Skip to content

Instantly share code, notes, and snippets.

@clouds56
Last active December 23, 2025 23:45
Show Gist options
  • Select an option

  • Save clouds56/f6c0678c680ebbf079a9b575a96bc303 to your computer and use it in GitHub Desktop.

Select an option

Save clouds56/f6c0678c680ebbf079a9b575a96bc303 to your computer and use it in GitHub Desktop.
Setup slurm
#!/usr/bin/env bash
set -euo pipefail
# Slurm (slurm-llnl on Arch) single-node setup.
# This script DOES NOT run unless you execute it.
#
# What it does when run:
# - Installs: slurm-llnl, munge, mariadb (for slurmdbd)
# - Configures munge
# - Writes minimal /etc/slurm-llnl/slurm.conf
# - Initializes MariaDB + creates slurm_acct_db + user
# - Writes /etc/slurm-llnl/slurmdbd.conf (0600)
# - Enables/starts: munge, mariadb, slurmdbd, slurmctld, slurmd
# - Registers a default cluster in Slurm accounting
#
# Uninstall:
# sudo bash setup.sh --uninstall
# sudo bash setup.sh --uninstall --purge # also deletes spools/logs/datadir
get_os_release() {
if [[ -r /etc/os-release ]]; then
. /etc/os-release
echo "${ID:-unknown}"
else
echo "unknown"
fi
}
OS_ID="$(get_os_release)"
DEFAULT_SLURM_ETC_DIR="/etc/slurm"
if [[ "$OS_ID" == "arch" || "$OS_ID" == "manjaro" ]]; then
DEFAULT_SLURM_ETC_DIR="/etc/slurm-llnl"
fi
SLURM_ETC_DIR="${SLURM_ETC_DIR:-$DEFAULT_SLURM_ETC_DIR}"
SLURM_CONF="${SLURM_CONF:-$SLURM_ETC_DIR/slurm.conf}"
SLURMDBD_CONF="${SLURMDBD_CONF:-$SLURM_ETC_DIR/slurmdbd.conf}"
CLUSTER_NAME="${CLUSTER_NAME:-cluster}"
CONTROL_HOST="${CONTROL_HOST:-}"
NODE_NAME="${NODE_NAME:-}"
# Choose a reasonable default CPU count; override with NODE_CPUS.
NODE_CPUS="${NODE_CPUS:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 1)}"
# Database settings (dedicated MariaDB/MySQL instance)
SLURM_DB_NAME="${SLURM_DB_NAME:-slurm_acct_db}"
SLURM_DB_USER="${SLURM_DB_USER:-slurm}"
# Provide your own password via env var, or let the script generate one.
SLURM_DB_PASS="${SLURM_DB_PASS:-}"
MARIADB_INSTANCE_NAME="${MARIADB_INSTANCE_NAME:-slurm}"
MARIADB_INSTANCE_UNIT="${MARIADB_INSTANCE_UNIT:-mariadb-${MARIADB_INSTANCE_NAME}.service}"
MARIADB_PORT="${MARIADB_PORT:-3307}"
MARIADB_DATADIR="${MARIADB_DATADIR:-/var/lib/mysql-${MARIADB_INSTANCE_NAME}}"
MARIADB_SOCKET="${MARIADB_SOCKET:-/run/mysqld/mysqld-${MARIADB_INSTANCE_NAME}.sock}"
MARIADB_PIDFILE="${MARIADB_PIDFILE:-/run/mysqld/mysqld-${MARIADB_INSTANCE_NAME}.pid}"
MARIADB_CNF="${MARIADB_CNF:-/etc/my.cnf.d/${MARIADB_INSTANCE_NAME}.cnf}"
require_root() {
if [[ "${EUID:-$(id -u)}" -ne 0 ]]; then
echo "ERROR: run as root (or via sudo)." >&2
exit 1
fi
}
usage() {
cat <<EOF
Usage:
sudo bash setup.sh
sudo bash setup.sh --update
sudo bash setup.sh --add-node <host>
sudo bash setup.sh --node
sudo bash setup.sh --uninstall [--purge]
Options:
--update Updates NodeName config from `slurmd -C` and restarts Slurm.
--add-node Adds a compute node via SSH (installs/configures munge+slurmd on target).
--node Configures this machine as a compute node (intended to be run remotely).
--uninstall Stops/disables Slurm services and removes files created by this script.
--purge Additionally removes Slurm spools/logs and the dedicated MariaDB datadir.
Environment overrides (setup):
CLUSTER_NAME, CONTROL_HOST, NODE_NAME, NODE_CPUS,
MARIADB_INSTANCE_NAME, MARIADB_PORT, MARIADB_DATADIR, MARIADB_SOCKET,
SLURM_DB_NAME, SLURM_DB_USER, SLURM_DB_PASS
Environment overrides (--add-node):
SSH_USER (default: ${SUDO_USER:-user}), SSH_OPTS (default: "-o StrictHostKeyChecking=accept-new")
SSH_SUDO_PASS (plain text password to feed sudo -S on the remote side)
SSH_SUDO_NO_PASSWORD=1 (skip the prompt when remote sudo already runs without a password)
Environment overrides (--node):
MUNGE_KEY_B64 Base64-encoded munge.key (required)
SLURM_CONF_B64 Base64-encoded slurm.conf to install (required)
EOF
}
short_hostname() {
local h
if command -v hostname >/dev/null 2>&1; then
h="$(hostname 2>/dev/null || true)"
if [[ -n "${h:-}" ]]; then
echo "$h" | cut -d. -f1
return 0
fi
fi
if command -v hostnamectl >/dev/null 2>&1; then
h="$(hostnamectl --static 2>/dev/null || true)"
if [[ -n "${h:-}" ]]; then
echo "$h" | cut -d. -f1
return 0
fi
fi
if [[ -r /etc/hostname ]]; then
h="$(head -n1 /etc/hostname 2>/dev/null | tr -d '\r' || true)"
if [[ -n "${h:-}" ]]; then
echo "$h" | cut -d. -f1
return 0
fi
fi
h="$(uname -n 2>/dev/null || true)"
if [[ -n "${h:-}" ]]; then
echo "$h" | cut -d. -f1
return 0
fi
echo "localhost"
}
install_packages() {
local role="${1:-control}"
local packages
if command -v pacman >/dev/null 2>&1; then
packages=(munge openssl inetutils)
if [[ "$role" == "control" ]]; then
packages+=(slurm-llnl mariadb)
else
packages+=(slurm-llnl)
fi
pacman -Syy --noconfirm --needed "${packages[@]}"
return
fi
if command -v dnf >/dev/null 2>&1; then
packages=(munge openssl inetutils)
if [[ "$role" == "control" ]]; then
packages+=(slurm mariadb-server)
else
packages+=(slurm)
fi
dnf install -y --setopt=install_weak_deps=false "${packages[@]}"
return
fi
if command -v apt-get >/dev/null 2>&1; then
if [[ "$role" == "control" ]]; then
packages=(slurm-wlm slurmdbd munge mariadb-server openssl hostname)
else
packages=(slurmd munge openssl hostname)
fi
apt-get update
apt-get install -y "${packages[@]}"
return
fi
echo "ERROR: no supported package manager found (pacman/dnf/apt-get)." >&2
exit 1
}
require_cmd() {
local cmd="$1"
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "ERROR: missing required command: $cmd" >&2
exit 1
fi
}
b64_file() {
local file="$1"
# GNU coreutils base64 supports -w; fall back if unavailable.
if base64 --help 2>/dev/null | grep -q -- ' -w'; then
base64 -w0 "$file"
else
base64 "$file" | tr -d '\n'
fi
}
slurmd_node_line() {
if ! command -v slurmd >/dev/null 2>&1; then
echo ""; return 1
fi
local line
line="$(slurmd -C 2>/dev/null | head -n1 || true)"
if [[ -z "${line:-}" || "$line" != NodeName=* ]]; then
echo ""; return 1
fi
echo "$line"
}
apply_node_line_to_slurm_conf() {
# Pure transformer:
# apply_node_line_to_slurm_conf <node_line> <slurm_conf_path>
# Writes an updated copy of <slurm_conf_path> to a new temp file and prints the path.
local node_line="${1:-}"
local slurm_conf="${2:-}"
if [[ -z "${node_line:-}" || "$node_line" != NodeName=* ]]; then
echo "ERROR: apply_node_line_to_slurm_conf requires a NodeName=... line (got: ${node_line:-<empty>})" >&2
return 2
fi
if [[ -z "${slurm_conf:-}" || ! -f "$slurm_conf" ]]; then
echo "ERROR: apply_node_line_to_slurm_conf requires an existing slurm.conf path (got: ${slurm_conf:-<empty>})" >&2
return 2
fi
local node_name
node_name="$(echo "$node_line" | sed -n 's/^NodeName=\([^ ]\+\).*/\1/p')"
if [[ -z "${node_name:-}" ]]; then
echo "ERROR: could not parse NodeName from: $node_line" >&2
return 2
fi
local out
out="$(mktemp "${slurm_conf}.updated.XXXXXX")"
awk -v node_line="$node_line" -v node_name="$node_name" '
function add_to_nodes_list(line, node, nodes, i, n, found, rest, newnodes) {
if (match(line, /(^|[[:space:]])Nodes=[^[:space:]]+/, m)) {
rest = substr(line, RSTART)
sub(/^[[:space:]]*Nodes=/, "", rest)
sub(/[[:space:]].*$/, "", rest)
n = split(rest, nodes, /,/)
found=0
for (i=1; i<=n; i++) if (nodes[i]==node) found=1
newnodes = rest
if (!found) newnodes = rest "," node
sub(/(^|[[:space:]])Nodes=[^[:space:]]+/, " Nodes=" newnodes, line)
return line
}
return line " Nodes=" node
}
BEGIN {
wrote_node=0; saw_part=0
node_count=0; buffered_first=0
first_line=""; first_is_match=0
}
$0 ~ /^NodeName=/ {
node_count++
if (node_count==1) {
# Buffer the first NodeName line until we know if this is a single-node config.
first_line=$0
first_is_match = ($0 ~ ("^NodeName=" node_name "( |$)"))
buffered_first=1
next
}
if (node_count==2 && buffered_first==1) {
# Multi-node config: flush the buffered first NodeName line unchanged unless it matches.
if (first_is_match) {
print node_line
wrote_node=1
} else {
print first_line
}
buffered_first=0
}
# Multi-node NodeName lines: replace only on exact match.
if ($0 ~ ("^NodeName=" node_name "( |$)")) {
print node_line
wrote_node=1
next
}
print
next
}
$0 ~ /^PartitionName=/ {
print add_to_nodes_list($0, node_name)
saw_part=1
next
}
{ print }
END {
if (buffered_first==1) {
# Exactly one NodeName line in the file.
# If it matches, replace it; otherwise keep it and append the new node.
if (first_is_match) {
print node_line
wrote_node=1
} else {
print first_line
}
buffered_first=0
}
# If there were zero NodeName lines, or this is multi-node and the node was missing, append.
if (wrote_node==0) print node_line
if (saw_part==0) print "PartitionName=debug Nodes=" node_name " Default=YES MaxTime=INFINITE State=UP"
}
' "$slurm_conf" >"$out"
chmod --reference="$slurm_conf" "$out" >/dev/null 2>&1 || chmod 0644 "$out" || true
printf '%s\n' "$out"
}
ssh_target() {
local host="$1"
# Prefer connecting as the invoking (non-root) user; elevate remotely with sudo.
local user="${SSH_USER:-${SUDO_USER:-user}}"
echo "${user}@${host}"
}
ssh_opts() {
# StrictHostKeyChecking=accept-new is convenient for first-time setup.
# Override with SSH_OPTS if you want stricter behavior.
echo "${SSH_OPTS:--o StrictHostKeyChecking=accept-new}"
}
prompt_remote_sudo_password() {
local host="$1"
local user="${SSH_USER:-${SUDO_USER:-user}}"
if [[ "$user" == "root" || "${SSH_SUDO_NO_PASSWORD:-0}" == "1" ]]; then
# No password needed when connecting as root or the caller already opted out.
return 0
fi
if [[ -n "${SSH_SUDO_PASS:-}" ]]; then
printf '%s' "$SSH_SUDO_PASS"
return 0
fi
if [[ ! -e /dev/tty ]]; then
echo "ERROR: cannot prompt for remote sudo password (no /dev/tty)" >&2
exit 1
fi
local pass
read -r -s -p "Password for sudo on ${user}@${host}: " pass </dev/tty
printf '\n' >&2
if [[ -z "${pass:-}" ]]; then
echo "ERROR: password cannot be empty" >&2
exit 1
fi
printf '%s' "$pass"
}
setup_munge() {
install -d -m 0700 -o munge -g munge /etc/munge
if [[ ! -f /etc/munge/munge.key ]]; then
dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024 status=none
chown munge:munge /etc/munge/munge.key
chmod 0400 /etc/munge/munge.key
fi
systemctl enable --now munge.service
}
setup_slurm_dirs() {
install -d -m 0755 -o slurm -g slurm /var/spool/slurmctld
install -d -m 0755 -o slurm -g slurm /var/spool/slurmd
install -d -m 0755 -o slurm -g slurm /var/log/slurm
}
write_slurm_conf() {
install -d -m 0755 "$SLURM_ETC_DIR"
cat >"$SLURM_CONF" <<EOF
# Minimal single-node Slurm config
ClusterName=$CLUSTER_NAME
SlurmctldHost=$CONTROL_HOST
SlurmUser=slurm
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
CryptoType=crypto/munge
StateSaveLocation=/var/spool/slurmctld
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/run/slurmctld.pid
SlurmdPidFile=/run/slurmd.pid
ProctrackType=proctrack/cgroup
TaskPlugin=task/cgroup
ReturnToService=2
SlurmctldTimeout=120
SlurmdTimeout=300
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=localhost
NodeName=$NODE_NAME CPUs=$NODE_CPUS State=UNKNOWN
PartitionName=debug Nodes=$NODE_NAME Default=YES MaxTime=INFINITE State=UP
EOF
chmod 0644 "$SLURM_CONF"
# Convenience symlink for tools/docs that expect /etc/slurm.conf
# if [[ ! -e /etc/slurm.conf ]]; then
# ln -s "$SLURM_CONF" /etc/slurm.conf
# fi
}
write_mariadb_instance_conf() {
install -d -m 0755 /etc/my.cnf.d
cat >"$MARIADB_CNF" <<EOF
[mysqld]
user=mysql
datadir=$MARIADB_DATADIR
socket=$MARIADB_SOCKET
port=$MARIADB_PORT
pid-file=$MARIADB_PIDFILE
bind-address=127.0.0.1
skip-name-resolve
[client]
socket=$MARIADB_SOCKET
EOF
chmod 0644 "$MARIADB_CNF"
}
write_mariadb_instance_unit() {
local mariadbd_bin
mariadbd_bin=$(command -v mariadbd 2>/dev/null || echo /usr/sbin/mariadbd)
install -d -m 0755 /etc/systemd/system
cat >"/etc/systemd/system/$MARIADB_INSTANCE_UNIT" <<EOF
[Unit]
Description=MariaDB instance for Slurm accounting (%i)
After=network.target
[Service]
Type=notify
User=mysql
Group=mysql
ExecStartPre=/usr/bin/install -d -o mysql -g mysql -m 0755 /run/mysqld
ExecStart=${mariadbd_bin} --defaults-file=$MARIADB_CNF
Restart=on-failure
TimeoutStartSec=120
[Install]
WantedBy=multi-user.target
EOF
}
init_mariadb_instance() {
install -d -m 0755 -o mysql -g mysql "$MARIADB_DATADIR"
if [[ ! -d "$MARIADB_DATADIR/mysql" ]]; then
mariadb-install-db --user=mysql --basedir=/usr --datadir="$MARIADB_DATADIR"
fi
systemctl daemon-reload
systemctl enable --now "$MARIADB_INSTANCE_UNIT"
# Wait until the instance answers over its socket.
for _ in {1..60}; do
if mariadb-admin --protocol=socket --socket="$MARIADB_SOCKET" ping >/dev/null 2>&1; then
return 0
fi
sleep 0.5
done
echo "ERROR: MariaDB instance did not become ready." >&2
exit 1
}
ensure_db_user_and_schema() {
if [[ -z "$SLURM_DB_PASS" ]]; then
SLURM_DB_PASS="$(openssl rand -base64 24 | tr -d '\n' | tr '/+' '_-' | cut -c1-32)"
echo "Generated SLURM_DB_PASS=$SLURM_DB_PASS"
echo "Save it somewhere safe; it is written into $SLURMDBD_CONF"
fi
# Create DB and user if missing; idempotent.
mariadb --protocol=socket --socket="$MARIADB_SOCKET" <<EOF
CREATE DATABASE IF NOT EXISTS \`$SLURM_DB_NAME\`;
CREATE USER IF NOT EXISTS '$SLURM_DB_USER'@'localhost' IDENTIFIED BY '$SLURM_DB_PASS';
ALTER USER '$SLURM_DB_USER'@'localhost' IDENTIFIED BY '$SLURM_DB_PASS';
GRANT ALL PRIVILEGES ON \`$SLURM_DB_NAME\`.* TO '$SLURM_DB_USER'@'localhost';
CREATE USER IF NOT EXISTS '$SLURM_DB_USER'@'127.0.0.1' IDENTIFIED BY '$SLURM_DB_PASS';
ALTER USER '$SLURM_DB_USER'@'127.0.0.1' IDENTIFIED BY '$SLURM_DB_PASS';
GRANT ALL PRIVILEGES ON \`$SLURM_DB_NAME\`.* TO '$SLURM_DB_USER'@'127.0.0.1';
FLUSH PRIVILEGES;
EOF
}
write_slurmdbd_conf() {
install -d -m 0755 "$SLURM_ETC_DIR"
cat >"$SLURMDBD_CONF" <<EOF
# SlurmDBD configuration (MariaDB local)
AuthType=auth/munge
DbdHost=localhost
DbdPort=6819
SlurmUser=slurm
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/run/slurmdbd/slurmdbd.pid
StorageType=accounting_storage/mysql
StorageHost=127.0.0.1
StoragePort=$MARIADB_PORT
StorageUser=$SLURM_DB_USER
StoragePass=$SLURM_DB_PASS
StorageLoc=$SLURM_DB_NAME
EOF
# slurmdbd.service runs as slurm:slurm on Arch; it must be able to read this file.
chown slurm:slurm "$SLURMDBD_CONF"
chmod 0600 "$SLURMDBD_CONF"
# Convenience symlink for tools/docs that expect /etc/slurmdbd.conf
# if [[ ! -e /etc/slurmdbd.conf ]]; then
# ln -s "$SLURMDBD_CONF" /etc/slurmdbd.conf
# fi
}
write_slurmdbd_systemd_dropin() {
# Arch's slurmdbd.service only depends on mariadb.service/mysql.service.
# When using a dedicated instance (e.g. mariadb-slurm.service), add explicit ordering.
install -d -m 0755 /etc/systemd/system/slurmdbd.service.d
cat >"/etc/systemd/system/slurmdbd.service.d/override.conf" <<EOF
[Unit]
Wants=$MARIADB_INSTANCE_UNIT
After=$MARIADB_INSTANCE_UNIT
EOF
systemctl daemon-reload
}
start_slurm_services() {
systemctl enable --now slurmdbd.service
systemctl enable --now slurmctld.service
systemctl enable --now slurmd.service
}
safe_rm_symlink() {
local link_path="$1"
local expected_target="$2"
if [[ -L "$link_path" ]]; then
local actual_target
actual_target="$(readlink -f "$link_path" 2>/dev/null || true)"
if [[ -n "${actual_target:-}" && "$actual_target" == "$expected_target" ]]; then
rm -f "$link_path"
fi
fi
}
uninstall() {
require_root
local purge=0
if [[ "${1:-}" == "--purge" ]]; then
purge=1
fi
echo "Stopping/disabling Slurm services..."
systemctl disable --now slurmd.service slurmctld.service slurmdbd.service >/dev/null 2>&1 || true
echo "Stopping/disabling dedicated MariaDB instance ($MARIADB_INSTANCE_UNIT)..."
systemctl disable --now "$MARIADB_INSTANCE_UNIT" >/dev/null 2>&1 || true
echo "Removing systemd drop-ins/units created by this script..."
rm -f "/etc/systemd/system/$MARIADB_INSTANCE_UNIT" 2>/dev/null || true
rm -f "/etc/systemd/system/slurmdbd.service.d/override.conf" 2>/dev/null || true
rmdir "/etc/systemd/system/slurmdbd.service.d" 2>/dev/null || true
echo "Removing MariaDB instance config created by this script..."
rm -f "$MARIADB_CNF" 2>/dev/null || true
echo "Removing Slurm configs created by this script (safe check)..."
if [[ -f "$SLURM_CONF" ]] && grep -q '^# Minimal single-node Slurm config' "$SLURM_CONF"; then
rm -f "$SLURM_CONF"
fi
if [[ -f "$SLURMDBD_CONF" ]] && grep -q '^# SlurmDBD configuration (MariaDB local)' "$SLURMDBD_CONF"; then
rm -f "$SLURMDBD_CONF"
fi
# Remove symlinks only if they point to our files.
safe_rm_symlink /etc/slurm.conf "$SLURM_CONF"
safe_rm_symlink /etc/slurmdbd.conf "$SLURMDBD_CONF"
if [[ "$purge" -eq 1 ]]; then
echo "Purging Slurm spools/logs and MariaDB datadir..."
rm -rf /var/spool/slurmctld /var/spool/slurmd /var/log/slurm 2>/dev/null || true
if [[ -n "${MARIADB_DATADIR:-}" ]]; then
rm -rf "$MARIADB_DATADIR" 2>/dev/null || true
fi
elif [ -d "$MARIADB_DATADIR" ]; then
echo "NOTE: Keeping ${MARIADB_DATADIR:-<unset>} (pass --purge to delete)."
fi
systemctl daemon-reload
echo "OK: uninstall completed."
}
update_node_config() {
# CLI behavior for --update: detect node_line from slurmd -C, rewrite slurm.conf, restart services.
require_root
if [[ ! -f "$SLURM_CONF" ]]; then
echo "ERROR: $SLURM_CONF does not exist." >&2
exit 1
fi
if ! command -v slurmd >/dev/null 2>&1; then
echo "ERROR: slurmd not found. Install slurm-llnl first." >&2
exit 1
fi
local detected
detected="$(slurmd -C 2>/dev/null | head -n1 || true)"
if [[ -z "${detected:-}" || "$detected" != NodeName=* ]]; then
echo "ERROR: failed to get NodeName line from 'slurmd -C'." >&2
exit 1
fi
local detected_node
detected_node="$(echo "$detected" | sed -n 's/^NodeName=\([^ ]\+\).*/\1/p')"
if [[ -z "${detected_node:-}" ]]; then
echo "ERROR: could not parse NodeName from: $detected" >&2
exit 1
fi
local backup
backup="${SLURM_CONF}.bak.$(date +%Y%m%d%H%M%S)"
cp -a "$SLURM_CONF" "$backup"
local updated
updated="$(apply_node_line_to_slurm_conf "$detected" "$backup")"
if [[ -z "${updated:-}" || ! -f "$updated" ]]; then
echo "ERROR: failed to generate updated slurm.conf" >&2
exit 1
fi
mv "$updated" "$SLURM_CONF"
chmod 0644 "$SLURM_CONF" || true
echo "Updated $SLURM_CONF using: $detected"
# Apply changes
systemctl restart slurmctld.service slurmd.service >/dev/null 2>&1 || true
# Clear DRAIN if it was caused by old config mismatch.
scontrol update NodeName="$detected_node" State=RESUME >/dev/null 2>&1 || true
echo "OK: node config updated. Backup: $backup"
}
add_node() {
require_root
require_cmd ssh
require_cmd base64
local host="${1:-}"
if [[ -z "${host:-}" ]]; then
echo "ERROR: --add-node requires a host argument" >&2
usage
exit 2
fi
if [[ ! -f "$SLURM_CONF" ]]; then
echo "ERROR: $SLURM_CONF does not exist. Run setup first." >&2
exit 1
fi
if [[ ! -r /etc/munge/munge.key ]]; then
echo "ERROR: /etc/munge/munge.key not readable. Run setup_munge first." >&2
exit 1
fi
echo "Configuring remote node via SSH (sudo on remote): $(ssh_target "$host")"
local ssh_opts_array=()
read -r -a ssh_opts_array <<< "$(ssh_opts)"
local control_dir
control_dir="$(mktemp -d)"
if [[ -z "${control_dir:-}" || ! -d "$control_dir" ]]; then
echo "ERROR: could not create control directory" >&2
exit 1
fi
local control_path="${control_dir}/control-master.sock"
local ssh_all_opts=("${ssh_opts_array[@]}" -o ControlMaster=auto -o ControlPersist=300s -o ControlPath="$control_path")
ssh_exec() {
ssh "${ssh_all_opts[@]}" "$@"
}
scp_exec() {
scp "${ssh_all_opts[@]}" "$@"
}
cleanup_control() {
ssh_exec -O exit "$(ssh_target "$host")" >/dev/null 2>&1 || true
rm -rf "$control_dir"
}
trap cleanup_control RETURN
# Upload this script to the remote node, then run it under sudo.
local self
self="$(readlink -f "$0" 2>/dev/null || echo "$0")"
if [[ ! -f "$self" ]]; then
echo "ERROR: could not resolve script path for copying (got: $self)" >&2
exit 1
fi
# Step 1: create a temp script file on the remote host.
local remote_script
remote_script="$(ssh_exec "$(ssh_target "$host")" "mktemp /tmp/slurm-setup-node.XXXXXX.sh")"
remote_script="${remote_script//$'\r'/}"
remote_script="${remote_script//$'\n'/}"
if [[ -z "${remote_script:-}" ]]; then
echo "ERROR: failed to create remote temp file" >&2
exit 1
fi
# Step 2: upload the script as the SSH user (no sudo needed).
scp_exec "$self" "$(ssh_target "$host"):$remote_script"
ssh_exec "$(ssh_target "$host")" "chmod 0700 '$remote_script'"
# Step 3: run it with sudo in a TTY so sudo can prompt for a password if needed.
local remote_sudo_pass
remote_sudo_pass="$(prompt_remote_sudo_password "$host")"
# Phase 1: probe remote node hardware to get its NodeName line.
local probe_cmd
probe_cmd="bash '$remote_script' --probe-node-line"
local probe_out
if [[ -n "${remote_sudo_pass:-}" ]]; then
probe_out="$(printf '%s\n' "$remote_sudo_pass" | ssh_exec "$(ssh_target "$host")" "sudo -S -p '' $probe_cmd")"
else
probe_out="$(ssh_exec -tt "$(ssh_target "$host")" "sudo $probe_cmd")"
fi
local node_line
node_line="$(echo "$probe_out" | sed -n 's/^NODELINE://p' | tail -n1)"
if [[ -z "${node_line:-}" || "$node_line" != NodeName=* ]]; then
echo "ERROR: failed to obtain NodeName line from remote slurmd -C" >&2
echo "Remote output:" >&2
echo "$probe_out" >&2
exit 1
fi
local node_name
node_name="$(echo "$node_line" | sed -n 's/^NodeName=\([^ ]\+\).*/\1/p')"
if [[ -z "${node_name:-}" ]]; then
echo "ERROR: could not parse NodeName from: $node_line" >&2
exit 1
fi
# Phase 2: update controller slurm.conf locally using the probed node_line.
# - add/replace a NodeName line for this node
# - ensure PartitionName Nodes includes this node
local backup
backup="${SLURM_CONF}.bak.add-node.$(date +%Y%m%d%H%M%S)"
cp -a "$SLURM_CONF" "$backup"
local updated
updated="$(apply_node_line_to_slurm_conf "$node_line" "$backup")"
if [[ -z "${updated:-}" || ! -f "$updated" ]]; then
echo "ERROR: failed to generate updated slurm.conf for add-node" >&2
exit 1
fi
mv "$updated" "$SLURM_CONF"
chmod 0644 "$SLURM_CONF" || true
echo "Added/updated node in controller config: $node_name"
echo "Restarting controller services..."
systemctl restart slurmctld.service slurmd.service >/dev/null 2>&1 || true
# Phase 3: send updated config + munge key to the remote node and start services.
local munge_key_b64
local slurm_conf_b64
munge_key_b64="$(b64_file /etc/munge/munge.key)"
slurm_conf_b64="$(b64_file "$SLURM_CONF")"
local node_cmd
node_cmd="env MUNGE_KEY_B64='$munge_key_b64' SLURM_CONF_B64='$slurm_conf_b64' bash '$remote_script' --node"
local node_out
if [[ -n "${remote_sudo_pass:-}" ]]; then
node_out="$(printf '%s\n' "$remote_sudo_pass" | ssh_exec "$(ssh_target "$host")" "sudo -S -p '' $node_cmd")"
else
node_out="$(ssh_exec -tt "$(ssh_target "$host")" "sudo $node_cmd")"
fi
# Cleanup remote script after successful runs.
ssh_exec "$(ssh_target "$host")" "rm -f '$remote_script'" >/dev/null 2>&1 || true
remote_sudo_pass=""
# Clear DRAIN if it was caused by old config mismatch.
scontrol update NodeName="$node_name" State=RESUME >/dev/null 2>&1 || true
echo "OK: node added. Backup: $backup"
}
node_probe_mode() {
# Intended to be executed on a compute node to discover its slurmd -C NodeName line.
require_root
install_packages node
local node_line
node_line="$(slurmd_node_line 2>/dev/null || true)"
if [[ -z "${node_line:-}" || "$node_line" != NodeName=* ]]; then
echo "ERROR: failed to get NodeName line from 'slurmd -C'." >&2
exit 1
fi
echo "NODELINE:$node_line"
}
node_mode() {
# Intended to be executed on a compute node (often via --add-node).
require_root
install_packages node
if [[ -z "${MUNGE_KEY_B64:-}" ]]; then
echo "ERROR: MUNGE_KEY_B64 is required in --node mode" >&2
exit 2
fi
if [[ -z "${SLURM_CONF_B64:-}" ]]; then
echo "ERROR: SLURM_CONF_B64 is required in --node mode" >&2
exit 2
fi
# Install munge key.
install -d -m 0700 -o munge -g munge /etc/munge
local tmp_key
tmp_key="$(mktemp)"
printf '%s' "$MUNGE_KEY_B64" | base64 -d >"$tmp_key"
install -m 0400 -o munge -g munge "$tmp_key" /etc/munge/munge.key
rm -f "$tmp_key"
# Install slurm.conf (then patch NodeName from slurmd -C before starting slurmd).
install -d -m 0755 "$SLURM_ETC_DIR"
local tmp_conf
tmp_conf="$(mktemp)"
printf '%s' "$SLURM_CONF_B64" | base64 -d >"$tmp_conf"
install -m 0644 -o root -g root "$tmp_conf" "$SLURM_CONF"
rm -f "$tmp_conf"
setup_slurm_dirs
local node_line
node_line="$(slurmd_node_line 2>/dev/null || true)"
if [[ -n "${node_line:-}" && "$node_line" == NodeName=* ]]; then
local backup
backup="${SLURM_CONF}.bak.slurmdC.$(date +%Y%m%d%H%M%S)"
cp -a "$SLURM_CONF" "$backup"
local updated
updated="$(apply_node_line_to_slurm_conf "$node_line" "$backup")" || true
if [[ -n "${updated:-}" && -f "$updated" ]]; then
mv "$updated" "$SLURM_CONF"
chmod 0644 "$SLURM_CONF" || true
echo "Updated $SLURM_CONF from slurmd -C (backup: $backup)"
fi
fi
systemctl enable munge.service slurmd.service
systemctl restart munge.service slurmd.service
echo "NODELINE:$node_line"
}
register_cluster() {
# Wait briefly for slurmdbd to be ready.
for _ in {1..20}; do
if sacctmgr -n show cluster 2>/dev/null | grep -q .; then
return 0
fi
sleep 0.5
done
# Idempotent create (ignore errors if it already exists).
sacctmgr -i add cluster "$CLUSTER_NAME" >/dev/null 2>&1 || true
}
main() {
require_root
install_packages
if [[ -z "${CONTROL_HOST:-}" ]]; then
CONTROL_HOST="$(short_hostname)"
fi
if [[ -z "${NODE_NAME:-}" ]]; then
NODE_NAME="$CONTROL_HOST"
fi
setup_munge
setup_slurm_dirs
write_slurm_conf
# Before starting slurmd (local node), rewrite NodeName line to match actual hardware.
if node_line="$(slurmd_node_line 2>/dev/null)"; then
local backup
backup="${SLURM_CONF}.bak.slurmdC.$(date +%Y%m%d%H%M%S)"
cp -a "$SLURM_CONF" "$backup"
local updated
updated="$(apply_node_line_to_slurm_conf "$node_line" "$backup")" || true
if [[ -n "${updated:-}" && -f "$updated" ]]; then
mv "$updated" "$SLURM_CONF"
chmod 0644 "$SLURM_CONF" || true
echo "Updated $SLURM_CONF from slurmd -C (backup: $backup)"
fi
fi
write_mariadb_instance_conf
write_mariadb_instance_unit
init_mariadb_instance
ensure_db_user_and_schema
write_slurmdbd_conf
write_slurmdbd_systemd_dropin
start_slurm_services
register_cluster
echo "OK: Slurm services enabled and started."
echo "Next checks: scontrol ping; sinfo; squeue; sacctmgr show cluster"
}
case "${1:-}" in
-h|--help)
usage
exit 0
;;
--update)
update_node_config
exit 0
;;
--add-node)
add_node "${2:-}"
exit 0
;;
--node)
node_mode
exit 0
;;
--probe-node-line)
node_probe_mode
exit 0
;;
--uninstall)
uninstall "${2:-}"
exit 0
;;
esac
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment