Last active
December 23, 2025 23:45
-
-
Save clouds56/f6c0678c680ebbf079a9b575a96bc303 to your computer and use it in GitHub Desktop.
Setup slurm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # Slurm (slurm-llnl on Arch) single-node setup. | |
| # This script DOES NOT run unless you execute it. | |
| # | |
| # What it does when run: | |
| # - Installs: slurm-llnl, munge, mariadb (for slurmdbd) | |
| # - Configures munge | |
| # - Writes minimal /etc/slurm-llnl/slurm.conf | |
| # - Initializes MariaDB + creates slurm_acct_db + user | |
| # - Writes /etc/slurm-llnl/slurmdbd.conf (0600) | |
| # - Enables/starts: munge, mariadb, slurmdbd, slurmctld, slurmd | |
| # - Registers a default cluster in Slurm accounting | |
| # | |
| # Uninstall: | |
| # sudo bash setup.sh --uninstall | |
| # sudo bash setup.sh --uninstall --purge # also deletes spools/logs/datadir | |
| get_os_release() { | |
| if [[ -r /etc/os-release ]]; then | |
| . /etc/os-release | |
| echo "${ID:-unknown}" | |
| else | |
| echo "unknown" | |
| fi | |
| } | |
| OS_ID="$(get_os_release)" | |
| DEFAULT_SLURM_ETC_DIR="/etc/slurm" | |
| if [[ "$OS_ID" == "arch" || "$OS_ID" == "manjaro" ]]; then | |
| DEFAULT_SLURM_ETC_DIR="/etc/slurm-llnl" | |
| fi | |
| SLURM_ETC_DIR="${SLURM_ETC_DIR:-$DEFAULT_SLURM_ETC_DIR}" | |
| SLURM_CONF="${SLURM_CONF:-$SLURM_ETC_DIR/slurm.conf}" | |
| SLURMDBD_CONF="${SLURMDBD_CONF:-$SLURM_ETC_DIR/slurmdbd.conf}" | |
| CLUSTER_NAME="${CLUSTER_NAME:-cluster}" | |
| CONTROL_HOST="${CONTROL_HOST:-}" | |
| NODE_NAME="${NODE_NAME:-}" | |
| # Choose a reasonable default CPU count; override with NODE_CPUS. | |
| NODE_CPUS="${NODE_CPUS:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 1)}" | |
| # Database settings (dedicated MariaDB/MySQL instance) | |
| SLURM_DB_NAME="${SLURM_DB_NAME:-slurm_acct_db}" | |
| SLURM_DB_USER="${SLURM_DB_USER:-slurm}" | |
| # Provide your own password via env var, or let the script generate one. | |
| SLURM_DB_PASS="${SLURM_DB_PASS:-}" | |
| MARIADB_INSTANCE_NAME="${MARIADB_INSTANCE_NAME:-slurm}" | |
| MARIADB_INSTANCE_UNIT="${MARIADB_INSTANCE_UNIT:-mariadb-${MARIADB_INSTANCE_NAME}.service}" | |
| MARIADB_PORT="${MARIADB_PORT:-3307}" | |
| MARIADB_DATADIR="${MARIADB_DATADIR:-/var/lib/mysql-${MARIADB_INSTANCE_NAME}}" | |
| MARIADB_SOCKET="${MARIADB_SOCKET:-/run/mysqld/mysqld-${MARIADB_INSTANCE_NAME}.sock}" | |
| MARIADB_PIDFILE="${MARIADB_PIDFILE:-/run/mysqld/mysqld-${MARIADB_INSTANCE_NAME}.pid}" | |
| MARIADB_CNF="${MARIADB_CNF:-/etc/my.cnf.d/${MARIADB_INSTANCE_NAME}.cnf}" | |
| require_root() { | |
| if [[ "${EUID:-$(id -u)}" -ne 0 ]]; then | |
| echo "ERROR: run as root (or via sudo)." >&2 | |
| exit 1 | |
| fi | |
| } | |
| usage() { | |
| cat <<EOF | |
| Usage: | |
| sudo bash setup.sh | |
| sudo bash setup.sh --update | |
| sudo bash setup.sh --add-node <host> | |
| sudo bash setup.sh --node | |
| sudo bash setup.sh --uninstall [--purge] | |
| Options: | |
| --update Updates NodeName config from `slurmd -C` and restarts Slurm. | |
| --add-node Adds a compute node via SSH (installs/configures munge+slurmd on target). | |
| --node Configures this machine as a compute node (intended to be run remotely). | |
| --uninstall Stops/disables Slurm services and removes files created by this script. | |
| --purge Additionally removes Slurm spools/logs and the dedicated MariaDB datadir. | |
| Environment overrides (setup): | |
| CLUSTER_NAME, CONTROL_HOST, NODE_NAME, NODE_CPUS, | |
| MARIADB_INSTANCE_NAME, MARIADB_PORT, MARIADB_DATADIR, MARIADB_SOCKET, | |
| SLURM_DB_NAME, SLURM_DB_USER, SLURM_DB_PASS | |
| Environment overrides (--add-node): | |
| SSH_USER (default: ${SUDO_USER:-user}), SSH_OPTS (default: "-o StrictHostKeyChecking=accept-new") | |
| SSH_SUDO_PASS (plain text password to feed sudo -S on the remote side) | |
| SSH_SUDO_NO_PASSWORD=1 (skip the prompt when remote sudo already runs without a password) | |
| Environment overrides (--node): | |
| MUNGE_KEY_B64 Base64-encoded munge.key (required) | |
| SLURM_CONF_B64 Base64-encoded slurm.conf to install (required) | |
| EOF | |
| } | |
| short_hostname() { | |
| local h | |
| if command -v hostname >/dev/null 2>&1; then | |
| h="$(hostname 2>/dev/null || true)" | |
| if [[ -n "${h:-}" ]]; then | |
| echo "$h" | cut -d. -f1 | |
| return 0 | |
| fi | |
| fi | |
| if command -v hostnamectl >/dev/null 2>&1; then | |
| h="$(hostnamectl --static 2>/dev/null || true)" | |
| if [[ -n "${h:-}" ]]; then | |
| echo "$h" | cut -d. -f1 | |
| return 0 | |
| fi | |
| fi | |
| if [[ -r /etc/hostname ]]; then | |
| h="$(head -n1 /etc/hostname 2>/dev/null | tr -d '\r' || true)" | |
| if [[ -n "${h:-}" ]]; then | |
| echo "$h" | cut -d. -f1 | |
| return 0 | |
| fi | |
| fi | |
| h="$(uname -n 2>/dev/null || true)" | |
| if [[ -n "${h:-}" ]]; then | |
| echo "$h" | cut -d. -f1 | |
| return 0 | |
| fi | |
| echo "localhost" | |
| } | |
| install_packages() { | |
| local role="${1:-control}" | |
| local packages | |
| if command -v pacman >/dev/null 2>&1; then | |
| packages=(munge openssl inetutils) | |
| if [[ "$role" == "control" ]]; then | |
| packages+=(slurm-llnl mariadb) | |
| else | |
| packages+=(slurm-llnl) | |
| fi | |
| pacman -Syy --noconfirm --needed "${packages[@]}" | |
| return | |
| fi | |
| if command -v dnf >/dev/null 2>&1; then | |
| packages=(munge openssl inetutils) | |
| if [[ "$role" == "control" ]]; then | |
| packages+=(slurm mariadb-server) | |
| else | |
| packages+=(slurm) | |
| fi | |
| dnf install -y --setopt=install_weak_deps=false "${packages[@]}" | |
| return | |
| fi | |
| if command -v apt-get >/dev/null 2>&1; then | |
| if [[ "$role" == "control" ]]; then | |
| packages=(slurm-wlm slurmdbd munge mariadb-server openssl hostname) | |
| else | |
| packages=(slurmd munge openssl hostname) | |
| fi | |
| apt-get update | |
| apt-get install -y "${packages[@]}" | |
| return | |
| fi | |
| echo "ERROR: no supported package manager found (pacman/dnf/apt-get)." >&2 | |
| exit 1 | |
| } | |
| require_cmd() { | |
| local cmd="$1" | |
| if ! command -v "$cmd" >/dev/null 2>&1; then | |
| echo "ERROR: missing required command: $cmd" >&2 | |
| exit 1 | |
| fi | |
| } | |
| b64_file() { | |
| local file="$1" | |
| # GNU coreutils base64 supports -w; fall back if unavailable. | |
| if base64 --help 2>/dev/null | grep -q -- ' -w'; then | |
| base64 -w0 "$file" | |
| else | |
| base64 "$file" | tr -d '\n' | |
| fi | |
| } | |
| slurmd_node_line() { | |
| if ! command -v slurmd >/dev/null 2>&1; then | |
| echo ""; return 1 | |
| fi | |
| local line | |
| line="$(slurmd -C 2>/dev/null | head -n1 || true)" | |
| if [[ -z "${line:-}" || "$line" != NodeName=* ]]; then | |
| echo ""; return 1 | |
| fi | |
| echo "$line" | |
| } | |
| apply_node_line_to_slurm_conf() { | |
| # Pure transformer: | |
| # apply_node_line_to_slurm_conf <node_line> <slurm_conf_path> | |
| # Writes an updated copy of <slurm_conf_path> to a new temp file and prints the path. | |
| local node_line="${1:-}" | |
| local slurm_conf="${2:-}" | |
| if [[ -z "${node_line:-}" || "$node_line" != NodeName=* ]]; then | |
| echo "ERROR: apply_node_line_to_slurm_conf requires a NodeName=... line (got: ${node_line:-<empty>})" >&2 | |
| return 2 | |
| fi | |
| if [[ -z "${slurm_conf:-}" || ! -f "$slurm_conf" ]]; then | |
| echo "ERROR: apply_node_line_to_slurm_conf requires an existing slurm.conf path (got: ${slurm_conf:-<empty>})" >&2 | |
| return 2 | |
| fi | |
| local node_name | |
| node_name="$(echo "$node_line" | sed -n 's/^NodeName=\([^ ]\+\).*/\1/p')" | |
| if [[ -z "${node_name:-}" ]]; then | |
| echo "ERROR: could not parse NodeName from: $node_line" >&2 | |
| return 2 | |
| fi | |
| local out | |
| out="$(mktemp "${slurm_conf}.updated.XXXXXX")" | |
| awk -v node_line="$node_line" -v node_name="$node_name" ' | |
| function add_to_nodes_list(line, node, nodes, i, n, found, rest, newnodes) { | |
| if (match(line, /(^|[[:space:]])Nodes=[^[:space:]]+/, m)) { | |
| rest = substr(line, RSTART) | |
| sub(/^[[:space:]]*Nodes=/, "", rest) | |
| sub(/[[:space:]].*$/, "", rest) | |
| n = split(rest, nodes, /,/) | |
| found=0 | |
| for (i=1; i<=n; i++) if (nodes[i]==node) found=1 | |
| newnodes = rest | |
| if (!found) newnodes = rest "," node | |
| sub(/(^|[[:space:]])Nodes=[^[:space:]]+/, " Nodes=" newnodes, line) | |
| return line | |
| } | |
| return line " Nodes=" node | |
| } | |
| BEGIN { | |
| wrote_node=0; saw_part=0 | |
| node_count=0; buffered_first=0 | |
| first_line=""; first_is_match=0 | |
| } | |
| $0 ~ /^NodeName=/ { | |
| node_count++ | |
| if (node_count==1) { | |
| # Buffer the first NodeName line until we know if this is a single-node config. | |
| first_line=$0 | |
| first_is_match = ($0 ~ ("^NodeName=" node_name "( |$)")) | |
| buffered_first=1 | |
| next | |
| } | |
| if (node_count==2 && buffered_first==1) { | |
| # Multi-node config: flush the buffered first NodeName line unchanged unless it matches. | |
| if (first_is_match) { | |
| print node_line | |
| wrote_node=1 | |
| } else { | |
| print first_line | |
| } | |
| buffered_first=0 | |
| } | |
| # Multi-node NodeName lines: replace only on exact match. | |
| if ($0 ~ ("^NodeName=" node_name "( |$)")) { | |
| print node_line | |
| wrote_node=1 | |
| next | |
| } | |
| next | |
| } | |
| $0 ~ /^PartitionName=/ { | |
| print add_to_nodes_list($0, node_name) | |
| saw_part=1 | |
| next | |
| } | |
| { print } | |
| END { | |
| if (buffered_first==1) { | |
| # Exactly one NodeName line in the file. | |
| # If it matches, replace it; otherwise keep it and append the new node. | |
| if (first_is_match) { | |
| print node_line | |
| wrote_node=1 | |
| } else { | |
| print first_line | |
| } | |
| buffered_first=0 | |
| } | |
| # If there were zero NodeName lines, or this is multi-node and the node was missing, append. | |
| if (wrote_node==0) print node_line | |
| if (saw_part==0) print "PartitionName=debug Nodes=" node_name " Default=YES MaxTime=INFINITE State=UP" | |
| } | |
| ' "$slurm_conf" >"$out" | |
| chmod --reference="$slurm_conf" "$out" >/dev/null 2>&1 || chmod 0644 "$out" || true | |
| printf '%s\n' "$out" | |
| } | |
| ssh_target() { | |
| local host="$1" | |
| # Prefer connecting as the invoking (non-root) user; elevate remotely with sudo. | |
| local user="${SSH_USER:-${SUDO_USER:-user}}" | |
| echo "${user}@${host}" | |
| } | |
| ssh_opts() { | |
| # StrictHostKeyChecking=accept-new is convenient for first-time setup. | |
| # Override with SSH_OPTS if you want stricter behavior. | |
| echo "${SSH_OPTS:--o StrictHostKeyChecking=accept-new}" | |
| } | |
| prompt_remote_sudo_password() { | |
| local host="$1" | |
| local user="${SSH_USER:-${SUDO_USER:-user}}" | |
| if [[ "$user" == "root" || "${SSH_SUDO_NO_PASSWORD:-0}" == "1" ]]; then | |
| # No password needed when connecting as root or the caller already opted out. | |
| return 0 | |
| fi | |
| if [[ -n "${SSH_SUDO_PASS:-}" ]]; then | |
| printf '%s' "$SSH_SUDO_PASS" | |
| return 0 | |
| fi | |
| if [[ ! -e /dev/tty ]]; then | |
| echo "ERROR: cannot prompt for remote sudo password (no /dev/tty)" >&2 | |
| exit 1 | |
| fi | |
| local pass | |
| read -r -s -p "Password for sudo on ${user}@${host}: " pass </dev/tty | |
| printf '\n' >&2 | |
| if [[ -z "${pass:-}" ]]; then | |
| echo "ERROR: password cannot be empty" >&2 | |
| exit 1 | |
| fi | |
| printf '%s' "$pass" | |
| } | |
| setup_munge() { | |
| install -d -m 0700 -o munge -g munge /etc/munge | |
| if [[ ! -f /etc/munge/munge.key ]]; then | |
| dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024 status=none | |
| chown munge:munge /etc/munge/munge.key | |
| chmod 0400 /etc/munge/munge.key | |
| fi | |
| systemctl enable --now munge.service | |
| } | |
| setup_slurm_dirs() { | |
| install -d -m 0755 -o slurm -g slurm /var/spool/slurmctld | |
| install -d -m 0755 -o slurm -g slurm /var/spool/slurmd | |
| install -d -m 0755 -o slurm -g slurm /var/log/slurm | |
| } | |
| write_slurm_conf() { | |
| install -d -m 0755 "$SLURM_ETC_DIR" | |
| cat >"$SLURM_CONF" <<EOF | |
| # Minimal single-node Slurm config | |
| ClusterName=$CLUSTER_NAME | |
| SlurmctldHost=$CONTROL_HOST | |
| SlurmUser=slurm | |
| SlurmctldPort=6817 | |
| SlurmdPort=6818 | |
| AuthType=auth/munge | |
| CryptoType=crypto/munge | |
| StateSaveLocation=/var/spool/slurmctld | |
| SlurmdSpoolDir=/var/spool/slurmd | |
| SwitchType=switch/none | |
| MpiDefault=none | |
| SlurmctldPidFile=/run/slurmctld.pid | |
| SlurmdPidFile=/run/slurmd.pid | |
| ProctrackType=proctrack/cgroup | |
| TaskPlugin=task/cgroup | |
| ReturnToService=2 | |
| SlurmctldTimeout=120 | |
| SlurmdTimeout=300 | |
| SchedulerType=sched/backfill | |
| SelectType=select/cons_tres | |
| SelectTypeParameters=CR_Core | |
| AccountingStorageType=accounting_storage/slurmdbd | |
| AccountingStorageHost=localhost | |
| NodeName=$NODE_NAME CPUs=$NODE_CPUS State=UNKNOWN | |
| PartitionName=debug Nodes=$NODE_NAME Default=YES MaxTime=INFINITE State=UP | |
| EOF | |
| chmod 0644 "$SLURM_CONF" | |
| # Convenience symlink for tools/docs that expect /etc/slurm.conf | |
| # if [[ ! -e /etc/slurm.conf ]]; then | |
| # ln -s "$SLURM_CONF" /etc/slurm.conf | |
| # fi | |
| } | |
| write_mariadb_instance_conf() { | |
| install -d -m 0755 /etc/my.cnf.d | |
| cat >"$MARIADB_CNF" <<EOF | |
| [mysqld] | |
| user=mysql | |
| datadir=$MARIADB_DATADIR | |
| socket=$MARIADB_SOCKET | |
| port=$MARIADB_PORT | |
| pid-file=$MARIADB_PIDFILE | |
| bind-address=127.0.0.1 | |
| skip-name-resolve | |
| [client] | |
| socket=$MARIADB_SOCKET | |
| EOF | |
| chmod 0644 "$MARIADB_CNF" | |
| } | |
| write_mariadb_instance_unit() { | |
| local mariadbd_bin | |
| mariadbd_bin=$(command -v mariadbd 2>/dev/null || echo /usr/sbin/mariadbd) | |
| install -d -m 0755 /etc/systemd/system | |
| cat >"/etc/systemd/system/$MARIADB_INSTANCE_UNIT" <<EOF | |
| [Unit] | |
| Description=MariaDB instance for Slurm accounting (%i) | |
| After=network.target | |
| [Service] | |
| Type=notify | |
| User=mysql | |
| Group=mysql | |
| ExecStartPre=/usr/bin/install -d -o mysql -g mysql -m 0755 /run/mysqld | |
| ExecStart=${mariadbd_bin} --defaults-file=$MARIADB_CNF | |
| Restart=on-failure | |
| TimeoutStartSec=120 | |
| [Install] | |
| WantedBy=multi-user.target | |
| EOF | |
| } | |
| init_mariadb_instance() { | |
| install -d -m 0755 -o mysql -g mysql "$MARIADB_DATADIR" | |
| if [[ ! -d "$MARIADB_DATADIR/mysql" ]]; then | |
| mariadb-install-db --user=mysql --basedir=/usr --datadir="$MARIADB_DATADIR" | |
| fi | |
| systemctl daemon-reload | |
| systemctl enable --now "$MARIADB_INSTANCE_UNIT" | |
| # Wait until the instance answers over its socket. | |
| for _ in {1..60}; do | |
| if mariadb-admin --protocol=socket --socket="$MARIADB_SOCKET" ping >/dev/null 2>&1; then | |
| return 0 | |
| fi | |
| sleep 0.5 | |
| done | |
| echo "ERROR: MariaDB instance did not become ready." >&2 | |
| exit 1 | |
| } | |
| ensure_db_user_and_schema() { | |
| if [[ -z "$SLURM_DB_PASS" ]]; then | |
| SLURM_DB_PASS="$(openssl rand -base64 24 | tr -d '\n' | tr '/+' '_-' | cut -c1-32)" | |
| echo "Generated SLURM_DB_PASS=$SLURM_DB_PASS" | |
| echo "Save it somewhere safe; it is written into $SLURMDBD_CONF" | |
| fi | |
| # Create DB and user if missing; idempotent. | |
| mariadb --protocol=socket --socket="$MARIADB_SOCKET" <<EOF | |
| CREATE DATABASE IF NOT EXISTS \`$SLURM_DB_NAME\`; | |
| CREATE USER IF NOT EXISTS '$SLURM_DB_USER'@'localhost' IDENTIFIED BY '$SLURM_DB_PASS'; | |
| ALTER USER '$SLURM_DB_USER'@'localhost' IDENTIFIED BY '$SLURM_DB_PASS'; | |
| GRANT ALL PRIVILEGES ON \`$SLURM_DB_NAME\`.* TO '$SLURM_DB_USER'@'localhost'; | |
| CREATE USER IF NOT EXISTS '$SLURM_DB_USER'@'127.0.0.1' IDENTIFIED BY '$SLURM_DB_PASS'; | |
| ALTER USER '$SLURM_DB_USER'@'127.0.0.1' IDENTIFIED BY '$SLURM_DB_PASS'; | |
| GRANT ALL PRIVILEGES ON \`$SLURM_DB_NAME\`.* TO '$SLURM_DB_USER'@'127.0.0.1'; | |
| FLUSH PRIVILEGES; | |
| EOF | |
| } | |
| write_slurmdbd_conf() { | |
| install -d -m 0755 "$SLURM_ETC_DIR" | |
| cat >"$SLURMDBD_CONF" <<EOF | |
| # SlurmDBD configuration (MariaDB local) | |
| AuthType=auth/munge | |
| DbdHost=localhost | |
| DbdPort=6819 | |
| SlurmUser=slurm | |
| LogFile=/var/log/slurm/slurmdbd.log | |
| PidFile=/run/slurmdbd/slurmdbd.pid | |
| StorageType=accounting_storage/mysql | |
| StorageHost=127.0.0.1 | |
| StoragePort=$MARIADB_PORT | |
| StorageUser=$SLURM_DB_USER | |
| StoragePass=$SLURM_DB_PASS | |
| StorageLoc=$SLURM_DB_NAME | |
| EOF | |
| # slurmdbd.service runs as slurm:slurm on Arch; it must be able to read this file. | |
| chown slurm:slurm "$SLURMDBD_CONF" | |
| chmod 0600 "$SLURMDBD_CONF" | |
| # Convenience symlink for tools/docs that expect /etc/slurmdbd.conf | |
| # if [[ ! -e /etc/slurmdbd.conf ]]; then | |
| # ln -s "$SLURMDBD_CONF" /etc/slurmdbd.conf | |
| # fi | |
| } | |
| write_slurmdbd_systemd_dropin() { | |
| # Arch's slurmdbd.service only depends on mariadb.service/mysql.service. | |
| # When using a dedicated instance (e.g. mariadb-slurm.service), add explicit ordering. | |
| install -d -m 0755 /etc/systemd/system/slurmdbd.service.d | |
| cat >"/etc/systemd/system/slurmdbd.service.d/override.conf" <<EOF | |
| [Unit] | |
| Wants=$MARIADB_INSTANCE_UNIT | |
| After=$MARIADB_INSTANCE_UNIT | |
| EOF | |
| systemctl daemon-reload | |
| } | |
| start_slurm_services() { | |
| systemctl enable --now slurmdbd.service | |
| systemctl enable --now slurmctld.service | |
| systemctl enable --now slurmd.service | |
| } | |
| safe_rm_symlink() { | |
| local link_path="$1" | |
| local expected_target="$2" | |
| if [[ -L "$link_path" ]]; then | |
| local actual_target | |
| actual_target="$(readlink -f "$link_path" 2>/dev/null || true)" | |
| if [[ -n "${actual_target:-}" && "$actual_target" == "$expected_target" ]]; then | |
| rm -f "$link_path" | |
| fi | |
| fi | |
| } | |
| uninstall() { | |
| require_root | |
| local purge=0 | |
| if [[ "${1:-}" == "--purge" ]]; then | |
| purge=1 | |
| fi | |
| echo "Stopping/disabling Slurm services..." | |
| systemctl disable --now slurmd.service slurmctld.service slurmdbd.service >/dev/null 2>&1 || true | |
| echo "Stopping/disabling dedicated MariaDB instance ($MARIADB_INSTANCE_UNIT)..." | |
| systemctl disable --now "$MARIADB_INSTANCE_UNIT" >/dev/null 2>&1 || true | |
| echo "Removing systemd drop-ins/units created by this script..." | |
| rm -f "/etc/systemd/system/$MARIADB_INSTANCE_UNIT" 2>/dev/null || true | |
| rm -f "/etc/systemd/system/slurmdbd.service.d/override.conf" 2>/dev/null || true | |
| rmdir "/etc/systemd/system/slurmdbd.service.d" 2>/dev/null || true | |
| echo "Removing MariaDB instance config created by this script..." | |
| rm -f "$MARIADB_CNF" 2>/dev/null || true | |
| echo "Removing Slurm configs created by this script (safe check)..." | |
| if [[ -f "$SLURM_CONF" ]] && grep -q '^# Minimal single-node Slurm config' "$SLURM_CONF"; then | |
| rm -f "$SLURM_CONF" | |
| fi | |
| if [[ -f "$SLURMDBD_CONF" ]] && grep -q '^# SlurmDBD configuration (MariaDB local)' "$SLURMDBD_CONF"; then | |
| rm -f "$SLURMDBD_CONF" | |
| fi | |
| # Remove symlinks only if they point to our files. | |
| safe_rm_symlink /etc/slurm.conf "$SLURM_CONF" | |
| safe_rm_symlink /etc/slurmdbd.conf "$SLURMDBD_CONF" | |
| if [[ "$purge" -eq 1 ]]; then | |
| echo "Purging Slurm spools/logs and MariaDB datadir..." | |
| rm -rf /var/spool/slurmctld /var/spool/slurmd /var/log/slurm 2>/dev/null || true | |
| if [[ -n "${MARIADB_DATADIR:-}" ]]; then | |
| rm -rf "$MARIADB_DATADIR" 2>/dev/null || true | |
| fi | |
| elif [ -d "$MARIADB_DATADIR" ]; then | |
| echo "NOTE: Keeping ${MARIADB_DATADIR:-<unset>} (pass --purge to delete)." | |
| fi | |
| systemctl daemon-reload | |
| echo "OK: uninstall completed." | |
| } | |
| update_node_config() { | |
| # CLI behavior for --update: detect node_line from slurmd -C, rewrite slurm.conf, restart services. | |
| require_root | |
| if [[ ! -f "$SLURM_CONF" ]]; then | |
| echo "ERROR: $SLURM_CONF does not exist." >&2 | |
| exit 1 | |
| fi | |
| if ! command -v slurmd >/dev/null 2>&1; then | |
| echo "ERROR: slurmd not found. Install slurm-llnl first." >&2 | |
| exit 1 | |
| fi | |
| local detected | |
| detected="$(slurmd -C 2>/dev/null | head -n1 || true)" | |
| if [[ -z "${detected:-}" || "$detected" != NodeName=* ]]; then | |
| echo "ERROR: failed to get NodeName line from 'slurmd -C'." >&2 | |
| exit 1 | |
| fi | |
| local detected_node | |
| detected_node="$(echo "$detected" | sed -n 's/^NodeName=\([^ ]\+\).*/\1/p')" | |
| if [[ -z "${detected_node:-}" ]]; then | |
| echo "ERROR: could not parse NodeName from: $detected" >&2 | |
| exit 1 | |
| fi | |
| local backup | |
| backup="${SLURM_CONF}.bak.$(date +%Y%m%d%H%M%S)" | |
| cp -a "$SLURM_CONF" "$backup" | |
| local updated | |
| updated="$(apply_node_line_to_slurm_conf "$detected" "$backup")" | |
| if [[ -z "${updated:-}" || ! -f "$updated" ]]; then | |
| echo "ERROR: failed to generate updated slurm.conf" >&2 | |
| exit 1 | |
| fi | |
| mv "$updated" "$SLURM_CONF" | |
| chmod 0644 "$SLURM_CONF" || true | |
| echo "Updated $SLURM_CONF using: $detected" | |
| # Apply changes | |
| systemctl restart slurmctld.service slurmd.service >/dev/null 2>&1 || true | |
| # Clear DRAIN if it was caused by old config mismatch. | |
| scontrol update NodeName="$detected_node" State=RESUME >/dev/null 2>&1 || true | |
| echo "OK: node config updated. Backup: $backup" | |
| } | |
| add_node() { | |
| require_root | |
| require_cmd ssh | |
| require_cmd base64 | |
| local host="${1:-}" | |
| if [[ -z "${host:-}" ]]; then | |
| echo "ERROR: --add-node requires a host argument" >&2 | |
| usage | |
| exit 2 | |
| fi | |
| if [[ ! -f "$SLURM_CONF" ]]; then | |
| echo "ERROR: $SLURM_CONF does not exist. Run setup first." >&2 | |
| exit 1 | |
| fi | |
| if [[ ! -r /etc/munge/munge.key ]]; then | |
| echo "ERROR: /etc/munge/munge.key not readable. Run setup_munge first." >&2 | |
| exit 1 | |
| fi | |
| echo "Configuring remote node via SSH (sudo on remote): $(ssh_target "$host")" | |
| local ssh_opts_array=() | |
| read -r -a ssh_opts_array <<< "$(ssh_opts)" | |
| local control_dir | |
| control_dir="$(mktemp -d)" | |
| if [[ -z "${control_dir:-}" || ! -d "$control_dir" ]]; then | |
| echo "ERROR: could not create control directory" >&2 | |
| exit 1 | |
| fi | |
| local control_path="${control_dir}/control-master.sock" | |
| local ssh_all_opts=("${ssh_opts_array[@]}" -o ControlMaster=auto -o ControlPersist=300s -o ControlPath="$control_path") | |
| ssh_exec() { | |
| ssh "${ssh_all_opts[@]}" "$@" | |
| } | |
| scp_exec() { | |
| scp "${ssh_all_opts[@]}" "$@" | |
| } | |
| cleanup_control() { | |
| ssh_exec -O exit "$(ssh_target "$host")" >/dev/null 2>&1 || true | |
| rm -rf "$control_dir" | |
| } | |
| trap cleanup_control RETURN | |
| # Upload this script to the remote node, then run it under sudo. | |
| local self | |
| self="$(readlink -f "$0" 2>/dev/null || echo "$0")" | |
| if [[ ! -f "$self" ]]; then | |
| echo "ERROR: could not resolve script path for copying (got: $self)" >&2 | |
| exit 1 | |
| fi | |
| # Step 1: create a temp script file on the remote host. | |
| local remote_script | |
| remote_script="$(ssh_exec "$(ssh_target "$host")" "mktemp /tmp/slurm-setup-node.XXXXXX.sh")" | |
| remote_script="${remote_script//$'\r'/}" | |
| remote_script="${remote_script//$'\n'/}" | |
| if [[ -z "${remote_script:-}" ]]; then | |
| echo "ERROR: failed to create remote temp file" >&2 | |
| exit 1 | |
| fi | |
| # Step 2: upload the script as the SSH user (no sudo needed). | |
| scp_exec "$self" "$(ssh_target "$host"):$remote_script" | |
| ssh_exec "$(ssh_target "$host")" "chmod 0700 '$remote_script'" | |
| # Step 3: run it with sudo in a TTY so sudo can prompt for a password if needed. | |
| local remote_sudo_pass | |
| remote_sudo_pass="$(prompt_remote_sudo_password "$host")" | |
| # Phase 1: probe remote node hardware to get its NodeName line. | |
| local probe_cmd | |
| probe_cmd="bash '$remote_script' --probe-node-line" | |
| local probe_out | |
| if [[ -n "${remote_sudo_pass:-}" ]]; then | |
| probe_out="$(printf '%s\n' "$remote_sudo_pass" | ssh_exec "$(ssh_target "$host")" "sudo -S -p '' $probe_cmd")" | |
| else | |
| probe_out="$(ssh_exec -tt "$(ssh_target "$host")" "sudo $probe_cmd")" | |
| fi | |
| local node_line | |
| node_line="$(echo "$probe_out" | sed -n 's/^NODELINE://p' | tail -n1)" | |
| if [[ -z "${node_line:-}" || "$node_line" != NodeName=* ]]; then | |
| echo "ERROR: failed to obtain NodeName line from remote slurmd -C" >&2 | |
| echo "Remote output:" >&2 | |
| echo "$probe_out" >&2 | |
| exit 1 | |
| fi | |
| local node_name | |
| node_name="$(echo "$node_line" | sed -n 's/^NodeName=\([^ ]\+\).*/\1/p')" | |
| if [[ -z "${node_name:-}" ]]; then | |
| echo "ERROR: could not parse NodeName from: $node_line" >&2 | |
| exit 1 | |
| fi | |
| # Phase 2: update controller slurm.conf locally using the probed node_line. | |
| # - add/replace a NodeName line for this node | |
| # - ensure PartitionName Nodes includes this node | |
| local backup | |
| backup="${SLURM_CONF}.bak.add-node.$(date +%Y%m%d%H%M%S)" | |
| cp -a "$SLURM_CONF" "$backup" | |
| local updated | |
| updated="$(apply_node_line_to_slurm_conf "$node_line" "$backup")" | |
| if [[ -z "${updated:-}" || ! -f "$updated" ]]; then | |
| echo "ERROR: failed to generate updated slurm.conf for add-node" >&2 | |
| exit 1 | |
| fi | |
| mv "$updated" "$SLURM_CONF" | |
| chmod 0644 "$SLURM_CONF" || true | |
| echo "Added/updated node in controller config: $node_name" | |
| echo "Restarting controller services..." | |
| systemctl restart slurmctld.service slurmd.service >/dev/null 2>&1 || true | |
| # Phase 3: send updated config + munge key to the remote node and start services. | |
| local munge_key_b64 | |
| local slurm_conf_b64 | |
| munge_key_b64="$(b64_file /etc/munge/munge.key)" | |
| slurm_conf_b64="$(b64_file "$SLURM_CONF")" | |
| local node_cmd | |
| node_cmd="env MUNGE_KEY_B64='$munge_key_b64' SLURM_CONF_B64='$slurm_conf_b64' bash '$remote_script' --node" | |
| local node_out | |
| if [[ -n "${remote_sudo_pass:-}" ]]; then | |
| node_out="$(printf '%s\n' "$remote_sudo_pass" | ssh_exec "$(ssh_target "$host")" "sudo -S -p '' $node_cmd")" | |
| else | |
| node_out="$(ssh_exec -tt "$(ssh_target "$host")" "sudo $node_cmd")" | |
| fi | |
| # Cleanup remote script after successful runs. | |
| ssh_exec "$(ssh_target "$host")" "rm -f '$remote_script'" >/dev/null 2>&1 || true | |
| remote_sudo_pass="" | |
| # Clear DRAIN if it was caused by old config mismatch. | |
| scontrol update NodeName="$node_name" State=RESUME >/dev/null 2>&1 || true | |
| echo "OK: node added. Backup: $backup" | |
| } | |
| node_probe_mode() { | |
| # Intended to be executed on a compute node to discover its slurmd -C NodeName line. | |
| require_root | |
| install_packages node | |
| local node_line | |
| node_line="$(slurmd_node_line 2>/dev/null || true)" | |
| if [[ -z "${node_line:-}" || "$node_line" != NodeName=* ]]; then | |
| echo "ERROR: failed to get NodeName line from 'slurmd -C'." >&2 | |
| exit 1 | |
| fi | |
| echo "NODELINE:$node_line" | |
| } | |
| node_mode() { | |
| # Intended to be executed on a compute node (often via --add-node). | |
| require_root | |
| install_packages node | |
| if [[ -z "${MUNGE_KEY_B64:-}" ]]; then | |
| echo "ERROR: MUNGE_KEY_B64 is required in --node mode" >&2 | |
| exit 2 | |
| fi | |
| if [[ -z "${SLURM_CONF_B64:-}" ]]; then | |
| echo "ERROR: SLURM_CONF_B64 is required in --node mode" >&2 | |
| exit 2 | |
| fi | |
| # Install munge key. | |
| install -d -m 0700 -o munge -g munge /etc/munge | |
| local tmp_key | |
| tmp_key="$(mktemp)" | |
| printf '%s' "$MUNGE_KEY_B64" | base64 -d >"$tmp_key" | |
| install -m 0400 -o munge -g munge "$tmp_key" /etc/munge/munge.key | |
| rm -f "$tmp_key" | |
| # Install slurm.conf (then patch NodeName from slurmd -C before starting slurmd). | |
| install -d -m 0755 "$SLURM_ETC_DIR" | |
| local tmp_conf | |
| tmp_conf="$(mktemp)" | |
| printf '%s' "$SLURM_CONF_B64" | base64 -d >"$tmp_conf" | |
| install -m 0644 -o root -g root "$tmp_conf" "$SLURM_CONF" | |
| rm -f "$tmp_conf" | |
| setup_slurm_dirs | |
| local node_line | |
| node_line="$(slurmd_node_line 2>/dev/null || true)" | |
| if [[ -n "${node_line:-}" && "$node_line" == NodeName=* ]]; then | |
| local backup | |
| backup="${SLURM_CONF}.bak.slurmdC.$(date +%Y%m%d%H%M%S)" | |
| cp -a "$SLURM_CONF" "$backup" | |
| local updated | |
| updated="$(apply_node_line_to_slurm_conf "$node_line" "$backup")" || true | |
| if [[ -n "${updated:-}" && -f "$updated" ]]; then | |
| mv "$updated" "$SLURM_CONF" | |
| chmod 0644 "$SLURM_CONF" || true | |
| echo "Updated $SLURM_CONF from slurmd -C (backup: $backup)" | |
| fi | |
| fi | |
| systemctl enable munge.service slurmd.service | |
| systemctl restart munge.service slurmd.service | |
| echo "NODELINE:$node_line" | |
| } | |
| register_cluster() { | |
| # Wait briefly for slurmdbd to be ready. | |
| for _ in {1..20}; do | |
| if sacctmgr -n show cluster 2>/dev/null | grep -q .; then | |
| return 0 | |
| fi | |
| sleep 0.5 | |
| done | |
| # Idempotent create (ignore errors if it already exists). | |
| sacctmgr -i add cluster "$CLUSTER_NAME" >/dev/null 2>&1 || true | |
| } | |
| main() { | |
| require_root | |
| install_packages | |
| if [[ -z "${CONTROL_HOST:-}" ]]; then | |
| CONTROL_HOST="$(short_hostname)" | |
| fi | |
| if [[ -z "${NODE_NAME:-}" ]]; then | |
| NODE_NAME="$CONTROL_HOST" | |
| fi | |
| setup_munge | |
| setup_slurm_dirs | |
| write_slurm_conf | |
| # Before starting slurmd (local node), rewrite NodeName line to match actual hardware. | |
| if node_line="$(slurmd_node_line 2>/dev/null)"; then | |
| local backup | |
| backup="${SLURM_CONF}.bak.slurmdC.$(date +%Y%m%d%H%M%S)" | |
| cp -a "$SLURM_CONF" "$backup" | |
| local updated | |
| updated="$(apply_node_line_to_slurm_conf "$node_line" "$backup")" || true | |
| if [[ -n "${updated:-}" && -f "$updated" ]]; then | |
| mv "$updated" "$SLURM_CONF" | |
| chmod 0644 "$SLURM_CONF" || true | |
| echo "Updated $SLURM_CONF from slurmd -C (backup: $backup)" | |
| fi | |
| fi | |
| write_mariadb_instance_conf | |
| write_mariadb_instance_unit | |
| init_mariadb_instance | |
| ensure_db_user_and_schema | |
| write_slurmdbd_conf | |
| write_slurmdbd_systemd_dropin | |
| start_slurm_services | |
| register_cluster | |
| echo "OK: Slurm services enabled and started." | |
| echo "Next checks: scontrol ping; sinfo; squeue; sacctmgr show cluster" | |
| } | |
| case "${1:-}" in | |
| -h|--help) | |
| usage | |
| exit 0 | |
| ;; | |
| --update) | |
| update_node_config | |
| exit 0 | |
| ;; | |
| --add-node) | |
| add_node "${2:-}" | |
| exit 0 | |
| ;; | |
| --node) | |
| node_mode | |
| exit 0 | |
| ;; | |
| --probe-node-line) | |
| node_probe_mode | |
| exit 0 | |
| ;; | |
| --uninstall) | |
| uninstall "${2:-}" | |
| exit 0 | |
| ;; | |
| esac | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment