Skip to content

Instantly share code, notes, and snippets.

@bio-punk
Created February 10, 2026 18:56
Show Gist options
  • Select an option

  • Save bio-punk/0cd8ce2cb2396b813e4bc2d2516a62e9 to your computer and use it in GitHub Desktop.

Select an option

Save bio-punk/0cd8ce2cb2396b813e4bc2d2516a62e9 to your computer and use it in GitHub Desktop.
slurm multi node IB check #slurm
#SBATCH --nodes=4
#SBATCH -p gpu_h100
#SBATCH --gres=gpu:1
#SBATCH --qos=gpugpu
#SBATCH
echo ""
echo "=================================================================="
echo "===================== START: IB check ====================="
echo "=================================================================="
for dev in $(echo $NCCL_IB_HCA | tr ',' ' '); do
dev_name=$(echo $dev | cut -d':' -f1)
dev_port=$(echo $dev | cut -d':' -f2)
srun --label --ntasks=$SLURM_NNODES --ntasks-per-node=1 bash -c "
nodename=$(hostname)
res=\$(ibstat $dev_name $dev_port | grep -e 'CA' -e 'State' -e 'Rate' -e 'Link layer' |cut -d':' -f2|xargs echo)
echo \"\$nodename \$res\"
" 2>&1 | tee -a H100_MRPC_32_512_1024_compare-$SLURM_JOB_ID-ib_dev.log
done
echo "问题IB设备如下:"
grep -i down H100_MRPC_32_512_1024_compare-$SLURM_JOB_ID-ib_dev.log | grep mlx5_
echo "done......"
if [ $? -eq 0]; then
echo "检测到 IB 设备 Down, 具体节点和设备如上, 退出作业..."
exit 10
fi
srun --label --ntasks=$SLURM_NNODES --ntasks-per-node=1 bash -c "hostname && ip a | grep "10.252""
echo "MASTER_ADDR: $MASTER_ADDR"
echo "MASTER_PORT: $MASTER_PORT"
echo "GPUS: $GPUS_PRE_NODE"
echo "SLURM_NNODES: $SLURM_NNODES"
echo "NCCL TESTS"
nowpath=`pwd`
cd nccl-tests
module load openmpi/4.1.5_cuda12.8 nccl/2.26_cuda12.8
# make MPI=1 MPI_HOME=/data/apps/openmpi/4.1.5_cuda12.8 CUDA_HOME=/data/apps/cuda/12.8 NCCL_HOME=/data/apps/nccl/nccl-2.26.6-1
hostlist=$MASTER_ADDR:$GPUS_PRE_NODE
for node in `scontrol show hostnames`; do
if [ "$node" != "$MASTER_ADDR" ]; then
hostlist=$hostlist,$node:$GPUS_PRE_NODE
fi
done
echo "HOSTLIST: $hostlist"
WORLD_SIZE=$((SLURM_NNODES * GPUS_PRE_NODE))
mpirun -H $hostlist -n $WORLD_SIZE ./build/all_reduce_perf -b 1G -e 8G -f 2 -g 1
cd $nowpath
module purge
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment