Created
February 10, 2026 18:56
-
-
Save bio-punk/0cd8ce2cb2396b813e4bc2d2516a62e9 to your computer and use it in GitHub Desktop.
slurm multi node IB check #slurm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #SBATCH --nodes=4 | |
| #SBATCH -p gpu_h100 | |
| #SBATCH --gres=gpu:1 | |
| #SBATCH --qos=gpugpu | |
| #SBATCH | |
| echo "" | |
| echo "==================================================================" | |
| echo "===================== START: IB check =====================" | |
| echo "==================================================================" | |
| for dev in $(echo $NCCL_IB_HCA | tr ',' ' '); do | |
| dev_name=$(echo $dev | cut -d':' -f1) | |
| dev_port=$(echo $dev | cut -d':' -f2) | |
| srun --label --ntasks=$SLURM_NNODES --ntasks-per-node=1 bash -c " | |
| nodename=$(hostname) | |
| res=\$(ibstat $dev_name $dev_port | grep -e 'CA' -e 'State' -e 'Rate' -e 'Link layer' |cut -d':' -f2|xargs echo) | |
| echo \"\$nodename \$res\" | |
| " 2>&1 | tee -a H100_MRPC_32_512_1024_compare-$SLURM_JOB_ID-ib_dev.log | |
| done | |
| echo "问题IB设备如下:" | |
| grep -i down H100_MRPC_32_512_1024_compare-$SLURM_JOB_ID-ib_dev.log | grep mlx5_ | |
| echo "done......" | |
| if [ $? -eq 0]; then | |
| echo "检测到 IB 设备 Down, 具体节点和设备如上, 退出作业..." | |
| exit 10 | |
| fi | |
| srun --label --ntasks=$SLURM_NNODES --ntasks-per-node=1 bash -c "hostname && ip a | grep "10.252"" | |
| echo "MASTER_ADDR: $MASTER_ADDR" | |
| echo "MASTER_PORT: $MASTER_PORT" | |
| echo "GPUS: $GPUS_PRE_NODE" | |
| echo "SLURM_NNODES: $SLURM_NNODES" | |
| echo "NCCL TESTS" | |
| nowpath=`pwd` | |
| cd nccl-tests | |
| module load openmpi/4.1.5_cuda12.8 nccl/2.26_cuda12.8 | |
| # make MPI=1 MPI_HOME=/data/apps/openmpi/4.1.5_cuda12.8 CUDA_HOME=/data/apps/cuda/12.8 NCCL_HOME=/data/apps/nccl/nccl-2.26.6-1 | |
| hostlist=$MASTER_ADDR:$GPUS_PRE_NODE | |
| for node in `scontrol show hostnames`; do | |
| if [ "$node" != "$MASTER_ADDR" ]; then | |
| hostlist=$hostlist,$node:$GPUS_PRE_NODE | |
| fi | |
| done | |
| echo "HOSTLIST: $hostlist" | |
| WORLD_SIZE=$((SLURM_NNODES * GPUS_PRE_NODE)) | |
| mpirun -H $hostlist -n $WORLD_SIZE ./build/all_reduce_perf -b 1G -e 8G -f 2 -g 1 | |
| cd $nowpath | |
| module purge |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment