Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,599 Bytes
246c106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
#!/bin/bash
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=1
#SBATCH --exclusive
#SBATCH --time=3-0
#SBATCH --constraint=volta32
#SBATCH --partition=learnlab,learnfair
#SBATCH --error=logs/std-%j.err
#SBATCH --output=logs/std-%j.out
#SBATCH --gpus-per-node=8
#SBATCH --exclude=learnfair7516,learnfair7518,learnfair7519,learnfair7576,learnfair7578,learnfair7625,learnfair7627,learnfair7552,learnfair7553,learnfair7554,learnfair7555,learnfair7596,learnfair7597,learnfair7620,learnfair7621,learnfair7622,learnfair7623,learnfair7573,learnfair7564,learnfair7565,learnfair7566,learnfair7567,learnfair7664,learnfair7665,learnfair7666,learnfair7667,learnfair7556,learnfair7557,learnfair7558,learnfair7559,learnfair7560,learnfair7561,learnfair7562,learnfair7563,learnfair7636,learnfair7637,learnfair7638,learnfair7677,learnfair7678,learnfair7679,learnfair7685,learnfair7686,learnfair7687,learnfair7545,learnfair7546,learnfair7547,learnfair7483,learnfair7633,learnfair7635,learnfair7650,learnfair7651,learnfair7672,learnfair7675,learnfair7688,learnfair7690,learnfair7702,learnfair7703,learnfair7528,learnfair7530,learnfair7531,learnfair7540,learnfair7541,learnfair7542,learnfair7543,learnfair7585,learnfair7586,learnfair7587,learnfair7616,learnfair7619,learnfair7536,learnfair7537,learnfair7538,learnfair7539,learnfair7648,learnfair7663,learnfair7704,learnfair7705,learnfair7706,learnfair7707,learnfair7590,learnfair7591,learnfair7626,learnfair7649,learnfair7662,learnfair7548,learnfair7549,learnfair7550,learnfair7551,learnfair7470,learnfair7488,learnfair7490,learnfair7491,learnfair7657,learnfair7708,learnfair7568,learnfair7569,learnfair7570,learnfair7571,learnfair7628,learnfair7629,learnfair7630,learnfair7631,learnfair7644,learnfair7645,learnfair7646,learnfair7647,learnfair7656,learnfair7658,learnfair7659,learnfair7700,learnfair7701
#SBATCH --constraint=volta32gb
#SBATCH --cpus-per-task=80
set -x
ulimit -c 0
script_name=${1}
CHK_DIR="logs/" # Define CHK_DIR
LOG_DIR="logs/" # Define LOG_DIR
##### Number of total processes
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Nodelist:= " $SLURM_JOB_NODELIST
echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
# pretrain
export MASTER_ADDR=$(hostname -s)
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export GLOO_SOCKET_IFNAME= # Set to your network interface
export NCCL_SOCKET_IFNAME= # Set to your network interface
export PYTHONUNBUFFERED=0
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_BLOCKING_WAIT=1
export CUDA_LAUNCH_BLOCKING=1
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_DEBUG=INFO
export NUM_GPU=8
export SLURM_NNODES=8
export SLURM_JOB_NUM_NODES=8
export SLURM_NODEID=${SLURM_NODEID:-0} # Default to 0 if not set
export NCCL_IB_DISABLE=1
export NCCL_P2P_DISABLE=1
# find free port
MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
DATE="`date +'%d_%m_%Y_%H_%M_%S'`_$$"
MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1)
WORLD_SIZE=$((${NUM_GPU} * ${SLURM_JOB_NUM_NODES}))
RANK=$SLURM_NODEID # Ensure this is set correctly in your environment
echo "MASTER_ADDR : ${MASTER_ADDR}"
echo "MASTER_PORT : ${MASTER_PORT}"
echo "WORLD_SIZE : ${WORLD_SIZE}"
echo "RANK : ${RANK}"
# wrap with error catch and requeue
# run with error catch and requeue and srun
function srun_with_requeue {
set +e
# Trap SIGTERM to requeue if killed by timeout
# Trap SIGTERM to requeue if killed by timeout
trap 'echo "Caught SIGTERM signal. Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGTERM
# Trap SIGUSR1 to requeue if node failure is detected
trap 'echo "Caught SIGUSR1 signal (node failure). Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGUSR1
srun "$@"
ret=$?
if [ $ret -eq 124 ]; then
echo "Job timed out. Requeuing..."
scontrol requeue $SLURM_JOB_ID
exit 0
elif [ $ret -eq 143 ]; then
echo "Job timed out. Requeuing..."
scontrol requeue $SLURM_JOB_ID
exit 0
elif [ $ret -ne 0 ]; then
echo "Error in $1 with exit code $ret. Not requeuing."
exit $ret
else
echo "Job completed successfully."
exit 0
fi
set -e
}
echo "--------------------------------------------------" >> ~/history.txt
echo "Slurm job id | job id | command" >> ~/history.txt
echo "$SLURM_JOB_ID | $JOB_ID | $script_name" >> ~/history.txt
# srun_with_requeue $script_name
chmod +x $script_name
srun_with_requeue $script_name |