Spaces:
Runtime error
Runtime error
| #SBATCH --nodes=16 | |
| #SBATCH --ntasks-per-node=1 | |
| #SBATCH --exclusive | |
| #SBATCH --time=3-0 | |
| #SBATCH --partition=learnfair | |
| #SBATCH --error=logs/std-%j.err | |
| #SBATCH --output=logs/std-%j.out | |
| #SBATCH --gpus-per-node=8 | |
| #SBATCH --cpus-per-task=32 | |
| #SBATCH --exclude=learnfair[021,025,045,081,082,089,097,098,101,102,103,105] | |
| set -x | |
| ulimit -c 0 | |
| script_name=${1} | |
| set -x | |
| ulimit -c 0 | |
| CHK_DIR="logs/" # Define CHK_DIR | |
| LOG_DIR="logs/" # Define LOG_DIR | |
| ##### Number of total processes | |
| echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX " | |
| echo "Nodelist:= " $SLURM_JOB_NODELIST | |
| echo "Number of nodes:= " $SLURM_JOB_NUM_NODES | |
| echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE | |
| echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX " | |
| # pretrain | |
| export MASTER_ADDR=$(hostname -s) | |
| export TORCH_DISTRIBUTED_DEBUG=DETAIL | |
| export GLOO_SOCKET_IFNAME= # Set to your network interface | |
| export NCCL_SOCKET_IFNAME= # Set to your network interface | |
| export PYTHONUNBUFFERED=0 | |
| export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 | |
| export NCCL_BLOCKING_WAIT=1 | |
| export CUDA_LAUNCH_BLOCKING=1 | |
| export TORCH_NCCL_BLOCKING_WAIT=1 | |
| export NCCL_DEBUG=INFO | |
| export NUM_GPU=8 | |
| export SLURM_NNODES=16 | |
| export SLURM_JOB_NUM_NODES=16 | |
| export SLURM_NODEID=${SLURM_NODEID:-0} # Default to 0 if not set | |
| export NCCL_IB_DISABLE=1 | |
| export NCCL_P2P_DISABLE=1 | |
| WANDB_KEY=4c1540ebf8cb9964703ac212a937c00848a79b67 | |
| wandb login ${WANDB_KEY} | |
| # find free port | |
| MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') | |
| DATE="`date +'%d_%m_%Y_%H_%M_%S'`_$$" | |
| MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1) | |
| WORLD_SIZE=$((${NUM_GPU} * ${SLURM_JOB_NUM_NODES})) | |
| RANK=$SLURM_NODEID # Ensure this is set correctly in your environment | |
| echo "MASTER_ADDR : ${MASTER_ADDR}" | |
| echo "MASTER_PORT : ${MASTER_PORT}" | |
| echo "WORLD_SIZE : ${WORLD_SIZE}" | |
| echo "RANK : ${RANK}" | |
| echo "--------------------------------------------------" >> ~/history.txt | |
| echo "Slurm job id | job id | command" >> ~/history.txt | |
| echo "$SLURM_JOB_ID | $JOB_ID | $script_name" >> ~/history.txt | |
| function srun_with_requeue { | |
| set +e | |
| # Trap SIGTERM to requeue if killed by timeout | |
| # Trap SIGTERM to requeue if killed by timeout | |
| trap 'echo "Caught SIGTERM signal. Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGTERM | |
| # Trap SIGUSR1 to requeue if node failure is detected | |
| trap 'echo "Caught SIGUSR1 signal (node failure). Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGUSR1 | |
| srun --cpu-bind=none "$@" | |
| ret=$? | |
| if [ $ret -eq 124 ]; then | |
| echo "Job timed out. Requeuing..." | |
| scontrol requeue $SLURM_JOB_ID | |
| exit 0 | |
| elif [ $ret -eq 143 ]; then | |
| echo "Job timed out. Requeuing..." | |
| scontrol requeue $SLURM_JOB_ID | |
| exit 0 | |
| elif [ $ret -ne 0 ]; then | |
| echo "Error in $1 with exit code $ret. Not requeuing." | |
| exit $ret | |
| else | |
| echo "Job completed successfully." | |
| exit 0 | |
| fi | |
| set -e | |
| } | |
| chmod +x $script_name | |
| srun_with_requeue $script_name |