Spaces:

liruiw
/

hma

Runtime error

App Files Files Community

hma / experiments /scripts /submit /sixteennode_wrapper2.sh

LeroyWaa

draft

246c106 11 months ago

raw

history blame

3.04 kB

	#!/bin/bash
	#SBATCH --nodes=16
	#SBATCH --ntasks-per-node=1
	#SBATCH --exclusive
	#SBATCH --time=3-0
	#SBATCH --partition=learnfair
	#SBATCH --error=logs/std-%j.err
	#SBATCH --output=logs/std-%j.out
	#SBATCH --gpus-per-node=8
	#SBATCH --cpus-per-task=32
	#SBATCH --exclude=learnfair[021,025,045,081,082,089,097,098,101,102,103,105]


	set -x
	ulimit -c 0


	script_name=${1}



	set -x
	ulimit -c 0
	CHK_DIR="logs/" # Define CHK_DIR
	LOG_DIR="logs/" # Define LOG_DIR

	##### Number of total processes
	echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
	echo "Nodelist:= " $SLURM_JOB_NODELIST
	echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
	echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
	echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "

	# pretrain
	export MASTER_ADDR=$(hostname -s)
	export TORCH_DISTRIBUTED_DEBUG=DETAIL
	export GLOO_SOCKET_IFNAME= # Set to your network interface
	export NCCL_SOCKET_IFNAME= # Set to your network interface

	export PYTHONUNBUFFERED=0
	export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
	export NCCL_BLOCKING_WAIT=1
	export CUDA_LAUNCH_BLOCKING=1
	export TORCH_NCCL_BLOCKING_WAIT=1
	export NCCL_DEBUG=INFO
	export NUM_GPU=8
	export SLURM_NNODES=16
	export SLURM_JOB_NUM_NODES=16
	export SLURM_NODEID=${SLURM_NODEID:-0} # Default to 0 if not set

	export NCCL_IB_DISABLE=1
	export NCCL_P2P_DISABLE=1
	WANDB_KEY=4c1540ebf8cb9964703ac212a937c00848a79b67
	wandb login ${WANDB_KEY}

	# find free port
	MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
	DATE="`date +'%d_%m_%Y_%H_%M_%S'`_$$"
	MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" \| head -n1)
	WORLD_SIZE=$((${NUM_GPU} * ${SLURM_JOB_NUM_NODES}))
	RANK=$SLURM_NODEID # Ensure this is set correctly in your environment

	echo "MASTER_ADDR : ${MASTER_ADDR}"
	echo "MASTER_PORT : ${MASTER_PORT}"
	echo "WORLD_SIZE : ${WORLD_SIZE}"
	echo "RANK : ${RANK}"

	echo "--------------------------------------------------" >> ~/history.txt
	echo "Slurm job id \| job id \| command" >> ~/history.txt
	echo "$SLURM_JOB_ID \| $JOB_ID \| $script_name" >> ~/history.txt
	function srun_with_requeue {
	set +e

	# Trap SIGTERM to requeue if killed by timeout
	# Trap SIGTERM to requeue if killed by timeout
	trap 'echo "Caught SIGTERM signal. Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGTERM

	# Trap SIGUSR1 to requeue if node failure is detected
	trap 'echo "Caught SIGUSR1 signal (node failure). Requeuing..."; scontrol requeue $SLURM_JOB_ID; exit 0' SIGUSR1

	srun --cpu-bind=none "$@"
	ret=$?
	if [ $ret -eq 124 ]; then
	echo "Job timed out. Requeuing..."
	scontrol requeue $SLURM_JOB_ID
	exit 0
	elif [ $ret -eq 143 ]; then
	echo "Job timed out. Requeuing..."
	scontrol requeue $SLURM_JOB_ID
	exit 0
	elif [ $ret -ne 0 ]; then
	echo "Error in $1 with exit code $ret. Not requeuing."
	exit $ret
	else
	echo "Job completed successfully."
	exit 0
	fi

	set -e
	}
	chmod +x $script_name
	srun_with_requeue $script_name