VerbCentric-RIS / scripts /train_repro.sh
dianecy's picture
Upload folder using huggingface_hub
599450c verified
#!/bin/bash
#SBATCH --job-name=CRIS_repro
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --time=0-12:00:00
#SBATCH --mem=60G
#SBATCH --cpus-per-task=12
#SBATCH --output=CRIS_REPRO.txt
source ${HOME}/.bashrc
source ${HOME}/miniconda3/bin/activate base
conda activate cris
cd /home/s1/chaeyunkim/VerbCentric_CY
# Trap SIGUSR1 to handle job requeueing
max_restarts=3
function resubmit() {
scontext=$(scontrol show job ${SLURM_JOB_ID})
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
if [[ $restarts -lt $max_restarts ]]; then
echo "Resubmitting job (restart $restarts/$max_restarts)..."
scontrol requeue ${SLURM_JOB_ID}
exit 0
else
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
exit 1
fi
}
trap 'resubmit' SIGUSR1
export NCCL_P2P_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=^docker0,lo
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -u train_angular_verb.py --config /home/s1/chaeyunkim/VerbCentric_CY/config/cris_r50.yaml 2>&1 | tee debug.log &
wait
exit 0