#SBATCH --job-name=CRIS_repro | |
#SBATCH --nodes=1 | |
#SBATCH --gres=gpu:4 | |
#SBATCH --time=0-12:00:00 | |
#SBATCH --mem=60G | |
#SBATCH --cpus-per-task=12 | |
#SBATCH --output=CRIS_REPRO.txt | |
source ${HOME}/.bashrc | |
source ${HOME}/miniconda3/bin/activate base | |
conda activate cris | |
cd /home/s1/chaeyunkim/VerbCentric_CY | |
# Trap SIGUSR1 to handle job requeueing | |
max_restarts=3 | |
function resubmit() { | |
scontext=$(scontrol show job ${SLURM_JOB_ID}) | |
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2) | |
if [[ $restarts -lt $max_restarts ]]; then | |
echo "Resubmitting job (restart $restarts/$max_restarts)..." | |
scontrol requeue ${SLURM_JOB_ID} | |
exit 0 | |
else | |
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)." | |
exit 1 | |
fi | |
} | |
trap 'resubmit' SIGUSR1 | |
export NCCL_P2P_DISABLE=1 | |
export NCCL_DEBUG=INFO | |
export NCCL_SOCKET_IFNAME=^docker0,lo | |
export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
python -u train_angular_verb.py --config /home/s1/chaeyunkim/VerbCentric_CY/config/cris_r50.yaml 2>&1 | tee debug.log & | |
wait | |
exit 0 | |