#SBATCH --job-name=dbs6-ace3 | |
#SBATCH --partition=a6000 | |
#SBATCH --gres=gpu:1 | |
#SBATCH --time=12-00:00:00 # d-hh:mm:ss, job time limit | |
#SBATCH --mem=28000 # cpu memory size | |
#SBATCH --cpus-per-task=4 | |
#SBATCH --output=./trainlog/dmmi_ACE_gref_m10_tmp005_bs6.log | |
ml purge | |
ml load cuda/11.8 | |
eval "$(conda shell.bash hook)" | |
conda activate risall | |
cd /data2/projects/chaeyun/RIS-DMMI | |
export NCCL_P2P_DISABLE=1 | |
export NVIDIA_TF32_OVERRIDE=0 | |
GPUS=1 | |
OUTPUT_DIR=$1 | |
EXP_NAME=$2 | |
MARGIN=$3 | |
TEMP=$4 | |
MODE=$5 | |
MASTER_PORT=5728 | |
# TRAIN | |
# hardpos_only, hardpos_only_rev | |
python_args="--model dmmi_swin_hardpos_only \ | |
--dataset refcocog \ | |
--splitBy umd \ | |
--output_dir ${OUTPUT_DIR} \ | |
--model_id ${EXP_NAME} \ | |
--batch-size 6 \ | |
--lr 0.00005 \ | |
--wd 1e-2 \ | |
--window12 \ | |
--swin_type base \ | |
--pretrained_backbone /data2/projects/chaeyun/LAVT-RIS/pretrained_weights/swin_base_patch4_window12_384_22k.pth \ | |
--epochs 40 \ | |
--img_size 480 \ | |
--metric_learning \ | |
--margin_value ${MARGIN} \ | |
--temperature ${TEMP} \ | |
--metric_mode ${MODE} \ | |
--exclude_multiobj " | |
CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=$GPUS --master_port=$MASTER_PORT train_rev.py $python_args | |
# python -m torch.distributed.launch --nproc_per_node=$GPUS train_rev.py $python_args | |
# CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=$GPUS train_rev.py $python_args | |
# sbatch train_ace_bs4.sh ./experiments/dmmi_grefu_ace/gref_m10_tmp007_bs6 gref_m10_tmp007_bs6 10 0.07 hardpos_only | |
# sbatch train_ace_bs4.sh ./experiments/dmmi_grefu_ace/gref_m12_tmp007_bs6 gref_m12_tmp007_bs6 12 0.07 hardpos_only | |
# sbatch train_ace_bs4.sh ./experiments/dmmi_grefu_ace/gref_m10_tmp005_bs6 gref_m10_tmp005_bs6 10 0.05 hardpos_only | |