File size: 1,984 Bytes
0b32e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
#SBATCH --job-name=dbs8-ace4
#SBATCH --partition=a6000
#SBATCH --gres=gpu:2
#SBATCH --time=12-00:00:00  # d-hh:mm:ss, job time limit
#SBATCH --mem=60000 # cpu memory size 
#SBATCH --cpus-per-task=6

# Job configuration
LOG_NAME=$6  
LOG_FILE="./trainlog/${LOG_NAME}.log"
GPUS=2
OUTPUT_DIR=$1
EXP_NAME=$2
MARGIN=$3
TEMP=$4
MODE=$5
MASTER_PORT=$7

# Environment setup
module purge
module load cuda/11.8
eval "$(conda shell.bash hook)"
conda activate risall

cd /data2/projects/chaeyun/RIS-DMMI

export NCCL_P2P_DISABLE=1
export NVIDIA_TF32_OVERRIDE=0

# Run the training script
torchrun \
    --nproc_per_node=$GPUS \
    --master_port=$MASTER_PORT \
    train_rev.py \
    --model dmmi_swin_hardpos_only \
    --dataset refcocog \
    --splitBy umd \
    --output_dir ${OUTPUT_DIR} \
    --model_id ${EXP_NAME} \
    --batch-size 4 \
    --lr 0.00005 \
    --wd 1e-2 \
    --window12 \
    --swin_type base \
    --pretrained_backbone /data2/projects/chaeyun/LAVT-RIS/pretrained_weights/swin_base_patch4_window12_384_22k.pth \
    --epochs 40 \
    --img_size 480 \
    --metric_learning \
    --margin_value ${MARGIN} \
    --temperature ${TEMP} \
    --metric_mode ${MODE} \
    --exclude_multiobj \
    2>&1 | tee $LOG_FILE


# bs12-ace4
# sbatch train_ace_bash.sh ./experiments/dmmi_grefu_ace_/gref_m15_tmp007_bs12 gref_m15_tmp007_bs12 15 0.07 hardpos_only dmmi_ACE_gref_m15_tmp007_bs12 2837

# bs12-ace3
# sbatch train_ace_bash.sh ./experiments/dmmi_grefu_ace_/gref_m10_tmp005_bs12 gref_m10_tmp005_bs12 10 0.05 hardpos_only dmmi_ACE_gref_m10_tmp005_bs12 8236

# bs12-ace4 refined
# sbatch train_ace_bash.sh ./experiments/dmmi_grefu_ace_/gref_m10_tmp007_refined_bs12 gref_m10_tmp007_refined_bs12 10 0.07 hardpos_only_refined dmmi_ACE_gref_m10_tmp007_refined_bs12 1873

# bs6-ace4
# sbatch train_ace_bash_bs8.sh ./experiments/dmmi_grefu_ace_/gref_m10_tmp007_refined_bs8 gref_m10_tmp007_refined_bs8 10 0.07 hardpos_only_refined dmmi_ACE_gref_m10_tmp007_refined_bs8 9873