| | #!/bin/bash |
| |
|
| | GPUS=4 |
| | MASTER_PORT=2948 |
| |
|
| | if [ "$#" -ne 2 ]; then |
| | echo "Usage: bash train.sh <OUTPUT_DIR> <EXP_NAME>" |
| | exit 1 |
| | fi |
| |
|
| | OUTPUT_DIR=$1 |
| | EXP_NAME=$2 |
| |
|
| | LOG_DIR="./refcoco_filter_exp" |
| | LOG_FILE="${LOG_DIR}/${EXP_NAME}.log" |
| |
|
| | mkdir -p "${LOG_DIR}" |
| |
|
| | MARGIN=12 |
| | TEMP=0.07 |
| | MODE=hardpos_only_sbertsim_refined |
| | MLW=0.1 |
| | BATCH_SIZE=48 |
| | MIXUP_FQ=False |
| | USE_PROJECTIONS=False |
| |
|
| | |
| |
|
| | echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." |
| | echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" |
| | echo "Logging to: ${LOG_FILE}" |
| |
|
| | ml purge |
| | ml load cuda/11.8 |
| | eval "$(conda shell.bash hook)" |
| | conda activate risall |
| |
|
| | cd /data2/projects/chaeyun/CGFormer/ |
| |
|
| | export NCCL_P2P_DISABLE=1 |
| | export NVIDIA_TF32_OVERRIDE=1 |
| | export NCCL_DEBUG=INFO |
| | export NCCL_TIMEOUT=7200 |
| | export NCCL_IB_RETRY_CNT=15 |
| |
|
| | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch \ |
| | --nproc_per_node=${GPUS} \ |
| | --master_port=${MASTER_PORT} \ |
| | train_rcc_sbert.py \ |
| | --config config/config_rcc_ace.yaml \ |
| | --opts TRAIN.batch_size ${BATCH_SIZE} \ |
| | TRAIN.exp_name ${EXP_NAME} \ |
| | TRAIN.output_folder ${OUTPUT_DIR} \ |
| | TRAIN.metric_mode ${MODE} \ |
| | TRAIN.metric_loss_weight ${MLW} \ |
| | TRAIN.margin_value ${MARGIN} \ |
| | TRAIN.temperature ${TEMP} \ |
| | TRAIN.filter_threshold 0.52 \ |
| | TRAIN.mixup_lasttwo ${MIXUP_FQ} \ |
| | TRAIN.use_projections ${USE_PROJECTIONS} \ |
| | > "${LOG_FILE}" 2>&1 |
| |
|