|
|
#!/bin/bash |
|
|
|
|
|
GPUS=6 |
|
|
MASTER_PORT=7031 |
|
|
|
|
|
if [ "$#" -ne 2 ]; then |
|
|
echo "Usage: bash train.sh <OUTPUT_DIR> <EXP_NAME>" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
OUTPUT_DIR=$1 |
|
|
EXP_NAME=$2 |
|
|
|
|
|
LOG_DIR="./bash_logs" |
|
|
LOG_FILE="${LOG_DIR}/${EXP_NAME}.log" |
|
|
|
|
|
mkdir -p "${LOG_DIR}" |
|
|
|
|
|
MARGIN=12 |
|
|
TEMP=0.07 |
|
|
MODE=hardpos_only_sbertsim_refined |
|
|
MLW=0.1 |
|
|
BATCH_SIZE=30 |
|
|
MIXUP_FQ=False |
|
|
|
|
|
echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." |
|
|
echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" |
|
|
echo "Logging to: ${LOG_FILE}" |
|
|
|
|
|
ml purge |
|
|
ml load cuda/11.8 |
|
|
eval "$(conda shell.bash hook)" |
|
|
conda activate ris_all |
|
|
|
|
|
cd /data2/projects/chaeyun/CGFormer/ |
|
|
|
|
|
export NVIDIA_TF32_OVERRIDE=1 |
|
|
export NCCL_DEBUG=INFO |
|
|
export NCCL_IB_TIMEOUT=100 |
|
|
export NCCL_IB_RETRY_CNT=15 |
|
|
|
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python -m torch.distributed.launch \ |
|
|
--nproc_per_node=${GPUS} \ |
|
|
--master_port=${MASTER_PORT} \ |
|
|
train_gref.py \ |
|
|
--config config/config_gref_ace.yaml \ |
|
|
--opts TRAIN.batch_size ${BATCH_SIZE} \ |
|
|
TRAIN.exp_name ${EXP_NAME} \ |
|
|
TRAIN.output_folder ${OUTPUT_DIR} \ |
|
|
TRAIN.metric_mode ${MODE} \ |
|
|
TRAIN.metric_loss_weight ${MLW} \ |
|
|
TRAIN.margin_value ${MARGIN} \ |
|
|
TRAIN.temperature ${TEMP} \ |
|
|
TRAIN.mixup_lasttwo ${MIXUP_FQ} \ |
|
|
> "${LOG_FILE}" 2>&1 |
|
|
|