File size: 5,379 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env

# Guide:
# This script supports distributed training on multi-gpu workers (as well as single-worker training). 
# Please set the options below according to the comments. 
# For multi-gpu workers training, these options should be manually set for each worker. 
# After setting the options, please run the script on each worker.
# To use the shuffled data (if exists), please uncomment the Line 23.

# Number of GPUs per GPU worker
GPUS_PER_NODE=8 
# Number of GPU workers, for single-worker training, please set to 1
WORKER_CNT=4 
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
export MASTER_ADDR=XX.XX.XX.XX
# The port for communication
export MASTER_PORT=8214
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
export RANK=0
data_dir=../../dataset/coco_image_gen_data
data=${data_dir}/coco_vqgan_train.tsv,${data_dir}/coco_vqgan_dev.tsv
# Note: If you have shuffled the data in advance, please uncomment the line below.
# data=${data_dir}/coco_vqgan_train_1.tsv,${data_dir}/coco_vqgan_train_2.tsv,${data_dir}/coco_vqgan_train_3.tsv,${data_dir}/coco_vqgan_train_4.tsv,${data_dir}/coco_vqgan_train_5.tsv,${data_dir}/coco_vqgan_train_6.tsv,${data_dir}/coco_vqgan_train_7.tsv,${data_dir}/coco_vqgan_train_8.tsv,${data_dir}/coco_vqgan_train_9.tsv,${data_dir}/coco_vqgan_train_10.tsv,${data_dir}/coco_vqgan_dev.tsv
restore_file=../../checkpoints/ofa_large.pt
selected_cols=0,2,1

log_dir=./image_gen_logs
save_dir=./image_gen_checkpoints
mkdir -p $log_dir $save_dir

bpe_dir=../../utils/BPE
user_dir=../../ofa_module

task=image_gen
arch=ofa_large
criterion=adjust_label_smoothed_cross_entropy
label_smoothing=0.0
batch_size=4
update_freq=4
encoder_drop_path_rate=0.1
decoder_drop_path_rate=0.1
dropout=0.1
attention_dropout=0.0
max_src_length=64
max_tgt_length=1024
num_bins=1000
code_image_size=256
constraint_range=50265,58457

VQGAN_MODEL_PATH=../../checkpoints/vqgan/last.ckpt
VQGAN_CONFIG_PATH=../../checkpoints/vqgan/model.yaml
CLIP_MODEL_PATH=../../checkpoints/clip/ViT-B-16.pt
GEN_IMAGES_PATH=../../results/image_gen_stage1

for total_num_updates in {40000,}; do
  echo "total_num_updates "${total_num_updates}
  for warmup_updates in {2000,}; do
    echo "warmup_updates "${warmup_updates}  
    for lr in {1e-3,}; do
      echo "lr "${lr}

        log_file=${log_dir}/${total_num_updates}"_"${warmup_updates}"_"${lr}"_rank"${RANK}".log"
        save_path=${save_dir}/${total_num_updates}"_"${warmup_updates}"_"${lr}
        mkdir -p $save_path

        python3 -m torch.distributed.launch --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} ../../train.py \
            ${data} \
            --selected-cols=${selected_cols} \
            --bpe-dir=${bpe_dir} \
            --user-dir=${user_dir} \
            --restore-file=${restore_file} \
            --reset-optimizer --reset-dataloader --reset-meters \
            --save-dir=${save_path} \
            --task=${task} \
            --arch=${arch} \
            --criterion=${criterion} \
            --label-smoothing=${label_smoothing} \
            --batch-size=${batch_size} \
            --batch-size-valid=1 \
            --update-freq=${update_freq} \
            --encoder-normalize-before \
            --decoder-normalize-before \
            --share-decoder-input-output-embed \
            --share-all-embeddings \
            --layernorm-embedding \
            --patch-layernorm-embedding \
            --code-layernorm-embedding \
            --encoder-drop-path-rate=${encoder_drop_path_rate} \
            --decoder-drop-path-rate=${decoder_drop_path_rate} \
            --dropout=${dropout} \
            --attention-dropout=${attention_dropout} \
            --weight-decay=0.01 \
            --optimizer=adam \
            --adam-betas="(0.9,0.999)" \
            --adam-eps=1e-08 \
            --clip-norm=1.0 \
            --lr-scheduler=polynomial_decay \
            --lr=${lr} \
            --total-num-update=${total_num_updates} \
            --warmup-updates=${warmup_updates} \
            --log-format=simple \
            --log-interval=10 \
            --fixed-validation-seed=7 \
            --keep-last-epochs=15 \
            --save-interval=3 --validate-interval=3 \
            --max-update=${total_num_updates} \
            --best-checkpoint-metric=score --maximize-best-checkpoint-metric \
            --eval-args='{"beam":24,"min_len":1024,"max_len_a":0,"max_len_b":1024,"sampling_topk":256,"temperature":1.0}' \
            --max-src-length=${max_src_length} \
            --max-tgt-length=${max_tgt_length} \
            --find-unused-parameters \
            --add-type-embedding \
            --scale-attn \
            --scale-fc \
            --scale-heads \
            --disable-entangle \
            --num-bins=${num_bins} \
            --code-image-size=${code_image_size} \
            --constraint-range=${constraint_range} \
            --vqgan-model-path=${VQGAN_MODEL_PATH} \
            --vqgan-config-path=${VQGAN_CONFIG_PATH} \
            --clip-model-path=${CLIP_MODEL_PATH} \
            --gen-images-path=${GEN_IMAGES_PATH} \
            --fp16 \
            --fp16-scale-window=256 \
            --num-workers=0 > ${log_file} 2>&1
    done
  done
done