| | #! /usr/bin/bash |
| | set -eux |
| |
|
| | train_device=0,1,2,3,4,5,6,7 |
| | eval_device=0 |
| | |
| | root_dir=$(dirname "$PWD") |
| |
|
| | src_lang=en |
| | tgt_lang=de |
| | threshold=0.7 |
| |
|
| | data_name=wmt23 |
| | |
| | task_name=${src_lang}2${tgt_lang} |
| | data_dir=$root_dir/data/${tgt_lang}2${src_lang}/${threshold} |
| | raw_data_dir=$data_dir/raw |
| | trainable_data_dir=$data_dir/trainable_data |
| |
|
| | |
| | decode_max_tokens=2048 |
| | beam=5 |
| | nbest=1 |
| | lenpen=1.0 |
| |
|
| | |
| | criterion=label_smoothed_cross_entropy |
| | label_smoothing=0.1 |
| | seed=42 |
| | max_epoch=40 |
| | keep_last_epochs=1 |
| | keep_best_checkpoints=5 |
| | patience=5 |
| | num_workers=8 |
| |
|
| | |
| | conf_name=transformer_big |
| | |
| | if [ $conf_name == "transformer_big" ]; then |
| | arch=transformer_vaswani_wmt_en_de_big |
| | activation_fn=relu |
| | encoder_ffn_embed_dim=4096 |
| | share_all_embeddings=1 |
| | share_decoder_input_output_embed=1 |
| | learing_rate=1e-3 |
| | warmup=4000 |
| | max_tokens=8192 |
| | weight_decay=0.0 |
| | dropout=0.3 |
| | gradient_accumulation_steps=4 |
| | else |
| | echo "unknown conf_name=$conf_name" |
| | exit |
| | fi |
| |
|
| | model_dir=$root_dir/exps/${task_name}_backtrans/${threshold}/${conf_name}_${data_name} |
| | mkdir -p $model_dir |
| | cp ${BASH_SOURCE[0]} $model_dir |
| |
|
| | gpu_num=`echo "$train_device" | awk '{split($0,arr,",");print length(arr)}'` |
| | export CUDA_VISIBLE_DEVICES=$train_device |
| | cmd="fairseq-train $trainable_data_dir \ |
| | --distributed-world-size $gpu_num -s $src_lang -t $tgt_lang \ |
| | --arch $arch \ |
| | --fp16 \ |
| | --optimizer adam --clip-norm 0.0 \ |
| | --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates $warmup \ |
| | --lr $learing_rate --adam-betas '(0.9, 0.98)' \ |
| | --weight-decay $weight_decay \ |
| | --dropout $dropout \ |
| | --criterion $criterion --label-smoothing $label_smoothing \ |
| | --max-epoch $max_epoch \ |
| | --max-tokens $max_tokens \ |
| | --update-freq $gradient_accumulation_steps \ |
| | --activation-fn $activation_fn \ |
| | --encoder-ffn-embed-dim $encoder_ffn_embed_dim \ |
| | --seed $seed \ |
| | --num-workers $num_workers \ |
| | --no-epoch-checkpoints \ |
| | --keep-last-epochs $keep_last_epochs \ |
| | --keep-best-checkpoints $keep_best_checkpoints \ |
| | --patience $patience \ |
| | --no-progress-bar \ |
| | --log-interval 100 \ |
| | --task "translation" \ |
| | --ddp-backend no_c10d \ |
| | --save-dir $model_dir \ |
| | --tensorboard-logdir $model_dir" |
| |
|
| | |
| | if [ $share_all_embeddings -eq 1 ]; then |
| | cmd=${cmd}" --share-all-embeddings " |
| | fi |
| | if [ $share_decoder_input_output_embed -eq 1 ]; then |
| | cmd=${cmd}" --share-decoder-input-output-embed " |
| | fi |
| | if [ ${max_update:=0} -ne 0 ]; then |
| | cmd=${cmd}" --max-update $max_update" |
| | fi |
| |
|
| | |
| | cur_time=`date +"%Y-%m-%d %H:%M:%S"` |
| | echo "=============$cur_time===================" >> $model_dir/train.log |
| | cmd="nohup ${cmd} >> $model_dir/train.log 2>&1 &" |
| |
|
| | eval $cmd |
| |
|
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|