set -eou pipefail | |
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" | |
export TORCH_DISTRIBUTED_DEBUG="INFO" | |
python ./transformer_lm/train.py \ | |
--start-epoch 0 \ | |
--world-size 8 \ | |
--exp-dir transformer_lm/exp_full_libri_16layer_8gpu \ | |
--num-epochs 20 \ | |
--lm-data ./transformer_lm/libri_lm_training_bpe500/sorted-lm-data-libri-lm_maxlen200.pt \ | |
--lm-data-valid ./transformer_lm/libri_lm_training_bpe500/sorted_lm_data-valid.pt \ | |
--use-fp16 0 \ | |
--num-layers 16 \ | |
--batch-size 70 |