script_path=$(realpath $BASH_SOURCE) | |
script_dir=$(dirname $script_path) | |
config_json="./config/config_block_large_chinese.json" | |
gpt_options=" \ | |
--block-lm \ | |
--task-mask \ | |
--bert-prob 0.4 \ | |
--gap-sentence-prob 0.3 \ | |
--avg-block-length 3 \ | |
--gpt-min-ratio 0.25 \ | |
--block-mask-prob 0.1 \ | |
--short-seq-prob 0.02 \ | |
--experiment-name blocklm-large-chinese \ | |
--model-parallel-size ${MP_SIZE} \ | |
--num-layers 24 \ | |
--hidden-size 1024 \ | |
--num-attention-heads 16 \ | |
--seq-length 512 \ | |
--max-position-embeddings 1024 \ | |
--save ../model_save/checkpoints/ \ | |
--load ../model_save/checkpoints/ | |
--log-interval 50 \ | |
--eval-interval 1000 \ | |
--save-interval 2000 \ | |
--train-iters 250000000 \ | |
--train-data wudao \ | |
--resume-dataloader \ | |
--loader-scatter 4 \ | |
--no-lazy-loader \ | |
--tokenizer-type ChineseSPTokenizer \ | |
--fix-command-token \ | |
--split 949,50,1 \ | |
--distributed-backend nccl \ | |
--lr-decay-style cosine \ | |
--lr-decay-ratio 0.1 \ | |
--lr-decay-iters 200000 \ | |
--warmup 0.04 \ | |
--checkpoint-activations \ | |
--deepspeed-activation-checkpointing \ | |
--fp16 \ | |
" | |
gpt_options="${gpt_options} | |
--deepspeed \ | |
--deepspeed_config ${config_json} \ | |
" | |