TEACHER_PATH=./bert-base-uncased-teacher-preparation-pretrain
OUTPUT_DIR=$TEACHER_PATH
DATA_CACHE_DIR=/root/kaokao/Model-Compression-Research-Package/examples/transformers/language-modeling/wikipedia_processed_for_pretrain
python -m torch.distributed.launch \
--nproc_per_node=8 \
../../examples/transformers/language-modeling/run_mlm.py \
--model_name_or_path bert-base-uncased \
--datasets_name_config wikipedia:20200501.en \
--data_process_type segment_pair_nsp \
--dataset_cache_dir $DATA_CACHE_DIR \
--do_train \
--learning_rate 5e-5 \
--max_steps 100000 \
--warmup_ratio 0.01 \
--weight_decay 0.01 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 4 \
--logging_steps 10 \
--save_steps 5000 \
--save_total_limit 2 \
--output_dir $OUTPUT_DIR \
--run_name pofa-teacher-prepare-pretrain