|
#!/bin/bash |
|
|
|
WORLD_SIZE=8 |
|
|
|
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ |
|
--nnodes 1 \ |
|
--node_rank 0 \ |
|
--master_addr localhost \ |
|
--master_port 6000" |
|
|
|
TASK="OCNLI" |
|
TRAIN_DATA="clue_data/ocnli/train.json" |
|
VALID_DATA="clue_data/ocnli/dev.json" |
|
TEST_DATA="clue_data/ocnli/test.json" |
|
PRETRAINED_CHECKPOINT="./yuyan-10b" |
|
|
|
VOCAB_FILE=bert-vocab.txt |
|
|
|
for lr in 2e-5 1e-5 7e-6; do |
|
for bs in 32 16; do |
|
for ep in 3 5 10 100; do |
|
ct=`date +"%m%d%H%M%S"` |
|
OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}" |
|
if [ ! -d ${OUTPUTS_PATH} ];then |
|
mkdir -p ${OUTPUTS_PATH} |
|
else |
|
echo "dir exist, not mkdir" |
|
fi |
|
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ |
|
--task $TASK \ |
|
--seed 1236 \ |
|
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \ |
|
--train-data $TRAIN_DATA \ |
|
--valid-data $VALID_DATA \ |
|
--test-data $TEST_DATA \ |
|
--tokenizer-type BertWordPieceLowerCase \ |
|
--vocab-file $VOCAB_FILE \ |
|
--epochs $ep \ |
|
--tensor-model-parallel-size 8 \ |
|
--num-layers 48 \ |
|
--hidden-size 4096 \ |
|
--num-attention-heads 64 \ |
|
--micro-batch-size $bs \ |
|
--lr $lr \ |
|
--lr-decay-style linear \ |
|
--lr-warmup-fraction 0.1 \ |
|
--seq-length 128 \ |
|
--max-position-embeddings 512 \ |
|
--log-interval 10 \ |
|
--eval-interval 800 \ |
|
--eval-iters 50 \ |
|
--weight-decay 1.0e-1 \ |
|
--res-path ${OUTPUTS_PATH} \ |
|
--fp16 | tee ${OUTPUTS_PATH}/job.log |
|
|
|
|
|
|
|
done |
|
done |
|
done |
|
|