set -eux NOW=`date +%Y-%m-%d-%H:%M:%S` LLM_RECIPES_DIR=/project source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh rm -f /tmp/hffs-* export WANDB_NOTES="Train sample" wandb login NUM_GPU_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) NUM_NODES=1 NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) # training config SEQ_LENGTH=4096 SLIDING_WINDOW_SIZE=131072 DATA_PARALLEL_SIZE=$NUM_GPUS MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=320 TRAIN_STEPS=20000 VALID_MICRO_BATCH_SIZE=1 # optimizer config LR=2e-5 MIN_LR=1e-6 LR_WARMUP_STEPS=500 LR_DECAY_STEPS=$TRAIN_STEPS WEIGHT_DECAY=0.1 GRAD_CLIP=1.0 # checkpoint & tokenizer TOKENIZER_MODEL=/share/pretrained_lm/Phi/Phi-2 BASE_MODEL=$TOKENIZER_MODEL LOAD_DIR=$BASE_MODEL SAVE_DIR=/work/llm_recipes/models/yans-baseline-Phi-2 mkdir -p $(dirname $SAVE_DIR) SAVE_BASE_NAME=$(basename $SAVE_DIR) LOG_FILE_PATH=$SAVE_DIR/train_${NOW}.log mkdir -p ${SAVE_DIR} # data config TRAIN_DATA_PATH="519177757 /work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document" TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 519177757 /work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document" VALID_DATA_PATH="519177757 /work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document" TEST_DATA_PATH=${VALID_DATA_PATH} set +e cd $LLM_RECIPES_DIR # run DISTRIBUTED_ARGS="--nproc_per_node $NUM_GPU_PER_NODE --nnodes 1 --node_rank 0 --master_addr localhost --master_port 8000" torchrun $DISTRIBUTED_ARGS examples/finetuning.py \ --seq-length ${SEQ_LENGTH} \ --sliding-window-size ${SLIDING_WINDOW_SIZE} \ --micro-batch-size ${MICRO_BATCH_SIZE} \ --valid_micro_batch_size ${VALID_MICRO_BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ --train-iters ${TRAIN_STEPS} \ --tokenizer-type HFPreTrainedTokenizer \ --tokenizer-model ${TOKENIZER_MODEL} \ --train-data-path ${TRAIN_DATA_PATH} \ --valid-data-path ${VALID_DATA_PATH} \ --test-data-path ${TEST_DATA_PATH} \ --lr ${LR} \ --min-lr ${MIN_LR} \ --lr-decay-style cosine \ --lr-warmup-iters ${LR_WARMUP_STEPS} \ --lr-decay-iters ${LR_DECAY_STEPS} \ --weight-decay ${WEIGHT_DECAY} \ --grad-clip-norm ${GRAD_CLIP} \ --optimizer anyprecision \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-6 \ --save-interval 500 \ --eval-interval 500 \ --eval-iters 10 \ --bf16 \ --mixed-precision \ --base-model ${BASE_MODEL} \ --save ${SAVE_DIR} \ --load ${SAVE_DIR} \ --fsdp-activation-checkpointing \ --sharding-strategy FULL_SHARD \ --checkpoint-type LOCAL_STATE_DICT \ --save-n-checkpoints 10 \ --upload-all-checkpoints-to-hf \ --hf-upload-retry-limit 2 \ --hf-repo-id shirayukikun/$SAVE_BASE_NAME \ --wandb-entity "keitokudo" \ --wandb-project "llm_tutorial" \ --wandb-name ${SAVE_BASE_NAME}_train_${NOW} 2>&1 | tee $LOG_FILE_PATH # --attn-implementation eager \ # --uploa-all-checkpoints-to-hf rm -f /tmp/hffs-*