lm1-4b2-84b-c4-repetitions / 4b284b6bc4subopt /sbatch_4b284b6bc4subopt.sh
Muennighoff's picture
Add
7d7c5ee
#!/bin/bash
#SBATCH --nodes=32
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=40 ##############
#SBATCH --mem=256G ##############
#SBATCH -p standard-g ##############
#SBATCH -t 48:00:00 ##############
#SBATCH --gpus-per-node=mi250:8 ##############
#SBATCH --exclusive=user
#SBATCH --hint=nomultithread ##############
#SBATCH --account=project_462000119 ##############
#SBATCH -o %j.out ##############
#SBATCH -e %j.err ##############
VARIANT=4b284b6bc4opt
# if run without sbatch, invoke here
if [ -z $SLURM_JOB_ID ]; then
mkdir -p logs
sbatch "$0"
exit
fi
set -euo pipefail
# symlink logs/latest.out and logs/latest.err
ln -f -s $SLURM_JOB_ID.out logs/latest.out
ln -f -s $SLURM_JOB_ID.err logs/latest.err
KILL_SWITCH_PATH=kill-switch-$VARIANT
CHECKPOINT_PATH=checkpoints_$VARIANT
TENSORBOARD_PATH=tensorboard_$VARIANT
# Data
VOCAB_FILE="gpt2/vocab.json"
MERGE_FILE="gpt2/merges.txt"
TRAIN_DATA_PATH=train6b.txt
# "train: 1.0 0:1 gpt2tok_c4_en_6B_text_document"
VALID_DATA_PATH=val.txt
# "validation: 1.0 0:1 gpt2tok_c4validation_text_document"
PP_SIZE=1
TP_SIZE=2
MICRO_BATCH_SIZE=2
GRADIENT_ACCUMULATION_STEPS=1
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
# Model parameters
source model_params.sh
MODEL_PARAM=("${PARAM_2639M[@]}")
NHIDDEN=${MODEL_PARAM[0]}
FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
KV_SIZE=${MODEL_PARAM[2]}
NHEADS=${MODEL_PARAM[3]}
NLAYERS=${MODEL_PARAM[4]}
SEQ_LEN=2048
echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
SAVE_INTERVAL=10000
# Tokens: 84_000_000_000
# -> Samples: 41_015_625.0
# TRAIN_SAMPLES=41_015_625
TRAIN_SAMPLES=62_550_493
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 2e-4 \
--min-lr 2e-5 \
--lr-decay-style cosine \
--lr-decay-samples $TRAIN_SAMPLES \
--lr-warmup-samples 625_505 \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--kv-channels $KV_SIZE \
--ffn-hidden-size $FFN_HIDDEN_SIZE \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--clip-grad 1.0 \
--kill-switch-path $KILL_SWITCH_PATH \
--bf16 \
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 10 \
--save-interval $SAVE_INTERVAL \
--eval-interval 1000 \
--eval-iters 100 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
ZERO_STAGE=0
mkdir -p ds_configs
DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
cat <<EOF > $DS_CONFIG_PATH
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOF
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config $DS_CONFIG_PATH \
--zero-stage $ZERO_STAGE \
"
CMD=" \
Megatron-DeepSpeed/pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
--valid-weighted-split-paths-path $VALID_DATA_PATH \
--data-impl mmap \
$DEEPSPEED_ARGS \
"
echo $CMD
echo "START $SLURM_JOBID: $(date)"
# bash launch_srun.sh $CMD
srun --label launch.sh $CMD
echo "END $SLURM_JOBID: $(date)"