#!/bin/bash #SBATCH --nodes=8 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 #SBATCH --mem=256G #SBATCH -p pilot #SBATCH -t 48:00:00 #SBATCH --gpus-per-node=mi250:8 #SBATCH --exclusive=user #SBATCH --hint=nomultithread #SBATCH --account=project_462000119 #SBATCH -o logs/%j.out #SBATCH -e logs/%j.err # if run without sbatch, invoke here #if [ -z $SLURM_JOB_ID ]; then # mkdir -p logs # sbatch "$0" # exit #fi VARIANT=7b1xp3ru set -euo pipefail # symlink logs/latest.out and logs/latest.err ln -f -s $SLURM_JOB_ID.out logs/latest.out ln -f -s $SLURM_JOB_ID.err logs/latest.err KILL_SWITCH_PATH=kill-switch-$VARIANT CHECKPOINT_PATH=checkpoints_$VARIANT TENSORBOARD_PATH=tensorboard_$VARIANT # Data TOKENIZER_NAME_OR_PATH=bigscience/tokenizer TRAIN_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3_train_ru.txt VALID_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3_validation_ru.txt PP_SIZE=1 TP_SIZE=1 MICRO_BATCH_SIZE=2 GRADIENT_ACCUMULATION_STEPS=16 WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) # Model parameters NLAYERS=30 NHIDDEN=4096 NHEADS=32 SEQ_LEN=2048 TRAIN_SAMPLES=6_348_800 SAVE_INTERVAL=500 ZERO_STAGE=1 mkdir -p ds_configs config_json="ds_configs/$SLURM_JOB_ID.json" cat < $config_json { "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, "train_batch_size": $GLOBAL_BATCH_SIZE, "gradient_clipping": 1.0, "zero_optimization": { "stage": $ZERO_STAGE }, "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 500, "hysteresis": 2, "min_loss_scale": 1, "initial_scale_power": 12 }, "steps_per_print": 2000, "wall_clock_breakdown": false } EOT CMD=" \ Megatron-DeepSpeed/finetune_t0.py \ --tensor-model-parallel-size $TP_SIZE \ --pipeline-model-parallel-size $PP_SIZE \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --seq-length $SEQ_LEN \ --max-position-embeddings $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-samples $TRAIN_SAMPLES \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ --init-method-std 0.0048 \ --embed-layernorm \ --fp16 \ --seed 42 \ --position-embedding-type alibi \ --abort-on-unmet-fused-kernel-constraints \ --clip-grad 1.0 \ --kill-switch-path $KILL_SWITCH_PATH \ --checkpoint-activations \ --pad-vocab-size-to 250880 \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --lr 2e-5 \ --lr-decay-style constant \ --lr-warmup-samples 0 \ --clip-grad 1.0 \ --weight-decay 1e-4 \ --no-load-optim \ --reset-progress \ --norm-target-loss \ --log-interval 10 \ --save-interval $SAVE_INTERVAL \ --eval-interval 500 \ --eval-iters 1 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ --log-batch-size-to-tensorboard \ --log-validation-ppl-to-tensorboard \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ --train-weighted-split-paths-path $TRAIN_DATA_PATH \ --valid-weighted-split-paths-path $VALID_DATA_PATH \ --dataloader-type single \ --data-impl mmap \ --deepspeed \ --deepspeed_config $config_json \ --zero-stage $ZERO_STAGE \ " echo $CMD echo "START $SLURM_JOBID: $(date)" srun --label launch.sh $CMD echo "END $SLURM_JOBID: $(date)"