Muennighoff commited on
Commit
13e40b3
1 Parent(s): 46d0e17

Create sbatch_2b855btasky.sh

Browse files
Files changed (1) hide show
  1. sbatch_2b855btasky.sh +162 -0
sbatch_2b855btasky.sh ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --nodes=32
3
+ #SBATCH --ntasks-per-node=1
4
+ #SBATCH -p standard-g
5
+ #SBATCH -t 48:00:00
6
+ #SBATCH --gpus-per-node=mi250:8
7
+ #SBATCH --exclusive=user
8
+ #SBATCH --hint=nomultithread
9
+ #SBATCH --account=project_462000241
10
+ #SBATCH -o logs/%j.out
11
+ #SBATCH -e logs/%j.err
12
+
13
+ VARIANT=2b855btasky
14
+
15
+ # if run without sbatch, invoke here
16
+ if [ -z $SLURM_JOB_ID ]; then
17
+ mkdir -p logs
18
+ sbatch "$0"
19
+ exit
20
+ fi
21
+
22
+ set -euo pipefail
23
+
24
+ # symlink logs/latest.out and logs/latest.err
25
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
26
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
27
+
28
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
29
+ CHECKPOINT_PATH=checkpoints_$VARIANT
30
+ TENSORBOARD_PATH=tensorboard_$VARIANT
31
+
32
+ # Data
33
+ VOCAB_FILE="gpt2/vocab.json"
34
+ MERGE_FILE="gpt2/merges.txt"
35
+ #TRAIN_DATA_PATH="/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_10B_text_document"
36
+ #VAL_DATA_PATH="/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
37
+
38
+ TRAIN_DATA_PATH=traintasky.txt
39
+ # "train: 1.0 0:1 /scratch/project_462000241/data/tasky/gpt2tok_tasky_text_document"
40
+ VALID_DATA_PATH=val.txt
41
+ # "validation_c4: 1.0 0:1 /scratch/project_462000241/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
42
+
43
+ PP_SIZE=1
44
+ TP_SIZE=1
45
+
46
+ MICRO_BATCH_SIZE=2
47
+ GRADIENT_ACCUMULATION_STEPS=1
48
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
49
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
50
+
51
+ # Model parameters
52
+ source model_params.sh
53
+ MODEL_PARAM=("${PARAM_2980M[@]}")
54
+ NHIDDEN=${MODEL_PARAM[0]}
55
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
56
+ KV_SIZE=${MODEL_PARAM[2]}
57
+ NHEADS=${MODEL_PARAM[3]}
58
+ NLAYERS=${MODEL_PARAM[4]}
59
+ SEQ_LEN=2048
60
+
61
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
62
+
63
+ SAVE_INTERVAL=10000
64
+
65
+ # Tokens: 55000000000
66
+ # -> Samples: 26855469
67
+ TRAIN_SAMPLES=26_855_469
68
+
69
+ OPTIMIZER_ARGS=" \
70
+ --optimizer adam \
71
+ --adam-beta1 0.9 \
72
+ --adam-beta2 0.95 \
73
+ --adam-eps 1e-8 \
74
+ --lr 2e-4 \
75
+ --min-lr 2e-5 \
76
+ --lr-decay-style cosine \
77
+ --lr-decay-samples $TRAIN_SAMPLES \
78
+ --lr-warmup-samples 268_555 \
79
+ --clip-grad 1.0 \
80
+ --weight-decay 1e-1 \
81
+ "
82
+
83
+ GPT_ARGS=" \
84
+ --num-layers $NLAYERS \
85
+ --hidden-size $NHIDDEN \
86
+ --num-attention-heads $NHEADS \
87
+ --kv-channels $KV_SIZE \
88
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
89
+ --seq-length $SEQ_LEN \
90
+ --max-position-embeddings $SEQ_LEN \
91
+ --micro-batch-size $MICRO_BATCH_SIZE \
92
+ --global-batch-size $GLOBAL_BATCH_SIZE \
93
+ --train-samples $TRAIN_SAMPLES \
94
+ --vocab-file $VOCAB_FILE \
95
+ --merge-file $MERGE_FILE \
96
+ --clip-grad 1.0 \
97
+ --kill-switch-path $KILL_SWITCH_PATH \
98
+ --bf16 \
99
+ $OPTIMIZER_ARGS \
100
+ "
101
+
102
+ OUTPUT_ARGS=" \
103
+ --log-interval 10 \
104
+ --save-interval $SAVE_INTERVAL \
105
+ --eval-interval 100 \
106
+ --eval-iters 100 \
107
+ --tensorboard-dir $TENSORBOARD_PATH \
108
+ --tensorboard-queue-size 5 \
109
+ --log-timers-to-tensorboard \
110
+ --log-batch-size-to-tensorboard \
111
+ --log-validation-ppl-to-tensorboard \
112
+ "
113
+
114
+ ZERO_STAGE=0
115
+
116
+ mkdir -p ds_configs
117
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
118
+
119
+ cat <<EOF > $DS_CONFIG_PATH
120
+ {
121
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
122
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
123
+ "gradient_clipping": 1.0,
124
+ "zero_optimization": {
125
+ "stage": $ZERO_STAGE
126
+ },
127
+ "bf16": {
128
+ "enabled": true
129
+ },
130
+ "steps_per_print": 2000,
131
+ "wall_clock_breakdown": false
132
+ }
133
+ EOF
134
+
135
+ DEEPSPEED_ARGS=" \
136
+ --deepspeed \
137
+ --deepspeed_config $DS_CONFIG_PATH \
138
+ --zero-stage $ZERO_STAGE \
139
+ "
140
+
141
+ CMD=" \
142
+ Megatron-DeepSpeed/pretrain_gpt.py \
143
+ --tensor-model-parallel-size $TP_SIZE \
144
+ --pipeline-model-parallel-size $PP_SIZE \
145
+ $GPT_ARGS \
146
+ $OUTPUT_ARGS \
147
+ --save $CHECKPOINT_PATH \
148
+ --load $CHECKPOINT_PATH \
149
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
150
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
151
+ --data-impl mmap \
152
+ $DEEPSPEED_ARGS \
153
+ "
154
+
155
+ echo $CMD
156
+
157
+ echo "START $SLURM_JOBID: $(date)"
158
+
159
+ # bash launch_srun.sh $CMD
160
+ srun --label launch.sh $CMD
161
+
162
+ echo "END $SLURM_JOBID: $(date)"