Muennighoff commited on
Commit
6498905
1 Parent(s): b383632
Files changed (47) hide show
  1. 8b7178b178b/3430821.err +0 -0
  2. 8b7178b178b/3430821.out +0 -0
  3. 8b7178b178b/latest +1 -0
  4. 8b7178b178b/sbatch_8b7178b178bval.sh +173 -0
  5. 8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682714736.nid006757.9395.0 +3 -0
  6. 8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682715750.nid006757.18568.0 +3 -0
  7. 8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682716321.nid006401.79744.0 +3 -0
  8. 8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682718055.nid006004.12707.0 +3 -0
  9. 8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682747563.nid006443.125406.0 +3 -0
  10. 8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682750644.nid006808.34149.0 +3 -0
  11. 8b7178b25b/3430923.err +0 -0
  12. 8b7178b25b/3430923.out +0 -0
  13. 8b7178b25b/latest +1 -0
  14. 8b7178b25b/sbatch_8b7178b25bval.sh +172 -0
  15. 8b7178b25b/tensorboard_8b7178b25bval/events.out.tfevents.1682747563.nid006004.70670.0 +3 -0
  16. 8b7178b25b/tensorboard_8b7178b25bval/events.out.tfevents.1682755053.nid006808.64028.0 +3 -0
  17. 8b7178b28b/events.out.tfevents.1682718827.nid006697.29435.0 +3 -0
  18. 8b7178b28b/events.out.tfevents.1682719334.nid007026.3295.0 +3 -0
  19. 8b7178b35b/3430973.err +0 -0
  20. 8b7178b35b/3430973.out +0 -0
  21. 8b7178b35b/latest +1 -0
  22. 8b7178b35b/sbatch_8b7178b35bval.sh +172 -0
  23. 8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682718827.nid007222.7917.0 +3 -0
  24. 8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682719334.nid006443.81826.0 +3 -0
  25. 8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682755053.nid007165.106247.0 +3 -0
  26. 8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682756717.nid007165.119497.0 +3 -0
  27. 8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682757384.nid006808.89370.0 +3 -0
  28. 8b7178b44b/3430925.err +0 -0
  29. 8b7178b44b/3430925.out +0 -0
  30. 8b7178b44b/latest +1 -0
  31. 8b7178b44b/sbatch_8b7178b44bval.sh +172 -0
  32. 8b7178b44b/tensorboard_8b7178b44bval/events.out.tfevents.1682718827.nid006888.86342.0 +3 -0
  33. 8b7178b44b/tensorboard_8b7178b44bval/events.out.tfevents.1682755053.nid007026.114219.0 +3 -0
  34. 8b7178b55b/events.out.tfevents.1682718827.nid005807.38076.0 +3 -0
  35. 8b7178b55b/events.out.tfevents.1682755053.nid006888.50037.0 +3 -0
  36. 8b7178b58b/3430964.err +0 -0
  37. 8b7178b58b/3430964.out +0 -0
  38. 8b7178b58b/latest +1 -0
  39. 8b7178b58b/sbatch_8b7178b58bval.sh +172 -0
  40. 8b7178b58b/tensorboard_8b7178b58bval/events.out.tfevents.1682756534.nid006808.73988.0 +3 -0
  41. 8b7178b58b/tensorboard_8b7178b58bval/events.out.tfevents.1682756738.nid006808.81065.0 +3 -0
  42. 8b7178b88b/3430928.err +0 -0
  43. 8b7178b88b/3430928.out +0 -0
  44. 8b7178b88b/latest +1 -0
  45. 8b7178b88b/sbatch_8b7178b88bval.sh +172 -0
  46. 8b7178b88b/tensorboard_8b7178b88bval/events.out.tfevents.1682718827.nid007026.128605.0 +3 -0
  47. 8b7178b88b/tensorboard_8b7178b88bval/events.out.tfevents.1682755053.nid007222.92615.0 +3 -0
8b7178b178b/3430821.err ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b178b/3430821.out ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b178b/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step84877
8b7178b178b/sbatch_8b7178b178bval.sh ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007542
3
+ #SBATCH --nodes=64
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=40
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=8b7178b178bval
17
+ VARIANT_CKPT=lm1-8b7-178b-c4-repetitions/8b7178b178b
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+ PP_SIZE=2
47
+ TP_SIZE=2
48
+
49
+ MICRO_BATCH_SIZE=1
50
+ GRADIENT_ACCUMULATION_STEPS=1
51
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
52
+ GLOBAL_BATCH_SIZE=512
53
+ #$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
54
+
55
+ # Model parameters
56
+ source model_params.sh
57
+ MODEL_PARAM=("${PARAM_9293M[@]}")
58
+ NHIDDEN=${MODEL_PARAM[0]}
59
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
60
+ KV_SIZE=${MODEL_PARAM[2]}
61
+ NHEADS=${MODEL_PARAM[3]}
62
+ NLAYERS=${MODEL_PARAM[4]}
63
+ SEQ_LEN=2048
64
+
65
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
66
+
67
+ SAVE_INTERVAL=5000
68
+
69
+ # Tokens: 11522010000
70
+ # -> Samples: 5625981
71
+ TRAIN_SAMPLES=1
72
+
73
+ OPTIMIZER_ARGS=" \
74
+ --optimizer adam \
75
+ --adam-beta1 0.9 \
76
+ --adam-beta2 0.999 \
77
+ --adam-eps 1e-8 \
78
+ --lr 2e-4 \
79
+ --min-lr 2e-5 \
80
+ --lr-decay-style cosine \
81
+ --lr-decay-samples $TRAIN_SAMPLES \
82
+ --lr-warmup-samples 0 \
83
+ --clip-grad 1.0 \
84
+ --weight-decay 1e-1 \
85
+ --override-lr-scheduler \
86
+ --reset-progress \
87
+ --no-load-optim \
88
+ "
89
+
90
+ GPT_ARGS=" \
91
+ --num-layers $NLAYERS \
92
+ --hidden-size $NHIDDEN \
93
+ --num-attention-heads $NHEADS \
94
+ --kv-channels $KV_SIZE \
95
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
96
+ --seq-length $SEQ_LEN \
97
+ --max-position-embeddings $SEQ_LEN \
98
+ --micro-batch-size $MICRO_BATCH_SIZE \
99
+ --global-batch-size $GLOBAL_BATCH_SIZE \
100
+ --train-samples $TRAIN_SAMPLES \
101
+ --vocab-file $VOCAB_FILE \
102
+ --merge-file $MERGE_FILE \
103
+ --clip-grad 1.0 \
104
+ --kill-switch-path $KILL_SWITCH_PATH \
105
+ --bf16 \
106
+ $OPTIMIZER_ARGS \
107
+ "
108
+
109
+ OUTPUT_ARGS=" \
110
+ --log-interval 10 \
111
+ --save-interval $SAVE_INTERVAL \
112
+ --eval-interval 1 \
113
+ --eval-iters 100 \
114
+ --eval-only true \
115
+ --tensorboard-dir $TENSORBOARD_PATH \
116
+ --tensorboard-queue-size 5 \
117
+ --log-timers-to-tensorboard \
118
+ --log-batch-size-to-tensorboard \
119
+ --log-validation-ppl-to-tensorboard \
120
+ "
121
+
122
+ ZERO_STAGE=0
123
+
124
+ mkdir -p ds_configs
125
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
126
+
127
+ cat <<EOF > $DS_CONFIG_PATH
128
+ {
129
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
130
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
131
+ "gradient_clipping": 1.0,
132
+ "zero_optimization": {
133
+ "stage": $ZERO_STAGE
134
+ },
135
+ "bf16": {
136
+ "enabled": true
137
+ },
138
+ "steps_per_print": 2000,
139
+ "wall_clock_breakdown": false
140
+ }
141
+ EOF
142
+
143
+ DEEPSPEED_ARGS=" \
144
+ --deepspeed \
145
+ --deepspeed_config $DS_CONFIG_PATH \
146
+ --zero-stage $ZERO_STAGE \
147
+ "
148
+
149
+ CMD=" \
150
+ Megatron-DeepSpeed/pretrain_gpt.py \
151
+ --tensor-model-parallel-size $TP_SIZE \
152
+ --pipeline-model-parallel-size $PP_SIZE \
153
+ $GPT_ARGS \
154
+ $OUTPUT_ARGS \
155
+ --save $CHECKPOINT_PATH \
156
+ --load $CHECKPOINT_PATH \
157
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
158
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
159
+ --data-impl mmap \
160
+ --num-workers 0 \
161
+ --valid-num-workers 0 \
162
+ $DEEPSPEED_ARGS \
163
+ "
164
+
165
+ echo $CMD
166
+
167
+ echo "START $SLURM_JOBID: $(date)"
168
+
169
+ # bash launch_srun.sh $CMD
170
+ srun --label launch.sh $CMD
171
+
172
+ echo "END $SLURM_JOBID: $(date)"
173
+
8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682714736.nid006757.9395.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5542c39702cb30793aa90d4369c5c39ff19c328b2fe2d9050cf5fe86133baa1f
3
+ size 980
8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682715750.nid006757.18568.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ea33b121154fb8f3c107049f6aa229463283a983efabfd749113bda41cbbb1
3
+ size 980
8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682716321.nid006401.79744.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4cd8c3c3f9a6141bed384d0b1162995e7f65908e260918b8b8b3aa3ee3eaade
3
+ size 40
8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682718055.nid006004.12707.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb41e17cbe01233ac3d2c82899b1cb160511922e19e4adbc9b4c448c7a24a063
3
+ size 980
8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682747563.nid006443.125406.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae1c6676a155204f70686cb112f9b94ea242773aa5c096bd6f556092bf9d94d8
3
+ size 40
8b7178b178b/tensorboard_8b7178b178bval/events.out.tfevents.1682750644.nid006808.34149.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5dd71824827897bf129fb5b849a6c6e0d5efaa803f59459b0bf2906d7007c4f
3
+ size 980
8b7178b25b/3430923.err ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b25b/3430923.out ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b25b/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step84877
8b7178b25b/sbatch_8b7178b25bval.sh ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007542
3
+ #SBATCH --nodes=64
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=40
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=8b7178b25bval
17
+ VARIANT_CKPT=lm1-8b7-178b-c4-repetitions/8b7178b25b
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+ PP_SIZE=2
47
+ TP_SIZE=2
48
+
49
+ MICRO_BATCH_SIZE=1
50
+ GRADIENT_ACCUMULATION_STEPS=1
51
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
52
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
53
+
54
+ # Model parameters
55
+ source model_params.sh
56
+ MODEL_PARAM=("${PARAM_9293M[@]}")
57
+ NHIDDEN=${MODEL_PARAM[0]}
58
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
59
+ KV_SIZE=${MODEL_PARAM[2]}
60
+ NHEADS=${MODEL_PARAM[3]}
61
+ NLAYERS=${MODEL_PARAM[4]}
62
+ SEQ_LEN=2048
63
+
64
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
65
+
66
+ SAVE_INTERVAL=5000
67
+
68
+ # Tokens: 11522010000
69
+ # -> Samples: 5625981
70
+ TRAIN_SAMPLES=1
71
+
72
+ OPTIMIZER_ARGS=" \
73
+ --optimizer adam \
74
+ --adam-beta1 0.9 \
75
+ --adam-beta2 0.999 \
76
+ --adam-eps 1e-8 \
77
+ --lr 2e-4 \
78
+ --min-lr 2e-5 \
79
+ --lr-decay-style cosine \
80
+ --lr-decay-samples $TRAIN_SAMPLES \
81
+ --lr-warmup-samples 0 \
82
+ --clip-grad 1.0 \
83
+ --weight-decay 1e-1 \
84
+ --override-lr-scheduler \
85
+ --reset-progress \
86
+ --no-load-optim \
87
+ "
88
+
89
+ GPT_ARGS=" \
90
+ --num-layers $NLAYERS \
91
+ --hidden-size $NHIDDEN \
92
+ --num-attention-heads $NHEADS \
93
+ --kv-channels $KV_SIZE \
94
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
95
+ --seq-length $SEQ_LEN \
96
+ --max-position-embeddings $SEQ_LEN \
97
+ --micro-batch-size $MICRO_BATCH_SIZE \
98
+ --global-batch-size $GLOBAL_BATCH_SIZE \
99
+ --train-samples $TRAIN_SAMPLES \
100
+ --vocab-file $VOCAB_FILE \
101
+ --merge-file $MERGE_FILE \
102
+ --clip-grad 1.0 \
103
+ --kill-switch-path $KILL_SWITCH_PATH \
104
+ --bf16 \
105
+ $OPTIMIZER_ARGS \
106
+ "
107
+
108
+ OUTPUT_ARGS=" \
109
+ --log-interval 10 \
110
+ --save-interval $SAVE_INTERVAL \
111
+ --eval-interval 1 \
112
+ --eval-iters 100 \
113
+ --eval-only true \
114
+ --tensorboard-dir $TENSORBOARD_PATH \
115
+ --tensorboard-queue-size 5 \
116
+ --log-timers-to-tensorboard \
117
+ --log-batch-size-to-tensorboard \
118
+ --log-validation-ppl-to-tensorboard \
119
+ "
120
+
121
+ ZERO_STAGE=0
122
+
123
+ mkdir -p ds_configs
124
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
125
+
126
+ cat <<EOF > $DS_CONFIG_PATH
127
+ {
128
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
129
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
130
+ "gradient_clipping": 1.0,
131
+ "zero_optimization": {
132
+ "stage": $ZERO_STAGE
133
+ },
134
+ "bf16": {
135
+ "enabled": true
136
+ },
137
+ "steps_per_print": 2000,
138
+ "wall_clock_breakdown": false
139
+ }
140
+ EOF
141
+
142
+ DEEPSPEED_ARGS=" \
143
+ --deepspeed \
144
+ --deepspeed_config $DS_CONFIG_PATH \
145
+ --zero-stage $ZERO_STAGE \
146
+ "
147
+
148
+ CMD=" \
149
+ Megatron-DeepSpeed/pretrain_gpt.py \
150
+ --tensor-model-parallel-size $TP_SIZE \
151
+ --pipeline-model-parallel-size $PP_SIZE \
152
+ $GPT_ARGS \
153
+ $OUTPUT_ARGS \
154
+ --save $CHECKPOINT_PATH \
155
+ --load $CHECKPOINT_PATH \
156
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
157
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
158
+ --data-impl mmap \
159
+ --num-workers 0 \
160
+ --valid-num-workers 0 \
161
+ $DEEPSPEED_ARGS \
162
+ "
163
+
164
+ echo $CMD
165
+
166
+ echo "START $SLURM_JOBID: $(date)"
167
+
168
+ # bash launch_srun.sh $CMD
169
+ srun --label launch.sh $CMD
170
+
171
+ echo "END $SLURM_JOBID: $(date)"
172
+
8b7178b25b/tensorboard_8b7178b25bval/events.out.tfevents.1682747563.nid006004.70670.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50dfe80463282d616d29275c3932dc5f0555fc6698846cd3d19be4d475489d40
3
+ size 980
8b7178b25b/tensorboard_8b7178b25bval/events.out.tfevents.1682755053.nid006808.64028.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4590dfe90c7d7e1c30e30b9c88bb4019f987f597ac98f9980d6f487c310f0f93
3
+ size 980
8b7178b28b/events.out.tfevents.1682718827.nid006697.29435.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f1717a7427c157695d446decc0538688cc9d5fa4081cf18a0c4cd0350e9ba6c
3
+ size 980
8b7178b28b/events.out.tfevents.1682719334.nid007026.3295.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bf1fca79277013e5fba0a634b8bbde837ba63739545079b5c1e3ba348dcff26
3
+ size 980
8b7178b35b/3430973.err ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b35b/3430973.out ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b35b/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step84877
8b7178b35b/sbatch_8b7178b35bval.sh ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007542
3
+ #SBATCH --nodes=64
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=40
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=8b7178b35bval
17
+ VARIANT_CKPT=lm1-8b7-178b-c4-repetitions/8b7178b35b
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+ PP_SIZE=2
47
+ TP_SIZE=2
48
+
49
+ MICRO_BATCH_SIZE=1
50
+ GRADIENT_ACCUMULATION_STEPS=1
51
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
52
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
53
+
54
+ # Model parameters
55
+ source model_params.sh
56
+ MODEL_PARAM=("${PARAM_9293M[@]}")
57
+ NHIDDEN=${MODEL_PARAM[0]}
58
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
59
+ KV_SIZE=${MODEL_PARAM[2]}
60
+ NHEADS=${MODEL_PARAM[3]}
61
+ NLAYERS=${MODEL_PARAM[4]}
62
+ SEQ_LEN=2048
63
+
64
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
65
+
66
+ SAVE_INTERVAL=5000
67
+
68
+ # Tokens: 11522010000
69
+ # -> Samples: 5625981
70
+ TRAIN_SAMPLES=1
71
+
72
+ OPTIMIZER_ARGS=" \
73
+ --optimizer adam \
74
+ --adam-beta1 0.9 \
75
+ --adam-beta2 0.999 \
76
+ --adam-eps 1e-8 \
77
+ --lr 2e-4 \
78
+ --min-lr 2e-5 \
79
+ --lr-decay-style cosine \
80
+ --lr-decay-samples $TRAIN_SAMPLES \
81
+ --lr-warmup-samples 0 \
82
+ --clip-grad 1.0 \
83
+ --weight-decay 1e-1 \
84
+ --override-lr-scheduler \
85
+ --reset-progress \
86
+ --no-load-optim \
87
+ "
88
+
89
+ GPT_ARGS=" \
90
+ --num-layers $NLAYERS \
91
+ --hidden-size $NHIDDEN \
92
+ --num-attention-heads $NHEADS \
93
+ --kv-channels $KV_SIZE \
94
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
95
+ --seq-length $SEQ_LEN \
96
+ --max-position-embeddings $SEQ_LEN \
97
+ --micro-batch-size $MICRO_BATCH_SIZE \
98
+ --global-batch-size $GLOBAL_BATCH_SIZE \
99
+ --train-samples $TRAIN_SAMPLES \
100
+ --vocab-file $VOCAB_FILE \
101
+ --merge-file $MERGE_FILE \
102
+ --clip-grad 1.0 \
103
+ --kill-switch-path $KILL_SWITCH_PATH \
104
+ --bf16 \
105
+ $OPTIMIZER_ARGS \
106
+ "
107
+
108
+ OUTPUT_ARGS=" \
109
+ --log-interval 10 \
110
+ --save-interval $SAVE_INTERVAL \
111
+ --eval-interval 1 \
112
+ --eval-iters 100 \
113
+ --eval-only true \
114
+ --tensorboard-dir $TENSORBOARD_PATH \
115
+ --tensorboard-queue-size 5 \
116
+ --log-timers-to-tensorboard \
117
+ --log-batch-size-to-tensorboard \
118
+ --log-validation-ppl-to-tensorboard \
119
+ "
120
+
121
+ ZERO_STAGE=0
122
+
123
+ mkdir -p ds_configs
124
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
125
+
126
+ cat <<EOF > $DS_CONFIG_PATH
127
+ {
128
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
129
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
130
+ "gradient_clipping": 1.0,
131
+ "zero_optimization": {
132
+ "stage": $ZERO_STAGE
133
+ },
134
+ "bf16": {
135
+ "enabled": true
136
+ },
137
+ "steps_per_print": 2000,
138
+ "wall_clock_breakdown": false
139
+ }
140
+ EOF
141
+
142
+ DEEPSPEED_ARGS=" \
143
+ --deepspeed \
144
+ --deepspeed_config $DS_CONFIG_PATH \
145
+ --zero-stage $ZERO_STAGE \
146
+ "
147
+
148
+ CMD=" \
149
+ Megatron-DeepSpeed/pretrain_gpt.py \
150
+ --tensor-model-parallel-size $TP_SIZE \
151
+ --pipeline-model-parallel-size $PP_SIZE \
152
+ $GPT_ARGS \
153
+ $OUTPUT_ARGS \
154
+ --save $CHECKPOINT_PATH \
155
+ --load $CHECKPOINT_PATH \
156
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
157
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
158
+ --data-impl mmap \
159
+ --num-workers 0 \
160
+ --valid-num-workers 0 \
161
+ $DEEPSPEED_ARGS \
162
+ "
163
+
164
+ echo $CMD
165
+
166
+ echo "START $SLURM_JOBID: $(date)"
167
+
168
+ # bash launch_srun.sh $CMD
169
+ srun --label launch.sh $CMD
170
+
171
+ echo "END $SLURM_JOBID: $(date)"
172
+
8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682718827.nid007222.7917.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9405e43bca3f799eb5c822050f89e3f4c857e6ff5717361d4dc607f418fbb730
3
+ size 980
8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682719334.nid006443.81826.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a69c0d31facfb47becc3177ebb1e5ab6b8857126e6ad20f11363c33cbadb82
3
+ size 40
8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682755053.nid007165.106247.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447fa98b32dec72d105837f4d9b1e96e21368a5dfd25e7c2abf72f94dfacade7
3
+ size 40
8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682756717.nid007165.119497.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dce5393ca599e61e1c47f20287278aa9964163b9d8ee9cb9bf35010873db7880
3
+ size 40
8b7178b35b/tensorboard_8b7178b35bval/events.out.tfevents.1682757384.nid006808.89370.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e9af9251de04949780d63c5b2f74a4c0f8a96be6815d506f30d3e5708ad976
3
+ size 980
8b7178b44b/3430925.err ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b44b/3430925.out ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b44b/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step84877
8b7178b44b/sbatch_8b7178b44bval.sh ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007542
3
+ #SBATCH --nodes=64
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=40
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=8b7178b44bval
17
+ VARIANT_CKPT=lm1-8b7-178b-c4-repetitions/8b7178b44b
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+ PP_SIZE=2
47
+ TP_SIZE=2
48
+
49
+ MICRO_BATCH_SIZE=1
50
+ GRADIENT_ACCUMULATION_STEPS=1
51
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
52
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
53
+
54
+ # Model parameters
55
+ source model_params.sh
56
+ MODEL_PARAM=("${PARAM_9293M[@]}")
57
+ NHIDDEN=${MODEL_PARAM[0]}
58
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
59
+ KV_SIZE=${MODEL_PARAM[2]}
60
+ NHEADS=${MODEL_PARAM[3]}
61
+ NLAYERS=${MODEL_PARAM[4]}
62
+ SEQ_LEN=2048
63
+
64
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
65
+
66
+ SAVE_INTERVAL=5000
67
+
68
+ # Tokens: 11522010000
69
+ # -> Samples: 5625981
70
+ TRAIN_SAMPLES=1
71
+
72
+ OPTIMIZER_ARGS=" \
73
+ --optimizer adam \
74
+ --adam-beta1 0.9 \
75
+ --adam-beta2 0.999 \
76
+ --adam-eps 1e-8 \
77
+ --lr 2e-4 \
78
+ --min-lr 2e-5 \
79
+ --lr-decay-style cosine \
80
+ --lr-decay-samples $TRAIN_SAMPLES \
81
+ --lr-warmup-samples 0 \
82
+ --clip-grad 1.0 \
83
+ --weight-decay 1e-1 \
84
+ --override-lr-scheduler \
85
+ --reset-progress \
86
+ --no-load-optim \
87
+ "
88
+
89
+ GPT_ARGS=" \
90
+ --num-layers $NLAYERS \
91
+ --hidden-size $NHIDDEN \
92
+ --num-attention-heads $NHEADS \
93
+ --kv-channels $KV_SIZE \
94
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
95
+ --seq-length $SEQ_LEN \
96
+ --max-position-embeddings $SEQ_LEN \
97
+ --micro-batch-size $MICRO_BATCH_SIZE \
98
+ --global-batch-size $GLOBAL_BATCH_SIZE \
99
+ --train-samples $TRAIN_SAMPLES \
100
+ --vocab-file $VOCAB_FILE \
101
+ --merge-file $MERGE_FILE \
102
+ --clip-grad 1.0 \
103
+ --kill-switch-path $KILL_SWITCH_PATH \
104
+ --bf16 \
105
+ $OPTIMIZER_ARGS \
106
+ "
107
+
108
+ OUTPUT_ARGS=" \
109
+ --log-interval 10 \
110
+ --save-interval $SAVE_INTERVAL \
111
+ --eval-interval 1 \
112
+ --eval-iters 100 \
113
+ --eval-only true \
114
+ --tensorboard-dir $TENSORBOARD_PATH \
115
+ --tensorboard-queue-size 5 \
116
+ --log-timers-to-tensorboard \
117
+ --log-batch-size-to-tensorboard \
118
+ --log-validation-ppl-to-tensorboard \
119
+ "
120
+
121
+ ZERO_STAGE=0
122
+
123
+ mkdir -p ds_configs
124
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
125
+
126
+ cat <<EOF > $DS_CONFIG_PATH
127
+ {
128
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
129
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
130
+ "gradient_clipping": 1.0,
131
+ "zero_optimization": {
132
+ "stage": $ZERO_STAGE
133
+ },
134
+ "bf16": {
135
+ "enabled": true
136
+ },
137
+ "steps_per_print": 2000,
138
+ "wall_clock_breakdown": false
139
+ }
140
+ EOF
141
+
142
+ DEEPSPEED_ARGS=" \
143
+ --deepspeed \
144
+ --deepspeed_config $DS_CONFIG_PATH \
145
+ --zero-stage $ZERO_STAGE \
146
+ "
147
+
148
+ CMD=" \
149
+ Megatron-DeepSpeed/pretrain_gpt.py \
150
+ --tensor-model-parallel-size $TP_SIZE \
151
+ --pipeline-model-parallel-size $PP_SIZE \
152
+ $GPT_ARGS \
153
+ $OUTPUT_ARGS \
154
+ --save $CHECKPOINT_PATH \
155
+ --load $CHECKPOINT_PATH \
156
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
157
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
158
+ --data-impl mmap \
159
+ --num-workers 0 \
160
+ --valid-num-workers 0 \
161
+ $DEEPSPEED_ARGS \
162
+ "
163
+
164
+ echo $CMD
165
+
166
+ echo "START $SLURM_JOBID: $(date)"
167
+
168
+ # bash launch_srun.sh $CMD
169
+ srun --label launch.sh $CMD
170
+
171
+ echo "END $SLURM_JOBID: $(date)"
172
+
8b7178b44b/tensorboard_8b7178b44bval/events.out.tfevents.1682718827.nid006888.86342.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:584c654b19370eb1b57ff569c18841a726d81a0a0b6bb40d9fce3e72021c47c9
3
+ size 980
8b7178b44b/tensorboard_8b7178b44bval/events.out.tfevents.1682755053.nid007026.114219.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:322579ebf8fe2a2e8e009e670a44269dd2f374d4492bc464b4d253bd8d715a5d
3
+ size 980
8b7178b55b/events.out.tfevents.1682718827.nid005807.38076.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98db65fe1f99dcfba821394bde3a08ce8ab449c8a3dc4c61724a810cbbb77458
3
+ size 40
8b7178b55b/events.out.tfevents.1682755053.nid006888.50037.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99f5c10f125933bb9f0002e961ea4aacef8e519fa1b8637022038dedb7062e1
3
+ size 980
8b7178b58b/3430964.err ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b58b/3430964.out ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b58b/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step84877
8b7178b58b/sbatch_8b7178b58bval.sh ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007542
3
+ #SBATCH --nodes=64
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=40
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=8b7178b58bval
17
+ VARIANT_CKPT=lm1-8b7-178b-c4-repetitions/8b7178b58b
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+ PP_SIZE=2
47
+ TP_SIZE=2
48
+
49
+ MICRO_BATCH_SIZE=1
50
+ GRADIENT_ACCUMULATION_STEPS=1
51
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
52
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
53
+
54
+ # Model parameters
55
+ source model_params.sh
56
+ MODEL_PARAM=("${PARAM_9293M[@]}")
57
+ NHIDDEN=${MODEL_PARAM[0]}
58
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
59
+ KV_SIZE=${MODEL_PARAM[2]}
60
+ NHEADS=${MODEL_PARAM[3]}
61
+ NLAYERS=${MODEL_PARAM[4]}
62
+ SEQ_LEN=2048
63
+
64
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
65
+
66
+ SAVE_INTERVAL=5000
67
+
68
+ # Tokens: 11522010000
69
+ # -> Samples: 5625981
70
+ TRAIN_SAMPLES=1
71
+
72
+ OPTIMIZER_ARGS=" \
73
+ --optimizer adam \
74
+ --adam-beta1 0.9 \
75
+ --adam-beta2 0.999 \
76
+ --adam-eps 1e-8 \
77
+ --lr 2e-4 \
78
+ --min-lr 2e-5 \
79
+ --lr-decay-style cosine \
80
+ --lr-decay-samples $TRAIN_SAMPLES \
81
+ --lr-warmup-samples 0 \
82
+ --clip-grad 1.0 \
83
+ --weight-decay 1e-1 \
84
+ --override-lr-scheduler \
85
+ --reset-progress \
86
+ --no-load-optim \
87
+ "
88
+
89
+ GPT_ARGS=" \
90
+ --num-layers $NLAYERS \
91
+ --hidden-size $NHIDDEN \
92
+ --num-attention-heads $NHEADS \
93
+ --kv-channels $KV_SIZE \
94
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
95
+ --seq-length $SEQ_LEN \
96
+ --max-position-embeddings $SEQ_LEN \
97
+ --micro-batch-size $MICRO_BATCH_SIZE \
98
+ --global-batch-size $GLOBAL_BATCH_SIZE \
99
+ --train-samples $TRAIN_SAMPLES \
100
+ --vocab-file $VOCAB_FILE \
101
+ --merge-file $MERGE_FILE \
102
+ --clip-grad 1.0 \
103
+ --kill-switch-path $KILL_SWITCH_PATH \
104
+ --bf16 \
105
+ $OPTIMIZER_ARGS \
106
+ "
107
+
108
+ OUTPUT_ARGS=" \
109
+ --log-interval 10 \
110
+ --save-interval $SAVE_INTERVAL \
111
+ --eval-interval 1 \
112
+ --eval-iters 100 \
113
+ --eval-only true \
114
+ --tensorboard-dir $TENSORBOARD_PATH \
115
+ --tensorboard-queue-size 5 \
116
+ --log-timers-to-tensorboard \
117
+ --log-batch-size-to-tensorboard \
118
+ --log-validation-ppl-to-tensorboard \
119
+ "
120
+
121
+ ZERO_STAGE=0
122
+
123
+ mkdir -p ds_configs
124
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
125
+
126
+ cat <<EOF > $DS_CONFIG_PATH
127
+ {
128
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
129
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
130
+ "gradient_clipping": 1.0,
131
+ "zero_optimization": {
132
+ "stage": $ZERO_STAGE
133
+ },
134
+ "bf16": {
135
+ "enabled": true
136
+ },
137
+ "steps_per_print": 2000,
138
+ "wall_clock_breakdown": false
139
+ }
140
+ EOF
141
+
142
+ DEEPSPEED_ARGS=" \
143
+ --deepspeed \
144
+ --deepspeed_config $DS_CONFIG_PATH \
145
+ --zero-stage $ZERO_STAGE \
146
+ "
147
+
148
+ CMD=" \
149
+ Megatron-DeepSpeed/pretrain_gpt.py \
150
+ --tensor-model-parallel-size $TP_SIZE \
151
+ --pipeline-model-parallel-size $PP_SIZE \
152
+ $GPT_ARGS \
153
+ $OUTPUT_ARGS \
154
+ --save $CHECKPOINT_PATH \
155
+ --load $CHECKPOINT_PATH \
156
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
157
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
158
+ --data-impl mmap \
159
+ --num-workers 0 \
160
+ --valid-num-workers 0 \
161
+ $DEEPSPEED_ARGS \
162
+ "
163
+
164
+ echo $CMD
165
+
166
+ echo "START $SLURM_JOBID: $(date)"
167
+
168
+ # bash launch_srun.sh $CMD
169
+ srun --label launch.sh $CMD
170
+
171
+ echo "END $SLURM_JOBID: $(date)"
172
+
8b7178b58b/tensorboard_8b7178b58bval/events.out.tfevents.1682756534.nid006808.73988.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:383c7fafbe73647f2490692eede41a4b8bac5275ee7ec76b573d332caba16c7d
3
+ size 40
8b7178b58b/tensorboard_8b7178b58bval/events.out.tfevents.1682756738.nid006808.81065.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36d06b6980ef2cd1ac862051d93739ba49b666406b4f124cd7b864ee064b081f
3
+ size 980
8b7178b88b/3430928.err ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b88b/3430928.out ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b88b/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step84877
8b7178b88b/sbatch_8b7178b88bval.sh ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007542
3
+ #SBATCH --nodes=64
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=40
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=8b7178b88bval
17
+ VARIANT_CKPT=lm1-8b7-178b-c4-repetitions/8b7178b88b
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+ PP_SIZE=2
47
+ TP_SIZE=2
48
+
49
+ MICRO_BATCH_SIZE=1
50
+ GRADIENT_ACCUMULATION_STEPS=1
51
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
52
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
53
+
54
+ # Model parameters
55
+ source model_params.sh
56
+ MODEL_PARAM=("${PARAM_9293M[@]}")
57
+ NHIDDEN=${MODEL_PARAM[0]}
58
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
59
+ KV_SIZE=${MODEL_PARAM[2]}
60
+ NHEADS=${MODEL_PARAM[3]}
61
+ NLAYERS=${MODEL_PARAM[4]}
62
+ SEQ_LEN=2048
63
+
64
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
65
+
66
+ SAVE_INTERVAL=5000
67
+
68
+ # Tokens: 11522010000
69
+ # -> Samples: 5625981
70
+ TRAIN_SAMPLES=1
71
+
72
+ OPTIMIZER_ARGS=" \
73
+ --optimizer adam \
74
+ --adam-beta1 0.9 \
75
+ --adam-beta2 0.999 \
76
+ --adam-eps 1e-8 \
77
+ --lr 2e-4 \
78
+ --min-lr 2e-5 \
79
+ --lr-decay-style cosine \
80
+ --lr-decay-samples $TRAIN_SAMPLES \
81
+ --lr-warmup-samples 0 \
82
+ --clip-grad 1.0 \
83
+ --weight-decay 1e-1 \
84
+ --override-lr-scheduler \
85
+ --reset-progress \
86
+ --no-load-optim \
87
+ "
88
+
89
+ GPT_ARGS=" \
90
+ --num-layers $NLAYERS \
91
+ --hidden-size $NHIDDEN \
92
+ --num-attention-heads $NHEADS \
93
+ --kv-channels $KV_SIZE \
94
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
95
+ --seq-length $SEQ_LEN \
96
+ --max-position-embeddings $SEQ_LEN \
97
+ --micro-batch-size $MICRO_BATCH_SIZE \
98
+ --global-batch-size $GLOBAL_BATCH_SIZE \
99
+ --train-samples $TRAIN_SAMPLES \
100
+ --vocab-file $VOCAB_FILE \
101
+ --merge-file $MERGE_FILE \
102
+ --clip-grad 1.0 \
103
+ --kill-switch-path $KILL_SWITCH_PATH \
104
+ --bf16 \
105
+ $OPTIMIZER_ARGS \
106
+ "
107
+
108
+ OUTPUT_ARGS=" \
109
+ --log-interval 10 \
110
+ --save-interval $SAVE_INTERVAL \
111
+ --eval-interval 1 \
112
+ --eval-iters 100 \
113
+ --eval-only true \
114
+ --tensorboard-dir $TENSORBOARD_PATH \
115
+ --tensorboard-queue-size 5 \
116
+ --log-timers-to-tensorboard \
117
+ --log-batch-size-to-tensorboard \
118
+ --log-validation-ppl-to-tensorboard \
119
+ "
120
+
121
+ ZERO_STAGE=0
122
+
123
+ mkdir -p ds_configs
124
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
125
+
126
+ cat <<EOF > $DS_CONFIG_PATH
127
+ {
128
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
129
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
130
+ "gradient_clipping": 1.0,
131
+ "zero_optimization": {
132
+ "stage": $ZERO_STAGE
133
+ },
134
+ "bf16": {
135
+ "enabled": true
136
+ },
137
+ "steps_per_print": 2000,
138
+ "wall_clock_breakdown": false
139
+ }
140
+ EOF
141
+
142
+ DEEPSPEED_ARGS=" \
143
+ --deepspeed \
144
+ --deepspeed_config $DS_CONFIG_PATH \
145
+ --zero-stage $ZERO_STAGE \
146
+ "
147
+
148
+ CMD=" \
149
+ Megatron-DeepSpeed/pretrain_gpt.py \
150
+ --tensor-model-parallel-size $TP_SIZE \
151
+ --pipeline-model-parallel-size $PP_SIZE \
152
+ $GPT_ARGS \
153
+ $OUTPUT_ARGS \
154
+ --save $CHECKPOINT_PATH \
155
+ --load $CHECKPOINT_PATH \
156
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
157
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
158
+ --data-impl mmap \
159
+ --num-workers 0 \
160
+ --valid-num-workers 0 \
161
+ $DEEPSPEED_ARGS \
162
+ "
163
+
164
+ echo $CMD
165
+
166
+ echo "START $SLURM_JOBID: $(date)"
167
+
168
+ # bash launch_srun.sh $CMD
169
+ srun --label launch.sh $CMD
170
+
171
+ echo "END $SLURM_JOBID: $(date)"
172
+
8b7178b88b/tensorboard_8b7178b88bval/events.out.tfevents.1682718827.nid007026.128605.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bae6c921c72ff96644d681410e22722438c92393d6634f6edf75e9d973cba9d
3
+ size 980
8b7178b88b/tensorboard_8b7178b88bval/events.out.tfevents.1682755053.nid007222.92615.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc2a651d9a738cb37fc7638d352811fe232f4ddb764112863c0fb0278c157e9
3
+ size 980