Muennighoff commited on
Commit
95c3d56
1 Parent(s): f10dd9f
3490971.err ADDED
The diff for this file is too large to render. See raw diff
 
3490971.out ADDED
The diff for this file is too large to render. See raw diff
 
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step52452
sbatch_2b855b55bdedupval.sh ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
3
+ #SBATCH --nodes=32
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=32
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 48:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=2b855b55bdedupval
17
+ VARIANT_CKPT=lm1-2b8-55b-dedup
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+ TRAIN_DATA_PATH=train1b5.txt
41
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document"
42
+ VALID_DATA_PATH=val.txt
43
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
44
+
45
+ PP_SIZE=1
46
+ TP_SIZE=1
47
+
48
+ MICRO_BATCH_SIZE=2
49
+ GRADIENT_ACCUMULATION_STEPS=1
50
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
51
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
52
+
53
+ # Model parameters
54
+ source model_params.sh
55
+ MODEL_PARAM=("${PARAM_2980M[@]}")
56
+ NHIDDEN=${MODEL_PARAM[0]}
57
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
58
+ KV_SIZE=${MODEL_PARAM[2]}
59
+ NHEADS=${MODEL_PARAM[3]}
60
+ NLAYERS=${MODEL_PARAM[4]}
61
+ SEQ_LEN=2048
62
+
63
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
64
+
65
+ SAVE_INTERVAL=1000
66
+
67
+ # Tokens: 35546190000
68
+ # -> Samples: 17356538
69
+ TRAIN_SAMPLES=1
70
+
71
+ OPTIMIZER_ARGS=" \
72
+ --optimizer adam \
73
+ --adam-beta1 0.9 \
74
+ --adam-beta2 0.999 \
75
+ --adam-eps 1e-8 \
76
+ --lr 2e-4 \
77
+ --min-lr 2e-5 \
78
+ --lr-decay-style cosine \
79
+ --lr-decay-samples $TRAIN_SAMPLES \
80
+ --lr-warmup-samples 0 \
81
+ --clip-grad 1.0 \
82
+ --weight-decay 1e-1 \
83
+ --override-lr-scheduler \
84
+ --reset-progress \
85
+ --no-load-optim \
86
+ "
87
+
88
+ GPT_ARGS=" \
89
+ --num-layers $NLAYERS \
90
+ --hidden-size $NHIDDEN \
91
+ --num-attention-heads $NHEADS \
92
+ --kv-channels $KV_SIZE \
93
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
94
+ --seq-length $SEQ_LEN \
95
+ --max-position-embeddings $SEQ_LEN \
96
+ --micro-batch-size $MICRO_BATCH_SIZE \
97
+ --global-batch-size $GLOBAL_BATCH_SIZE \
98
+ --train-samples $TRAIN_SAMPLES \
99
+ --vocab-file $VOCAB_FILE \
100
+ --merge-file $MERGE_FILE \
101
+ --clip-grad 1.0 \
102
+ --kill-switch-path $KILL_SWITCH_PATH \
103
+ --bf16 \
104
+ $OPTIMIZER_ARGS \
105
+ "
106
+
107
+ OUTPUT_ARGS=" \
108
+ --log-interval 10 \
109
+ --save-interval $SAVE_INTERVAL \
110
+ --eval-interval 1 \
111
+ --eval-iters 100 \
112
+ --eval-only true \
113
+ --tensorboard-dir $TENSORBOARD_PATH \
114
+ --tensorboard-queue-size 5 \
115
+ --log-timers-to-tensorboard \
116
+ --log-batch-size-to-tensorboard \
117
+ --log-validation-ppl-to-tensorboard \
118
+ "
119
+
120
+ ZERO_STAGE=0
121
+
122
+ mkdir -p ds_configs
123
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
124
+
125
+ cat <<EOF > $DS_CONFIG_PATH
126
+ {
127
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
128
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
129
+ "gradient_clipping": 1.0,
130
+ "zero_optimization": {
131
+ "stage": $ZERO_STAGE
132
+ },
133
+ "bf16": {
134
+ "enabled": true
135
+ },
136
+ "steps_per_print": 2000,
137
+ "wall_clock_breakdown": false
138
+ }
139
+ EOF
140
+
141
+ DEEPSPEED_ARGS=" \
142
+ --deepspeed \
143
+ --deepspeed_config $DS_CONFIG_PATH \
144
+ --zero-stage $ZERO_STAGE \
145
+ "
146
+
147
+ CMD=" \
148
+ Megatron-DeepSpeed/pretrain_gpt.py \
149
+ --tensor-model-parallel-size $TP_SIZE \
150
+ --pipeline-model-parallel-size $PP_SIZE \
151
+ $GPT_ARGS \
152
+ $OUTPUT_ARGS \
153
+ --save $CHECKPOINT_PATH \
154
+ --load $CHECKPOINT_PATH \
155
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
156
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
157
+ --data-impl mmap \
158
+ $DEEPSPEED_ARGS \
159
+ "
160
+
161
+ echo $CMD
162
+
163
+ echo "START $SLURM_JOBID: $(date)"
164
+
165
+ # bash launch_srun_32.sh $CMD
166
+ srun --label launch.sh $CMD
167
+
168
+ echo "END $SLURM_JOBID: $(date)"
tensorboard_2b855b55bdedupval/events.out.tfevents.1683703572.nid005175.86332.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c8048d61bec6349ee28630eb1e6da60c655f068a9c4f71e24dde126f2703fa6
3
+ size 980
tensorboard_2b855b55bdedupval/events.out.tfevents.1683704235.nid007191.7351.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce73ca343c2c90f2ccb3d260766c827fc55c38bf058b7e020d928086c467002f
3
+ size 980
tensorboard_2b855b55bdedupval/events.out.tfevents.1683710537.nid005934.52157.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b23669c951074958bd9fcf5143c4914bef64293b882886c3e3cc8fd3ee8792c8
3
+ size 980