Muennighoff
commited on
Commit
·
f9fc05c
1
Parent(s):
0420974
Add
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 146m14b100mdedup/3326961.err +0 -0
- 146m14b100mdedup/3326961.out +0 -0
- 146m14b100mdedup/3328731.err +0 -0
- 146m14b100mdedup/3328731.out +446 -0
- 146m14b400m/3318392.err +0 -0
- 146m14b400m/3318392.out +0 -0
- 146m174b100m/3319491.err +0 -0
- 146m174b100m/3319491.out +367 -0
- 146m174b100m/3418230.err +0 -0
- 146m174b100m/3418230.out +0 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3 -0
- 146m174b100m/global_step331103/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3 -0
146m14b100mdedup/3326961.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m14b100mdedup/3326961.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m14b100mdedup/3328731.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m14b100mdedup/3328731.out
ADDED
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model parameters: d_model 768 ffw_size 3072 kv_size 64 n_heads 12 n_layers 15
|
2 |
+
Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 15 --hidden-size 768 --num-attention-heads 12 --kv-channels 64 --ffn-hidden-size 3072 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-146m14b100mdedupval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --no-load-optim --reset-progress --override-lr-scheduler --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --eval-only true --tensorboard-dir tensorboard_146m14b100mdedupval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_146m14b100mdedup --load checkpoints_146m14b100mdedup --train-weighted-split-paths-path train14b.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3328731.json --zero-stage 0
|
3 |
+
START 3328731: Fri 17 Mar 2023 10:24:10 AM EET
|
4 |
+
0:
|
5 |
+
0:
|
6 |
+
0: ======================= ROCm System Management Interface =======================
|
7 |
+
0: ================================= Concise Info =================================
|
8 |
+
0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
9 |
+
0: 0 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
10 |
+
0: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
11 |
+
0: 2 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
12 |
+
0: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
13 |
+
0: 4 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
14 |
+
0: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
15 |
+
0: 6 42.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
16 |
+
0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
17 |
+
0: ================================================================================
|
18 |
+
0: ============================= End of ROCm SMI Log ==============================
|
19 |
+
6:
|
20 |
+
6:
|
21 |
+
6: ======================= ROCm System Management Interface =======================
|
22 |
+
6: ================================= Concise Info =================================
|
23 |
+
6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
24 |
+
6: 0 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
25 |
+
6: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
26 |
+
6: 2 41.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
27 |
+
6: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
28 |
+
6: 4 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
29 |
+
6: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
30 |
+
6: 6 43.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
31 |
+
6: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
32 |
+
6: ================================================================================
|
33 |
+
6: ============================= End of ROCm SMI Log ==============================
|
34 |
+
2:
|
35 |
+
2:
|
36 |
+
2: ======================= ROCm System Management Interface =======================
|
37 |
+
2: ================================= Concise Info =================================
|
38 |
+
2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
39 |
+
2: 0 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
40 |
+
2: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
41 |
+
2: 2 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
42 |
+
2: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
43 |
+
2: 4 37.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
44 |
+
2: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
45 |
+
2: 6 37.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
46 |
+
2: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
47 |
+
2: ================================================================================
|
48 |
+
2: ============================= End of ROCm SMI Log ==============================
|
49 |
+
3:
|
50 |
+
3:
|
51 |
+
3: ======================= ROCm System Management Interface =======================
|
52 |
+
3: ================================= Concise Info =================================
|
53 |
+
3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
54 |
+
3: 0 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
55 |
+
3: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
56 |
+
3: 2 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
57 |
+
3: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
58 |
+
3: 4 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
59 |
+
3: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
60 |
+
3: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
61 |
+
3: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
62 |
+
3: ================================================================================
|
63 |
+
3: ============================= End of ROCm SMI Log ==============================
|
64 |
+
5:
|
65 |
+
5:
|
66 |
+
5: ======================= ROCm System Management Interface =======================
|
67 |
+
5: ================================= Concise Info =================================
|
68 |
+
5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
69 |
+
5: 0 47.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
70 |
+
5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
71 |
+
5: 2 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
72 |
+
5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
73 |
+
5: 4 38.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
74 |
+
5: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
75 |
+
5: 6 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
76 |
+
5: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
77 |
+
5: ================================================================================
|
78 |
+
5: ============================= End of ROCm SMI Log ==============================
|
79 |
+
7:
|
80 |
+
7:
|
81 |
+
7: ======================= ROCm System Management Interface =======================
|
82 |
+
7: ================================= Concise Info =================================
|
83 |
+
7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
84 |
+
7: 0 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
85 |
+
7: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
86 |
+
7: 2 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
87 |
+
7: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
88 |
+
7: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
89 |
+
7: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
90 |
+
7: 6 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
91 |
+
7: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
92 |
+
7: ================================================================================
|
93 |
+
7: ============================= End of ROCm SMI Log ==============================
|
94 |
+
4:
|
95 |
+
4:
|
96 |
+
4: ======================= ROCm System Management Interface =======================
|
97 |
+
4: ================================= Concise Info =================================
|
98 |
+
4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
99 |
+
4: 0 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
100 |
+
4: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
101 |
+
4: 2 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
102 |
+
4: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
103 |
+
4: 4 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
104 |
+
4: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
105 |
+
4: 6 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
106 |
+
4: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
107 |
+
4: ================================================================================
|
108 |
+
4: ============================= End of ROCm SMI Log ==============================
|
109 |
+
1:
|
110 |
+
1:
|
111 |
+
1: ======================= ROCm System Management Interface =======================
|
112 |
+
1: ================================= Concise Info =================================
|
113 |
+
1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
114 |
+
1: 0 49.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
115 |
+
1: 1 54.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
116 |
+
1: 2 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
117 |
+
1: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
118 |
+
1: 4 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
119 |
+
1: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
120 |
+
1: 6 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
121 |
+
1: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
122 |
+
1: ================================================================================
|
123 |
+
1: ============================= End of ROCm SMI Log ==============================
|
124 |
+
2: Launching on nid005360 (2/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
125 |
+
6: Launching on nid005364 (6/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
126 |
+
1: Launching on nid005359 (1/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
127 |
+
7: Launching on nid005365 (7/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
128 |
+
0: Launching on nid005358 (0/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
129 |
+
4: Launching on nid005362 (4/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
130 |
+
5: Launching on nid005363 (5/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
131 |
+
3: Launching on nid005361 (3/8), master nid005358 port 9999, GPUs 8, CUDA: True
|
132 |
+
0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1
|
133 |
+
0: accumulate and all-reduce gradients in fp32 for bfloat16 data type.
|
134 |
+
0: using torch.bfloat16 for parameters ...
|
135 |
+
0: ------------------------ arguments ------------------------
|
136 |
+
0: abort_on_unmet_fused_kernel_constraints ......... False
|
137 |
+
0: accumulate_allreduce_grads_in_fp32 .............. True
|
138 |
+
0: adam_beta1 ...................................... 0.9
|
139 |
+
0: adam_beta2 ...................................... 0.999
|
140 |
+
0: adam_eps ........................................ 1e-08
|
141 |
+
0: adlr_autoresume ................................. False
|
142 |
+
0: adlr_autoresume_interval ........................ 1000
|
143 |
+
0: apply_query_key_layer_scaling ................... True
|
144 |
+
0: apply_residual_connection_post_layernorm ........ False
|
145 |
+
0: attention_dropout ............................... 0.1
|
146 |
+
0: attention_softmax_in_fp32 ....................... False
|
147 |
+
0: bert_binary_head ................................ True
|
148 |
+
0: bert_load ....................................... None
|
149 |
+
0: bf16 ............................................ True
|
150 |
+
0: bias_dropout_fusion ............................. True
|
151 |
+
0: bias_gelu_fusion ................................ True
|
152 |
+
0: biencoder_projection_dim ........................ 0
|
153 |
+
0: biencoder_shared_query_context_model ............ False
|
154 |
+
0: block_data_path ................................. None
|
155 |
+
0: checkpoint_activations .......................... False
|
156 |
+
0: checkpoint_in_cpu ............................... False
|
157 |
+
0: checkpoint_num_layers ........................... 1
|
158 |
+
0: clip_grad ....................................... 1.0
|
159 |
+
0: codecarbon_dir .................................. None
|
160 |
+
0: consumed_train_samples .......................... 0
|
161 |
+
0: consumed_train_tokens ........................... 0
|
162 |
+
0: consumed_valid_samples .......................... 0
|
163 |
+
0: contigious_checkpointing ........................ False
|
164 |
+
0: cpu_optimizer ................................... False
|
165 |
+
0: cpu_torch_adam .................................. False
|
166 |
+
0: curriculum_learning ............................. False
|
167 |
+
0: data_impl ....................................... mmap
|
168 |
+
0: data_parallel_size .............................. 64
|
169 |
+
0: data_path ....................................... None
|
170 |
+
0: dataloader_type ................................. single
|
171 |
+
0: DDP_impl ........................................ local
|
172 |
+
0: decoder_seq_length .............................. None
|
173 |
+
0: deepscale ....................................... False
|
174 |
+
0: deepscale_config ................................ None
|
175 |
+
0: deepspeed ....................................... True
|
176 |
+
0: deepspeed_activation_checkpointing .............. False
|
177 |
+
0: deepspeed_config ................................ ds_configs/3328731.json
|
178 |
+
0: deepspeed_mpi ................................... False
|
179 |
+
0: distribute_checkpointed_activations ............. False
|
180 |
+
0: distributed_backend ............................. nccl
|
181 |
+
0: embed_layernorm ................................. False
|
182 |
+
0: embedding_path .................................. None
|
183 |
+
0: encoder_seq_length .............................. 2048
|
184 |
+
0: eod_mask_loss ................................... False
|
185 |
+
0: eval_interval ................................... 1
|
186 |
+
0: eval_iters ...................................... 100
|
187 |
+
0: eval_only ....................................... True
|
188 |
+
0: evidence_data_path .............................. None
|
189 |
+
0: exit_duration_in_mins ........................... None
|
190 |
+
0: exit_interval ................................... None
|
191 |
+
0: ffn_hidden_size ................................. 3072
|
192 |
+
0: finetune ........................................ False
|
193 |
+
0: fp16 ............................................ False
|
194 |
+
0: fp16_lm_cross_entropy ........................... False
|
195 |
+
0: fp32_residual_connection ........................ False
|
196 |
+
0: gigaflos_no_embeds .............................. 0
|
197 |
+
0: global_batch_size ............................... 256
|
198 |
+
0: glu_activation .................................. None
|
199 |
+
0: hidden_dropout .................................. 0.1
|
200 |
+
0: hidden_size ..................................... 768
|
201 |
+
0: hysteresis ...................................... 2
|
202 |
+
0: ict_head_size ................................... None
|
203 |
+
0: ict_load ........................................ None
|
204 |
+
0: img_dim ......................................... 224
|
205 |
+
0: indexer_batch_size .............................. 128
|
206 |
+
0: indexer_log_interval ............................ 1000
|
207 |
+
0: inference ....................................... False
|
208 |
+
0: init_method_std ................................. 0.02
|
209 |
+
0: init_method_xavier_uniform ...................... False
|
210 |
+
0: initial_loss_scale .............................. 4294967296
|
211 |
+
0: kill_switch_path ................................ kill-switch-146m14b100mdedupval
|
212 |
+
0: kv_channels ..................................... 64
|
213 |
+
0: layer_norm_fusion ............................... True
|
214 |
+
0: layernorm_epsilon ............................... 1e-05
|
215 |
+
0: lazy_mpu_init ................................... None
|
216 |
+
0: load ............................................ checkpoints_146m14b100mdedup
|
217 |
+
0: local_rank ...................................... None
|
218 |
+
0: log_batch_size_to_tensorboard ................... True
|
219 |
+
0: log_interval .................................... 10
|
220 |
+
0: log_learning_rate_to_tensorboard ................ True
|
221 |
+
0: log_level ....................................... None
|
222 |
+
0: log_level_replica ............................... None
|
223 |
+
0: log_loss_scale_to_tensorboard ................... True
|
224 |
+
0: log_num_zeros_in_grad ........................... False
|
225 |
+
0: log_params_norm ................................. False
|
226 |
+
0: log_path ........................................ None
|
227 |
+
0: log_timers_to_tensorboard ....................... True
|
228 |
+
0: log_validation_ppl_to_tensorboard ............... True
|
229 |
+
0: loss_on_targets_only ............................ False
|
230 |
+
0: loss_scale ...................................... None
|
231 |
+
0: loss_scale_window ............................... 1000
|
232 |
+
0: lr .............................................. 0.0002
|
233 |
+
0: lr_decay_iters .................................. None
|
234 |
+
0: lr_decay_samples ................................ 1
|
235 |
+
0: lr_decay_style .................................. cosine
|
236 |
+
0: lr_decay_tokens ................................. None
|
237 |
+
0: lr_warmup_fraction .............................. None
|
238 |
+
0: lr_warmup_iters ................................. 0
|
239 |
+
0: lr_warmup_samples ............................... 0
|
240 |
+
0: make_vocab_size_divisible_by .................... 128
|
241 |
+
0: mask_prob ....................................... 0.15
|
242 |
+
0: masked_softmax_fusion ........................... True
|
243 |
+
0: max_position_embeddings ......................... 2048
|
244 |
+
0: mean_noise_span_length .......................... None
|
245 |
+
0: memory_centric_tiled_linear ..................... False
|
246 |
+
0: merge_file ...................................... gpt2/merges.txt
|
247 |
+
0: micro_batch_size ................................ 4
|
248 |
+
0: min_loss_scale .................................. 1.0
|
249 |
+
0: min_lr .......................................... 2e-05
|
250 |
+
0: mmap_warmup ..................................... False
|
251 |
+
0: no_load_optim ................................... True
|
252 |
+
0: no_load_rng ..................................... None
|
253 |
+
0: no_save_optim ................................... None
|
254 |
+
0: no_save_rng ..................................... None
|
255 |
+
0: noise_density ................................... None
|
256 |
+
0: num_attention_heads ............................. 12
|
257 |
+
0: num_channels .................................... 3
|
258 |
+
0: num_classes ..................................... 1000
|
259 |
+
0: num_layers ...................................... 15
|
260 |
+
0: num_layers_per_virtual_pipeline_stage ........... None
|
261 |
+
0: num_workers ..................................... 2
|
262 |
+
0: onnx_safe ....................................... None
|
263 |
+
0: openai_gelu ..................................... False
|
264 |
+
0: optimizer ....................................... adam
|
265 |
+
0: optimizer_fusion ................................ True
|
266 |
+
0: override_lr_scheduler ........................... True
|
267 |
+
0: pad_vocab_size_to ............................... None
|
268 |
+
0: params_dtype .................................... torch.bfloat16
|
269 |
+
0: partition_activations ........................... False
|
270 |
+
0: patch_dim ....................................... 16
|
271 |
+
0: pipeline_model_parallel_size .................... 1
|
272 |
+
0: position_embedding_type ......................... PositionEmbeddingType.absolute
|
273 |
+
0: pp_partition_method ............................. None
|
274 |
+
0: profile_backward ................................ False
|
275 |
+
0: query_in_block_prob ............................. 0.1
|
276 |
+
0: rampup_batch_size ............................... None
|
277 |
+
0: rank ............................................ 0
|
278 |
+
0: remote_device ................................... none
|
279 |
+
0: reset_attention_mask ............................ False
|
280 |
+
0: reset_position_ids .............................. False
|
281 |
+
0: reset_progress .................................. True
|
282 |
+
0: retriever_report_topk_accuracies ................ []
|
283 |
+
0: retriever_score_scaling ......................... False
|
284 |
+
0: retriever_seq_length ............................ 256
|
285 |
+
0: reweight_loss_based_on_position_frequency ....... False
|
286 |
+
0: sample_rate ..................................... 1.0
|
287 |
+
0: save ............................................ checkpoints_146m14b100mdedup
|
288 |
+
0: save_interval ................................... 1000
|
289 |
+
0: scatter_gather_tensors_in_pipeline .............. True
|
290 |
+
0: scattered_embeddings ............................ False
|
291 |
+
0: seed ............................................ 1234
|
292 |
+
0: seq_length ...................................... 2048
|
293 |
+
0: sgd_momentum .................................... 0.9
|
294 |
+
0: short_seq_prob .................................. 0.1
|
295 |
+
0: skip_train_iteration_range ...................... None
|
296 |
+
0: split ........................................... None
|
297 |
+
0: split_transformers .............................. False
|
298 |
+
0: sync_tp_duplicated_parameters ................... False
|
299 |
+
0: synchronize_each_layer .......................... False
|
300 |
+
0: tensor_model_parallel_size ...................... 1
|
301 |
+
0: tensorboard_dir ................................. tensorboard_146m14b100mdedupval
|
302 |
+
0: tensorboard_log_interval ........................ 1
|
303 |
+
0: tensorboard_queue_size .......................... 5
|
304 |
+
0: test_weighted_split_paths ....................... None
|
305 |
+
0: test_weighted_split_paths_path .................. None
|
306 |
+
0: tile_factor ..................................... 1
|
307 |
+
0: titles_data_path ................................ None
|
308 |
+
0: tokenizer_name_or_path .......................... None
|
309 |
+
0: tokenizer_type .................................. GPT2BPETokenizer
|
310 |
+
0: train_iters ..................................... None
|
311 |
+
0: train_samples ................................... 1
|
312 |
+
0: train_tokens .................................... None
|
313 |
+
0: train_weighted_split_names ...................... ['train']
|
314 |
+
0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_14B_text_document']]
|
315 |
+
0: train_weighted_split_paths_path ................. None
|
316 |
+
0: train_weighted_split_splits ..................... [['0:1']]
|
317 |
+
0: train_weighted_split_weights .................... [['1.0']]
|
318 |
+
0: universal_checkpoint ............................ False
|
319 |
+
0: use_bnb_optimizer ............................... False
|
320 |
+
0: use_checkpoint_lr_scheduler ..................... False
|
321 |
+
0: use_contiguous_buffers_in_ddp ................... True
|
322 |
+
0: use_cpu_initialization .......................... None
|
323 |
+
0: use_one_sent_docs ............................... False
|
324 |
+
0: use_pin_memory .................................. False
|
325 |
+
0: valid_num_workers ............................... 2
|
326 |
+
0: valid_weighted_split_names ...................... ['validation']
|
327 |
+
0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']]
|
328 |
+
0: valid_weighted_split_paths_path ................. None
|
329 |
+
0: valid_weighted_split_splits ..................... [['0:1']]
|
330 |
+
0: valid_weighted_split_weights .................... [['1.0']]
|
331 |
+
0: virtual_pipeline_model_parallel_size ............ None
|
332 |
+
0: vocab_extra_ids ................................. 0
|
333 |
+
0: vocab_file ...................................... gpt2/vocab.json
|
334 |
+
0: weight_decay .................................... 0.1
|
335 |
+
0: world_size ...................................... 64
|
336 |
+
0: zero_allgather_bucket_size ...................... 0.0
|
337 |
+
0: zero_contigious_gradients ....................... False
|
338 |
+
0: zero_reduce_bucket_size ......................... 0.0
|
339 |
+
0: zero_reduce_scatter ............................. False
|
340 |
+
0: zero_stage ...................................... 0
|
341 |
+
0: -------------------- end of arguments ---------------------
|
342 |
+
0: setting number of micro-batches to constant 1
|
343 |
+
0: > building GPT2BPETokenizer tokenizer ...
|
344 |
+
0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
|
345 |
+
0: DeepSpeed general environment info:
|
346 |
+
0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch']
|
347 |
+
0: torch version .................... 1.13.0+rocm5.2
|
348 |
+
0: torch cuda version ............... None
|
349 |
+
0: torch hip version ................ 5.2.21151-afdc89f8
|
350 |
+
0: nvcc version ..................... None
|
351 |
+
0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed']
|
352 |
+
0: deepspeed info ................... 0.7.5, unknown, unknown
|
353 |
+
0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1
|
354 |
+
0: **** Git info for Megatron: git_hash=unknown git_branch=unknown ****
|
355 |
+
0: > initializing torch distributed ...
|
356 |
+
0: [2023-03-17 10:27:13,249] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
357 |
+
7: > setting tensorboard ...
|
358 |
+
0: > initializing tensor model parallel with size 1
|
359 |
+
0: > initializing pipeline model parallel with size 1
|
360 |
+
0: > setting random seeds to 1234 ...
|
361 |
+
0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
|
362 |
+
0: > compiling dataset index builder ...
|
363 |
+
0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
364 |
+
0: make: Nothing to be done for 'default'.
|
365 |
+
0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
366 |
+
0: >>> done with dataset index builder. Compilation time: 0.111 seconds
|
367 |
+
0: > compiling and loading fused kernels ...
|
368 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified]
|
369 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
370 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
371 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
372 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified]
|
373 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
374 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
375 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
376 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
|
377 |
+
0: Total number of unsupported CUDA function calls: 0
|
378 |
+
0:
|
379 |
+
0:
|
380 |
+
0: Total number of replaced kernel launches: 87
|
381 |
+
0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so
|
382 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified]
|
383 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified]
|
384 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
385 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
386 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
387 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
|
388 |
+
0: Total number of unsupported CUDA function calls: 0
|
389 |
+
0:
|
390 |
+
0:
|
391 |
+
0: Total number of replaced kernel launches: 63
|
392 |
+
0: [1/1] c++ scaled_masked_softmax_hip.o scaled_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so
|
393 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes]
|
394 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified]
|
395 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
396 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
397 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
398 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
|
399 |
+
0: Total number of unsupported CUDA function calls: 0
|
400 |
+
0:
|
401 |
+
0:
|
402 |
+
0: Total number of replaced kernel launches: 67
|
403 |
+
0: ninja: no work to do.
|
404 |
+
0: >>> done with compiling and loading fused kernels. Compilation time: 23.698 seconds
|
405 |
+
0: time to initialize megatron (seconds): -4.253
|
406 |
+
0: [after megatron is initialized] datetime: 2023-03-17 10:27:39
|
407 |
+
0: building GPT model ...
|
408 |
+
0: [2023-03-17 10:27:39,879] [INFO] [utils.py:827:see_memory_usage] Before Building Model
|
409 |
+
0: [2023-03-17 10:27:39,880] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB
|
410 |
+
0: [2023-03-17 10:27:39,880] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.59 GB, percent = 6.1%
|
411 |
+
0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None
|
412 |
+
0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi
|
413 |
+
0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4
|
414 |
+
0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63}
|
415 |
+
0: [2023-03-17 10:27:41,886] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer
|
416 |
+
0: stage=0 layers=22
|
417 |
+
0: 0: _to_float16
|
418 |
+
0: 1: EmbeddingPipe
|
419 |
+
0: 2: <lambda>
|
420 |
+
0: 3: ParallelTransformerLayerPipe
|
421 |
+
0: 4: ParallelTransformerLayerPipe
|
422 |
+
0: 5: ParallelTransformerLayerPipe
|
423 |
+
0: 6: ParallelTransformerLayerPipe
|
424 |
+
0: 7: ParallelTransformerLayerPipe
|
425 |
+
0: 8: ParallelTransformerLayerPipe
|
426 |
+
0: 9: ParallelTransformerLayerPipe
|
427 |
+
0: 10: ParallelTransformerLayerPipe
|
428 |
+
0: 11: ParallelTransformerLayerPipe
|
429 |
+
0: 12: ParallelTransformerLayerPipe
|
430 |
+
0: 13: ParallelTransformerLayerPipe
|
431 |
+
0: 14: ParallelTransformerLayerPipe
|
432 |
+
0: 15: ParallelTransformerLayerPipe
|
433 |
+
0: 16: ParallelTransformerLayerPipe
|
434 |
+
0: 17: ParallelTransformerLayerPipe
|
435 |
+
0: 18: undo
|
436 |
+
0: 19: MixedFusedLayerNorm
|
437 |
+
0: 20: EmbeddingPipe
|
438 |
+
0: 21: float16_to_fp32
|
439 |
+
0: loss: CrossEntropy
|
440 |
+
0: [2023-03-17 10:27:42,188] [INFO] [utils.py:827:see_memory_usage] After Building Model
|
441 |
+
0: [2023-03-17 10:27:42,189] [INFO] [utils.py:828:see_memory_usage] MA 0.28 GB Max_MA 0.28 GB CA 0.29 GB Max_CA 0 GB
|
442 |
+
0: [2023-03-17 10:27:42,189] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.61 GB, percent = 6.1%
|
443 |
+
0: setting training iterations to 0
|
444 |
+
0: > learning rate decay style: cosine
|
445 |
+
0: DeepSpeed is enabled.
|
446 |
+
0: [2023-03-17 10:27:42,191] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown
|
146m14b400m/3318392.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m14b400m/3318392.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b100m/3319491.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b100m/3319491.out
ADDED
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model parameters: d_model 768 ffw_size 3072 kv_size 64 n_heads 12 n_layers 15
|
2 |
+
Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 15 --hidden-size 768 --num-attention-heads 12 --kv-channels 64 --ffn-hidden-size 3072 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 84_762_549 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-146m174b100m --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 84_762_549 --lr-warmup-samples 847_625 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 100 --save-interval 10000 --eval-interval 10000 --eval-iters 1 --tensorboard-dir tensorboard_146m174b100m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_146m174b100m --load checkpoints_146m174b100m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3319491.json --zero-stage 0
|
3 |
+
START 3319491: Fri 17 Mar 2023 01:50:53 PM EET
|
4 |
+
0:
|
5 |
+
0:
|
6 |
+
0: ======================= ROCm System Management Interface =======================
|
7 |
+
0: ================================= Concise Info =================================
|
8 |
+
0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
9 |
+
0: 0 46.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
10 |
+
0: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
11 |
+
0: 2 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
12 |
+
0: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
13 |
+
0: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
14 |
+
0: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
15 |
+
0: 6 38.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
16 |
+
0: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
17 |
+
0: ================================================================================
|
18 |
+
0: ============================= End of ROCm SMI Log ==============================
|
19 |
+
7:
|
20 |
+
7:
|
21 |
+
7: ======================= ROCm System Management Interface =======================
|
22 |
+
7: ================================= Concise Info =================================
|
23 |
+
7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
24 |
+
7: 0 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
25 |
+
7: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
26 |
+
7: 2 38.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
27 |
+
7: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
28 |
+
7: 4 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
29 |
+
7: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
30 |
+
7: 6 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
31 |
+
7: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
32 |
+
7: ================================================================================
|
33 |
+
7: ============================= End of ROCm SMI Log ==============================
|
34 |
+
1:
|
35 |
+
1:
|
36 |
+
1: ======================= ROCm System Management Interface =======================
|
37 |
+
1: ================================= Concise Info =================================
|
38 |
+
1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
39 |
+
1: 0 45.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
40 |
+
1: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
41 |
+
1: 2 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
42 |
+
1: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
43 |
+
1: 4 49.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
44 |
+
1: 5 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
45 |
+
1: 6 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
46 |
+
1: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
47 |
+
1: ================================================================================
|
48 |
+
1: ============================= End of ROCm SMI Log ==============================
|
49 |
+
4:
|
50 |
+
4:
|
51 |
+
4: ======================= ROCm System Management Interface =======================
|
52 |
+
4: ================================= Concise Info =================================
|
53 |
+
4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
54 |
+
4: 0 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
55 |
+
4: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
56 |
+
4: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
57 |
+
4: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
58 |
+
4: 4 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
59 |
+
4: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
60 |
+
4: 6 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
61 |
+
4: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
62 |
+
4: ================================================================================
|
63 |
+
4: ============================= End of ROCm SMI Log ==============================
|
64 |
+
5:
|
65 |
+
5:
|
66 |
+
5: ======================= ROCm System Management Interface =======================
|
67 |
+
5: ================================= Concise Info =================================
|
68 |
+
5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
69 |
+
5: 0 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
70 |
+
5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
71 |
+
5: 2 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
72 |
+
5: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
73 |
+
5: 4 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
74 |
+
5: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
75 |
+
5: 6 35.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
76 |
+
5: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
77 |
+
5: ================================================================================
|
78 |
+
5: ============================= End of ROCm SMI Log ==============================
|
79 |
+
3:
|
80 |
+
3:
|
81 |
+
3: ======================= ROCm System Management Interface =======================
|
82 |
+
3: ================================= Concise Info =================================
|
83 |
+
3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
84 |
+
3: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
85 |
+
3: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
86 |
+
3: 2 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
87 |
+
3: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
88 |
+
3: 4 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
89 |
+
3: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
90 |
+
3: 6 47.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
91 |
+
3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
92 |
+
3: ================================================================================
|
93 |
+
3: ============================= End of ROCm SMI Log ==============================
|
94 |
+
2:
|
95 |
+
2:
|
96 |
+
2: ======================= ROCm System Management Interface =======================
|
97 |
+
2: ================================= Concise Info =================================
|
98 |
+
2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
99 |
+
2: 0 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
100 |
+
2: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
101 |
+
2: 2 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
102 |
+
2: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
103 |
+
2: 4 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
104 |
+
2: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
105 |
+
2: 6 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
106 |
+
2: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
107 |
+
2: ================================================================================
|
108 |
+
2: ============================= End of ROCm SMI Log ==============================
|
109 |
+
6:
|
110 |
+
6:
|
111 |
+
6: ======================= ROCm System Management Interface =======================
|
112 |
+
6: ================================= Concise Info =================================
|
113 |
+
6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
114 |
+
6: 0 48.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
115 |
+
6: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
116 |
+
6: 2 41.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
117 |
+
6: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
118 |
+
6: 4 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
119 |
+
6: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
120 |
+
6: 6 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
121 |
+
6: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
122 |
+
6: ================================================================================
|
123 |
+
6: ============================= End of ROCm SMI Log ==============================
|
124 |
+
7: Launching on nid006946 (7/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
125 |
+
4: Launching on nid006943 (4/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
126 |
+
6: Launching on nid006945 (6/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
127 |
+
3: Launching on nid006942 (3/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
128 |
+
0: Launching on nid006939 (0/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
129 |
+
5: Launching on nid006944 (5/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
130 |
+
1: Launching on nid006940 (1/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
131 |
+
2: Launching on nid006941 (2/8), master nid006939 port 9999, GPUs 8, CUDA: True
|
132 |
+
7: > setting tensorboard ...
|
133 |
+
0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1
|
134 |
+
0: accumulate and all-reduce gradients in fp32 for bfloat16 data type.
|
135 |
+
0: using torch.bfloat16 for parameters ...
|
136 |
+
0: ------------------------ arguments ------------------------
|
137 |
+
0: abort_on_unmet_fused_kernel_constraints ......... False
|
138 |
+
0: accumulate_allreduce_grads_in_fp32 .............. True
|
139 |
+
0: adam_beta1 ...................................... 0.9
|
140 |
+
0: adam_beta2 ...................................... 0.999
|
141 |
+
0: adam_eps ........................................ 1e-08
|
142 |
+
0: adlr_autoresume ................................. False
|
143 |
+
0: adlr_autoresume_interval ........................ 1000
|
144 |
+
0: apply_query_key_layer_scaling ................... True
|
145 |
+
0: apply_residual_connection_post_layernorm ........ False
|
146 |
+
0: attention_dropout ............................... 0.1
|
147 |
+
0: attention_softmax_in_fp32 ....................... False
|
148 |
+
0: bert_binary_head ................................ True
|
149 |
+
0: bert_load ....................................... None
|
150 |
+
0: bf16 ............................................ True
|
151 |
+
0: bias_dropout_fusion ............................. True
|
152 |
+
0: bias_gelu_fusion ................................ True
|
153 |
+
0: biencoder_projection_dim ........................ 0
|
154 |
+
0: biencoder_shared_query_context_model ............ False
|
155 |
+
0: block_data_path ................................. None
|
156 |
+
0: checkpoint_activations .......................... True
|
157 |
+
0: checkpoint_in_cpu ............................... False
|
158 |
+
0: checkpoint_num_layers ........................... 1
|
159 |
+
0: clip_grad ....................................... 1.0
|
160 |
+
0: codecarbon_dir .................................. None
|
161 |
+
0: consumed_train_samples .......................... 0
|
162 |
+
0: consumed_train_tokens ........................... 0
|
163 |
+
0: consumed_valid_samples .......................... 0
|
164 |
+
0: contigious_checkpointing ........................ False
|
165 |
+
0: cpu_optimizer ................................... False
|
166 |
+
0: cpu_torch_adam .................................. False
|
167 |
+
0: curriculum_learning ............................. False
|
168 |
+
0: data_impl ....................................... mmap
|
169 |
+
0: data_parallel_size .............................. 64
|
170 |
+
0: data_path ....................................... None
|
171 |
+
0: dataloader_type ................................. single
|
172 |
+
0: DDP_impl ........................................ local
|
173 |
+
0: decoder_seq_length .............................. None
|
174 |
+
0: deepscale ....................................... False
|
175 |
+
0: deepscale_config ................................ None
|
176 |
+
0: deepspeed ....................................... True
|
177 |
+
0: deepspeed_activation_checkpointing .............. False
|
178 |
+
0: deepspeed_config ................................ ds_configs/3319491.json
|
179 |
+
0: deepspeed_mpi ................................... False
|
180 |
+
0: distribute_checkpointed_activations ............. False
|
181 |
+
0: distributed_backend ............................. nccl
|
182 |
+
0: embed_layernorm ................................. False
|
183 |
+
0: embedding_path .................................. None
|
184 |
+
0: encoder_seq_length .............................. 2048
|
185 |
+
0: eod_mask_loss ................................... False
|
186 |
+
0: eval_interval ................................... 10000
|
187 |
+
0: eval_iters ...................................... 1
|
188 |
+
0: eval_only ....................................... None
|
189 |
+
0: evidence_data_path .............................. None
|
190 |
+
0: exit_duration_in_mins ........................... None
|
191 |
+
0: exit_interval ................................... None
|
192 |
+
0: ffn_hidden_size ................................. 3072
|
193 |
+
0: finetune ........................................ False
|
194 |
+
0: fp16 ............................................ False
|
195 |
+
0: fp16_lm_cross_entropy ........................... False
|
196 |
+
0: fp32_residual_connection ........................ False
|
197 |
+
0: gigaflos_no_embeds .............................. 0
|
198 |
+
0: global_batch_size ............................... 256
|
199 |
+
0: glu_activation .................................. None
|
200 |
+
0: hidden_dropout .................................. 0.1
|
201 |
+
0: hidden_size ..................................... 768
|
202 |
+
0: hysteresis ...................................... 2
|
203 |
+
0: ict_head_size ................................... None
|
204 |
+
0: ict_load ........................................ None
|
205 |
+
0: img_dim ......................................... 224
|
206 |
+
0: indexer_batch_size .............................. 128
|
207 |
+
0: indexer_log_interval ............................ 1000
|
208 |
+
0: inference ....................................... False
|
209 |
+
0: init_method_std ................................. 0.02
|
210 |
+
0: init_method_xavier_uniform ...................... False
|
211 |
+
0: initial_loss_scale .............................. 4294967296
|
212 |
+
0: kill_switch_path ................................ kill-switch-146m174b100m
|
213 |
+
0: kv_channels ..................................... 64
|
214 |
+
0: layer_norm_fusion ............................... True
|
215 |
+
0: layernorm_epsilon ............................... 1e-05
|
216 |
+
0: lazy_mpu_init ................................... None
|
217 |
+
0: load ............................................ checkpoints_146m174b100m
|
218 |
+
0: local_rank ...................................... None
|
219 |
+
0: log_batch_size_to_tensorboard ................... True
|
220 |
+
0: log_interval .................................... 100
|
221 |
+
0: log_learning_rate_to_tensorboard ................ True
|
222 |
+
0: log_level ....................................... None
|
223 |
+
0: log_level_replica ............................... None
|
224 |
+
0: log_loss_scale_to_tensorboard ................... True
|
225 |
+
0: log_num_zeros_in_grad ........................... False
|
226 |
+
0: log_params_norm ................................. False
|
227 |
+
0: log_path ........................................ None
|
228 |
+
0: log_timers_to_tensorboard ....................... True
|
229 |
+
0: log_validation_ppl_to_tensorboard ............... True
|
230 |
+
0: loss_on_targets_only ............................ False
|
231 |
+
0: loss_scale ...................................... 12.0
|
232 |
+
0: loss_scale_window ............................... 1000
|
233 |
+
0: lr .............................................. 0.0002
|
234 |
+
0: lr_decay_iters .................................. None
|
235 |
+
0: lr_decay_samples ................................ 84762549
|
236 |
+
0: lr_decay_style .................................. cosine
|
237 |
+
0: lr_decay_tokens ................................. None
|
238 |
+
0: lr_warmup_fraction .............................. None
|
239 |
+
0: lr_warmup_iters ................................. 0
|
240 |
+
0: lr_warmup_samples ............................... 847625
|
241 |
+
0: make_vocab_size_divisible_by .................... 128
|
242 |
+
0: mask_prob ....................................... 0.15
|
243 |
+
0: masked_softmax_fusion ........................... True
|
244 |
+
0: max_position_embeddings ......................... 2048
|
245 |
+
0: mean_noise_span_length .......................... None
|
246 |
+
0: memory_centric_tiled_linear ..................... False
|
247 |
+
0: merge_file ...................................... gpt2/merges.txt
|
248 |
+
0: micro_batch_size ................................ 4
|
249 |
+
0: min_loss_scale .................................. 1.0
|
250 |
+
0: min_lr .......................................... 2e-05
|
251 |
+
0: mmap_warmup ..................................... False
|
252 |
+
0: no_load_optim ................................... None
|
253 |
+
0: no_load_rng ..................................... None
|
254 |
+
0: no_save_optim ................................... None
|
255 |
+
0: no_save_rng ..................................... None
|
256 |
+
0: noise_density ................................... None
|
257 |
+
0: num_attention_heads ............................. 12
|
258 |
+
0: num_channels .................................... 3
|
259 |
+
0: num_classes ..................................... 1000
|
260 |
+
0: num_layers ...................................... 15
|
261 |
+
0: num_layers_per_virtual_pipeline_stage ........... None
|
262 |
+
0: num_workers ..................................... 2
|
263 |
+
0: onnx_safe ....................................... None
|
264 |
+
0: openai_gelu ..................................... False
|
265 |
+
0: optimizer ....................................... adam
|
266 |
+
0: optimizer_fusion ................................ True
|
267 |
+
0: override_lr_scheduler ........................... False
|
268 |
+
0: pad_vocab_size_to ............................... None
|
269 |
+
0: params_dtype .................................... torch.bfloat16
|
270 |
+
0: partition_activations ........................... False
|
271 |
+
0: patch_dim ....................................... 16
|
272 |
+
0: pipeline_model_parallel_size .................... 1
|
273 |
+
0: position_embedding_type ......................... PositionEmbeddingType.absolute
|
274 |
+
0: pp_partition_method ............................. None
|
275 |
+
0: profile_backward ................................ False
|
276 |
+
0: query_in_block_prob ............................. 0.1
|
277 |
+
0: rampup_batch_size ............................... None
|
278 |
+
0: rank ............................................ 0
|
279 |
+
0: remote_device ................................... none
|
280 |
+
0: reset_attention_mask ............................ False
|
281 |
+
0: reset_position_ids .............................. False
|
282 |
+
0: reset_progress .................................. None
|
283 |
+
0: retriever_report_topk_accuracies ................ []
|
284 |
+
0: retriever_score_scaling ......................... False
|
285 |
+
0: retriever_seq_length ............................ 256
|
286 |
+
0: reweight_loss_based_on_position_frequency ....... False
|
287 |
+
0: sample_rate ..................................... 1.0
|
288 |
+
0: save ............................................ checkpoints_146m174b100m
|
289 |
+
0: save_interval ................................... 10000
|
290 |
+
0: scatter_gather_tensors_in_pipeline .............. True
|
291 |
+
0: scattered_embeddings ............................ False
|
292 |
+
0: seed ............................................ 1234
|
293 |
+
0: seq_length ...................................... 2048
|
294 |
+
0: sgd_momentum .................................... 0.9
|
295 |
+
0: short_seq_prob .................................. 0.1
|
296 |
+
0: skip_train_iteration_range ...................... None
|
297 |
+
0: split ........................................... None
|
298 |
+
0: split_transformers .............................. False
|
299 |
+
0: sync_tp_duplicated_parameters ................... False
|
300 |
+
0: synchronize_each_layer .......................... False
|
301 |
+
0: tensor_model_parallel_size ...................... 1
|
302 |
+
0: tensorboard_dir ................................. tensorboard_146m174b100m
|
303 |
+
0: tensorboard_log_interval ........................ 1
|
304 |
+
0: tensorboard_queue_size .......................... 5
|
305 |
+
0: test_weighted_split_paths ....................... None
|
306 |
+
0: test_weighted_split_paths_path .................. None
|
307 |
+
0: tile_factor ..................................... 1
|
308 |
+
0: titles_data_path ................................ None
|
309 |
+
0: tokenizer_name_or_path .......................... None
|
310 |
+
0: tokenizer_type .................................. GPT2BPETokenizer
|
311 |
+
0: train_iters ..................................... None
|
312 |
+
0: train_samples ................................... 84762549
|
313 |
+
0: train_tokens .................................... None
|
314 |
+
0: train_weighted_split_names ...................... ['train']
|
315 |
+
0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']]
|
316 |
+
0: train_weighted_split_paths_path ................. None
|
317 |
+
0: train_weighted_split_splits ..................... [['0:1']]
|
318 |
+
0: train_weighted_split_weights .................... [['1.0']]
|
319 |
+
0: universal_checkpoint ............................ False
|
320 |
+
0: use_bnb_optimizer ............................... False
|
321 |
+
0: use_checkpoint_lr_scheduler ..................... False
|
322 |
+
0: use_contiguous_buffers_in_ddp ................... True
|
323 |
+
0: use_cpu_initialization .......................... None
|
324 |
+
0: use_one_sent_docs ............................... False
|
325 |
+
0: use_pin_memory .................................. False
|
326 |
+
0: valid_num_workers ............................... 2
|
327 |
+
0: valid_weighted_split_names ...................... ['validation']
|
328 |
+
0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']]
|
329 |
+
0: valid_weighted_split_paths_path ................. None
|
330 |
+
0: valid_weighted_split_splits ..................... [['0:1']]
|
331 |
+
0: valid_weighted_split_weights .................... [['1.0']]
|
332 |
+
0: virtual_pipeline_model_parallel_size ............ None
|
333 |
+
0: vocab_extra_ids ................................. 0
|
334 |
+
0: vocab_file ...................................... gpt2/vocab.json
|
335 |
+
0: weight_decay .................................... 0.1
|
336 |
+
0: world_size ...................................... 64
|
337 |
+
0: zero_allgather_bucket_size ...................... 0.0
|
338 |
+
0: zero_contigious_gradients ....................... False
|
339 |
+
0: zero_reduce_bucket_size ......................... 0.0
|
340 |
+
0: zero_reduce_scatter ............................. False
|
341 |
+
0: zero_stage ...................................... 0
|
342 |
+
0: -------------------- end of arguments ---------------------
|
343 |
+
0: setting number of micro-batches to constant 1
|
344 |
+
0: > building GPT2BPETokenizer tokenizer ...
|
345 |
+
0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
|
346 |
+
0: DeepSpeed general environment info:
|
347 |
+
0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch']
|
348 |
+
0: torch version .................... 1.13.0+rocm5.2
|
349 |
+
0: torch cuda version ............... None
|
350 |
+
0: torch hip version ................ 5.2.21151-afdc89f8
|
351 |
+
0: nvcc version ..................... None
|
352 |
+
0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed']
|
353 |
+
0: deepspeed info ................... 0.7.5, unknown, unknown
|
354 |
+
0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1
|
355 |
+
0: **** Git info for Megatron: git_hash=unknown git_branch=unknown ****
|
356 |
+
0: > initializing torch distributed ...
|
357 |
+
0: [2023-03-17 13:53:41,482] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
358 |
+
0: > initializing tensor model parallel with size 1
|
359 |
+
0: > initializing pipeline model parallel with size 1
|
360 |
+
0: > setting random seeds to 1234 ...
|
361 |
+
0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
|
362 |
+
0: > compiling dataset index builder ...
|
363 |
+
0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
364 |
+
0: make: Nothing to be done for 'default'.
|
365 |
+
0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
366 |
+
0: >>> done with dataset index builder. Compilation time: 0.065 seconds
|
367 |
+
0: > compiling and loading fused kernels ...
|
146m174b100m/3418230.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b100m/3418230.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b100m/global_step331103/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4c98e1090308db2255181e2aca94a74aac9746fb46cf9414a935ddbb0dd077a
|
3 |
+
size 27478295
|
146m174b100m/global_step331103/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d1476ab6d4eec77e81881cc1908787cc4a6b15b8c32bb86bfad8310edb936f1
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:def884516c09d88f9b176fb0e4b7b1700e6c5c85e0d665c7969b61cfdfd015dc
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0c097836ff5fb58384147ab6aa7d120af066a5d470ef80578a31649e94d9ac0
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a28f9c3d1ae830f174a8dc5745d6d0ff79f21b6e27f6968bf863cc1c1ba304ca
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edbe12ea347202878b4230cf3ab2196fd87af005efba9b184ccbde4b935b2cb9
|
3 |
+
size 27478370
|
146m174b100m/global_step331103/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f79afd73bb01137b7b3e54056734c71668c5a82383845eb5cff4ebf1737fee96
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c444c291f99ffd4f733da747f3eaff0689372c8f30c93dd076eabfa2243b412
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:845d58f9b80108560061ae7213d29d9862ac91b4ae9554da8dced8148e2e263c
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55f979cb703553da50ca567dbe8ff7e08e9fd90af3fa63f87b32584876b6ab49
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d2c051ef04a09bb5be2767f4b405a38917cd287ce3dceb225361e9c39720005
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:da67e26bde64738bd30f66866cefa0bd96ee748f639a58aef5f5464b22a110d3
|
3 |
+
size 27478231
|
146m174b100m/global_step331103/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66f7e9052fc56c60e056a0cabe75f3bfbb9c4e2aa4c140947cc1448f482a131b
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1884a01bed9a83088aa569b3133274de92219359e511244c6a251926e04ba2d9
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7dd3fa0eca179f104d51a8d960b9224195add2be77fc4cd0d519ad80e859fd6
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42c46817934ef755388554003b278835fa955198b4006f425cc5f6c56f8a13a8
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7acf871012c794ef8b27cef7d7d9ccc460403493abb1086c5bcb4c086a3270a
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e57871912ddb7b687ea1599b09ebd8c35ffa6af36da6e6d014725c7575b7e5c
|
3 |
+
size 27478370
|
146m174b100m/global_step331103/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:577b35e4a5c49f4918ff08bf588b76aebef387a3c38ba48b646109841e938cb4
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07b748167b5717e57b41fcdcf2f56ca9816aa3d01203d1724680ed9db4aa1684
|
3 |
+
size 27478370
|
146m174b100m/global_step331103/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a9b7880f7ef23bf9da8732dbe54f679ce866a6e948fc768a13cad4c95f514
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be012283f5da09de0c09aadfce59f56ebcfe99f6ba57cdf8292f37f991c54bd5
|
3 |
+
size 27478370
|
146m174b100m/global_step331103/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d1c020a58e8218cad82e7f1496a04ad2a5b9fb0a4e18d2766c37e479c55ea57
|
3 |
+
size 27478231
|
146m174b100m/global_step331103/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75dd11e69fb3fdd308cd8c04c914e5da7b4eb9e910794c61336541300bbd7deb
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3dfbcd217b5363f82c4b59ad5f033b0826596514af0c6bc76b8f3e2d25fb0f8d
|
3 |
+
size 27478370
|
146m174b100m/global_step331103/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c74eb1b55c392ff5210506eb075303c2fbb3cc0fccd5ef18075bfcc10f362e3
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1dce65e517d122fb1794bc6b16b09eafab4e0dd91782ca19bb365ed221d70465
|
3 |
+
size 27478434
|
146m174b100m/global_step331103/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:739896e4af1c14fabab2949c2becb4ad280faabaedcaf3c583cfb3ca20f25867
|
3 |
+
size 27478370
|
146m174b100m/global_step331103/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44ef176168409b59ff9a2d3039dfb29d30964813f9d1425d9b4cf1492414c05f
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d68662500747b667c8bc925ce368fff78693ecaac7836f48afae3de9b25ffc5
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f842ef4f790cc356a71f8c4dfe8aac0bd01a39915544fabb50748ddf361bd986
|
3 |
+
size 27478114
|
146m174b100m/global_step331103/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebeacef06838267442e5501541edda74e38255784fdda27036cb15f964b3118d
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9228d5c5c4e0ed700b2132d40acf57e7face3a7fd627f79b914f1b2f8eaac2f1
|
3 |
+
size 27478434
|
146m174b100m/global_step331103/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c5485d13731d7a9df1a5091d29966b4de19bef1b1a7869e87ee1d6faafb55a5
|
3 |
+
size 27478231
|
146m174b100m/global_step331103/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4f4b805b4d92c005b78d66cc7220dcdd3df361f372b8381dd9154f98d221f05
|
3 |
+
size 27478242
|
146m174b100m/global_step331103/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed0845169d18c8425f17850dd74239a4988b8aa952a2055ca4ea527b85d025bc
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff04b43ad01b17675404f821c59281315d264dcadd9f8143fa97fa8634fa8d95
|
3 |
+
size 27478306
|
146m174b100m/global_step331103/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5968e82c36a8f31445b66ddd50b17a3bb56618012a8004073aa8c2e0ff74e14
|
3 |
+
size 27478178
|
146m174b100m/global_step331103/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f74714e68f68ca003e8600b8a74738d4da295d89ddeeed9633758aa3616456c
|
3 |
+
size 27478434
|
146m174b100m/global_step331103/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b41da6605be07d3fe2d2a1111d9aea1c48c91b209d1376072fd3ff71e3f45ab
|
3 |
+
size 27478114
|