bbytxt commited on
Commit
6760a63
·
verified ·
1 Parent(s): 19fa1ee

Training in progress, step 25, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "o_proj",
 
24
  "v_proj",
25
- "gate_proj",
26
  "down_proj",
27
- "up_proj",
28
  "k_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "up_proj",
24
+ "q_proj",
25
  "v_proj",
26
+ "o_proj",
27
  "down_proj",
 
28
  "k_proj",
29
+ "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d4e18070f1fbb323ff7ff66736dbb195610e6b5ce5e81b24898f35e97a06abc
3
  size 335604696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27ea72c3fef81e660c61caf875f657fc3522ea93174492ce188ba45643557473
3
  size 335604696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39f29dcc78510dc497cd6eee55d10960c256b53fccec8fb8f066642446661b29
3
  size 170920084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2d7b3b44effdefdd96a1c4f3c20120ee3d7bd2811acdd3d73aeba990d9c6ff2
3
  size 170920084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b628271795a824ff26c9ba2c9328f6e4104a7f47aab89bc3ae57b145971748ed
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5537e910b76a6f610d3778b948cabe4cd17467c9f407942be9530729a5c41a5a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1df0528620c07325b8faa7567e59b0c1e86a1f1ee6af1245a69c6c0463fe4e2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6676fe28230ae15b45fb334c871c6fdf1a7984a935952b9f8650896c37a8c106
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "best_metric": 1.06325101852417,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
- "epoch": 0.15464913976416006,
5
  "eval_steps": 25,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.003092982795283201,
13
- "grad_norm": 62.20377731323242,
14
  "learning_rate": 1e-05,
15
  "loss": 25.075,
16
  "step": 1
@@ -18,369 +18,186 @@
18
  {
19
  "epoch": 0.003092982795283201,
20
  "eval_loss": 1.5490102767944336,
21
- "eval_runtime": 149.2461,
22
- "eval_samples_per_second": 3.652,
23
  "eval_steps_per_second": 1.829,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.006185965590566402,
28
- "grad_norm": 66.19660949707031,
29
  "learning_rate": 2e-05,
30
  "loss": 23.6882,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.009278948385849604,
35
- "grad_norm": 63.24820327758789,
36
  "learning_rate": 3e-05,
37
  "loss": 23.8152,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.012371931181132804,
42
- "grad_norm": 65.91133880615234,
43
  "learning_rate": 4e-05,
44
- "loss": 24.4938,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.015464913976416006,
49
- "grad_norm": 37.1609001159668,
50
  "learning_rate": 5e-05,
51
- "loss": 20.4001,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.01855789677169921,
56
- "grad_norm": 37.836673736572266,
57
  "learning_rate": 6e-05,
58
- "loss": 21.0493,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.02165087956698241,
63
- "grad_norm": 35.000755310058594,
64
  "learning_rate": 7e-05,
65
- "loss": 19.5046,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.02474386236226561,
70
- "grad_norm": 31.371307373046875,
71
  "learning_rate": 8e-05,
72
- "loss": 18.7178,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.02783684515754881,
77
- "grad_norm": 36.98615264892578,
78
  "learning_rate": 9e-05,
79
- "loss": 19.6468,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.03092982795283201,
84
- "grad_norm": 33.72000503540039,
85
  "learning_rate": 0.0001,
86
- "loss": 19.1595,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.034022810748115216,
91
- "grad_norm": 32.670860290527344,
92
  "learning_rate": 9.98458666866564e-05,
93
- "loss": 18.3969,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.03711579354339842,
98
- "grad_norm": 35.979820251464844,
99
  "learning_rate": 9.938441702975689e-05,
100
- "loss": 19.4862,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.04020877633868162,
105
- "grad_norm": 28.703033447265625,
106
  "learning_rate": 9.861849601988383e-05,
107
- "loss": 17.8546,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.04330175913396482,
112
- "grad_norm": 24.526432037353516,
113
  "learning_rate": 9.755282581475769e-05,
114
- "loss": 17.7441,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.046394741929248015,
119
- "grad_norm": 25.215848922729492,
120
  "learning_rate": 9.619397662556435e-05,
121
- "loss": 17.3959,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.04948772472453122,
126
- "grad_norm": 26.21254539489746,
127
  "learning_rate": 9.45503262094184e-05,
128
- "loss": 17.1178,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.05258070751981442,
133
- "grad_norm": 27.829181671142578,
134
  "learning_rate": 9.263200821770461e-05,
135
- "loss": 19.3058,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.05567369031509762,
140
- "grad_norm": 25.054349899291992,
141
  "learning_rate": 9.045084971874738e-05,
142
- "loss": 17.6997,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.05876667311038082,
147
- "grad_norm": 25.53619384765625,
148
  "learning_rate": 8.802029828000156e-05,
149
- "loss": 17.6693,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.06185965590566402,
154
- "grad_norm": 29.731338500976562,
155
  "learning_rate": 8.535533905932738e-05,
156
- "loss": 18.1907,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.06495263870094722,
161
- "grad_norm": 27.52916145324707,
162
  "learning_rate": 8.247240241650918e-05,
163
- "loss": 17.7958,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.06804562149623043,
168
- "grad_norm": 27.228439331054688,
169
  "learning_rate": 7.938926261462366e-05,
170
- "loss": 18.8158,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.07113860429151363,
175
- "grad_norm": 26.449893951416016,
176
  "learning_rate": 7.612492823579745e-05,
177
- "loss": 17.0907,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.07423158708679684,
182
- "grad_norm": 26.7717227935791,
183
  "learning_rate": 7.269952498697734e-05,
184
- "loss": 19.3673,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.07732456988208003,
189
- "grad_norm": 24.446931838989258,
190
  "learning_rate": 6.91341716182545e-05,
191
- "loss": 17.9268,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.07732456988208003,
196
- "eval_loss": 1.0832998752593994,
197
- "eval_runtime": 150.2298,
198
- "eval_samples_per_second": 3.628,
199
- "eval_steps_per_second": 1.817,
200
  "step": 25
201
- },
202
- {
203
- "epoch": 0.08041755267736324,
204
- "grad_norm": 24.669111251831055,
205
- "learning_rate": 6.545084971874738e-05,
206
- "loss": 17.5459,
207
- "step": 26
208
- },
209
- {
210
- "epoch": 0.08351053547264643,
211
- "grad_norm": 23.340457916259766,
212
- "learning_rate": 6.167226819279528e-05,
213
- "loss": 16.5561,
214
- "step": 27
215
- },
216
- {
217
- "epoch": 0.08660351826792964,
218
- "grad_norm": 25.84408950805664,
219
- "learning_rate": 5.782172325201155e-05,
220
- "loss": 17.3012,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.08969650106321284,
225
- "grad_norm": 25.935504913330078,
226
- "learning_rate": 5.392295478639225e-05,
227
- "loss": 18.7042,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.09278948385849603,
232
- "grad_norm": 23.08302879333496,
233
- "learning_rate": 5e-05,
234
- "loss": 16.2506,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.09588246665377924,
239
- "grad_norm": 24.507280349731445,
240
- "learning_rate": 4.607704521360776e-05,
241
- "loss": 18.5361,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.09897544944906243,
246
- "grad_norm": 22.492021560668945,
247
- "learning_rate": 4.2178276747988446e-05,
248
- "loss": 16.9076,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.10206843224434564,
253
- "grad_norm": 24.12836456298828,
254
- "learning_rate": 3.832773180720475e-05,
255
- "loss": 18.5358,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.10516141503962884,
260
- "grad_norm": 22.608657836914062,
261
- "learning_rate": 3.4549150281252636e-05,
262
- "loss": 16.6398,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.10825439783491204,
267
- "grad_norm": 21.76448631286621,
268
- "learning_rate": 3.086582838174551e-05,
269
- "loss": 16.4674,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.11134738063019524,
274
- "grad_norm": 24.399173736572266,
275
- "learning_rate": 2.7300475013022663e-05,
276
- "loss": 16.219,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.11444036342547845,
281
- "grad_norm": 21.240434646606445,
282
- "learning_rate": 2.3875071764202563e-05,
283
- "loss": 14.7258,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.11753334622076164,
288
- "grad_norm": 23.667985916137695,
289
- "learning_rate": 2.061073738537635e-05,
290
- "loss": 17.4903,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.12062632901604485,
295
- "grad_norm": 24.75313377380371,
296
- "learning_rate": 1.7527597583490822e-05,
297
- "loss": 18.4833,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.12371931181132804,
302
- "grad_norm": 28.104583740234375,
303
- "learning_rate": 1.4644660940672627e-05,
304
- "loss": 16.1857,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.12681229460661125,
309
- "grad_norm": 24.995805740356445,
310
- "learning_rate": 1.1979701719998453e-05,
311
- "loss": 16.9785,
312
- "step": 41
313
- },
314
- {
315
- "epoch": 0.12990527740189445,
316
- "grad_norm": 26.10293197631836,
317
- "learning_rate": 9.549150281252633e-06,
318
- "loss": 17.8392,
319
- "step": 42
320
- },
321
- {
322
- "epoch": 0.13299826019717764,
323
- "grad_norm": 22.538623809814453,
324
- "learning_rate": 7.367991782295391e-06,
325
- "loss": 16.8663,
326
- "step": 43
327
- },
328
- {
329
- "epoch": 0.13609124299246086,
330
- "grad_norm": 23.371549606323242,
331
- "learning_rate": 5.449673790581611e-06,
332
- "loss": 17.0096,
333
- "step": 44
334
- },
335
- {
336
- "epoch": 0.13918422578774406,
337
- "grad_norm": 22.70631980895996,
338
- "learning_rate": 3.8060233744356633e-06,
339
- "loss": 16.9947,
340
- "step": 45
341
- },
342
- {
343
- "epoch": 0.14227720858302725,
344
- "grad_norm": 26.935110092163086,
345
- "learning_rate": 2.4471741852423237e-06,
346
- "loss": 17.6166,
347
- "step": 46
348
- },
349
- {
350
- "epoch": 0.14537019137831045,
351
- "grad_norm": 24.898109436035156,
352
- "learning_rate": 1.3815039801161721e-06,
353
- "loss": 17.4678,
354
- "step": 47
355
- },
356
- {
357
- "epoch": 0.14846317417359367,
358
- "grad_norm": 23.253236770629883,
359
- "learning_rate": 6.15582970243117e-07,
360
- "loss": 17.145,
361
- "step": 48
362
- },
363
- {
364
- "epoch": 0.15155615696887687,
365
- "grad_norm": 23.85171890258789,
366
- "learning_rate": 1.5413331334360182e-07,
367
- "loss": 17.1861,
368
- "step": 49
369
- },
370
- {
371
- "epoch": 0.15464913976416006,
372
- "grad_norm": 22.377422332763672,
373
- "learning_rate": 0.0,
374
- "loss": 19.1859,
375
- "step": 50
376
- },
377
- {
378
- "epoch": 0.15464913976416006,
379
- "eval_loss": 1.06325101852417,
380
- "eval_runtime": 150.4088,
381
- "eval_samples_per_second": 3.623,
382
- "eval_steps_per_second": 1.815,
383
- "step": 50
384
  }
385
  ],
386
  "logging_steps": 1,
@@ -404,12 +221,12 @@
404
  "should_evaluate": false,
405
  "should_log": false,
406
  "should_save": true,
407
- "should_training_stop": true
408
  },
409
  "attributes": {}
410
  }
411
  },
412
- "total_flos": 1.5435789463506125e+17,
413
  "train_batch_size": 2,
414
  "trial_name": null,
415
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.0824270248413086,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-25",
4
+ "epoch": 0.07732456988208003,
5
  "eval_steps": 25,
6
+ "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.003092982795283201,
13
+ "grad_norm": 63.227630615234375,
14
  "learning_rate": 1e-05,
15
  "loss": 25.075,
16
  "step": 1
 
18
  {
19
  "epoch": 0.003092982795283201,
20
  "eval_loss": 1.5490102767944336,
21
+ "eval_runtime": 149.2754,
22
+ "eval_samples_per_second": 3.651,
23
  "eval_steps_per_second": 1.829,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.006185965590566402,
28
+ "grad_norm": 66.90909576416016,
29
  "learning_rate": 2e-05,
30
  "loss": 23.6882,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.009278948385849604,
35
+ "grad_norm": 63.644508361816406,
36
  "learning_rate": 3e-05,
37
  "loss": 23.8152,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.012371931181132804,
42
+ "grad_norm": 66.12242126464844,
43
  "learning_rate": 4e-05,
44
+ "loss": 24.5011,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.015464913976416006,
49
+ "grad_norm": 37.521697998046875,
50
  "learning_rate": 5e-05,
51
+ "loss": 20.4137,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.01855789677169921,
56
+ "grad_norm": 36.76587677001953,
57
  "learning_rate": 6e-05,
58
+ "loss": 21.0439,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.02165087956698241,
63
+ "grad_norm": 35.1934928894043,
64
  "learning_rate": 7e-05,
65
+ "loss": 19.4824,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.02474386236226561,
70
+ "grad_norm": 34.91908645629883,
71
  "learning_rate": 8e-05,
72
+ "loss": 18.7023,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.02783684515754881,
77
+ "grad_norm": 37.3280143737793,
78
  "learning_rate": 9e-05,
79
+ "loss": 19.6331,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.03092982795283201,
84
+ "grad_norm": 33.82084655761719,
85
  "learning_rate": 0.0001,
86
+ "loss": 19.1644,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.034022810748115216,
91
+ "grad_norm": 28.904260635375977,
92
  "learning_rate": 9.98458666866564e-05,
93
+ "loss": 18.3894,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.03711579354339842,
98
+ "grad_norm": 33.33706283569336,
99
  "learning_rate": 9.938441702975689e-05,
100
+ "loss": 19.5411,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.04020877633868162,
105
+ "grad_norm": 28.71995735168457,
106
  "learning_rate": 9.861849601988383e-05,
107
+ "loss": 17.7854,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.04330175913396482,
112
+ "grad_norm": 24.508121490478516,
113
  "learning_rate": 9.755282581475769e-05,
114
+ "loss": 17.7444,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.046394741929248015,
119
+ "grad_norm": 25.298206329345703,
120
  "learning_rate": 9.619397662556435e-05,
121
+ "loss": 17.419,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.04948772472453122,
126
+ "grad_norm": 26.203685760498047,
127
  "learning_rate": 9.45503262094184e-05,
128
+ "loss": 17.0621,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.05258070751981442,
133
+ "grad_norm": 28.71617889404297,
134
  "learning_rate": 9.263200821770461e-05,
135
+ "loss": 19.3019,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.05567369031509762,
140
+ "grad_norm": 24.17940902709961,
141
  "learning_rate": 9.045084971874738e-05,
142
+ "loss": 17.6426,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.05876667311038082,
147
+ "grad_norm": 25.524850845336914,
148
  "learning_rate": 8.802029828000156e-05,
149
+ "loss": 17.6575,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.06185965590566402,
154
+ "grad_norm": 30.970611572265625,
155
  "learning_rate": 8.535533905932738e-05,
156
+ "loss": 18.1651,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.06495263870094722,
161
+ "grad_norm": 26.73845100402832,
162
  "learning_rate": 8.247240241650918e-05,
163
+ "loss": 17.7731,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.06804562149623043,
168
+ "grad_norm": 27.083934783935547,
169
  "learning_rate": 7.938926261462366e-05,
170
+ "loss": 18.8107,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.07113860429151363,
175
+ "grad_norm": 26.337148666381836,
176
  "learning_rate": 7.612492823579745e-05,
177
+ "loss": 17.086,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.07423158708679684,
182
+ "grad_norm": 24.6472225189209,
183
  "learning_rate": 7.269952498697734e-05,
184
+ "loss": 19.3557,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.07732456988208003,
189
+ "grad_norm": 24.704971313476562,
190
  "learning_rate": 6.91341716182545e-05,
191
+ "loss": 17.915,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.07732456988208003,
196
+ "eval_loss": 1.0824270248413086,
197
+ "eval_runtime": 150.1751,
198
+ "eval_samples_per_second": 3.629,
199
+ "eval_steps_per_second": 1.818,
200
  "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
  ],
203
  "logging_steps": 1,
 
221
  "should_evaluate": false,
222
  "should_log": false,
223
  "should_save": true,
224
+ "should_training_stop": false
225
  },
226
  "attributes": {}
227
  }
228
  },
229
+ "total_flos": 7.709054073411994e+16,
230
  "train_batch_size": 2,
231
  "trial_name": null,
232
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c7893fcc1d921507987621a11ba58392fa5578668e213df95a55c81995950c5
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6f6e4fa385cc360d0456add741e899e95c6f03dade6986e9b6175844c1c2dab
3
  size 6776