sedrickkeh commited on
Commit
10f9d89
·
verified ·
1 Parent(s): 94ec1ab

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_camel_ai_biology
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_camel_ai_biology
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6390
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_camel_ai_biology
 
16
 
17
  # OH_DCFT_V3_wo_camel_ai_biology
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_camel_ai_biology dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6390
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.9945750452079567,
3
- "eval_loss": 0.6422178745269775,
4
- "eval_runtime": 223.8998,
5
- "eval_samples_per_second": 49.915,
6
- "eval_steps_per_second": 0.393,
7
  "total_flos": 2079977499525120.0,
8
- "train_loss": 0.6134321775029439,
9
- "train_runtime": 37323.5248,
10
- "train_samples_per_second": 17.067,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 2.9945750452079567,
3
+ "eval_loss": 0.6389562487602234,
4
+ "eval_runtime": 221.8509,
5
+ "eval_samples_per_second": 50.376,
6
+ "eval_steps_per_second": 0.397,
7
  "total_flos": 2079977499525120.0,
8
+ "train_loss": 0.615725677754376,
9
+ "train_runtime": 37262.9121,
10
+ "train_samples_per_second": 17.094,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.9945750452079567,
3
- "eval_loss": 0.6422178745269775,
4
- "eval_runtime": 223.8998,
5
- "eval_samples_per_second": 49.915,
6
- "eval_steps_per_second": 0.393
7
  }
 
1
  {
2
  "epoch": 2.9945750452079567,
3
+ "eval_loss": 0.6389562487602234,
4
+ "eval_runtime": 221.8509,
5
+ "eval_samples_per_second": 50.376,
6
+ "eval_steps_per_second": 0.397
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9945750452079567,
3
  "total_flos": 2079977499525120.0,
4
- "train_loss": 0.6134321775029439,
5
- "train_runtime": 37323.5248,
6
- "train_samples_per_second": 17.067,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 2.9945750452079567,
3
  "total_flos": 2079977499525120.0,
4
+ "train_loss": 0.615725677754376,
5
+ "train_runtime": 37262.9121,
6
+ "train_samples_per_second": 17.094,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,903 +10,903 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.024110910186859555,
13
- "grad_norm": 2.685936547429143,
14
  "learning_rate": 5e-06,
15
- "loss": 0.9133,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.04822182037371911,
20
- "grad_norm": 2.8983680167236696,
21
  "learning_rate": 5e-06,
22
- "loss": 0.7794,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07233273056057866,
27
- "grad_norm": 1.4715031186017837,
28
  "learning_rate": 5e-06,
29
- "loss": 0.759,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.09644364074743822,
34
- "grad_norm": 0.8377570160614484,
35
  "learning_rate": 5e-06,
36
- "loss": 0.7317,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.12055455093429777,
41
- "grad_norm": 1.0172390996140888,
42
  "learning_rate": 5e-06,
43
- "loss": 0.728,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.14466546112115733,
48
- "grad_norm": 2.076736903100202,
49
  "learning_rate": 5e-06,
50
- "loss": 0.7065,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16877637130801687,
55
- "grad_norm": 0.9324814149418421,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7025,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.19288728149487644,
62
- "grad_norm": 0.8961533346444337,
63
  "learning_rate": 5e-06,
64
- "loss": 0.6948,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21699819168173598,
69
- "grad_norm": 0.9833679322557037,
70
  "learning_rate": 5e-06,
71
- "loss": 0.6997,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.24110910186859555,
76
- "grad_norm": 0.6320425718923794,
77
  "learning_rate": 5e-06,
78
- "loss": 0.6781,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2652200120554551,
83
- "grad_norm": 0.8047932595929222,
84
  "learning_rate": 5e-06,
85
- "loss": 0.6857,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.28933092224231466,
90
- "grad_norm": 0.6873832051216665,
91
  "learning_rate": 5e-06,
92
- "loss": 0.6762,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3134418324291742,
97
- "grad_norm": 0.8515251273118922,
98
  "learning_rate": 5e-06,
99
- "loss": 0.6741,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.33755274261603374,
104
- "grad_norm": 0.6815399625732373,
105
  "learning_rate": 5e-06,
106
- "loss": 0.6786,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.3616636528028933,
111
- "grad_norm": 0.6231823130658575,
112
  "learning_rate": 5e-06,
113
- "loss": 0.6763,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.3857745629897529,
118
- "grad_norm": 0.552827319574485,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6711,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4098854731766124,
125
- "grad_norm": 0.6826986498299203,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6706,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.43399638336347196,
132
- "grad_norm": 0.5574310360062503,
133
  "learning_rate": 5e-06,
134
- "loss": 0.6659,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.45810729355033153,
139
- "grad_norm": 0.7613567669157012,
140
  "learning_rate": 5e-06,
141
- "loss": 0.6658,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.4822182037371911,
146
- "grad_norm": 0.5609659476480818,
147
  "learning_rate": 5e-06,
148
- "loss": 0.6598,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5063291139240507,
153
- "grad_norm": 1.078834895881199,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6687,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5304400241109102,
160
- "grad_norm": 0.6016551358752319,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6667,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5545509342977697,
167
- "grad_norm": 0.5329067498961892,
168
  "learning_rate": 5e-06,
169
- "loss": 0.6568,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5786618444846293,
174
- "grad_norm": 0.5844269800148942,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6656,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.6027727546714888,
181
- "grad_norm": 0.713015217035973,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6584,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6268836648583485,
188
- "grad_norm": 0.7063878216983879,
189
  "learning_rate": 5e-06,
190
- "loss": 0.665,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.650994575045208,
195
- "grad_norm": 0.50774960805631,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6615,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6751054852320675,
202
- "grad_norm": 0.6111313528033431,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6551,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6992163954189271,
209
- "grad_norm": 0.6458858962308502,
210
  "learning_rate": 5e-06,
211
- "loss": 0.6535,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7233273056057866,
216
- "grad_norm": 0.6797329430329018,
217
  "learning_rate": 5e-06,
218
- "loss": 0.6616,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7474382157926461,
223
- "grad_norm": 1.0271382997748104,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6593,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7715491259795058,
230
- "grad_norm": 0.5821025343959978,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6556,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7956600361663653,
237
- "grad_norm": 0.575144218324774,
238
  "learning_rate": 5e-06,
239
- "loss": 0.6522,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8197709463532248,
244
- "grad_norm": 0.4992177743591918,
245
  "learning_rate": 5e-06,
246
- "loss": 0.6472,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8438818565400844,
251
- "grad_norm": 0.5518799725500897,
252
  "learning_rate": 5e-06,
253
- "loss": 0.6486,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8679927667269439,
258
- "grad_norm": 0.6827706978670125,
259
  "learning_rate": 5e-06,
260
- "loss": 0.6527,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8921036769138035,
265
- "grad_norm": 0.5370276906753118,
266
  "learning_rate": 5e-06,
267
- "loss": 0.6564,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9162145871006631,
272
- "grad_norm": 0.5011748190469159,
273
  "learning_rate": 5e-06,
274
- "loss": 0.648,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9403254972875226,
279
- "grad_norm": 0.7289445343800255,
280
  "learning_rate": 5e-06,
281
- "loss": 0.645,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9644364074743822,
286
- "grad_norm": 0.5223137931656774,
287
  "learning_rate": 5e-06,
288
- "loss": 0.6481,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9885473176612417,
293
- "grad_norm": 0.5702001612329072,
294
  "learning_rate": 5e-06,
295
- "loss": 0.6417,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.9981916817359855,
300
- "eval_loss": 0.6491908431053162,
301
- "eval_runtime": 223.3579,
302
- "eval_samples_per_second": 50.036,
303
- "eval_steps_per_second": 0.394,
304
  "step": 414
305
  },
306
  {
307
  "epoch": 1.0126582278481013,
308
- "grad_norm": 0.6141037997267318,
309
  "learning_rate": 5e-06,
310
- "loss": 0.6207,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.0367691380349608,
315
- "grad_norm": 0.5738222179228437,
316
  "learning_rate": 5e-06,
317
- "loss": 0.6073,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0608800482218204,
322
- "grad_norm": 0.6152321417799828,
323
  "learning_rate": 5e-06,
324
- "loss": 0.5963,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.0849909584086799,
329
- "grad_norm": 0.6230797448075694,
330
  "learning_rate": 5e-06,
331
- "loss": 0.6035,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.1091018685955394,
336
- "grad_norm": 0.5547485435536735,
337
  "learning_rate": 5e-06,
338
- "loss": 0.6043,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.1332127787823991,
343
- "grad_norm": 0.6897788261093171,
344
  "learning_rate": 5e-06,
345
- "loss": 0.6059,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.1573236889692586,
350
- "grad_norm": 0.5319379437293987,
351
  "learning_rate": 5e-06,
352
- "loss": 0.5991,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1814345991561181,
357
- "grad_norm": 0.5927433636509655,
358
  "learning_rate": 5e-06,
359
- "loss": 0.6033,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.2055455093429777,
364
- "grad_norm": 0.6178241976987927,
365
  "learning_rate": 5e-06,
366
- "loss": 0.6,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2296564195298372,
371
- "grad_norm": 0.5009847922110348,
372
  "learning_rate": 5e-06,
373
- "loss": 0.5987,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.253767329716697,
378
- "grad_norm": 0.6865827636690425,
379
  "learning_rate": 5e-06,
380
- "loss": 0.6039,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2778782399035564,
385
- "grad_norm": 0.6419339118636196,
386
  "learning_rate": 5e-06,
387
- "loss": 0.5977,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.301989150090416,
392
- "grad_norm": 0.5403820568820131,
393
  "learning_rate": 5e-06,
394
- "loss": 0.6053,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3261000602772754,
399
- "grad_norm": 0.496944344094317,
400
  "learning_rate": 5e-06,
401
- "loss": 0.6043,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.350210970464135,
406
- "grad_norm": 0.6835364259470225,
407
  "learning_rate": 5e-06,
408
- "loss": 0.6015,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.3743218806509945,
413
- "grad_norm": 0.5433357613957998,
414
  "learning_rate": 5e-06,
415
- "loss": 0.5979,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.3984327908378542,
420
- "grad_norm": 0.48293592352088544,
421
  "learning_rate": 5e-06,
422
- "loss": 0.6039,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.4225437010247137,
427
- "grad_norm": 0.5167692584382013,
428
  "learning_rate": 5e-06,
429
- "loss": 0.6029,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4466546112115732,
434
- "grad_norm": 0.5467014681458703,
435
  "learning_rate": 5e-06,
436
- "loss": 0.6056,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4707655213984328,
441
- "grad_norm": 0.48669984975762765,
442
  "learning_rate": 5e-06,
443
- "loss": 0.6053,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4948764315852923,
448
- "grad_norm": 0.5052139384494145,
449
  "learning_rate": 5e-06,
450
- "loss": 0.607,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.518987341772152,
455
- "grad_norm": 0.5189039466272587,
456
  "learning_rate": 5e-06,
457
- "loss": 0.6079,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5430982519590115,
462
- "grad_norm": 0.5340411087467901,
463
  "learning_rate": 5e-06,
464
- "loss": 0.5966,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.567209162145871,
469
- "grad_norm": 0.6320951914134804,
470
  "learning_rate": 5e-06,
471
- "loss": 0.6119,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5913200723327305,
476
- "grad_norm": 0.5402636477743581,
477
  "learning_rate": 5e-06,
478
- "loss": 0.6018,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.61543098251959,
483
- "grad_norm": 0.6023321834042192,
484
  "learning_rate": 5e-06,
485
- "loss": 0.6023,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.6395418927064496,
490
- "grad_norm": 0.49282224066247415,
491
  "learning_rate": 5e-06,
492
- "loss": 0.6002,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.663652802893309,
497
- "grad_norm": 0.6838051107799483,
498
  "learning_rate": 5e-06,
499
- "loss": 0.601,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.6877637130801688,
504
- "grad_norm": 0.4809683173497573,
505
  "learning_rate": 5e-06,
506
- "loss": 0.6012,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.7118746232670283,
511
- "grad_norm": 0.5130004764470846,
512
  "learning_rate": 5e-06,
513
- "loss": 0.6019,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7359855334538878,
518
- "grad_norm": 0.5222089493788711,
519
  "learning_rate": 5e-06,
520
- "loss": 0.6029,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.7600964436407476,
525
- "grad_norm": 0.5537154673186192,
526
  "learning_rate": 5e-06,
527
- "loss": 0.6039,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.784207353827607,
532
- "grad_norm": 0.5081950888314039,
533
  "learning_rate": 5e-06,
534
- "loss": 0.5973,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.8083182640144666,
539
- "grad_norm": 0.5806567422134803,
540
  "learning_rate": 5e-06,
541
- "loss": 0.6072,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.8324291742013261,
546
- "grad_norm": 0.5192410257029635,
547
  "learning_rate": 5e-06,
548
- "loss": 0.6026,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8565400843881856,
553
- "grad_norm": 0.5487344170749389,
554
  "learning_rate": 5e-06,
555
- "loss": 0.6009,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.8806509945750451,
560
- "grad_norm": 0.5324805374861366,
561
  "learning_rate": 5e-06,
562
- "loss": 0.5994,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.9047619047619047,
567
- "grad_norm": 0.6058321884008855,
568
  "learning_rate": 5e-06,
569
- "loss": 0.6025,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.9288728149487642,
574
- "grad_norm": 0.57365525151735,
575
  "learning_rate": 5e-06,
576
- "loss": 0.6026,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.952983725135624,
581
- "grad_norm": 0.5436955562661013,
582
  "learning_rate": 5e-06,
583
- "loss": 0.5953,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.9770946353224834,
588
- "grad_norm": 0.6042343773815075,
589
  "learning_rate": 5e-06,
590
- "loss": 0.6,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.998794454490657,
595
- "eval_loss": 0.6393378973007202,
596
- "eval_runtime": 225.3367,
597
- "eval_samples_per_second": 49.597,
598
- "eval_steps_per_second": 0.391,
599
  "step": 829
600
  },
601
  {
602
  "epoch": 2.001205545509343,
603
- "grad_norm": 0.9670296692720087,
604
  "learning_rate": 5e-06,
605
- "loss": 0.6016,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.0253164556962027,
610
- "grad_norm": 0.6837527713124405,
611
  "learning_rate": 5e-06,
612
- "loss": 0.5631,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.049427365883062,
617
- "grad_norm": 0.5935688974373606,
618
  "learning_rate": 5e-06,
619
- "loss": 0.5531,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.0735382760699217,
624
- "grad_norm": 0.5980530217682797,
625
  "learning_rate": 5e-06,
626
- "loss": 0.5554,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.097649186256781,
631
- "grad_norm": 0.5752374885434699,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5557,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.1217600964436407,
638
- "grad_norm": 0.5042143345935887,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5522,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.1458710066305002,
645
- "grad_norm": 0.5980920545311946,
646
  "learning_rate": 5e-06,
647
- "loss": 0.553,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.1699819168173597,
652
- "grad_norm": 0.5290062022586566,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5541,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.1940928270042193,
659
- "grad_norm": 0.6029389321066391,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5527,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.2182037371910788,
666
- "grad_norm": 0.5761620842575014,
667
  "learning_rate": 5e-06,
668
- "loss": 0.561,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.2423146473779383,
673
- "grad_norm": 0.5382086109948551,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5528,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.2664255575647982,
680
- "grad_norm": 0.5536204411197307,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5552,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.2905364677516578,
687
- "grad_norm": 0.7414422036930762,
688
  "learning_rate": 5e-06,
689
- "loss": 0.557,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.3146473779385173,
694
- "grad_norm": 0.6072913873182035,
695
  "learning_rate": 5e-06,
696
- "loss": 0.5573,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.338758288125377,
701
- "grad_norm": 0.5786725716853928,
702
  "learning_rate": 5e-06,
703
- "loss": 0.5577,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3628691983122363,
708
- "grad_norm": 0.5958758621711483,
709
  "learning_rate": 5e-06,
710
- "loss": 0.5573,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.386980108499096,
715
- "grad_norm": 0.5427800525323759,
716
  "learning_rate": 5e-06,
717
- "loss": 0.5605,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.4110910186859553,
722
- "grad_norm": 0.5008520202035274,
723
  "learning_rate": 5e-06,
724
- "loss": 0.55,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.435201928872815,
729
- "grad_norm": 0.5438627458062395,
730
  "learning_rate": 5e-06,
731
- "loss": 0.5591,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4593128390596743,
736
- "grad_norm": 0.523458598668171,
737
  "learning_rate": 5e-06,
738
- "loss": 0.5523,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.483423749246534,
743
- "grad_norm": 0.562845339140823,
744
  "learning_rate": 5e-06,
745
- "loss": 0.5513,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.507534659433394,
750
- "grad_norm": 0.71192454951128,
751
  "learning_rate": 5e-06,
752
- "loss": 0.5617,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.5316455696202533,
757
- "grad_norm": 0.5488684911452221,
758
  "learning_rate": 5e-06,
759
- "loss": 0.5594,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.555756479807113,
764
- "grad_norm": 0.6322721667592042,
765
  "learning_rate": 5e-06,
766
- "loss": 0.5603,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5798673899939724,
771
- "grad_norm": 0.5208011078844106,
772
  "learning_rate": 5e-06,
773
- "loss": 0.5564,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.603978300180832,
778
- "grad_norm": 0.5150689754075237,
779
  "learning_rate": 5e-06,
780
- "loss": 0.5624,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.6280892103676914,
785
- "grad_norm": 0.5338754237375813,
786
  "learning_rate": 5e-06,
787
- "loss": 0.5628,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.652200120554551,
792
- "grad_norm": 0.5072044155960452,
793
  "learning_rate": 5e-06,
794
- "loss": 0.5606,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.6763110307414104,
799
- "grad_norm": 0.7238515722776927,
800
  "learning_rate": 5e-06,
801
- "loss": 0.5557,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.70042194092827,
806
- "grad_norm": 0.5147434745712806,
807
  "learning_rate": 5e-06,
808
- "loss": 0.553,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.7245328511151294,
813
- "grad_norm": 0.5564967074947503,
814
  "learning_rate": 5e-06,
815
- "loss": 0.5635,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.748643761301989,
820
- "grad_norm": 0.5501220049253929,
821
  "learning_rate": 5e-06,
822
- "loss": 0.5583,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.7727546714888485,
827
- "grad_norm": 0.5103459117518057,
828
  "learning_rate": 5e-06,
829
- "loss": 0.5597,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.7968655816757084,
834
- "grad_norm": 0.5479118611862815,
835
  "learning_rate": 5e-06,
836
- "loss": 0.5579,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.820976491862568,
841
- "grad_norm": 0.5471001762934908,
842
  "learning_rate": 5e-06,
843
- "loss": 0.5591,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.8450874020494274,
848
- "grad_norm": 0.6232136492982399,
849
  "learning_rate": 5e-06,
850
- "loss": 0.5606,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.869198312236287,
855
- "grad_norm": 0.5669388319949817,
856
  "learning_rate": 5e-06,
857
- "loss": 0.5649,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.8933092224231465,
862
- "grad_norm": 0.6969387028585086,
863
  "learning_rate": 5e-06,
864
- "loss": 0.5651,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.917420132610006,
869
- "grad_norm": 0.6374387529410114,
870
  "learning_rate": 5e-06,
871
- "loss": 0.56,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.9415310427968655,
876
- "grad_norm": 0.560816628841587,
877
  "learning_rate": 5e-06,
878
- "loss": 0.5594,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.965641952983725,
883
- "grad_norm": 0.6033572013760955,
884
  "learning_rate": 5e-06,
885
- "loss": 0.5604,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.9897528631705845,
890
- "grad_norm": 0.5557325437050415,
891
  "learning_rate": 5e-06,
892
- "loss": 0.5631,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.9945750452079567,
897
- "eval_loss": 0.6422178745269775,
898
- "eval_runtime": 225.1456,
899
- "eval_samples_per_second": 49.639,
900
- "eval_steps_per_second": 0.391,
901
  "step": 1242
902
  },
903
  {
904
  "epoch": 2.9945750452079567,
905
  "step": 1242,
906
  "total_flos": 2079977499525120.0,
907
- "train_loss": 0.6134321775029439,
908
- "train_runtime": 37323.5248,
909
- "train_samples_per_second": 17.067,
910
  "train_steps_per_second": 0.033
911
  }
912
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.024110910186859555,
13
+ "grad_norm": 6.533456606849755,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.8842,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.04822182037371911,
20
+ "grad_norm": 1.7431134609583179,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.7716,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07233273056057866,
27
+ "grad_norm": 1.3259257434449123,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.7528,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.09644364074743822,
34
+ "grad_norm": 0.7933729448787181,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7272,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.12055455093429777,
41
+ "grad_norm": 0.9489596846030721,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.7249,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.14466546112115733,
48
+ "grad_norm": 0.9021467855462361,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.7048,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.16877637130801687,
55
+ "grad_norm": 0.9161730770097936,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7005,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.19288728149487644,
62
+ "grad_norm": 0.7938637800361754,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.6932,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21699819168173598,
69
+ "grad_norm": 0.5753057373815283,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.6974,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.24110910186859555,
76
+ "grad_norm": 0.9667031835337068,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.6757,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2652200120554551,
83
+ "grad_norm": 0.5028322572120897,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6834,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.28933092224231466,
90
+ "grad_norm": 0.6797693061745307,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.674,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3134418324291742,
97
+ "grad_norm": 1.1680360190006298,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.6722,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.33755274261603374,
104
+ "grad_norm": 0.544648561957048,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.677,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.3616636528028933,
111
+ "grad_norm": 0.7257586557706087,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6747,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.3857745629897529,
118
+ "grad_norm": 0.4119617826643094,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6693,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4098854731766124,
125
+ "grad_norm": 0.5605900141967505,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6688,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.43399638336347196,
132
+ "grad_norm": 0.6385688604944322,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.6643,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.45810729355033153,
139
+ "grad_norm": 1.0125290764918353,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.6641,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.4822182037371911,
146
+ "grad_norm": 0.6186627046406172,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.658,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5063291139240507,
153
+ "grad_norm": 0.8414739576765752,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6671,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5304400241109102,
160
+ "grad_norm": 0.6025383396507406,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6651,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5545509342977697,
167
+ "grad_norm": 0.5410823595468066,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6554,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.5786618444846293,
174
+ "grad_norm": 0.6181513401688427,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6641,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.6027727546714888,
181
+ "grad_norm": 0.4896208841371711,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.657,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6268836648583485,
188
+ "grad_norm": 0.544546111477725,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6638,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.650994575045208,
195
+ "grad_norm": 0.5356265326461168,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6602,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.6751054852320675,
202
+ "grad_norm": 0.5475932069244179,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6538,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.6992163954189271,
209
+ "grad_norm": 0.8345098978281534,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6523,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7233273056057866,
216
+ "grad_norm": 0.8160477568039888,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6604,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7474382157926461,
223
+ "grad_norm": 0.5563594159462366,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6581,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.7715491259795058,
230
+ "grad_norm": 0.6104670026137493,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.6543,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.7956600361663653,
237
+ "grad_norm": 0.5818143425968119,
238
  "learning_rate": 5e-06,
239
+ "loss": 0.6508,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8197709463532248,
244
+ "grad_norm": 0.39317653113678785,
245
  "learning_rate": 5e-06,
246
+ "loss": 0.6459,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8438818565400844,
251
+ "grad_norm": 0.4869964807571895,
252
  "learning_rate": 5e-06,
253
+ "loss": 0.6476,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.8679927667269439,
258
+ "grad_norm": 0.9839633535279524,
259
  "learning_rate": 5e-06,
260
+ "loss": 0.6517,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.8921036769138035,
265
+ "grad_norm": 0.4947132075136725,
266
  "learning_rate": 5e-06,
267
+ "loss": 0.6554,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9162145871006631,
272
+ "grad_norm": 0.42196728270115014,
273
  "learning_rate": 5e-06,
274
+ "loss": 0.647,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9403254972875226,
279
+ "grad_norm": 0.7036293961206416,
280
  "learning_rate": 5e-06,
281
+ "loss": 0.6437,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9644364074743822,
286
+ "grad_norm": 0.4303638291795801,
287
  "learning_rate": 5e-06,
288
+ "loss": 0.647,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.9885473176612417,
293
+ "grad_norm": 0.4993150805880552,
294
  "learning_rate": 5e-06,
295
+ "loss": 0.6408,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.9981916817359855,
300
+ "eval_loss": 0.6481794714927673,
301
+ "eval_runtime": 221.8793,
302
+ "eval_samples_per_second": 50.37,
303
+ "eval_steps_per_second": 0.397,
304
  "step": 414
305
  },
306
  {
307
  "epoch": 1.0126582278481013,
308
+ "grad_norm": 0.5861902646980864,
309
  "learning_rate": 5e-06,
310
+ "loss": 0.6219,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.0367691380349608,
315
+ "grad_norm": 0.4608104690581376,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.6109,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.0608800482218204,
322
+ "grad_norm": 0.7019806195266277,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.5999,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.0849909584086799,
329
+ "grad_norm": 0.4666118598227287,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.6071,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.1091018685955394,
336
+ "grad_norm": 0.49273088471001014,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.6079,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.1332127787823991,
343
+ "grad_norm": 0.5608412041594104,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.6093,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.1573236889692586,
350
+ "grad_norm": 0.5133766270512516,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.6023,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.1814345991561181,
357
+ "grad_norm": 0.4639503656965253,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.6067,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.2055455093429777,
364
+ "grad_norm": 0.4941484591532595,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.6034,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.2296564195298372,
371
+ "grad_norm": 0.532046568060987,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.6021,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.253767329716697,
378
+ "grad_norm": 0.6313451414543506,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.6072,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.2778782399035564,
385
+ "grad_norm": 0.48840150221258893,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.6008,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.301989150090416,
392
+ "grad_norm": 0.4346073819877919,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.6084,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.3261000602772754,
399
+ "grad_norm": 0.5696969325867375,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.6073,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.350210970464135,
406
+ "grad_norm": 0.6029521082479712,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.6045,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.3743218806509945,
413
+ "grad_norm": 0.5359000000861764,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.601,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.3984327908378542,
420
+ "grad_norm": 0.4280776424781654,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.6068,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.4225437010247137,
427
+ "grad_norm": 0.42975173635641,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.6058,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.4466546112115732,
434
+ "grad_norm": 0.4148935722421534,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.6084,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.4707655213984328,
441
+ "grad_norm": 0.4346895040838288,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.6083,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.4948764315852923,
448
+ "grad_norm": 0.456872099031643,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.6101,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.518987341772152,
455
+ "grad_norm": 0.518636393965265,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.6107,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.5430982519590115,
462
+ "grad_norm": 0.4976317739138397,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.5995,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.567209162145871,
469
+ "grad_norm": 0.5121056663367101,
470
  "learning_rate": 5e-06,
471
+ "loss": 0.6147,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.5913200723327305,
476
+ "grad_norm": 0.49181051844188867,
477
  "learning_rate": 5e-06,
478
+ "loss": 0.6046,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.61543098251959,
483
+ "grad_norm": 0.4913489094366748,
484
  "learning_rate": 5e-06,
485
+ "loss": 0.605,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.6395418927064496,
490
+ "grad_norm": 0.4360413141924259,
491
  "learning_rate": 5e-06,
492
+ "loss": 0.603,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.663652802893309,
497
+ "grad_norm": 0.5553873036504335,
498
  "learning_rate": 5e-06,
499
+ "loss": 0.6037,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.6877637130801688,
504
+ "grad_norm": 0.439159626571011,
505
  "learning_rate": 5e-06,
506
+ "loss": 0.6037,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.7118746232670283,
511
+ "grad_norm": 0.5009323338564864,
512
  "learning_rate": 5e-06,
513
+ "loss": 0.6046,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.7359855334538878,
518
+ "grad_norm": 0.49820787215486934,
519
  "learning_rate": 5e-06,
520
+ "loss": 0.6057,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.7600964436407476,
525
+ "grad_norm": 0.553637472752945,
526
  "learning_rate": 5e-06,
527
+ "loss": 0.6066,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.784207353827607,
532
+ "grad_norm": 0.44541140483577896,
533
  "learning_rate": 5e-06,
534
+ "loss": 0.6,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.8083182640144666,
539
+ "grad_norm": 0.5310706794248644,
540
  "learning_rate": 5e-06,
541
+ "loss": 0.6098,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.8324291742013261,
546
+ "grad_norm": 0.6630764624549126,
547
  "learning_rate": 5e-06,
548
+ "loss": 0.6054,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.8565400843881856,
553
+ "grad_norm": 0.553711920694149,
554
  "learning_rate": 5e-06,
555
+ "loss": 0.6037,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.8806509945750451,
560
+ "grad_norm": 0.566305473833487,
561
  "learning_rate": 5e-06,
562
+ "loss": 0.6019,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.9047619047619047,
567
+ "grad_norm": 0.582333160680419,
568
  "learning_rate": 5e-06,
569
+ "loss": 0.6051,
570
  "step": 790
571
  },
572
  {
573
  "epoch": 1.9288728149487642,
574
+ "grad_norm": 0.509141986707748,
575
  "learning_rate": 5e-06,
576
+ "loss": 0.6052,
577
  "step": 800
578
  },
579
  {
580
  "epoch": 1.952983725135624,
581
+ "grad_norm": 0.4543923308424651,
582
  "learning_rate": 5e-06,
583
+ "loss": 0.598,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 1.9770946353224834,
588
+ "grad_norm": 0.45958164108182104,
589
  "learning_rate": 5e-06,
590
+ "loss": 0.6026,
591
  "step": 820
592
  },
593
  {
594
  "epoch": 1.998794454490657,
595
+ "eval_loss": 0.6378007531166077,
596
+ "eval_runtime": 222.8588,
597
+ "eval_samples_per_second": 50.148,
598
+ "eval_steps_per_second": 0.395,
599
  "step": 829
600
  },
601
  {
602
  "epoch": 2.001205545509343,
603
+ "grad_norm": 0.7867156441363433,
604
  "learning_rate": 5e-06,
605
+ "loss": 0.6043,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.0253164556962027,
610
+ "grad_norm": 0.5284619907031279,
611
  "learning_rate": 5e-06,
612
+ "loss": 0.5707,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.049427365883062,
617
+ "grad_norm": 0.5795012320295118,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5606,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.0735382760699217,
624
+ "grad_norm": 0.5627294692682645,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.563,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.097649186256781,
631
+ "grad_norm": 0.5052179539566712,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5631,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.1217600964436407,
638
+ "grad_norm": 0.4428407773542258,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5595,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.1458710066305002,
645
+ "grad_norm": 0.5267499633401915,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5601,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.1699819168173597,
652
+ "grad_norm": 0.4655374512529405,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5607,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.1940928270042193,
659
+ "grad_norm": 0.48398838396056276,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5595,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.2182037371910788,
666
+ "grad_norm": 0.48096941817619093,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.5677,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.2423146473779383,
673
+ "grad_norm": 0.5154141010470734,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5594,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.2664255575647982,
680
+ "grad_norm": 0.4799488446079912,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.5621,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.2905364677516578,
687
+ "grad_norm": 0.5540016498502853,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.5638,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.3146473779385173,
694
+ "grad_norm": 0.6082357481189948,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.564,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.338758288125377,
701
+ "grad_norm": 0.5420853183530063,
702
  "learning_rate": 5e-06,
703
+ "loss": 0.5643,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.3628691983122363,
708
+ "grad_norm": 0.42570128293415416,
709
  "learning_rate": 5e-06,
710
+ "loss": 0.5642,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.386980108499096,
715
+ "grad_norm": 0.5255517048498499,
716
  "learning_rate": 5e-06,
717
+ "loss": 0.5672,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.4110910186859553,
722
+ "grad_norm": 0.5353694927594205,
723
  "learning_rate": 5e-06,
724
+ "loss": 0.5565,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.435201928872815,
729
+ "grad_norm": 0.4617633168683323,
730
  "learning_rate": 5e-06,
731
+ "loss": 0.5657,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.4593128390596743,
736
+ "grad_norm": 0.449869806649973,
737
  "learning_rate": 5e-06,
738
+ "loss": 0.5586,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.483423749246534,
743
+ "grad_norm": 0.5115337318725849,
744
  "learning_rate": 5e-06,
745
+ "loss": 0.5576,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.507534659433394,
750
+ "grad_norm": 0.6907411145245406,
751
  "learning_rate": 5e-06,
752
+ "loss": 0.5681,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.5316455696202533,
757
+ "grad_norm": 0.5238948140915647,
758
  "learning_rate": 5e-06,
759
+ "loss": 0.5659,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.555756479807113,
764
+ "grad_norm": 0.6589003211840228,
765
  "learning_rate": 5e-06,
766
+ "loss": 0.5667,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.5798673899939724,
771
+ "grad_norm": 0.4764556136945032,
772
  "learning_rate": 5e-06,
773
+ "loss": 0.5629,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.603978300180832,
778
+ "grad_norm": 0.44468254080490577,
779
  "learning_rate": 5e-06,
780
+ "loss": 0.5687,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.6280892103676914,
785
+ "grad_norm": 0.5124860341949249,
786
  "learning_rate": 5e-06,
787
+ "loss": 0.5692,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.652200120554551,
792
+ "grad_norm": 0.5228826110878407,
793
  "learning_rate": 5e-06,
794
+ "loss": 0.5667,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.6763110307414104,
799
+ "grad_norm": 0.5458373344595544,
800
  "learning_rate": 5e-06,
801
+ "loss": 0.5617,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.70042194092827,
806
+ "grad_norm": 0.43248189186264496,
807
  "learning_rate": 5e-06,
808
+ "loss": 0.5589,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.7245328511151294,
813
+ "grad_norm": 0.44951413853647815,
814
  "learning_rate": 5e-06,
815
+ "loss": 0.5696,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.748643761301989,
820
+ "grad_norm": 0.5059427152996532,
821
  "learning_rate": 5e-06,
822
+ "loss": 0.5645,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.7727546714888485,
827
+ "grad_norm": 0.4713166756254001,
828
  "learning_rate": 5e-06,
829
+ "loss": 0.5659,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.7968655816757084,
834
+ "grad_norm": 0.4662277376061737,
835
  "learning_rate": 5e-06,
836
+ "loss": 0.5638,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.820976491862568,
841
+ "grad_norm": 0.5055943494520574,
842
  "learning_rate": 5e-06,
843
+ "loss": 0.5651,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.8450874020494274,
848
+ "grad_norm": 0.49826856850045714,
849
  "learning_rate": 5e-06,
850
+ "loss": 0.5664,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 2.869198312236287,
855
+ "grad_norm": 0.46906591997365343,
856
  "learning_rate": 5e-06,
857
+ "loss": 0.5708,
858
  "step": 1190
859
  },
860
  {
861
  "epoch": 2.8933092224231465,
862
+ "grad_norm": 0.5743140790459712,
863
  "learning_rate": 5e-06,
864
+ "loss": 0.5713,
865
  "step": 1200
866
  },
867
  {
868
  "epoch": 2.917420132610006,
869
+ "grad_norm": 0.5413293244789124,
870
  "learning_rate": 5e-06,
871
+ "loss": 0.566,
872
  "step": 1210
873
  },
874
  {
875
  "epoch": 2.9415310427968655,
876
+ "grad_norm": 0.4769984493754597,
877
  "learning_rate": 5e-06,
878
+ "loss": 0.5653,
879
  "step": 1220
880
  },
881
  {
882
  "epoch": 2.965641952983725,
883
+ "grad_norm": 0.4784113431133355,
884
  "learning_rate": 5e-06,
885
+ "loss": 0.5663,
886
  "step": 1230
887
  },
888
  {
889
  "epoch": 2.9897528631705845,
890
+ "grad_norm": 0.46857130335535624,
891
  "learning_rate": 5e-06,
892
+ "loss": 0.569,
893
  "step": 1240
894
  },
895
  {
896
  "epoch": 2.9945750452079567,
897
+ "eval_loss": 0.6389562487602234,
898
+ "eval_runtime": 223.5725,
899
+ "eval_samples_per_second": 49.988,
900
+ "eval_steps_per_second": 0.394,
901
  "step": 1242
902
  },
903
  {
904
  "epoch": 2.9945750452079567,
905
  "step": 1242,
906
  "total_flos": 2079977499525120.0,
907
+ "train_loss": 0.615725677754376,
908
+ "train_runtime": 37262.9121,
909
+ "train_samples_per_second": 17.094,
910
  "train_steps_per_second": 0.033
911
  }
912
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED