kerpr commited on
Commit
8f1c71f
1 Parent(s): 95a3649

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +10 -10
  2. eval_results.json +6 -6
  3. train_results.json +5 -5
  4. trainer_state.json +247 -1096
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_f1": 0.0,
4
- "eval_loss": 0.34448280930519104,
5
- "eval_runtime": 20.531,
6
  "eval_samples": 1916,
7
- "eval_samples_per_second": 93.322,
8
- "eval_steps_per_second": 1.461,
9
- "train_loss": 0.3510875488058106,
10
- "train_runtime": 3949.5247,
11
  "train_samples": 5743,
12
- "train_samples_per_second": 14.541,
13
- "train_steps_per_second": 0.909
14
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_f1": 0.19553868058851448,
4
+ "eval_loss": 5.8615827560424805,
5
+ "eval_runtime": 20.4376,
6
  "eval_samples": 1916,
7
+ "eval_samples_per_second": 93.749,
8
+ "eval_steps_per_second": 1.468,
9
+ "train_loss": 15.830373929303002,
10
+ "train_runtime": 1144.0282,
11
  "train_samples": 5743,
12
+ "train_samples_per_second": 15.06,
13
+ "train_steps_per_second": 0.472
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_f1": 0.0,
4
- "eval_loss": 0.34448280930519104,
5
- "eval_runtime": 20.531,
6
  "eval_samples": 1916,
7
- "eval_samples_per_second": 93.322,
8
- "eval_steps_per_second": 1.461
9
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_f1": 0.19553868058851448,
4
+ "eval_loss": 5.8615827560424805,
5
+ "eval_runtime": 20.4376,
6
  "eval_samples": 1916,
7
+ "eval_samples_per_second": 93.749,
8
+ "eval_steps_per_second": 1.468
9
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "train_loss": 0.3510875488058106,
4
- "train_runtime": 3949.5247,
5
  "train_samples": 5743,
6
- "train_samples_per_second": 14.541,
7
- "train_steps_per_second": 0.909
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 15.830373929303002,
4
+ "train_runtime": 1144.0282,
5
  "train_samples": 5743,
6
+ "train_samples_per_second": 15.06,
7
+ "train_steps_per_second": 0.472
8
  }
trainer_state.json CHANGED
@@ -1,1230 +1,381 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 3590,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
11
  {
12
  "epoch": 0.11,
13
- "learning_rate": 2e-05,
14
- "loss": 95.2205,
15
  "step": 20
16
  },
 
 
 
 
 
 
17
  {
18
  "epoch": 0.22,
19
- "learning_rate": 2e-05,
20
- "loss": 48.3233,
21
  "step": 40
22
  },
 
 
 
 
 
 
23
  {
24
  "epoch": 0.33,
25
- "learning_rate": 2e-05,
26
- "loss": 13.9954,
27
  "step": 60
28
  },
29
  {
30
- "epoch": 0.45,
31
- "learning_rate": 2e-05,
32
- "loss": 2.0884,
 
 
 
 
 
 
33
  "step": 80
34
  },
 
 
 
 
 
 
35
  {
36
  "epoch": 0.56,
37
- "learning_rate": 2e-05,
38
- "loss": 1.5549,
39
  "step": 100
40
  },
 
 
 
 
 
 
41
  {
42
  "epoch": 0.67,
43
- "learning_rate": 2e-05,
44
- "loss": 2.0528,
45
  "step": 120
46
  },
 
 
 
 
 
 
47
  {
48
  "epoch": 0.78,
49
- "learning_rate": 2e-05,
50
- "loss": 3.102,
51
  "step": 140
52
  },
 
 
 
 
 
 
53
  {
54
  "epoch": 0.89,
55
- "learning_rate": 2e-05,
56
- "loss": 1.2929,
57
  "step": 160
58
  },
59
  {
60
- "epoch": 1.0,
61
- "eval_f1": 0.1941564561734213,
62
- "eval_loss": 13.452169418334961,
63
- "eval_runtime": 23.1921,
64
- "eval_samples_per_second": 82.614,
65
- "eval_steps_per_second": 1.294,
66
- "step": 179
67
  },
68
  {
69
  "epoch": 1.0,
70
- "learning_rate": 2e-05,
71
- "loss": 0.5906,
72
  "step": 180
73
  },
74
  {
75
- "epoch": 1.11,
76
- "learning_rate": 2e-05,
77
- "loss": 0.3674,
78
- "step": 200
79
- },
80
- {
81
- "epoch": 1.23,
82
- "learning_rate": 2e-05,
83
- "loss": 0.5531,
84
- "step": 220
85
- },
86
- {
87
- "epoch": 1.34,
88
- "learning_rate": 2e-05,
89
- "loss": 0.2448,
90
- "step": 240
91
- },
92
- {
93
- "epoch": 1.45,
94
- "learning_rate": 2e-05,
95
- "loss": 0.1894,
96
- "step": 260
97
- },
98
- {
99
- "epoch": 1.56,
100
- "learning_rate": 2e-05,
101
- "loss": 0.2385,
102
- "step": 280
103
- },
104
- {
105
- "epoch": 1.67,
106
- "learning_rate": 2e-05,
107
- "loss": 0.2773,
108
- "step": 300
109
- },
110
- {
111
- "epoch": 1.78,
112
- "learning_rate": 2e-05,
113
- "loss": 0.1879,
114
- "step": 320
115
- },
116
- {
117
- "epoch": 1.89,
118
- "learning_rate": 2e-05,
119
- "loss": 0.1541,
120
- "step": 340
121
- },
122
- {
123
- "epoch": 2.0,
124
- "eval_f1": 0.1941564561734213,
125
- "eval_loss": 8.468379020690918,
126
- "eval_runtime": 22.9961,
127
- "eval_samples_per_second": 83.318,
128
- "eval_steps_per_second": 1.305,
129
- "step": 359
130
- },
131
- {
132
- "epoch": 2.01,
133
- "learning_rate": 2e-05,
134
- "loss": 0.1381,
135
- "step": 360
136
- },
137
- {
138
- "epoch": 2.12,
139
- "learning_rate": 2e-05,
140
- "loss": 0.1464,
141
- "step": 380
142
- },
143
- {
144
- "epoch": 2.23,
145
- "learning_rate": 2e-05,
146
- "loss": 0.2856,
147
- "step": 400
148
- },
149
- {
150
- "epoch": 2.34,
151
- "learning_rate": 2e-05,
152
- "loss": 0.3727,
153
- "step": 420
154
- },
155
- {
156
- "epoch": 2.45,
157
- "learning_rate": 2e-05,
158
- "loss": 0.2145,
159
- "step": 440
160
- },
161
- {
162
- "epoch": 2.56,
163
- "learning_rate": 2e-05,
164
- "loss": 0.0867,
165
- "step": 460
166
- },
167
- {
168
- "epoch": 2.67,
169
- "learning_rate": 2e-05,
170
- "loss": 0.232,
171
- "step": 480
172
- },
173
- {
174
- "epoch": 2.79,
175
- "learning_rate": 2e-05,
176
- "loss": 0.1407,
177
- "step": 500
178
- },
179
- {
180
- "epoch": 2.9,
181
- "learning_rate": 2e-05,
182
- "loss": 0.1257,
183
- "step": 520
184
- },
185
- {
186
- "epoch": 3.0,
187
  "eval_f1": 0.1941564561734213,
188
- "eval_loss": 7.637044906616211,
189
- "eval_runtime": 22.6323,
190
- "eval_samples_per_second": 84.658,
191
- "eval_steps_per_second": 1.326,
192
- "step": 538
193
- },
194
- {
195
- "epoch": 3.01,
196
- "learning_rate": 2e-05,
197
- "loss": 0.1963,
198
- "step": 540
199
  },
200
  {
201
- "epoch": 3.12,
202
- "learning_rate": 2e-05,
203
- "loss": 0.1485,
204
- "step": 560
205
  },
206
  {
207
- "epoch": 3.23,
208
- "learning_rate": 2e-05,
209
- "loss": 0.1373,
210
- "step": 580
211
  },
212
  {
213
- "epoch": 3.34,
214
- "learning_rate": 2e-05,
215
- "loss": 0.0971,
216
- "step": 600
217
  },
218
  {
219
- "epoch": 3.45,
220
- "learning_rate": 2e-05,
221
- "loss": 0.2036,
222
- "step": 620
223
  },
224
  {
225
- "epoch": 3.57,
226
- "learning_rate": 2e-05,
227
- "loss": 0.2205,
228
- "step": 640
229
  },
230
  {
231
- "epoch": 3.68,
232
- "learning_rate": 2e-05,
233
- "loss": 0.0547,
234
- "step": 660
235
  },
236
  {
237
- "epoch": 3.79,
238
- "learning_rate": 2e-05,
239
- "loss": 0.0957,
240
- "step": 680
241
  },
242
  {
243
- "epoch": 3.9,
244
- "learning_rate": 2e-05,
245
- "loss": 0.1684,
246
- "step": 700
247
  },
248
  {
249
- "epoch": 4.0,
250
- "eval_f1": 0.6376360808709176,
251
- "eval_loss": 0.7054294943809509,
252
- "eval_runtime": 22.968,
253
- "eval_samples_per_second": 83.42,
254
- "eval_steps_per_second": 1.306,
255
- "step": 718
256
  },
257
  {
258
- "epoch": 4.01,
259
- "learning_rate": 2e-05,
260
- "loss": 0.145,
261
- "step": 720
262
  },
263
  {
264
- "epoch": 4.12,
265
- "learning_rate": 2e-05,
266
- "loss": 0.1186,
267
- "step": 740
268
  },
269
  {
270
- "epoch": 4.23,
271
- "learning_rate": 2e-05,
272
- "loss": 0.0227,
273
- "step": 760
274
  },
275
  {
276
- "epoch": 4.35,
277
- "learning_rate": 2e-05,
278
- "loss": 0.0556,
279
- "step": 780
280
  },
281
  {
282
- "epoch": 4.46,
283
- "learning_rate": 2e-05,
284
- "loss": 0.141,
285
- "step": 800
286
  },
287
  {
288
- "epoch": 4.57,
289
- "learning_rate": 2e-05,
290
- "loss": 0.1328,
291
- "step": 820
292
  },
293
  {
294
- "epoch": 4.68,
295
- "learning_rate": 2e-05,
296
- "loss": 0.0992,
297
- "step": 840
298
  },
299
  {
300
- "epoch": 4.79,
301
- "learning_rate": 2e-05,
302
- "loss": 0.1691,
303
- "step": 860
304
  },
305
  {
306
- "epoch": 4.9,
307
- "learning_rate": 2e-05,
308
- "loss": 0.0911,
309
- "step": 880
310
  },
311
  {
312
- "epoch": 5.0,
313
  "eval_f1": 0.1941564561734213,
314
- "eval_loss": 5.119464874267578,
315
- "eval_runtime": 22.7812,
316
- "eval_samples_per_second": 84.104,
317
- "eval_steps_per_second": 1.317,
318
- "step": 897
319
- },
320
- {
321
- "epoch": 5.01,
322
- "learning_rate": 2e-05,
323
- "loss": 0.0993,
324
- "step": 900
325
- },
326
- {
327
- "epoch": 5.13,
328
- "learning_rate": 2e-05,
329
- "loss": 0.0972,
330
- "step": 920
331
- },
332
- {
333
- "epoch": 5.24,
334
- "learning_rate": 2e-05,
335
- "loss": 0.0389,
336
- "step": 940
337
- },
338
- {
339
- "epoch": 5.35,
340
- "learning_rate": 2e-05,
341
- "loss": 0.1366,
342
- "step": 960
343
- },
344
- {
345
- "epoch": 5.46,
346
- "learning_rate": 2e-05,
347
- "loss": 0.0833,
348
- "step": 980
349
- },
350
- {
351
- "epoch": 5.57,
352
- "learning_rate": 2e-05,
353
- "loss": 0.1634,
354
- "step": 1000
355
- },
356
- {
357
- "epoch": 5.68,
358
- "learning_rate": 2e-05,
359
- "loss": 0.0691,
360
- "step": 1020
361
- },
362
- {
363
- "epoch": 5.79,
364
- "learning_rate": 2e-05,
365
- "loss": 0.1487,
366
- "step": 1040
367
- },
368
- {
369
- "epoch": 5.91,
370
- "learning_rate": 2e-05,
371
- "loss": 0.145,
372
- "step": 1060
373
- },
374
- {
375
- "epoch": 6.0,
376
- "eval_f1": 0.7984031936127745,
377
- "eval_loss": 0.2693595290184021,
378
- "eval_runtime": 22.9118,
379
- "eval_samples_per_second": 83.625,
380
- "eval_steps_per_second": 1.309,
381
- "step": 1077
382
- },
383
- {
384
- "epoch": 6.02,
385
- "learning_rate": 2e-05,
386
- "loss": 0.0373,
387
- "step": 1080
388
- },
389
- {
390
- "epoch": 6.13,
391
- "learning_rate": 2e-05,
392
- "loss": 0.0409,
393
- "step": 1100
394
- },
395
- {
396
- "epoch": 6.24,
397
- "learning_rate": 2e-05,
398
- "loss": 0.0714,
399
- "step": 1120
400
- },
401
- {
402
- "epoch": 6.35,
403
- "learning_rate": 2e-05,
404
- "loss": 0.0915,
405
- "step": 1140
406
- },
407
- {
408
- "epoch": 6.46,
409
- "learning_rate": 2e-05,
410
- "loss": 0.1359,
411
- "step": 1160
412
- },
413
- {
414
- "epoch": 6.57,
415
- "learning_rate": 2e-05,
416
- "loss": 0.1016,
417
- "step": 1180
418
- },
419
- {
420
- "epoch": 6.69,
421
- "learning_rate": 2e-05,
422
- "loss": 0.0346,
423
- "step": 1200
424
- },
425
- {
426
- "epoch": 6.8,
427
- "learning_rate": 2e-05,
428
- "loss": 0.0437,
429
- "step": 1220
430
- },
431
- {
432
- "epoch": 6.91,
433
- "learning_rate": 2e-05,
434
- "loss": 0.1191,
435
- "step": 1240
436
- },
437
- {
438
- "epoch": 7.0,
439
- "eval_f1": 0.20265617314313825,
440
- "eval_loss": 2.941455602645874,
441
- "eval_runtime": 22.6391,
442
- "eval_samples_per_second": 84.632,
443
- "eval_steps_per_second": 1.325,
444
- "step": 1256
445
- },
446
- {
447
- "epoch": 7.02,
448
- "learning_rate": 2e-05,
449
- "loss": 0.069,
450
- "step": 1260
451
- },
452
- {
453
- "epoch": 7.13,
454
- "learning_rate": 2e-05,
455
- "loss": 0.009,
456
- "step": 1280
457
- },
458
- {
459
- "epoch": 7.24,
460
- "learning_rate": 2e-05,
461
- "loss": 0.0485,
462
- "step": 1300
463
- },
464
- {
465
- "epoch": 7.35,
466
- "learning_rate": 2e-05,
467
- "loss": 0.0105,
468
- "step": 1320
469
- },
470
- {
471
- "epoch": 7.47,
472
- "learning_rate": 2e-05,
473
- "loss": 0.0835,
474
- "step": 1340
475
- },
476
- {
477
- "epoch": 7.58,
478
- "learning_rate": 2e-05,
479
- "loss": 0.1458,
480
- "step": 1360
481
- },
482
- {
483
- "epoch": 7.69,
484
- "learning_rate": 2e-05,
485
- "loss": 0.0553,
486
- "step": 1380
487
- },
488
- {
489
- "epoch": 7.8,
490
- "learning_rate": 2e-05,
491
- "loss": 0.005,
492
- "step": 1400
493
- },
494
- {
495
- "epoch": 7.91,
496
- "learning_rate": 2e-05,
497
- "loss": 0.1008,
498
- "step": 1420
499
- },
500
- {
501
- "epoch": 8.0,
502
- "eval_f1": 0.9023255813953488,
503
- "eval_loss": 0.17851048707962036,
504
- "eval_runtime": 22.8798,
505
- "eval_samples_per_second": 83.742,
506
- "eval_steps_per_second": 1.311,
507
- "step": 1436
508
- },
509
- {
510
- "epoch": 8.02,
511
- "learning_rate": 2e-05,
512
- "loss": 0.0414,
513
- "step": 1440
514
- },
515
- {
516
- "epoch": 8.13,
517
- "learning_rate": 2e-05,
518
- "loss": 0.0351,
519
- "step": 1460
520
- },
521
- {
522
- "epoch": 8.25,
523
- "learning_rate": 2e-05,
524
- "loss": 0.115,
525
- "step": 1480
526
- },
527
- {
528
- "epoch": 8.36,
529
- "learning_rate": 2e-05,
530
- "loss": 0.0453,
531
- "step": 1500
532
- },
533
- {
534
- "epoch": 4.23,
535
- "learning_rate": 0.001,
536
- "loss": 18.7223,
537
- "step": 1520
538
- },
539
- {
540
- "epoch": 4.29,
541
- "learning_rate": 0.001,
542
- "loss": 5.5701,
543
- "step": 1540
544
- },
545
- {
546
- "epoch": 4.35,
547
- "learning_rate": 0.001,
548
- "loss": 1.2935,
549
- "step": 1560
550
- },
551
- {
552
- "epoch": 4.4,
553
- "learning_rate": 0.001,
554
- "loss": 0.6161,
555
- "step": 1580
556
- },
557
- {
558
- "epoch": 4.46,
559
- "learning_rate": 0.001,
560
- "loss": 0.757,
561
- "step": 1600
562
- },
563
- {
564
- "epoch": 4.51,
565
- "learning_rate": 0.001,
566
- "loss": 0.6241,
567
- "step": 1620
568
- },
569
- {
570
- "epoch": 4.57,
571
- "learning_rate": 0.001,
572
- "loss": 0.5211,
573
- "step": 1640
574
- },
575
- {
576
- "epoch": 4.62,
577
- "learning_rate": 0.001,
578
- "loss": 0.4467,
579
- "step": 1660
580
- },
581
- {
582
- "epoch": 4.68,
583
- "learning_rate": 0.001,
584
- "loss": 0.424,
585
- "step": 1680
586
- },
587
- {
588
- "epoch": 4.74,
589
- "learning_rate": 0.001,
590
- "loss": 0.3741,
591
- "step": 1700
592
- },
593
- {
594
- "epoch": 4.79,
595
- "learning_rate": 0.001,
596
- "loss": 0.3276,
597
- "step": 1720
598
  },
599
  {
600
- "epoch": 4.85,
601
- "learning_rate": 0.001,
602
- "loss": 0.3692,
603
- "step": 1740
604
  },
605
  {
606
- "epoch": 4.9,
607
- "learning_rate": 0.001,
608
  "loss": 0.3626,
609
- "step": 1760
610
- },
611
- {
612
- "epoch": 4.96,
613
- "learning_rate": 0.001,
614
- "loss": 0.3698,
615
- "step": 1780
616
- },
617
- {
618
- "epoch": 5.0,
619
- "eval_f1": 0.0,
620
- "eval_loss": 0.3513816297054291,
621
- "eval_runtime": 20.8512,
622
- "eval_samples_per_second": 91.889,
623
- "eval_steps_per_second": 1.439,
624
- "step": 1795
625
- },
626
- {
627
- "epoch": 5.01,
628
- "learning_rate": 0.001,
629
- "loss": 0.3672,
630
- "step": 1800
631
- },
632
- {
633
- "epoch": 5.07,
634
- "learning_rate": 0.001,
635
- "loss": 0.3879,
636
- "step": 1820
637
- },
638
- {
639
- "epoch": 5.13,
640
- "learning_rate": 0.001,
641
- "loss": 0.458,
642
- "step": 1840
643
- },
644
- {
645
- "epoch": 5.18,
646
- "learning_rate": 0.001,
647
- "loss": 0.3949,
648
- "step": 1860
649
- },
650
- {
651
- "epoch": 5.24,
652
- "learning_rate": 0.001,
653
- "loss": 0.372,
654
- "step": 1880
655
- },
656
- {
657
- "epoch": 5.29,
658
- "learning_rate": 0.001,
659
- "loss": 0.3578,
660
- "step": 1900
661
- },
662
- {
663
- "epoch": 5.35,
664
- "learning_rate": 0.001,
665
- "loss": 0.3906,
666
- "step": 1920
667
- },
668
- {
669
- "epoch": 5.4,
670
- "learning_rate": 0.001,
671
- "loss": 0.3888,
672
- "step": 1940
673
- },
674
- {
675
- "epoch": 5.46,
676
- "learning_rate": 0.001,
677
- "loss": 0.4049,
678
- "step": 1960
679
- },
680
- {
681
- "epoch": 5.52,
682
- "learning_rate": 0.001,
683
- "loss": 0.3692,
684
- "step": 1980
685
- },
686
- {
687
- "epoch": 5.57,
688
- "learning_rate": 0.001,
689
- "loss": 0.3299,
690
- "step": 2000
691
- },
692
- {
693
- "epoch": 5.63,
694
- "learning_rate": 0.001,
695
- "loss": 0.3714,
696
- "step": 2020
697
- },
698
- {
699
- "epoch": 5.68,
700
- "learning_rate": 0.001,
701
- "loss": 0.3423,
702
- "step": 2040
703
- },
704
- {
705
- "epoch": 5.74,
706
- "learning_rate": 0.001,
707
- "loss": 0.3534,
708
- "step": 2060
709
- },
710
- {
711
- "epoch": 5.79,
712
- "learning_rate": 0.001,
713
- "loss": 0.3426,
714
- "step": 2080
715
- },
716
- {
717
- "epoch": 5.85,
718
- "learning_rate": 0.001,
719
- "loss": 0.3684,
720
- "step": 2100
721
- },
722
- {
723
- "epoch": 5.91,
724
- "learning_rate": 0.001,
725
- "loss": 0.3472,
726
- "step": 2120
727
- },
728
- {
729
- "epoch": 5.96,
730
- "learning_rate": 0.001,
731
- "loss": 0.299,
732
- "step": 2140
733
- },
734
- {
735
- "epoch": 6.0,
736
- "eval_f1": 0.0,
737
- "eval_loss": 0.3469391465187073,
738
- "eval_runtime": 20.5335,
739
- "eval_samples_per_second": 93.311,
740
- "eval_steps_per_second": 1.461,
741
- "step": 2154
742
- },
743
- {
744
- "epoch": 6.02,
745
- "learning_rate": 0.001,
746
- "loss": 0.3336,
747
- "step": 2160
748
- },
749
- {
750
- "epoch": 6.07,
751
- "learning_rate": 0.001,
752
- "loss": 0.4366,
753
- "step": 2180
754
- },
755
- {
756
- "epoch": 6.13,
757
- "learning_rate": 0.001,
758
- "loss": 0.3709,
759
- "step": 2200
760
- },
761
- {
762
- "epoch": 6.18,
763
- "learning_rate": 0.001,
764
- "loss": 0.3357,
765
- "step": 2220
766
- },
767
- {
768
- "epoch": 6.24,
769
- "learning_rate": 0.001,
770
- "loss": 0.4034,
771
- "step": 2240
772
- },
773
- {
774
- "epoch": 6.3,
775
- "learning_rate": 0.001,
776
- "loss": 0.3868,
777
- "step": 2260
778
- },
779
- {
780
- "epoch": 6.35,
781
- "learning_rate": 0.001,
782
- "loss": 0.3328,
783
- "step": 2280
784
- },
785
- {
786
- "epoch": 6.41,
787
- "learning_rate": 0.001,
788
- "loss": 0.3974,
789
- "step": 2300
790
- },
791
- {
792
- "epoch": 6.46,
793
- "learning_rate": 0.001,
794
- "loss": 0.3707,
795
- "step": 2320
796
- },
797
- {
798
- "epoch": 6.52,
799
- "learning_rate": 0.001,
800
- "loss": 0.3753,
801
- "step": 2340
802
- },
803
- {
804
- "epoch": 6.57,
805
- "learning_rate": 0.001,
806
- "loss": 0.3255,
807
- "step": 2360
808
- },
809
- {
810
- "epoch": 6.63,
811
- "learning_rate": 0.001,
812
- "loss": 0.4284,
813
- "step": 2380
814
- },
815
- {
816
- "epoch": 6.69,
817
- "learning_rate": 0.001,
818
- "loss": 0.3699,
819
- "step": 2400
820
- },
821
- {
822
- "epoch": 6.74,
823
- "learning_rate": 0.001,
824
- "loss": 0.3705,
825
- "step": 2420
826
- },
827
- {
828
- "epoch": 6.8,
829
- "learning_rate": 0.001,
830
- "loss": 0.2841,
831
- "step": 2440
832
- },
833
- {
834
- "epoch": 6.85,
835
- "learning_rate": 0.001,
836
- "loss": 0.2687,
837
- "step": 2460
838
- },
839
- {
840
- "epoch": 6.91,
841
- "learning_rate": 0.001,
842
- "loss": 0.3294,
843
- "step": 2480
844
- },
845
- {
846
- "epoch": 6.96,
847
- "learning_rate": 0.001,
848
- "loss": 0.3531,
849
- "step": 2500
850
- },
851
- {
852
- "epoch": 7.0,
853
- "eval_f1": 0.0,
854
- "eval_loss": 0.3420043885707855,
855
- "eval_runtime": 20.5195,
856
- "eval_samples_per_second": 93.374,
857
- "eval_steps_per_second": 1.462,
858
- "step": 2513
859
- },
860
- {
861
- "epoch": 7.02,
862
- "learning_rate": 0.001,
863
- "loss": 0.3396,
864
- "step": 2520
865
- },
866
- {
867
- "epoch": 7.08,
868
- "learning_rate": 0.001,
869
- "loss": 0.3824,
870
- "step": 2540
871
- },
872
- {
873
- "epoch": 7.13,
874
- "learning_rate": 0.001,
875
- "loss": 0.2518,
876
- "step": 2560
877
- },
878
- {
879
- "epoch": 7.19,
880
- "learning_rate": 0.001,
881
- "loss": 0.3822,
882
- "step": 2580
883
- },
884
- {
885
- "epoch": 7.24,
886
- "learning_rate": 0.001,
887
- "loss": 0.3969,
888
- "step": 2600
889
- },
890
- {
891
- "epoch": 7.3,
892
- "learning_rate": 0.001,
893
- "loss": 0.2551,
894
- "step": 2620
895
- },
896
- {
897
- "epoch": 7.35,
898
- "learning_rate": 0.001,
899
- "loss": 0.3387,
900
- "step": 2640
901
- },
902
- {
903
- "epoch": 7.41,
904
- "learning_rate": 0.001,
905
- "loss": 0.3761,
906
- "step": 2660
907
- },
908
- {
909
- "epoch": 7.47,
910
- "learning_rate": 0.001,
911
- "loss": 0.3899,
912
- "step": 2680
913
- },
914
- {
915
- "epoch": 7.52,
916
- "learning_rate": 0.001,
917
- "loss": 0.3691,
918
- "step": 2700
919
- },
920
- {
921
- "epoch": 7.58,
922
- "learning_rate": 0.001,
923
- "loss": 0.3172,
924
- "step": 2720
925
- },
926
- {
927
- "epoch": 7.63,
928
- "learning_rate": 0.001,
929
- "loss": 0.3358,
930
- "step": 2740
931
- },
932
- {
933
- "epoch": 7.69,
934
- "learning_rate": 0.001,
935
- "loss": 0.3459,
936
- "step": 2760
937
- },
938
- {
939
- "epoch": 7.74,
940
- "learning_rate": 0.001,
941
- "loss": 0.3347,
942
- "step": 2780
943
- },
944
- {
945
- "epoch": 7.8,
946
- "learning_rate": 0.001,
947
- "loss": 0.3459,
948
- "step": 2800
949
- },
950
- {
951
- "epoch": 7.86,
952
- "learning_rate": 0.001,
953
- "loss": 0.3797,
954
- "step": 2820
955
- },
956
- {
957
- "epoch": 7.91,
958
- "learning_rate": 0.001,
959
- "loss": 0.3721,
960
- "step": 2840
961
- },
962
- {
963
- "epoch": 7.97,
964
- "learning_rate": 0.001,
965
- "loss": 0.3892,
966
- "step": 2860
967
- },
968
- {
969
- "epoch": 8.0,
970
- "eval_f1": 0.0,
971
- "eval_loss": 0.34283891320228577,
972
- "eval_runtime": 20.5477,
973
- "eval_samples_per_second": 93.247,
974
- "eval_steps_per_second": 1.46,
975
- "step": 2872
976
- },
977
- {
978
- "epoch": 8.02,
979
- "learning_rate": 0.001,
980
- "loss": 0.3234,
981
- "step": 2880
982
- },
983
- {
984
- "epoch": 8.08,
985
- "learning_rate": 0.001,
986
- "loss": 0.3979,
987
- "step": 2900
988
- },
989
- {
990
- "epoch": 8.13,
991
- "learning_rate": 0.001,
992
- "loss": 0.4032,
993
- "step": 2920
994
- },
995
- {
996
- "epoch": 8.19,
997
- "learning_rate": 0.001,
998
- "loss": 0.3787,
999
- "step": 2940
1000
- },
1001
- {
1002
- "epoch": 8.25,
1003
- "learning_rate": 0.001,
1004
- "loss": 0.3144,
1005
- "step": 2960
1006
- },
1007
- {
1008
- "epoch": 8.3,
1009
- "learning_rate": 0.001,
1010
- "loss": 0.4071,
1011
- "step": 2980
1012
- },
1013
- {
1014
- "epoch": 8.36,
1015
- "learning_rate": 0.001,
1016
- "loss": 0.3192,
1017
- "step": 3000
1018
- },
1019
- {
1020
- "epoch": 8.41,
1021
- "learning_rate": 0.001,
1022
- "loss": 0.3194,
1023
- "step": 3020
1024
- },
1025
- {
1026
- "epoch": 8.47,
1027
- "learning_rate": 0.001,
1028
- "loss": 0.3468,
1029
- "step": 3040
1030
- },
1031
- {
1032
- "epoch": 8.52,
1033
- "learning_rate": 0.001,
1034
- "loss": 0.325,
1035
- "step": 3060
1036
- },
1037
- {
1038
- "epoch": 8.58,
1039
- "learning_rate": 0.001,
1040
- "loss": 0.3631,
1041
- "step": 3080
1042
- },
1043
- {
1044
- "epoch": 8.64,
1045
- "learning_rate": 0.001,
1046
- "loss": 0.3464,
1047
- "step": 3100
1048
- },
1049
- {
1050
- "epoch": 8.69,
1051
- "learning_rate": 0.001,
1052
- "loss": 0.3378,
1053
- "step": 3120
1054
- },
1055
- {
1056
- "epoch": 8.75,
1057
- "learning_rate": 0.001,
1058
- "loss": 0.3808,
1059
- "step": 3140
1060
- },
1061
- {
1062
- "epoch": 8.8,
1063
- "learning_rate": 0.001,
1064
- "loss": 0.3668,
1065
- "step": 3160
1066
- },
1067
- {
1068
- "epoch": 8.86,
1069
- "learning_rate": 0.001,
1070
- "loss": 0.3045,
1071
- "step": 3180
1072
- },
1073
- {
1074
- "epoch": 8.91,
1075
- "learning_rate": 0.001,
1076
- "loss": 0.2805,
1077
- "step": 3200
1078
- },
1079
- {
1080
- "epoch": 8.97,
1081
- "learning_rate": 0.001,
1082
- "loss": 0.3706,
1083
- "step": 3220
1084
- },
1085
- {
1086
- "epoch": 9.0,
1087
- "eval_f1": 0.0,
1088
- "eval_loss": 0.3420598804950714,
1089
- "eval_runtime": 20.5266,
1090
- "eval_samples_per_second": 93.342,
1091
- "eval_steps_per_second": 1.462,
1092
- "step": 3231
1093
- },
1094
- {
1095
- "epoch": 9.03,
1096
- "learning_rate": 0.001,
1097
- "loss": 0.3502,
1098
- "step": 3240
1099
- },
1100
- {
1101
- "epoch": 9.08,
1102
- "learning_rate": 0.001,
1103
- "loss": 0.3414,
1104
- "step": 3260
1105
  },
1106
  {
1107
- "epoch": 9.14,
1108
- "learning_rate": 0.001,
1109
- "loss": 0.4037,
1110
- "step": 3280
1111
  },
1112
  {
1113
- "epoch": 9.19,
1114
- "learning_rate": 0.001,
1115
- "loss": 0.3548,
1116
- "step": 3300
1117
  },
1118
  {
1119
- "epoch": 9.25,
1120
- "learning_rate": 0.001,
1121
- "loss": 0.3426,
1122
- "step": 3320
1123
  },
1124
  {
1125
- "epoch": 9.3,
1126
- "learning_rate": 0.001,
1127
- "loss": 0.3614,
1128
- "step": 3340
1129
  },
1130
  {
1131
- "epoch": 9.36,
1132
- "learning_rate": 0.001,
1133
- "loss": 0.2505,
1134
- "step": 3360
1135
  },
1136
  {
1137
- "epoch": 9.42,
1138
- "learning_rate": 0.001,
1139
- "loss": 0.402,
1140
- "step": 3380
1141
  },
1142
  {
1143
- "epoch": 9.47,
1144
- "learning_rate": 0.001,
1145
- "loss": 0.3029,
1146
- "step": 3400
1147
  },
1148
  {
1149
- "epoch": 9.53,
1150
- "learning_rate": 0.001,
1151
- "loss": 0.2799,
1152
- "step": 3420
1153
  },
1154
  {
1155
- "epoch": 9.58,
1156
- "learning_rate": 0.001,
1157
- "loss": 0.3046,
1158
- "step": 3440
1159
  },
1160
  {
1161
- "epoch": 9.64,
1162
- "learning_rate": 0.001,
1163
- "loss": 0.3707,
1164
- "step": 3460
1165
  },
1166
  {
1167
- "epoch": 9.69,
1168
- "learning_rate": 0.001,
1169
- "loss": 0.3417,
1170
- "step": 3480
1171
  },
1172
  {
1173
- "epoch": 9.75,
1174
- "learning_rate": 0.001,
1175
- "loss": 0.3826,
1176
- "step": 3500
1177
  },
1178
  {
1179
- "epoch": 9.81,
1180
- "learning_rate": 0.001,
1181
- "loss": 0.3658,
1182
- "step": 3520
1183
  },
1184
  {
1185
- "epoch": 9.86,
1186
- "learning_rate": 0.001,
1187
- "loss": 0.3185,
1188
- "step": 3540
1189
  },
1190
  {
1191
- "epoch": 9.92,
1192
- "learning_rate": 0.001,
1193
- "loss": 0.3596,
1194
- "step": 3560
1195
  },
1196
  {
1197
- "epoch": 9.97,
1198
- "learning_rate": 0.001,
1199
- "loss": 0.3863,
1200
- "step": 3580
1201
  },
1202
  {
1203
- "epoch": 10.0,
1204
- "eval_f1": 0.0,
1205
- "eval_loss": 0.34448280930519104,
1206
- "eval_runtime": 20.4875,
1207
- "eval_samples_per_second": 93.52,
1208
- "eval_steps_per_second": 1.464,
1209
- "step": 3590
1210
  },
1211
  {
1212
- "epoch": 10.0,
1213
- "step": 3590,
1214
- "total_flos": 3792562624069632.0,
1215
- "train_loss": 0.3510875488058106,
1216
- "train_runtime": 3949.5247,
1217
- "train_samples_per_second": 14.541,
1218
- "train_steps_per_second": 0.909
1219
  }
1220
  ],
1221
- "logging_steps": 20,
1222
- "max_steps": 3590,
1223
  "num_input_tokens_seen": 0,
1224
- "num_train_epochs": 10,
1225
  "save_steps": 500,
1226
- "total_flos": 3792562624069632.0,
1227
- "train_batch_size": 2,
1228
  "trial_name": null,
1229
  "trial_params": null
1230
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 540,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "learning_rate": 1.4814814814814812e-06,
14
+ "loss": 118.6129,
15
+ "step": 10
16
+ },
17
  {
18
  "epoch": 0.11,
19
+ "learning_rate": 2.9629629629629625e-06,
20
+ "loss": 96.0751,
21
  "step": 20
22
  },
23
+ {
24
+ "epoch": 0.17,
25
+ "learning_rate": 4.444444444444444e-06,
26
+ "loss": 96.9119,
27
+ "step": 30
28
+ },
29
  {
30
  "epoch": 0.22,
31
+ "learning_rate": 5.925925925925925e-06,
32
+ "loss": 85.8261,
33
  "step": 40
34
  },
35
+ {
36
+ "epoch": 0.28,
37
+ "learning_rate": 7.4074074074074075e-06,
38
+ "loss": 76.0406,
39
+ "step": 50
40
+ },
41
  {
42
  "epoch": 0.33,
43
+ "learning_rate": 7.99699181001692e-06,
44
+ "loss": 62.1596,
45
  "step": 60
46
  },
47
  {
48
+ "epoch": 0.39,
49
+ "learning_rate": 7.978624809626406e-06,
50
+ "loss": 60.1637,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.44,
55
+ "learning_rate": 7.943638653379184e-06,
56
+ "loss": 49.0233,
57
  "step": 80
58
  },
59
+ {
60
+ "epoch": 0.5,
61
+ "learning_rate": 7.892179482319294e-06,
62
+ "loss": 61.9877,
63
+ "step": 90
64
+ },
65
  {
66
  "epoch": 0.56,
67
+ "learning_rate": 7.824462247095518e-06,
68
+ "loss": 41.0057,
69
  "step": 100
70
  },
71
+ {
72
+ "epoch": 0.61,
73
+ "learning_rate": 7.740769810088759e-06,
74
+ "loss": 28.3225,
75
+ "step": 110
76
+ },
77
  {
78
  "epoch": 0.67,
79
+ "learning_rate": 7.641451763864587e-06,
80
+ "loss": 21.0979,
81
  "step": 120
82
  },
83
+ {
84
+ "epoch": 0.72,
85
+ "learning_rate": 7.526922970886431e-06,
86
+ "loss": 18.2794,
87
+ "step": 130
88
+ },
89
  {
90
  "epoch": 0.78,
91
+ "learning_rate": 7.3976618305891895e-06,
92
+ "loss": 9.044,
93
  "step": 140
94
  },
95
+ {
96
+ "epoch": 0.83,
97
+ "learning_rate": 7.2542082810518696e-06,
98
+ "loss": 6.2416,
99
+ "step": 150
100
+ },
101
  {
102
  "epoch": 0.89,
103
+ "learning_rate": 7.097161543616529e-06,
104
+ "loss": 2.6869,
105
  "step": 160
106
  },
107
  {
108
+ "epoch": 0.94,
109
+ "learning_rate": 6.927177619874449e-06,
110
+ "loss": 1.5778,
111
+ "step": 170
 
 
 
112
  },
113
  {
114
  "epoch": 1.0,
115
+ "learning_rate": 6.744966551474935e-06,
116
+ "loss": 1.5514,
117
  "step": 180
118
  },
119
  {
120
+ "epoch": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  "eval_f1": 0.1941564561734213,
122
+ "eval_loss": 15.080679893493652,
123
+ "eval_runtime": 20.7366,
124
+ "eval_samples_per_second": 92.397,
125
+ "eval_steps_per_second": 1.447,
126
+ "step": 180
 
 
 
 
 
 
127
  },
128
  {
129
+ "epoch": 1.06,
130
+ "learning_rate": 6.551289454202823e-06,
131
+ "loss": 1.0316,
132
+ "step": 190
133
  },
134
  {
135
+ "epoch": 1.11,
136
+ "learning_rate": 6.346955338713672e-06,
137
+ "loss": 1.3113,
138
+ "step": 200
139
  },
140
  {
141
+ "epoch": 1.17,
142
+ "learning_rate": 6.132817731206765e-06,
143
+ "loss": 0.9913,
144
+ "step": 210
145
  },
146
  {
147
+ "epoch": 1.22,
148
+ "learning_rate": 5.9097711081517955e-06,
149
+ "loss": 0.9418,
150
+ "step": 220
151
  },
152
  {
153
+ "epoch": 1.28,
154
+ "learning_rate": 5.678747159961716e-06,
155
+ "loss": 1.3579,
156
+ "step": 230
157
  },
158
  {
159
+ "epoch": 1.33,
160
+ "learning_rate": 5.440710899218841e-06,
161
+ "loss": 0.7793,
162
+ "step": 240
163
  },
164
  {
165
+ "epoch": 1.39,
166
+ "learning_rate": 5.196656629710567e-06,
167
+ "loss": 0.8691,
168
+ "step": 250
169
  },
170
  {
171
+ "epoch": 1.44,
172
+ "learning_rate": 4.947603793112476e-06,
173
+ "loss": 0.7268,
174
+ "step": 260
175
  },
176
  {
177
+ "epoch": 1.5,
178
+ "learning_rate": 4.694592710667722e-06,
179
+ "loss": 1.3357,
180
+ "step": 270
 
 
 
181
  },
182
  {
183
+ "epoch": 1.56,
184
+ "learning_rate": 4.438680237650126e-06,
185
+ "loss": 0.7497,
186
+ "step": 280
187
  },
188
  {
189
+ "epoch": 1.61,
190
+ "learning_rate": 4.180935348762825e-06,
191
+ "loss": 0.5343,
192
+ "step": 290
193
  },
194
  {
195
+ "epoch": 1.67,
196
+ "learning_rate": 3.922434672912703e-06,
197
+ "loss": 1.0287,
198
+ "step": 300
199
  },
200
  {
201
+ "epoch": 1.72,
202
+ "learning_rate": 3.664257996012372e-06,
203
+ "loss": 0.45,
204
+ "step": 310
205
  },
206
  {
207
+ "epoch": 1.78,
208
+ "learning_rate": 3.4074837505950056e-06,
209
+ "loss": 1.4606,
210
+ "step": 320
211
  },
212
  {
213
+ "epoch": 1.83,
214
+ "learning_rate": 3.1531845110823583e-06,
215
+ "loss": 0.4987,
216
+ "step": 330
217
  },
218
  {
219
+ "epoch": 1.89,
220
+ "learning_rate": 2.9024225135227944e-06,
221
+ "loss": 0.4837,
222
+ "step": 340
223
  },
224
  {
225
+ "epoch": 1.94,
226
+ "learning_rate": 2.6562452185138318e-06,
227
+ "loss": 0.4317,
228
+ "step": 350
229
  },
230
  {
231
+ "epoch": 2.0,
232
+ "learning_rate": 2.4156809358433725e-06,
233
+ "loss": 0.309,
234
+ "step": 360
235
  },
236
  {
237
+ "epoch": 2.0,
238
  "eval_f1": 0.1941564561734213,
239
+ "eval_loss": 13.09052562713623,
240
+ "eval_runtime": 20.4809,
241
+ "eval_samples_per_second": 93.55,
242
+ "eval_steps_per_second": 1.465,
243
+ "step": 360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  },
245
  {
246
+ "epoch": 2.06,
247
+ "learning_rate": 2.181734529125998e-06,
248
+ "loss": 0.4364,
249
+ "step": 370
250
  },
251
  {
252
+ "epoch": 2.11,
253
+ "learning_rate": 1.9553832183765073e-06,
254
  "loss": 0.3626,
255
+ "step": 380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  },
257
  {
258
+ "epoch": 2.17,
259
+ "learning_rate": 1.7375724980538462e-06,
260
+ "loss": 0.1876,
261
+ "step": 390
262
  },
263
  {
264
+ "epoch": 2.22,
265
+ "learning_rate": 1.529212187626172e-06,
266
+ "loss": 0.1443,
267
+ "step": 400
268
  },
269
  {
270
+ "epoch": 2.28,
271
+ "learning_rate": 1.3311726311543211e-06,
272
+ "loss": 0.1844,
273
+ "step": 410
274
  },
275
  {
276
+ "epoch": 2.33,
277
+ "learning_rate": 1.1442810617684044e-06,
278
+ "loss": 0.1862,
279
+ "step": 420
280
  },
281
  {
282
+ "epoch": 2.39,
283
+ "learning_rate": 9.693181462235283e-07,
284
+ "loss": 0.0915,
285
+ "step": 430
286
  },
287
  {
288
+ "epoch": 2.44,
289
+ "learning_rate": 8.070147239684279e-07,
290
+ "loss": 0.0219,
291
+ "step": 440
292
  },
293
  {
294
+ "epoch": 2.5,
295
+ "learning_rate": 6.580487543482549e-07,
296
+ "loss": 0.1046,
297
+ "step": 450
298
  },
299
  {
300
+ "epoch": 2.56,
301
+ "learning_rate": 5.230424846934088e-07,
302
+ "loss": 0.0806,
303
+ "step": 460
304
  },
305
  {
306
+ "epoch": 2.61,
307
+ "learning_rate": 4.025598511236281e-07,
308
+ "loss": 0.1472,
309
+ "step": 470
310
  },
311
  {
312
+ "epoch": 2.67,
313
+ "learning_rate": 2.9710412292443863e-07,
314
+ "loss": 0.1118,
315
+ "step": 480
316
  },
317
  {
318
+ "epoch": 2.72,
319
+ "learning_rate": 2.071158003356941e-07,
320
+ "loss": 0.0869,
321
+ "step": 490
322
  },
323
  {
324
+ "epoch": 2.78,
325
+ "learning_rate": 1.3297077453335193e-07,
326
+ "loss": 0.1594,
327
+ "step": 500
328
  },
329
  {
330
+ "epoch": 2.83,
331
+ "learning_rate": 7.497875749046122e-08,
332
+ "loss": 0.1333,
333
+ "step": 510
334
  },
335
  {
336
+ "epoch": 2.89,
337
+ "learning_rate": 3.3381988275995585e-08,
338
+ "loss": 0.1171,
339
+ "step": 520
340
  },
341
  {
342
+ "epoch": 2.94,
343
+ "learning_rate": 8.354221195471912e-09,
344
+ "loss": 0.2449,
345
+ "step": 530
346
  },
347
  {
348
+ "epoch": 3.0,
349
+ "learning_rate": 0.0,
350
+ "loss": 0.1401,
351
+ "step": 540
352
  },
353
  {
354
+ "epoch": 3.0,
355
+ "eval_f1": 0.19553868058851448,
356
+ "eval_loss": 5.8615827560424805,
357
+ "eval_runtime": 20.4644,
358
+ "eval_samples_per_second": 93.626,
359
+ "eval_steps_per_second": 1.466,
360
+ "step": 540
361
  },
362
  {
363
+ "epoch": 3.0,
364
+ "step": 540,
365
+ "total_flos": 17340365537280.0,
366
+ "train_loss": 15.830373929303002,
367
+ "train_runtime": 1144.0282,
368
+ "train_samples_per_second": 15.06,
369
+ "train_steps_per_second": 0.472
370
  }
371
  ],
372
+ "logging_steps": 10,
373
+ "max_steps": 540,
374
  "num_input_tokens_seen": 0,
375
+ "num_train_epochs": 3,
376
  "save_steps": 500,
377
+ "total_flos": 17340365537280.0,
378
+ "train_batch_size": 4,
379
  "trial_name": null,
380
  "trial_params": null
381
  }