younggi commited on
Commit
daf2799
·
1 Parent(s): d82f812

Training in progress, epoch 0

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 49.02,
3
- "eval_accuracy": 0.9290322580645162,
4
- "eval_loss": 0.18336611986160278,
5
- "eval_runtime": 12.5538,
6
- "eval_samples_per_second": 12.347,
7
- "eval_steps_per_second": 3.107
8
  }
 
1
  {
2
  "epoch": 49.02,
3
+ "eval_accuracy": 0.8850574712643678,
4
+ "eval_loss": 0.38414719700813293,
5
+ "eval_runtime": 6.1302,
6
+ "eval_samples_per_second": 14.192,
7
+ "eval_steps_per_second": 3.589
8
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cb154e0d13e9e3d27632b626366e31b3a8cc4aaf395306ec53c67bb9cf8f30f
3
- size 345004516
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4883b0eb29ea209f1f563a296a757202cbed167c64820f562d4ec4df2c54d1a9
3
+ size 345004552
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 49.02,
3
- "eval_accuracy": 0.9290322580645162,
4
- "eval_loss": 0.18336611986160278,
5
- "eval_runtime": 12.5538,
6
- "eval_samples_per_second": 12.347,
7
- "eval_steps_per_second": 3.107
8
  }
 
1
  {
2
  "epoch": 49.02,
3
+ "eval_accuracy": 0.8850574712643678,
4
+ "eval_loss": 0.38414719700813293,
5
+ "eval_runtime": 6.1302,
6
+ "eval_samples_per_second": 14.192,
7
+ "eval_steps_per_second": 3.589
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.9857142857142858,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-1425",
4
  "epoch": 49.02,
5
  "global_step": 3750,
6
  "is_hyper_param_search": false,
@@ -10,883 +10,883 @@
10
  {
11
  "epoch": 0.0,
12
  "learning_rate": 1.3333333333333334e-06,
13
- "loss": 2.3169,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.01,
18
  "learning_rate": 2.666666666666667e-06,
19
- "loss": 2.3252,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.01,
24
  "learning_rate": 4.000000000000001e-06,
25
- "loss": 2.2807,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.01,
30
  "learning_rate": 5.333333333333334e-06,
31
- "loss": 2.2992,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.01,
36
  "learning_rate": 6.666666666666667e-06,
37
- "loss": 2.2765,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.02,
42
  "learning_rate": 8.000000000000001e-06,
43
- "loss": 2.2692,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.02,
48
  "learning_rate": 9.333333333333334e-06,
49
- "loss": 2.2357,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.02,
54
- "eval_accuracy": 0.34285714285714286,
55
- "eval_loss": 2.195152759552002,
56
- "eval_runtime": 5.4445,
57
- "eval_samples_per_second": 12.857,
58
- "eval_steps_per_second": 3.306,
59
  "step": 75
60
  },
61
  {
62
  "epoch": 1.0,
63
  "learning_rate": 1.0666666666666667e-05,
64
- "loss": 2.2979,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 1.0,
69
  "learning_rate": 1.2e-05,
70
- "loss": 2.1564,
71
  "step": 90
72
  },
73
  {
74
  "epoch": 1.01,
75
  "learning_rate": 1.3333333333333333e-05,
76
- "loss": 2.1848,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 1.01,
81
  "learning_rate": 1.4666666666666668e-05,
82
- "loss": 2.072,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 1.01,
87
  "learning_rate": 1.6000000000000003e-05,
88
- "loss": 2.0733,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 1.01,
93
  "learning_rate": 1.7333333333333336e-05,
94
- "loss": 2.0719,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 1.02,
99
  "learning_rate": 1.866666666666667e-05,
100
- "loss": 2.0285,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 1.02,
105
  "learning_rate": 2e-05,
106
- "loss": 1.8271,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 1.02,
111
- "eval_accuracy": 0.35714285714285715,
112
- "eval_loss": 1.8570266962051392,
113
- "eval_runtime": 5.776,
114
- "eval_samples_per_second": 12.119,
115
- "eval_steps_per_second": 3.116,
116
  "step": 150
117
  },
118
  {
119
  "epoch": 2.0,
120
  "learning_rate": 2.1333333333333335e-05,
121
- "loss": 1.6139,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 2.01,
126
  "learning_rate": 2.2666666666666668e-05,
127
- "loss": 1.424,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 2.01,
132
  "learning_rate": 2.4e-05,
133
- "loss": 1.3944,
134
  "step": 180
135
  },
136
  {
137
  "epoch": 2.01,
138
  "learning_rate": 2.5333333333333337e-05,
139
- "loss": 1.1121,
140
  "step": 190
141
  },
142
  {
143
  "epoch": 2.01,
144
  "learning_rate": 2.6666666666666667e-05,
145
- "loss": 1.0835,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 2.02,
150
  "learning_rate": 2.8000000000000003e-05,
151
- "loss": 0.9839,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 2.02,
156
  "learning_rate": 2.9333333333333336e-05,
157
- "loss": 0.8947,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 2.02,
162
- "eval_accuracy": 0.7285714285714285,
163
- "eval_loss": 0.839695155620575,
164
- "eval_runtime": 5.7359,
165
- "eval_samples_per_second": 12.204,
166
- "eval_steps_per_second": 3.138,
167
  "step": 225
168
  },
169
  {
170
  "epoch": 3.0,
171
  "learning_rate": 3.066666666666667e-05,
172
- "loss": 0.8324,
173
  "step": 230
174
  },
175
  {
176
  "epoch": 3.0,
177
  "learning_rate": 3.2000000000000005e-05,
178
- "loss": 0.6889,
179
  "step": 240
180
  },
181
  {
182
  "epoch": 3.01,
183
  "learning_rate": 3.3333333333333335e-05,
184
- "loss": 0.6668,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 3.01,
189
  "learning_rate": 3.466666666666667e-05,
190
- "loss": 0.6746,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 3.01,
195
  "learning_rate": 3.6e-05,
196
- "loss": 0.7853,
197
  "step": 270
198
  },
199
  {
200
  "epoch": 3.01,
201
  "learning_rate": 3.733333333333334e-05,
202
- "loss": 0.3688,
203
  "step": 280
204
  },
205
  {
206
  "epoch": 3.02,
207
  "learning_rate": 3.866666666666667e-05,
208
- "loss": 0.4888,
209
  "step": 290
210
  },
211
  {
212
  "epoch": 3.02,
213
  "learning_rate": 4e-05,
214
- "loss": 0.5347,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 3.02,
219
- "eval_accuracy": 0.8285714285714286,
220
- "eval_loss": 0.5631577968597412,
221
- "eval_runtime": 6.1294,
222
- "eval_samples_per_second": 11.42,
223
- "eval_steps_per_second": 2.937,
224
  "step": 300
225
  },
226
  {
227
  "epoch": 4.0,
228
  "learning_rate": 4.133333333333333e-05,
229
- "loss": 0.4236,
230
  "step": 310
231
  },
232
  {
233
  "epoch": 4.01,
234
  "learning_rate": 4.266666666666667e-05,
235
- "loss": 0.4206,
236
  "step": 320
237
  },
238
  {
239
  "epoch": 4.01,
240
  "learning_rate": 4.4000000000000006e-05,
241
- "loss": 0.4133,
242
  "step": 330
243
  },
244
  {
245
  "epoch": 4.01,
246
  "learning_rate": 4.5333333333333335e-05,
247
- "loss": 0.6152,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 4.01,
252
  "learning_rate": 4.666666666666667e-05,
253
- "loss": 0.7022,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 4.02,
258
  "learning_rate": 4.8e-05,
259
- "loss": 0.4414,
260
  "step": 360
261
  },
262
  {
263
  "epoch": 4.02,
264
  "learning_rate": 4.933333333333334e-05,
265
- "loss": 0.4101,
266
  "step": 370
267
  },
268
  {
269
  "epoch": 4.02,
270
- "eval_accuracy": 0.8,
271
- "eval_loss": 0.690824031829834,
272
- "eval_runtime": 5.752,
273
- "eval_samples_per_second": 12.17,
274
- "eval_steps_per_second": 3.129,
275
  "step": 375
276
  },
277
  {
278
  "epoch": 5.0,
279
  "learning_rate": 4.9925925925925926e-05,
280
- "loss": 0.1599,
281
  "step": 380
282
  },
283
  {
284
  "epoch": 5.0,
285
  "learning_rate": 4.977777777777778e-05,
286
- "loss": 0.3312,
287
  "step": 390
288
  },
289
  {
290
  "epoch": 5.01,
291
  "learning_rate": 4.962962962962963e-05,
292
- "loss": 0.3967,
293
  "step": 400
294
  },
295
  {
296
  "epoch": 5.01,
297
  "learning_rate": 4.9481481481481485e-05,
298
- "loss": 0.3926,
299
  "step": 410
300
  },
301
  {
302
  "epoch": 5.01,
303
  "learning_rate": 4.933333333333334e-05,
304
- "loss": 0.8443,
305
  "step": 420
306
  },
307
  {
308
  "epoch": 5.01,
309
  "learning_rate": 4.918518518518519e-05,
310
- "loss": 0.4227,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 5.02,
315
  "learning_rate": 4.903703703703704e-05,
316
- "loss": 0.2168,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 5.02,
321
  "learning_rate": 4.888888888888889e-05,
322
- "loss": 0.0855,
323
  "step": 450
324
  },
325
  {
326
  "epoch": 5.02,
327
- "eval_accuracy": 0.9571428571428572,
328
- "eval_loss": 0.1541331708431244,
329
- "eval_runtime": 5.9819,
330
- "eval_samples_per_second": 11.702,
331
- "eval_steps_per_second": 3.009,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 6.0,
336
  "learning_rate": 4.874074074074074e-05,
337
- "loss": 0.0751,
338
  "step": 460
339
  },
340
  {
341
  "epoch": 6.01,
342
  "learning_rate": 4.8592592592592596e-05,
343
- "loss": 0.2454,
344
  "step": 470
345
  },
346
  {
347
  "epoch": 6.01,
348
  "learning_rate": 4.844444444444445e-05,
349
- "loss": 0.3209,
350
  "step": 480
351
  },
352
  {
353
  "epoch": 6.01,
354
  "learning_rate": 4.82962962962963e-05,
355
- "loss": 0.3679,
356
  "step": 490
357
  },
358
  {
359
  "epoch": 6.01,
360
  "learning_rate": 4.814814814814815e-05,
361
- "loss": 0.2122,
362
  "step": 500
363
  },
364
  {
365
  "epoch": 6.02,
366
  "learning_rate": 4.8e-05,
367
- "loss": 0.1339,
368
  "step": 510
369
  },
370
  {
371
  "epoch": 6.02,
372
  "learning_rate": 4.7851851851851854e-05,
373
- "loss": 0.2286,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 6.02,
378
- "eval_accuracy": 0.9,
379
- "eval_loss": 0.2762179374694824,
380
- "eval_runtime": 5.6813,
381
- "eval_samples_per_second": 12.321,
382
- "eval_steps_per_second": 3.168,
383
  "step": 525
384
  },
385
  {
386
  "epoch": 7.0,
387
  "learning_rate": 4.770370370370371e-05,
388
- "loss": 0.3897,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 7.0,
393
  "learning_rate": 4.755555555555556e-05,
394
- "loss": 0.0845,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 7.01,
399
  "learning_rate": 4.740740740740741e-05,
400
- "loss": 0.3024,
401
  "step": 550
402
  },
403
  {
404
  "epoch": 7.01,
405
  "learning_rate": 4.7259259259259266e-05,
406
- "loss": 0.0144,
407
  "step": 560
408
  },
409
  {
410
  "epoch": 7.01,
411
  "learning_rate": 4.711111111111111e-05,
412
- "loss": 0.4922,
413
  "step": 570
414
  },
415
  {
416
  "epoch": 7.01,
417
  "learning_rate": 4.6962962962962966e-05,
418
- "loss": 0.0182,
419
  "step": 580
420
  },
421
  {
422
  "epoch": 7.02,
423
  "learning_rate": 4.681481481481482e-05,
424
- "loss": 0.0056,
425
  "step": 590
426
  },
427
  {
428
  "epoch": 7.02,
429
  "learning_rate": 4.666666666666667e-05,
430
- "loss": 0.2905,
431
  "step": 600
432
  },
433
  {
434
  "epoch": 7.02,
435
- "eval_accuracy": 0.9,
436
- "eval_loss": 0.23063896596431732,
437
- "eval_runtime": 5.9851,
438
- "eval_samples_per_second": 11.696,
439
- "eval_steps_per_second": 3.007,
440
  "step": 600
441
  },
442
  {
443
  "epoch": 8.0,
444
  "learning_rate": 4.6518518518518525e-05,
445
- "loss": 0.0139,
446
  "step": 610
447
  },
448
  {
449
  "epoch": 8.01,
450
  "learning_rate": 4.637037037037038e-05,
451
- "loss": 0.3338,
452
  "step": 620
453
  },
454
  {
455
  "epoch": 8.01,
456
  "learning_rate": 4.6222222222222224e-05,
457
- "loss": 0.0218,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 8.01,
462
  "learning_rate": 4.607407407407408e-05,
463
- "loss": 0.0053,
464
  "step": 640
465
  },
466
  {
467
  "epoch": 8.01,
468
  "learning_rate": 4.592592592592593e-05,
469
- "loss": 0.0042,
470
  "step": 650
471
  },
472
  {
473
  "epoch": 8.02,
474
  "learning_rate": 4.577777777777778e-05,
475
- "loss": 0.0115,
476
  "step": 660
477
  },
478
  {
479
  "epoch": 8.02,
480
  "learning_rate": 4.5629629629629636e-05,
481
- "loss": 0.0051,
482
  "step": 670
483
  },
484
  {
485
  "epoch": 8.02,
486
- "eval_accuracy": 0.9571428571428572,
487
- "eval_loss": 0.2053740918636322,
488
- "eval_runtime": 5.9611,
489
- "eval_samples_per_second": 11.743,
490
- "eval_steps_per_second": 3.02,
491
  "step": 675
492
  },
493
  {
494
  "epoch": 9.0,
495
  "learning_rate": 4.548148148148149e-05,
496
- "loss": 0.0034,
497
  "step": 680
498
  },
499
  {
500
  "epoch": 9.0,
501
  "learning_rate": 4.5333333333333335e-05,
502
- "loss": 0.006,
503
  "step": 690
504
  },
505
  {
506
  "epoch": 9.01,
507
  "learning_rate": 4.518518518518519e-05,
508
- "loss": 0.0663,
509
  "step": 700
510
  },
511
  {
512
  "epoch": 9.01,
513
  "learning_rate": 4.503703703703704e-05,
514
- "loss": 0.0074,
515
  "step": 710
516
  },
517
  {
518
  "epoch": 9.01,
519
  "learning_rate": 4.4888888888888894e-05,
520
- "loss": 0.1151,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 9.01,
525
  "learning_rate": 4.474074074074075e-05,
526
- "loss": 0.0627,
527
  "step": 730
528
  },
529
  {
530
  "epoch": 9.02,
531
  "learning_rate": 4.4592592592592594e-05,
532
- "loss": 0.016,
533
  "step": 740
534
  },
535
  {
536
  "epoch": 9.02,
537
  "learning_rate": 4.4444444444444447e-05,
538
- "loss": 0.1142,
539
  "step": 750
540
  },
541
  {
542
  "epoch": 9.02,
543
- "eval_accuracy": 0.8714285714285714,
544
- "eval_loss": 0.704865038394928,
545
- "eval_runtime": 5.6807,
546
- "eval_samples_per_second": 12.323,
547
- "eval_steps_per_second": 3.169,
548
  "step": 750
549
  },
550
  {
551
  "epoch": 10.0,
552
  "learning_rate": 4.42962962962963e-05,
553
- "loss": 0.0937,
554
  "step": 760
555
  },
556
  {
557
  "epoch": 10.01,
558
  "learning_rate": 4.414814814814815e-05,
559
- "loss": 0.1874,
560
  "step": 770
561
  },
562
  {
563
  "epoch": 10.01,
564
  "learning_rate": 4.4000000000000006e-05,
565
- "loss": 0.0798,
566
  "step": 780
567
  },
568
  {
569
  "epoch": 10.01,
570
  "learning_rate": 4.385185185185185e-05,
571
- "loss": 0.0028,
572
  "step": 790
573
  },
574
  {
575
  "epoch": 10.01,
576
  "learning_rate": 4.3703703703703705e-05,
577
- "loss": 0.1547,
578
  "step": 800
579
  },
580
  {
581
  "epoch": 10.02,
582
  "learning_rate": 4.355555555555556e-05,
583
- "loss": 0.1714,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 10.02,
588
  "learning_rate": 4.340740740740741e-05,
589
- "loss": 0.0022,
590
  "step": 820
591
  },
592
  {
593
  "epoch": 10.02,
594
- "eval_accuracy": 0.9428571428571428,
595
- "eval_loss": 0.1919040083885193,
596
- "eval_runtime": 5.707,
597
- "eval_samples_per_second": 12.266,
598
- "eval_steps_per_second": 3.154,
599
  "step": 825
600
  },
601
  {
602
  "epoch": 11.0,
603
  "learning_rate": 4.325925925925926e-05,
604
- "loss": 0.0026,
605
  "step": 830
606
  },
607
  {
608
  "epoch": 11.0,
609
  "learning_rate": 4.311111111111111e-05,
610
- "loss": 0.0029,
611
  "step": 840
612
  },
613
  {
614
  "epoch": 11.01,
615
  "learning_rate": 4.296296296296296e-05,
616
- "loss": 0.0794,
617
  "step": 850
618
  },
619
  {
620
  "epoch": 11.01,
621
  "learning_rate": 4.2814814814814816e-05,
622
- "loss": 0.0708,
623
  "step": 860
624
  },
625
  {
626
  "epoch": 11.01,
627
  "learning_rate": 4.266666666666667e-05,
628
- "loss": 0.0705,
629
  "step": 870
630
  },
631
  {
632
  "epoch": 11.01,
633
  "learning_rate": 4.2518518518518515e-05,
634
- "loss": 0.0549,
635
  "step": 880
636
  },
637
  {
638
  "epoch": 11.02,
639
  "learning_rate": 4.237037037037037e-05,
640
- "loss": 0.0022,
641
  "step": 890
642
  },
643
  {
644
  "epoch": 11.02,
645
  "learning_rate": 4.222222222222222e-05,
646
- "loss": 0.0019,
647
  "step": 900
648
  },
649
  {
650
  "epoch": 11.02,
651
- "eval_accuracy": 0.8857142857142857,
652
- "eval_loss": 0.5478063225746155,
653
- "eval_runtime": 5.7039,
654
- "eval_samples_per_second": 12.272,
655
- "eval_steps_per_second": 3.156,
656
  "step": 900
657
  },
658
  {
659
  "epoch": 12.0,
660
  "learning_rate": 4.2074074074074075e-05,
661
- "loss": 0.0032,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 12.01,
666
  "learning_rate": 4.192592592592593e-05,
667
- "loss": 0.0159,
668
  "step": 920
669
  },
670
  {
671
  "epoch": 12.01,
672
  "learning_rate": 4.177777777777778e-05,
673
- "loss": 0.0861,
674
  "step": 930
675
  },
676
  {
677
  "epoch": 12.01,
678
  "learning_rate": 4.162962962962963e-05,
679
- "loss": 0.2058,
680
  "step": 940
681
  },
682
  {
683
  "epoch": 12.01,
684
  "learning_rate": 4.148148148148148e-05,
685
- "loss": 0.0042,
686
  "step": 950
687
  },
688
  {
689
  "epoch": 12.02,
690
  "learning_rate": 4.133333333333333e-05,
691
- "loss": 0.1814,
692
  "step": 960
693
  },
694
  {
695
  "epoch": 12.02,
696
  "learning_rate": 4.1185185185185186e-05,
697
- "loss": 0.0021,
698
  "step": 970
699
  },
700
  {
701
  "epoch": 12.02,
702
- "eval_accuracy": 0.9285714285714286,
703
- "eval_loss": 0.4232262670993805,
704
- "eval_runtime": 5.5509,
705
- "eval_samples_per_second": 12.611,
706
- "eval_steps_per_second": 3.243,
707
  "step": 975
708
  },
709
  {
710
  "epoch": 13.0,
711
  "learning_rate": 4.103703703703704e-05,
712
- "loss": 0.0036,
713
  "step": 980
714
  },
715
  {
716
  "epoch": 13.0,
717
  "learning_rate": 4.088888888888889e-05,
718
- "loss": 0.0761,
719
  "step": 990
720
  },
721
  {
722
  "epoch": 13.01,
723
  "learning_rate": 4.074074074074074e-05,
724
- "loss": 0.0036,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 13.01,
729
  "learning_rate": 4.059259259259259e-05,
730
- "loss": 0.1052,
731
  "step": 1010
732
  },
733
  {
734
  "epoch": 13.01,
735
  "learning_rate": 4.0444444444444444e-05,
736
- "loss": 0.0165,
737
  "step": 1020
738
  },
739
  {
740
  "epoch": 13.01,
741
  "learning_rate": 4.02962962962963e-05,
742
- "loss": 0.0347,
743
  "step": 1030
744
  },
745
  {
746
  "epoch": 13.02,
747
  "learning_rate": 4.014814814814815e-05,
748
- "loss": 0.007,
749
  "step": 1040
750
  },
751
  {
752
  "epoch": 13.02,
753
  "learning_rate": 4e-05,
754
- "loss": 0.0019,
755
  "step": 1050
756
  },
757
  {
758
  "epoch": 13.02,
759
- "eval_accuracy": 0.8857142857142857,
760
- "eval_loss": 0.5436500310897827,
761
- "eval_runtime": 5.8387,
762
- "eval_samples_per_second": 11.989,
763
- "eval_steps_per_second": 3.083,
764
  "step": 1050
765
  },
766
  {
767
  "epoch": 14.0,
768
  "learning_rate": 3.985185185185185e-05,
769
- "loss": 0.0304,
770
  "step": 1060
771
  },
772
  {
773
  "epoch": 14.01,
774
  "learning_rate": 3.97037037037037e-05,
775
- "loss": 0.0019,
776
  "step": 1070
777
  },
778
  {
779
  "epoch": 14.01,
780
  "learning_rate": 3.9555555555555556e-05,
781
- "loss": 0.1148,
782
  "step": 1080
783
  },
784
  {
785
  "epoch": 14.01,
786
  "learning_rate": 3.940740740740741e-05,
787
- "loss": 0.0148,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 14.01,
792
  "learning_rate": 3.925925925925926e-05,
793
- "loss": 0.0016,
794
  "step": 1100
795
  },
796
  {
797
  "epoch": 14.02,
798
  "learning_rate": 3.9111111111111115e-05,
799
- "loss": 0.0113,
800
  "step": 1110
801
  },
802
  {
803
  "epoch": 14.02,
804
  "learning_rate": 3.896296296296296e-05,
805
- "loss": 0.002,
806
  "step": 1120
807
  },
808
  {
809
  "epoch": 14.02,
810
- "eval_accuracy": 0.7857142857142857,
811
- "eval_loss": 1.0354480743408203,
812
- "eval_runtime": 5.6448,
813
- "eval_samples_per_second": 12.401,
814
- "eval_steps_per_second": 3.189,
815
  "step": 1125
816
  },
817
  {
818
  "epoch": 15.0,
819
  "learning_rate": 3.8814814814814814e-05,
820
- "loss": 0.0017,
821
  "step": 1130
822
  },
823
  {
824
  "epoch": 15.0,
825
  "learning_rate": 3.866666666666667e-05,
826
- "loss": 0.1468,
827
  "step": 1140
828
  },
829
  {
830
  "epoch": 15.01,
831
  "learning_rate": 3.851851851851852e-05,
832
- "loss": 0.0023,
833
  "step": 1150
834
  },
835
  {
836
  "epoch": 15.01,
837
  "learning_rate": 3.837037037037037e-05,
838
- "loss": 0.0082,
839
  "step": 1160
840
  },
841
  {
842
  "epoch": 15.01,
843
  "learning_rate": 3.8222222222222226e-05,
844
- "loss": 0.003,
845
  "step": 1170
846
  },
847
  {
848
  "epoch": 15.01,
849
  "learning_rate": 3.807407407407408e-05,
850
- "loss": 0.0822,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 15.02,
855
  "learning_rate": 3.7925925925925925e-05,
856
- "loss": 0.0015,
857
  "step": 1190
858
  },
859
  {
860
  "epoch": 15.02,
861
  "learning_rate": 3.777777777777778e-05,
862
- "loss": 0.0028,
863
  "step": 1200
864
  },
865
  {
866
  "epoch": 15.02,
867
- "eval_accuracy": 0.9714285714285714,
868
- "eval_loss": 0.10393200069665909,
869
- "eval_runtime": 5.6145,
870
- "eval_samples_per_second": 12.468,
871
- "eval_steps_per_second": 3.206,
872
  "step": 1200
873
  },
874
  {
875
  "epoch": 16.0,
876
  "learning_rate": 3.762962962962963e-05,
877
- "loss": 0.0019,
878
  "step": 1210
879
  },
880
  {
881
  "epoch": 16.01,
882
  "learning_rate": 3.7481481481481484e-05,
883
- "loss": 0.0029,
884
  "step": 1220
885
  },
886
  {
887
  "epoch": 16.01,
888
  "learning_rate": 3.733333333333334e-05,
889
- "loss": 0.0042,
890
  "step": 1230
891
  },
892
  {
@@ -904,301 +904,301 @@
904
  {
905
  "epoch": 16.02,
906
  "learning_rate": 3.688888888888889e-05,
907
- "loss": 0.1956,
908
  "step": 1260
909
  },
910
  {
911
  "epoch": 16.02,
912
  "learning_rate": 3.674074074074074e-05,
913
- "loss": 0.0013,
914
  "step": 1270
915
  },
916
  {
917
  "epoch": 16.02,
918
- "eval_accuracy": 0.9714285714285714,
919
- "eval_loss": 0.1551610380411148,
920
- "eval_runtime": 5.9927,
921
- "eval_samples_per_second": 11.681,
922
- "eval_steps_per_second": 3.004,
923
  "step": 1275
924
  },
925
  {
926
  "epoch": 17.0,
927
  "learning_rate": 3.6592592592592596e-05,
928
- "loss": 0.2917,
929
  "step": 1280
930
  },
931
  {
932
  "epoch": 17.0,
933
  "learning_rate": 3.644444444444445e-05,
934
- "loss": 0.0015,
935
  "step": 1290
936
  },
937
  {
938
  "epoch": 17.01,
939
  "learning_rate": 3.62962962962963e-05,
940
- "loss": 0.3129,
941
  "step": 1300
942
  },
943
  {
944
  "epoch": 17.01,
945
  "learning_rate": 3.614814814814815e-05,
946
- "loss": 0.0016,
947
  "step": 1310
948
  },
949
  {
950
  "epoch": 17.01,
951
  "learning_rate": 3.6e-05,
952
- "loss": 0.1919,
953
  "step": 1320
954
  },
955
  {
956
  "epoch": 17.01,
957
  "learning_rate": 3.5851851851851854e-05,
958
- "loss": 0.0335,
959
  "step": 1330
960
  },
961
  {
962
  "epoch": 17.02,
963
  "learning_rate": 3.570370370370371e-05,
964
- "loss": 0.0014,
965
  "step": 1340
966
  },
967
  {
968
  "epoch": 17.02,
969
  "learning_rate": 3.555555555555556e-05,
970
- "loss": 0.2309,
971
  "step": 1350
972
  },
973
  {
974
  "epoch": 17.02,
975
- "eval_accuracy": 0.9285714285714286,
976
- "eval_loss": 0.2719881236553192,
977
- "eval_runtime": 5.759,
978
- "eval_samples_per_second": 12.155,
979
- "eval_steps_per_second": 3.126,
980
  "step": 1350
981
  },
982
  {
983
  "epoch": 18.0,
984
  "learning_rate": 3.540740740740741e-05,
985
- "loss": 0.0013,
986
  "step": 1360
987
  },
988
  {
989
  "epoch": 18.01,
990
  "learning_rate": 3.525925925925926e-05,
991
- "loss": 0.0018,
992
  "step": 1370
993
  },
994
  {
995
  "epoch": 18.01,
996
  "learning_rate": 3.511111111111111e-05,
997
- "loss": 0.2065,
998
  "step": 1380
999
  },
1000
  {
1001
  "epoch": 18.01,
1002
  "learning_rate": 3.4962962962962965e-05,
1003
- "loss": 0.0106,
1004
  "step": 1390
1005
  },
1006
  {
1007
  "epoch": 18.01,
1008
  "learning_rate": 3.481481481481482e-05,
1009
- "loss": 0.0019,
1010
  "step": 1400
1011
  },
1012
  {
1013
  "epoch": 18.02,
1014
  "learning_rate": 3.466666666666667e-05,
1015
- "loss": 0.0179,
1016
  "step": 1410
1017
  },
1018
  {
1019
  "epoch": 18.02,
1020
  "learning_rate": 3.4518518518518524e-05,
1021
- "loss": 0.1662,
1022
  "step": 1420
1023
  },
1024
  {
1025
  "epoch": 18.02,
1026
- "eval_accuracy": 0.9857142857142858,
1027
- "eval_loss": 0.031152786687016487,
1028
- "eval_runtime": 5.8068,
1029
- "eval_samples_per_second": 12.055,
1030
- "eval_steps_per_second": 3.1,
1031
  "step": 1425
1032
  },
1033
  {
1034
  "epoch": 19.0,
1035
  "learning_rate": 3.437037037037037e-05,
1036
- "loss": 0.0945,
1037
  "step": 1430
1038
  },
1039
  {
1040
  "epoch": 19.0,
1041
  "learning_rate": 3.4222222222222224e-05,
1042
- "loss": 0.1443,
1043
  "step": 1440
1044
  },
1045
  {
1046
  "epoch": 19.01,
1047
  "learning_rate": 3.4074074074074077e-05,
1048
- "loss": 0.0583,
1049
  "step": 1450
1050
  },
1051
  {
1052
  "epoch": 19.01,
1053
  "learning_rate": 3.392592592592593e-05,
1054
- "loss": 0.0015,
1055
  "step": 1460
1056
  },
1057
  {
1058
  "epoch": 19.01,
1059
  "learning_rate": 3.377777777777778e-05,
1060
- "loss": 0.1515,
1061
  "step": 1470
1062
  },
1063
  {
1064
  "epoch": 19.01,
1065
  "learning_rate": 3.3629629629629636e-05,
1066
- "loss": 0.008,
1067
  "step": 1480
1068
  },
1069
  {
1070
  "epoch": 19.02,
1071
  "learning_rate": 3.348148148148148e-05,
1072
- "loss": 0.2176,
1073
  "step": 1490
1074
  },
1075
  {
1076
  "epoch": 19.02,
1077
  "learning_rate": 3.3333333333333335e-05,
1078
- "loss": 0.0199,
1079
  "step": 1500
1080
  },
1081
  {
1082
  "epoch": 19.02,
1083
- "eval_accuracy": 0.9571428571428572,
1084
- "eval_loss": 0.14778082072734833,
1085
- "eval_runtime": 5.8359,
1086
- "eval_samples_per_second": 11.995,
1087
- "eval_steps_per_second": 3.084,
1088
  "step": 1500
1089
  },
1090
  {
1091
  "epoch": 20.0,
1092
  "learning_rate": 3.318518518518519e-05,
1093
- "loss": 0.0017,
1094
  "step": 1510
1095
  },
1096
  {
1097
  "epoch": 20.01,
1098
  "learning_rate": 3.303703703703704e-05,
1099
- "loss": 0.001,
1100
  "step": 1520
1101
  },
1102
  {
1103
  "epoch": 20.01,
1104
  "learning_rate": 3.2888888888888894e-05,
1105
- "loss": 0.0442,
1106
  "step": 1530
1107
  },
1108
  {
1109
  "epoch": 20.01,
1110
  "learning_rate": 3.274074074074075e-05,
1111
- "loss": 0.0019,
1112
  "step": 1540
1113
  },
1114
  {
1115
  "epoch": 20.01,
1116
  "learning_rate": 3.25925925925926e-05,
1117
- "loss": 0.0015,
1118
  "step": 1550
1119
  },
1120
  {
1121
  "epoch": 20.02,
1122
  "learning_rate": 3.2444444444444446e-05,
1123
- "loss": 0.0012,
1124
  "step": 1560
1125
  },
1126
  {
1127
  "epoch": 20.02,
1128
  "learning_rate": 3.22962962962963e-05,
1129
- "loss": 0.001,
1130
  "step": 1570
1131
  },
1132
  {
1133
  "epoch": 20.02,
1134
- "eval_accuracy": 0.9714285714285714,
1135
- "eval_loss": 0.21894989907741547,
1136
- "eval_runtime": 5.952,
1137
- "eval_samples_per_second": 11.761,
1138
- "eval_steps_per_second": 3.024,
1139
  "step": 1575
1140
  },
1141
  {
1142
  "epoch": 21.0,
1143
  "learning_rate": 3.214814814814815e-05,
1144
- "loss": 0.0058,
1145
  "step": 1580
1146
  },
1147
  {
1148
  "epoch": 21.0,
1149
  "learning_rate": 3.2000000000000005e-05,
1150
- "loss": 0.001,
1151
  "step": 1590
1152
  },
1153
  {
1154
  "epoch": 21.01,
1155
  "learning_rate": 3.185185185185185e-05,
1156
- "loss": 0.0271,
1157
  "step": 1600
1158
  },
1159
  {
1160
  "epoch": 21.01,
1161
  "learning_rate": 3.1703703703703705e-05,
1162
- "loss": 0.0009,
1163
  "step": 1610
1164
  },
1165
  {
1166
  "epoch": 21.01,
1167
  "learning_rate": 3.155555555555556e-05,
1168
- "loss": 0.001,
1169
  "step": 1620
1170
  },
1171
  {
1172
  "epoch": 21.01,
1173
  "learning_rate": 3.140740740740741e-05,
1174
- "loss": 0.1786,
1175
  "step": 1630
1176
  },
1177
  {
1178
  "epoch": 21.02,
1179
  "learning_rate": 3.1259259259259264e-05,
1180
- "loss": 0.0056,
1181
  "step": 1640
1182
  },
1183
  {
1184
  "epoch": 21.02,
1185
  "learning_rate": 3.111111111111111e-05,
1186
- "loss": 0.0009,
1187
  "step": 1650
1188
  },
1189
  {
1190
  "epoch": 21.02,
1191
- "eval_accuracy": 0.9714285714285714,
1192
- "eval_loss": 0.15684819221496582,
1193
- "eval_runtime": 5.7784,
1194
- "eval_samples_per_second": 12.114,
1195
- "eval_steps_per_second": 3.115,
1196
  "step": 1650
1197
  },
1198
  {
1199
  "epoch": 22.0,
1200
  "learning_rate": 3.096296296296296e-05,
1201
- "loss": 0.1822,
1202
  "step": 1660
1203
  },
1204
  {
@@ -1210,13 +1210,13 @@
1210
  {
1211
  "epoch": 22.01,
1212
  "learning_rate": 3.066666666666667e-05,
1213
- "loss": 0.0009,
1214
  "step": 1680
1215
  },
1216
  {
1217
  "epoch": 22.01,
1218
  "learning_rate": 3.0518518518518515e-05,
1219
- "loss": 0.0008,
1220
  "step": 1690
1221
  },
1222
  {
@@ -1228,7 +1228,7 @@
1228
  {
1229
  "epoch": 22.02,
1230
  "learning_rate": 3.0222222222222225e-05,
1231
- "loss": 0.001,
1232
  "step": 1710
1233
  },
1234
  {
@@ -1239,17 +1239,17 @@
1239
  },
1240
  {
1241
  "epoch": 22.02,
1242
- "eval_accuracy": 0.9428571428571428,
1243
- "eval_loss": 0.213576078414917,
1244
- "eval_runtime": 5.6561,
1245
- "eval_samples_per_second": 12.376,
1246
- "eval_steps_per_second": 3.182,
1247
  "step": 1725
1248
  },
1249
  {
1250
  "epoch": 23.0,
1251
  "learning_rate": 2.992592592592593e-05,
1252
- "loss": 0.0008,
1253
  "step": 1730
1254
  },
1255
  {
@@ -1261,13 +1261,13 @@
1261
  {
1262
  "epoch": 23.01,
1263
  "learning_rate": 2.962962962962963e-05,
1264
- "loss": 0.0008,
1265
  "step": 1750
1266
  },
1267
  {
1268
  "epoch": 23.01,
1269
  "learning_rate": 2.9481481481481483e-05,
1270
- "loss": 0.0238,
1271
  "step": 1760
1272
  },
1273
  {
@@ -1279,52 +1279,52 @@
1279
  {
1280
  "epoch": 23.01,
1281
  "learning_rate": 2.918518518518519e-05,
1282
- "loss": 0.0007,
1283
  "step": 1780
1284
  },
1285
  {
1286
  "epoch": 23.02,
1287
  "learning_rate": 2.9037037037037042e-05,
1288
- "loss": 0.0009,
1289
  "step": 1790
1290
  },
1291
  {
1292
  "epoch": 23.02,
1293
  "learning_rate": 2.8888888888888888e-05,
1294
- "loss": 0.0007,
1295
  "step": 1800
1296
  },
1297
  {
1298
  "epoch": 23.02,
1299
- "eval_accuracy": 0.9714285714285714,
1300
- "eval_loss": 0.10322745889425278,
1301
- "eval_runtime": 5.6572,
1302
- "eval_samples_per_second": 12.374,
1303
- "eval_steps_per_second": 3.182,
1304
  "step": 1800
1305
  },
1306
  {
1307
  "epoch": 24.0,
1308
  "learning_rate": 2.874074074074074e-05,
1309
- "loss": 0.0008,
1310
  "step": 1810
1311
  },
1312
  {
1313
  "epoch": 24.01,
1314
  "learning_rate": 2.8592592592592594e-05,
1315
- "loss": 0.0007,
1316
  "step": 1820
1317
  },
1318
  {
1319
  "epoch": 24.01,
1320
  "learning_rate": 2.8444444444444447e-05,
1321
- "loss": 0.0008,
1322
  "step": 1830
1323
  },
1324
  {
1325
  "epoch": 24.01,
1326
  "learning_rate": 2.8296296296296297e-05,
1327
- "loss": 0.0007,
1328
  "step": 1840
1329
  },
1330
  {
@@ -1336,22 +1336,22 @@
1336
  {
1337
  "epoch": 24.02,
1338
  "learning_rate": 2.8000000000000003e-05,
1339
- "loss": 0.0006,
1340
  "step": 1860
1341
  },
1342
  {
1343
  "epoch": 24.02,
1344
  "learning_rate": 2.7851851851851853e-05,
1345
- "loss": 0.0007,
1346
  "step": 1870
1347
  },
1348
  {
1349
  "epoch": 24.02,
1350
- "eval_accuracy": 0.9714285714285714,
1351
- "eval_loss": 0.10255812108516693,
1352
- "eval_runtime": 5.7489,
1353
- "eval_samples_per_second": 12.176,
1354
- "eval_steps_per_second": 3.131,
1355
  "step": 1875
1356
  },
1357
  {
@@ -1375,58 +1375,58 @@
1375
  {
1376
  "epoch": 25.01,
1377
  "learning_rate": 2.725925925925926e-05,
1378
- "loss": 0.0007,
1379
  "step": 1910
1380
  },
1381
  {
1382
  "epoch": 25.01,
1383
  "learning_rate": 2.7111111111111114e-05,
1384
- "loss": 0.0006,
1385
  "step": 1920
1386
  },
1387
  {
1388
  "epoch": 25.01,
1389
  "learning_rate": 2.696296296296296e-05,
1390
- "loss": 0.0006,
1391
  "step": 1930
1392
  },
1393
  {
1394
  "epoch": 25.02,
1395
  "learning_rate": 2.6814814814814814e-05,
1396
- "loss": 0.0006,
1397
  "step": 1940
1398
  },
1399
  {
1400
  "epoch": 25.02,
1401
  "learning_rate": 2.6666666666666667e-05,
1402
- "loss": 0.0006,
1403
  "step": 1950
1404
  },
1405
  {
1406
  "epoch": 25.02,
1407
- "eval_accuracy": 0.9571428571428572,
1408
- "eval_loss": 0.11298384517431259,
1409
- "eval_runtime": 5.7507,
1410
- "eval_samples_per_second": 12.173,
1411
- "eval_steps_per_second": 3.13,
1412
  "step": 1950
1413
  },
1414
  {
1415
  "epoch": 26.0,
1416
  "learning_rate": 2.651851851851852e-05,
1417
- "loss": 0.0006,
1418
  "step": 1960
1419
  },
1420
  {
1421
  "epoch": 26.01,
1422
  "learning_rate": 2.6370370370370373e-05,
1423
- "loss": 0.0006,
1424
  "step": 1970
1425
  },
1426
  {
1427
  "epoch": 26.01,
1428
  "learning_rate": 2.6222222222222226e-05,
1429
- "loss": 0.0006,
1430
  "step": 1980
1431
  },
1432
  {
@@ -1438,115 +1438,115 @@
1438
  {
1439
  "epoch": 26.01,
1440
  "learning_rate": 2.5925925925925925e-05,
1441
- "loss": 0.0006,
1442
  "step": 2000
1443
  },
1444
  {
1445
  "epoch": 26.02,
1446
  "learning_rate": 2.5777777777777778e-05,
1447
- "loss": 0.0006,
1448
  "step": 2010
1449
  },
1450
  {
1451
  "epoch": 26.02,
1452
  "learning_rate": 2.562962962962963e-05,
1453
- "loss": 0.0006,
1454
  "step": 2020
1455
  },
1456
  {
1457
  "epoch": 26.02,
1458
- "eval_accuracy": 0.9714285714285714,
1459
- "eval_loss": 0.11467116326093674,
1460
- "eval_runtime": 5.8139,
1461
- "eval_samples_per_second": 12.04,
1462
- "eval_steps_per_second": 3.096,
1463
  "step": 2025
1464
  },
1465
  {
1466
  "epoch": 27.0,
1467
  "learning_rate": 2.5481481481481484e-05,
1468
- "loss": 0.0007,
1469
  "step": 2030
1470
  },
1471
  {
1472
  "epoch": 27.0,
1473
  "learning_rate": 2.5333333333333337e-05,
1474
- "loss": 0.0007,
1475
  "step": 2040
1476
  },
1477
  {
1478
  "epoch": 27.01,
1479
  "learning_rate": 2.5185185185185183e-05,
1480
- "loss": 0.0006,
1481
  "step": 2050
1482
  },
1483
  {
1484
  "epoch": 27.01,
1485
  "learning_rate": 2.5037037037037036e-05,
1486
- "loss": 0.0006,
1487
  "step": 2060
1488
  },
1489
  {
1490
  "epoch": 27.01,
1491
  "learning_rate": 2.488888888888889e-05,
1492
- "loss": 0.0006,
1493
  "step": 2070
1494
  },
1495
  {
1496
  "epoch": 27.01,
1497
  "learning_rate": 2.4740740740740742e-05,
1498
- "loss": 0.0006,
1499
  "step": 2080
1500
  },
1501
  {
1502
  "epoch": 27.02,
1503
  "learning_rate": 2.4592592592592595e-05,
1504
- "loss": 0.0006,
1505
  "step": 2090
1506
  },
1507
  {
1508
  "epoch": 27.02,
1509
  "learning_rate": 2.4444444444444445e-05,
1510
- "loss": 0.0006,
1511
  "step": 2100
1512
  },
1513
  {
1514
  "epoch": 27.02,
1515
- "eval_accuracy": 0.9857142857142858,
1516
- "eval_loss": 0.0858125165104866,
1517
- "eval_runtime": 5.7371,
1518
- "eval_samples_per_second": 12.201,
1519
- "eval_steps_per_second": 3.137,
1520
  "step": 2100
1521
  },
1522
  {
1523
  "epoch": 28.0,
1524
  "learning_rate": 2.4296296296296298e-05,
1525
- "loss": 0.0006,
1526
  "step": 2110
1527
  },
1528
  {
1529
  "epoch": 28.01,
1530
  "learning_rate": 2.414814814814815e-05,
1531
- "loss": 0.0006,
1532
  "step": 2120
1533
  },
1534
  {
1535
  "epoch": 28.01,
1536
  "learning_rate": 2.4e-05,
1537
- "loss": 0.0006,
1538
  "step": 2130
1539
  },
1540
  {
1541
  "epoch": 28.01,
1542
  "learning_rate": 2.3851851851851854e-05,
1543
- "loss": 0.0005,
1544
  "step": 2140
1545
  },
1546
  {
1547
  "epoch": 28.01,
1548
  "learning_rate": 2.3703703703703707e-05,
1549
- "loss": 0.0006,
1550
  "step": 2150
1551
  },
1552
  {
@@ -1558,16 +1558,16 @@
1558
  {
1559
  "epoch": 28.02,
1560
  "learning_rate": 2.340740740740741e-05,
1561
- "loss": 0.0006,
1562
  "step": 2170
1563
  },
1564
  {
1565
  "epoch": 28.02,
1566
- "eval_accuracy": 0.9857142857142858,
1567
- "eval_loss": 0.08683785051107407,
1568
- "eval_runtime": 5.8537,
1569
- "eval_samples_per_second": 11.958,
1570
- "eval_steps_per_second": 3.075,
1571
  "step": 2175
1572
  },
1573
  {
@@ -1579,58 +1579,58 @@
1579
  {
1580
  "epoch": 29.0,
1581
  "learning_rate": 2.3111111111111112e-05,
1582
- "loss": 0.0005,
1583
  "step": 2190
1584
  },
1585
  {
1586
  "epoch": 29.01,
1587
  "learning_rate": 2.2962962962962965e-05,
1588
- "loss": 0.0006,
1589
  "step": 2200
1590
  },
1591
  {
1592
  "epoch": 29.01,
1593
  "learning_rate": 2.2814814814814818e-05,
1594
- "loss": 0.0006,
1595
  "step": 2210
1596
  },
1597
  {
1598
  "epoch": 29.01,
1599
  "learning_rate": 2.2666666666666668e-05,
1600
- "loss": 0.0005,
1601
  "step": 2220
1602
  },
1603
  {
1604
  "epoch": 29.01,
1605
  "learning_rate": 2.251851851851852e-05,
1606
- "loss": 0.0005,
1607
  "step": 2230
1608
  },
1609
  {
1610
  "epoch": 29.02,
1611
  "learning_rate": 2.2370370370370374e-05,
1612
- "loss": 0.0005,
1613
  "step": 2240
1614
  },
1615
  {
1616
  "epoch": 29.02,
1617
  "learning_rate": 2.2222222222222223e-05,
1618
- "loss": 0.0006,
1619
  "step": 2250
1620
  },
1621
  {
1622
  "epoch": 29.02,
1623
- "eval_accuracy": 0.9857142857142858,
1624
- "eval_loss": 0.08798635751008987,
1625
- "eval_runtime": 5.7007,
1626
- "eval_samples_per_second": 12.279,
1627
- "eval_steps_per_second": 3.158,
1628
  "step": 2250
1629
  },
1630
  {
1631
  "epoch": 30.0,
1632
  "learning_rate": 2.2074074074074076e-05,
1633
- "loss": 0.0005,
1634
  "step": 2260
1635
  },
1636
  {
@@ -1642,25 +1642,25 @@
1642
  {
1643
  "epoch": 30.01,
1644
  "learning_rate": 2.177777777777778e-05,
1645
- "loss": 0.0005,
1646
  "step": 2280
1647
  },
1648
  {
1649
  "epoch": 30.01,
1650
  "learning_rate": 2.162962962962963e-05,
1651
- "loss": 0.0005,
1652
  "step": 2290
1653
  },
1654
  {
1655
  "epoch": 30.01,
1656
  "learning_rate": 2.148148148148148e-05,
1657
- "loss": 0.0006,
1658
  "step": 2300
1659
  },
1660
  {
1661
  "epoch": 30.02,
1662
  "learning_rate": 2.1333333333333335e-05,
1663
- "loss": 0.0005,
1664
  "step": 2310
1665
  },
1666
  {
@@ -1671,17 +1671,17 @@
1671
  },
1672
  {
1673
  "epoch": 30.02,
1674
- "eval_accuracy": 0.9857142857142858,
1675
- "eval_loss": 0.08717390149831772,
1676
- "eval_runtime": 5.8675,
1677
- "eval_samples_per_second": 11.93,
1678
- "eval_steps_per_second": 3.068,
1679
  "step": 2325
1680
  },
1681
  {
1682
  "epoch": 31.0,
1683
  "learning_rate": 2.1037037037037037e-05,
1684
- "loss": 0.0005,
1685
  "step": 2330
1686
  },
1687
  {
@@ -1693,7 +1693,7 @@
1693
  {
1694
  "epoch": 31.01,
1695
  "learning_rate": 2.074074074074074e-05,
1696
- "loss": 0.0005,
1697
  "step": 2350
1698
  },
1699
  {
@@ -1728,11 +1728,11 @@
1728
  },
1729
  {
1730
  "epoch": 31.02,
1731
- "eval_accuracy": 0.9857142857142858,
1732
- "eval_loss": 0.09013553708791733,
1733
- "eval_runtime": 5.7699,
1734
- "eval_samples_per_second": 12.132,
1735
- "eval_steps_per_second": 3.12,
1736
  "step": 2400
1737
  },
1738
  {
@@ -1779,11 +1779,11 @@
1779
  },
1780
  {
1781
  "epoch": 32.02,
1782
- "eval_accuracy": 0.9857142857142858,
1783
- "eval_loss": 0.08940133452415466,
1784
- "eval_runtime": 5.7037,
1785
- "eval_samples_per_second": 12.273,
1786
- "eval_steps_per_second": 3.156,
1787
  "step": 2475
1788
  },
1789
  {
@@ -1813,7 +1813,7 @@
1813
  {
1814
  "epoch": 33.01,
1815
  "learning_rate": 1.8222222222222224e-05,
1816
- "loss": 0.0004,
1817
  "step": 2520
1818
  },
1819
  {
@@ -1836,17 +1836,17 @@
1836
  },
1837
  {
1838
  "epoch": 33.02,
1839
- "eval_accuracy": 0.9857142857142858,
1840
- "eval_loss": 0.08584433048963547,
1841
- "eval_runtime": 5.7242,
1842
- "eval_samples_per_second": 12.229,
1843
- "eval_steps_per_second": 3.145,
1844
  "step": 2550
1845
  },
1846
  {
1847
  "epoch": 34.0,
1848
  "learning_rate": 1.762962962962963e-05,
1849
- "loss": 0.0004,
1850
  "step": 2560
1851
  },
1852
  {
@@ -1864,7 +1864,7 @@
1864
  {
1865
  "epoch": 34.01,
1866
  "learning_rate": 1.7185185185185185e-05,
1867
- "loss": 0.0004,
1868
  "step": 2590
1869
  },
1870
  {
@@ -1887,11 +1887,11 @@
1887
  },
1888
  {
1889
  "epoch": 34.02,
1890
- "eval_accuracy": 0.9857142857142858,
1891
- "eval_loss": 0.0912405401468277,
1892
- "eval_runtime": 5.7854,
1893
- "eval_samples_per_second": 12.099,
1894
- "eval_steps_per_second": 3.111,
1895
  "step": 2625
1896
  },
1897
  {
@@ -1903,7 +1903,7 @@
1903
  {
1904
  "epoch": 35.0,
1905
  "learning_rate": 1.6444444444444447e-05,
1906
- "loss": 0.0004,
1907
  "step": 2640
1908
  },
1909
  {
@@ -1915,13 +1915,13 @@
1915
  {
1916
  "epoch": 35.01,
1917
  "learning_rate": 1.614814814814815e-05,
1918
- "loss": 0.0005,
1919
  "step": 2660
1920
  },
1921
  {
1922
  "epoch": 35.01,
1923
  "learning_rate": 1.6000000000000003e-05,
1924
- "loss": 0.1029,
1925
  "step": 2670
1926
  },
1927
  {
@@ -1933,7 +1933,7 @@
1933
  {
1934
  "epoch": 35.02,
1935
  "learning_rate": 1.5703703703703705e-05,
1936
- "loss": 0.0006,
1937
  "step": 2690
1938
  },
1939
  {
@@ -1944,11 +1944,11 @@
1944
  },
1945
  {
1946
  "epoch": 35.02,
1947
- "eval_accuracy": 0.9428571428571428,
1948
- "eval_loss": 0.33030033111572266,
1949
- "eval_runtime": 5.6632,
1950
- "eval_samples_per_second": 12.361,
1951
- "eval_steps_per_second": 3.178,
1952
  "step": 2700
1953
  },
1954
  {
@@ -1960,13 +1960,13 @@
1960
  {
1961
  "epoch": 36.01,
1962
  "learning_rate": 1.5259259259259258e-05,
1963
- "loss": 0.0004,
1964
  "step": 2720
1965
  },
1966
  {
1967
  "epoch": 36.01,
1968
  "learning_rate": 1.5111111111111112e-05,
1969
- "loss": 0.0004,
1970
  "step": 2730
1971
  },
1972
  {
@@ -1984,7 +1984,7 @@
1984
  {
1985
  "epoch": 36.02,
1986
  "learning_rate": 1.4666666666666668e-05,
1987
- "loss": 0.0004,
1988
  "step": 2760
1989
  },
1990
  {
@@ -1995,17 +1995,17 @@
1995
  },
1996
  {
1997
  "epoch": 36.02,
1998
- "eval_accuracy": 0.9428571428571428,
1999
- "eval_loss": 0.17365017533302307,
2000
- "eval_runtime": 5.6606,
2001
- "eval_samples_per_second": 12.366,
2002
- "eval_steps_per_second": 3.18,
2003
  "step": 2775
2004
  },
2005
  {
2006
  "epoch": 37.0,
2007
  "learning_rate": 1.437037037037037e-05,
2008
- "loss": 0.0004,
2009
  "step": 2780
2010
  },
2011
  {
@@ -2023,7 +2023,7 @@
2023
  {
2024
  "epoch": 37.01,
2025
  "learning_rate": 1.3925925925925926e-05,
2026
- "loss": 0.0004,
2027
  "step": 2810
2028
  },
2029
  {
@@ -2035,13 +2035,13 @@
2035
  {
2036
  "epoch": 37.01,
2037
  "learning_rate": 1.362962962962963e-05,
2038
- "loss": 0.0004,
2039
  "step": 2830
2040
  },
2041
  {
2042
  "epoch": 37.02,
2043
  "learning_rate": 1.348148148148148e-05,
2044
- "loss": 0.0004,
2045
  "step": 2840
2046
  },
2047
  {
@@ -2052,11 +2052,11 @@
2052
  },
2053
  {
2054
  "epoch": 37.02,
2055
- "eval_accuracy": 0.9714285714285714,
2056
- "eval_loss": 0.15336854755878448,
2057
- "eval_runtime": 5.7371,
2058
- "eval_samples_per_second": 12.201,
2059
- "eval_steps_per_second": 3.137,
2060
  "step": 2850
2061
  },
2062
  {
@@ -2068,7 +2068,7 @@
2068
  {
2069
  "epoch": 38.01,
2070
  "learning_rate": 1.3037037037037036e-05,
2071
- "loss": 0.0004,
2072
  "step": 2870
2073
  },
2074
  {
@@ -2098,16 +2098,16 @@
2098
  {
2099
  "epoch": 38.02,
2100
  "learning_rate": 1.2296296296296298e-05,
2101
- "loss": 0.0004,
2102
  "step": 2920
2103
  },
2104
  {
2105
  "epoch": 38.02,
2106
- "eval_accuracy": 0.9714285714285714,
2107
- "eval_loss": 0.13503512740135193,
2108
- "eval_runtime": 5.8822,
2109
- "eval_samples_per_second": 11.9,
2110
- "eval_steps_per_second": 3.06,
2111
  "step": 2925
2112
  },
2113
  {
@@ -2131,7 +2131,7 @@
2131
  {
2132
  "epoch": 39.01,
2133
  "learning_rate": 1.1703703703703705e-05,
2134
- "loss": 0.0004,
2135
  "step": 2960
2136
  },
2137
  {
@@ -2160,11 +2160,11 @@
2160
  },
2161
  {
2162
  "epoch": 39.02,
2163
- "eval_accuracy": 0.9714285714285714,
2164
- "eval_loss": 0.1270487755537033,
2165
- "eval_runtime": 5.5912,
2166
- "eval_samples_per_second": 12.52,
2167
- "eval_steps_per_second": 3.219,
2168
  "step": 3000
2169
  },
2170
  {
@@ -2211,11 +2211,11 @@
2211
  },
2212
  {
2213
  "epoch": 40.02,
2214
- "eval_accuracy": 0.9714285714285714,
2215
- "eval_loss": 0.1252710521221161,
2216
- "eval_runtime": 5.672,
2217
- "eval_samples_per_second": 12.341,
2218
- "eval_steps_per_second": 3.173,
2219
  "step": 3075
2220
  },
2221
  {
@@ -2227,7 +2227,7 @@
2227
  {
2228
  "epoch": 41.0,
2229
  "learning_rate": 9.777777777777779e-06,
2230
- "loss": 0.0004,
2231
  "step": 3090
2232
  },
2233
  {
@@ -2257,7 +2257,7 @@
2257
  {
2258
  "epoch": 41.02,
2259
  "learning_rate": 9.037037037037037e-06,
2260
- "loss": 0.0004,
2261
  "step": 3140
2262
  },
2263
  {
@@ -2268,11 +2268,11 @@
2268
  },
2269
  {
2270
  "epoch": 41.02,
2271
- "eval_accuracy": 0.9714285714285714,
2272
- "eval_loss": 0.12408071011304855,
2273
- "eval_runtime": 5.7302,
2274
- "eval_samples_per_second": 12.216,
2275
- "eval_steps_per_second": 3.141,
2276
  "step": 3150
2277
  },
2278
  {
@@ -2284,7 +2284,7 @@
2284
  {
2285
  "epoch": 42.01,
2286
  "learning_rate": 8.592592592592593e-06,
2287
- "loss": 0.0004,
2288
  "step": 3170
2289
  },
2290
  {
@@ -2319,17 +2319,17 @@
2319
  },
2320
  {
2321
  "epoch": 42.02,
2322
- "eval_accuracy": 0.9714285714285714,
2323
- "eval_loss": 0.12468262761831284,
2324
- "eval_runtime": 5.8752,
2325
- "eval_samples_per_second": 11.915,
2326
- "eval_steps_per_second": 3.064,
2327
  "step": 3225
2328
  },
2329
  {
2330
  "epoch": 43.0,
2331
  "learning_rate": 7.703703703703704e-06,
2332
- "loss": 0.0004,
2333
  "step": 3230
2334
  },
2335
  {
@@ -2371,16 +2371,16 @@
2371
  {
2372
  "epoch": 43.02,
2373
  "learning_rate": 6.666666666666667e-06,
2374
- "loss": 0.0004,
2375
  "step": 3300
2376
  },
2377
  {
2378
  "epoch": 43.02,
2379
- "eval_accuracy": 0.9714285714285714,
2380
- "eval_loss": 0.12618018686771393,
2381
- "eval_runtime": 5.7926,
2382
- "eval_samples_per_second": 12.084,
2383
- "eval_steps_per_second": 3.107,
2384
  "step": 3300
2385
  },
2386
  {
@@ -2427,11 +2427,11 @@
2427
  },
2428
  {
2429
  "epoch": 44.02,
2430
- "eval_accuracy": 0.9571428571428572,
2431
- "eval_loss": 0.12661002576351166,
2432
- "eval_runtime": 5.8019,
2433
- "eval_samples_per_second": 12.065,
2434
- "eval_steps_per_second": 3.102,
2435
  "step": 3375
2436
  },
2437
  {
@@ -2461,7 +2461,7 @@
2461
  {
2462
  "epoch": 45.01,
2463
  "learning_rate": 4.888888888888889e-06,
2464
- "loss": 0.0253,
2465
  "step": 3420
2466
  },
2467
  {
@@ -2484,11 +2484,11 @@
2484
  },
2485
  {
2486
  "epoch": 45.02,
2487
- "eval_accuracy": 0.9714285714285714,
2488
- "eval_loss": 0.17405559122562408,
2489
- "eval_runtime": 5.6182,
2490
- "eval_samples_per_second": 12.459,
2491
- "eval_steps_per_second": 3.204,
2492
  "step": 3450
2493
  },
2494
  {
@@ -2535,11 +2535,11 @@
2535
  },
2536
  {
2537
  "epoch": 46.02,
2538
- "eval_accuracy": 0.9714285714285714,
2539
- "eval_loss": 0.17525674402713776,
2540
- "eval_runtime": 5.8413,
2541
- "eval_samples_per_second": 11.984,
2542
- "eval_steps_per_second": 3.082,
2543
  "step": 3525
2544
  },
2545
  {
@@ -2581,7 +2581,7 @@
2581
  {
2582
  "epoch": 47.02,
2583
  "learning_rate": 2.3703703703703703e-06,
2584
- "loss": 0.0006,
2585
  "step": 3590
2586
  },
2587
  {
@@ -2592,11 +2592,11 @@
2592
  },
2593
  {
2594
  "epoch": 47.02,
2595
- "eval_accuracy": 0.9714285714285714,
2596
- "eval_loss": 0.1633566915988922,
2597
- "eval_runtime": 5.6957,
2598
- "eval_samples_per_second": 12.29,
2599
- "eval_steps_per_second": 3.16,
2600
  "step": 3600
2601
  },
2602
  {
@@ -2620,7 +2620,7 @@
2620
  {
2621
  "epoch": 48.01,
2622
  "learning_rate": 1.6296296296296295e-06,
2623
- "loss": 0.0004,
2624
  "step": 3640
2625
  },
2626
  {
@@ -2643,11 +2643,11 @@
2643
  },
2644
  {
2645
  "epoch": 48.02,
2646
- "eval_accuracy": 0.9714285714285714,
2647
- "eval_loss": 0.1603454053401947,
2648
- "eval_runtime": 5.9217,
2649
- "eval_samples_per_second": 11.821,
2650
- "eval_steps_per_second": 3.04,
2651
  "step": 3675
2652
  },
2653
  {
@@ -2700,38 +2700,38 @@
2700
  },
2701
  {
2702
  "epoch": 49.02,
2703
- "eval_accuracy": 0.9714285714285714,
2704
- "eval_loss": 0.16023388504981995,
2705
- "eval_runtime": 5.6893,
2706
- "eval_samples_per_second": 12.304,
2707
- "eval_steps_per_second": 3.164,
2708
  "step": 3750
2709
  },
2710
  {
2711
  "epoch": 49.02,
2712
  "step": 3750,
2713
  "total_flos": 1.86923023515648e+19,
2714
- "train_loss": 0.1646428666052719,
2715
- "train_runtime": 3006.4065,
2716
- "train_samples_per_second": 4.989,
2717
- "train_steps_per_second": 1.247
2718
  },
2719
  {
2720
  "epoch": 49.02,
2721
- "eval_accuracy": 0.9290322580645162,
2722
- "eval_loss": 0.18336611986160278,
2723
- "eval_runtime": 15.2988,
2724
- "eval_samples_per_second": 10.132,
2725
- "eval_steps_per_second": 2.549,
2726
  "step": 3750
2727
  },
2728
  {
2729
  "epoch": 49.02,
2730
- "eval_accuracy": 0.9290322580645162,
2731
- "eval_loss": 0.18336611986160278,
2732
- "eval_runtime": 12.5538,
2733
- "eval_samples_per_second": 12.347,
2734
- "eval_steps_per_second": 3.107,
2735
  "step": 3750
2736
  }
2737
  ],
 
1
  {
2
+ "best_metric": 0.972972972972973,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-375",
4
  "epoch": 49.02,
5
  "global_step": 3750,
6
  "is_hyper_param_search": false,
 
10
  {
11
  "epoch": 0.0,
12
  "learning_rate": 1.3333333333333334e-06,
13
+ "loss": 2.4049,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.01,
18
  "learning_rate": 2.666666666666667e-06,
19
+ "loss": 2.3782,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.01,
24
  "learning_rate": 4.000000000000001e-06,
25
+ "loss": 2.3345,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.01,
30
  "learning_rate": 5.333333333333334e-06,
31
+ "loss": 2.3762,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.01,
36
  "learning_rate": 6.666666666666667e-06,
37
+ "loss": 2.3277,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.02,
42
  "learning_rate": 8.000000000000001e-06,
43
+ "loss": 2.2711,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.02,
48
  "learning_rate": 9.333333333333334e-06,
49
+ "loss": 2.369,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.02,
54
+ "eval_accuracy": 0.2972972972972973,
55
+ "eval_loss": 2.2216224670410156,
56
+ "eval_runtime": 2.6845,
57
+ "eval_samples_per_second": 13.783,
58
+ "eval_steps_per_second": 3.725,
59
  "step": 75
60
  },
61
  {
62
  "epoch": 1.0,
63
  "learning_rate": 1.0666666666666667e-05,
64
+ "loss": 2.2595,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 1.0,
69
  "learning_rate": 1.2e-05,
70
+ "loss": 2.2602,
71
  "step": 90
72
  },
73
  {
74
  "epoch": 1.01,
75
  "learning_rate": 1.3333333333333333e-05,
76
+ "loss": 2.1993,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 1.01,
81
  "learning_rate": 1.4666666666666668e-05,
82
+ "loss": 2.2198,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 1.01,
87
  "learning_rate": 1.6000000000000003e-05,
88
+ "loss": 2.1327,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 1.01,
93
  "learning_rate": 1.7333333333333336e-05,
94
+ "loss": 2.0154,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 1.02,
99
  "learning_rate": 1.866666666666667e-05,
100
+ "loss": 2.0248,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 1.02,
105
  "learning_rate": 2e-05,
106
+ "loss": 1.8283,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 1.02,
111
+ "eval_accuracy": 0.4864864864864865,
112
+ "eval_loss": 1.7583507299423218,
113
+ "eval_runtime": 2.7247,
114
+ "eval_samples_per_second": 13.58,
115
+ "eval_steps_per_second": 3.67,
116
  "step": 150
117
  },
118
  {
119
  "epoch": 2.0,
120
  "learning_rate": 2.1333333333333335e-05,
121
+ "loss": 1.5075,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 2.01,
126
  "learning_rate": 2.2666666666666668e-05,
127
+ "loss": 1.4104,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 2.01,
132
  "learning_rate": 2.4e-05,
133
+ "loss": 1.1892,
134
  "step": 180
135
  },
136
  {
137
  "epoch": 2.01,
138
  "learning_rate": 2.5333333333333337e-05,
139
+ "loss": 0.9929,
140
  "step": 190
141
  },
142
  {
143
  "epoch": 2.01,
144
  "learning_rate": 2.6666666666666667e-05,
145
+ "loss": 0.8859,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 2.02,
150
  "learning_rate": 2.8000000000000003e-05,
151
+ "loss": 0.7763,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 2.02,
156
  "learning_rate": 2.9333333333333336e-05,
157
+ "loss": 0.8729,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 2.02,
162
+ "eval_accuracy": 0.7027027027027027,
163
+ "eval_loss": 1.0192049741744995,
164
+ "eval_runtime": 2.6893,
165
+ "eval_samples_per_second": 13.758,
166
+ "eval_steps_per_second": 3.718,
167
  "step": 225
168
  },
169
  {
170
  "epoch": 3.0,
171
  "learning_rate": 3.066666666666667e-05,
172
+ "loss": 0.7605,
173
  "step": 230
174
  },
175
  {
176
  "epoch": 3.0,
177
  "learning_rate": 3.2000000000000005e-05,
178
+ "loss": 0.709,
179
  "step": 240
180
  },
181
  {
182
  "epoch": 3.01,
183
  "learning_rate": 3.3333333333333335e-05,
184
+ "loss": 0.4213,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 3.01,
189
  "learning_rate": 3.466666666666667e-05,
190
+ "loss": 0.6716,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 3.01,
195
  "learning_rate": 3.6e-05,
196
+ "loss": 0.7687,
197
  "step": 270
198
  },
199
  {
200
  "epoch": 3.01,
201
  "learning_rate": 3.733333333333334e-05,
202
+ "loss": 0.4992,
203
  "step": 280
204
  },
205
  {
206
  "epoch": 3.02,
207
  "learning_rate": 3.866666666666667e-05,
208
+ "loss": 0.5896,
209
  "step": 290
210
  },
211
  {
212
  "epoch": 3.02,
213
  "learning_rate": 4e-05,
214
+ "loss": 0.4077,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 3.02,
219
+ "eval_accuracy": 0.8378378378378378,
220
+ "eval_loss": 0.48492956161499023,
221
+ "eval_runtime": 2.6873,
222
+ "eval_samples_per_second": 13.768,
223
+ "eval_steps_per_second": 3.721,
224
  "step": 300
225
  },
226
  {
227
  "epoch": 4.0,
228
  "learning_rate": 4.133333333333333e-05,
229
+ "loss": 0.2703,
230
  "step": 310
231
  },
232
  {
233
  "epoch": 4.01,
234
  "learning_rate": 4.266666666666667e-05,
235
+ "loss": 0.3431,
236
  "step": 320
237
  },
238
  {
239
  "epoch": 4.01,
240
  "learning_rate": 4.4000000000000006e-05,
241
+ "loss": 0.4284,
242
  "step": 330
243
  },
244
  {
245
  "epoch": 4.01,
246
  "learning_rate": 4.5333333333333335e-05,
247
+ "loss": 0.5786,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 4.01,
252
  "learning_rate": 4.666666666666667e-05,
253
+ "loss": 0.5347,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 4.02,
258
  "learning_rate": 4.8e-05,
259
+ "loss": 0.2638,
260
  "step": 360
261
  },
262
  {
263
  "epoch": 4.02,
264
  "learning_rate": 4.933333333333334e-05,
265
+ "loss": 0.3742,
266
  "step": 370
267
  },
268
  {
269
  "epoch": 4.02,
270
+ "eval_accuracy": 0.972972972972973,
271
+ "eval_loss": 0.1343977451324463,
272
+ "eval_runtime": 2.7118,
273
+ "eval_samples_per_second": 13.644,
274
+ "eval_steps_per_second": 3.688,
275
  "step": 375
276
  },
277
  {
278
  "epoch": 5.0,
279
  "learning_rate": 4.9925925925925926e-05,
280
+ "loss": 0.2036,
281
  "step": 380
282
  },
283
  {
284
  "epoch": 5.0,
285
  "learning_rate": 4.977777777777778e-05,
286
+ "loss": 0.1899,
287
  "step": 390
288
  },
289
  {
290
  "epoch": 5.01,
291
  "learning_rate": 4.962962962962963e-05,
292
+ "loss": 0.3761,
293
  "step": 400
294
  },
295
  {
296
  "epoch": 5.01,
297
  "learning_rate": 4.9481481481481485e-05,
298
+ "loss": 0.3704,
299
  "step": 410
300
  },
301
  {
302
  "epoch": 5.01,
303
  "learning_rate": 4.933333333333334e-05,
304
+ "loss": 0.2038,
305
  "step": 420
306
  },
307
  {
308
  "epoch": 5.01,
309
  "learning_rate": 4.918518518518519e-05,
310
+ "loss": 0.4751,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 5.02,
315
  "learning_rate": 4.903703703703704e-05,
316
+ "loss": 0.1951,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 5.02,
321
  "learning_rate": 4.888888888888889e-05,
322
+ "loss": 0.094,
323
  "step": 450
324
  },
325
  {
326
  "epoch": 5.02,
327
+ "eval_accuracy": 0.8918918918918919,
328
+ "eval_loss": 0.24485082924365997,
329
+ "eval_runtime": 2.7174,
330
+ "eval_samples_per_second": 13.616,
331
+ "eval_steps_per_second": 3.68,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 6.0,
336
  "learning_rate": 4.874074074074074e-05,
337
+ "loss": 0.1336,
338
  "step": 460
339
  },
340
  {
341
  "epoch": 6.01,
342
  "learning_rate": 4.8592592592592596e-05,
343
+ "loss": 0.0813,
344
  "step": 470
345
  },
346
  {
347
  "epoch": 6.01,
348
  "learning_rate": 4.844444444444445e-05,
349
+ "loss": 0.1722,
350
  "step": 480
351
  },
352
  {
353
  "epoch": 6.01,
354
  "learning_rate": 4.82962962962963e-05,
355
+ "loss": 0.5579,
356
  "step": 490
357
  },
358
  {
359
  "epoch": 6.01,
360
  "learning_rate": 4.814814814814815e-05,
361
+ "loss": 0.3787,
362
  "step": 500
363
  },
364
  {
365
  "epoch": 6.02,
366
  "learning_rate": 4.8e-05,
367
+ "loss": 0.5287,
368
  "step": 510
369
  },
370
  {
371
  "epoch": 6.02,
372
  "learning_rate": 4.7851851851851854e-05,
373
+ "loss": 0.1005,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 6.02,
378
+ "eval_accuracy": 0.7837837837837838,
379
+ "eval_loss": 1.0794074535369873,
380
+ "eval_runtime": 2.7101,
381
+ "eval_samples_per_second": 13.653,
382
+ "eval_steps_per_second": 3.69,
383
  "step": 525
384
  },
385
  {
386
  "epoch": 7.0,
387
  "learning_rate": 4.770370370370371e-05,
388
+ "loss": 0.5154,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 7.0,
393
  "learning_rate": 4.755555555555556e-05,
394
+ "loss": 0.1985,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 7.01,
399
  "learning_rate": 4.740740740740741e-05,
400
+ "loss": 0.2665,
401
  "step": 550
402
  },
403
  {
404
  "epoch": 7.01,
405
  "learning_rate": 4.7259259259259266e-05,
406
+ "loss": 0.0321,
407
  "step": 560
408
  },
409
  {
410
  "epoch": 7.01,
411
  "learning_rate": 4.711111111111111e-05,
412
+ "loss": 0.159,
413
  "step": 570
414
  },
415
  {
416
  "epoch": 7.01,
417
  "learning_rate": 4.6962962962962966e-05,
418
+ "loss": 0.3711,
419
  "step": 580
420
  },
421
  {
422
  "epoch": 7.02,
423
  "learning_rate": 4.681481481481482e-05,
424
+ "loss": 0.0089,
425
  "step": 590
426
  },
427
  {
428
  "epoch": 7.02,
429
  "learning_rate": 4.666666666666667e-05,
430
+ "loss": 0.0053,
431
  "step": 600
432
  },
433
  {
434
  "epoch": 7.02,
435
+ "eval_accuracy": 0.9459459459459459,
436
+ "eval_loss": 0.23636393249034882,
437
+ "eval_runtime": 2.6811,
438
+ "eval_samples_per_second": 13.8,
439
+ "eval_steps_per_second": 3.73,
440
  "step": 600
441
  },
442
  {
443
  "epoch": 8.0,
444
  "learning_rate": 4.6518518518518525e-05,
445
+ "loss": 0.0056,
446
  "step": 610
447
  },
448
  {
449
  "epoch": 8.01,
450
  "learning_rate": 4.637037037037038e-05,
451
+ "loss": 0.3093,
452
  "step": 620
453
  },
454
  {
455
  "epoch": 8.01,
456
  "learning_rate": 4.6222222222222224e-05,
457
+ "loss": 0.3216,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 8.01,
462
  "learning_rate": 4.607407407407408e-05,
463
+ "loss": 0.1304,
464
  "step": 640
465
  },
466
  {
467
  "epoch": 8.01,
468
  "learning_rate": 4.592592592592593e-05,
469
+ "loss": 0.148,
470
  "step": 650
471
  },
472
  {
473
  "epoch": 8.02,
474
  "learning_rate": 4.577777777777778e-05,
475
+ "loss": 0.132,
476
  "step": 660
477
  },
478
  {
479
  "epoch": 8.02,
480
  "learning_rate": 4.5629629629629636e-05,
481
+ "loss": 0.0807,
482
  "step": 670
483
  },
484
  {
485
  "epoch": 8.02,
486
+ "eval_accuracy": 0.8378378378378378,
487
+ "eval_loss": 0.6658951640129089,
488
+ "eval_runtime": 2.6902,
489
+ "eval_samples_per_second": 13.754,
490
+ "eval_steps_per_second": 3.717,
491
  "step": 675
492
  },
493
  {
494
  "epoch": 9.0,
495
  "learning_rate": 4.548148148148149e-05,
496
+ "loss": 0.0775,
497
  "step": 680
498
  },
499
  {
500
  "epoch": 9.0,
501
  "learning_rate": 4.5333333333333335e-05,
502
+ "loss": 0.1351,
503
  "step": 690
504
  },
505
  {
506
  "epoch": 9.01,
507
  "learning_rate": 4.518518518518519e-05,
508
+ "loss": 0.1521,
509
  "step": 700
510
  },
511
  {
512
  "epoch": 9.01,
513
  "learning_rate": 4.503703703703704e-05,
514
+ "loss": 0.4525,
515
  "step": 710
516
  },
517
  {
518
  "epoch": 9.01,
519
  "learning_rate": 4.4888888888888894e-05,
520
+ "loss": 0.4013,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 9.01,
525
  "learning_rate": 4.474074074074075e-05,
526
+ "loss": 0.0261,
527
  "step": 730
528
  },
529
  {
530
  "epoch": 9.02,
531
  "learning_rate": 4.4592592592592594e-05,
532
+ "loss": 0.0091,
533
  "step": 740
534
  },
535
  {
536
  "epoch": 9.02,
537
  "learning_rate": 4.4444444444444447e-05,
538
+ "loss": 0.0031,
539
  "step": 750
540
  },
541
  {
542
  "epoch": 9.02,
543
+ "eval_accuracy": 0.918918918918919,
544
+ "eval_loss": 0.4496133029460907,
545
+ "eval_runtime": 2.7338,
546
+ "eval_samples_per_second": 13.534,
547
+ "eval_steps_per_second": 3.658,
548
  "step": 750
549
  },
550
  {
551
  "epoch": 10.0,
552
  "learning_rate": 4.42962962962963e-05,
553
+ "loss": 0.0035,
554
  "step": 760
555
  },
556
  {
557
  "epoch": 10.01,
558
  "learning_rate": 4.414814814814815e-05,
559
+ "loss": 0.0199,
560
  "step": 770
561
  },
562
  {
563
  "epoch": 10.01,
564
  "learning_rate": 4.4000000000000006e-05,
565
+ "loss": 0.0028,
566
  "step": 780
567
  },
568
  {
569
  "epoch": 10.01,
570
  "learning_rate": 4.385185185185185e-05,
571
+ "loss": 0.0021,
572
  "step": 790
573
  },
574
  {
575
  "epoch": 10.01,
576
  "learning_rate": 4.3703703703703705e-05,
577
+ "loss": 0.0035,
578
  "step": 800
579
  },
580
  {
581
  "epoch": 10.02,
582
  "learning_rate": 4.355555555555556e-05,
583
+ "loss": 0.0025,
584
  "step": 810
585
  },
586
  {
587
  "epoch": 10.02,
588
  "learning_rate": 4.340740740740741e-05,
589
+ "loss": 0.0203,
590
  "step": 820
591
  },
592
  {
593
  "epoch": 10.02,
594
+ "eval_accuracy": 0.918918918918919,
595
+ "eval_loss": 0.3398858308792114,
596
+ "eval_runtime": 2.8361,
597
+ "eval_samples_per_second": 13.046,
598
+ "eval_steps_per_second": 3.526,
599
  "step": 825
600
  },
601
  {
602
  "epoch": 11.0,
603
  "learning_rate": 4.325925925925926e-05,
604
+ "loss": 0.0042,
605
  "step": 830
606
  },
607
  {
608
  "epoch": 11.0,
609
  "learning_rate": 4.311111111111111e-05,
610
+ "loss": 0.0024,
611
  "step": 840
612
  },
613
  {
614
  "epoch": 11.01,
615
  "learning_rate": 4.296296296296296e-05,
616
+ "loss": 0.0023,
617
  "step": 850
618
  },
619
  {
620
  "epoch": 11.01,
621
  "learning_rate": 4.2814814814814816e-05,
622
+ "loss": 0.0266,
623
  "step": 860
624
  },
625
  {
626
  "epoch": 11.01,
627
  "learning_rate": 4.266666666666667e-05,
628
+ "loss": 0.018,
629
  "step": 870
630
  },
631
  {
632
  "epoch": 11.01,
633
  "learning_rate": 4.2518518518518515e-05,
634
+ "loss": 0.0028,
635
  "step": 880
636
  },
637
  {
638
  "epoch": 11.02,
639
  "learning_rate": 4.237037037037037e-05,
640
+ "loss": 0.2435,
641
  "step": 890
642
  },
643
  {
644
  "epoch": 11.02,
645
  "learning_rate": 4.222222222222222e-05,
646
+ "loss": 0.0093,
647
  "step": 900
648
  },
649
  {
650
  "epoch": 11.02,
651
+ "eval_accuracy": 0.9459459459459459,
652
+ "eval_loss": 0.3724738359451294,
653
+ "eval_runtime": 2.729,
654
+ "eval_samples_per_second": 13.558,
655
+ "eval_steps_per_second": 3.664,
656
  "step": 900
657
  },
658
  {
659
  "epoch": 12.0,
660
  "learning_rate": 4.2074074074074075e-05,
661
+ "loss": 0.1554,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 12.01,
666
  "learning_rate": 4.192592592592593e-05,
667
+ "loss": 0.0049,
668
  "step": 920
669
  },
670
  {
671
  "epoch": 12.01,
672
  "learning_rate": 4.177777777777778e-05,
673
+ "loss": 0.1019,
674
  "step": 930
675
  },
676
  {
677
  "epoch": 12.01,
678
  "learning_rate": 4.162962962962963e-05,
679
+ "loss": 0.2167,
680
  "step": 940
681
  },
682
  {
683
  "epoch": 12.01,
684
  "learning_rate": 4.148148148148148e-05,
685
+ "loss": 0.1541,
686
  "step": 950
687
  },
688
  {
689
  "epoch": 12.02,
690
  "learning_rate": 4.133333333333333e-05,
691
+ "loss": 0.0032,
692
  "step": 960
693
  },
694
  {
695
  "epoch": 12.02,
696
  "learning_rate": 4.1185185185185186e-05,
697
+ "loss": 0.0022,
698
  "step": 970
699
  },
700
  {
701
  "epoch": 12.02,
702
+ "eval_accuracy": 0.918918918918919,
703
+ "eval_loss": 0.5498412847518921,
704
+ "eval_runtime": 2.7424,
705
+ "eval_samples_per_second": 13.492,
706
+ "eval_steps_per_second": 3.647,
707
  "step": 975
708
  },
709
  {
710
  "epoch": 13.0,
711
  "learning_rate": 4.103703703703704e-05,
712
+ "loss": 0.1514,
713
  "step": 980
714
  },
715
  {
716
  "epoch": 13.0,
717
  "learning_rate": 4.088888888888889e-05,
718
+ "loss": 0.0019,
719
  "step": 990
720
  },
721
  {
722
  "epoch": 13.01,
723
  "learning_rate": 4.074074074074074e-05,
724
+ "loss": 0.0018,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 13.01,
729
  "learning_rate": 4.059259259259259e-05,
730
+ "loss": 0.0017,
731
  "step": 1010
732
  },
733
  {
734
  "epoch": 13.01,
735
  "learning_rate": 4.0444444444444444e-05,
736
+ "loss": 0.0017,
737
  "step": 1020
738
  },
739
  {
740
  "epoch": 13.01,
741
  "learning_rate": 4.02962962962963e-05,
742
+ "loss": 0.0015,
743
  "step": 1030
744
  },
745
  {
746
  "epoch": 13.02,
747
  "learning_rate": 4.014814814814815e-05,
748
+ "loss": 0.0015,
749
  "step": 1040
750
  },
751
  {
752
  "epoch": 13.02,
753
  "learning_rate": 4e-05,
754
+ "loss": 0.0017,
755
  "step": 1050
756
  },
757
  {
758
  "epoch": 13.02,
759
+ "eval_accuracy": 0.972972972972973,
760
+ "eval_loss": 0.16978278756141663,
761
+ "eval_runtime": 2.7539,
762
+ "eval_samples_per_second": 13.436,
763
+ "eval_steps_per_second": 3.631,
764
  "step": 1050
765
  },
766
  {
767
  "epoch": 14.0,
768
  "learning_rate": 3.985185185185185e-05,
769
+ "loss": 0.0015,
770
  "step": 1060
771
  },
772
  {
773
  "epoch": 14.01,
774
  "learning_rate": 3.97037037037037e-05,
775
+ "loss": 0.0014,
776
  "step": 1070
777
  },
778
  {
779
  "epoch": 14.01,
780
  "learning_rate": 3.9555555555555556e-05,
781
+ "loss": 0.0014,
782
  "step": 1080
783
  },
784
  {
785
  "epoch": 14.01,
786
  "learning_rate": 3.940740740740741e-05,
787
+ "loss": 0.0013,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 14.01,
792
  "learning_rate": 3.925925925925926e-05,
793
+ "loss": 0.0012,
794
  "step": 1100
795
  },
796
  {
797
  "epoch": 14.02,
798
  "learning_rate": 3.9111111111111115e-05,
799
+ "loss": 0.0013,
800
  "step": 1110
801
  },
802
  {
803
  "epoch": 14.02,
804
  "learning_rate": 3.896296296296296e-05,
805
+ "loss": 0.0014,
806
  "step": 1120
807
  },
808
  {
809
  "epoch": 14.02,
810
+ "eval_accuracy": 0.9459459459459459,
811
+ "eval_loss": 0.19233068823814392,
812
+ "eval_runtime": 2.8803,
813
+ "eval_samples_per_second": 12.846,
814
+ "eval_steps_per_second": 3.472,
815
  "step": 1125
816
  },
817
  {
818
  "epoch": 15.0,
819
  "learning_rate": 3.8814814814814814e-05,
820
+ "loss": 0.0012,
821
  "step": 1130
822
  },
823
  {
824
  "epoch": 15.0,
825
  "learning_rate": 3.866666666666667e-05,
826
+ "loss": 0.0013,
827
  "step": 1140
828
  },
829
  {
830
  "epoch": 15.01,
831
  "learning_rate": 3.851851851851852e-05,
832
+ "loss": 0.0206,
833
  "step": 1150
834
  },
835
  {
836
  "epoch": 15.01,
837
  "learning_rate": 3.837037037037037e-05,
838
+ "loss": 0.0014,
839
  "step": 1160
840
  },
841
  {
842
  "epoch": 15.01,
843
  "learning_rate": 3.8222222222222226e-05,
844
+ "loss": 0.2069,
845
  "step": 1170
846
  },
847
  {
848
  "epoch": 15.01,
849
  "learning_rate": 3.807407407407408e-05,
850
+ "loss": 0.0014,
851
  "step": 1180
852
  },
853
  {
854
  "epoch": 15.02,
855
  "learning_rate": 3.7925925925925925e-05,
856
+ "loss": 0.0012,
857
  "step": 1190
858
  },
859
  {
860
  "epoch": 15.02,
861
  "learning_rate": 3.777777777777778e-05,
862
+ "loss": 0.0014,
863
  "step": 1200
864
  },
865
  {
866
  "epoch": 15.02,
867
+ "eval_accuracy": 0.972972972972973,
868
+ "eval_loss": 0.15707702934741974,
869
+ "eval_runtime": 2.8394,
870
+ "eval_samples_per_second": 13.031,
871
+ "eval_steps_per_second": 3.522,
872
  "step": 1200
873
  },
874
  {
875
  "epoch": 16.0,
876
  "learning_rate": 3.762962962962963e-05,
877
+ "loss": 0.0012,
878
  "step": 1210
879
  },
880
  {
881
  "epoch": 16.01,
882
  "learning_rate": 3.7481481481481484e-05,
883
+ "loss": 0.0018,
884
  "step": 1220
885
  },
886
  {
887
  "epoch": 16.01,
888
  "learning_rate": 3.733333333333334e-05,
889
+ "loss": 0.1871,
890
  "step": 1230
891
  },
892
  {
 
904
  {
905
  "epoch": 16.02,
906
  "learning_rate": 3.688888888888889e-05,
907
+ "loss": 0.132,
908
  "step": 1260
909
  },
910
  {
911
  "epoch": 16.02,
912
  "learning_rate": 3.674074074074074e-05,
913
+ "loss": 0.0474,
914
  "step": 1270
915
  },
916
  {
917
  "epoch": 16.02,
918
+ "eval_accuracy": 0.8918918918918919,
919
+ "eval_loss": 0.5192855596542358,
920
+ "eval_runtime": 2.8857,
921
+ "eval_samples_per_second": 12.822,
922
+ "eval_steps_per_second": 3.465,
923
  "step": 1275
924
  },
925
  {
926
  "epoch": 17.0,
927
  "learning_rate": 3.6592592592592596e-05,
928
+ "loss": 0.0019,
929
  "step": 1280
930
  },
931
  {
932
  "epoch": 17.0,
933
  "learning_rate": 3.644444444444445e-05,
934
+ "loss": 0.0013,
935
  "step": 1290
936
  },
937
  {
938
  "epoch": 17.01,
939
  "learning_rate": 3.62962962962963e-05,
940
+ "loss": 0.1087,
941
  "step": 1300
942
  },
943
  {
944
  "epoch": 17.01,
945
  "learning_rate": 3.614814814814815e-05,
946
+ "loss": 0.081,
947
  "step": 1310
948
  },
949
  {
950
  "epoch": 17.01,
951
  "learning_rate": 3.6e-05,
952
+ "loss": 0.0019,
953
  "step": 1320
954
  },
955
  {
956
  "epoch": 17.01,
957
  "learning_rate": 3.5851851851851854e-05,
958
+ "loss": 0.0012,
959
  "step": 1330
960
  },
961
  {
962
  "epoch": 17.02,
963
  "learning_rate": 3.570370370370371e-05,
964
+ "loss": 0.0012,
965
  "step": 1340
966
  },
967
  {
968
  "epoch": 17.02,
969
  "learning_rate": 3.555555555555556e-05,
970
+ "loss": 0.0011,
971
  "step": 1350
972
  },
973
  {
974
  "epoch": 17.02,
975
+ "eval_accuracy": 0.972972972972973,
976
+ "eval_loss": 0.14079852402210236,
977
+ "eval_runtime": 3.0095,
978
+ "eval_samples_per_second": 12.294,
979
+ "eval_steps_per_second": 3.323,
980
  "step": 1350
981
  },
982
  {
983
  "epoch": 18.0,
984
  "learning_rate": 3.540740740740741e-05,
985
+ "loss": 0.0009,
986
  "step": 1360
987
  },
988
  {
989
  "epoch": 18.01,
990
  "learning_rate": 3.525925925925926e-05,
991
+ "loss": 0.0011,
992
  "step": 1370
993
  },
994
  {
995
  "epoch": 18.01,
996
  "learning_rate": 3.511111111111111e-05,
997
+ "loss": 0.0277,
998
  "step": 1380
999
  },
1000
  {
1001
  "epoch": 18.01,
1002
  "learning_rate": 3.4962962962962965e-05,
1003
+ "loss": 0.0011,
1004
  "step": 1390
1005
  },
1006
  {
1007
  "epoch": 18.01,
1008
  "learning_rate": 3.481481481481482e-05,
1009
+ "loss": 0.0011,
1010
  "step": 1400
1011
  },
1012
  {
1013
  "epoch": 18.02,
1014
  "learning_rate": 3.466666666666667e-05,
1015
+ "loss": 0.1377,
1016
  "step": 1410
1017
  },
1018
  {
1019
  "epoch": 18.02,
1020
  "learning_rate": 3.4518518518518524e-05,
1021
+ "loss": 0.001,
1022
  "step": 1420
1023
  },
1024
  {
1025
  "epoch": 18.02,
1026
+ "eval_accuracy": 0.9459459459459459,
1027
+ "eval_loss": 0.34057652950286865,
1028
+ "eval_runtime": 2.8674,
1029
+ "eval_samples_per_second": 12.904,
1030
+ "eval_steps_per_second": 3.488,
1031
  "step": 1425
1032
  },
1033
  {
1034
  "epoch": 19.0,
1035
  "learning_rate": 3.437037037037037e-05,
1036
+ "loss": 0.002,
1037
  "step": 1430
1038
  },
1039
  {
1040
  "epoch": 19.0,
1041
  "learning_rate": 3.4222222222222224e-05,
1042
+ "loss": 0.0018,
1043
  "step": 1440
1044
  },
1045
  {
1046
  "epoch": 19.01,
1047
  "learning_rate": 3.4074074074074077e-05,
1048
+ "loss": 0.2287,
1049
  "step": 1450
1050
  },
1051
  {
1052
  "epoch": 19.01,
1053
  "learning_rate": 3.392592592592593e-05,
1054
+ "loss": 0.102,
1055
  "step": 1460
1056
  },
1057
  {
1058
  "epoch": 19.01,
1059
  "learning_rate": 3.377777777777778e-05,
1060
+ "loss": 0.0785,
1061
  "step": 1470
1062
  },
1063
  {
1064
  "epoch": 19.01,
1065
  "learning_rate": 3.3629629629629636e-05,
1066
+ "loss": 0.002,
1067
  "step": 1480
1068
  },
1069
  {
1070
  "epoch": 19.02,
1071
  "learning_rate": 3.348148148148148e-05,
1072
+ "loss": 0.0357,
1073
  "step": 1490
1074
  },
1075
  {
1076
  "epoch": 19.02,
1077
  "learning_rate": 3.3333333333333335e-05,
1078
+ "loss": 0.0034,
1079
  "step": 1500
1080
  },
1081
  {
1082
  "epoch": 19.02,
1083
+ "eval_accuracy": 0.9459459459459459,
1084
+ "eval_loss": 0.25160789489746094,
1085
+ "eval_runtime": 2.8519,
1086
+ "eval_samples_per_second": 12.974,
1087
+ "eval_steps_per_second": 3.506,
1088
  "step": 1500
1089
  },
1090
  {
1091
  "epoch": 20.0,
1092
  "learning_rate": 3.318518518518519e-05,
1093
+ "loss": 0.0609,
1094
  "step": 1510
1095
  },
1096
  {
1097
  "epoch": 20.01,
1098
  "learning_rate": 3.303703703703704e-05,
1099
+ "loss": 0.0013,
1100
  "step": 1520
1101
  },
1102
  {
1103
  "epoch": 20.01,
1104
  "learning_rate": 3.2888888888888894e-05,
1105
+ "loss": 0.0012,
1106
  "step": 1530
1107
  },
1108
  {
1109
  "epoch": 20.01,
1110
  "learning_rate": 3.274074074074075e-05,
1111
+ "loss": 0.0014,
1112
  "step": 1540
1113
  },
1114
  {
1115
  "epoch": 20.01,
1116
  "learning_rate": 3.25925925925926e-05,
1117
+ "loss": 0.1104,
1118
  "step": 1550
1119
  },
1120
  {
1121
  "epoch": 20.02,
1122
  "learning_rate": 3.2444444444444446e-05,
1123
+ "loss": 0.0011,
1124
  "step": 1560
1125
  },
1126
  {
1127
  "epoch": 20.02,
1128
  "learning_rate": 3.22962962962963e-05,
1129
+ "loss": 0.0029,
1130
  "step": 1570
1131
  },
1132
  {
1133
  "epoch": 20.02,
1134
+ "eval_accuracy": 0.918918918918919,
1135
+ "eval_loss": 0.29616308212280273,
1136
+ "eval_runtime": 2.9222,
1137
+ "eval_samples_per_second": 12.662,
1138
+ "eval_steps_per_second": 3.422,
1139
  "step": 1575
1140
  },
1141
  {
1142
  "epoch": 21.0,
1143
  "learning_rate": 3.214814814814815e-05,
1144
+ "loss": 0.0014,
1145
  "step": 1580
1146
  },
1147
  {
1148
  "epoch": 21.0,
1149
  "learning_rate": 3.2000000000000005e-05,
1150
+ "loss": 0.0769,
1151
  "step": 1590
1152
  },
1153
  {
1154
  "epoch": 21.01,
1155
  "learning_rate": 3.185185185185185e-05,
1156
+ "loss": 0.001,
1157
  "step": 1600
1158
  },
1159
  {
1160
  "epoch": 21.01,
1161
  "learning_rate": 3.1703703703703705e-05,
1162
+ "loss": 0.001,
1163
  "step": 1610
1164
  },
1165
  {
1166
  "epoch": 21.01,
1167
  "learning_rate": 3.155555555555556e-05,
1168
+ "loss": 0.1219,
1169
  "step": 1620
1170
  },
1171
  {
1172
  "epoch": 21.01,
1173
  "learning_rate": 3.140740740740741e-05,
1174
+ "loss": 0.001,
1175
  "step": 1630
1176
  },
1177
  {
1178
  "epoch": 21.02,
1179
  "learning_rate": 3.1259259259259264e-05,
1180
+ "loss": 0.0013,
1181
  "step": 1640
1182
  },
1183
  {
1184
  "epoch": 21.02,
1185
  "learning_rate": 3.111111111111111e-05,
1186
+ "loss": 0.0008,
1187
  "step": 1650
1188
  },
1189
  {
1190
  "epoch": 21.02,
1191
+ "eval_accuracy": 0.918918918918919,
1192
+ "eval_loss": 0.4023502469062805,
1193
+ "eval_runtime": 2.9031,
1194
+ "eval_samples_per_second": 12.745,
1195
+ "eval_steps_per_second": 3.445,
1196
  "step": 1650
1197
  },
1198
  {
1199
  "epoch": 22.0,
1200
  "learning_rate": 3.096296296296296e-05,
1201
+ "loss": 0.001,
1202
  "step": 1660
1203
  },
1204
  {
 
1210
  {
1211
  "epoch": 22.01,
1212
  "learning_rate": 3.066666666666667e-05,
1213
+ "loss": 0.0022,
1214
  "step": 1680
1215
  },
1216
  {
1217
  "epoch": 22.01,
1218
  "learning_rate": 3.0518518518518515e-05,
1219
+ "loss": 0.0009,
1220
  "step": 1690
1221
  },
1222
  {
 
1228
  {
1229
  "epoch": 22.02,
1230
  "learning_rate": 3.0222222222222225e-05,
1231
+ "loss": 0.0008,
1232
  "step": 1710
1233
  },
1234
  {
 
1239
  },
1240
  {
1241
  "epoch": 22.02,
1242
+ "eval_accuracy": 0.918918918918919,
1243
+ "eval_loss": 0.4643724262714386,
1244
+ "eval_runtime": 2.9019,
1245
+ "eval_samples_per_second": 12.75,
1246
+ "eval_steps_per_second": 3.446,
1247
  "step": 1725
1248
  },
1249
  {
1250
  "epoch": 23.0,
1251
  "learning_rate": 2.992592592592593e-05,
1252
+ "loss": 0.0009,
1253
  "step": 1730
1254
  },
1255
  {
 
1261
  {
1262
  "epoch": 23.01,
1263
  "learning_rate": 2.962962962962963e-05,
1264
+ "loss": 0.0336,
1265
  "step": 1750
1266
  },
1267
  {
1268
  "epoch": 23.01,
1269
  "learning_rate": 2.9481481481481483e-05,
1270
+ "loss": 0.0044,
1271
  "step": 1760
1272
  },
1273
  {
 
1279
  {
1280
  "epoch": 23.01,
1281
  "learning_rate": 2.918518518518519e-05,
1282
+ "loss": 0.001,
1283
  "step": 1780
1284
  },
1285
  {
1286
  "epoch": 23.02,
1287
  "learning_rate": 2.9037037037037042e-05,
1288
+ "loss": 0.0008,
1289
  "step": 1790
1290
  },
1291
  {
1292
  "epoch": 23.02,
1293
  "learning_rate": 2.8888888888888888e-05,
1294
+ "loss": 0.1521,
1295
  "step": 1800
1296
  },
1297
  {
1298
  "epoch": 23.02,
1299
+ "eval_accuracy": 0.918918918918919,
1300
+ "eval_loss": 0.48252156376838684,
1301
+ "eval_runtime": 2.9421,
1302
+ "eval_samples_per_second": 12.576,
1303
+ "eval_steps_per_second": 3.399,
1304
  "step": 1800
1305
  },
1306
  {
1307
  "epoch": 24.0,
1308
  "learning_rate": 2.874074074074074e-05,
1309
+ "loss": 0.0023,
1310
  "step": 1810
1311
  },
1312
  {
1313
  "epoch": 24.01,
1314
  "learning_rate": 2.8592592592592594e-05,
1315
+ "loss": 0.0008,
1316
  "step": 1820
1317
  },
1318
  {
1319
  "epoch": 24.01,
1320
  "learning_rate": 2.8444444444444447e-05,
1321
+ "loss": 0.0074,
1322
  "step": 1830
1323
  },
1324
  {
1325
  "epoch": 24.01,
1326
  "learning_rate": 2.8296296296296297e-05,
1327
+ "loss": 0.0009,
1328
  "step": 1840
1329
  },
1330
  {
 
1336
  {
1337
  "epoch": 24.02,
1338
  "learning_rate": 2.8000000000000003e-05,
1339
+ "loss": 0.0007,
1340
  "step": 1860
1341
  },
1342
  {
1343
  "epoch": 24.02,
1344
  "learning_rate": 2.7851851851851853e-05,
1345
+ "loss": 0.001,
1346
  "step": 1870
1347
  },
1348
  {
1349
  "epoch": 24.02,
1350
+ "eval_accuracy": 0.918918918918919,
1351
+ "eval_loss": 0.6339796185493469,
1352
+ "eval_runtime": 2.8461,
1353
+ "eval_samples_per_second": 13.0,
1354
+ "eval_steps_per_second": 3.514,
1355
  "step": 1875
1356
  },
1357
  {
 
1375
  {
1376
  "epoch": 25.01,
1377
  "learning_rate": 2.725925925925926e-05,
1378
+ "loss": 0.001,
1379
  "step": 1910
1380
  },
1381
  {
1382
  "epoch": 25.01,
1383
  "learning_rate": 2.7111111111111114e-05,
1384
+ "loss": 0.0007,
1385
  "step": 1920
1386
  },
1387
  {
1388
  "epoch": 25.01,
1389
  "learning_rate": 2.696296296296296e-05,
1390
+ "loss": 0.0007,
1391
  "step": 1930
1392
  },
1393
  {
1394
  "epoch": 25.02,
1395
  "learning_rate": 2.6814814814814814e-05,
1396
+ "loss": 0.0007,
1397
  "step": 1940
1398
  },
1399
  {
1400
  "epoch": 25.02,
1401
  "learning_rate": 2.6666666666666667e-05,
1402
+ "loss": 0.0245,
1403
  "step": 1950
1404
  },
1405
  {
1406
  "epoch": 25.02,
1407
+ "eval_accuracy": 0.9459459459459459,
1408
+ "eval_loss": 0.3778836727142334,
1409
+ "eval_runtime": 2.8022,
1410
+ "eval_samples_per_second": 13.204,
1411
+ "eval_steps_per_second": 3.569,
1412
  "step": 1950
1413
  },
1414
  {
1415
  "epoch": 26.0,
1416
  "learning_rate": 2.651851851851852e-05,
1417
+ "loss": 0.0007,
1418
  "step": 1960
1419
  },
1420
  {
1421
  "epoch": 26.01,
1422
  "learning_rate": 2.6370370370370373e-05,
1423
+ "loss": 0.0007,
1424
  "step": 1970
1425
  },
1426
  {
1427
  "epoch": 26.01,
1428
  "learning_rate": 2.6222222222222226e-05,
1429
+ "loss": 0.0007,
1430
  "step": 1980
1431
  },
1432
  {
 
1438
  {
1439
  "epoch": 26.01,
1440
  "learning_rate": 2.5925925925925925e-05,
1441
+ "loss": 0.0947,
1442
  "step": 2000
1443
  },
1444
  {
1445
  "epoch": 26.02,
1446
  "learning_rate": 2.5777777777777778e-05,
1447
+ "loss": 0.001,
1448
  "step": 2010
1449
  },
1450
  {
1451
  "epoch": 26.02,
1452
  "learning_rate": 2.562962962962963e-05,
1453
+ "loss": 0.0007,
1454
  "step": 2020
1455
  },
1456
  {
1457
  "epoch": 26.02,
1458
+ "eval_accuracy": 0.9459459459459459,
1459
+ "eval_loss": 0.3375699818134308,
1460
+ "eval_runtime": 2.7928,
1461
+ "eval_samples_per_second": 13.248,
1462
+ "eval_steps_per_second": 3.581,
1463
  "step": 2025
1464
  },
1465
  {
1466
  "epoch": 27.0,
1467
  "learning_rate": 2.5481481481481484e-05,
1468
+ "loss": 0.0006,
1469
  "step": 2030
1470
  },
1471
  {
1472
  "epoch": 27.0,
1473
  "learning_rate": 2.5333333333333337e-05,
1474
+ "loss": 0.0012,
1475
  "step": 2040
1476
  },
1477
  {
1478
  "epoch": 27.01,
1479
  "learning_rate": 2.5185185185185183e-05,
1480
+ "loss": 0.0007,
1481
  "step": 2050
1482
  },
1483
  {
1484
  "epoch": 27.01,
1485
  "learning_rate": 2.5037037037037036e-05,
1486
+ "loss": 0.0007,
1487
  "step": 2060
1488
  },
1489
  {
1490
  "epoch": 27.01,
1491
  "learning_rate": 2.488888888888889e-05,
1492
+ "loss": 0.0927,
1493
  "step": 2070
1494
  },
1495
  {
1496
  "epoch": 27.01,
1497
  "learning_rate": 2.4740740740740742e-05,
1498
+ "loss": 0.0817,
1499
  "step": 2080
1500
  },
1501
  {
1502
  "epoch": 27.02,
1503
  "learning_rate": 2.4592592592592595e-05,
1504
+ "loss": 0.2909,
1505
  "step": 2090
1506
  },
1507
  {
1508
  "epoch": 27.02,
1509
  "learning_rate": 2.4444444444444445e-05,
1510
+ "loss": 0.0011,
1511
  "step": 2100
1512
  },
1513
  {
1514
  "epoch": 27.02,
1515
+ "eval_accuracy": 0.9459459459459459,
1516
+ "eval_loss": 0.28334298729896545,
1517
+ "eval_runtime": 2.7397,
1518
+ "eval_samples_per_second": 13.505,
1519
+ "eval_steps_per_second": 3.65,
1520
  "step": 2100
1521
  },
1522
  {
1523
  "epoch": 28.0,
1524
  "learning_rate": 2.4296296296296298e-05,
1525
+ "loss": 0.0007,
1526
  "step": 2110
1527
  },
1528
  {
1529
  "epoch": 28.01,
1530
  "learning_rate": 2.414814814814815e-05,
1531
+ "loss": 0.0007,
1532
  "step": 2120
1533
  },
1534
  {
1535
  "epoch": 28.01,
1536
  "learning_rate": 2.4e-05,
1537
+ "loss": 0.0007,
1538
  "step": 2130
1539
  },
1540
  {
1541
  "epoch": 28.01,
1542
  "learning_rate": 2.3851851851851854e-05,
1543
+ "loss": 0.0007,
1544
  "step": 2140
1545
  },
1546
  {
1547
  "epoch": 28.01,
1548
  "learning_rate": 2.3703703703703707e-05,
1549
+ "loss": 0.0007,
1550
  "step": 2150
1551
  },
1552
  {
 
1558
  {
1559
  "epoch": 28.02,
1560
  "learning_rate": 2.340740740740741e-05,
1561
+ "loss": 0.0008,
1562
  "step": 2170
1563
  },
1564
  {
1565
  "epoch": 28.02,
1566
+ "eval_accuracy": 0.972972972972973,
1567
+ "eval_loss": 0.15925714373588562,
1568
+ "eval_runtime": 2.6329,
1569
+ "eval_samples_per_second": 14.053,
1570
+ "eval_steps_per_second": 3.798,
1571
  "step": 2175
1572
  },
1573
  {
 
1579
  {
1580
  "epoch": 29.0,
1581
  "learning_rate": 2.3111111111111112e-05,
1582
+ "loss": 0.0015,
1583
  "step": 2190
1584
  },
1585
  {
1586
  "epoch": 29.01,
1587
  "learning_rate": 2.2962962962962965e-05,
1588
+ "loss": 0.0008,
1589
  "step": 2200
1590
  },
1591
  {
1592
  "epoch": 29.01,
1593
  "learning_rate": 2.2814814814814818e-05,
1594
+ "loss": 0.0007,
1595
  "step": 2210
1596
  },
1597
  {
1598
  "epoch": 29.01,
1599
  "learning_rate": 2.2666666666666668e-05,
1600
+ "loss": 0.0006,
1601
  "step": 2220
1602
  },
1603
  {
1604
  "epoch": 29.01,
1605
  "learning_rate": 2.251851851851852e-05,
1606
+ "loss": 0.0006,
1607
  "step": 2230
1608
  },
1609
  {
1610
  "epoch": 29.02,
1611
  "learning_rate": 2.2370370370370374e-05,
1612
+ "loss": 0.0953,
1613
  "step": 2240
1614
  },
1615
  {
1616
  "epoch": 29.02,
1617
  "learning_rate": 2.2222222222222223e-05,
1618
+ "loss": 0.0008,
1619
  "step": 2250
1620
  },
1621
  {
1622
  "epoch": 29.02,
1623
+ "eval_accuracy": 0.972972972972973,
1624
+ "eval_loss": 0.08563826233148575,
1625
+ "eval_runtime": 2.7114,
1626
+ "eval_samples_per_second": 13.646,
1627
+ "eval_steps_per_second": 3.688,
1628
  "step": 2250
1629
  },
1630
  {
1631
  "epoch": 30.0,
1632
  "learning_rate": 2.2074074074074076e-05,
1633
+ "loss": 0.0006,
1634
  "step": 2260
1635
  },
1636
  {
 
1642
  {
1643
  "epoch": 30.01,
1644
  "learning_rate": 2.177777777777778e-05,
1645
+ "loss": 0.0006,
1646
  "step": 2280
1647
  },
1648
  {
1649
  "epoch": 30.01,
1650
  "learning_rate": 2.162962962962963e-05,
1651
+ "loss": 0.0006,
1652
  "step": 2290
1653
  },
1654
  {
1655
  "epoch": 30.01,
1656
  "learning_rate": 2.148148148148148e-05,
1657
+ "loss": 0.0007,
1658
  "step": 2300
1659
  },
1660
  {
1661
  "epoch": 30.02,
1662
  "learning_rate": 2.1333333333333335e-05,
1663
+ "loss": 0.0006,
1664
  "step": 2310
1665
  },
1666
  {
 
1671
  },
1672
  {
1673
  "epoch": 30.02,
1674
+ "eval_accuracy": 0.972972972972973,
1675
+ "eval_loss": 0.1049196645617485,
1676
+ "eval_runtime": 2.6802,
1677
+ "eval_samples_per_second": 13.805,
1678
+ "eval_steps_per_second": 3.731,
1679
  "step": 2325
1680
  },
1681
  {
1682
  "epoch": 31.0,
1683
  "learning_rate": 2.1037037037037037e-05,
1684
+ "loss": 0.0006,
1685
  "step": 2330
1686
  },
1687
  {
 
1693
  {
1694
  "epoch": 31.01,
1695
  "learning_rate": 2.074074074074074e-05,
1696
+ "loss": 0.0006,
1697
  "step": 2350
1698
  },
1699
  {
 
1728
  },
1729
  {
1730
  "epoch": 31.02,
1731
+ "eval_accuracy": 0.972972972972973,
1732
+ "eval_loss": 0.11323297023773193,
1733
+ "eval_runtime": 3.2205,
1734
+ "eval_samples_per_second": 11.489,
1735
+ "eval_steps_per_second": 3.105,
1736
  "step": 2400
1737
  },
1738
  {
 
1779
  },
1780
  {
1781
  "epoch": 32.02,
1782
+ "eval_accuracy": 0.972972972972973,
1783
+ "eval_loss": 0.11638977378606796,
1784
+ "eval_runtime": 2.5685,
1785
+ "eval_samples_per_second": 14.405,
1786
+ "eval_steps_per_second": 3.893,
1787
  "step": 2475
1788
  },
1789
  {
 
1813
  {
1814
  "epoch": 33.01,
1815
  "learning_rate": 1.8222222222222224e-05,
1816
+ "loss": 0.0005,
1817
  "step": 2520
1818
  },
1819
  {
 
1836
  },
1837
  {
1838
  "epoch": 33.02,
1839
+ "eval_accuracy": 0.972972972972973,
1840
+ "eval_loss": 0.12427014112472534,
1841
+ "eval_runtime": 2.5792,
1842
+ "eval_samples_per_second": 14.345,
1843
+ "eval_steps_per_second": 3.877,
1844
  "step": 2550
1845
  },
1846
  {
1847
  "epoch": 34.0,
1848
  "learning_rate": 1.762962962962963e-05,
1849
+ "loss": 0.0005,
1850
  "step": 2560
1851
  },
1852
  {
 
1864
  {
1865
  "epoch": 34.01,
1866
  "learning_rate": 1.7185185185185185e-05,
1867
+ "loss": 0.0005,
1868
  "step": 2590
1869
  },
1870
  {
 
1887
  },
1888
  {
1889
  "epoch": 34.02,
1890
+ "eval_accuracy": 0.972972972972973,
1891
+ "eval_loss": 0.1306069791316986,
1892
+ "eval_runtime": 2.6024,
1893
+ "eval_samples_per_second": 14.217,
1894
+ "eval_steps_per_second": 3.843,
1895
  "step": 2625
1896
  },
1897
  {
 
1903
  {
1904
  "epoch": 35.0,
1905
  "learning_rate": 1.6444444444444447e-05,
1906
+ "loss": 0.0005,
1907
  "step": 2640
1908
  },
1909
  {
 
1915
  {
1916
  "epoch": 35.01,
1917
  "learning_rate": 1.614814814814815e-05,
1918
+ "loss": 0.0007,
1919
  "step": 2660
1920
  },
1921
  {
1922
  "epoch": 35.01,
1923
  "learning_rate": 1.6000000000000003e-05,
1924
+ "loss": 0.0005,
1925
  "step": 2670
1926
  },
1927
  {
 
1933
  {
1934
  "epoch": 35.02,
1935
  "learning_rate": 1.5703703703703705e-05,
1936
+ "loss": 0.0005,
1937
  "step": 2690
1938
  },
1939
  {
 
1944
  },
1945
  {
1946
  "epoch": 35.02,
1947
+ "eval_accuracy": 0.9459459459459459,
1948
+ "eval_loss": 0.39192795753479004,
1949
+ "eval_runtime": 2.5502,
1950
+ "eval_samples_per_second": 14.509,
1951
+ "eval_steps_per_second": 3.921,
1952
  "step": 2700
1953
  },
1954
  {
 
1960
  {
1961
  "epoch": 36.01,
1962
  "learning_rate": 1.5259259259259258e-05,
1963
+ "loss": 0.0005,
1964
  "step": 2720
1965
  },
1966
  {
1967
  "epoch": 36.01,
1968
  "learning_rate": 1.5111111111111112e-05,
1969
+ "loss": 0.0005,
1970
  "step": 2730
1971
  },
1972
  {
 
1984
  {
1985
  "epoch": 36.02,
1986
  "learning_rate": 1.4666666666666668e-05,
1987
+ "loss": 0.0005,
1988
  "step": 2760
1989
  },
1990
  {
 
1995
  },
1996
  {
1997
  "epoch": 36.02,
1998
+ "eval_accuracy": 0.9459459459459459,
1999
+ "eval_loss": 0.36302804946899414,
2000
+ "eval_runtime": 2.4982,
2001
+ "eval_samples_per_second": 14.811,
2002
+ "eval_steps_per_second": 4.003,
2003
  "step": 2775
2004
  },
2005
  {
2006
  "epoch": 37.0,
2007
  "learning_rate": 1.437037037037037e-05,
2008
+ "loss": 0.0005,
2009
  "step": 2780
2010
  },
2011
  {
 
2023
  {
2024
  "epoch": 37.01,
2025
  "learning_rate": 1.3925925925925926e-05,
2026
+ "loss": 0.0005,
2027
  "step": 2810
2028
  },
2029
  {
 
2035
  {
2036
  "epoch": 37.01,
2037
  "learning_rate": 1.362962962962963e-05,
2038
+ "loss": 0.0005,
2039
  "step": 2830
2040
  },
2041
  {
2042
  "epoch": 37.02,
2043
  "learning_rate": 1.348148148148148e-05,
2044
+ "loss": 0.0005,
2045
  "step": 2840
2046
  },
2047
  {
 
2052
  },
2053
  {
2054
  "epoch": 37.02,
2055
+ "eval_accuracy": 0.9459459459459459,
2056
+ "eval_loss": 0.2762458324432373,
2057
+ "eval_runtime": 2.5203,
2058
+ "eval_samples_per_second": 14.681,
2059
+ "eval_steps_per_second": 3.968,
2060
  "step": 2850
2061
  },
2062
  {
 
2068
  {
2069
  "epoch": 38.01,
2070
  "learning_rate": 1.3037037037037036e-05,
2071
+ "loss": 0.0005,
2072
  "step": 2870
2073
  },
2074
  {
 
2098
  {
2099
  "epoch": 38.02,
2100
  "learning_rate": 1.2296296296296298e-05,
2101
+ "loss": 0.0005,
2102
  "step": 2920
2103
  },
2104
  {
2105
  "epoch": 38.02,
2106
+ "eval_accuracy": 0.9459459459459459,
2107
+ "eval_loss": 0.23680266737937927,
2108
+ "eval_runtime": 2.5362,
2109
+ "eval_samples_per_second": 14.589,
2110
+ "eval_steps_per_second": 3.943,
2111
  "step": 2925
2112
  },
2113
  {
 
2131
  {
2132
  "epoch": 39.01,
2133
  "learning_rate": 1.1703703703703705e-05,
2134
+ "loss": 0.0005,
2135
  "step": 2960
2136
  },
2137
  {
 
2160
  },
2161
  {
2162
  "epoch": 39.02,
2163
+ "eval_accuracy": 0.972972972972973,
2164
+ "eval_loss": 0.1935373693704605,
2165
+ "eval_runtime": 2.5511,
2166
+ "eval_samples_per_second": 14.504,
2167
+ "eval_steps_per_second": 3.92,
2168
  "step": 3000
2169
  },
2170
  {
 
2211
  },
2212
  {
2213
  "epoch": 40.02,
2214
+ "eval_accuracy": 0.972972972972973,
2215
+ "eval_loss": 0.19308657944202423,
2216
+ "eval_runtime": 2.5348,
2217
+ "eval_samples_per_second": 14.597,
2218
+ "eval_steps_per_second": 3.945,
2219
  "step": 3075
2220
  },
2221
  {
 
2227
  {
2228
  "epoch": 41.0,
2229
  "learning_rate": 9.777777777777779e-06,
2230
+ "loss": 0.0085,
2231
  "step": 3090
2232
  },
2233
  {
 
2257
  {
2258
  "epoch": 41.02,
2259
  "learning_rate": 9.037037037037037e-06,
2260
+ "loss": 0.0005,
2261
  "step": 3140
2262
  },
2263
  {
 
2268
  },
2269
  {
2270
  "epoch": 41.02,
2271
+ "eval_accuracy": 0.9459459459459459,
2272
+ "eval_loss": 0.21387839317321777,
2273
+ "eval_runtime": 2.5103,
2274
+ "eval_samples_per_second": 14.739,
2275
+ "eval_steps_per_second": 3.984,
2276
  "step": 3150
2277
  },
2278
  {
 
2284
  {
2285
  "epoch": 42.01,
2286
  "learning_rate": 8.592592592592593e-06,
2287
+ "loss": 0.0005,
2288
  "step": 3170
2289
  },
2290
  {
 
2319
  },
2320
  {
2321
  "epoch": 42.02,
2322
+ "eval_accuracy": 0.972972972972973,
2323
+ "eval_loss": 0.1899683177471161,
2324
+ "eval_runtime": 2.6252,
2325
+ "eval_samples_per_second": 14.094,
2326
+ "eval_steps_per_second": 3.809,
2327
  "step": 3225
2328
  },
2329
  {
2330
  "epoch": 43.0,
2331
  "learning_rate": 7.703703703703704e-06,
2332
+ "loss": 0.0015,
2333
  "step": 3230
2334
  },
2335
  {
 
2371
  {
2372
  "epoch": 43.02,
2373
  "learning_rate": 6.666666666666667e-06,
2374
+ "loss": 0.0006,
2375
  "step": 3300
2376
  },
2377
  {
2378
  "epoch": 43.02,
2379
+ "eval_accuracy": 0.972972972972973,
2380
+ "eval_loss": 0.1750660091638565,
2381
+ "eval_runtime": 2.5683,
2382
+ "eval_samples_per_second": 14.407,
2383
+ "eval_steps_per_second": 3.894,
2384
  "step": 3300
2385
  },
2386
  {
 
2427
  },
2428
  {
2429
  "epoch": 44.02,
2430
+ "eval_accuracy": 0.9459459459459459,
2431
+ "eval_loss": 0.29775792360305786,
2432
+ "eval_runtime": 2.6767,
2433
+ "eval_samples_per_second": 13.823,
2434
+ "eval_steps_per_second": 3.736,
2435
  "step": 3375
2436
  },
2437
  {
 
2461
  {
2462
  "epoch": 45.01,
2463
  "learning_rate": 4.888888888888889e-06,
2464
+ "loss": 0.0004,
2465
  "step": 3420
2466
  },
2467
  {
 
2484
  },
2485
  {
2486
  "epoch": 45.02,
2487
+ "eval_accuracy": 0.9459459459459459,
2488
+ "eval_loss": 0.2776608467102051,
2489
+ "eval_runtime": 2.5843,
2490
+ "eval_samples_per_second": 14.317,
2491
+ "eval_steps_per_second": 3.87,
2492
  "step": 3450
2493
  },
2494
  {
 
2535
  },
2536
  {
2537
  "epoch": 46.02,
2538
+ "eval_accuracy": 0.9459459459459459,
2539
+ "eval_loss": 0.2706151008605957,
2540
+ "eval_runtime": 2.6031,
2541
+ "eval_samples_per_second": 14.214,
2542
+ "eval_steps_per_second": 3.842,
2543
  "step": 3525
2544
  },
2545
  {
 
2581
  {
2582
  "epoch": 47.02,
2583
  "learning_rate": 2.3703703703703703e-06,
2584
+ "loss": 0.0004,
2585
  "step": 3590
2586
  },
2587
  {
 
2592
  },
2593
  {
2594
  "epoch": 47.02,
2595
+ "eval_accuracy": 0.9459459459459459,
2596
+ "eval_loss": 0.26377302408218384,
2597
+ "eval_runtime": 2.5538,
2598
+ "eval_samples_per_second": 14.488,
2599
+ "eval_steps_per_second": 3.916,
2600
  "step": 3600
2601
  },
2602
  {
 
2620
  {
2621
  "epoch": 48.01,
2622
  "learning_rate": 1.6296296296296295e-06,
2623
+ "loss": 0.0005,
2624
  "step": 3640
2625
  },
2626
  {
 
2643
  },
2644
  {
2645
  "epoch": 48.02,
2646
+ "eval_accuracy": 0.9459459459459459,
2647
+ "eval_loss": 0.21225357055664062,
2648
+ "eval_runtime": 2.6586,
2649
+ "eval_samples_per_second": 13.917,
2650
+ "eval_steps_per_second": 3.761,
2651
  "step": 3675
2652
  },
2653
  {
 
2700
  },
2701
  {
2702
  "epoch": 49.02,
2703
+ "eval_accuracy": 0.9459459459459459,
2704
+ "eval_loss": 0.21062754094600677,
2705
+ "eval_runtime": 2.7099,
2706
+ "eval_samples_per_second": 13.653,
2707
+ "eval_steps_per_second": 3.69,
2708
  "step": 3750
2709
  },
2710
  {
2711
  "epoch": 49.02,
2712
  "step": 3750,
2713
  "total_flos": 1.86923023515648e+19,
2714
+ "train_loss": 0.16172246270999313,
2715
+ "train_runtime": 3621.2802,
2716
+ "train_samples_per_second": 4.142,
2717
+ "train_steps_per_second": 1.036
2718
  },
2719
  {
2720
  "epoch": 49.02,
2721
+ "eval_accuracy": 0.8850574712643678,
2722
+ "eval_loss": 0.38414710760116577,
2723
+ "eval_runtime": 8.6129,
2724
+ "eval_samples_per_second": 10.101,
2725
+ "eval_steps_per_second": 2.554,
2726
  "step": 3750
2727
  },
2728
  {
2729
  "epoch": 49.02,
2730
+ "eval_accuracy": 0.8850574712643678,
2731
+ "eval_loss": 0.38414719700813293,
2732
+ "eval_runtime": 6.1302,
2733
+ "eval_samples_per_second": 14.192,
2734
+ "eval_steps_per_second": 3.589,
2735
  "step": 3750
2736
  }
2737
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6e3454d677f838f72c44c0d25ed5379fc596af582a8d2edaaedf4baaf9e9edc
3
  size 3439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54be8404f3a06187402196bf7251b8491aa663c5d9c26aa1c6f1e5be6bcb9d33
3
  size 3439