alkzar90 commited on
Commit
aad544d
1 Parent(s): b6e15d7

🍻 cheers

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 25.0,
3
- "eval_accuracy": 0.6839080459770115,
4
- "eval_loss": 1.1035996675491333,
5
- "eval_runtime": 1.6448,
6
- "eval_samples_per_second": 105.785,
7
- "eval_steps_per_second": 13.375,
8
- "total_flos": 2.8110723118700544e+18,
9
- "train_loss": 0.38098634693648786,
10
- "train_runtime": 883.423,
11
- "train_samples_per_second": 41.062,
12
- "train_steps_per_second": 2.575
13
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.7471264367816092,
4
+ "eval_loss": 0.847703754901886,
5
+ "eval_runtime": 1.6712,
6
+ "eval_samples_per_second": 104.114,
7
+ "eval_steps_per_second": 13.164,
8
+ "total_flos": 5.622144623740109e+18,
9
+ "train_loss": 0.2358846340533141,
10
+ "train_runtime": 1979.1189,
11
+ "train_samples_per_second": 36.658,
12
+ "train_steps_per_second": 2.299
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 25.0,
3
- "eval_accuracy": 0.711764705882353,
4
- "eval_loss": 0.90943843126297,
5
- "eval_runtime": 1.6048,
6
- "eval_samples_per_second": 105.932,
7
- "eval_steps_per_second": 13.709
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.7470588235294118,
4
+ "eval_loss": 0.7239104509353638,
5
+ "eval_runtime": 1.5971,
6
+ "eval_samples_per_second": 106.444,
7
+ "eval_steps_per_second": 13.775
8
  }
runs/Aug06_22-45-08_a3e0ec26426d/events.out.tfevents.1659828394.a3e0ec26426d.71.18 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6b9f5654c4e0fa78cb3d60e5b5dfbe4004df50154dc337900c535690f6d6525
3
+ size 686
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 25.0,
3
- "eval_accuracy": 0.6839080459770115,
4
- "eval_loss": 1.1035996675491333,
5
- "eval_runtime": 1.6448,
6
- "eval_samples_per_second": 105.785,
7
- "eval_steps_per_second": 13.375
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.7471264367816092,
4
+ "eval_loss": 0.847703754901886,
5
+ "eval_runtime": 1.6712,
6
+ "eval_samples_per_second": 104.114,
7
+ "eval_steps_per_second": 13.164
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 25.0,
3
- "total_flos": 2.8110723118700544e+18,
4
- "train_loss": 0.38098634693648786,
5
- "train_runtime": 883.423,
6
- "train_samples_per_second": 41.062,
7
- "train_steps_per_second": 2.575
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "total_flos": 5.622144623740109e+18,
4
+ "train_loss": 0.2358846340533141,
5
+ "train_runtime": 1979.1189,
6
+ "train_samples_per_second": 36.658,
7
+ "train_steps_per_second": 2.299
8
  }
trainer_state.json CHANGED
@@ -1,1585 +1,3160 @@
1
  {
2
- "best_metric": 0.7445889711380005,
3
- "best_model_checkpoint": "./croupier-creature-classifier/checkpoint-2200",
4
- "epoch": 25.0,
5
- "global_step": 2275,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.11,
12
- "learning_rate": 0.00019912087912087913,
13
- "loss": 1.3637,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.22,
18
- "learning_rate": 0.00019824175824175827,
19
- "loss": 1.3785,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.33,
24
- "learning_rate": 0.00019736263736263738,
25
- "loss": 1.3807,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.44,
30
- "learning_rate": 0.00019648351648351647,
31
- "loss": 1.3582,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.55,
36
- "learning_rate": 0.00019560439560439562,
37
- "loss": 1.2273,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.66,
42
- "learning_rate": 0.00019472527472527473,
43
- "loss": 1.3811,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.77,
48
- "learning_rate": 0.00019384615384615385,
49
- "loss": 1.3425,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.88,
54
- "learning_rate": 0.000192967032967033,
55
- "loss": 1.3062,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.99,
60
- "learning_rate": 0.0001920879120879121,
61
- "loss": 1.2959,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 1.1,
66
- "learning_rate": 0.00019120879120879122,
67
- "loss": 1.1638,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 1.1,
72
- "eval_accuracy": 0.5470588235294118,
73
- "eval_loss": 1.0563907623291016,
74
- "eval_runtime": 1.5249,
75
- "eval_samples_per_second": 111.483,
76
- "eval_steps_per_second": 14.427,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 1.21,
81
- "learning_rate": 0.00019032967032967034,
82
- "loss": 1.0851,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 1.32,
87
- "learning_rate": 0.00018945054945054945,
88
- "loss": 1.1777,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 1.43,
93
- "learning_rate": 0.00018857142857142857,
94
- "loss": 1.1352,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 1.54,
99
- "learning_rate": 0.0001876923076923077,
100
- "loss": 1.0112,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 1.65,
105
- "learning_rate": 0.00018681318681318683,
106
- "loss": 1.0718,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 1.76,
111
- "learning_rate": 0.00018593406593406595,
112
- "loss": 0.9871,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 1.87,
117
- "learning_rate": 0.00018505494505494506,
118
- "loss": 1.2279,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 1.98,
123
- "learning_rate": 0.00018417582417582418,
124
- "loss": 1.148,
125
  "step": 180
126
  },
127
  {
128
  "epoch": 2.09,
129
- "learning_rate": 0.0001832967032967033,
130
- "loss": 0.8905,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 2.2,
135
- "learning_rate": 0.0001824175824175824,
136
- "loss": 0.8524,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 2.2,
141
- "eval_accuracy": 0.611764705882353,
142
- "eval_loss": 0.9402753114700317,
143
- "eval_runtime": 1.611,
144
- "eval_samples_per_second": 105.524,
145
- "eval_steps_per_second": 13.656,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 2.31,
150
- "learning_rate": 0.00018153846153846155,
151
- "loss": 0.8388,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 2.42,
156
- "learning_rate": 0.00018065934065934067,
157
- "loss": 0.7519,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 2.53,
162
- "learning_rate": 0.00017978021978021978,
163
- "loss": 1.0743,
164
  "step": 230
165
  },
166
  {
167
  "epoch": 2.64,
168
- "learning_rate": 0.00017890109890109893,
169
- "loss": 0.8687,
170
  "step": 240
171
  },
172
  {
173
  "epoch": 2.75,
174
- "learning_rate": 0.00017802197802197802,
175
- "loss": 0.886,
176
  "step": 250
177
  },
178
  {
179
  "epoch": 2.86,
180
- "learning_rate": 0.00017714285714285713,
181
- "loss": 0.8484,
182
  "step": 260
183
  },
184
  {
185
  "epoch": 2.97,
186
- "learning_rate": 0.00017626373626373627,
187
- "loss": 0.8394,
188
  "step": 270
189
  },
190
  {
191
  "epoch": 3.08,
192
- "learning_rate": 0.0001753846153846154,
193
- "loss": 0.8304,
194
  "step": 280
195
  },
196
  {
197
  "epoch": 3.19,
198
- "learning_rate": 0.0001745054945054945,
199
- "loss": 0.6359,
200
  "step": 290
201
  },
202
  {
203
  "epoch": 3.3,
204
- "learning_rate": 0.00017362637362637365,
205
- "loss": 0.8231,
206
  "step": 300
207
  },
208
  {
209
  "epoch": 3.3,
210
- "eval_accuracy": 0.7176470588235294,
211
- "eval_loss": 0.8282244801521301,
212
- "eval_runtime": 2.0367,
213
- "eval_samples_per_second": 83.467,
214
- "eval_steps_per_second": 10.802,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 3.41,
219
- "learning_rate": 0.00017274725274725276,
220
- "loss": 0.6735,
221
  "step": 310
222
  },
223
  {
224
  "epoch": 3.52,
225
- "learning_rate": 0.00017186813186813185,
226
- "loss": 0.6721,
227
  "step": 320
228
  },
229
  {
230
  "epoch": 3.63,
231
- "learning_rate": 0.000170989010989011,
232
- "loss": 0.8183,
233
  "step": 330
234
  },
235
  {
236
  "epoch": 3.74,
237
- "learning_rate": 0.0001701098901098901,
238
- "loss": 0.7158,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 3.85,
243
- "learning_rate": 0.00016923076923076923,
244
- "loss": 0.796,
245
  "step": 350
246
  },
247
  {
248
  "epoch": 3.96,
249
- "learning_rate": 0.00016835164835164837,
250
- "loss": 0.7814,
251
  "step": 360
252
  },
253
  {
254
  "epoch": 4.07,
255
- "learning_rate": 0.0001674725274725275,
256
- "loss": 0.6363,
257
  "step": 370
258
  },
259
  {
260
  "epoch": 4.18,
261
- "learning_rate": 0.0001665934065934066,
262
- "loss": 0.57,
263
  "step": 380
264
  },
265
  {
266
  "epoch": 4.29,
267
- "learning_rate": 0.00016571428571428575,
268
- "loss": 0.5695,
269
  "step": 390
270
  },
271
  {
272
  "epoch": 4.4,
273
- "learning_rate": 0.00016483516483516484,
274
- "loss": 0.7398,
275
  "step": 400
276
  },
277
  {
278
  "epoch": 4.4,
279
- "eval_accuracy": 0.6294117647058823,
280
- "eval_loss": 0.9056094884872437,
281
- "eval_runtime": 1.6835,
282
- "eval_samples_per_second": 100.978,
283
- "eval_steps_per_second": 13.068,
284
  "step": 400
285
  },
286
  {
287
  "epoch": 4.51,
288
- "learning_rate": 0.00016395604395604395,
289
- "loss": 0.5678,
290
  "step": 410
291
  },
292
  {
293
  "epoch": 4.62,
294
- "learning_rate": 0.0001630769230769231,
295
- "loss": 0.5507,
296
  "step": 420
297
  },
298
  {
299
  "epoch": 4.73,
300
- "learning_rate": 0.0001621978021978022,
301
- "loss": 0.4983,
302
  "step": 430
303
  },
304
  {
305
  "epoch": 4.84,
306
- "learning_rate": 0.00016131868131868133,
307
- "loss": 0.6078,
308
  "step": 440
309
  },
310
  {
311
  "epoch": 4.95,
312
- "learning_rate": 0.00016043956043956044,
313
- "loss": 0.584,
314
  "step": 450
315
  },
316
  {
317
  "epoch": 5.05,
318
- "learning_rate": 0.00015956043956043958,
319
- "loss": 0.4714,
320
  "step": 460
321
  },
322
  {
323
  "epoch": 5.16,
324
- "learning_rate": 0.00015868131868131867,
325
- "loss": 0.6575,
326
  "step": 470
327
  },
328
  {
329
  "epoch": 5.27,
330
- "learning_rate": 0.00015780219780219782,
331
- "loss": 0.522,
332
  "step": 480
333
  },
334
  {
335
  "epoch": 5.38,
336
- "learning_rate": 0.00015692307692307693,
337
- "loss": 0.5475,
338
  "step": 490
339
  },
340
  {
341
  "epoch": 5.49,
342
- "learning_rate": 0.00015604395604395605,
343
- "loss": 0.41,
344
  "step": 500
345
  },
346
  {
347
  "epoch": 5.49,
348
- "eval_accuracy": 0.6235294117647059,
349
- "eval_loss": 0.8814617991447449,
350
- "eval_runtime": 1.5991,
351
- "eval_samples_per_second": 106.312,
352
- "eval_steps_per_second": 13.758,
353
  "step": 500
354
  },
355
  {
356
  "epoch": 5.6,
357
- "learning_rate": 0.00015516483516483516,
358
- "loss": 0.4955,
359
  "step": 510
360
  },
361
  {
362
  "epoch": 5.71,
363
- "learning_rate": 0.0001542857142857143,
364
- "loss": 0.5234,
365
  "step": 520
366
  },
367
  {
368
  "epoch": 5.82,
369
- "learning_rate": 0.00015340659340659342,
370
- "loss": 0.4696,
371
  "step": 530
372
  },
373
  {
374
  "epoch": 5.93,
375
- "learning_rate": 0.00015252747252747254,
376
- "loss": 0.4195,
377
  "step": 540
378
  },
379
  {
380
  "epoch": 6.04,
381
- "learning_rate": 0.00015164835164835165,
382
- "loss": 0.5375,
383
  "step": 550
384
  },
385
  {
386
  "epoch": 6.15,
387
- "learning_rate": 0.00015076923076923077,
388
- "loss": 0.3726,
389
  "step": 560
390
  },
391
  {
392
  "epoch": 6.26,
393
- "learning_rate": 0.0001498901098901099,
394
- "loss": 0.3705,
395
  "step": 570
396
  },
397
  {
398
  "epoch": 6.37,
399
- "learning_rate": 0.00014901098901098903,
400
- "loss": 0.3445,
401
  "step": 580
402
  },
403
  {
404
  "epoch": 6.48,
405
- "learning_rate": 0.00014813186813186815,
406
- "loss": 0.4785,
407
  "step": 590
408
  },
409
  {
410
  "epoch": 6.59,
411
- "learning_rate": 0.00014725274725274726,
412
- "loss": 0.4849,
413
  "step": 600
414
  },
415
  {
416
  "epoch": 6.59,
417
- "eval_accuracy": 0.6294117647058823,
418
- "eval_loss": 0.9504889249801636,
419
- "eval_runtime": 1.7834,
420
- "eval_samples_per_second": 95.325,
421
- "eval_steps_per_second": 12.336,
422
  "step": 600
423
  },
424
  {
425
  "epoch": 6.7,
426
- "learning_rate": 0.00014637362637362638,
427
- "loss": 0.5062,
428
  "step": 610
429
  },
430
  {
431
  "epoch": 6.81,
432
- "learning_rate": 0.0001454945054945055,
433
- "loss": 0.4269,
434
  "step": 620
435
  },
436
  {
437
  "epoch": 6.92,
438
- "learning_rate": 0.0001446153846153846,
439
- "loss": 0.3566,
440
  "step": 630
441
  },
442
  {
443
  "epoch": 7.03,
444
- "learning_rate": 0.00014373626373626375,
445
- "loss": 0.3931,
446
  "step": 640
447
  },
448
  {
449
  "epoch": 7.14,
450
- "learning_rate": 0.00014285714285714287,
451
- "loss": 0.3896,
452
  "step": 650
453
  },
454
  {
455
  "epoch": 7.25,
456
- "learning_rate": 0.00014197802197802198,
457
- "loss": 0.3306,
458
  "step": 660
459
  },
460
  {
461
  "epoch": 7.36,
462
- "learning_rate": 0.00014109890109890113,
463
- "loss": 0.4898,
464
  "step": 670
465
  },
466
  {
467
  "epoch": 7.47,
468
- "learning_rate": 0.00014021978021978022,
469
- "loss": 0.4136,
470
  "step": 680
471
  },
472
  {
473
  "epoch": 7.58,
474
- "learning_rate": 0.00013934065934065933,
475
- "loss": 0.4874,
476
  "step": 690
477
  },
478
  {
479
  "epoch": 7.69,
480
- "learning_rate": 0.00013846153846153847,
481
- "loss": 0.3894,
482
  "step": 700
483
  },
484
  {
485
  "epoch": 7.69,
486
- "eval_accuracy": 0.6882352941176471,
487
- "eval_loss": 0.8051896095275879,
488
- "eval_runtime": 2.0897,
489
- "eval_samples_per_second": 81.352,
490
- "eval_steps_per_second": 10.528,
491
  "step": 700
492
  },
493
  {
494
  "epoch": 7.8,
495
- "learning_rate": 0.0001375824175824176,
496
- "loss": 0.355,
497
  "step": 710
498
  },
499
  {
500
  "epoch": 7.91,
501
- "learning_rate": 0.0001367032967032967,
502
- "loss": 0.4024,
503
  "step": 720
504
  },
505
  {
506
  "epoch": 8.02,
507
- "learning_rate": 0.00013582417582417585,
508
- "loss": 0.458,
509
  "step": 730
510
  },
511
  {
512
  "epoch": 8.13,
513
- "learning_rate": 0.00013494505494505497,
514
- "loss": 0.2912,
515
  "step": 740
516
  },
517
  {
518
  "epoch": 8.24,
519
- "learning_rate": 0.00013406593406593405,
520
- "loss": 0.3608,
521
  "step": 750
522
  },
523
  {
524
  "epoch": 8.35,
525
- "learning_rate": 0.0001331868131868132,
526
- "loss": 0.3897,
527
  "step": 760
528
  },
529
  {
530
  "epoch": 8.46,
531
- "learning_rate": 0.0001323076923076923,
532
- "loss": 0.3451,
533
  "step": 770
534
  },
535
  {
536
  "epoch": 8.57,
537
- "learning_rate": 0.00013142857142857143,
538
- "loss": 0.3122,
539
  "step": 780
540
  },
541
  {
542
  "epoch": 8.68,
543
- "learning_rate": 0.00013054945054945057,
544
- "loss": 0.2771,
545
  "step": 790
546
  },
547
  {
548
  "epoch": 8.79,
549
- "learning_rate": 0.0001296703296703297,
550
- "loss": 0.4678,
551
  "step": 800
552
  },
553
  {
554
  "epoch": 8.79,
555
- "eval_accuracy": 0.7058823529411765,
556
- "eval_loss": 0.8423882722854614,
557
- "eval_runtime": 1.8864,
558
- "eval_samples_per_second": 90.118,
559
- "eval_steps_per_second": 11.662,
560
  "step": 800
561
  },
562
  {
563
  "epoch": 8.9,
564
- "learning_rate": 0.0001287912087912088,
565
- "loss": 0.4709,
566
  "step": 810
567
  },
568
  {
569
  "epoch": 9.01,
570
- "learning_rate": 0.00012791208791208792,
571
- "loss": 0.3594,
572
  "step": 820
573
  },
574
  {
575
  "epoch": 9.12,
576
- "learning_rate": 0.00012703296703296704,
577
- "loss": 0.3349,
578
  "step": 830
579
  },
580
  {
581
  "epoch": 9.23,
582
- "learning_rate": 0.00012615384615384615,
583
- "loss": 0.4093,
584
  "step": 840
585
  },
586
  {
587
  "epoch": 9.34,
588
- "learning_rate": 0.00012527472527472527,
589
- "loss": 0.2627,
590
  "step": 850
591
  },
592
  {
593
  "epoch": 9.45,
594
- "learning_rate": 0.0001243956043956044,
595
- "loss": 0.2817,
596
  "step": 860
597
  },
598
  {
599
  "epoch": 9.56,
600
- "learning_rate": 0.00012351648351648353,
601
- "loss": 0.3046,
602
  "step": 870
603
  },
604
  {
605
  "epoch": 9.67,
606
- "learning_rate": 0.00012263736263736264,
607
- "loss": 0.5005,
608
  "step": 880
609
  },
610
  {
611
  "epoch": 9.78,
612
- "learning_rate": 0.00012175824175824176,
613
- "loss": 0.1866,
614
  "step": 890
615
  },
616
  {
617
  "epoch": 9.89,
618
- "learning_rate": 0.00012087912087912087,
619
- "loss": 0.4279,
620
  "step": 900
621
  },
622
  {
623
  "epoch": 9.89,
624
- "eval_accuracy": 0.6705882352941176,
625
- "eval_loss": 0.963887095451355,
626
- "eval_runtime": 1.496,
627
- "eval_samples_per_second": 113.639,
628
- "eval_steps_per_second": 14.706,
629
  "step": 900
630
  },
631
  {
632
  "epoch": 10.0,
633
- "learning_rate": 0.00012,
634
- "loss": 0.4008,
635
  "step": 910
636
  },
637
  {
638
  "epoch": 10.11,
639
- "learning_rate": 0.00011912087912087912,
640
- "loss": 0.3001,
641
  "step": 920
642
  },
643
  {
644
  "epoch": 10.22,
645
- "learning_rate": 0.00011824175824175825,
646
- "loss": 0.2588,
647
  "step": 930
648
  },
649
  {
650
  "epoch": 10.33,
651
- "learning_rate": 0.00011736263736263738,
652
- "loss": 0.4082,
653
  "step": 940
654
  },
655
  {
656
  "epoch": 10.44,
657
- "learning_rate": 0.0001164835164835165,
658
- "loss": 0.3957,
659
  "step": 950
660
  },
661
  {
662
  "epoch": 10.55,
663
- "learning_rate": 0.0001156043956043956,
664
- "loss": 0.3988,
665
  "step": 960
666
  },
667
  {
668
  "epoch": 10.66,
669
- "learning_rate": 0.00011472527472527473,
670
- "loss": 0.2264,
671
  "step": 970
672
  },
673
  {
674
  "epoch": 10.77,
675
- "learning_rate": 0.00011384615384615384,
676
- "loss": 0.3202,
677
  "step": 980
678
  },
679
  {
680
  "epoch": 10.88,
681
- "learning_rate": 0.00011296703296703297,
682
- "loss": 0.2622,
683
  "step": 990
684
  },
685
  {
686
  "epoch": 10.99,
687
- "learning_rate": 0.0001120879120879121,
688
- "loss": 0.3461,
689
  "step": 1000
690
  },
691
  {
692
  "epoch": 10.99,
693
- "eval_accuracy": 0.7058823529411765,
694
- "eval_loss": 0.8496671915054321,
695
- "eval_runtime": 1.4967,
696
- "eval_samples_per_second": 113.586,
697
- "eval_steps_per_second": 14.699,
698
  "step": 1000
699
  },
700
  {
701
  "epoch": 11.1,
702
- "learning_rate": 0.00011120879120879122,
703
- "loss": 0.1967,
704
  "step": 1010
705
  },
706
  {
707
  "epoch": 11.21,
708
- "learning_rate": 0.00011032967032967035,
709
- "loss": 0.2573,
710
  "step": 1020
711
  },
712
  {
713
  "epoch": 11.32,
714
- "learning_rate": 0.00010945054945054946,
715
- "loss": 0.3313,
716
  "step": 1030
717
  },
718
  {
719
  "epoch": 11.43,
720
- "learning_rate": 0.00010857142857142856,
721
- "loss": 0.2774,
722
  "step": 1040
723
  },
724
  {
725
  "epoch": 11.54,
726
- "learning_rate": 0.0001076923076923077,
727
- "loss": 0.1745,
728
  "step": 1050
729
  },
730
  {
731
  "epoch": 11.65,
732
- "learning_rate": 0.00010681318681318682,
733
- "loss": 0.2563,
734
  "step": 1060
735
  },
736
  {
737
  "epoch": 11.76,
738
- "learning_rate": 0.00010593406593406594,
739
- "loss": 0.15,
740
  "step": 1070
741
  },
742
  {
743
  "epoch": 11.87,
744
- "learning_rate": 0.00010505494505494507,
745
- "loss": 0.346,
746
  "step": 1080
747
  },
748
  {
749
  "epoch": 11.98,
750
- "learning_rate": 0.00010417582417582418,
751
- "loss": 0.3187,
752
  "step": 1090
753
  },
754
  {
755
  "epoch": 12.09,
756
- "learning_rate": 0.00010329670329670331,
757
- "loss": 0.2741,
758
  "step": 1100
759
  },
760
  {
761
  "epoch": 12.09,
762
- "eval_accuracy": 0.7,
763
- "eval_loss": 0.9090209007263184,
764
- "eval_runtime": 1.5007,
765
- "eval_samples_per_second": 113.284,
766
- "eval_steps_per_second": 14.66,
767
  "step": 1100
768
  },
769
  {
770
  "epoch": 12.2,
771
- "learning_rate": 0.00010241758241758242,
772
- "loss": 0.2168,
773
  "step": 1110
774
  },
775
  {
776
  "epoch": 12.31,
777
- "learning_rate": 0.00010153846153846153,
778
- "loss": 0.1902,
779
  "step": 1120
780
  },
781
  {
782
  "epoch": 12.42,
783
- "learning_rate": 0.00010065934065934066,
784
- "loss": 0.2154,
785
  "step": 1130
786
  },
787
  {
788
  "epoch": 12.53,
789
- "learning_rate": 9.978021978021979e-05,
790
- "loss": 0.3392,
791
  "step": 1140
792
  },
793
  {
794
  "epoch": 12.64,
795
- "learning_rate": 9.89010989010989e-05,
796
- "loss": 0.2567,
797
  "step": 1150
798
  },
799
  {
800
  "epoch": 12.75,
801
- "learning_rate": 9.802197802197802e-05,
802
- "loss": 0.3863,
803
  "step": 1160
804
  },
805
  {
806
  "epoch": 12.86,
807
- "learning_rate": 9.714285714285715e-05,
808
- "loss": 0.3635,
809
  "step": 1170
810
  },
811
  {
812
  "epoch": 12.97,
813
- "learning_rate": 9.626373626373627e-05,
814
- "loss": 0.2915,
815
  "step": 1180
816
  },
817
  {
818
  "epoch": 13.08,
819
- "learning_rate": 9.53846153846154e-05,
820
- "loss": 0.3021,
821
  "step": 1190
822
  },
823
  {
824
  "epoch": 13.19,
825
- "learning_rate": 9.450549450549451e-05,
826
- "loss": 0.1771,
827
  "step": 1200
828
  },
829
  {
830
  "epoch": 13.19,
831
- "eval_accuracy": 0.711764705882353,
832
- "eval_loss": 0.8292282819747925,
833
- "eval_runtime": 1.5182,
834
- "eval_samples_per_second": 111.974,
835
- "eval_steps_per_second": 14.491,
836
  "step": 1200
837
  },
838
  {
839
  "epoch": 13.3,
840
- "learning_rate": 9.362637362637363e-05,
841
- "loss": 0.1475,
842
  "step": 1210
843
  },
844
  {
845
  "epoch": 13.41,
846
- "learning_rate": 9.274725274725276e-05,
847
- "loss": 0.1051,
848
  "step": 1220
849
  },
850
  {
851
  "epoch": 13.52,
852
- "learning_rate": 9.186813186813187e-05,
853
- "loss": 0.1564,
854
  "step": 1230
855
  },
856
  {
857
  "epoch": 13.63,
858
- "learning_rate": 9.098901098901099e-05,
859
- "loss": 0.4018,
860
  "step": 1240
861
  },
862
  {
863
  "epoch": 13.74,
864
- "learning_rate": 9.010989010989012e-05,
865
- "loss": 0.3191,
866
  "step": 1250
867
  },
868
  {
869
  "epoch": 13.85,
870
- "learning_rate": 8.923076923076924e-05,
871
- "loss": 0.2845,
872
  "step": 1260
873
  },
874
  {
875
  "epoch": 13.96,
876
- "learning_rate": 8.835164835164835e-05,
877
- "loss": 0.3667,
878
  "step": 1270
879
  },
880
  {
881
  "epoch": 14.07,
882
- "learning_rate": 8.747252747252748e-05,
883
- "loss": 0.2433,
884
  "step": 1280
885
  },
886
  {
887
  "epoch": 14.18,
888
- "learning_rate": 8.65934065934066e-05,
889
- "loss": 0.1794,
890
  "step": 1290
891
  },
892
  {
893
  "epoch": 14.29,
894
- "learning_rate": 8.571428571428571e-05,
895
- "loss": 0.1779,
896
  "step": 1300
897
  },
898
  {
899
  "epoch": 14.29,
900
- "eval_accuracy": 0.6294117647058823,
901
- "eval_loss": 1.1313611268997192,
902
- "eval_runtime": 1.4881,
903
- "eval_samples_per_second": 114.237,
904
- "eval_steps_per_second": 14.784,
905
  "step": 1300
906
  },
907
  {
908
  "epoch": 14.4,
909
- "learning_rate": 8.483516483516484e-05,
910
- "loss": 0.2517,
911
  "step": 1310
912
  },
913
  {
914
  "epoch": 14.51,
915
- "learning_rate": 8.395604395604396e-05,
916
- "loss": 0.2737,
917
  "step": 1320
918
  },
919
  {
920
  "epoch": 14.62,
921
- "learning_rate": 8.307692307692309e-05,
922
- "loss": 0.2092,
923
  "step": 1330
924
  },
925
  {
926
  "epoch": 14.73,
927
- "learning_rate": 8.21978021978022e-05,
928
- "loss": 0.2281,
929
  "step": 1340
930
  },
931
  {
932
  "epoch": 14.84,
933
- "learning_rate": 8.131868131868132e-05,
934
- "loss": 0.2375,
935
  "step": 1350
936
  },
937
  {
938
  "epoch": 14.95,
939
- "learning_rate": 8.043956043956045e-05,
940
- "loss": 0.2062,
941
  "step": 1360
942
  },
943
  {
944
  "epoch": 15.05,
945
- "learning_rate": 7.956043956043956e-05,
946
- "loss": 0.1698,
947
  "step": 1370
948
  },
949
  {
950
  "epoch": 15.16,
951
- "learning_rate": 7.868131868131868e-05,
952
- "loss": 0.1904,
953
  "step": 1380
954
  },
955
  {
956
  "epoch": 15.27,
957
- "learning_rate": 7.780219780219781e-05,
958
- "loss": 0.2469,
959
  "step": 1390
960
  },
961
  {
962
  "epoch": 15.38,
963
- "learning_rate": 7.692307692307693e-05,
964
- "loss": 0.2044,
965
  "step": 1400
966
  },
967
  {
968
  "epoch": 15.38,
969
- "eval_accuracy": 0.7294117647058823,
970
- "eval_loss": 0.8349283337593079,
971
- "eval_runtime": 1.502,
972
- "eval_samples_per_second": 113.184,
973
- "eval_steps_per_second": 14.647,
974
  "step": 1400
975
  },
976
  {
977
  "epoch": 15.49,
978
- "learning_rate": 7.604395604395604e-05,
979
- "loss": 0.2405,
980
  "step": 1410
981
  },
982
  {
983
  "epoch": 15.6,
984
- "learning_rate": 7.516483516483517e-05,
985
- "loss": 0.1326,
986
  "step": 1420
987
  },
988
  {
989
  "epoch": 15.71,
990
- "learning_rate": 7.428571428571429e-05,
991
- "loss": 0.2253,
992
  "step": 1430
993
  },
994
  {
995
  "epoch": 15.82,
996
- "learning_rate": 7.34065934065934e-05,
997
- "loss": 0.2029,
998
  "step": 1440
999
  },
1000
  {
1001
  "epoch": 15.93,
1002
- "learning_rate": 7.252747252747253e-05,
1003
- "loss": 0.1161,
1004
  "step": 1450
1005
  },
1006
  {
1007
  "epoch": 16.04,
1008
- "learning_rate": 7.164835164835165e-05,
1009
- "loss": 0.3051,
1010
  "step": 1460
1011
  },
1012
  {
1013
  "epoch": 16.15,
1014
- "learning_rate": 7.076923076923078e-05,
1015
- "loss": 0.3765,
1016
  "step": 1470
1017
  },
1018
  {
1019
  "epoch": 16.26,
1020
- "learning_rate": 6.98901098901099e-05,
1021
- "loss": 0.1976,
1022
  "step": 1480
1023
  },
1024
  {
1025
  "epoch": 16.37,
1026
- "learning_rate": 6.901098901098901e-05,
1027
- "loss": 0.1887,
1028
  "step": 1490
1029
  },
1030
  {
1031
  "epoch": 16.48,
1032
- "learning_rate": 6.813186813186814e-05,
1033
- "loss": 0.1543,
1034
  "step": 1500
1035
  },
1036
  {
1037
  "epoch": 16.48,
1038
- "eval_accuracy": 0.6941176470588235,
1039
- "eval_loss": 0.8952152132987976,
1040
- "eval_runtime": 1.5318,
1041
- "eval_samples_per_second": 110.978,
1042
- "eval_steps_per_second": 14.362,
1043
  "step": 1500
1044
  },
1045
  {
1046
  "epoch": 16.59,
1047
- "learning_rate": 6.725274725274725e-05,
1048
- "loss": 0.2324,
1049
  "step": 1510
1050
  },
1051
  {
1052
  "epoch": 16.7,
1053
- "learning_rate": 6.637362637362637e-05,
1054
- "loss": 0.1539,
1055
  "step": 1520
1056
  },
1057
  {
1058
  "epoch": 16.81,
1059
- "learning_rate": 6.54945054945055e-05,
1060
- "loss": 0.2305,
1061
  "step": 1530
1062
  },
1063
  {
1064
  "epoch": 16.92,
1065
- "learning_rate": 6.461538461538462e-05,
1066
- "loss": 0.1771,
1067
  "step": 1540
1068
  },
1069
  {
1070
  "epoch": 17.03,
1071
- "learning_rate": 6.373626373626373e-05,
1072
- "loss": 0.1604,
1073
  "step": 1550
1074
  },
1075
  {
1076
  "epoch": 17.14,
1077
- "learning_rate": 6.285714285714286e-05,
1078
- "loss": 0.1297,
1079
  "step": 1560
1080
  },
1081
  {
1082
  "epoch": 17.25,
1083
- "learning_rate": 6.197802197802198e-05,
1084
- "loss": 0.1646,
1085
  "step": 1570
1086
  },
1087
  {
1088
  "epoch": 17.36,
1089
- "learning_rate": 6.10989010989011e-05,
1090
- "loss": 0.1942,
1091
  "step": 1580
1092
  },
1093
  {
1094
  "epoch": 17.47,
1095
- "learning_rate": 6.021978021978022e-05,
1096
- "loss": 0.105,
1097
  "step": 1590
1098
  },
1099
  {
1100
  "epoch": 17.58,
1101
- "learning_rate": 5.9340659340659345e-05,
1102
- "loss": 0.1283,
1103
  "step": 1600
1104
  },
1105
  {
1106
  "epoch": 17.58,
1107
- "eval_accuracy": 0.7352941176470589,
1108
- "eval_loss": 0.8053585290908813,
1109
- "eval_runtime": 1.4888,
1110
- "eval_samples_per_second": 114.184,
1111
- "eval_steps_per_second": 14.777,
1112
  "step": 1600
1113
  },
1114
  {
1115
  "epoch": 17.69,
1116
- "learning_rate": 5.846153846153847e-05,
1117
- "loss": 0.1779,
1118
  "step": 1610
1119
  },
1120
  {
1121
  "epoch": 17.8,
1122
- "learning_rate": 5.7582417582417584e-05,
1123
- "loss": 0.1588,
1124
  "step": 1620
1125
  },
1126
  {
1127
  "epoch": 17.91,
1128
- "learning_rate": 5.6703296703296706e-05,
1129
- "loss": 0.1868,
1130
  "step": 1630
1131
  },
1132
  {
1133
  "epoch": 18.02,
1134
- "learning_rate": 5.582417582417583e-05,
1135
- "loss": 0.1537,
1136
  "step": 1640
1137
  },
1138
  {
1139
  "epoch": 18.13,
1140
- "learning_rate": 5.494505494505495e-05,
1141
- "loss": 0.1426,
1142
  "step": 1650
1143
  },
1144
  {
1145
  "epoch": 18.24,
1146
- "learning_rate": 5.406593406593407e-05,
1147
- "loss": 0.1759,
1148
  "step": 1660
1149
  },
1150
  {
1151
  "epoch": 18.35,
1152
- "learning_rate": 5.318681318681319e-05,
1153
- "loss": 0.1908,
1154
  "step": 1670
1155
  },
1156
  {
1157
  "epoch": 18.46,
1158
- "learning_rate": 5.230769230769231e-05,
1159
- "loss": 0.1562,
1160
  "step": 1680
1161
  },
1162
  {
1163
  "epoch": 18.57,
1164
- "learning_rate": 5.142857142857143e-05,
1165
- "loss": 0.1137,
1166
  "step": 1690
1167
  },
1168
  {
1169
  "epoch": 18.68,
1170
- "learning_rate": 5.054945054945055e-05,
1171
- "loss": 0.1721,
1172
  "step": 1700
1173
  },
1174
  {
1175
  "epoch": 18.68,
1176
- "eval_accuracy": 0.7235294117647059,
1177
- "eval_loss": 0.9094342589378357,
1178
- "eval_runtime": 1.4871,
1179
- "eval_samples_per_second": 114.319,
1180
- "eval_steps_per_second": 14.794,
1181
  "step": 1700
1182
  },
1183
  {
1184
  "epoch": 18.79,
1185
- "learning_rate": 4.9670329670329674e-05,
1186
- "loss": 0.1623,
1187
  "step": 1710
1188
  },
1189
  {
1190
  "epoch": 18.9,
1191
- "learning_rate": 4.879120879120879e-05,
1192
- "loss": 0.1731,
1193
  "step": 1720
1194
  },
1195
  {
1196
  "epoch": 19.01,
1197
- "learning_rate": 4.791208791208792e-05,
1198
- "loss": 0.1458,
1199
  "step": 1730
1200
  },
1201
  {
1202
  "epoch": 19.12,
1203
- "learning_rate": 4.7032967032967035e-05,
1204
- "loss": 0.1717,
1205
  "step": 1740
1206
  },
1207
  {
1208
  "epoch": 19.23,
1209
- "learning_rate": 4.615384615384616e-05,
1210
- "loss": 0.1125,
1211
  "step": 1750
1212
  },
1213
  {
1214
  "epoch": 19.34,
1215
- "learning_rate": 4.5274725274725274e-05,
1216
- "loss": 0.1776,
1217
  "step": 1760
1218
  },
1219
  {
1220
  "epoch": 19.45,
1221
- "learning_rate": 4.43956043956044e-05,
1222
- "loss": 0.1259,
1223
  "step": 1770
1224
  },
1225
  {
1226
  "epoch": 19.56,
1227
- "learning_rate": 4.351648351648352e-05,
1228
- "loss": 0.1294,
1229
  "step": 1780
1230
  },
1231
  {
1232
  "epoch": 19.67,
1233
- "learning_rate": 4.2637362637362635e-05,
1234
- "loss": 0.2037,
1235
  "step": 1790
1236
  },
1237
  {
1238
  "epoch": 19.78,
1239
- "learning_rate": 4.1758241758241765e-05,
1240
- "loss": 0.1509,
1241
  "step": 1800
1242
  },
1243
  {
1244
  "epoch": 19.78,
1245
- "eval_accuracy": 0.7411764705882353,
1246
- "eval_loss": 0.9168078303337097,
1247
- "eval_runtime": 1.5237,
1248
- "eval_samples_per_second": 111.573,
1249
- "eval_steps_per_second": 14.439,
1250
  "step": 1800
1251
  },
1252
  {
1253
  "epoch": 19.89,
1254
- "learning_rate": 4.087912087912088e-05,
1255
- "loss": 0.0973,
1256
  "step": 1810
1257
  },
1258
  {
1259
  "epoch": 20.0,
1260
- "learning_rate": 4e-05,
1261
- "loss": 0.2074,
1262
  "step": 1820
1263
  },
1264
  {
1265
  "epoch": 20.11,
1266
- "learning_rate": 3.912087912087912e-05,
1267
- "loss": 0.185,
1268
  "step": 1830
1269
  },
1270
  {
1271
  "epoch": 20.22,
1272
- "learning_rate": 3.824175824175824e-05,
1273
- "loss": 0.0895,
1274
  "step": 1840
1275
  },
1276
  {
1277
  "epoch": 20.33,
1278
- "learning_rate": 3.7362637362637365e-05,
1279
- "loss": 0.1081,
1280
  "step": 1850
1281
  },
1282
  {
1283
  "epoch": 20.44,
1284
- "learning_rate": 3.648351648351648e-05,
1285
- "loss": 0.169,
1286
  "step": 1860
1287
  },
1288
  {
1289
  "epoch": 20.55,
1290
- "learning_rate": 3.560439560439561e-05,
1291
- "loss": 0.1601,
1292
  "step": 1870
1293
  },
1294
  {
1295
  "epoch": 20.66,
1296
- "learning_rate": 3.4725274725274726e-05,
1297
- "loss": 0.0853,
1298
  "step": 1880
1299
  },
1300
  {
1301
  "epoch": 20.77,
1302
- "learning_rate": 3.384615384615385e-05,
1303
- "loss": 0.1067,
1304
  "step": 1890
1305
  },
1306
  {
1307
  "epoch": 20.88,
1308
- "learning_rate": 3.296703296703297e-05,
1309
- "loss": 0.1257,
1310
  "step": 1900
1311
  },
1312
  {
1313
  "epoch": 20.88,
1314
- "eval_accuracy": 0.7411764705882353,
1315
- "eval_loss": 0.939464807510376,
1316
- "eval_runtime": 1.4981,
1317
- "eval_samples_per_second": 113.476,
1318
- "eval_steps_per_second": 14.685,
1319
  "step": 1900
1320
  },
1321
  {
1322
  "epoch": 20.99,
1323
- "learning_rate": 3.2087912087912094e-05,
1324
- "loss": 0.2041,
1325
  "step": 1910
1326
  },
1327
  {
1328
  "epoch": 21.1,
1329
- "learning_rate": 3.120879120879121e-05,
1330
- "loss": 0.144,
1331
  "step": 1920
1332
  },
1333
  {
1334
  "epoch": 21.21,
1335
- "learning_rate": 3.032967032967033e-05,
1336
- "loss": 0.0677,
1337
  "step": 1930
1338
  },
1339
  {
1340
  "epoch": 21.32,
1341
- "learning_rate": 2.945054945054945e-05,
1342
- "loss": 0.1195,
1343
  "step": 1940
1344
  },
1345
  {
1346
  "epoch": 21.43,
1347
- "learning_rate": 2.857142857142857e-05,
1348
- "loss": 0.1599,
1349
  "step": 1950
1350
  },
1351
  {
1352
  "epoch": 21.54,
1353
- "learning_rate": 2.7692307692307694e-05,
1354
- "loss": 0.0681,
1355
  "step": 1960
1356
  },
1357
  {
1358
  "epoch": 21.65,
1359
- "learning_rate": 2.6813186813186813e-05,
1360
- "loss": 0.0854,
1361
  "step": 1970
1362
  },
1363
  {
1364
  "epoch": 21.76,
1365
- "learning_rate": 2.593406593406594e-05,
1366
- "loss": 0.1481,
1367
  "step": 1980
1368
  },
1369
  {
1370
  "epoch": 21.87,
1371
- "learning_rate": 2.5054945054945055e-05,
1372
- "loss": 0.0927,
1373
  "step": 1990
1374
  },
1375
  {
1376
  "epoch": 21.98,
1377
- "learning_rate": 2.4175824175824177e-05,
1378
- "loss": 0.1747,
1379
  "step": 2000
1380
  },
1381
  {
1382
  "epoch": 21.98,
1383
- "eval_accuracy": 0.7470588235294118,
1384
- "eval_loss": 0.8745805025100708,
1385
- "eval_runtime": 1.5068,
1386
- "eval_samples_per_second": 112.823,
1387
- "eval_steps_per_second": 14.601,
1388
  "step": 2000
1389
  },
1390
  {
1391
  "epoch": 22.09,
1392
- "learning_rate": 2.3296703296703297e-05,
1393
- "loss": 0.0698,
1394
  "step": 2010
1395
  },
1396
  {
1397
  "epoch": 22.2,
1398
- "learning_rate": 2.241758241758242e-05,
1399
- "loss": 0.0488,
1400
  "step": 2020
1401
  },
1402
  {
1403
  "epoch": 22.31,
1404
- "learning_rate": 2.1538461538461542e-05,
1405
- "loss": 0.0583,
1406
  "step": 2030
1407
  },
1408
  {
1409
  "epoch": 22.42,
1410
- "learning_rate": 2.0659340659340658e-05,
1411
- "loss": 0.1999,
1412
  "step": 2040
1413
  },
1414
  {
1415
  "epoch": 22.53,
1416
- "learning_rate": 1.978021978021978e-05,
1417
- "loss": 0.0723,
1418
  "step": 2050
1419
  },
1420
  {
1421
  "epoch": 22.64,
1422
- "learning_rate": 1.89010989010989e-05,
1423
- "loss": 0.1596,
1424
  "step": 2060
1425
  },
1426
  {
1427
  "epoch": 22.75,
1428
- "learning_rate": 1.8021978021978023e-05,
1429
- "loss": 0.1527,
1430
  "step": 2070
1431
  },
1432
  {
1433
  "epoch": 22.86,
1434
- "learning_rate": 1.7142857142857145e-05,
1435
- "loss": 0.1554,
1436
  "step": 2080
1437
  },
1438
  {
1439
  "epoch": 22.97,
1440
- "learning_rate": 1.6263736263736265e-05,
1441
- "loss": 0.1464,
1442
  "step": 2090
1443
  },
1444
  {
1445
  "epoch": 23.08,
1446
- "learning_rate": 1.5384615384615387e-05,
1447
- "loss": 0.1506,
1448
  "step": 2100
1449
  },
1450
  {
1451
  "epoch": 23.08,
1452
- "eval_accuracy": 0.7352941176470589,
1453
- "eval_loss": 0.7992474436759949,
1454
- "eval_runtime": 1.5262,
1455
- "eval_samples_per_second": 111.385,
1456
- "eval_steps_per_second": 14.415,
1457
  "step": 2100
1458
  },
1459
  {
1460
  "epoch": 23.19,
1461
- "learning_rate": 1.4505494505494508e-05,
1462
- "loss": 0.1008,
1463
  "step": 2110
1464
  },
1465
  {
1466
  "epoch": 23.3,
1467
- "learning_rate": 1.3626373626373626e-05,
1468
- "loss": 0.0949,
1469
  "step": 2120
1470
  },
1471
  {
1472
  "epoch": 23.41,
1473
- "learning_rate": 1.2747252747252747e-05,
1474
- "loss": 0.09,
1475
  "step": 2130
1476
  },
1477
  {
1478
  "epoch": 23.52,
1479
- "learning_rate": 1.1868131868131868e-05,
1480
- "loss": 0.1631,
1481
  "step": 2140
1482
  },
1483
  {
1484
  "epoch": 23.63,
1485
- "learning_rate": 1.0989010989010989e-05,
1486
- "loss": 0.0929,
1487
  "step": 2150
1488
  },
1489
  {
1490
  "epoch": 23.74,
1491
- "learning_rate": 1.0109890109890111e-05,
1492
- "loss": 0.1425,
1493
  "step": 2160
1494
  },
1495
  {
1496
  "epoch": 23.85,
1497
- "learning_rate": 9.230769230769232e-06,
1498
- "loss": 0.1187,
1499
  "step": 2170
1500
  },
1501
  {
1502
  "epoch": 23.96,
1503
- "learning_rate": 8.351648351648352e-06,
1504
- "loss": 0.1834,
1505
  "step": 2180
1506
  },
1507
  {
1508
  "epoch": 24.07,
1509
- "learning_rate": 7.4725274725274726e-06,
1510
- "loss": 0.1157,
1511
  "step": 2190
1512
  },
1513
  {
1514
  "epoch": 24.18,
1515
- "learning_rate": 6.5934065934065935e-06,
1516
- "loss": 0.1021,
1517
  "step": 2200
1518
  },
1519
  {
1520
  "epoch": 24.18,
1521
- "eval_accuracy": 0.7705882352941177,
1522
- "eval_loss": 0.7445889711380005,
1523
- "eval_runtime": 1.5168,
1524
- "eval_samples_per_second": 112.082,
1525
- "eval_steps_per_second": 14.505,
1526
  "step": 2200
1527
  },
1528
  {
1529
  "epoch": 24.29,
1530
- "learning_rate": 5.7142857142857145e-06,
1531
- "loss": 0.1826,
1532
  "step": 2210
1533
  },
1534
  {
1535
  "epoch": 24.4,
1536
- "learning_rate": 4.8351648351648355e-06,
1537
- "loss": 0.1807,
1538
  "step": 2220
1539
  },
1540
  {
1541
  "epoch": 24.51,
1542
- "learning_rate": 3.9560439560439565e-06,
1543
- "loss": 0.0946,
1544
  "step": 2230
1545
  },
1546
  {
1547
  "epoch": 24.62,
1548
- "learning_rate": 3.0769230769230774e-06,
1549
- "loss": 0.1039,
1550
  "step": 2240
1551
  },
1552
  {
1553
  "epoch": 24.73,
1554
- "learning_rate": 2.197802197802198e-06,
1555
- "loss": 0.1729,
1556
  "step": 2250
1557
  },
1558
  {
1559
  "epoch": 24.84,
1560
- "learning_rate": 1.3186813186813187e-06,
1561
- "loss": 0.1052,
1562
  "step": 2260
1563
  },
1564
  {
1565
  "epoch": 24.95,
1566
- "learning_rate": 4.3956043956043957e-07,
1567
- "loss": 0.1003,
1568
  "step": 2270
1569
  },
1570
  {
1571
- "epoch": 25.0,
1572
- "step": 2275,
1573
- "total_flos": 2.8110723118700544e+18,
1574
- "train_loss": 0.38098634693648786,
1575
- "train_runtime": 883.423,
1576
- "train_samples_per_second": 41.062,
1577
- "train_steps_per_second": 2.575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1578
  }
1579
  ],
1580
- "max_steps": 2275,
1581
- "num_train_epochs": 25,
1582
- "total_flos": 2.8110723118700544e+18,
1583
  "trial_name": null,
1584
  "trial_params": null
1585
  }
 
1
  {
2
+ "best_metric": 0.6692019104957581,
3
+ "best_model_checkpoint": "./croupier-creature-classifier/checkpoint-2000",
4
+ "epoch": 50.0,
5
+ "global_step": 4550,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.11,
12
+ "learning_rate": 2.9934065934065934e-05,
13
+ "loss": 1.3796,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.22,
18
+ "learning_rate": 2.9868131868131868e-05,
19
+ "loss": 1.3597,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.33,
24
+ "learning_rate": 2.9802197802197805e-05,
25
+ "loss": 1.3427,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.44,
30
+ "learning_rate": 2.973626373626374e-05,
31
+ "loss": 1.3331,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.55,
36
+ "learning_rate": 2.9670329670329673e-05,
37
+ "loss": 1.275,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.66,
42
+ "learning_rate": 2.9604395604395606e-05,
43
+ "loss": 1.2765,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.77,
48
+ "learning_rate": 2.953846153846154e-05,
49
+ "loss": 1.2306,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.88,
54
+ "learning_rate": 2.947252747252747e-05,
55
+ "loss": 1.2621,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.99,
60
+ "learning_rate": 2.9406593406593407e-05,
61
+ "loss": 1.1997,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 1.1,
66
+ "learning_rate": 2.934065934065934e-05,
67
+ "loss": 1.1159,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 1.1,
72
+ "eval_accuracy": 0.611764705882353,
73
+ "eval_loss": 1.1144014596939087,
74
+ "eval_runtime": 1.5221,
75
+ "eval_samples_per_second": 111.687,
76
+ "eval_steps_per_second": 14.454,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 1.21,
81
+ "learning_rate": 2.9274725274725275e-05,
82
+ "loss": 1.0839,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 1.32,
87
+ "learning_rate": 2.920879120879121e-05,
88
+ "loss": 1.0689,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 1.43,
93
+ "learning_rate": 2.9142857142857142e-05,
94
+ "loss": 0.9898,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 1.54,
99
+ "learning_rate": 2.907692307692308e-05,
100
+ "loss": 0.9495,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 1.65,
105
+ "learning_rate": 2.9010989010989013e-05,
106
+ "loss": 0.9943,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 1.76,
111
+ "learning_rate": 2.8945054945054947e-05,
112
+ "loss": 0.98,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 1.87,
117
+ "learning_rate": 2.887912087912088e-05,
118
+ "loss": 0.9986,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 1.98,
123
+ "learning_rate": 2.8813186813186814e-05,
124
+ "loss": 0.9522,
125
  "step": 180
126
  },
127
  {
128
  "epoch": 2.09,
129
+ "learning_rate": 2.8747252747252748e-05,
130
+ "loss": 0.8533,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 2.2,
135
+ "learning_rate": 2.8681318681318685e-05,
136
+ "loss": 0.8183,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 2.2,
141
+ "eval_accuracy": 0.6882352941176471,
142
+ "eval_loss": 0.9109101891517639,
143
+ "eval_runtime": 1.6142,
144
+ "eval_samples_per_second": 105.317,
145
+ "eval_steps_per_second": 13.629,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 2.31,
150
+ "learning_rate": 2.861538461538462e-05,
151
+ "loss": 0.7843,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 2.42,
156
+ "learning_rate": 2.854945054945055e-05,
157
+ "loss": 0.8093,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 2.53,
162
+ "learning_rate": 2.8483516483516482e-05,
163
+ "loss": 0.8541,
164
  "step": 230
165
  },
166
  {
167
  "epoch": 2.64,
168
+ "learning_rate": 2.8417582417582416e-05,
169
+ "loss": 0.7405,
170
  "step": 240
171
  },
172
  {
173
  "epoch": 2.75,
174
+ "learning_rate": 2.8351648351648353e-05,
175
+ "loss": 0.7858,
176
  "step": 250
177
  },
178
  {
179
  "epoch": 2.86,
180
+ "learning_rate": 2.8285714285714287e-05,
181
+ "loss": 0.7966,
182
  "step": 260
183
  },
184
  {
185
  "epoch": 2.97,
186
+ "learning_rate": 2.821978021978022e-05,
187
+ "loss": 0.695,
188
  "step": 270
189
  },
190
  {
191
  "epoch": 3.08,
192
+ "learning_rate": 2.8153846153846154e-05,
193
+ "loss": 0.6649,
194
  "step": 280
195
  },
196
  {
197
  "epoch": 3.19,
198
+ "learning_rate": 2.8087912087912088e-05,
199
+ "loss": 0.605,
200
  "step": 290
201
  },
202
  {
203
  "epoch": 3.3,
204
+ "learning_rate": 2.802197802197802e-05,
205
+ "loss": 0.6829,
206
  "step": 300
207
  },
208
  {
209
  "epoch": 3.3,
210
+ "eval_accuracy": 0.7235294117647059,
211
+ "eval_loss": 0.7676671743392944,
212
+ "eval_runtime": 1.6247,
213
+ "eval_samples_per_second": 104.633,
214
+ "eval_steps_per_second": 13.541,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 3.41,
219
+ "learning_rate": 2.795604395604396e-05,
220
+ "loss": 0.5722,
221
  "step": 310
222
  },
223
  {
224
  "epoch": 3.52,
225
+ "learning_rate": 2.7890109890109892e-05,
226
+ "loss": 0.654,
227
  "step": 320
228
  },
229
  {
230
  "epoch": 3.63,
231
+ "learning_rate": 2.7824175824175826e-05,
232
+ "loss": 0.7001,
233
  "step": 330
234
  },
235
  {
236
  "epoch": 3.74,
237
+ "learning_rate": 2.775824175824176e-05,
238
+ "loss": 0.5804,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 3.85,
243
+ "learning_rate": 2.7692307692307694e-05,
244
+ "loss": 0.6638,
245
  "step": 350
246
  },
247
  {
248
  "epoch": 3.96,
249
+ "learning_rate": 2.7626373626373624e-05,
250
+ "loss": 0.6668,
251
  "step": 360
252
  },
253
  {
254
  "epoch": 4.07,
255
+ "learning_rate": 2.756043956043956e-05,
256
+ "loss": 0.5599,
257
  "step": 370
258
  },
259
  {
260
  "epoch": 4.18,
261
+ "learning_rate": 2.7494505494505495e-05,
262
+ "loss": 0.5317,
263
  "step": 380
264
  },
265
  {
266
  "epoch": 4.29,
267
+ "learning_rate": 2.7428571428571428e-05,
268
+ "loss": 0.5403,
269
  "step": 390
270
  },
271
  {
272
  "epoch": 4.4,
273
+ "learning_rate": 2.7362637362637362e-05,
274
+ "loss": 0.5575,
275
  "step": 400
276
  },
277
  {
278
  "epoch": 4.4,
279
+ "eval_accuracy": 0.6764705882352942,
280
+ "eval_loss": 0.7670463919639587,
281
+ "eval_runtime": 1.6237,
282
+ "eval_samples_per_second": 104.698,
283
+ "eval_steps_per_second": 13.549,
284
  "step": 400
285
  },
286
  {
287
  "epoch": 4.51,
288
+ "learning_rate": 2.7296703296703296e-05,
289
+ "loss": 0.5018,
290
  "step": 410
291
  },
292
  {
293
  "epoch": 4.62,
294
+ "learning_rate": 2.7230769230769233e-05,
295
+ "loss": 0.5715,
296
  "step": 420
297
  },
298
  {
299
  "epoch": 4.73,
300
+ "learning_rate": 2.7164835164835166e-05,
301
+ "loss": 0.4884,
302
  "step": 430
303
  },
304
  {
305
  "epoch": 4.84,
306
+ "learning_rate": 2.7105494505494504e-05,
307
+ "loss": 0.5533,
308
  "step": 440
309
  },
310
  {
311
  "epoch": 4.95,
312
+ "learning_rate": 2.703956043956044e-05,
313
+ "loss": 0.5183,
314
  "step": 450
315
  },
316
  {
317
  "epoch": 5.05,
318
+ "learning_rate": 2.6973626373626375e-05,
319
+ "loss": 0.4451,
320
  "step": 460
321
  },
322
  {
323
  "epoch": 5.16,
324
+ "learning_rate": 2.690769230769231e-05,
325
+ "loss": 0.4605,
326
  "step": 470
327
  },
328
  {
329
  "epoch": 5.27,
330
+ "learning_rate": 2.6841758241758243e-05,
331
+ "loss": 0.4289,
332
  "step": 480
333
  },
334
  {
335
  "epoch": 5.38,
336
+ "learning_rate": 2.6775824175824176e-05,
337
+ "loss": 0.4433,
338
  "step": 490
339
  },
340
  {
341
  "epoch": 5.49,
342
+ "learning_rate": 2.670989010989011e-05,
343
+ "loss": 0.4644,
344
  "step": 500
345
  },
346
  {
347
  "epoch": 5.49,
348
+ "eval_accuracy": 0.6647058823529411,
349
+ "eval_loss": 0.8459996581077576,
350
+ "eval_runtime": 1.6386,
351
+ "eval_samples_per_second": 103.747,
352
+ "eval_steps_per_second": 13.426,
353
  "step": 500
354
  },
355
  {
356
  "epoch": 5.6,
357
+ "learning_rate": 2.6643956043956047e-05,
358
+ "loss": 0.4732,
359
  "step": 510
360
  },
361
  {
362
  "epoch": 5.71,
363
+ "learning_rate": 2.657802197802198e-05,
364
+ "loss": 0.4939,
365
  "step": 520
366
  },
367
  {
368
  "epoch": 5.82,
369
+ "learning_rate": 2.651208791208791e-05,
370
+ "loss": 0.4209,
371
  "step": 530
372
  },
373
  {
374
  "epoch": 5.93,
375
+ "learning_rate": 2.6446153846153845e-05,
376
+ "loss": 0.3783,
377
  "step": 540
378
  },
379
  {
380
  "epoch": 6.04,
381
+ "learning_rate": 2.638021978021978e-05,
382
+ "loss": 0.4665,
383
  "step": 550
384
  },
385
  {
386
  "epoch": 6.15,
387
+ "learning_rate": 2.6314285714285715e-05,
388
+ "loss": 0.2572,
389
  "step": 560
390
  },
391
  {
392
  "epoch": 6.26,
393
+ "learning_rate": 2.624835164835165e-05,
394
+ "loss": 0.3752,
395
  "step": 570
396
  },
397
  {
398
  "epoch": 6.37,
399
+ "learning_rate": 2.6182417582417583e-05,
400
+ "loss": 0.2943,
401
  "step": 580
402
  },
403
  {
404
  "epoch": 6.48,
405
+ "learning_rate": 2.6116483516483517e-05,
406
+ "loss": 0.3949,
407
  "step": 590
408
  },
409
  {
410
  "epoch": 6.59,
411
+ "learning_rate": 2.605054945054945e-05,
412
+ "loss": 0.3096,
413
  "step": 600
414
  },
415
  {
416
  "epoch": 6.59,
417
+ "eval_accuracy": 0.7529411764705882,
418
+ "eval_loss": 0.7081632614135742,
419
+ "eval_runtime": 1.6198,
420
+ "eval_samples_per_second": 104.953,
421
+ "eval_steps_per_second": 13.582,
422
  "step": 600
423
  },
424
  {
425
  "epoch": 6.7,
426
+ "learning_rate": 2.5984615384615384e-05,
427
+ "loss": 0.4314,
428
  "step": 610
429
  },
430
  {
431
  "epoch": 6.81,
432
+ "learning_rate": 2.591868131868132e-05,
433
+ "loss": 0.3389,
434
  "step": 620
435
  },
436
  {
437
  "epoch": 6.92,
438
+ "learning_rate": 2.5852747252747255e-05,
439
+ "loss": 0.399,
440
  "step": 630
441
  },
442
  {
443
  "epoch": 7.03,
444
+ "learning_rate": 2.578681318681319e-05,
445
+ "loss": 0.3425,
446
  "step": 640
447
  },
448
  {
449
  "epoch": 7.14,
450
+ "learning_rate": 2.5720879120879122e-05,
451
+ "loss": 0.3395,
452
  "step": 650
453
  },
454
  {
455
  "epoch": 7.25,
456
+ "learning_rate": 2.5654945054945056e-05,
457
+ "loss": 0.3558,
458
  "step": 660
459
  },
460
  {
461
  "epoch": 7.36,
462
+ "learning_rate": 2.558901098901099e-05,
463
+ "loss": 0.3773,
464
  "step": 670
465
  },
466
  {
467
  "epoch": 7.47,
468
+ "learning_rate": 2.5523076923076923e-05,
469
+ "loss": 0.3493,
470
  "step": 680
471
  },
472
  {
473
  "epoch": 7.58,
474
+ "learning_rate": 2.5457142857142857e-05,
475
+ "loss": 0.402,
476
  "step": 690
477
  },
478
  {
479
  "epoch": 7.69,
480
+ "learning_rate": 2.539120879120879e-05,
481
+ "loss": 0.305,
482
  "step": 700
483
  },
484
  {
485
  "epoch": 7.69,
486
+ "eval_accuracy": 0.7647058823529411,
487
+ "eval_loss": 0.693938136100769,
488
+ "eval_runtime": 1.7847,
489
+ "eval_samples_per_second": 95.254,
490
+ "eval_steps_per_second": 12.327,
491
  "step": 700
492
  },
493
  {
494
  "epoch": 7.8,
495
+ "learning_rate": 2.5325274725274724e-05,
496
+ "loss": 0.3702,
497
  "step": 710
498
  },
499
  {
500
  "epoch": 7.91,
501
+ "learning_rate": 2.5259340659340658e-05,
502
+ "loss": 0.374,
503
  "step": 720
504
  },
505
  {
506
  "epoch": 8.02,
507
+ "learning_rate": 2.5193406593406595e-05,
508
+ "loss": 0.3374,
509
  "step": 730
510
  },
511
  {
512
  "epoch": 8.13,
513
+ "learning_rate": 2.512747252747253e-05,
514
+ "loss": 0.2479,
515
  "step": 740
516
  },
517
  {
518
  "epoch": 8.24,
519
+ "learning_rate": 2.5061538461538462e-05,
520
+ "loss": 0.3161,
521
  "step": 750
522
  },
523
  {
524
  "epoch": 8.35,
525
+ "learning_rate": 2.4995604395604396e-05,
526
+ "loss": 0.3229,
527
  "step": 760
528
  },
529
  {
530
  "epoch": 8.46,
531
+ "learning_rate": 2.492967032967033e-05,
532
+ "loss": 0.2871,
533
  "step": 770
534
  },
535
  {
536
  "epoch": 8.57,
537
+ "learning_rate": 2.4863736263736267e-05,
538
+ "loss": 0.3116,
539
  "step": 780
540
  },
541
  {
542
  "epoch": 8.68,
543
+ "learning_rate": 2.47978021978022e-05,
544
+ "loss": 0.3632,
545
  "step": 790
546
  },
547
  {
548
  "epoch": 8.79,
549
+ "learning_rate": 2.4731868131868134e-05,
550
+ "loss": 0.3349,
551
  "step": 800
552
  },
553
  {
554
  "epoch": 8.79,
555
+ "eval_accuracy": 0.7235294117647059,
556
+ "eval_loss": 0.7285170555114746,
557
+ "eval_runtime": 1.7358,
558
+ "eval_samples_per_second": 97.94,
559
+ "eval_steps_per_second": 12.675,
560
  "step": 800
561
  },
562
  {
563
  "epoch": 8.9,
564
+ "learning_rate": 2.4665934065934068e-05,
565
+ "loss": 0.3465,
566
  "step": 810
567
  },
568
  {
569
  "epoch": 9.01,
570
+ "learning_rate": 2.4599999999999998e-05,
571
+ "loss": 0.2579,
572
  "step": 820
573
  },
574
  {
575
  "epoch": 9.12,
576
+ "learning_rate": 2.4534065934065932e-05,
577
+ "loss": 0.324,
578
  "step": 830
579
  },
580
  {
581
  "epoch": 9.23,
582
+ "learning_rate": 2.446813186813187e-05,
583
+ "loss": 0.3192,
584
  "step": 840
585
  },
586
  {
587
  "epoch": 9.34,
588
+ "learning_rate": 2.4402197802197803e-05,
589
+ "loss": 0.2076,
590
  "step": 850
591
  },
592
  {
593
  "epoch": 9.45,
594
+ "learning_rate": 2.4336263736263736e-05,
595
+ "loss": 0.2456,
596
  "step": 860
597
  },
598
  {
599
  "epoch": 9.56,
600
+ "learning_rate": 2.427032967032967e-05,
601
+ "loss": 0.3255,
602
  "step": 870
603
  },
604
  {
605
  "epoch": 9.67,
606
+ "learning_rate": 2.4204395604395604e-05,
607
+ "loss": 0.3601,
608
  "step": 880
609
  },
610
  {
611
  "epoch": 9.78,
612
+ "learning_rate": 2.413846153846154e-05,
613
+ "loss": 0.1986,
614
  "step": 890
615
  },
616
  {
617
  "epoch": 9.89,
618
+ "learning_rate": 2.4072527472527475e-05,
619
+ "loss": 0.36,
620
  "step": 900
621
  },
622
  {
623
  "epoch": 9.89,
624
+ "eval_accuracy": 0.7294117647058823,
625
+ "eval_loss": 0.7664376497268677,
626
+ "eval_runtime": 1.6387,
627
+ "eval_samples_per_second": 103.742,
628
+ "eval_steps_per_second": 13.425,
629
  "step": 900
630
  },
631
  {
632
  "epoch": 10.0,
633
+ "learning_rate": 2.4006593406593408e-05,
634
+ "loss": 0.259,
635
  "step": 910
636
  },
637
  {
638
  "epoch": 10.11,
639
+ "learning_rate": 2.3940659340659342e-05,
640
+ "loss": 0.2984,
641
  "step": 920
642
  },
643
  {
644
  "epoch": 10.22,
645
+ "learning_rate": 2.3874725274725276e-05,
646
+ "loss": 0.2073,
647
  "step": 930
648
  },
649
  {
650
  "epoch": 10.33,
651
+ "learning_rate": 2.380879120879121e-05,
652
+ "loss": 0.3265,
653
  "step": 940
654
  },
655
  {
656
  "epoch": 10.44,
657
+ "learning_rate": 2.3742857142857146e-05,
658
+ "loss": 0.3372,
659
  "step": 950
660
  },
661
  {
662
  "epoch": 10.55,
663
+ "learning_rate": 2.3676923076923077e-05,
664
+ "loss": 0.3518,
665
  "step": 960
666
  },
667
  {
668
  "epoch": 10.66,
669
+ "learning_rate": 2.361098901098901e-05,
670
+ "loss": 0.2707,
671
  "step": 970
672
  },
673
  {
674
  "epoch": 10.77,
675
+ "learning_rate": 2.3545054945054944e-05,
676
+ "loss": 0.2529,
677
  "step": 980
678
  },
679
  {
680
  "epoch": 10.88,
681
+ "learning_rate": 2.3479120879120878e-05,
682
+ "loss": 0.258,
683
  "step": 990
684
  },
685
  {
686
  "epoch": 10.99,
687
+ "learning_rate": 2.341978021978022e-05,
688
+ "loss": 0.3184,
689
  "step": 1000
690
  },
691
  {
692
  "epoch": 10.99,
693
+ "eval_accuracy": 0.7588235294117647,
694
+ "eval_loss": 0.6806656718254089,
695
+ "eval_runtime": 1.6209,
696
+ "eval_samples_per_second": 104.882,
697
+ "eval_steps_per_second": 13.573,
698
  "step": 1000
699
  },
700
  {
701
  "epoch": 11.1,
702
+ "learning_rate": 2.3353846153846153e-05,
703
+ "loss": 0.2439,
704
  "step": 1010
705
  },
706
  {
707
  "epoch": 11.21,
708
+ "learning_rate": 2.3287912087912087e-05,
709
+ "loss": 0.1827,
710
  "step": 1020
711
  },
712
  {
713
  "epoch": 11.32,
714
+ "learning_rate": 2.3221978021978024e-05,
715
+ "loss": 0.2845,
716
  "step": 1030
717
  },
718
  {
719
  "epoch": 11.43,
720
+ "learning_rate": 2.3156043956043957e-05,
721
+ "loss": 0.2503,
722
  "step": 1040
723
  },
724
  {
725
  "epoch": 11.54,
726
+ "learning_rate": 2.309010989010989e-05,
727
+ "loss": 0.2009,
728
  "step": 1050
729
  },
730
  {
731
  "epoch": 11.65,
732
+ "learning_rate": 2.3024175824175825e-05,
733
+ "loss": 0.2486,
734
  "step": 1060
735
  },
736
  {
737
  "epoch": 11.76,
738
+ "learning_rate": 2.295824175824176e-05,
739
+ "loss": 0.1877,
740
  "step": 1070
741
  },
742
  {
743
  "epoch": 11.87,
744
+ "learning_rate": 2.2892307692307692e-05,
745
+ "loss": 0.3497,
746
  "step": 1080
747
  },
748
  {
749
  "epoch": 11.98,
750
+ "learning_rate": 2.282637362637363e-05,
751
+ "loss": 0.2559,
752
  "step": 1090
753
  },
754
  {
755
  "epoch": 12.09,
756
+ "learning_rate": 2.2760439560439563e-05,
757
+ "loss": 0.2815,
758
  "step": 1100
759
  },
760
  {
761
  "epoch": 12.09,
762
+ "eval_accuracy": 0.7352941176470589,
763
+ "eval_loss": 0.7407693266868591,
764
+ "eval_runtime": 1.6974,
765
+ "eval_samples_per_second": 100.156,
766
+ "eval_steps_per_second": 12.961,
767
  "step": 1100
768
  },
769
  {
770
  "epoch": 12.2,
771
+ "learning_rate": 2.2694505494505497e-05,
772
+ "loss": 0.2352,
773
  "step": 1110
774
  },
775
  {
776
  "epoch": 12.31,
777
+ "learning_rate": 2.262857142857143e-05,
778
+ "loss": 0.2422,
779
  "step": 1120
780
  },
781
  {
782
  "epoch": 12.42,
783
+ "learning_rate": 2.256263736263736e-05,
784
+ "loss": 0.2287,
785
  "step": 1130
786
  },
787
  {
788
  "epoch": 12.53,
789
+ "learning_rate": 2.2496703296703298e-05,
790
+ "loss": 0.282,
791
  "step": 1140
792
  },
793
  {
794
  "epoch": 12.64,
795
+ "learning_rate": 2.243076923076923e-05,
796
+ "loss": 0.2579,
797
  "step": 1150
798
  },
799
  {
800
  "epoch": 12.75,
801
+ "learning_rate": 2.2364835164835165e-05,
802
+ "loss": 0.2907,
803
  "step": 1160
804
  },
805
  {
806
  "epoch": 12.86,
807
+ "learning_rate": 2.22989010989011e-05,
808
+ "loss": 0.3221,
809
  "step": 1170
810
  },
811
  {
812
  "epoch": 12.97,
813
+ "learning_rate": 2.2232967032967032e-05,
814
+ "loss": 0.2657,
815
  "step": 1180
816
  },
817
  {
818
  "epoch": 13.08,
819
+ "learning_rate": 2.2167032967032966e-05,
820
+ "loss": 0.2456,
821
  "step": 1190
822
  },
823
  {
824
  "epoch": 13.19,
825
+ "learning_rate": 2.2101098901098903e-05,
826
+ "loss": 0.1745,
827
  "step": 1200
828
  },
829
  {
830
  "epoch": 13.19,
831
+ "eval_accuracy": 0.7294117647058823,
832
+ "eval_loss": 0.7527948617935181,
833
+ "eval_runtime": 1.6243,
834
+ "eval_samples_per_second": 104.661,
835
+ "eval_steps_per_second": 13.544,
836
  "step": 1200
837
  },
838
  {
839
  "epoch": 13.3,
840
+ "learning_rate": 2.2035164835164837e-05,
841
+ "loss": 0.202,
842
  "step": 1210
843
  },
844
  {
845
  "epoch": 13.41,
846
+ "learning_rate": 2.196923076923077e-05,
847
+ "loss": 0.1601,
848
  "step": 1220
849
  },
850
  {
851
  "epoch": 13.52,
852
+ "learning_rate": 2.1903296703296704e-05,
853
+ "loss": 0.1467,
854
  "step": 1230
855
  },
856
  {
857
  "epoch": 13.63,
858
+ "learning_rate": 2.1837362637362638e-05,
859
+ "loss": 0.287,
860
  "step": 1240
861
  },
862
  {
863
  "epoch": 13.74,
864
+ "learning_rate": 2.177142857142857e-05,
865
+ "loss": 0.2805,
866
  "step": 1250
867
  },
868
  {
869
  "epoch": 13.85,
870
+ "learning_rate": 2.170549450549451e-05,
871
+ "loss": 0.2405,
872
  "step": 1260
873
  },
874
  {
875
  "epoch": 13.96,
876
+ "learning_rate": 2.163956043956044e-05,
877
+ "loss": 0.3035,
878
  "step": 1270
879
  },
880
  {
881
  "epoch": 14.07,
882
+ "learning_rate": 2.1573626373626373e-05,
883
+ "loss": 0.2273,
884
  "step": 1280
885
  },
886
  {
887
  "epoch": 14.18,
888
+ "learning_rate": 2.1507692307692306e-05,
889
+ "loss": 0.1933,
890
  "step": 1290
891
  },
892
  {
893
  "epoch": 14.29,
894
+ "learning_rate": 2.144175824175824e-05,
895
+ "loss": 0.1894,
896
  "step": 1300
897
  },
898
  {
899
  "epoch": 14.29,
900
+ "eval_accuracy": 0.7470588235294118,
901
+ "eval_loss": 0.7634099721908569,
902
+ "eval_runtime": 1.6355,
903
+ "eval_samples_per_second": 103.947,
904
+ "eval_steps_per_second": 13.452,
905
  "step": 1300
906
  },
907
  {
908
  "epoch": 14.4,
909
+ "learning_rate": 2.1375824175824177e-05,
910
+ "loss": 0.1862,
911
  "step": 1310
912
  },
913
  {
914
  "epoch": 14.51,
915
+ "learning_rate": 2.130989010989011e-05,
916
+ "loss": 0.258,
917
  "step": 1320
918
  },
919
  {
920
  "epoch": 14.62,
921
+ "learning_rate": 2.1243956043956045e-05,
922
+ "loss": 0.231,
923
  "step": 1330
924
  },
925
  {
926
  "epoch": 14.73,
927
+ "learning_rate": 2.1178021978021978e-05,
928
+ "loss": 0.2558,
929
  "step": 1340
930
  },
931
  {
932
  "epoch": 14.84,
933
+ "learning_rate": 2.1112087912087912e-05,
934
+ "loss": 0.1848,
935
  "step": 1350
936
  },
937
  {
938
  "epoch": 14.95,
939
+ "learning_rate": 2.1046153846153846e-05,
940
+ "loss": 0.2495,
941
  "step": 1360
942
  },
943
  {
944
  "epoch": 15.05,
945
+ "learning_rate": 2.0980219780219783e-05,
946
+ "loss": 0.2079,
947
  "step": 1370
948
  },
949
  {
950
  "epoch": 15.16,
951
+ "learning_rate": 2.0914285714285716e-05,
952
+ "loss": 0.1803,
953
  "step": 1380
954
  },
955
  {
956
  "epoch": 15.27,
957
+ "learning_rate": 2.084835164835165e-05,
958
+ "loss": 0.2434,
959
  "step": 1390
960
  },
961
  {
962
  "epoch": 15.38,
963
+ "learning_rate": 2.0782417582417584e-05,
964
+ "loss": 0.1641,
965
  "step": 1400
966
  },
967
  {
968
  "epoch": 15.38,
969
+ "eval_accuracy": 0.7647058823529411,
970
+ "eval_loss": 0.7208631634712219,
971
+ "eval_runtime": 1.7468,
972
+ "eval_samples_per_second": 97.323,
973
+ "eval_steps_per_second": 12.595,
974
  "step": 1400
975
  },
976
  {
977
  "epoch": 15.49,
978
+ "learning_rate": 2.0716483516483514e-05,
979
+ "loss": 0.2347,
980
  "step": 1410
981
  },
982
  {
983
  "epoch": 15.6,
984
+ "learning_rate": 2.065054945054945e-05,
985
+ "loss": 0.1929,
986
  "step": 1420
987
  },
988
  {
989
  "epoch": 15.71,
990
+ "learning_rate": 2.0584615384615385e-05,
991
+ "loss": 0.1588,
992
  "step": 1430
993
  },
994
  {
995
  "epoch": 15.82,
996
+ "learning_rate": 2.051868131868132e-05,
997
+ "loss": 0.2216,
998
  "step": 1440
999
  },
1000
  {
1001
  "epoch": 15.93,
1002
+ "learning_rate": 2.0452747252747252e-05,
1003
+ "loss": 0.1755,
1004
  "step": 1450
1005
  },
1006
  {
1007
  "epoch": 16.04,
1008
+ "learning_rate": 2.0386813186813186e-05,
1009
+ "loss": 0.2623,
1010
  "step": 1460
1011
  },
1012
  {
1013
  "epoch": 16.15,
1014
+ "learning_rate": 2.032087912087912e-05,
1015
+ "loss": 0.2498,
1016
  "step": 1470
1017
  },
1018
  {
1019
  "epoch": 16.26,
1020
+ "learning_rate": 2.0254945054945057e-05,
1021
+ "loss": 0.2103,
1022
  "step": 1480
1023
  },
1024
  {
1025
  "epoch": 16.37,
1026
+ "learning_rate": 2.018901098901099e-05,
1027
+ "loss": 0.1713,
1028
  "step": 1490
1029
  },
1030
  {
1031
  "epoch": 16.48,
1032
+ "learning_rate": 2.0123076923076924e-05,
1033
+ "loss": 0.1932,
1034
  "step": 1500
1035
  },
1036
  {
1037
  "epoch": 16.48,
1038
+ "eval_accuracy": 0.7,
1039
+ "eval_loss": 0.9090902805328369,
1040
+ "eval_runtime": 1.6369,
1041
+ "eval_samples_per_second": 103.854,
1042
+ "eval_steps_per_second": 13.44,
1043
  "step": 1500
1044
  },
1045
  {
1046
  "epoch": 16.59,
1047
+ "learning_rate": 2.0057142857142858e-05,
1048
+ "loss": 0.256,
1049
  "step": 1510
1050
  },
1051
  {
1052
  "epoch": 16.7,
1053
+ "learning_rate": 1.999120879120879e-05,
1054
+ "loss": 0.1954,
1055
  "step": 1520
1056
  },
1057
  {
1058
  "epoch": 16.81,
1059
+ "learning_rate": 1.992527472527473e-05,
1060
+ "loss": 0.2644,
1061
  "step": 1530
1062
  },
1063
  {
1064
  "epoch": 16.92,
1065
+ "learning_rate": 1.9859340659340662e-05,
1066
+ "loss": 0.2049,
1067
  "step": 1540
1068
  },
1069
  {
1070
  "epoch": 17.03,
1071
+ "learning_rate": 1.9793406593406596e-05,
1072
+ "loss": 0.1822,
1073
  "step": 1550
1074
  },
1075
  {
1076
  "epoch": 17.14,
1077
+ "learning_rate": 1.9727472527472526e-05,
1078
+ "loss": 0.1925,
1079
  "step": 1560
1080
  },
1081
  {
1082
  "epoch": 17.25,
1083
+ "learning_rate": 1.966153846153846e-05,
1084
+ "loss": 0.2075,
1085
  "step": 1570
1086
  },
1087
  {
1088
  "epoch": 17.36,
1089
+ "learning_rate": 1.9595604395604394e-05,
1090
+ "loss": 0.2604,
1091
  "step": 1580
1092
  },
1093
  {
1094
  "epoch": 17.47,
1095
+ "learning_rate": 1.952967032967033e-05,
1096
+ "loss": 0.1662,
1097
  "step": 1590
1098
  },
1099
  {
1100
  "epoch": 17.58,
1101
+ "learning_rate": 1.9463736263736264e-05,
1102
+ "loss": 0.1609,
1103
  "step": 1600
1104
  },
1105
  {
1106
  "epoch": 17.58,
1107
+ "eval_accuracy": 0.7588235294117647,
1108
+ "eval_loss": 0.7208409309387207,
1109
+ "eval_runtime": 1.6376,
1110
+ "eval_samples_per_second": 103.809,
1111
+ "eval_steps_per_second": 13.434,
1112
  "step": 1600
1113
  },
1114
  {
1115
  "epoch": 17.69,
1116
+ "learning_rate": 1.9397802197802198e-05,
1117
+ "loss": 0.1817,
1118
  "step": 1610
1119
  },
1120
  {
1121
  "epoch": 17.8,
1122
+ "learning_rate": 1.9331868131868132e-05,
1123
+ "loss": 0.2423,
1124
  "step": 1620
1125
  },
1126
  {
1127
  "epoch": 17.91,
1128
+ "learning_rate": 1.9265934065934065e-05,
1129
+ "loss": 0.1722,
1130
  "step": 1630
1131
  },
1132
  {
1133
  "epoch": 18.02,
1134
+ "learning_rate": 1.9200000000000003e-05,
1135
+ "loss": 0.1544,
1136
  "step": 1640
1137
  },
1138
  {
1139
  "epoch": 18.13,
1140
+ "learning_rate": 1.9134065934065936e-05,
1141
+ "loss": 0.1976,
1142
  "step": 1650
1143
  },
1144
  {
1145
  "epoch": 18.24,
1146
+ "learning_rate": 1.906813186813187e-05,
1147
+ "loss": 0.2191,
1148
  "step": 1660
1149
  },
1150
  {
1151
  "epoch": 18.35,
1152
+ "learning_rate": 1.9002197802197804e-05,
1153
+ "loss": 0.1458,
1154
  "step": 1670
1155
  },
1156
  {
1157
  "epoch": 18.46,
1158
+ "learning_rate": 1.8936263736263737e-05,
1159
+ "loss": 0.2027,
1160
  "step": 1680
1161
  },
1162
  {
1163
  "epoch": 18.57,
1164
+ "learning_rate": 1.887032967032967e-05,
1165
+ "loss": 0.1637,
1166
  "step": 1690
1167
  },
1168
  {
1169
  "epoch": 18.68,
1170
+ "learning_rate": 1.8804395604395605e-05,
1171
+ "loss": 0.132,
1172
  "step": 1700
1173
  },
1174
  {
1175
  "epoch": 18.68,
1176
+ "eval_accuracy": 0.7588235294117647,
1177
+ "eval_loss": 0.8486713171005249,
1178
+ "eval_runtime": 1.6357,
1179
+ "eval_samples_per_second": 103.93,
1180
+ "eval_steps_per_second": 13.45,
1181
  "step": 1700
1182
  },
1183
  {
1184
  "epoch": 18.79,
1185
+ "learning_rate": 1.873846153846154e-05,
1186
+ "loss": 0.1319,
1187
  "step": 1710
1188
  },
1189
  {
1190
  "epoch": 18.9,
1191
+ "learning_rate": 1.8672527472527472e-05,
1192
+ "loss": 0.2005,
1193
  "step": 1720
1194
  },
1195
  {
1196
  "epoch": 19.01,
1197
+ "learning_rate": 1.8606593406593406e-05,
1198
+ "loss": 0.2234,
1199
  "step": 1730
1200
  },
1201
  {
1202
  "epoch": 19.12,
1203
+ "learning_rate": 1.854065934065934e-05,
1204
+ "loss": 0.188,
1205
  "step": 1740
1206
  },
1207
  {
1208
  "epoch": 19.23,
1209
+ "learning_rate": 1.8474725274725277e-05,
1210
+ "loss": 0.1322,
1211
  "step": 1750
1212
  },
1213
  {
1214
  "epoch": 19.34,
1215
+ "learning_rate": 1.840879120879121e-05,
1216
+ "loss": 0.2137,
1217
  "step": 1760
1218
  },
1219
  {
1220
  "epoch": 19.45,
1221
+ "learning_rate": 1.8342857142857144e-05,
1222
+ "loss": 0.2314,
1223
  "step": 1770
1224
  },
1225
  {
1226
  "epoch": 19.56,
1227
+ "learning_rate": 1.8276923076923078e-05,
1228
+ "loss": 0.1987,
1229
  "step": 1780
1230
  },
1231
  {
1232
  "epoch": 19.67,
1233
+ "learning_rate": 1.821098901098901e-05,
1234
+ "loss": 0.1955,
1235
  "step": 1790
1236
  },
1237
  {
1238
  "epoch": 19.78,
1239
+ "learning_rate": 1.8145054945054945e-05,
1240
+ "loss": 0.1903,
1241
  "step": 1800
1242
  },
1243
  {
1244
  "epoch": 19.78,
1245
+ "eval_accuracy": 0.7470588235294118,
1246
+ "eval_loss": 0.7911531329154968,
1247
+ "eval_runtime": 1.6371,
1248
+ "eval_samples_per_second": 103.842,
1249
+ "eval_steps_per_second": 13.438,
1250
  "step": 1800
1251
  },
1252
  {
1253
  "epoch": 19.89,
1254
+ "learning_rate": 1.8079120879120882e-05,
1255
+ "loss": 0.1333,
1256
  "step": 1810
1257
  },
1258
  {
1259
  "epoch": 20.0,
1260
+ "learning_rate": 1.8013186813186816e-05,
1261
+ "loss": 0.2119,
1262
  "step": 1820
1263
  },
1264
  {
1265
  "epoch": 20.11,
1266
+ "learning_rate": 1.794725274725275e-05,
1267
+ "loss": 0.1784,
1268
  "step": 1830
1269
  },
1270
  {
1271
  "epoch": 20.22,
1272
+ "learning_rate": 1.788131868131868e-05,
1273
+ "loss": 0.1821,
1274
  "step": 1840
1275
  },
1276
  {
1277
  "epoch": 20.33,
1278
+ "learning_rate": 1.7815384615384613e-05,
1279
+ "loss": 0.2508,
1280
  "step": 1850
1281
  },
1282
  {
1283
  "epoch": 20.44,
1284
+ "learning_rate": 1.7749450549450547e-05,
1285
+ "loss": 0.1599,
1286
  "step": 1860
1287
  },
1288
  {
1289
  "epoch": 20.55,
1290
+ "learning_rate": 1.7683516483516484e-05,
1291
+ "loss": 0.133,
1292
  "step": 1870
1293
  },
1294
  {
1295
  "epoch": 20.66,
1296
+ "learning_rate": 1.7617582417582418e-05,
1297
+ "loss": 0.1344,
1298
  "step": 1880
1299
  },
1300
  {
1301
  "epoch": 20.77,
1302
+ "learning_rate": 1.755164835164835e-05,
1303
+ "loss": 0.163,
1304
  "step": 1890
1305
  },
1306
  {
1307
  "epoch": 20.88,
1308
+ "learning_rate": 1.7485714285714285e-05,
1309
+ "loss": 0.121,
1310
  "step": 1900
1311
  },
1312
  {
1313
  "epoch": 20.88,
1314
+ "eval_accuracy": 0.7470588235294118,
1315
+ "eval_loss": 0.6734881401062012,
1316
+ "eval_runtime": 1.6331,
1317
+ "eval_samples_per_second": 104.095,
1318
+ "eval_steps_per_second": 13.471,
1319
  "step": 1900
1320
  },
1321
  {
1322
  "epoch": 20.99,
1323
+ "learning_rate": 1.741978021978022e-05,
1324
+ "loss": 0.1984,
1325
  "step": 1910
1326
  },
1327
  {
1328
  "epoch": 21.1,
1329
+ "learning_rate": 1.7353846153846156e-05,
1330
+ "loss": 0.15,
1331
  "step": 1920
1332
  },
1333
  {
1334
  "epoch": 21.21,
1335
+ "learning_rate": 1.728791208791209e-05,
1336
+ "loss": 0.11,
1337
  "step": 1930
1338
  },
1339
  {
1340
  "epoch": 21.32,
1341
+ "learning_rate": 1.7221978021978023e-05,
1342
+ "loss": 0.1753,
1343
  "step": 1940
1344
  },
1345
  {
1346
  "epoch": 21.43,
1347
+ "learning_rate": 1.7156043956043957e-05,
1348
+ "loss": 0.1535,
1349
  "step": 1950
1350
  },
1351
  {
1352
  "epoch": 21.54,
1353
+ "learning_rate": 1.709010989010989e-05,
1354
+ "loss": 0.1269,
1355
  "step": 1960
1356
  },
1357
  {
1358
  "epoch": 21.65,
1359
+ "learning_rate": 1.7024175824175825e-05,
1360
+ "loss": 0.1711,
1361
  "step": 1970
1362
  },
1363
  {
1364
  "epoch": 21.76,
1365
+ "learning_rate": 1.6958241758241758e-05,
1366
+ "loss": 0.1871,
1367
  "step": 1980
1368
  },
1369
  {
1370
  "epoch": 21.87,
1371
+ "learning_rate": 1.6892307692307692e-05,
1372
+ "loss": 0.1544,
1373
  "step": 1990
1374
  },
1375
  {
1376
  "epoch": 21.98,
1377
+ "learning_rate": 1.6826373626373626e-05,
1378
+ "loss": 0.1903,
1379
  "step": 2000
1380
  },
1381
  {
1382
  "epoch": 21.98,
1383
+ "eval_accuracy": 0.7823529411764706,
1384
+ "eval_loss": 0.6692019104957581,
1385
+ "eval_runtime": 1.6545,
1386
+ "eval_samples_per_second": 102.753,
1387
+ "eval_steps_per_second": 13.297,
1388
  "step": 2000
1389
  },
1390
  {
1391
  "epoch": 22.09,
1392
+ "learning_rate": 1.676043956043956e-05,
1393
+ "loss": 0.1258,
1394
  "step": 2010
1395
  },
1396
  {
1397
  "epoch": 22.2,
1398
+ "learning_rate": 1.6694505494505493e-05,
1399
+ "loss": 0.1057,
1400
  "step": 2020
1401
  },
1402
  {
1403
  "epoch": 22.31,
1404
+ "learning_rate": 1.662857142857143e-05,
1405
+ "loss": 0.0954,
1406
  "step": 2030
1407
  },
1408
  {
1409
  "epoch": 22.42,
1410
+ "learning_rate": 1.6562637362637364e-05,
1411
+ "loss": 0.2413,
1412
  "step": 2040
1413
  },
1414
  {
1415
  "epoch": 22.53,
1416
+ "learning_rate": 1.6496703296703297e-05,
1417
+ "loss": 0.1073,
1418
  "step": 2050
1419
  },
1420
  {
1421
  "epoch": 22.64,
1422
+ "learning_rate": 1.643076923076923e-05,
1423
+ "loss": 0.2491,
1424
  "step": 2060
1425
  },
1426
  {
1427
  "epoch": 22.75,
1428
+ "learning_rate": 1.6364835164835165e-05,
1429
+ "loss": 0.1972,
1430
  "step": 2070
1431
  },
1432
  {
1433
  "epoch": 22.86,
1434
+ "learning_rate": 1.62989010989011e-05,
1435
+ "loss": 0.1852,
1436
  "step": 2080
1437
  },
1438
  {
1439
  "epoch": 22.97,
1440
+ "learning_rate": 1.6232967032967036e-05,
1441
+ "loss": 0.135,
1442
  "step": 2090
1443
  },
1444
  {
1445
  "epoch": 23.08,
1446
+ "learning_rate": 1.616703296703297e-05,
1447
+ "loss": 0.176,
1448
  "step": 2100
1449
  },
1450
  {
1451
  "epoch": 23.08,
1452
+ "eval_accuracy": 0.7176470588235294,
1453
+ "eval_loss": 0.8350964784622192,
1454
+ "eval_runtime": 1.6244,
1455
+ "eval_samples_per_second": 104.652,
1456
+ "eval_steps_per_second": 13.543,
1457
  "step": 2100
1458
  },
1459
  {
1460
  "epoch": 23.19,
1461
+ "learning_rate": 1.6101098901098903e-05,
1462
+ "loss": 0.1485,
1463
  "step": 2110
1464
  },
1465
  {
1466
  "epoch": 23.3,
1467
+ "learning_rate": 1.6035164835164837e-05,
1468
+ "loss": 0.1436,
1469
  "step": 2120
1470
  },
1471
  {
1472
  "epoch": 23.41,
1473
+ "learning_rate": 1.5969230769230767e-05,
1474
+ "loss": 0.1783,
1475
  "step": 2130
1476
  },
1477
  {
1478
  "epoch": 23.52,
1479
+ "learning_rate": 1.5903296703296704e-05,
1480
+ "loss": 0.1655,
1481
  "step": 2140
1482
  },
1483
  {
1484
  "epoch": 23.63,
1485
+ "learning_rate": 1.5837362637362638e-05,
1486
+ "loss": 0.1274,
1487
  "step": 2150
1488
  },
1489
  {
1490
  "epoch": 23.74,
1491
+ "learning_rate": 1.577142857142857e-05,
1492
+ "loss": 0.1261,
1493
  "step": 2160
1494
  },
1495
  {
1496
  "epoch": 23.85,
1497
+ "learning_rate": 1.5705494505494505e-05,
1498
+ "loss": 0.2141,
1499
  "step": 2170
1500
  },
1501
  {
1502
  "epoch": 23.96,
1503
+ "learning_rate": 1.563956043956044e-05,
1504
+ "loss": 0.2215,
1505
  "step": 2180
1506
  },
1507
  {
1508
  "epoch": 24.07,
1509
+ "learning_rate": 1.5573626373626373e-05,
1510
+ "loss": 0.2243,
1511
  "step": 2190
1512
  },
1513
  {
1514
  "epoch": 24.18,
1515
+ "learning_rate": 1.550769230769231e-05,
1516
+ "loss": 0.1186,
1517
  "step": 2200
1518
  },
1519
  {
1520
  "epoch": 24.18,
1521
+ "eval_accuracy": 0.7470588235294118,
1522
+ "eval_loss": 0.7317846417427063,
1523
+ "eval_runtime": 1.63,
1524
+ "eval_samples_per_second": 104.295,
1525
+ "eval_steps_per_second": 13.497,
1526
  "step": 2200
1527
  },
1528
  {
1529
  "epoch": 24.29,
1530
+ "learning_rate": 1.5441758241758243e-05,
1531
+ "loss": 0.2046,
1532
  "step": 2210
1533
  },
1534
  {
1535
  "epoch": 24.4,
1536
+ "learning_rate": 1.5375824175824177e-05,
1537
+ "loss": 0.1992,
1538
  "step": 2220
1539
  },
1540
  {
1541
  "epoch": 24.51,
1542
+ "learning_rate": 1.530989010989011e-05,
1543
+ "loss": 0.1289,
1544
  "step": 2230
1545
  },
1546
  {
1547
  "epoch": 24.62,
1548
+ "learning_rate": 1.5243956043956046e-05,
1549
+ "loss": 0.1033,
1550
  "step": 2240
1551
  },
1552
  {
1553
  "epoch": 24.73,
1554
+ "learning_rate": 1.517802197802198e-05,
1555
+ "loss": 0.2098,
1556
  "step": 2250
1557
  },
1558
  {
1559
  "epoch": 24.84,
1560
+ "learning_rate": 1.5112087912087913e-05,
1561
+ "loss": 0.0928,
1562
  "step": 2260
1563
  },
1564
  {
1565
  "epoch": 24.95,
1566
+ "learning_rate": 1.5046153846153845e-05,
1567
+ "loss": 0.1469,
1568
  "step": 2270
1569
  },
1570
  {
1571
+ "epoch": 25.05,
1572
+ "learning_rate": 1.498021978021978e-05,
1573
+ "loss": 0.0645,
1574
+ "step": 2280
1575
+ },
1576
+ {
1577
+ "epoch": 25.16,
1578
+ "learning_rate": 1.4914285714285715e-05,
1579
+ "loss": 0.1786,
1580
+ "step": 2290
1581
+ },
1582
+ {
1583
+ "epoch": 25.27,
1584
+ "learning_rate": 1.4848351648351648e-05,
1585
+ "loss": 0.1424,
1586
+ "step": 2300
1587
+ },
1588
+ {
1589
+ "epoch": 25.27,
1590
+ "eval_accuracy": 0.7588235294117647,
1591
+ "eval_loss": 0.7859658598899841,
1592
+ "eval_runtime": 1.6259,
1593
+ "eval_samples_per_second": 104.556,
1594
+ "eval_steps_per_second": 13.531,
1595
+ "step": 2300
1596
+ },
1597
+ {
1598
+ "epoch": 25.38,
1599
+ "learning_rate": 1.4782417582417582e-05,
1600
+ "loss": 0.0742,
1601
+ "step": 2310
1602
+ },
1603
+ {
1604
+ "epoch": 25.49,
1605
+ "learning_rate": 1.4716483516483517e-05,
1606
+ "loss": 0.1224,
1607
+ "step": 2320
1608
+ },
1609
+ {
1610
+ "epoch": 25.6,
1611
+ "learning_rate": 1.4650549450549451e-05,
1612
+ "loss": 0.1334,
1613
+ "step": 2330
1614
+ },
1615
+ {
1616
+ "epoch": 25.71,
1617
+ "learning_rate": 1.4584615384615385e-05,
1618
+ "loss": 0.1924,
1619
+ "step": 2340
1620
+ },
1621
+ {
1622
+ "epoch": 25.82,
1623
+ "learning_rate": 1.451868131868132e-05,
1624
+ "loss": 0.1386,
1625
+ "step": 2350
1626
+ },
1627
+ {
1628
+ "epoch": 25.93,
1629
+ "learning_rate": 1.4452747252747254e-05,
1630
+ "loss": 0.1982,
1631
+ "step": 2360
1632
+ },
1633
+ {
1634
+ "epoch": 26.04,
1635
+ "learning_rate": 1.4386813186813186e-05,
1636
+ "loss": 0.1341,
1637
+ "step": 2370
1638
+ },
1639
+ {
1640
+ "epoch": 26.15,
1641
+ "learning_rate": 1.4320879120879121e-05,
1642
+ "loss": 0.1664,
1643
+ "step": 2380
1644
+ },
1645
+ {
1646
+ "epoch": 26.26,
1647
+ "learning_rate": 1.4254945054945055e-05,
1648
+ "loss": 0.0903,
1649
+ "step": 2390
1650
+ },
1651
+ {
1652
+ "epoch": 26.37,
1653
+ "learning_rate": 1.4189010989010989e-05,
1654
+ "loss": 0.144,
1655
+ "step": 2400
1656
+ },
1657
+ {
1658
+ "epoch": 26.37,
1659
+ "eval_accuracy": 0.788235294117647,
1660
+ "eval_loss": 0.7021328806877136,
1661
+ "eval_runtime": 1.618,
1662
+ "eval_samples_per_second": 105.065,
1663
+ "eval_steps_per_second": 13.597,
1664
+ "step": 2400
1665
+ },
1666
+ {
1667
+ "epoch": 26.48,
1668
+ "learning_rate": 1.4123076923076924e-05,
1669
+ "loss": 0.1747,
1670
+ "step": 2410
1671
+ },
1672
+ {
1673
+ "epoch": 26.59,
1674
+ "learning_rate": 1.4057142857142858e-05,
1675
+ "loss": 0.1655,
1676
+ "step": 2420
1677
+ },
1678
+ {
1679
+ "epoch": 26.7,
1680
+ "learning_rate": 1.3991208791208793e-05,
1681
+ "loss": 0.1003,
1682
+ "step": 2430
1683
+ },
1684
+ {
1685
+ "epoch": 26.81,
1686
+ "learning_rate": 1.3925274725274727e-05,
1687
+ "loss": 0.142,
1688
+ "step": 2440
1689
+ },
1690
+ {
1691
+ "epoch": 26.92,
1692
+ "learning_rate": 1.3859340659340659e-05,
1693
+ "loss": 0.089,
1694
+ "step": 2450
1695
+ },
1696
+ {
1697
+ "epoch": 27.03,
1698
+ "learning_rate": 1.3793406593406594e-05,
1699
+ "loss": 0.2091,
1700
+ "step": 2460
1701
+ },
1702
+ {
1703
+ "epoch": 27.14,
1704
+ "learning_rate": 1.3727472527472528e-05,
1705
+ "loss": 0.1794,
1706
+ "step": 2470
1707
+ },
1708
+ {
1709
+ "epoch": 27.25,
1710
+ "learning_rate": 1.3661538461538461e-05,
1711
+ "loss": 0.133,
1712
+ "step": 2480
1713
+ },
1714
+ {
1715
+ "epoch": 27.36,
1716
+ "learning_rate": 1.3595604395604397e-05,
1717
+ "loss": 0.1039,
1718
+ "step": 2490
1719
+ },
1720
+ {
1721
+ "epoch": 27.47,
1722
+ "learning_rate": 1.352967032967033e-05,
1723
+ "loss": 0.1088,
1724
+ "step": 2500
1725
+ },
1726
+ {
1727
+ "epoch": 27.47,
1728
+ "eval_accuracy": 0.7470588235294118,
1729
+ "eval_loss": 0.8109354972839355,
1730
+ "eval_runtime": 1.7131,
1731
+ "eval_samples_per_second": 99.233,
1732
+ "eval_steps_per_second": 12.842,
1733
+ "step": 2500
1734
+ },
1735
+ {
1736
+ "epoch": 27.58,
1737
+ "learning_rate": 1.3463736263736264e-05,
1738
+ "loss": 0.1443,
1739
+ "step": 2510
1740
+ },
1741
+ {
1742
+ "epoch": 27.69,
1743
+ "learning_rate": 1.3397802197802198e-05,
1744
+ "loss": 0.1106,
1745
+ "step": 2520
1746
+ },
1747
+ {
1748
+ "epoch": 27.8,
1749
+ "learning_rate": 1.3331868131868132e-05,
1750
+ "loss": 0.0908,
1751
+ "step": 2530
1752
+ },
1753
+ {
1754
+ "epoch": 27.91,
1755
+ "learning_rate": 1.3265934065934065e-05,
1756
+ "loss": 0.1268,
1757
+ "step": 2540
1758
+ },
1759
+ {
1760
+ "epoch": 28.02,
1761
+ "learning_rate": 1.32e-05,
1762
+ "loss": 0.194,
1763
+ "step": 2550
1764
+ },
1765
+ {
1766
+ "epoch": 28.13,
1767
+ "learning_rate": 1.3134065934065934e-05,
1768
+ "loss": 0.1085,
1769
+ "step": 2560
1770
+ },
1771
+ {
1772
+ "epoch": 28.24,
1773
+ "learning_rate": 1.306813186813187e-05,
1774
+ "loss": 0.0856,
1775
+ "step": 2570
1776
+ },
1777
+ {
1778
+ "epoch": 28.35,
1779
+ "learning_rate": 1.3002197802197803e-05,
1780
+ "loss": 0.1157,
1781
+ "step": 2580
1782
+ },
1783
+ {
1784
+ "epoch": 28.46,
1785
+ "learning_rate": 1.2936263736263735e-05,
1786
+ "loss": 0.0941,
1787
+ "step": 2590
1788
+ },
1789
+ {
1790
+ "epoch": 28.57,
1791
+ "learning_rate": 1.287032967032967e-05,
1792
+ "loss": 0.1019,
1793
+ "step": 2600
1794
+ },
1795
+ {
1796
+ "epoch": 28.57,
1797
+ "eval_accuracy": 0.7470588235294118,
1798
+ "eval_loss": 0.8157252669334412,
1799
+ "eval_runtime": 1.6087,
1800
+ "eval_samples_per_second": 105.676,
1801
+ "eval_steps_per_second": 13.676,
1802
+ "step": 2600
1803
+ },
1804
+ {
1805
+ "epoch": 28.68,
1806
+ "learning_rate": 1.2804395604395605e-05,
1807
+ "loss": 0.1466,
1808
+ "step": 2610
1809
+ },
1810
+ {
1811
+ "epoch": 28.79,
1812
+ "learning_rate": 1.2738461538461538e-05,
1813
+ "loss": 0.0661,
1814
+ "step": 2620
1815
+ },
1816
+ {
1817
+ "epoch": 28.9,
1818
+ "learning_rate": 1.2672527472527474e-05,
1819
+ "loss": 0.2147,
1820
+ "step": 2630
1821
+ },
1822
+ {
1823
+ "epoch": 29.01,
1824
+ "learning_rate": 1.2606593406593407e-05,
1825
+ "loss": 0.2561,
1826
+ "step": 2640
1827
+ },
1828
+ {
1829
+ "epoch": 29.12,
1830
+ "learning_rate": 1.2540659340659341e-05,
1831
+ "loss": 0.1408,
1832
+ "step": 2650
1833
+ },
1834
+ {
1835
+ "epoch": 29.23,
1836
+ "learning_rate": 1.2474725274725275e-05,
1837
+ "loss": 0.0896,
1838
+ "step": 2660
1839
+ },
1840
+ {
1841
+ "epoch": 29.34,
1842
+ "learning_rate": 1.2408791208791208e-05,
1843
+ "loss": 0.1356,
1844
+ "step": 2670
1845
+ },
1846
+ {
1847
+ "epoch": 29.45,
1848
+ "learning_rate": 1.2342857142857144e-05,
1849
+ "loss": 0.0914,
1850
+ "step": 2680
1851
+ },
1852
+ {
1853
+ "epoch": 29.56,
1854
+ "learning_rate": 1.2276923076923077e-05,
1855
+ "loss": 0.1355,
1856
+ "step": 2690
1857
+ },
1858
+ {
1859
+ "epoch": 29.67,
1860
+ "learning_rate": 1.2210989010989011e-05,
1861
+ "loss": 0.0947,
1862
+ "step": 2700
1863
+ },
1864
+ {
1865
+ "epoch": 29.67,
1866
+ "eval_accuracy": 0.7588235294117647,
1867
+ "eval_loss": 0.8027762174606323,
1868
+ "eval_runtime": 1.6188,
1869
+ "eval_samples_per_second": 105.014,
1870
+ "eval_steps_per_second": 13.59,
1871
+ "step": 2700
1872
+ },
1873
+ {
1874
+ "epoch": 29.78,
1875
+ "learning_rate": 1.2145054945054947e-05,
1876
+ "loss": 0.0745,
1877
+ "step": 2710
1878
+ },
1879
+ {
1880
+ "epoch": 29.89,
1881
+ "learning_rate": 1.207912087912088e-05,
1882
+ "loss": 0.172,
1883
+ "step": 2720
1884
+ },
1885
+ {
1886
+ "epoch": 30.0,
1887
+ "learning_rate": 1.2013186813186812e-05,
1888
+ "loss": 0.1363,
1889
+ "step": 2730
1890
+ },
1891
+ {
1892
+ "epoch": 30.11,
1893
+ "learning_rate": 1.1947252747252748e-05,
1894
+ "loss": 0.1109,
1895
+ "step": 2740
1896
+ },
1897
+ {
1898
+ "epoch": 30.22,
1899
+ "learning_rate": 1.1881318681318681e-05,
1900
+ "loss": 0.0705,
1901
+ "step": 2750
1902
+ },
1903
+ {
1904
+ "epoch": 30.33,
1905
+ "learning_rate": 1.1815384615384615e-05,
1906
+ "loss": 0.1153,
1907
+ "step": 2760
1908
+ },
1909
+ {
1910
+ "epoch": 30.44,
1911
+ "learning_rate": 1.174945054945055e-05,
1912
+ "loss": 0.0552,
1913
+ "step": 2770
1914
+ },
1915
+ {
1916
+ "epoch": 30.55,
1917
+ "learning_rate": 1.1683516483516484e-05,
1918
+ "loss": 0.1828,
1919
+ "step": 2780
1920
+ },
1921
+ {
1922
+ "epoch": 30.66,
1923
+ "learning_rate": 1.1617582417582418e-05,
1924
+ "loss": 0.1159,
1925
+ "step": 2790
1926
+ },
1927
+ {
1928
+ "epoch": 30.77,
1929
+ "learning_rate": 1.1551648351648351e-05,
1930
+ "loss": 0.1715,
1931
+ "step": 2800
1932
+ },
1933
+ {
1934
+ "epoch": 30.77,
1935
+ "eval_accuracy": 0.7470588235294118,
1936
+ "eval_loss": 0.8344667553901672,
1937
+ "eval_runtime": 1.6594,
1938
+ "eval_samples_per_second": 102.448,
1939
+ "eval_steps_per_second": 13.258,
1940
+ "step": 2800
1941
+ },
1942
+ {
1943
+ "epoch": 30.88,
1944
+ "learning_rate": 1.1485714285714285e-05,
1945
+ "loss": 0.2173,
1946
+ "step": 2810
1947
+ },
1948
+ {
1949
+ "epoch": 30.99,
1950
+ "learning_rate": 1.141978021978022e-05,
1951
+ "loss": 0.0776,
1952
+ "step": 2820
1953
+ },
1954
+ {
1955
+ "epoch": 31.1,
1956
+ "learning_rate": 1.1353846153846154e-05,
1957
+ "loss": 0.0849,
1958
+ "step": 2830
1959
+ },
1960
+ {
1961
+ "epoch": 31.21,
1962
+ "learning_rate": 1.1287912087912088e-05,
1963
+ "loss": 0.1367,
1964
+ "step": 2840
1965
+ },
1966
+ {
1967
+ "epoch": 31.32,
1968
+ "learning_rate": 1.1221978021978023e-05,
1969
+ "loss": 0.1146,
1970
+ "step": 2850
1971
+ },
1972
+ {
1973
+ "epoch": 31.43,
1974
+ "learning_rate": 1.1156043956043957e-05,
1975
+ "loss": 0.2185,
1976
+ "step": 2860
1977
+ },
1978
+ {
1979
+ "epoch": 31.54,
1980
+ "learning_rate": 1.1090109890109889e-05,
1981
+ "loss": 0.1426,
1982
+ "step": 2870
1983
+ },
1984
+ {
1985
+ "epoch": 31.65,
1986
+ "learning_rate": 1.1024175824175824e-05,
1987
+ "loss": 0.1934,
1988
+ "step": 2880
1989
+ },
1990
+ {
1991
+ "epoch": 31.76,
1992
+ "learning_rate": 1.0958241758241758e-05,
1993
+ "loss": 0.1535,
1994
+ "step": 2890
1995
+ },
1996
+ {
1997
+ "epoch": 31.87,
1998
+ "learning_rate": 1.0892307692307692e-05,
1999
+ "loss": 0.1046,
2000
+ "step": 2900
2001
+ },
2002
+ {
2003
+ "epoch": 31.87,
2004
+ "eval_accuracy": 0.7411764705882353,
2005
+ "eval_loss": 0.8577510118484497,
2006
+ "eval_runtime": 1.6297,
2007
+ "eval_samples_per_second": 104.313,
2008
+ "eval_steps_per_second": 13.499,
2009
+ "step": 2900
2010
+ },
2011
+ {
2012
+ "epoch": 31.98,
2013
+ "learning_rate": 1.0826373626373627e-05,
2014
+ "loss": 0.1664,
2015
+ "step": 2910
2016
+ },
2017
+ {
2018
+ "epoch": 32.09,
2019
+ "learning_rate": 1.076043956043956e-05,
2020
+ "loss": 0.1792,
2021
+ "step": 2920
2022
+ },
2023
+ {
2024
+ "epoch": 32.2,
2025
+ "learning_rate": 1.0694505494505496e-05,
2026
+ "loss": 0.1166,
2027
+ "step": 2930
2028
+ },
2029
+ {
2030
+ "epoch": 32.31,
2031
+ "learning_rate": 1.0628571428571428e-05,
2032
+ "loss": 0.078,
2033
+ "step": 2940
2034
+ },
2035
+ {
2036
+ "epoch": 32.42,
2037
+ "learning_rate": 1.0562637362637362e-05,
2038
+ "loss": 0.0868,
2039
+ "step": 2950
2040
+ },
2041
+ {
2042
+ "epoch": 32.53,
2043
+ "learning_rate": 1.0496703296703297e-05,
2044
+ "loss": 0.0976,
2045
+ "step": 2960
2046
+ },
2047
+ {
2048
+ "epoch": 32.64,
2049
+ "learning_rate": 1.0430769230769231e-05,
2050
+ "loss": 0.2388,
2051
+ "step": 2970
2052
+ },
2053
+ {
2054
+ "epoch": 32.75,
2055
+ "learning_rate": 1.0364835164835165e-05,
2056
+ "loss": 0.1135,
2057
+ "step": 2980
2058
+ },
2059
+ {
2060
+ "epoch": 32.86,
2061
+ "learning_rate": 1.02989010989011e-05,
2062
+ "loss": 0.1377,
2063
+ "step": 2990
2064
+ },
2065
+ {
2066
+ "epoch": 32.97,
2067
+ "learning_rate": 1.0232967032967034e-05,
2068
+ "loss": 0.1367,
2069
+ "step": 3000
2070
+ },
2071
+ {
2072
+ "epoch": 32.97,
2073
+ "eval_accuracy": 0.788235294117647,
2074
+ "eval_loss": 0.7669554948806763,
2075
+ "eval_runtime": 1.6159,
2076
+ "eval_samples_per_second": 105.202,
2077
+ "eval_steps_per_second": 13.614,
2078
+ "step": 3000
2079
+ },
2080
+ {
2081
+ "epoch": 33.08,
2082
+ "learning_rate": 1.0167032967032966e-05,
2083
+ "loss": 0.1076,
2084
+ "step": 3010
2085
+ },
2086
+ {
2087
+ "epoch": 33.19,
2088
+ "learning_rate": 1.0101098901098901e-05,
2089
+ "loss": 0.1596,
2090
+ "step": 3020
2091
+ },
2092
+ {
2093
+ "epoch": 33.3,
2094
+ "learning_rate": 1.0035164835164835e-05,
2095
+ "loss": 0.1152,
2096
+ "step": 3030
2097
+ },
2098
+ {
2099
+ "epoch": 33.41,
2100
+ "learning_rate": 9.96923076923077e-06,
2101
+ "loss": 0.1093,
2102
+ "step": 3040
2103
+ },
2104
+ {
2105
+ "epoch": 33.52,
2106
+ "learning_rate": 9.903296703296704e-06,
2107
+ "loss": 0.1465,
2108
+ "step": 3050
2109
+ },
2110
+ {
2111
+ "epoch": 33.63,
2112
+ "learning_rate": 9.843956043956044e-06,
2113
+ "loss": 0.1281,
2114
+ "step": 3060
2115
+ },
2116
+ {
2117
+ "epoch": 33.74,
2118
+ "learning_rate": 9.778021978021979e-06,
2119
+ "loss": 0.0871,
2120
+ "step": 3070
2121
+ },
2122
+ {
2123
+ "epoch": 33.85,
2124
+ "learning_rate": 9.712087912087913e-06,
2125
+ "loss": 0.0713,
2126
+ "step": 3080
2127
+ },
2128
+ {
2129
+ "epoch": 33.96,
2130
+ "learning_rate": 9.646153846153846e-06,
2131
+ "loss": 0.0605,
2132
+ "step": 3090
2133
+ },
2134
+ {
2135
+ "epoch": 34.07,
2136
+ "learning_rate": 9.580219780219782e-06,
2137
+ "loss": 0.1339,
2138
+ "step": 3100
2139
+ },
2140
+ {
2141
+ "epoch": 34.07,
2142
+ "eval_accuracy": 0.7647058823529411,
2143
+ "eval_loss": 0.776252269744873,
2144
+ "eval_runtime": 1.6196,
2145
+ "eval_samples_per_second": 104.967,
2146
+ "eval_steps_per_second": 13.584,
2147
+ "step": 3100
2148
+ },
2149
+ {
2150
+ "epoch": 34.18,
2151
+ "learning_rate": 9.514285714285714e-06,
2152
+ "loss": 0.1543,
2153
+ "step": 3110
2154
+ },
2155
+ {
2156
+ "epoch": 34.29,
2157
+ "learning_rate": 9.448351648351647e-06,
2158
+ "loss": 0.0526,
2159
+ "step": 3120
2160
+ },
2161
+ {
2162
+ "epoch": 34.4,
2163
+ "learning_rate": 9.382417582417583e-06,
2164
+ "loss": 0.1342,
2165
+ "step": 3130
2166
+ },
2167
+ {
2168
+ "epoch": 34.51,
2169
+ "learning_rate": 9.316483516483516e-06,
2170
+ "loss": 0.0849,
2171
+ "step": 3140
2172
+ },
2173
+ {
2174
+ "epoch": 34.62,
2175
+ "learning_rate": 9.25054945054945e-06,
2176
+ "loss": 0.0898,
2177
+ "step": 3150
2178
+ },
2179
+ {
2180
+ "epoch": 34.73,
2181
+ "learning_rate": 9.184615384615386e-06,
2182
+ "loss": 0.1102,
2183
+ "step": 3160
2184
+ },
2185
+ {
2186
+ "epoch": 34.84,
2187
+ "learning_rate": 9.11868131868132e-06,
2188
+ "loss": 0.1245,
2189
+ "step": 3170
2190
+ },
2191
+ {
2192
+ "epoch": 34.95,
2193
+ "learning_rate": 9.052747252747255e-06,
2194
+ "loss": 0.1517,
2195
+ "step": 3180
2196
+ },
2197
+ {
2198
+ "epoch": 35.05,
2199
+ "learning_rate": 8.986813186813187e-06,
2200
+ "loss": 0.15,
2201
+ "step": 3190
2202
+ },
2203
+ {
2204
+ "epoch": 35.16,
2205
+ "learning_rate": 8.92087912087912e-06,
2206
+ "loss": 0.1194,
2207
+ "step": 3200
2208
+ },
2209
+ {
2210
+ "epoch": 35.16,
2211
+ "eval_accuracy": 0.7705882352941177,
2212
+ "eval_loss": 0.7726542949676514,
2213
+ "eval_runtime": 1.6435,
2214
+ "eval_samples_per_second": 103.438,
2215
+ "eval_steps_per_second": 13.386,
2216
+ "step": 3200
2217
+ },
2218
+ {
2219
+ "epoch": 35.27,
2220
+ "learning_rate": 8.854945054945056e-06,
2221
+ "loss": 0.0909,
2222
+ "step": 3210
2223
+ },
2224
+ {
2225
+ "epoch": 35.38,
2226
+ "learning_rate": 8.78901098901099e-06,
2227
+ "loss": 0.0842,
2228
+ "step": 3220
2229
+ },
2230
+ {
2231
+ "epoch": 35.49,
2232
+ "learning_rate": 8.723076923076923e-06,
2233
+ "loss": 0.1693,
2234
+ "step": 3230
2235
+ },
2236
+ {
2237
+ "epoch": 35.6,
2238
+ "learning_rate": 8.657142857142858e-06,
2239
+ "loss": 0.1059,
2240
+ "step": 3240
2241
+ },
2242
+ {
2243
+ "epoch": 35.71,
2244
+ "learning_rate": 8.591208791208792e-06,
2245
+ "loss": 0.1426,
2246
+ "step": 3250
2247
+ },
2248
+ {
2249
+ "epoch": 35.82,
2250
+ "learning_rate": 8.525274725274724e-06,
2251
+ "loss": 0.1127,
2252
+ "step": 3260
2253
+ },
2254
+ {
2255
+ "epoch": 35.93,
2256
+ "learning_rate": 8.45934065934066e-06,
2257
+ "loss": 0.0513,
2258
+ "step": 3270
2259
+ },
2260
+ {
2261
+ "epoch": 36.04,
2262
+ "learning_rate": 8.393406593406593e-06,
2263
+ "loss": 0.1005,
2264
+ "step": 3280
2265
+ },
2266
+ {
2267
+ "epoch": 36.15,
2268
+ "learning_rate": 8.327472527472527e-06,
2269
+ "loss": 0.0906,
2270
+ "step": 3290
2271
+ },
2272
+ {
2273
+ "epoch": 36.26,
2274
+ "learning_rate": 8.261538461538462e-06,
2275
+ "loss": 0.151,
2276
+ "step": 3300
2277
+ },
2278
+ {
2279
+ "epoch": 36.26,
2280
+ "eval_accuracy": 0.7470588235294118,
2281
+ "eval_loss": 0.8271679878234863,
2282
+ "eval_runtime": 1.8428,
2283
+ "eval_samples_per_second": 92.249,
2284
+ "eval_steps_per_second": 11.938,
2285
+ "step": 3300
2286
+ },
2287
+ {
2288
+ "epoch": 36.37,
2289
+ "learning_rate": 8.195604395604396e-06,
2290
+ "loss": 0.0569,
2291
+ "step": 3310
2292
+ },
2293
+ {
2294
+ "epoch": 36.48,
2295
+ "learning_rate": 8.129670329670331e-06,
2296
+ "loss": 0.1806,
2297
+ "step": 3320
2298
+ },
2299
+ {
2300
+ "epoch": 36.59,
2301
+ "learning_rate": 8.063736263736263e-06,
2302
+ "loss": 0.1399,
2303
+ "step": 3330
2304
+ },
2305
+ {
2306
+ "epoch": 36.7,
2307
+ "learning_rate": 7.997802197802197e-06,
2308
+ "loss": 0.1186,
2309
+ "step": 3340
2310
+ },
2311
+ {
2312
+ "epoch": 36.81,
2313
+ "learning_rate": 7.931868131868132e-06,
2314
+ "loss": 0.0971,
2315
+ "step": 3350
2316
+ },
2317
+ {
2318
+ "epoch": 36.92,
2319
+ "learning_rate": 7.865934065934066e-06,
2320
+ "loss": 0.1759,
2321
+ "step": 3360
2322
+ },
2323
+ {
2324
+ "epoch": 37.03,
2325
+ "learning_rate": 7.8e-06,
2326
+ "loss": 0.0609,
2327
+ "step": 3370
2328
+ },
2329
+ {
2330
+ "epoch": 37.14,
2331
+ "learning_rate": 7.734065934065935e-06,
2332
+ "loss": 0.101,
2333
+ "step": 3380
2334
+ },
2335
+ {
2336
+ "epoch": 37.25,
2337
+ "learning_rate": 7.668131868131869e-06,
2338
+ "loss": 0.0826,
2339
+ "step": 3390
2340
+ },
2341
+ {
2342
+ "epoch": 37.36,
2343
+ "learning_rate": 7.602197802197802e-06,
2344
+ "loss": 0.0646,
2345
+ "step": 3400
2346
+ },
2347
+ {
2348
+ "epoch": 37.36,
2349
+ "eval_accuracy": 0.7764705882352941,
2350
+ "eval_loss": 0.7721197009086609,
2351
+ "eval_runtime": 1.6753,
2352
+ "eval_samples_per_second": 101.472,
2353
+ "eval_steps_per_second": 13.132,
2354
+ "step": 3400
2355
+ },
2356
+ {
2357
+ "epoch": 37.47,
2358
+ "learning_rate": 7.536263736263736e-06,
2359
+ "loss": 0.1707,
2360
+ "step": 3410
2361
+ },
2362
+ {
2363
+ "epoch": 37.58,
2364
+ "learning_rate": 7.47032967032967e-06,
2365
+ "loss": 0.114,
2366
+ "step": 3420
2367
+ },
2368
+ {
2369
+ "epoch": 37.69,
2370
+ "learning_rate": 7.4043956043956046e-06,
2371
+ "loss": 0.1188,
2372
+ "step": 3430
2373
+ },
2374
+ {
2375
+ "epoch": 37.8,
2376
+ "learning_rate": 7.338461538461539e-06,
2377
+ "loss": 0.1788,
2378
+ "step": 3440
2379
+ },
2380
+ {
2381
+ "epoch": 37.91,
2382
+ "learning_rate": 7.272527472527472e-06,
2383
+ "loss": 0.1241,
2384
+ "step": 3450
2385
+ },
2386
+ {
2387
+ "epoch": 38.02,
2388
+ "learning_rate": 7.2065934065934065e-06,
2389
+ "loss": 0.1308,
2390
+ "step": 3460
2391
+ },
2392
+ {
2393
+ "epoch": 38.13,
2394
+ "learning_rate": 7.140659340659341e-06,
2395
+ "loss": 0.1154,
2396
+ "step": 3470
2397
+ },
2398
+ {
2399
+ "epoch": 38.24,
2400
+ "learning_rate": 7.0747252747252756e-06,
2401
+ "loss": 0.1787,
2402
+ "step": 3480
2403
+ },
2404
+ {
2405
+ "epoch": 38.35,
2406
+ "learning_rate": 7.008791208791208e-06,
2407
+ "loss": 0.1262,
2408
+ "step": 3490
2409
+ },
2410
+ {
2411
+ "epoch": 38.46,
2412
+ "learning_rate": 6.942857142857143e-06,
2413
+ "loss": 0.0801,
2414
+ "step": 3500
2415
+ },
2416
+ {
2417
+ "epoch": 38.46,
2418
+ "eval_accuracy": 0.7529411764705882,
2419
+ "eval_loss": 0.8170506358146667,
2420
+ "eval_runtime": 1.6607,
2421
+ "eval_samples_per_second": 102.365,
2422
+ "eval_steps_per_second": 13.247,
2423
+ "step": 3500
2424
+ },
2425
+ {
2426
+ "epoch": 38.57,
2427
+ "learning_rate": 6.8769230769230775e-06,
2428
+ "loss": 0.098,
2429
+ "step": 3510
2430
+ },
2431
+ {
2432
+ "epoch": 38.68,
2433
+ "learning_rate": 6.810989010989011e-06,
2434
+ "loss": 0.1109,
2435
+ "step": 3520
2436
+ },
2437
+ {
2438
+ "epoch": 38.79,
2439
+ "learning_rate": 6.745054945054945e-06,
2440
+ "loss": 0.1078,
2441
+ "step": 3530
2442
+ },
2443
+ {
2444
+ "epoch": 38.9,
2445
+ "learning_rate": 6.679120879120879e-06,
2446
+ "loss": 0.0767,
2447
+ "step": 3540
2448
+ },
2449
+ {
2450
+ "epoch": 39.01,
2451
+ "learning_rate": 6.613186813186814e-06,
2452
+ "loss": 0.0886,
2453
+ "step": 3550
2454
+ },
2455
+ {
2456
+ "epoch": 39.12,
2457
+ "learning_rate": 6.547252747252747e-06,
2458
+ "loss": 0.0631,
2459
+ "step": 3560
2460
+ },
2461
+ {
2462
+ "epoch": 39.23,
2463
+ "learning_rate": 6.481318681318681e-06,
2464
+ "loss": 0.0737,
2465
+ "step": 3570
2466
+ },
2467
+ {
2468
+ "epoch": 39.34,
2469
+ "learning_rate": 6.415384615384616e-06,
2470
+ "loss": 0.1207,
2471
+ "step": 3580
2472
+ },
2473
+ {
2474
+ "epoch": 39.45,
2475
+ "learning_rate": 6.3494505494505496e-06,
2476
+ "loss": 0.0831,
2477
+ "step": 3590
2478
+ },
2479
+ {
2480
+ "epoch": 39.56,
2481
+ "learning_rate": 6.283516483516483e-06,
2482
+ "loss": 0.1038,
2483
+ "step": 3600
2484
+ },
2485
+ {
2486
+ "epoch": 39.56,
2487
+ "eval_accuracy": 0.7058823529411765,
2488
+ "eval_loss": 0.9464375376701355,
2489
+ "eval_runtime": 1.6231,
2490
+ "eval_samples_per_second": 104.738,
2491
+ "eval_steps_per_second": 13.554,
2492
+ "step": 3600
2493
+ },
2494
+ {
2495
+ "epoch": 39.67,
2496
+ "learning_rate": 6.217582417582418e-06,
2497
+ "loss": 0.1151,
2498
+ "step": 3610
2499
+ },
2500
+ {
2501
+ "epoch": 39.78,
2502
+ "learning_rate": 6.151648351648352e-06,
2503
+ "loss": 0.113,
2504
+ "step": 3620
2505
+ },
2506
+ {
2507
+ "epoch": 39.89,
2508
+ "learning_rate": 6.085714285714285e-06,
2509
+ "loss": 0.1226,
2510
+ "step": 3630
2511
+ },
2512
+ {
2513
+ "epoch": 40.0,
2514
+ "learning_rate": 6.01978021978022e-06,
2515
+ "loss": 0.0645,
2516
+ "step": 3640
2517
+ },
2518
+ {
2519
+ "epoch": 40.11,
2520
+ "learning_rate": 5.953846153846154e-06,
2521
+ "loss": 0.1398,
2522
+ "step": 3650
2523
+ },
2524
+ {
2525
+ "epoch": 40.22,
2526
+ "learning_rate": 5.887912087912089e-06,
2527
+ "loss": 0.0801,
2528
+ "step": 3660
2529
+ },
2530
+ {
2531
+ "epoch": 40.33,
2532
+ "learning_rate": 5.821978021978022e-06,
2533
+ "loss": 0.0787,
2534
+ "step": 3670
2535
+ },
2536
+ {
2537
+ "epoch": 40.44,
2538
+ "learning_rate": 5.756043956043956e-06,
2539
+ "loss": 0.1388,
2540
+ "step": 3680
2541
+ },
2542
+ {
2543
+ "epoch": 40.55,
2544
+ "learning_rate": 5.690109890109891e-06,
2545
+ "loss": 0.0356,
2546
+ "step": 3690
2547
+ },
2548
+ {
2549
+ "epoch": 40.66,
2550
+ "learning_rate": 5.624175824175824e-06,
2551
+ "loss": 0.16,
2552
+ "step": 3700
2553
+ },
2554
+ {
2555
+ "epoch": 40.66,
2556
+ "eval_accuracy": 0.7705882352941177,
2557
+ "eval_loss": 0.8004665374755859,
2558
+ "eval_runtime": 1.6262,
2559
+ "eval_samples_per_second": 104.537,
2560
+ "eval_steps_per_second": 13.528,
2561
+ "step": 3700
2562
+ },
2563
+ {
2564
+ "epoch": 40.77,
2565
+ "learning_rate": 5.558241758241758e-06,
2566
+ "loss": 0.1363,
2567
+ "step": 3710
2568
+ },
2569
+ {
2570
+ "epoch": 40.88,
2571
+ "learning_rate": 5.492307692307693e-06,
2572
+ "loss": 0.0803,
2573
+ "step": 3720
2574
+ },
2575
+ {
2576
+ "epoch": 40.99,
2577
+ "learning_rate": 5.426373626373627e-06,
2578
+ "loss": 0.077,
2579
+ "step": 3730
2580
+ },
2581
+ {
2582
+ "epoch": 41.1,
2583
+ "learning_rate": 5.36043956043956e-06,
2584
+ "loss": 0.0912,
2585
+ "step": 3740
2586
+ },
2587
+ {
2588
+ "epoch": 41.21,
2589
+ "learning_rate": 5.2945054945054946e-06,
2590
+ "loss": 0.1425,
2591
+ "step": 3750
2592
+ },
2593
+ {
2594
+ "epoch": 41.32,
2595
+ "learning_rate": 5.228571428571429e-06,
2596
+ "loss": 0.0815,
2597
+ "step": 3760
2598
+ },
2599
+ {
2600
+ "epoch": 41.43,
2601
+ "learning_rate": 5.162637362637363e-06,
2602
+ "loss": 0.0987,
2603
+ "step": 3770
2604
+ },
2605
+ {
2606
+ "epoch": 41.54,
2607
+ "learning_rate": 5.0967032967032965e-06,
2608
+ "loss": 0.0457,
2609
+ "step": 3780
2610
+ },
2611
+ {
2612
+ "epoch": 41.65,
2613
+ "learning_rate": 5.030769230769231e-06,
2614
+ "loss": 0.1181,
2615
+ "step": 3790
2616
+ },
2617
+ {
2618
+ "epoch": 41.76,
2619
+ "learning_rate": 4.9648351648351656e-06,
2620
+ "loss": 0.1151,
2621
+ "step": 3800
2622
+ },
2623
+ {
2624
+ "epoch": 41.76,
2625
+ "eval_accuracy": 0.7470588235294118,
2626
+ "eval_loss": 0.8784206509590149,
2627
+ "eval_runtime": 1.6555,
2628
+ "eval_samples_per_second": 102.688,
2629
+ "eval_steps_per_second": 13.289,
2630
+ "step": 3800
2631
+ },
2632
+ {
2633
+ "epoch": 41.87,
2634
+ "learning_rate": 4.898901098901098e-06,
2635
+ "loss": 0.1219,
2636
+ "step": 3810
2637
+ },
2638
+ {
2639
+ "epoch": 41.98,
2640
+ "learning_rate": 4.832967032967033e-06,
2641
+ "loss": 0.0719,
2642
+ "step": 3820
2643
+ },
2644
+ {
2645
+ "epoch": 42.09,
2646
+ "learning_rate": 4.7670329670329675e-06,
2647
+ "loss": 0.0647,
2648
+ "step": 3830
2649
+ },
2650
+ {
2651
+ "epoch": 42.2,
2652
+ "learning_rate": 4.701098901098901e-06,
2653
+ "loss": 0.1066,
2654
+ "step": 3840
2655
+ },
2656
+ {
2657
+ "epoch": 42.31,
2658
+ "learning_rate": 4.635164835164835e-06,
2659
+ "loss": 0.052,
2660
+ "step": 3850
2661
+ },
2662
+ {
2663
+ "epoch": 42.42,
2664
+ "learning_rate": 4.569230769230769e-06,
2665
+ "loss": 0.0658,
2666
+ "step": 3860
2667
+ },
2668
+ {
2669
+ "epoch": 42.53,
2670
+ "learning_rate": 4.503296703296704e-06,
2671
+ "loss": 0.0851,
2672
+ "step": 3870
2673
+ },
2674
+ {
2675
+ "epoch": 42.64,
2676
+ "learning_rate": 4.437362637362637e-06,
2677
+ "loss": 0.071,
2678
+ "step": 3880
2679
+ },
2680
+ {
2681
+ "epoch": 42.75,
2682
+ "learning_rate": 4.371428571428571e-06,
2683
+ "loss": 0.0922,
2684
+ "step": 3890
2685
+ },
2686
+ {
2687
+ "epoch": 42.86,
2688
+ "learning_rate": 4.305494505494506e-06,
2689
+ "loss": 0.1159,
2690
+ "step": 3900
2691
+ },
2692
+ {
2693
+ "epoch": 42.86,
2694
+ "eval_accuracy": 0.7470588235294118,
2695
+ "eval_loss": 0.85979163646698,
2696
+ "eval_runtime": 1.6327,
2697
+ "eval_samples_per_second": 104.124,
2698
+ "eval_steps_per_second": 13.475,
2699
+ "step": 3900
2700
+ },
2701
+ {
2702
+ "epoch": 42.97,
2703
+ "learning_rate": 4.2395604395604396e-06,
2704
+ "loss": 0.1347,
2705
+ "step": 3910
2706
+ },
2707
+ {
2708
+ "epoch": 43.08,
2709
+ "learning_rate": 4.173626373626373e-06,
2710
+ "loss": 0.0455,
2711
+ "step": 3920
2712
+ },
2713
+ {
2714
+ "epoch": 43.19,
2715
+ "learning_rate": 4.107692307692308e-06,
2716
+ "loss": 0.054,
2717
+ "step": 3930
2718
+ },
2719
+ {
2720
+ "epoch": 43.3,
2721
+ "learning_rate": 4.041758241758242e-06,
2722
+ "loss": 0.1078,
2723
+ "step": 3940
2724
+ },
2725
+ {
2726
+ "epoch": 43.41,
2727
+ "learning_rate": 3.975824175824176e-06,
2728
+ "loss": 0.0738,
2729
+ "step": 3950
2730
+ },
2731
+ {
2732
+ "epoch": 43.52,
2733
+ "learning_rate": 3.90989010989011e-06,
2734
+ "loss": 0.0672,
2735
+ "step": 3960
2736
+ },
2737
+ {
2738
+ "epoch": 43.63,
2739
+ "learning_rate": 3.843956043956044e-06,
2740
+ "loss": 0.0476,
2741
+ "step": 3970
2742
+ },
2743
+ {
2744
+ "epoch": 43.74,
2745
+ "learning_rate": 3.7780219780219784e-06,
2746
+ "loss": 0.1283,
2747
+ "step": 3980
2748
+ },
2749
+ {
2750
+ "epoch": 43.85,
2751
+ "learning_rate": 3.712087912087912e-06,
2752
+ "loss": 0.1116,
2753
+ "step": 3990
2754
+ },
2755
+ {
2756
+ "epoch": 43.96,
2757
+ "learning_rate": 3.646153846153846e-06,
2758
+ "loss": 0.0575,
2759
+ "step": 4000
2760
+ },
2761
+ {
2762
+ "epoch": 43.96,
2763
+ "eval_accuracy": 0.7529411764705882,
2764
+ "eval_loss": 0.8543334007263184,
2765
+ "eval_runtime": 1.6405,
2766
+ "eval_samples_per_second": 103.629,
2767
+ "eval_steps_per_second": 13.411,
2768
+ "step": 4000
2769
+ },
2770
+ {
2771
+ "epoch": 44.07,
2772
+ "learning_rate": 3.5802197802197803e-06,
2773
+ "loss": 0.079,
2774
+ "step": 4010
2775
+ },
2776
+ {
2777
+ "epoch": 44.18,
2778
+ "learning_rate": 3.5142857142857144e-06,
2779
+ "loss": 0.0763,
2780
+ "step": 4020
2781
+ },
2782
+ {
2783
+ "epoch": 44.29,
2784
+ "learning_rate": 3.4483516483516485e-06,
2785
+ "loss": 0.0445,
2786
+ "step": 4030
2787
+ },
2788
+ {
2789
+ "epoch": 44.4,
2790
+ "learning_rate": 3.3824175824175826e-06,
2791
+ "loss": 0.0939,
2792
+ "step": 4040
2793
+ },
2794
+ {
2795
+ "epoch": 44.51,
2796
+ "learning_rate": 3.3164835164835163e-06,
2797
+ "loss": 0.1475,
2798
+ "step": 4050
2799
+ },
2800
+ {
2801
+ "epoch": 44.62,
2802
+ "learning_rate": 3.250549450549451e-06,
2803
+ "loss": 0.1596,
2804
+ "step": 4060
2805
+ },
2806
+ {
2807
+ "epoch": 44.73,
2808
+ "learning_rate": 3.1846153846153846e-06,
2809
+ "loss": 0.1322,
2810
+ "step": 4070
2811
+ },
2812
+ {
2813
+ "epoch": 44.84,
2814
+ "learning_rate": 3.1186813186813187e-06,
2815
+ "loss": 0.1158,
2816
+ "step": 4080
2817
+ },
2818
+ {
2819
+ "epoch": 44.95,
2820
+ "learning_rate": 3.0527472527472528e-06,
2821
+ "loss": 0.075,
2822
+ "step": 4090
2823
+ },
2824
+ {
2825
+ "epoch": 45.05,
2826
+ "learning_rate": 2.986813186813187e-06,
2827
+ "loss": 0.164,
2828
+ "step": 4100
2829
+ },
2830
+ {
2831
+ "epoch": 45.05,
2832
+ "eval_accuracy": 0.7588235294117647,
2833
+ "eval_loss": 0.8658636808395386,
2834
+ "eval_runtime": 1.518,
2835
+ "eval_samples_per_second": 111.989,
2836
+ "eval_steps_per_second": 14.493,
2837
+ "step": 4100
2838
+ },
2839
+ {
2840
+ "epoch": 45.16,
2841
+ "learning_rate": 2.920879120879121e-06,
2842
+ "loss": 0.0847,
2843
+ "step": 4110
2844
+ },
2845
+ {
2846
+ "epoch": 45.27,
2847
+ "learning_rate": 2.854945054945055e-06,
2848
+ "loss": 0.1098,
2849
+ "step": 4120
2850
+ },
2851
+ {
2852
+ "epoch": 45.38,
2853
+ "learning_rate": 2.7890109890109892e-06,
2854
+ "loss": 0.1282,
2855
+ "step": 4130
2856
+ },
2857
+ {
2858
+ "epoch": 45.49,
2859
+ "learning_rate": 2.723076923076923e-06,
2860
+ "loss": 0.0736,
2861
+ "step": 4140
2862
+ },
2863
+ {
2864
+ "epoch": 45.6,
2865
+ "learning_rate": 2.657142857142857e-06,
2866
+ "loss": 0.1344,
2867
+ "step": 4150
2868
+ },
2869
+ {
2870
+ "epoch": 45.71,
2871
+ "learning_rate": 2.591208791208791e-06,
2872
+ "loss": 0.1191,
2873
+ "step": 4160
2874
+ },
2875
+ {
2876
+ "epoch": 45.82,
2877
+ "learning_rate": 2.5252747252747253e-06,
2878
+ "loss": 0.0721,
2879
+ "step": 4170
2880
+ },
2881
+ {
2882
+ "epoch": 45.93,
2883
+ "learning_rate": 2.4593406593406594e-06,
2884
+ "loss": 0.0371,
2885
+ "step": 4180
2886
+ },
2887
+ {
2888
+ "epoch": 46.04,
2889
+ "learning_rate": 2.3934065934065935e-06,
2890
+ "loss": 0.0407,
2891
+ "step": 4190
2892
+ },
2893
+ {
2894
+ "epoch": 46.15,
2895
+ "learning_rate": 2.3274725274725276e-06,
2896
+ "loss": 0.1319,
2897
+ "step": 4200
2898
+ },
2899
+ {
2900
+ "epoch": 46.15,
2901
+ "eval_accuracy": 0.7411764705882353,
2902
+ "eval_loss": 0.885351300239563,
2903
+ "eval_runtime": 1.4998,
2904
+ "eval_samples_per_second": 113.347,
2905
+ "eval_steps_per_second": 14.668,
2906
+ "step": 4200
2907
+ },
2908
+ {
2909
+ "epoch": 46.26,
2910
+ "learning_rate": 2.2615384615384617e-06,
2911
+ "loss": 0.1159,
2912
+ "step": 4210
2913
+ },
2914
+ {
2915
+ "epoch": 46.37,
2916
+ "learning_rate": 2.195604395604396e-06,
2917
+ "loss": 0.0659,
2918
+ "step": 4220
2919
+ },
2920
+ {
2921
+ "epoch": 46.48,
2922
+ "learning_rate": 2.1296703296703296e-06,
2923
+ "loss": 0.0948,
2924
+ "step": 4230
2925
+ },
2926
+ {
2927
+ "epoch": 46.59,
2928
+ "learning_rate": 2.0637362637362637e-06,
2929
+ "loss": 0.0886,
2930
+ "step": 4240
2931
+ },
2932
+ {
2933
+ "epoch": 46.7,
2934
+ "learning_rate": 1.9978021978021978e-06,
2935
+ "loss": 0.0889,
2936
+ "step": 4250
2937
+ },
2938
+ {
2939
+ "epoch": 46.81,
2940
+ "learning_rate": 1.931868131868132e-06,
2941
+ "loss": 0.0339,
2942
+ "step": 4260
2943
+ },
2944
+ {
2945
+ "epoch": 46.92,
2946
+ "learning_rate": 1.8659340659340658e-06,
2947
+ "loss": 0.0697,
2948
+ "step": 4270
2949
+ },
2950
+ {
2951
+ "epoch": 47.03,
2952
+ "learning_rate": 1.8e-06,
2953
+ "loss": 0.0777,
2954
+ "step": 4280
2955
+ },
2956
+ {
2957
+ "epoch": 47.14,
2958
+ "learning_rate": 1.734065934065934e-06,
2959
+ "loss": 0.0546,
2960
+ "step": 4290
2961
+ },
2962
+ {
2963
+ "epoch": 47.25,
2964
+ "learning_rate": 1.6681318681318681e-06,
2965
+ "loss": 0.0489,
2966
+ "step": 4300
2967
+ },
2968
+ {
2969
+ "epoch": 47.25,
2970
+ "eval_accuracy": 0.7588235294117647,
2971
+ "eval_loss": 0.7507675886154175,
2972
+ "eval_runtime": 1.5139,
2973
+ "eval_samples_per_second": 112.29,
2974
+ "eval_steps_per_second": 14.532,
2975
+ "step": 4300
2976
+ },
2977
+ {
2978
+ "epoch": 47.36,
2979
+ "learning_rate": 1.6021978021978023e-06,
2980
+ "loss": 0.087,
2981
+ "step": 4310
2982
+ },
2983
+ {
2984
+ "epoch": 47.47,
2985
+ "learning_rate": 1.5362637362637364e-06,
2986
+ "loss": 0.0452,
2987
+ "step": 4320
2988
+ },
2989
+ {
2990
+ "epoch": 47.58,
2991
+ "learning_rate": 1.4703296703296705e-06,
2992
+ "loss": 0.0508,
2993
+ "step": 4330
2994
+ },
2995
+ {
2996
+ "epoch": 47.69,
2997
+ "learning_rate": 1.4043956043956046e-06,
2998
+ "loss": 0.0812,
2999
+ "step": 4340
3000
+ },
3001
+ {
3002
+ "epoch": 47.8,
3003
+ "learning_rate": 1.3384615384615383e-06,
3004
+ "loss": 0.0721,
3005
+ "step": 4350
3006
+ },
3007
+ {
3008
+ "epoch": 47.91,
3009
+ "learning_rate": 1.2725274725274724e-06,
3010
+ "loss": 0.0835,
3011
+ "step": 4360
3012
+ },
3013
+ {
3014
+ "epoch": 48.02,
3015
+ "learning_rate": 1.2065934065934065e-06,
3016
+ "loss": 0.1077,
3017
+ "step": 4370
3018
+ },
3019
+ {
3020
+ "epoch": 48.13,
3021
+ "learning_rate": 1.1406593406593406e-06,
3022
+ "loss": 0.0636,
3023
+ "step": 4380
3024
+ },
3025
+ {
3026
+ "epoch": 48.24,
3027
+ "learning_rate": 1.0747252747252748e-06,
3028
+ "loss": 0.1026,
3029
+ "step": 4390
3030
+ },
3031
+ {
3032
+ "epoch": 48.35,
3033
+ "learning_rate": 1.0087912087912089e-06,
3034
+ "loss": 0.0678,
3035
+ "step": 4400
3036
+ },
3037
+ {
3038
+ "epoch": 48.35,
3039
+ "eval_accuracy": 0.7352941176470589,
3040
+ "eval_loss": 0.8784447312355042,
3041
+ "eval_runtime": 1.5075,
3042
+ "eval_samples_per_second": 112.768,
3043
+ "eval_steps_per_second": 14.594,
3044
+ "step": 4400
3045
+ },
3046
+ {
3047
+ "epoch": 48.46,
3048
+ "learning_rate": 9.42857142857143e-07,
3049
+ "loss": 0.072,
3050
+ "step": 4410
3051
+ },
3052
+ {
3053
+ "epoch": 48.57,
3054
+ "learning_rate": 8.769230769230769e-07,
3055
+ "loss": 0.0526,
3056
+ "step": 4420
3057
+ },
3058
+ {
3059
+ "epoch": 48.68,
3060
+ "learning_rate": 8.10989010989011e-07,
3061
+ "loss": 0.0802,
3062
+ "step": 4430
3063
+ },
3064
+ {
3065
+ "epoch": 48.79,
3066
+ "learning_rate": 7.450549450549451e-07,
3067
+ "loss": 0.046,
3068
+ "step": 4440
3069
+ },
3070
+ {
3071
+ "epoch": 48.9,
3072
+ "learning_rate": 6.791208791208791e-07,
3073
+ "loss": 0.0656,
3074
+ "step": 4450
3075
+ },
3076
+ {
3077
+ "epoch": 49.01,
3078
+ "learning_rate": 6.131868131868131e-07,
3079
+ "loss": 0.139,
3080
+ "step": 4460
3081
+ },
3082
+ {
3083
+ "epoch": 49.12,
3084
+ "learning_rate": 5.472527472527473e-07,
3085
+ "loss": 0.0576,
3086
+ "step": 4470
3087
+ },
3088
+ {
3089
+ "epoch": 49.23,
3090
+ "learning_rate": 4.813186813186814e-07,
3091
+ "loss": 0.0686,
3092
+ "step": 4480
3093
+ },
3094
+ {
3095
+ "epoch": 49.34,
3096
+ "learning_rate": 4.153846153846154e-07,
3097
+ "loss": 0.0658,
3098
+ "step": 4490
3099
+ },
3100
+ {
3101
+ "epoch": 49.45,
3102
+ "learning_rate": 3.4945054945054945e-07,
3103
+ "loss": 0.0832,
3104
+ "step": 4500
3105
+ },
3106
+ {
3107
+ "epoch": 49.45,
3108
+ "eval_accuracy": 0.7764705882352941,
3109
+ "eval_loss": 0.7247602343559265,
3110
+ "eval_runtime": 1.5084,
3111
+ "eval_samples_per_second": 112.699,
3112
+ "eval_steps_per_second": 14.585,
3113
+ "step": 4500
3114
+ },
3115
+ {
3116
+ "epoch": 49.56,
3117
+ "learning_rate": 2.835164835164835e-07,
3118
+ "loss": 0.0692,
3119
+ "step": 4510
3120
+ },
3121
+ {
3122
+ "epoch": 49.67,
3123
+ "learning_rate": 2.175824175824176e-07,
3124
+ "loss": 0.1135,
3125
+ "step": 4520
3126
+ },
3127
+ {
3128
+ "epoch": 49.78,
3129
+ "learning_rate": 1.5164835164835167e-07,
3130
+ "loss": 0.1512,
3131
+ "step": 4530
3132
+ },
3133
+ {
3134
+ "epoch": 49.89,
3135
+ "learning_rate": 8.571428571428572e-08,
3136
+ "loss": 0.1339,
3137
+ "step": 4540
3138
+ },
3139
+ {
3140
+ "epoch": 50.0,
3141
+ "learning_rate": 1.9780219780219782e-08,
3142
+ "loss": 0.1045,
3143
+ "step": 4550
3144
+ },
3145
+ {
3146
+ "epoch": 50.0,
3147
+ "step": 4550,
3148
+ "total_flos": 5.622144623740109e+18,
3149
+ "train_loss": 0.2358846340533141,
3150
+ "train_runtime": 1979.1189,
3151
+ "train_samples_per_second": 36.658,
3152
+ "train_steps_per_second": 2.299
3153
  }
3154
  ],
3155
+ "max_steps": 4550,
3156
+ "num_train_epochs": 50,
3157
+ "total_flos": 5.622144623740109e+18,
3158
  "trial_name": null,
3159
  "trial_params": null
3160
  }