Soulaimen commited on
Commit
209fa63
1 Parent(s): f04433a

End of training

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 9.93,
3
- "eval_accuracy": 0.9053803339517625,
4
- "eval_loss": 0.28196877241134644,
5
- "eval_runtime": 9.3286,
6
- "eval_samples_per_second": 57.78,
7
- "eval_steps_per_second": 7.289,
8
- "total_flos": 1.0221236731922227e+18,
9
- "train_loss": 0.4434790275817694,
10
- "train_runtime": 1771.1031,
11
- "train_samples_per_second": 27.356,
12
- "train_steps_per_second": 0.486
13
  }
 
1
  {
2
+ "epoch": 49.67,
3
+ "eval_accuracy": 0.974025974025974,
4
+ "eval_loss": 0.10644800215959549,
5
+ "eval_runtime": 9.2052,
6
+ "eval_samples_per_second": 58.554,
7
+ "eval_steps_per_second": 7.387,
8
+ "total_flos": 5.11036354111998e+18,
9
+ "train_loss": 0.23105980243793753,
10
+ "train_runtime": 6101.6743,
11
+ "train_samples_per_second": 39.702,
12
+ "train_steps_per_second": 0.705
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.93,
3
- "eval_accuracy": 0.9053803339517625,
4
- "eval_loss": 0.28196877241134644,
5
- "eval_runtime": 9.3286,
6
- "eval_samples_per_second": 57.78,
7
- "eval_steps_per_second": 7.289
8
  }
 
1
  {
2
+ "epoch": 49.67,
3
+ "eval_accuracy": 0.974025974025974,
4
+ "eval_loss": 0.10644800215959549,
5
+ "eval_runtime": 9.2052,
6
+ "eval_samples_per_second": 58.554,
7
+ "eval_steps_per_second": 7.387
8
  }
runs/Apr25_14-52-06_91b1909d3306/events.out.tfevents.1682440461.91b1909d3306.614.7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3654ab3359b30ec650bd54c237bbbdf7e7e8c0e1ce8f63eddc7e87d5de2be77
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.93,
3
- "total_flos": 1.0221236731922227e+18,
4
- "train_loss": 0.4434790275817694,
5
- "train_runtime": 1771.1031,
6
- "train_samples_per_second": 27.356,
7
- "train_steps_per_second": 0.486
8
  }
 
1
  {
2
+ "epoch": 49.67,
3
+ "total_flos": 5.11036354111998e+18,
4
+ "train_loss": 0.23105980243793753,
5
+ "train_runtime": 6101.6743,
6
+ "train_samples_per_second": 39.702,
7
+ "train_steps_per_second": 0.705
8
  }
trainer_state.json CHANGED
@@ -1,631 +1,3055 @@
1
  {
2
- "best_metric": 0.9053803339517625,
3
- "best_model_checkpoint": "resnet-50-resnet50_fashion/checkpoint-692",
4
- "epoch": 9.933993399339935,
5
- "global_step": 860,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.12,
12
- "learning_rate": 4.994124559341951e-05,
13
- "loss": 0.6959,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.23,
18
- "learning_rate": 4.935370152761457e-05,
19
- "loss": 0.6903,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.35,
24
- "learning_rate": 4.876615746180964e-05,
25
- "loss": 0.6815,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.46,
30
- "learning_rate": 4.81786133960047e-05,
31
- "loss": 0.6753,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.58,
36
- "learning_rate": 4.759106933019977e-05,
37
- "loss": 0.6688,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.69,
42
- "learning_rate": 4.700352526439483e-05,
43
- "loss": 0.6613,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.81,
48
- "learning_rate": 4.6415981198589895e-05,
49
- "loss": 0.6599,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.92,
54
- "learning_rate": 4.582843713278496e-05,
55
- "loss": 0.6396,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.99,
60
- "eval_accuracy": 0.7346938775510204,
61
- "eval_loss": 0.7625104188919067,
62
- "eval_runtime": 58.978,
63
- "eval_samples_per_second": 9.139,
64
- "eval_steps_per_second": 1.153,
65
  "step": 86
66
  },
67
  {
68
  "epoch": 1.04,
69
- "learning_rate": 4.524089306698003e-05,
70
- "loss": 0.6247,
71
  "step": 90
72
  },
73
  {
74
  "epoch": 1.16,
75
- "learning_rate": 4.465334900117509e-05,
76
- "loss": 0.628,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 1.27,
81
- "learning_rate": 4.4065804935370154e-05,
82
- "loss": 0.632,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 1.39,
87
- "learning_rate": 4.347826086956522e-05,
88
- "loss": 0.6004,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 1.5,
93
- "learning_rate": 4.289071680376029e-05,
94
- "loss": 0.5901,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 1.62,
99
- "learning_rate": 4.2303172737955346e-05,
100
- "loss": 0.5947,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 1.73,
105
- "learning_rate": 4.171562867215041e-05,
106
- "loss": 0.5753,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 1.85,
111
- "learning_rate": 4.112808460634548e-05,
112
- "loss": 0.5851,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 1.96,
117
- "learning_rate": 4.0540540540540545e-05,
118
- "loss": 0.5646,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 2.0,
123
- "eval_accuracy": 0.8348794063079777,
124
- "eval_loss": 0.5780991911888123,
125
- "eval_runtime": 9.3608,
126
- "eval_samples_per_second": 57.581,
127
- "eval_steps_per_second": 7.264,
128
  "step": 173
129
  },
130
  {
131
  "epoch": 2.08,
132
- "learning_rate": 3.9952996474735605e-05,
133
- "loss": 0.547,
134
  "step": 180
135
  },
136
  {
137
  "epoch": 2.19,
138
- "learning_rate": 3.936545240893067e-05,
139
- "loss": 0.5548,
140
  "step": 190
141
  },
142
  {
143
  "epoch": 2.31,
144
- "learning_rate": 3.877790834312574e-05,
145
- "loss": 0.5498,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 2.43,
150
- "learning_rate": 3.81903642773208e-05,
151
- "loss": 0.5291,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 2.54,
156
- "learning_rate": 3.760282021151586e-05,
157
- "loss": 0.5145,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 2.66,
162
- "learning_rate": 3.701527614571093e-05,
163
- "loss": 0.4981,
164
  "step": 230
165
  },
166
  {
167
  "epoch": 2.77,
168
- "learning_rate": 3.6427732079905996e-05,
169
- "loss": 0.5022,
170
  "step": 240
171
  },
172
  {
173
  "epoch": 2.89,
174
- "learning_rate": 3.584018801410106e-05,
175
- "loss": 0.4768,
176
  "step": 250
177
  },
178
  {
179
  "epoch": 2.99,
180
- "eval_accuracy": 0.8571428571428571,
181
- "eval_loss": 0.47913119196891785,
182
- "eval_runtime": 8.4707,
183
- "eval_samples_per_second": 63.631,
184
- "eval_steps_per_second": 8.028,
185
  "step": 259
186
  },
187
  {
188
  "epoch": 3.0,
189
- "learning_rate": 3.525264394829612e-05,
190
- "loss": 0.4758,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 3.12,
195
- "learning_rate": 3.466509988249119e-05,
196
- "loss": 0.4699,
197
  "step": 270
198
  },
199
  {
200
  "epoch": 3.23,
201
- "learning_rate": 3.4077555816686255e-05,
202
- "loss": 0.4638,
203
  "step": 280
204
  },
205
  {
206
  "epoch": 3.35,
207
- "learning_rate": 3.3490011750881314e-05,
208
- "loss": 0.4378,
209
  "step": 290
210
  },
211
  {
212
  "epoch": 3.47,
213
- "learning_rate": 3.290246768507638e-05,
214
- "loss": 0.4795,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 3.58,
219
- "learning_rate": 3.231492361927145e-05,
220
- "loss": 0.4522,
221
  "step": 310
222
  },
223
  {
224
  "epoch": 3.7,
225
- "learning_rate": 3.172737955346651e-05,
226
- "loss": 0.4456,
227
  "step": 320
228
  },
229
  {
230
  "epoch": 3.81,
231
- "learning_rate": 3.113983548766158e-05,
232
- "loss": 0.4489,
233
  "step": 330
234
  },
235
  {
236
  "epoch": 3.93,
237
- "learning_rate": 3.055229142185664e-05,
238
- "loss": 0.4161,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 4.0,
243
- "eval_accuracy": 0.8905380333951762,
244
- "eval_loss": 0.38660189509391785,
245
- "eval_runtime": 8.0364,
246
- "eval_samples_per_second": 67.07,
247
- "eval_steps_per_second": 8.461,
248
  "step": 346
249
  },
250
  {
251
  "epoch": 4.04,
252
- "learning_rate": 2.9964747356051702e-05,
253
- "loss": 0.415,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 4.16,
258
- "learning_rate": 2.9377203290246768e-05,
259
- "loss": 0.4333,
260
  "step": 360
261
  },
262
  {
263
  "epoch": 4.27,
264
- "learning_rate": 2.8789659224441835e-05,
265
- "loss": 0.4304,
266
  "step": 370
267
  },
268
  {
269
  "epoch": 4.39,
270
- "learning_rate": 2.82021151586369e-05,
271
- "loss": 0.4151,
272
  "step": 380
273
  },
274
  {
275
  "epoch": 4.5,
276
- "learning_rate": 2.7614571092831964e-05,
277
- "loss": 0.3846,
278
  "step": 390
279
  },
280
  {
281
  "epoch": 4.62,
282
- "learning_rate": 2.702702702702703e-05,
283
- "loss": 0.4447,
284
  "step": 400
285
  },
286
  {
287
  "epoch": 4.74,
288
- "learning_rate": 2.6439482961222096e-05,
289
- "loss": 0.4253,
290
  "step": 410
291
  },
292
  {
293
  "epoch": 4.85,
294
- "learning_rate": 2.5851938895417156e-05,
295
- "loss": 0.3669,
296
  "step": 420
297
  },
298
  {
299
  "epoch": 4.97,
300
- "learning_rate": 2.526439482961222e-05,
301
- "loss": 0.402,
302
  "step": 430
303
  },
304
  {
305
  "epoch": 4.99,
306
- "eval_accuracy": 0.9035250463821892,
307
- "eval_loss": 0.3293728232383728,
308
- "eval_runtime": 7.9465,
309
- "eval_samples_per_second": 67.829,
310
- "eval_steps_per_second": 8.557,
311
  "step": 432
312
  },
313
  {
314
  "epoch": 5.08,
315
- "learning_rate": 2.4676850763807285e-05,
316
- "loss": 0.4157,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 5.2,
321
- "learning_rate": 2.408930669800235e-05,
322
- "loss": 0.372,
323
  "step": 450
324
  },
325
  {
326
  "epoch": 5.31,
327
- "learning_rate": 2.3501762632197415e-05,
328
- "loss": 0.4115,
329
  "step": 460
330
  },
331
  {
332
  "epoch": 5.43,
333
- "learning_rate": 2.291421856639248e-05,
334
- "loss": 0.3954,
335
  "step": 470
336
  },
337
  {
338
  "epoch": 5.54,
339
- "learning_rate": 2.2326674500587544e-05,
340
- "loss": 0.395,
341
  "step": 480
342
  },
343
  {
344
  "epoch": 5.66,
345
- "learning_rate": 2.173913043478261e-05,
346
- "loss": 0.378,
347
  "step": 490
348
  },
349
  {
350
  "epoch": 5.78,
351
- "learning_rate": 2.1151586368977673e-05,
352
- "loss": 0.354,
353
  "step": 500
354
  },
355
  {
356
  "epoch": 5.89,
357
- "learning_rate": 2.056404230317274e-05,
358
- "loss": 0.369,
359
  "step": 510
360
  },
361
  {
362
  "epoch": 6.0,
363
- "eval_accuracy": 0.8923933209647495,
364
- "eval_loss": 1.0405044555664062,
365
- "eval_runtime": 7.8895,
366
- "eval_samples_per_second": 68.318,
367
- "eval_steps_per_second": 8.619,
368
  "step": 519
369
  },
370
  {
371
  "epoch": 6.01,
372
- "learning_rate": 1.9976498237367802e-05,
373
- "loss": 0.3544,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 6.12,
378
- "learning_rate": 1.938895417156287e-05,
379
- "loss": 0.3565,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 6.24,
384
- "learning_rate": 1.880141010575793e-05,
385
- "loss": 0.3868,
386
  "step": 540
387
  },
388
  {
389
  "epoch": 6.35,
390
- "learning_rate": 1.8213866039952998e-05,
391
- "loss": 0.4005,
392
  "step": 550
393
  },
394
  {
395
  "epoch": 6.47,
396
- "learning_rate": 1.762632197414806e-05,
397
- "loss": 0.3524,
398
  "step": 560
399
  },
400
  {
401
  "epoch": 6.58,
402
- "learning_rate": 1.7038777908343127e-05,
403
- "loss": 0.4008,
404
  "step": 570
405
  },
406
  {
407
  "epoch": 6.7,
408
- "learning_rate": 1.645123384253819e-05,
409
- "loss": 0.3571,
410
  "step": 580
411
  },
412
  {
413
  "epoch": 6.82,
414
- "learning_rate": 1.5863689776733257e-05,
415
- "loss": 0.3398,
416
  "step": 590
417
  },
418
  {
419
  "epoch": 6.93,
420
- "learning_rate": 1.527614571092832e-05,
421
- "loss": 0.3512,
422
  "step": 600
423
  },
424
  {
425
  "epoch": 7.0,
426
- "eval_accuracy": 0.8905380333951762,
427
- "eval_loss": 1.4846545457839966,
428
- "eval_runtime": 8.0342,
429
- "eval_samples_per_second": 67.089,
430
- "eval_steps_per_second": 8.464,
431
  "step": 606
432
  },
433
  {
434
  "epoch": 7.05,
435
- "learning_rate": 1.4688601645123384e-05,
436
- "loss": 0.3451,
437
  "step": 610
438
  },
439
  {
440
  "epoch": 7.16,
441
- "learning_rate": 1.410105757931845e-05,
442
- "loss": 0.3489,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 7.28,
447
- "learning_rate": 1.3513513513513515e-05,
448
- "loss": 0.3414,
449
  "step": 630
450
  },
451
  {
452
  "epoch": 7.39,
453
- "learning_rate": 1.2925969447708578e-05,
454
- "loss": 0.3867,
455
  "step": 640
456
  },
457
  {
458
  "epoch": 7.51,
459
- "learning_rate": 1.2338425381903643e-05,
460
- "loss": 0.3509,
461
  "step": 650
462
  },
463
  {
464
  "epoch": 7.62,
465
- "learning_rate": 1.1750881316098707e-05,
466
- "loss": 0.33,
467
  "step": 660
468
  },
469
  {
470
  "epoch": 7.74,
471
- "learning_rate": 1.1163337250293772e-05,
472
- "loss": 0.3678,
473
  "step": 670
474
  },
475
  {
476
  "epoch": 7.85,
477
- "learning_rate": 1.0575793184488837e-05,
478
- "loss": 0.3481,
479
  "step": 680
480
  },
481
  {
482
  "epoch": 7.97,
483
- "learning_rate": 9.988249118683901e-06,
484
- "loss": 0.3439,
485
  "step": 690
486
  },
487
  {
488
  "epoch": 7.99,
489
- "eval_accuracy": 0.9053803339517625,
490
- "eval_loss": 0.28196877241134644,
491
- "eval_runtime": 8.0187,
492
- "eval_samples_per_second": 67.218,
493
- "eval_steps_per_second": 8.48,
494
  "step": 692
495
  },
496
  {
497
  "epoch": 8.09,
498
- "learning_rate": 9.400705052878966e-06,
499
- "loss": 0.3763,
500
  "step": 700
501
  },
502
  {
503
  "epoch": 8.2,
504
- "learning_rate": 8.81316098707403e-06,
505
- "loss": 0.3435,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 8.32,
510
- "learning_rate": 8.225616921269095e-06,
511
- "loss": 0.3455,
512
  "step": 720
513
  },
514
  {
515
  "epoch": 8.43,
516
- "learning_rate": 7.63807285546416e-06,
517
- "loss": 0.3422,
518
  "step": 730
519
  },
520
  {
521
  "epoch": 8.55,
522
- "learning_rate": 7.050528789659225e-06,
523
- "loss": 0.3484,
524
  "step": 740
525
  },
526
  {
527
  "epoch": 8.66,
528
- "learning_rate": 6.462984723854289e-06,
529
- "loss": 0.3342,
530
  "step": 750
531
  },
532
  {
533
  "epoch": 8.78,
534
- "learning_rate": 5.875440658049354e-06,
535
- "loss": 0.3657,
536
  "step": 760
537
  },
538
  {
539
  "epoch": 8.89,
540
- "learning_rate": 5.287896592244418e-06,
541
- "loss": 0.3306,
542
  "step": 770
543
  },
544
  {
545
  "epoch": 9.0,
546
- "eval_accuracy": 0.8849721706864564,
547
- "eval_loss": 0.3021787703037262,
548
- "eval_runtime": 8.1523,
549
- "eval_samples_per_second": 66.116,
550
- "eval_steps_per_second": 8.341,
551
  "step": 779
552
  },
553
  {
554
  "epoch": 9.01,
555
- "learning_rate": 4.700352526439483e-06,
556
- "loss": 0.3541,
557
  "step": 780
558
  },
559
  {
560
  "epoch": 9.13,
561
- "learning_rate": 4.1128084606345476e-06,
562
- "loss": 0.3233,
563
  "step": 790
564
  },
565
  {
566
  "epoch": 9.24,
567
- "learning_rate": 3.5252643948296126e-06,
568
- "loss": 0.3345,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 9.36,
573
- "learning_rate": 2.937720329024677e-06,
574
- "loss": 0.3387,
575
  "step": 810
576
  },
577
  {
578
  "epoch": 9.47,
579
- "learning_rate": 2.3501762632197415e-06,
580
- "loss": 0.3477,
581
  "step": 820
582
  },
583
  {
584
  "epoch": 9.59,
585
- "learning_rate": 1.7626321974148063e-06,
586
- "loss": 0.3148,
587
  "step": 830
588
  },
589
  {
590
  "epoch": 9.7,
591
- "learning_rate": 1.1750881316098707e-06,
592
- "loss": 0.337,
593
  "step": 840
594
  },
595
  {
596
  "epoch": 9.82,
597
- "learning_rate": 5.875440658049354e-07,
598
- "loss": 0.3339,
599
  "step": 850
600
  },
601
  {
602
  "epoch": 9.93,
603
- "learning_rate": 0.0,
604
- "loss": 0.3691,
605
  "step": 860
606
  },
607
  {
608
- "epoch": 9.93,
609
- "eval_accuracy": 0.8163265306122449,
610
- "eval_loss": 0.7981617450714111,
611
- "eval_runtime": 9.506,
612
- "eval_samples_per_second": 56.701,
613
- "eval_steps_per_second": 7.153,
614
- "step": 860
615
  },
616
  {
617
- "epoch": 9.93,
618
- "step": 860,
619
- "total_flos": 1.0221236731922227e+18,
620
- "train_loss": 0.4434790275817694,
621
- "train_runtime": 1771.1031,
622
- "train_samples_per_second": 27.356,
623
- "train_steps_per_second": 0.486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  }
625
  ],
626
- "max_steps": 860,
627
- "num_train_epochs": 10,
628
- "total_flos": 1.0221236731922227e+18,
629
  "trial_name": null,
630
  "trial_params": null
631
  }
 
1
  {
2
+ "best_metric": 0.974025974025974,
3
+ "best_model_checkpoint": "resnet-50-resnet50_fashion/checkpoint-2077",
4
+ "epoch": 49.66996699669967,
5
+ "global_step": 4300,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.12,
12
+ "learning_rate": 1.1627906976744187e-05,
13
+ "loss": 0.6908,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.23,
18
+ "learning_rate": 2.3255813953488374e-05,
19
+ "loss": 0.6904,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.35,
24
+ "learning_rate": 3.488372093023256e-05,
25
+ "loss": 0.6864,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.46,
30
+ "learning_rate": 4.651162790697675e-05,
31
+ "loss": 0.6821,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.58,
36
+ "learning_rate": 4.991778247592201e-05,
37
+ "loss": 0.6771,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.69,
42
+ "learning_rate": 4.9800328870096314e-05,
43
+ "loss": 0.6719,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.81,
48
+ "learning_rate": 4.968287526427062e-05,
49
+ "loss": 0.6696,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.92,
54
+ "learning_rate": 4.956542165844491e-05,
55
+ "loss": 0.6532,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.99,
60
+ "eval_accuracy": 0.634508348794063,
61
+ "eval_loss": 0.6780841946601868,
62
+ "eval_runtime": 8.8727,
63
+ "eval_samples_per_second": 60.748,
64
+ "eval_steps_per_second": 7.664,
65
  "step": 86
66
  },
67
  {
68
  "epoch": 1.04,
69
+ "learning_rate": 4.944796805261922e-05,
70
+ "loss": 0.6442,
71
  "step": 90
72
  },
73
  {
74
  "epoch": 1.16,
75
+ "learning_rate": 4.933051444679352e-05,
76
+ "loss": 0.6432,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 1.27,
81
+ "learning_rate": 4.9213060840967814e-05,
82
+ "loss": 0.64,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 1.39,
87
+ "learning_rate": 4.9095607235142123e-05,
88
+ "loss": 0.6133,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 1.5,
93
+ "learning_rate": 4.897815362931642e-05,
94
+ "loss": 0.5955,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 1.62,
99
+ "learning_rate": 4.886070002349072e-05,
100
+ "loss": 0.5998,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 1.73,
105
+ "learning_rate": 4.8743246417665025e-05,
106
+ "loss": 0.5724,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 1.85,
111
+ "learning_rate": 4.862579281183933e-05,
112
+ "loss": 0.573,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 1.96,
117
+ "learning_rate": 4.850833920601362e-05,
118
+ "loss": 0.5407,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 2.0,
123
+ "eval_accuracy": 0.8589981447124304,
124
+ "eval_loss": 0.5222358107566833,
125
+ "eval_runtime": 8.0811,
126
+ "eval_samples_per_second": 66.699,
127
+ "eval_steps_per_second": 8.415,
128
  "step": 173
129
  },
130
  {
131
  "epoch": 2.08,
132
+ "learning_rate": 4.839088560018793e-05,
133
+ "loss": 0.5141,
134
  "step": 180
135
  },
136
  {
137
  "epoch": 2.19,
138
+ "learning_rate": 4.827343199436223e-05,
139
+ "loss": 0.5092,
140
  "step": 190
141
  },
142
  {
143
  "epoch": 2.31,
144
+ "learning_rate": 4.815597838853653e-05,
145
+ "loss": 0.5005,
146
  "step": 200
147
  },
148
  {
149
  "epoch": 2.43,
150
+ "learning_rate": 4.8038524782710834e-05,
151
+ "loss": 0.4718,
152
  "step": 210
153
  },
154
  {
155
  "epoch": 2.54,
156
+ "learning_rate": 4.7921071176885137e-05,
157
+ "loss": 0.4495,
158
  "step": 220
159
  },
160
  {
161
  "epoch": 2.66,
162
+ "learning_rate": 4.780361757105943e-05,
163
+ "loss": 0.4403,
164
  "step": 230
165
  },
166
  {
167
  "epoch": 2.77,
168
+ "learning_rate": 4.7686163965233735e-05,
169
+ "loss": 0.438,
170
  "step": 240
171
  },
172
  {
173
  "epoch": 2.89,
174
+ "learning_rate": 4.756871035940804e-05,
175
+ "loss": 0.4086,
176
  "step": 250
177
  },
178
  {
179
  "epoch": 2.99,
180
+ "eval_accuracy": 0.8923933209647495,
181
+ "eval_loss": 0.3594764173030853,
182
+ "eval_runtime": 8.0206,
183
+ "eval_samples_per_second": 67.202,
184
+ "eval_steps_per_second": 8.478,
185
  "step": 259
186
  },
187
  {
188
  "epoch": 3.0,
189
+ "learning_rate": 4.7451256753582334e-05,
190
+ "loss": 0.4199,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 3.12,
195
+ "learning_rate": 4.7333803147756636e-05,
196
+ "loss": 0.3936,
197
  "step": 270
198
  },
199
  {
200
  "epoch": 3.23,
201
+ "learning_rate": 4.721634954193094e-05,
202
+ "loss": 0.3951,
203
  "step": 280
204
  },
205
  {
206
  "epoch": 3.35,
207
+ "learning_rate": 4.709889593610524e-05,
208
+ "loss": 0.365,
209
  "step": 290
210
  },
211
  {
212
  "epoch": 3.47,
213
+ "learning_rate": 4.698144233027954e-05,
214
+ "loss": 0.4232,
215
  "step": 300
216
  },
217
  {
218
  "epoch": 3.58,
219
+ "learning_rate": 4.686398872445385e-05,
220
+ "loss": 0.3703,
221
  "step": 310
222
  },
223
  {
224
  "epoch": 3.7,
225
+ "learning_rate": 4.674653511862814e-05,
226
+ "loss": 0.3714,
227
  "step": 320
228
  },
229
  {
230
  "epoch": 3.81,
231
+ "learning_rate": 4.6629081512802445e-05,
232
+ "loss": 0.3859,
233
  "step": 330
234
  },
235
  {
236
  "epoch": 3.93,
237
+ "learning_rate": 4.651162790697675e-05,
238
+ "loss": 0.3449,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 4.0,
243
+ "eval_accuracy": 0.9183673469387755,
244
+ "eval_loss": 0.2615828514099121,
245
+ "eval_runtime": 7.9757,
246
+ "eval_samples_per_second": 67.58,
247
+ "eval_steps_per_second": 8.526,
248
  "step": 346
249
  },
250
  {
251
  "epoch": 4.04,
252
+ "learning_rate": 4.639417430115105e-05,
253
+ "loss": 0.356,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 4.16,
258
+ "learning_rate": 4.627672069532535e-05,
259
+ "loss": 0.3595,
260
  "step": 360
261
  },
262
  {
263
  "epoch": 4.27,
264
+ "learning_rate": 4.615926708949965e-05,
265
+ "loss": 0.3519,
266
  "step": 370
267
  },
268
  {
269
  "epoch": 4.39,
270
+ "learning_rate": 4.604181348367395e-05,
271
+ "loss": 0.3428,
272
  "step": 380
273
  },
274
  {
275
  "epoch": 4.5,
276
+ "learning_rate": 4.592435987784825e-05,
277
+ "loss": 0.332,
278
  "step": 390
279
  },
280
  {
281
  "epoch": 4.62,
282
+ "learning_rate": 4.580690627202255e-05,
283
+ "loss": 0.3621,
284
  "step": 400
285
  },
286
  {
287
  "epoch": 4.74,
288
+ "learning_rate": 4.568945266619685e-05,
289
+ "loss": 0.3565,
290
  "step": 410
291
  },
292
  {
293
  "epoch": 4.85,
294
+ "learning_rate": 4.5571999060371156e-05,
295
+ "loss": 0.3075,
296
  "step": 420
297
  },
298
  {
299
  "epoch": 4.97,
300
+ "learning_rate": 4.545454545454546e-05,
301
+ "loss": 0.3518,
302
  "step": 430
303
  },
304
  {
305
  "epoch": 4.99,
306
+ "eval_accuracy": 0.9443413729128015,
307
+ "eval_loss": 0.22880208492279053,
308
+ "eval_runtime": 7.8304,
309
+ "eval_samples_per_second": 68.834,
310
+ "eval_steps_per_second": 8.684,
311
  "step": 432
312
  },
313
  {
314
  "epoch": 5.08,
315
+ "learning_rate": 4.533709184871976e-05,
316
+ "loss": 0.3537,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 5.2,
321
+ "learning_rate": 4.521963824289406e-05,
322
+ "loss": 0.2907,
323
  "step": 450
324
  },
325
  {
326
  "epoch": 5.31,
327
+ "learning_rate": 4.510218463706836e-05,
328
+ "loss": 0.3318,
329
  "step": 460
330
  },
331
  {
332
  "epoch": 5.43,
333
+ "learning_rate": 4.498473103124266e-05,
334
+ "loss": 0.3502,
335
  "step": 470
336
  },
337
  {
338
  "epoch": 5.54,
339
+ "learning_rate": 4.4867277425416965e-05,
340
+ "loss": 0.3319,
341
  "step": 480
342
  },
343
  {
344
  "epoch": 5.66,
345
+ "learning_rate": 4.474982381959126e-05,
346
+ "loss": 0.3241,
347
  "step": 490
348
  },
349
  {
350
  "epoch": 5.78,
351
+ "learning_rate": 4.463237021376557e-05,
352
+ "loss": 0.2762,
353
  "step": 500
354
  },
355
  {
356
  "epoch": 5.89,
357
+ "learning_rate": 4.4514916607939866e-05,
358
+ "loss": 0.308,
359
  "step": 510
360
  },
361
  {
362
  "epoch": 6.0,
363
+ "eval_accuracy": 0.9424860853432282,
364
+ "eval_loss": 0.2758006155490875,
365
+ "eval_runtime": 7.9236,
366
+ "eval_samples_per_second": 68.024,
367
+ "eval_steps_per_second": 8.582,
368
  "step": 519
369
  },
370
  {
371
  "epoch": 6.01,
372
+ "learning_rate": 4.439746300211416e-05,
373
+ "loss": 0.3129,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 6.12,
378
+ "learning_rate": 4.428000939628847e-05,
379
+ "loss": 0.2942,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 6.24,
384
+ "learning_rate": 4.416255579046277e-05,
385
+ "loss": 0.3346,
386
  "step": 540
387
  },
388
  {
389
  "epoch": 6.35,
390
+ "learning_rate": 4.404510218463707e-05,
391
+ "loss": 0.3333,
392
  "step": 550
393
  },
394
  {
395
  "epoch": 6.47,
396
+ "learning_rate": 4.392764857881137e-05,
397
+ "loss": 0.2897,
398
  "step": 560
399
  },
400
  {
401
  "epoch": 6.58,
402
+ "learning_rate": 4.3810194972985676e-05,
403
+ "loss": 0.3428,
404
  "step": 570
405
  },
406
  {
407
  "epoch": 6.7,
408
+ "learning_rate": 4.369274136715997e-05,
409
+ "loss": 0.2877,
410
  "step": 580
411
  },
412
  {
413
  "epoch": 6.82,
414
+ "learning_rate": 4.3575287761334274e-05,
415
+ "loss": 0.2512,
416
  "step": 590
417
  },
418
  {
419
  "epoch": 6.93,
420
+ "learning_rate": 4.345783415550858e-05,
421
+ "loss": 0.3209,
422
  "step": 600
423
  },
424
  {
425
  "epoch": 7.0,
426
+ "eval_accuracy": 0.9369202226345084,
427
+ "eval_loss": 0.3777436316013336,
428
+ "eval_runtime": 7.9087,
429
+ "eval_samples_per_second": 68.153,
430
+ "eval_steps_per_second": 8.598,
431
  "step": 606
432
  },
433
  {
434
  "epoch": 7.05,
435
+ "learning_rate": 4.334038054968288e-05,
436
+ "loss": 0.2756,
437
  "step": 610
438
  },
439
  {
440
  "epoch": 7.16,
441
+ "learning_rate": 4.3222926943857175e-05,
442
+ "loss": 0.2645,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 7.28,
447
+ "learning_rate": 4.3105473338031485e-05,
448
+ "loss": 0.2753,
449
  "step": 630
450
  },
451
  {
452
  "epoch": 7.39,
453
+ "learning_rate": 4.298801973220578e-05,
454
+ "loss": 0.3309,
455
  "step": 640
456
  },
457
  {
458
  "epoch": 7.51,
459
+ "learning_rate": 4.2870566126380077e-05,
460
+ "loss": 0.2937,
461
  "step": 650
462
  },
463
  {
464
  "epoch": 7.62,
465
+ "learning_rate": 4.2753112520554386e-05,
466
+ "loss": 0.246,
467
  "step": 660
468
  },
469
  {
470
  "epoch": 7.74,
471
+ "learning_rate": 4.263565891472868e-05,
472
+ "loss": 0.3071,
473
  "step": 670
474
  },
475
  {
476
  "epoch": 7.85,
477
+ "learning_rate": 4.2518205308902985e-05,
478
+ "loss": 0.2831,
479
  "step": 680
480
  },
481
  {
482
  "epoch": 7.97,
483
+ "learning_rate": 4.240075170307729e-05,
484
+ "loss": 0.284,
485
  "step": 690
486
  },
487
  {
488
  "epoch": 7.99,
489
+ "eval_accuracy": 0.9554730983302412,
490
+ "eval_loss": 0.1704244613647461,
491
+ "eval_runtime": 7.9276,
492
+ "eval_samples_per_second": 67.99,
493
+ "eval_steps_per_second": 8.578,
494
  "step": 692
495
  },
496
  {
497
  "epoch": 8.09,
498
+ "learning_rate": 4.228329809725159e-05,
499
+ "loss": 0.3047,
500
  "step": 700
501
  },
502
  {
503
  "epoch": 8.2,
504
+ "learning_rate": 4.2165844491425886e-05,
505
+ "loss": 0.2757,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 8.32,
510
+ "learning_rate": 4.2048390885600195e-05,
511
+ "loss": 0.2666,
512
  "step": 720
513
  },
514
  {
515
  "epoch": 8.43,
516
+ "learning_rate": 4.193093727977449e-05,
517
+ "loss": 0.2711,
518
  "step": 730
519
  },
520
  {
521
  "epoch": 8.55,
522
+ "learning_rate": 4.1813483673948794e-05,
523
+ "loss": 0.2843,
524
  "step": 740
525
  },
526
  {
527
  "epoch": 8.66,
528
+ "learning_rate": 4.1696030068123096e-05,
529
+ "loss": 0.2664,
530
  "step": 750
531
  },
532
  {
533
  "epoch": 8.78,
534
+ "learning_rate": 4.15785764622974e-05,
535
+ "loss": 0.2963,
536
  "step": 760
537
  },
538
  {
539
  "epoch": 8.89,
540
+ "learning_rate": 4.1461122856471695e-05,
541
+ "loss": 0.2466,
542
  "step": 770
543
  },
544
  {
545
  "epoch": 9.0,
546
+ "eval_accuracy": 0.9461966604823747,
547
+ "eval_loss": 0.15713872015476227,
548
+ "eval_runtime": 7.9343,
549
+ "eval_samples_per_second": 67.933,
550
+ "eval_steps_per_second": 8.57,
551
  "step": 779
552
  },
553
  {
554
  "epoch": 9.01,
555
+ "learning_rate": 4.1343669250646e-05,
556
+ "loss": 0.2721,
557
  "step": 780
558
  },
559
  {
560
  "epoch": 9.13,
561
+ "learning_rate": 4.12262156448203e-05,
562
+ "loss": 0.2391,
563
  "step": 790
564
  },
565
  {
566
  "epoch": 9.24,
567
+ "learning_rate": 4.1108762038994596e-05,
568
+ "loss": 0.2502,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 9.36,
573
+ "learning_rate": 4.09913084331689e-05,
574
+ "loss": 0.2653,
575
  "step": 810
576
  },
577
  {
578
  "epoch": 9.47,
579
+ "learning_rate": 4.08738548273432e-05,
580
+ "loss": 0.2557,
581
  "step": 820
582
  },
583
  {
584
  "epoch": 9.59,
585
+ "learning_rate": 4.0756401221517504e-05,
586
+ "loss": 0.2148,
587
  "step": 830
588
  },
589
  {
590
  "epoch": 9.7,
591
+ "learning_rate": 4.06389476156918e-05,
592
+ "loss": 0.2495,
593
  "step": 840
594
  },
595
  {
596
  "epoch": 9.82,
597
+ "learning_rate": 4.052149400986611e-05,
598
+ "loss": 0.2678,
599
  "step": 850
600
  },
601
  {
602
  "epoch": 9.93,
603
+ "learning_rate": 4.0404040404040405e-05,
604
+ "loss": 0.3123,
605
  "step": 860
606
  },
607
  {
608
+ "epoch": 9.99,
609
+ "eval_accuracy": 0.9406307977736549,
610
+ "eval_loss": 0.6491873860359192,
611
+ "eval_runtime": 8.122,
612
+ "eval_samples_per_second": 66.363,
613
+ "eval_steps_per_second": 8.372,
614
+ "step": 865
615
  },
616
  {
617
+ "epoch": 10.05,
618
+ "learning_rate": 4.028658679821471e-05,
619
+ "loss": 0.256,
620
+ "step": 870
621
+ },
622
+ {
623
+ "epoch": 10.17,
624
+ "learning_rate": 4.016913319238901e-05,
625
+ "loss": 0.2439,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 10.28,
630
+ "learning_rate": 4.005167958656331e-05,
631
+ "loss": 0.2592,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 10.4,
636
+ "learning_rate": 3.993422598073761e-05,
637
+ "loss": 0.2539,
638
+ "step": 900
639
+ },
640
+ {
641
+ "epoch": 10.51,
642
+ "learning_rate": 3.981677237491191e-05,
643
+ "loss": 0.2771,
644
+ "step": 910
645
+ },
646
+ {
647
+ "epoch": 10.63,
648
+ "learning_rate": 3.9699318769086215e-05,
649
+ "loss": 0.3149,
650
+ "step": 920
651
+ },
652
+ {
653
+ "epoch": 10.74,
654
+ "learning_rate": 3.958186516326051e-05,
655
+ "loss": 0.2281,
656
+ "step": 930
657
+ },
658
+ {
659
+ "epoch": 10.86,
660
+ "learning_rate": 3.946441155743481e-05,
661
+ "loss": 0.2264,
662
+ "step": 940
663
+ },
664
+ {
665
+ "epoch": 10.97,
666
+ "learning_rate": 3.9346957951609116e-05,
667
+ "loss": 0.2827,
668
+ "step": 950
669
+ },
670
+ {
671
+ "epoch": 11.0,
672
+ "eval_accuracy": 0.9406307977736549,
673
+ "eval_loss": 0.49678388237953186,
674
+ "eval_runtime": 8.0733,
675
+ "eval_samples_per_second": 66.763,
676
+ "eval_steps_per_second": 8.423,
677
+ "step": 952
678
+ },
679
+ {
680
+ "epoch": 11.09,
681
+ "learning_rate": 3.922950434578342e-05,
682
+ "loss": 0.2954,
683
+ "step": 960
684
+ },
685
+ {
686
+ "epoch": 11.2,
687
+ "learning_rate": 3.9112050739957714e-05,
688
+ "loss": 0.2484,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 11.32,
693
+ "learning_rate": 3.8994597134132024e-05,
694
+ "loss": 0.2382,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 11.44,
699
+ "learning_rate": 3.887714352830632e-05,
700
+ "loss": 0.2572,
701
+ "step": 990
702
+ },
703
+ {
704
+ "epoch": 11.55,
705
+ "learning_rate": 3.875968992248062e-05,
706
+ "loss": 0.243,
707
+ "step": 1000
708
+ },
709
+ {
710
+ "epoch": 11.67,
711
+ "learning_rate": 3.8642236316654925e-05,
712
+ "loss": 0.2569,
713
+ "step": 1010
714
+ },
715
+ {
716
+ "epoch": 11.78,
717
+ "learning_rate": 3.852478271082923e-05,
718
+ "loss": 0.2302,
719
+ "step": 1020
720
+ },
721
+ {
722
+ "epoch": 11.9,
723
+ "learning_rate": 3.8407329105003524e-05,
724
+ "loss": 0.2736,
725
+ "step": 1030
726
+ },
727
+ {
728
+ "epoch": 11.99,
729
+ "eval_accuracy": 0.9591836734693877,
730
+ "eval_loss": 0.13702794909477234,
731
+ "eval_runtime": 8.0983,
732
+ "eval_samples_per_second": 66.557,
733
+ "eval_steps_per_second": 8.397,
734
+ "step": 1038
735
+ },
736
+ {
737
+ "epoch": 12.01,
738
+ "learning_rate": 3.8289875499177826e-05,
739
+ "loss": 0.2202,
740
+ "step": 1040
741
+ },
742
+ {
743
+ "epoch": 12.13,
744
+ "learning_rate": 3.817242189335213e-05,
745
+ "loss": 0.2247,
746
+ "step": 1050
747
+ },
748
+ {
749
+ "epoch": 12.24,
750
+ "learning_rate": 3.8054968287526425e-05,
751
+ "loss": 0.2449,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 12.36,
756
+ "learning_rate": 3.7937514681700734e-05,
757
+ "loss": 0.249,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 12.48,
762
+ "learning_rate": 3.782006107587503e-05,
763
+ "loss": 0.2809,
764
+ "step": 1080
765
+ },
766
+ {
767
+ "epoch": 12.59,
768
+ "learning_rate": 3.770260747004933e-05,
769
+ "loss": 0.2439,
770
+ "step": 1090
771
+ },
772
+ {
773
+ "epoch": 12.71,
774
+ "learning_rate": 3.7585153864223635e-05,
775
+ "loss": 0.2326,
776
+ "step": 1100
777
+ },
778
+ {
779
+ "epoch": 12.82,
780
+ "learning_rate": 3.746770025839794e-05,
781
+ "loss": 0.2725,
782
+ "step": 1110
783
+ },
784
+ {
785
+ "epoch": 12.94,
786
+ "learning_rate": 3.7350246652572234e-05,
787
+ "loss": 0.2476,
788
+ "step": 1120
789
+ },
790
+ {
791
+ "epoch": 13.0,
792
+ "eval_accuracy": 0.9499072356215214,
793
+ "eval_loss": 0.16156192123889923,
794
+ "eval_runtime": 8.2723,
795
+ "eval_samples_per_second": 65.157,
796
+ "eval_steps_per_second": 8.22,
797
+ "step": 1125
798
+ },
799
+ {
800
+ "epoch": 13.05,
801
+ "learning_rate": 3.723279304674654e-05,
802
+ "loss": 0.2495,
803
+ "step": 1130
804
+ },
805
+ {
806
+ "epoch": 13.17,
807
+ "learning_rate": 3.711533944092084e-05,
808
+ "loss": 0.2815,
809
+ "step": 1140
810
+ },
811
+ {
812
+ "epoch": 13.28,
813
+ "learning_rate": 3.699788583509514e-05,
814
+ "loss": 0.27,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 13.4,
819
+ "learning_rate": 3.688043222926944e-05,
820
+ "loss": 0.2174,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 13.51,
825
+ "learning_rate": 3.676297862344374e-05,
826
+ "loss": 0.2503,
827
+ "step": 1170
828
+ },
829
+ {
830
+ "epoch": 13.63,
831
+ "learning_rate": 3.664552501761804e-05,
832
+ "loss": 0.2209,
833
+ "step": 1180
834
+ },
835
+ {
836
+ "epoch": 13.75,
837
+ "learning_rate": 3.652807141179234e-05,
838
+ "loss": 0.1989,
839
+ "step": 1190
840
+ },
841
+ {
842
+ "epoch": 13.86,
843
+ "learning_rate": 3.641061780596665e-05,
844
+ "loss": 0.2473,
845
+ "step": 1200
846
+ },
847
+ {
848
+ "epoch": 13.98,
849
+ "learning_rate": 3.6293164200140944e-05,
850
+ "loss": 0.195,
851
+ "step": 1210
852
+ },
853
+ {
854
+ "epoch": 14.0,
855
+ "eval_accuracy": 0.961038961038961,
856
+ "eval_loss": 0.1361682415008545,
857
+ "eval_runtime": 8.4028,
858
+ "eval_samples_per_second": 64.145,
859
+ "eval_steps_per_second": 8.093,
860
+ "step": 1212
861
+ },
862
+ {
863
+ "epoch": 14.09,
864
+ "learning_rate": 3.617571059431525e-05,
865
+ "loss": 0.2768,
866
+ "step": 1220
867
+ },
868
+ {
869
+ "epoch": 14.21,
870
+ "learning_rate": 3.605825698848955e-05,
871
+ "loss": 0.2251,
872
+ "step": 1230
873
+ },
874
+ {
875
+ "epoch": 14.32,
876
+ "learning_rate": 3.594080338266385e-05,
877
+ "loss": 0.1665,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 14.44,
882
+ "learning_rate": 3.582334977683815e-05,
883
+ "loss": 0.2384,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 14.55,
888
+ "learning_rate": 3.570589617101245e-05,
889
+ "loss": 0.2133,
890
+ "step": 1260
891
+ },
892
+ {
893
+ "epoch": 14.67,
894
+ "learning_rate": 3.5588442565186754e-05,
895
+ "loss": 0.2234,
896
+ "step": 1270
897
+ },
898
+ {
899
+ "epoch": 14.79,
900
+ "learning_rate": 3.5470988959361056e-05,
901
+ "loss": 0.2373,
902
+ "step": 1280
903
+ },
904
+ {
905
+ "epoch": 14.9,
906
+ "learning_rate": 3.535353535353535e-05,
907
+ "loss": 0.2536,
908
+ "step": 1290
909
+ },
910
+ {
911
+ "epoch": 14.99,
912
+ "eval_accuracy": 0.9536178107606679,
913
+ "eval_loss": 0.12982788681983948,
914
+ "eval_runtime": 8.2944,
915
+ "eval_samples_per_second": 64.984,
916
+ "eval_steps_per_second": 8.198,
917
+ "step": 1298
918
+ },
919
+ {
920
+ "epoch": 15.02,
921
+ "learning_rate": 3.5236081747709655e-05,
922
+ "loss": 0.2483,
923
+ "step": 1300
924
+ },
925
+ {
926
+ "epoch": 15.13,
927
+ "learning_rate": 3.511862814188396e-05,
928
+ "loss": 0.2263,
929
+ "step": 1310
930
+ },
931
+ {
932
+ "epoch": 15.25,
933
+ "learning_rate": 3.500117453605825e-05,
934
+ "loss": 0.2542,
935
+ "step": 1320
936
+ },
937
+ {
938
+ "epoch": 15.36,
939
+ "learning_rate": 3.488372093023256e-05,
940
+ "loss": 0.2009,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 15.48,
945
+ "learning_rate": 3.476626732440686e-05,
946
+ "loss": 0.2383,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 15.59,
951
+ "learning_rate": 3.464881371858116e-05,
952
+ "loss": 0.2116,
953
+ "step": 1350
954
+ },
955
+ {
956
+ "epoch": 15.71,
957
+ "learning_rate": 3.4531360112755464e-05,
958
+ "loss": 0.2653,
959
+ "step": 1360
960
+ },
961
+ {
962
+ "epoch": 15.83,
963
+ "learning_rate": 3.441390650692977e-05,
964
+ "loss": 0.2447,
965
+ "step": 1370
966
+ },
967
+ {
968
+ "epoch": 15.94,
969
+ "learning_rate": 3.429645290110406e-05,
970
+ "loss": 0.2022,
971
+ "step": 1380
972
+ },
973
+ {
974
+ "epoch": 16.0,
975
+ "eval_accuracy": 0.9517625231910947,
976
+ "eval_loss": 0.7470229268074036,
977
+ "eval_runtime": 8.0594,
978
+ "eval_samples_per_second": 66.878,
979
+ "eval_steps_per_second": 8.437,
980
+ "step": 1385
981
+ },
982
+ {
983
+ "epoch": 16.06,
984
+ "learning_rate": 3.417899929527837e-05,
985
+ "loss": 0.2249,
986
+ "step": 1390
987
+ },
988
+ {
989
+ "epoch": 16.17,
990
+ "learning_rate": 3.406154568945267e-05,
991
+ "loss": 0.2498,
992
+ "step": 1400
993
+ },
994
+ {
995
+ "epoch": 16.29,
996
+ "learning_rate": 3.394409208362697e-05,
997
+ "loss": 0.215,
998
+ "step": 1410
999
+ },
1000
+ {
1001
+ "epoch": 16.4,
1002
+ "learning_rate": 3.382663847780127e-05,
1003
+ "loss": 0.2512,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 16.52,
1008
+ "learning_rate": 3.370918487197557e-05,
1009
+ "loss": 0.2096,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 16.63,
1014
+ "learning_rate": 3.359173126614987e-05,
1015
+ "loss": 0.2097,
1016
+ "step": 1440
1017
+ },
1018
+ {
1019
+ "epoch": 16.75,
1020
+ "learning_rate": 3.3474277660324174e-05,
1021
+ "loss": 0.2158,
1022
+ "step": 1450
1023
+ },
1024
+ {
1025
+ "epoch": 16.86,
1026
+ "learning_rate": 3.335682405449848e-05,
1027
+ "loss": 0.2303,
1028
+ "step": 1460
1029
+ },
1030
+ {
1031
+ "epoch": 16.98,
1032
+ "learning_rate": 3.323937044867277e-05,
1033
+ "loss": 0.2406,
1034
+ "step": 1470
1035
+ },
1036
+ {
1037
+ "epoch": 16.99,
1038
+ "eval_accuracy": 0.9647495361781077,
1039
+ "eval_loss": 0.12411854416131973,
1040
+ "eval_runtime": 7.9838,
1041
+ "eval_samples_per_second": 67.512,
1042
+ "eval_steps_per_second": 8.517,
1043
+ "step": 1471
1044
+ },
1045
+ {
1046
+ "epoch": 17.1,
1047
+ "learning_rate": 3.3121916842847076e-05,
1048
+ "loss": 0.2697,
1049
+ "step": 1480
1050
+ },
1051
+ {
1052
+ "epoch": 17.21,
1053
+ "learning_rate": 3.300446323702138e-05,
1054
+ "loss": 0.2567,
1055
+ "step": 1490
1056
+ },
1057
+ {
1058
+ "epoch": 17.33,
1059
+ "learning_rate": 3.288700963119568e-05,
1060
+ "loss": 0.2745,
1061
+ "step": 1500
1062
+ },
1063
+ {
1064
+ "epoch": 17.44,
1065
+ "learning_rate": 3.276955602536998e-05,
1066
+ "loss": 0.2269,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 17.56,
1071
+ "learning_rate": 3.2652102419544286e-05,
1072
+ "loss": 0.1893,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 17.67,
1077
+ "learning_rate": 3.253464881371858e-05,
1078
+ "loss": 0.1827,
1079
+ "step": 1530
1080
+ },
1081
+ {
1082
+ "epoch": 17.79,
1083
+ "learning_rate": 3.2417195207892885e-05,
1084
+ "loss": 0.2579,
1085
+ "step": 1540
1086
+ },
1087
+ {
1088
+ "epoch": 17.9,
1089
+ "learning_rate": 3.229974160206719e-05,
1090
+ "loss": 0.2019,
1091
+ "step": 1550
1092
+ },
1093
+ {
1094
+ "epoch": 18.0,
1095
+ "eval_accuracy": 0.9536178107606679,
1096
+ "eval_loss": 0.12778125703334808,
1097
+ "eval_runtime": 8.0655,
1098
+ "eval_samples_per_second": 66.828,
1099
+ "eval_steps_per_second": 8.431,
1100
+ "step": 1558
1101
+ },
1102
+ {
1103
+ "epoch": 18.02,
1104
+ "learning_rate": 3.2182287996241483e-05,
1105
+ "loss": 0.184,
1106
+ "step": 1560
1107
+ },
1108
+ {
1109
+ "epoch": 18.14,
1110
+ "learning_rate": 3.2064834390415786e-05,
1111
+ "loss": 0.2261,
1112
+ "step": 1570
1113
+ },
1114
+ {
1115
+ "epoch": 18.25,
1116
+ "learning_rate": 3.194738078459009e-05,
1117
+ "loss": 0.2155,
1118
+ "step": 1580
1119
+ },
1120
+ {
1121
+ "epoch": 18.37,
1122
+ "learning_rate": 3.182992717876439e-05,
1123
+ "loss": 0.1857,
1124
+ "step": 1590
1125
+ },
1126
+ {
1127
+ "epoch": 18.48,
1128
+ "learning_rate": 3.171247357293869e-05,
1129
+ "loss": 0.2009,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 18.6,
1134
+ "learning_rate": 3.159501996711299e-05,
1135
+ "loss": 0.213,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 18.71,
1140
+ "learning_rate": 3.147756636128729e-05,
1141
+ "loss": 0.2299,
1142
+ "step": 1620
1143
+ },
1144
+ {
1145
+ "epoch": 18.83,
1146
+ "learning_rate": 3.1360112755461595e-05,
1147
+ "loss": 0.1956,
1148
+ "step": 1630
1149
+ },
1150
+ {
1151
+ "epoch": 18.94,
1152
+ "learning_rate": 3.124265914963589e-05,
1153
+ "loss": 0.2073,
1154
+ "step": 1640
1155
+ },
1156
+ {
1157
+ "epoch": 18.99,
1158
+ "eval_accuracy": 0.9684601113172542,
1159
+ "eval_loss": 0.11341895163059235,
1160
+ "eval_runtime": 8.1079,
1161
+ "eval_samples_per_second": 66.478,
1162
+ "eval_steps_per_second": 8.387,
1163
+ "step": 1644
1164
+ },
1165
+ {
1166
+ "epoch": 19.06,
1167
+ "learning_rate": 3.11252055438102e-05,
1168
+ "loss": 0.1869,
1169
+ "step": 1650
1170
+ },
1171
+ {
1172
+ "epoch": 19.17,
1173
+ "learning_rate": 3.1007751937984497e-05,
1174
+ "loss": 0.1941,
1175
+ "step": 1660
1176
+ },
1177
+ {
1178
+ "epoch": 19.29,
1179
+ "learning_rate": 3.08902983321588e-05,
1180
+ "loss": 0.2218,
1181
+ "step": 1670
1182
+ },
1183
+ {
1184
+ "epoch": 19.41,
1185
+ "learning_rate": 3.07728447263331e-05,
1186
+ "loss": 0.196,
1187
+ "step": 1680
1188
+ },
1189
+ {
1190
+ "epoch": 19.52,
1191
+ "learning_rate": 3.06553911205074e-05,
1192
+ "loss": 0.2339,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 19.64,
1197
+ "learning_rate": 3.05379375146817e-05,
1198
+ "loss": 0.2045,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 19.75,
1203
+ "learning_rate": 3.0420483908856e-05,
1204
+ "loss": 0.2057,
1205
+ "step": 1710
1206
+ },
1207
+ {
1208
+ "epoch": 19.87,
1209
+ "learning_rate": 3.0303030303030306e-05,
1210
+ "loss": 0.2136,
1211
+ "step": 1720
1212
+ },
1213
+ {
1214
+ "epoch": 19.98,
1215
+ "learning_rate": 3.0185576697204605e-05,
1216
+ "loss": 0.1873,
1217
+ "step": 1730
1218
+ },
1219
+ {
1220
+ "epoch": 20.0,
1221
+ "eval_accuracy": 0.9628942486085343,
1222
+ "eval_loss": 0.6738272309303284,
1223
+ "eval_runtime": 8.4568,
1224
+ "eval_samples_per_second": 63.736,
1225
+ "eval_steps_per_second": 8.041,
1226
+ "step": 1731
1227
+ },
1228
+ {
1229
+ "epoch": 20.1,
1230
+ "learning_rate": 3.0068123091378908e-05,
1231
+ "loss": 0.2643,
1232
+ "step": 1740
1233
+ },
1234
+ {
1235
+ "epoch": 20.21,
1236
+ "learning_rate": 2.9950669485553207e-05,
1237
+ "loss": 0.1854,
1238
+ "step": 1750
1239
+ },
1240
+ {
1241
+ "epoch": 20.33,
1242
+ "learning_rate": 2.983321587972751e-05,
1243
+ "loss": 0.2382,
1244
+ "step": 1760
1245
+ },
1246
+ {
1247
+ "epoch": 20.45,
1248
+ "learning_rate": 2.971576227390181e-05,
1249
+ "loss": 0.18,
1250
+ "step": 1770
1251
+ },
1252
+ {
1253
+ "epoch": 20.56,
1254
+ "learning_rate": 2.9598308668076115e-05,
1255
+ "loss": 0.1763,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 20.68,
1260
+ "learning_rate": 2.948085506225041e-05,
1261
+ "loss": 0.2399,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 20.79,
1266
+ "learning_rate": 2.9363401456424717e-05,
1267
+ "loss": 0.2275,
1268
+ "step": 1800
1269
+ },
1270
+ {
1271
+ "epoch": 20.91,
1272
+ "learning_rate": 2.9245947850599016e-05,
1273
+ "loss": 0.2446,
1274
+ "step": 1810
1275
+ },
1276
+ {
1277
+ "epoch": 21.0,
1278
+ "eval_accuracy": 0.9684601113172542,
1279
+ "eval_loss": 0.1033068299293518,
1280
+ "eval_runtime": 8.3041,
1281
+ "eval_samples_per_second": 64.908,
1282
+ "eval_steps_per_second": 8.189,
1283
+ "step": 1818
1284
+ },
1285
+ {
1286
+ "epoch": 21.02,
1287
+ "learning_rate": 2.9128494244773312e-05,
1288
+ "loss": 0.2423,
1289
+ "step": 1820
1290
+ },
1291
+ {
1292
+ "epoch": 21.14,
1293
+ "learning_rate": 2.9011040638947618e-05,
1294
+ "loss": 0.2239,
1295
+ "step": 1830
1296
+ },
1297
+ {
1298
+ "epoch": 21.25,
1299
+ "learning_rate": 2.8893587033121917e-05,
1300
+ "loss": 0.2212,
1301
+ "step": 1840
1302
+ },
1303
+ {
1304
+ "epoch": 21.37,
1305
+ "learning_rate": 2.877613342729622e-05,
1306
+ "loss": 0.1594,
1307
+ "step": 1850
1308
+ },
1309
+ {
1310
+ "epoch": 21.49,
1311
+ "learning_rate": 2.865867982147052e-05,
1312
+ "loss": 0.1878,
1313
+ "step": 1860
1314
+ },
1315
+ {
1316
+ "epoch": 21.6,
1317
+ "learning_rate": 2.8541226215644822e-05,
1318
+ "loss": 0.1965,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 21.72,
1323
+ "learning_rate": 2.842377260981912e-05,
1324
+ "loss": 0.2186,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 21.83,
1329
+ "learning_rate": 2.8306319003993427e-05,
1330
+ "loss": 0.1904,
1331
+ "step": 1890
1332
+ },
1333
+ {
1334
+ "epoch": 21.95,
1335
+ "learning_rate": 2.8188865398167723e-05,
1336
+ "loss": 0.1999,
1337
+ "step": 1900
1338
+ },
1339
+ {
1340
+ "epoch": 21.99,
1341
+ "eval_accuracy": 0.9647495361781077,
1342
+ "eval_loss": 0.11812406778335571,
1343
+ "eval_runtime": 8.2822,
1344
+ "eval_samples_per_second": 65.079,
1345
+ "eval_steps_per_second": 8.21,
1346
+ "step": 1904
1347
+ },
1348
+ {
1349
+ "epoch": 22.06,
1350
+ "learning_rate": 2.807141179234203e-05,
1351
+ "loss": 0.2147,
1352
+ "step": 1910
1353
+ },
1354
+ {
1355
+ "epoch": 22.18,
1356
+ "learning_rate": 2.795395818651633e-05,
1357
+ "loss": 0.2158,
1358
+ "step": 1920
1359
+ },
1360
+ {
1361
+ "epoch": 22.29,
1362
+ "learning_rate": 2.783650458069063e-05,
1363
+ "loss": 0.1937,
1364
+ "step": 1930
1365
+ },
1366
+ {
1367
+ "epoch": 22.41,
1368
+ "learning_rate": 2.771905097486493e-05,
1369
+ "loss": 0.2208,
1370
+ "step": 1940
1371
+ },
1372
+ {
1373
+ "epoch": 22.52,
1374
+ "learning_rate": 2.760159736903923e-05,
1375
+ "loss": 0.155,
1376
+ "step": 1950
1377
+ },
1378
+ {
1379
+ "epoch": 22.64,
1380
+ "learning_rate": 2.7484143763213532e-05,
1381
+ "loss": 0.1793,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 22.76,
1386
+ "learning_rate": 2.736669015738783e-05,
1387
+ "loss": 0.1794,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 22.87,
1392
+ "learning_rate": 2.7249236551562134e-05,
1393
+ "loss": 0.22,
1394
+ "step": 1980
1395
+ },
1396
+ {
1397
+ "epoch": 22.99,
1398
+ "learning_rate": 2.7131782945736434e-05,
1399
+ "loss": 0.1716,
1400
+ "step": 1990
1401
+ },
1402
+ {
1403
+ "epoch": 23.0,
1404
+ "eval_accuracy": 0.961038961038961,
1405
+ "eval_loss": 0.10991629213094711,
1406
+ "eval_runtime": 8.221,
1407
+ "eval_samples_per_second": 65.564,
1408
+ "eval_steps_per_second": 8.272,
1409
+ "step": 1991
1410
+ },
1411
+ {
1412
+ "epoch": 23.1,
1413
+ "learning_rate": 2.7014329339910736e-05,
1414
+ "loss": 0.1692,
1415
+ "step": 2000
1416
+ },
1417
+ {
1418
+ "epoch": 23.22,
1419
+ "learning_rate": 2.6896875734085036e-05,
1420
+ "loss": 0.1931,
1421
+ "step": 2010
1422
+ },
1423
+ {
1424
+ "epoch": 23.33,
1425
+ "learning_rate": 2.677942212825934e-05,
1426
+ "loss": 0.1663,
1427
+ "step": 2020
1428
+ },
1429
+ {
1430
+ "epoch": 23.45,
1431
+ "learning_rate": 2.6661968522433637e-05,
1432
+ "loss": 0.2102,
1433
+ "step": 2030
1434
+ },
1435
+ {
1436
+ "epoch": 23.56,
1437
+ "learning_rate": 2.6544514916607944e-05,
1438
+ "loss": 0.1822,
1439
+ "step": 2040
1440
+ },
1441
+ {
1442
+ "epoch": 23.68,
1443
+ "learning_rate": 2.6427061310782243e-05,
1444
+ "loss": 0.2216,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 23.8,
1449
+ "learning_rate": 2.6309607704956545e-05,
1450
+ "loss": 0.1731,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 23.91,
1455
+ "learning_rate": 2.6192154099130845e-05,
1456
+ "loss": 0.175,
1457
+ "step": 2070
1458
+ },
1459
+ {
1460
+ "epoch": 23.99,
1461
+ "eval_accuracy": 0.974025974025974,
1462
+ "eval_loss": 0.10644800215959549,
1463
+ "eval_runtime": 8.1046,
1464
+ "eval_samples_per_second": 66.505,
1465
+ "eval_steps_per_second": 8.39,
1466
+ "step": 2077
1467
+ },
1468
+ {
1469
+ "epoch": 24.03,
1470
+ "learning_rate": 2.6074700493305144e-05,
1471
+ "loss": 0.1845,
1472
+ "step": 2080
1473
+ },
1474
+ {
1475
+ "epoch": 24.14,
1476
+ "learning_rate": 2.5957246887479447e-05,
1477
+ "loss": 0.2091,
1478
+ "step": 2090
1479
+ },
1480
+ {
1481
+ "epoch": 24.26,
1482
+ "learning_rate": 2.5839793281653746e-05,
1483
+ "loss": 0.1783,
1484
+ "step": 2100
1485
+ },
1486
+ {
1487
+ "epoch": 24.37,
1488
+ "learning_rate": 2.572233967582805e-05,
1489
+ "loss": 0.1822,
1490
+ "step": 2110
1491
+ },
1492
+ {
1493
+ "epoch": 24.49,
1494
+ "learning_rate": 2.5604886070002348e-05,
1495
+ "loss": 0.2239,
1496
+ "step": 2120
1497
+ },
1498
+ {
1499
+ "epoch": 24.6,
1500
+ "learning_rate": 2.5487432464176654e-05,
1501
+ "loss": 0.1639,
1502
+ "step": 2130
1503
+ },
1504
+ {
1505
+ "epoch": 24.72,
1506
+ "learning_rate": 2.536997885835095e-05,
1507
+ "loss": 0.1839,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 24.83,
1512
+ "learning_rate": 2.5252525252525256e-05,
1513
+ "loss": 0.2114,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 24.95,
1518
+ "learning_rate": 2.5135071646699555e-05,
1519
+ "loss": 0.1962,
1520
+ "step": 2160
1521
+ },
1522
+ {
1523
+ "epoch": 25.0,
1524
+ "eval_accuracy": 0.9721706864564007,
1525
+ "eval_loss": 0.11735469102859497,
1526
+ "eval_runtime": 8.2081,
1527
+ "eval_samples_per_second": 65.667,
1528
+ "eval_steps_per_second": 8.284,
1529
+ "step": 2164
1530
+ },
1531
+ {
1532
+ "epoch": 25.07,
1533
+ "learning_rate": 2.5017618040873858e-05,
1534
+ "loss": 0.1658,
1535
+ "step": 2170
1536
+ },
1537
+ {
1538
+ "epoch": 25.18,
1539
+ "learning_rate": 2.4900164435048157e-05,
1540
+ "loss": 0.1979,
1541
+ "step": 2180
1542
+ },
1543
+ {
1544
+ "epoch": 25.3,
1545
+ "learning_rate": 2.4782710829222456e-05,
1546
+ "loss": 0.1707,
1547
+ "step": 2190
1548
+ },
1549
+ {
1550
+ "epoch": 25.41,
1551
+ "learning_rate": 2.466525722339676e-05,
1552
+ "loss": 0.1932,
1553
+ "step": 2200
1554
+ },
1555
+ {
1556
+ "epoch": 25.53,
1557
+ "learning_rate": 2.4547803617571062e-05,
1558
+ "loss": 0.2355,
1559
+ "step": 2210
1560
+ },
1561
+ {
1562
+ "epoch": 25.64,
1563
+ "learning_rate": 2.443035001174536e-05,
1564
+ "loss": 0.2246,
1565
+ "step": 2220
1566
+ },
1567
+ {
1568
+ "epoch": 25.76,
1569
+ "learning_rate": 2.4312896405919664e-05,
1570
+ "loss": 0.2031,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 25.87,
1575
+ "learning_rate": 2.4195442800093966e-05,
1576
+ "loss": 0.2062,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 25.99,
1581
+ "learning_rate": 2.4077989194268266e-05,
1582
+ "loss": 0.1943,
1583
+ "step": 2250
1584
+ },
1585
+ {
1586
+ "epoch": 25.99,
1587
+ "eval_accuracy": 0.9517625231910947,
1588
+ "eval_loss": 1.0624566078186035,
1589
+ "eval_runtime": 8.1518,
1590
+ "eval_samples_per_second": 66.12,
1591
+ "eval_steps_per_second": 8.342,
1592
+ "step": 2250
1593
+ },
1594
+ {
1595
+ "epoch": 26.11,
1596
+ "learning_rate": 2.3960535588442568e-05,
1597
+ "loss": 0.2071,
1598
+ "step": 2260
1599
+ },
1600
+ {
1601
+ "epoch": 26.22,
1602
+ "learning_rate": 2.3843081982616868e-05,
1603
+ "loss": 0.1872,
1604
+ "step": 2270
1605
+ },
1606
+ {
1607
+ "epoch": 26.34,
1608
+ "learning_rate": 2.3725628376791167e-05,
1609
+ "loss": 0.1835,
1610
+ "step": 2280
1611
+ },
1612
+ {
1613
+ "epoch": 26.45,
1614
+ "learning_rate": 2.360817477096547e-05,
1615
+ "loss": 0.171,
1616
+ "step": 2290
1617
+ },
1618
+ {
1619
+ "epoch": 26.57,
1620
+ "learning_rate": 2.349072116513977e-05,
1621
+ "loss": 0.2028,
1622
+ "step": 2300
1623
+ },
1624
+ {
1625
+ "epoch": 26.68,
1626
+ "learning_rate": 2.337326755931407e-05,
1627
+ "loss": 0.2108,
1628
+ "step": 2310
1629
+ },
1630
+ {
1631
+ "epoch": 26.8,
1632
+ "learning_rate": 2.3255813953488374e-05,
1633
+ "loss": 0.2046,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 26.91,
1638
+ "learning_rate": 2.3138360347662673e-05,
1639
+ "loss": 0.2044,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 27.0,
1644
+ "eval_accuracy": 0.9573283858998145,
1645
+ "eval_loss": 0.8419390916824341,
1646
+ "eval_runtime": 7.9375,
1647
+ "eval_samples_per_second": 67.906,
1648
+ "eval_steps_per_second": 8.567,
1649
+ "step": 2337
1650
+ },
1651
+ {
1652
+ "epoch": 27.03,
1653
+ "learning_rate": 2.3020906741836976e-05,
1654
+ "loss": 0.2017,
1655
+ "step": 2340
1656
+ },
1657
+ {
1658
+ "epoch": 27.15,
1659
+ "learning_rate": 2.2903453136011275e-05,
1660
+ "loss": 0.1923,
1661
+ "step": 2350
1662
+ },
1663
+ {
1664
+ "epoch": 27.26,
1665
+ "learning_rate": 2.2785999530185578e-05,
1666
+ "loss": 0.171,
1667
+ "step": 2360
1668
+ },
1669
+ {
1670
+ "epoch": 27.38,
1671
+ "learning_rate": 2.266854592435988e-05,
1672
+ "loss": 0.1912,
1673
+ "step": 2370
1674
+ },
1675
+ {
1676
+ "epoch": 27.49,
1677
+ "learning_rate": 2.255109231853418e-05,
1678
+ "loss": 0.2001,
1679
+ "step": 2380
1680
+ },
1681
+ {
1682
+ "epoch": 27.61,
1683
+ "learning_rate": 2.2433638712708483e-05,
1684
+ "loss": 0.1668,
1685
+ "step": 2390
1686
+ },
1687
+ {
1688
+ "epoch": 27.72,
1689
+ "learning_rate": 2.2316185106882785e-05,
1690
+ "loss": 0.1699,
1691
+ "step": 2400
1692
+ },
1693
+ {
1694
+ "epoch": 27.84,
1695
+ "learning_rate": 2.219873150105708e-05,
1696
+ "loss": 0.1944,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 27.95,
1701
+ "learning_rate": 2.2081277895231384e-05,
1702
+ "loss": 0.1835,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 28.0,
1707
+ "eval_accuracy": 0.9703153988868275,
1708
+ "eval_loss": 0.11119159311056137,
1709
+ "eval_runtime": 7.8965,
1710
+ "eval_samples_per_second": 68.258,
1711
+ "eval_steps_per_second": 8.611,
1712
+ "step": 2424
1713
+ },
1714
+ {
1715
+ "epoch": 28.07,
1716
+ "learning_rate": 2.1963824289405686e-05,
1717
+ "loss": 0.1724,
1718
+ "step": 2430
1719
+ },
1720
+ {
1721
+ "epoch": 28.18,
1722
+ "learning_rate": 2.1846370683579986e-05,
1723
+ "loss": 0.2228,
1724
+ "step": 2440
1725
+ },
1726
+ {
1727
+ "epoch": 28.3,
1728
+ "learning_rate": 2.172891707775429e-05,
1729
+ "loss": 0.1672,
1730
+ "step": 2450
1731
+ },
1732
+ {
1733
+ "epoch": 28.42,
1734
+ "learning_rate": 2.1611463471928588e-05,
1735
+ "loss": 0.2021,
1736
+ "step": 2460
1737
+ },
1738
+ {
1739
+ "epoch": 28.53,
1740
+ "learning_rate": 2.149400986610289e-05,
1741
+ "loss": 0.1893,
1742
+ "step": 2470
1743
+ },
1744
+ {
1745
+ "epoch": 28.65,
1746
+ "learning_rate": 2.1376556260277193e-05,
1747
+ "loss": 0.2024,
1748
+ "step": 2480
1749
+ },
1750
+ {
1751
+ "epoch": 28.76,
1752
+ "learning_rate": 2.1259102654451492e-05,
1753
+ "loss": 0.1508,
1754
+ "step": 2490
1755
+ },
1756
+ {
1757
+ "epoch": 28.88,
1758
+ "learning_rate": 2.1141649048625795e-05,
1759
+ "loss": 0.1562,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 28.99,
1764
+ "learning_rate": 2.1024195442800098e-05,
1765
+ "loss": 0.191,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 28.99,
1770
+ "eval_accuracy": 0.9684601113172542,
1771
+ "eval_loss": 0.11420014500617981,
1772
+ "eval_runtime": 7.9348,
1773
+ "eval_samples_per_second": 67.929,
1774
+ "eval_steps_per_second": 8.57,
1775
+ "step": 2510
1776
+ },
1777
+ {
1778
+ "epoch": 29.11,
1779
+ "learning_rate": 2.0906741836974397e-05,
1780
+ "loss": 0.1925,
1781
+ "step": 2520
1782
+ },
1783
+ {
1784
+ "epoch": 29.22,
1785
+ "learning_rate": 2.07892882311487e-05,
1786
+ "loss": 0.1513,
1787
+ "step": 2530
1788
+ },
1789
+ {
1790
+ "epoch": 29.34,
1791
+ "learning_rate": 2.0671834625323e-05,
1792
+ "loss": 0.2406,
1793
+ "step": 2540
1794
+ },
1795
+ {
1796
+ "epoch": 29.46,
1797
+ "learning_rate": 2.0554381019497298e-05,
1798
+ "loss": 0.1809,
1799
+ "step": 2550
1800
+ },
1801
+ {
1802
+ "epoch": 29.57,
1803
+ "learning_rate": 2.04369274136716e-05,
1804
+ "loss": 0.1641,
1805
+ "step": 2560
1806
+ },
1807
+ {
1808
+ "epoch": 29.69,
1809
+ "learning_rate": 2.03194738078459e-05,
1810
+ "loss": 0.1805,
1811
+ "step": 2570
1812
+ },
1813
+ {
1814
+ "epoch": 29.8,
1815
+ "learning_rate": 2.0202020202020203e-05,
1816
+ "loss": 0.1702,
1817
+ "step": 2580
1818
+ },
1819
+ {
1820
+ "epoch": 29.92,
1821
+ "learning_rate": 2.0084566596194505e-05,
1822
+ "loss": 0.1676,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 30.0,
1827
+ "eval_accuracy": 0.9647495361781077,
1828
+ "eval_loss": 0.10803297162055969,
1829
+ "eval_runtime": 7.8199,
1830
+ "eval_samples_per_second": 68.927,
1831
+ "eval_steps_per_second": 8.696,
1832
+ "step": 2597
1833
+ },
1834
+ {
1835
+ "epoch": 30.03,
1836
+ "learning_rate": 1.9967112990368805e-05,
1837
+ "loss": 0.1554,
1838
+ "step": 2600
1839
+ },
1840
+ {
1841
+ "epoch": 30.15,
1842
+ "learning_rate": 1.9849659384543107e-05,
1843
+ "loss": 0.2092,
1844
+ "step": 2610
1845
+ },
1846
+ {
1847
+ "epoch": 30.26,
1848
+ "learning_rate": 1.9732205778717407e-05,
1849
+ "loss": 0.16,
1850
+ "step": 2620
1851
+ },
1852
+ {
1853
+ "epoch": 30.38,
1854
+ "learning_rate": 1.961475217289171e-05,
1855
+ "loss": 0.1932,
1856
+ "step": 2630
1857
+ },
1858
+ {
1859
+ "epoch": 30.5,
1860
+ "learning_rate": 1.9497298567066012e-05,
1861
+ "loss": 0.1742,
1862
+ "step": 2640
1863
+ },
1864
+ {
1865
+ "epoch": 30.61,
1866
+ "learning_rate": 1.937984496124031e-05,
1867
+ "loss": 0.1714,
1868
+ "step": 2650
1869
+ },
1870
+ {
1871
+ "epoch": 30.73,
1872
+ "learning_rate": 1.9262391355414614e-05,
1873
+ "loss": 0.1668,
1874
+ "step": 2660
1875
+ },
1876
+ {
1877
+ "epoch": 30.84,
1878
+ "learning_rate": 1.9144937749588913e-05,
1879
+ "loss": 0.2284,
1880
+ "step": 2670
1881
+ },
1882
+ {
1883
+ "epoch": 30.96,
1884
+ "learning_rate": 1.9027484143763212e-05,
1885
+ "loss": 0.1533,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 30.99,
1890
+ "eval_accuracy": 0.9647495361781077,
1891
+ "eval_loss": 0.14941494166851044,
1892
+ "eval_runtime": 7.9882,
1893
+ "eval_samples_per_second": 67.474,
1894
+ "eval_steps_per_second": 8.513,
1895
+ "step": 2683
1896
+ },
1897
+ {
1898
+ "epoch": 31.07,
1899
+ "learning_rate": 1.8910030537937515e-05,
1900
+ "loss": 0.179,
1901
+ "step": 2690
1902
+ },
1903
+ {
1904
+ "epoch": 31.19,
1905
+ "learning_rate": 1.8792576932111818e-05,
1906
+ "loss": 0.1589,
1907
+ "step": 2700
1908
+ },
1909
+ {
1910
+ "epoch": 31.3,
1911
+ "learning_rate": 1.8675123326286117e-05,
1912
+ "loss": 0.2133,
1913
+ "step": 2710
1914
+ },
1915
+ {
1916
+ "epoch": 31.42,
1917
+ "learning_rate": 1.855766972046042e-05,
1918
+ "loss": 0.173,
1919
+ "step": 2720
1920
+ },
1921
+ {
1922
+ "epoch": 31.53,
1923
+ "learning_rate": 1.844021611463472e-05,
1924
+ "loss": 0.1998,
1925
+ "step": 2730
1926
+ },
1927
+ {
1928
+ "epoch": 31.65,
1929
+ "learning_rate": 1.832276250880902e-05,
1930
+ "loss": 0.2054,
1931
+ "step": 2740
1932
+ },
1933
+ {
1934
+ "epoch": 31.77,
1935
+ "learning_rate": 1.8205308902983324e-05,
1936
+ "loss": 0.1739,
1937
+ "step": 2750
1938
+ },
1939
+ {
1940
+ "epoch": 31.88,
1941
+ "learning_rate": 1.8087855297157624e-05,
1942
+ "loss": 0.1581,
1943
+ "step": 2760
1944
+ },
1945
+ {
1946
+ "epoch": 32.0,
1947
+ "learning_rate": 1.7970401691331926e-05,
1948
+ "loss": 0.1991,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 32.0,
1953
+ "eval_accuracy": 0.9703153988868275,
1954
+ "eval_loss": 0.10002347081899643,
1955
+ "eval_runtime": 8.066,
1956
+ "eval_samples_per_second": 66.823,
1957
+ "eval_steps_per_second": 8.43,
1958
+ "step": 2770
1959
+ },
1960
+ {
1961
+ "epoch": 32.11,
1962
+ "learning_rate": 1.7852948085506225e-05,
1963
+ "loss": 0.1573,
1964
+ "step": 2780
1965
+ },
1966
+ {
1967
+ "epoch": 32.23,
1968
+ "learning_rate": 1.7735494479680528e-05,
1969
+ "loss": 0.1641,
1970
+ "step": 2790
1971
+ },
1972
+ {
1973
+ "epoch": 32.34,
1974
+ "learning_rate": 1.7618040873854827e-05,
1975
+ "loss": 0.1656,
1976
+ "step": 2800
1977
+ },
1978
+ {
1979
+ "epoch": 32.46,
1980
+ "learning_rate": 1.7500587268029127e-05,
1981
+ "loss": 0.2127,
1982
+ "step": 2810
1983
+ },
1984
+ {
1985
+ "epoch": 32.57,
1986
+ "learning_rate": 1.738313366220343e-05,
1987
+ "loss": 0.1756,
1988
+ "step": 2820
1989
+ },
1990
+ {
1991
+ "epoch": 32.69,
1992
+ "learning_rate": 1.7265680056377732e-05,
1993
+ "loss": 0.1754,
1994
+ "step": 2830
1995
+ },
1996
+ {
1997
+ "epoch": 32.81,
1998
+ "learning_rate": 1.714822645055203e-05,
1999
+ "loss": 0.1605,
2000
+ "step": 2840
2001
+ },
2002
+ {
2003
+ "epoch": 32.92,
2004
+ "learning_rate": 1.7030772844726334e-05,
2005
+ "loss": 0.1845,
2006
+ "step": 2850
2007
+ },
2008
+ {
2009
+ "epoch": 32.99,
2010
+ "eval_accuracy": 0.974025974025974,
2011
+ "eval_loss": 0.09888846427202225,
2012
+ "eval_runtime": 7.9989,
2013
+ "eval_samples_per_second": 67.385,
2014
+ "eval_steps_per_second": 8.501,
2015
+ "step": 2856
2016
+ },
2017
+ {
2018
+ "epoch": 33.04,
2019
+ "learning_rate": 1.6913319238900637e-05,
2020
+ "loss": 0.1855,
2021
+ "step": 2860
2022
+ },
2023
+ {
2024
+ "epoch": 33.15,
2025
+ "learning_rate": 1.6795865633074936e-05,
2026
+ "loss": 0.211,
2027
+ "step": 2870
2028
+ },
2029
+ {
2030
+ "epoch": 33.27,
2031
+ "learning_rate": 1.667841202724924e-05,
2032
+ "loss": 0.2067,
2033
+ "step": 2880
2034
+ },
2035
+ {
2036
+ "epoch": 33.38,
2037
+ "learning_rate": 1.6560958421423538e-05,
2038
+ "loss": 0.1738,
2039
+ "step": 2890
2040
+ },
2041
+ {
2042
+ "epoch": 33.5,
2043
+ "learning_rate": 1.644350481559784e-05,
2044
+ "loss": 0.1725,
2045
+ "step": 2900
2046
+ },
2047
+ {
2048
+ "epoch": 33.61,
2049
+ "learning_rate": 1.6326051209772143e-05,
2050
+ "loss": 0.1686,
2051
+ "step": 2910
2052
+ },
2053
+ {
2054
+ "epoch": 33.73,
2055
+ "learning_rate": 1.6208597603946442e-05,
2056
+ "loss": 0.1642,
2057
+ "step": 2920
2058
+ },
2059
+ {
2060
+ "epoch": 33.84,
2061
+ "learning_rate": 1.6091143998120742e-05,
2062
+ "loss": 0.1527,
2063
+ "step": 2930
2064
+ },
2065
+ {
2066
+ "epoch": 33.96,
2067
+ "learning_rate": 1.5973690392295044e-05,
2068
+ "loss": 0.1605,
2069
+ "step": 2940
2070
+ },
2071
+ {
2072
+ "epoch": 34.0,
2073
+ "eval_accuracy": 0.9684601113172542,
2074
+ "eval_loss": 0.09749138355255127,
2075
+ "eval_runtime": 8.1236,
2076
+ "eval_samples_per_second": 66.35,
2077
+ "eval_steps_per_second": 8.371,
2078
+ "step": 2943
2079
+ },
2080
+ {
2081
+ "epoch": 34.08,
2082
+ "learning_rate": 1.5856236786469344e-05,
2083
+ "loss": 0.1699,
2084
+ "step": 2950
2085
+ },
2086
+ {
2087
+ "epoch": 34.19,
2088
+ "learning_rate": 1.5738783180643646e-05,
2089
+ "loss": 0.1646,
2090
+ "step": 2960
2091
+ },
2092
+ {
2093
+ "epoch": 34.31,
2094
+ "learning_rate": 1.5621329574817946e-05,
2095
+ "loss": 0.1779,
2096
+ "step": 2970
2097
+ },
2098
+ {
2099
+ "epoch": 34.42,
2100
+ "learning_rate": 1.5503875968992248e-05,
2101
+ "loss": 0.1908,
2102
+ "step": 2980
2103
+ },
2104
+ {
2105
+ "epoch": 34.54,
2106
+ "learning_rate": 1.538642236316655e-05,
2107
+ "loss": 0.182,
2108
+ "step": 2990
2109
+ },
2110
+ {
2111
+ "epoch": 34.65,
2112
+ "learning_rate": 1.526896875734085e-05,
2113
+ "loss": 0.2004,
2114
+ "step": 3000
2115
+ },
2116
+ {
2117
+ "epoch": 34.77,
2118
+ "learning_rate": 1.5151515151515153e-05,
2119
+ "loss": 0.1426,
2120
+ "step": 3010
2121
+ },
2122
+ {
2123
+ "epoch": 34.88,
2124
+ "learning_rate": 1.5034061545689454e-05,
2125
+ "loss": 0.1614,
2126
+ "step": 3020
2127
+ },
2128
+ {
2129
+ "epoch": 35.0,
2130
+ "learning_rate": 1.4916607939863755e-05,
2131
+ "loss": 0.1928,
2132
+ "step": 3030
2133
+ },
2134
+ {
2135
+ "epoch": 35.0,
2136
+ "eval_accuracy": 0.9628942486085343,
2137
+ "eval_loss": 0.4555383026599884,
2138
+ "eval_runtime": 7.8835,
2139
+ "eval_samples_per_second": 68.371,
2140
+ "eval_steps_per_second": 8.626,
2141
+ "step": 3030
2142
+ },
2143
+ {
2144
+ "epoch": 35.12,
2145
+ "learning_rate": 1.4799154334038057e-05,
2146
+ "loss": 0.1884,
2147
+ "step": 3040
2148
+ },
2149
+ {
2150
+ "epoch": 35.23,
2151
+ "learning_rate": 1.4681700728212358e-05,
2152
+ "loss": 0.1651,
2153
+ "step": 3050
2154
+ },
2155
+ {
2156
+ "epoch": 35.35,
2157
+ "learning_rate": 1.4564247122386656e-05,
2158
+ "loss": 0.1602,
2159
+ "step": 3060
2160
+ },
2161
+ {
2162
+ "epoch": 35.46,
2163
+ "learning_rate": 1.4446793516560959e-05,
2164
+ "loss": 0.1688,
2165
+ "step": 3070
2166
+ },
2167
+ {
2168
+ "epoch": 35.58,
2169
+ "learning_rate": 1.432933991073526e-05,
2170
+ "loss": 0.1719,
2171
+ "step": 3080
2172
+ },
2173
+ {
2174
+ "epoch": 35.69,
2175
+ "learning_rate": 1.421188630490956e-05,
2176
+ "loss": 0.1608,
2177
+ "step": 3090
2178
+ },
2179
+ {
2180
+ "epoch": 35.81,
2181
+ "learning_rate": 1.4094432699083862e-05,
2182
+ "loss": 0.163,
2183
+ "step": 3100
2184
+ },
2185
+ {
2186
+ "epoch": 35.92,
2187
+ "learning_rate": 1.3976979093258164e-05,
2188
+ "loss": 0.1506,
2189
+ "step": 3110
2190
+ },
2191
+ {
2192
+ "epoch": 35.99,
2193
+ "eval_accuracy": 0.9703153988868275,
2194
+ "eval_loss": 0.1059107631444931,
2195
+ "eval_runtime": 8.0108,
2196
+ "eval_samples_per_second": 67.284,
2197
+ "eval_steps_per_second": 8.489,
2198
+ "step": 3116
2199
+ },
2200
+ {
2201
+ "epoch": 36.04,
2202
+ "learning_rate": 1.3859525487432465e-05,
2203
+ "loss": 0.1802,
2204
+ "step": 3120
2205
+ },
2206
+ {
2207
+ "epoch": 36.16,
2208
+ "learning_rate": 1.3742071881606766e-05,
2209
+ "loss": 0.1332,
2210
+ "step": 3130
2211
+ },
2212
+ {
2213
+ "epoch": 36.27,
2214
+ "learning_rate": 1.3624618275781067e-05,
2215
+ "loss": 0.1298,
2216
+ "step": 3140
2217
+ },
2218
+ {
2219
+ "epoch": 36.39,
2220
+ "learning_rate": 1.3507164669955368e-05,
2221
+ "loss": 0.1705,
2222
+ "step": 3150
2223
+ },
2224
+ {
2225
+ "epoch": 36.5,
2226
+ "learning_rate": 1.338971106412967e-05,
2227
+ "loss": 0.1431,
2228
+ "step": 3160
2229
+ },
2230
+ {
2231
+ "epoch": 36.62,
2232
+ "learning_rate": 1.3272257458303972e-05,
2233
+ "loss": 0.1582,
2234
+ "step": 3170
2235
+ },
2236
+ {
2237
+ "epoch": 36.73,
2238
+ "learning_rate": 1.3154803852478273e-05,
2239
+ "loss": 0.1544,
2240
+ "step": 3180
2241
+ },
2242
+ {
2243
+ "epoch": 36.85,
2244
+ "learning_rate": 1.3037350246652572e-05,
2245
+ "loss": 0.1966,
2246
+ "step": 3190
2247
+ },
2248
+ {
2249
+ "epoch": 36.96,
2250
+ "learning_rate": 1.2919896640826873e-05,
2251
+ "loss": 0.1912,
2252
+ "step": 3200
2253
+ },
2254
+ {
2255
+ "epoch": 37.0,
2256
+ "eval_accuracy": 0.9647495361781077,
2257
+ "eval_loss": 0.10163893550634384,
2258
+ "eval_runtime": 8.0326,
2259
+ "eval_samples_per_second": 67.102,
2260
+ "eval_steps_per_second": 8.466,
2261
+ "step": 3203
2262
+ },
2263
+ {
2264
+ "epoch": 37.08,
2265
+ "learning_rate": 1.2802443035001174e-05,
2266
+ "loss": 0.1956,
2267
+ "step": 3210
2268
+ },
2269
+ {
2270
+ "epoch": 37.19,
2271
+ "learning_rate": 1.2684989429175475e-05,
2272
+ "loss": 0.1705,
2273
+ "step": 3220
2274
+ },
2275
+ {
2276
+ "epoch": 37.31,
2277
+ "learning_rate": 1.2567535823349778e-05,
2278
+ "loss": 0.1559,
2279
+ "step": 3230
2280
+ },
2281
+ {
2282
+ "epoch": 37.43,
2283
+ "learning_rate": 1.2450082217524079e-05,
2284
+ "loss": 0.1684,
2285
+ "step": 3240
2286
+ },
2287
+ {
2288
+ "epoch": 37.54,
2289
+ "learning_rate": 1.233262861169838e-05,
2290
+ "loss": 0.1962,
2291
+ "step": 3250
2292
+ },
2293
+ {
2294
+ "epoch": 37.66,
2295
+ "learning_rate": 1.221517500587268e-05,
2296
+ "loss": 0.1528,
2297
+ "step": 3260
2298
+ },
2299
+ {
2300
+ "epoch": 37.77,
2301
+ "learning_rate": 1.2097721400046983e-05,
2302
+ "loss": 0.1788,
2303
+ "step": 3270
2304
+ },
2305
+ {
2306
+ "epoch": 37.89,
2307
+ "learning_rate": 1.1980267794221284e-05,
2308
+ "loss": 0.1689,
2309
+ "step": 3280
2310
+ },
2311
+ {
2312
+ "epoch": 37.99,
2313
+ "eval_accuracy": 0.9666048237476809,
2314
+ "eval_loss": 0.5420700907707214,
2315
+ "eval_runtime": 8.2102,
2316
+ "eval_samples_per_second": 65.65,
2317
+ "eval_steps_per_second": 8.282,
2318
+ "step": 3289
2319
+ },
2320
+ {
2321
+ "epoch": 38.0,
2322
+ "learning_rate": 1.1862814188395583e-05,
2323
+ "loss": 0.1739,
2324
+ "step": 3290
2325
+ },
2326
+ {
2327
+ "epoch": 38.12,
2328
+ "learning_rate": 1.1745360582569884e-05,
2329
+ "loss": 0.1396,
2330
+ "step": 3300
2331
+ },
2332
+ {
2333
+ "epoch": 38.23,
2334
+ "learning_rate": 1.1627906976744187e-05,
2335
+ "loss": 0.1871,
2336
+ "step": 3310
2337
+ },
2338
+ {
2339
+ "epoch": 38.35,
2340
+ "learning_rate": 1.1510453370918488e-05,
2341
+ "loss": 0.1947,
2342
+ "step": 3320
2343
+ },
2344
+ {
2345
+ "epoch": 38.47,
2346
+ "learning_rate": 1.1392999765092789e-05,
2347
+ "loss": 0.1823,
2348
+ "step": 3330
2349
+ },
2350
+ {
2351
+ "epoch": 38.58,
2352
+ "learning_rate": 1.127554615926709e-05,
2353
+ "loss": 0.1816,
2354
+ "step": 3340
2355
+ },
2356
+ {
2357
+ "epoch": 38.7,
2358
+ "learning_rate": 1.1158092553441393e-05,
2359
+ "loss": 0.2031,
2360
+ "step": 3350
2361
+ },
2362
+ {
2363
+ "epoch": 38.81,
2364
+ "learning_rate": 1.1040638947615692e-05,
2365
+ "loss": 0.1764,
2366
+ "step": 3360
2367
+ },
2368
+ {
2369
+ "epoch": 38.93,
2370
+ "learning_rate": 1.0923185341789993e-05,
2371
+ "loss": 0.1467,
2372
+ "step": 3370
2373
+ },
2374
+ {
2375
+ "epoch": 39.0,
2376
+ "eval_accuracy": 0.9647495361781077,
2377
+ "eval_loss": 0.10951773822307587,
2378
+ "eval_runtime": 8.0526,
2379
+ "eval_samples_per_second": 66.935,
2380
+ "eval_steps_per_second": 8.444,
2381
+ "step": 3376
2382
+ },
2383
+ {
2384
+ "epoch": 39.04,
2385
+ "learning_rate": 1.0805731735964294e-05,
2386
+ "loss": 0.1615,
2387
+ "step": 3380
2388
+ },
2389
+ {
2390
+ "epoch": 39.16,
2391
+ "learning_rate": 1.0688278130138596e-05,
2392
+ "loss": 0.1797,
2393
+ "step": 3390
2394
+ },
2395
+ {
2396
+ "epoch": 39.27,
2397
+ "learning_rate": 1.0570824524312897e-05,
2398
+ "loss": 0.1314,
2399
+ "step": 3400
2400
+ },
2401
+ {
2402
+ "epoch": 39.39,
2403
+ "learning_rate": 1.0453370918487198e-05,
2404
+ "loss": 0.19,
2405
+ "step": 3410
2406
+ },
2407
+ {
2408
+ "epoch": 39.5,
2409
+ "learning_rate": 1.03359173126615e-05,
2410
+ "loss": 0.1955,
2411
+ "step": 3420
2412
+ },
2413
+ {
2414
+ "epoch": 39.62,
2415
+ "learning_rate": 1.02184637068358e-05,
2416
+ "loss": 0.1635,
2417
+ "step": 3430
2418
+ },
2419
+ {
2420
+ "epoch": 39.74,
2421
+ "learning_rate": 1.0101010101010101e-05,
2422
+ "loss": 0.1544,
2423
+ "step": 3440
2424
+ },
2425
+ {
2426
+ "epoch": 39.85,
2427
+ "learning_rate": 9.983556495184402e-06,
2428
+ "loss": 0.1604,
2429
+ "step": 3450
2430
+ },
2431
+ {
2432
+ "epoch": 39.97,
2433
+ "learning_rate": 9.866102889358703e-06,
2434
+ "loss": 0.1513,
2435
+ "step": 3460
2436
+ },
2437
+ {
2438
+ "epoch": 39.99,
2439
+ "eval_accuracy": 0.9703153988868275,
2440
+ "eval_loss": 0.3827688992023468,
2441
+ "eval_runtime": 7.9622,
2442
+ "eval_samples_per_second": 67.695,
2443
+ "eval_steps_per_second": 8.54,
2444
+ "step": 3462
2445
+ },
2446
+ {
2447
+ "epoch": 40.08,
2448
+ "learning_rate": 9.748649283533006e-06,
2449
+ "loss": 0.1578,
2450
+ "step": 3470
2451
+ },
2452
+ {
2453
+ "epoch": 40.2,
2454
+ "learning_rate": 9.631195677707307e-06,
2455
+ "loss": 0.1633,
2456
+ "step": 3480
2457
+ },
2458
+ {
2459
+ "epoch": 40.31,
2460
+ "learning_rate": 9.513742071881606e-06,
2461
+ "loss": 0.1273,
2462
+ "step": 3490
2463
+ },
2464
+ {
2465
+ "epoch": 40.43,
2466
+ "learning_rate": 9.396288466055909e-06,
2467
+ "loss": 0.1535,
2468
+ "step": 3500
2469
+ },
2470
+ {
2471
+ "epoch": 40.54,
2472
+ "learning_rate": 9.27883486023021e-06,
2473
+ "loss": 0.2011,
2474
+ "step": 3510
2475
+ },
2476
+ {
2477
+ "epoch": 40.66,
2478
+ "learning_rate": 9.16138125440451e-06,
2479
+ "loss": 0.1801,
2480
+ "step": 3520
2481
+ },
2482
+ {
2483
+ "epoch": 40.78,
2484
+ "learning_rate": 9.043927648578812e-06,
2485
+ "loss": 0.1341,
2486
+ "step": 3530
2487
+ },
2488
+ {
2489
+ "epoch": 40.89,
2490
+ "learning_rate": 8.926474042753113e-06,
2491
+ "loss": 0.1768,
2492
+ "step": 3540
2493
+ },
2494
+ {
2495
+ "epoch": 41.0,
2496
+ "eval_accuracy": 0.9703153988868275,
2497
+ "eval_loss": 0.09445924311876297,
2498
+ "eval_runtime": 8.0713,
2499
+ "eval_samples_per_second": 66.78,
2500
+ "eval_steps_per_second": 8.425,
2501
+ "step": 3549
2502
+ },
2503
+ {
2504
+ "epoch": 41.01,
2505
+ "learning_rate": 8.809020436927414e-06,
2506
+ "loss": 0.1797,
2507
+ "step": 3550
2508
+ },
2509
+ {
2510
+ "epoch": 41.12,
2511
+ "learning_rate": 8.691566831101715e-06,
2512
+ "loss": 0.1782,
2513
+ "step": 3560
2514
+ },
2515
+ {
2516
+ "epoch": 41.24,
2517
+ "learning_rate": 8.574113225276016e-06,
2518
+ "loss": 0.164,
2519
+ "step": 3570
2520
+ },
2521
+ {
2522
+ "epoch": 41.35,
2523
+ "learning_rate": 8.456659619450318e-06,
2524
+ "loss": 0.156,
2525
+ "step": 3580
2526
+ },
2527
+ {
2528
+ "epoch": 41.47,
2529
+ "learning_rate": 8.33920601362462e-06,
2530
+ "loss": 0.1603,
2531
+ "step": 3590
2532
+ },
2533
+ {
2534
+ "epoch": 41.58,
2535
+ "learning_rate": 8.22175240779892e-06,
2536
+ "loss": 0.1663,
2537
+ "step": 3600
2538
+ },
2539
+ {
2540
+ "epoch": 41.7,
2541
+ "learning_rate": 8.104298801973221e-06,
2542
+ "loss": 0.1433,
2543
+ "step": 3610
2544
+ },
2545
+ {
2546
+ "epoch": 41.82,
2547
+ "learning_rate": 7.986845196147522e-06,
2548
+ "loss": 0.1769,
2549
+ "step": 3620
2550
+ },
2551
+ {
2552
+ "epoch": 41.93,
2553
+ "learning_rate": 7.869391590321823e-06,
2554
+ "loss": 0.1633,
2555
+ "step": 3630
2556
+ },
2557
+ {
2558
+ "epoch": 42.0,
2559
+ "eval_accuracy": 0.9591836734693877,
2560
+ "eval_loss": 0.22497127950191498,
2561
+ "eval_runtime": 8.2319,
2562
+ "eval_samples_per_second": 65.477,
2563
+ "eval_steps_per_second": 8.261,
2564
+ "step": 3636
2565
+ },
2566
+ {
2567
+ "epoch": 42.05,
2568
+ "learning_rate": 7.751937984496124e-06,
2569
+ "loss": 0.1682,
2570
+ "step": 3640
2571
+ },
2572
+ {
2573
+ "epoch": 42.16,
2574
+ "learning_rate": 7.634484378670425e-06,
2575
+ "loss": 0.1527,
2576
+ "step": 3650
2577
+ },
2578
+ {
2579
+ "epoch": 42.28,
2580
+ "learning_rate": 7.517030772844727e-06,
2581
+ "loss": 0.1615,
2582
+ "step": 3660
2583
+ },
2584
+ {
2585
+ "epoch": 42.39,
2586
+ "learning_rate": 7.399577167019029e-06,
2587
+ "loss": 0.1867,
2588
+ "step": 3670
2589
+ },
2590
+ {
2591
+ "epoch": 42.51,
2592
+ "learning_rate": 7.282123561193328e-06,
2593
+ "loss": 0.1696,
2594
+ "step": 3680
2595
+ },
2596
+ {
2597
+ "epoch": 42.62,
2598
+ "learning_rate": 7.16466995536763e-06,
2599
+ "loss": 0.1257,
2600
+ "step": 3690
2601
+ },
2602
+ {
2603
+ "epoch": 42.74,
2604
+ "learning_rate": 7.047216349541931e-06,
2605
+ "loss": 0.1549,
2606
+ "step": 3700
2607
+ },
2608
+ {
2609
+ "epoch": 42.85,
2610
+ "learning_rate": 6.929762743716233e-06,
2611
+ "loss": 0.1604,
2612
+ "step": 3710
2613
+ },
2614
+ {
2615
+ "epoch": 42.97,
2616
+ "learning_rate": 6.812309137890534e-06,
2617
+ "loss": 0.1945,
2618
+ "step": 3720
2619
+ },
2620
+ {
2621
+ "epoch": 42.99,
2622
+ "eval_accuracy": 0.9684601113172542,
2623
+ "eval_loss": 0.2014760673046112,
2624
+ "eval_runtime": 8.2434,
2625
+ "eval_samples_per_second": 65.386,
2626
+ "eval_steps_per_second": 8.249,
2627
+ "step": 3722
2628
+ },
2629
+ {
2630
+ "epoch": 43.09,
2631
+ "learning_rate": 6.694855532064835e-06,
2632
+ "loss": 0.1757,
2633
+ "step": 3730
2634
+ },
2635
+ {
2636
+ "epoch": 43.2,
2637
+ "learning_rate": 6.577401926239136e-06,
2638
+ "loss": 0.1401,
2639
+ "step": 3740
2640
+ },
2641
+ {
2642
+ "epoch": 43.32,
2643
+ "learning_rate": 6.4599483204134365e-06,
2644
+ "loss": 0.1655,
2645
+ "step": 3750
2646
+ },
2647
+ {
2648
+ "epoch": 43.43,
2649
+ "learning_rate": 6.3424947145877375e-06,
2650
+ "loss": 0.178,
2651
+ "step": 3760
2652
+ },
2653
+ {
2654
+ "epoch": 43.55,
2655
+ "learning_rate": 6.225041108762039e-06,
2656
+ "loss": 0.1277,
2657
+ "step": 3770
2658
+ },
2659
+ {
2660
+ "epoch": 43.66,
2661
+ "learning_rate": 6.10758750293634e-06,
2662
+ "loss": 0.1861,
2663
+ "step": 3780
2664
+ },
2665
+ {
2666
+ "epoch": 43.78,
2667
+ "learning_rate": 5.990133897110642e-06,
2668
+ "loss": 0.1634,
2669
+ "step": 3790
2670
+ },
2671
+ {
2672
+ "epoch": 43.89,
2673
+ "learning_rate": 5.872680291284942e-06,
2674
+ "loss": 0.1896,
2675
+ "step": 3800
2676
+ },
2677
+ {
2678
+ "epoch": 44.0,
2679
+ "eval_accuracy": 0.9666048237476809,
2680
+ "eval_loss": 0.11137495934963226,
2681
+ "eval_runtime": 8.1155,
2682
+ "eval_samples_per_second": 66.416,
2683
+ "eval_steps_per_second": 8.379,
2684
+ "step": 3809
2685
+ },
2686
+ {
2687
+ "epoch": 44.01,
2688
+ "learning_rate": 5.755226685459244e-06,
2689
+ "loss": 0.1602,
2690
+ "step": 3810
2691
+ },
2692
+ {
2693
+ "epoch": 44.13,
2694
+ "learning_rate": 5.637773079633545e-06,
2695
+ "loss": 0.1184,
2696
+ "step": 3820
2697
+ },
2698
+ {
2699
+ "epoch": 44.24,
2700
+ "learning_rate": 5.520319473807846e-06,
2701
+ "loss": 0.144,
2702
+ "step": 3830
2703
+ },
2704
+ {
2705
+ "epoch": 44.36,
2706
+ "learning_rate": 5.402865867982147e-06,
2707
+ "loss": 0.1956,
2708
+ "step": 3840
2709
+ },
2710
+ {
2711
+ "epoch": 44.47,
2712
+ "learning_rate": 5.285412262156449e-06,
2713
+ "loss": 0.1654,
2714
+ "step": 3850
2715
+ },
2716
+ {
2717
+ "epoch": 44.59,
2718
+ "learning_rate": 5.16795865633075e-06,
2719
+ "loss": 0.1889,
2720
+ "step": 3860
2721
+ },
2722
+ {
2723
+ "epoch": 44.7,
2724
+ "learning_rate": 5.050505050505051e-06,
2725
+ "loss": 0.108,
2726
+ "step": 3870
2727
+ },
2728
+ {
2729
+ "epoch": 44.82,
2730
+ "learning_rate": 4.933051444679352e-06,
2731
+ "loss": 0.1702,
2732
+ "step": 3880
2733
+ },
2734
+ {
2735
+ "epoch": 44.93,
2736
+ "learning_rate": 4.8155978388536535e-06,
2737
+ "loss": 0.1629,
2738
+ "step": 3890
2739
+ },
2740
+ {
2741
+ "epoch": 44.99,
2742
+ "eval_accuracy": 0.9666048237476809,
2743
+ "eval_loss": 0.0954408049583435,
2744
+ "eval_runtime": 8.1596,
2745
+ "eval_samples_per_second": 66.057,
2746
+ "eval_steps_per_second": 8.334,
2747
+ "step": 3895
2748
+ },
2749
+ {
2750
+ "epoch": 45.05,
2751
+ "learning_rate": 4.6981442330279544e-06,
2752
+ "loss": 0.1729,
2753
+ "step": 3900
2754
+ },
2755
+ {
2756
+ "epoch": 45.17,
2757
+ "learning_rate": 4.580690627202255e-06,
2758
+ "loss": 0.1781,
2759
+ "step": 3910
2760
+ },
2761
+ {
2762
+ "epoch": 45.28,
2763
+ "learning_rate": 4.463237021376556e-06,
2764
+ "loss": 0.1565,
2765
+ "step": 3920
2766
+ },
2767
+ {
2768
+ "epoch": 45.4,
2769
+ "learning_rate": 4.345783415550857e-06,
2770
+ "loss": 0.1445,
2771
+ "step": 3930
2772
+ },
2773
+ {
2774
+ "epoch": 45.51,
2775
+ "learning_rate": 4.228329809725159e-06,
2776
+ "loss": 0.1315,
2777
+ "step": 3940
2778
+ },
2779
+ {
2780
+ "epoch": 45.63,
2781
+ "learning_rate": 4.11087620389946e-06,
2782
+ "loss": 0.1578,
2783
+ "step": 3950
2784
+ },
2785
+ {
2786
+ "epoch": 45.74,
2787
+ "learning_rate": 3.993422598073761e-06,
2788
+ "loss": 0.1875,
2789
+ "step": 3960
2790
+ },
2791
+ {
2792
+ "epoch": 45.86,
2793
+ "learning_rate": 3.875968992248062e-06,
2794
+ "loss": 0.2048,
2795
+ "step": 3970
2796
+ },
2797
+ {
2798
+ "epoch": 45.97,
2799
+ "learning_rate": 3.7585153864223635e-06,
2800
+ "loss": 0.1825,
2801
+ "step": 3980
2802
+ },
2803
+ {
2804
+ "epoch": 46.0,
2805
+ "eval_accuracy": 0.974025974025974,
2806
+ "eval_loss": 0.09737637639045715,
2807
+ "eval_runtime": 8.2567,
2808
+ "eval_samples_per_second": 65.28,
2809
+ "eval_steps_per_second": 8.236,
2810
+ "step": 3982
2811
+ },
2812
+ {
2813
+ "epoch": 46.09,
2814
+ "learning_rate": 3.641061780596664e-06,
2815
+ "loss": 0.1715,
2816
+ "step": 3990
2817
+ },
2818
+ {
2819
+ "epoch": 46.2,
2820
+ "learning_rate": 3.5236081747709654e-06,
2821
+ "loss": 0.1679,
2822
+ "step": 4000
2823
+ },
2824
+ {
2825
+ "epoch": 46.32,
2826
+ "learning_rate": 3.406154568945267e-06,
2827
+ "loss": 0.1809,
2828
+ "step": 4010
2829
+ },
2830
+ {
2831
+ "epoch": 46.44,
2832
+ "learning_rate": 3.288700963119568e-06,
2833
+ "loss": 0.1582,
2834
+ "step": 4020
2835
+ },
2836
+ {
2837
+ "epoch": 46.55,
2838
+ "learning_rate": 3.1712473572938687e-06,
2839
+ "loss": 0.1497,
2840
+ "step": 4030
2841
+ },
2842
+ {
2843
+ "epoch": 46.67,
2844
+ "learning_rate": 3.05379375146817e-06,
2845
+ "loss": 0.1748,
2846
+ "step": 4040
2847
+ },
2848
+ {
2849
+ "epoch": 46.78,
2850
+ "learning_rate": 2.936340145642471e-06,
2851
+ "loss": 0.1893,
2852
+ "step": 4050
2853
+ },
2854
+ {
2855
+ "epoch": 46.9,
2856
+ "learning_rate": 2.8188865398167725e-06,
2857
+ "loss": 0.1664,
2858
+ "step": 4060
2859
+ },
2860
+ {
2861
+ "epoch": 46.99,
2862
+ "eval_accuracy": 0.9703153988868275,
2863
+ "eval_loss": 0.09385673701763153,
2864
+ "eval_runtime": 8.1532,
2865
+ "eval_samples_per_second": 66.109,
2866
+ "eval_steps_per_second": 8.34,
2867
+ "step": 4068
2868
+ },
2869
+ {
2870
+ "epoch": 47.01,
2871
+ "learning_rate": 2.7014329339910735e-06,
2872
+ "loss": 0.1701,
2873
+ "step": 4070
2874
+ },
2875
+ {
2876
+ "epoch": 47.13,
2877
+ "learning_rate": 2.583979328165375e-06,
2878
+ "loss": 0.1427,
2879
+ "step": 4080
2880
+ },
2881
+ {
2882
+ "epoch": 47.24,
2883
+ "learning_rate": 2.466525722339676e-06,
2884
+ "loss": 0.1303,
2885
+ "step": 4090
2886
+ },
2887
+ {
2888
+ "epoch": 47.36,
2889
+ "learning_rate": 2.3490721165139772e-06,
2890
+ "loss": 0.1459,
2891
+ "step": 4100
2892
+ },
2893
+ {
2894
+ "epoch": 47.48,
2895
+ "learning_rate": 2.231618510688278e-06,
2896
+ "loss": 0.1548,
2897
+ "step": 4110
2898
+ },
2899
+ {
2900
+ "epoch": 47.59,
2901
+ "learning_rate": 2.1141649048625796e-06,
2902
+ "loss": 0.1562,
2903
+ "step": 4120
2904
+ },
2905
+ {
2906
+ "epoch": 47.71,
2907
+ "learning_rate": 1.9967112990368805e-06,
2908
+ "loss": 0.1668,
2909
+ "step": 4130
2910
+ },
2911
+ {
2912
+ "epoch": 47.82,
2913
+ "learning_rate": 1.8792576932111817e-06,
2914
+ "loss": 0.1512,
2915
+ "step": 4140
2916
+ },
2917
+ {
2918
+ "epoch": 47.94,
2919
+ "learning_rate": 1.7618040873854827e-06,
2920
+ "loss": 0.1535,
2921
+ "step": 4150
2922
+ },
2923
+ {
2924
+ "epoch": 48.0,
2925
+ "eval_accuracy": 0.9721706864564007,
2926
+ "eval_loss": 0.09351829439401627,
2927
+ "eval_runtime": 8.1415,
2928
+ "eval_samples_per_second": 66.204,
2929
+ "eval_steps_per_second": 8.352,
2930
+ "step": 4155
2931
+ },
2932
+ {
2933
+ "epoch": 48.05,
2934
+ "learning_rate": 1.644350481559784e-06,
2935
+ "loss": 0.1624,
2936
+ "step": 4160
2937
+ },
2938
+ {
2939
+ "epoch": 48.17,
2940
+ "learning_rate": 1.526896875734085e-06,
2941
+ "loss": 0.1332,
2942
+ "step": 4170
2943
+ },
2944
+ {
2945
+ "epoch": 48.28,
2946
+ "learning_rate": 1.4094432699083862e-06,
2947
+ "loss": 0.1957,
2948
+ "step": 4180
2949
+ },
2950
+ {
2951
+ "epoch": 48.4,
2952
+ "learning_rate": 1.2919896640826874e-06,
2953
+ "loss": 0.139,
2954
+ "step": 4190
2955
+ },
2956
+ {
2957
+ "epoch": 48.51,
2958
+ "learning_rate": 1.1745360582569886e-06,
2959
+ "loss": 0.1589,
2960
+ "step": 4200
2961
+ },
2962
+ {
2963
+ "epoch": 48.63,
2964
+ "learning_rate": 1.0570824524312898e-06,
2965
+ "loss": 0.1696,
2966
+ "step": 4210
2967
+ },
2968
+ {
2969
+ "epoch": 48.75,
2970
+ "learning_rate": 9.396288466055909e-07,
2971
+ "loss": 0.1712,
2972
+ "step": 4220
2973
+ },
2974
+ {
2975
+ "epoch": 48.86,
2976
+ "learning_rate": 8.22175240779892e-07,
2977
+ "loss": 0.1562,
2978
+ "step": 4230
2979
+ },
2980
+ {
2981
+ "epoch": 48.98,
2982
+ "learning_rate": 7.047216349541931e-07,
2983
+ "loss": 0.1801,
2984
+ "step": 4240
2985
+ },
2986
+ {
2987
+ "epoch": 49.0,
2988
+ "eval_accuracy": 0.9703153988868275,
2989
+ "eval_loss": 0.09990726411342621,
2990
+ "eval_runtime": 8.1674,
2991
+ "eval_samples_per_second": 65.994,
2992
+ "eval_steps_per_second": 8.326,
2993
+ "step": 4242
2994
+ },
2995
+ {
2996
+ "epoch": 49.09,
2997
+ "learning_rate": 5.872680291284943e-07,
2998
+ "loss": 0.173,
2999
+ "step": 4250
3000
+ },
3001
+ {
3002
+ "epoch": 49.21,
3003
+ "learning_rate": 4.6981442330279543e-07,
3004
+ "loss": 0.1455,
3005
+ "step": 4260
3006
+ },
3007
+ {
3008
+ "epoch": 49.32,
3009
+ "learning_rate": 3.5236081747709656e-07,
3010
+ "loss": 0.1765,
3011
+ "step": 4270
3012
+ },
3013
+ {
3014
+ "epoch": 49.44,
3015
+ "learning_rate": 2.3490721165139772e-07,
3016
+ "loss": 0.1866,
3017
+ "step": 4280
3018
+ },
3019
+ {
3020
+ "epoch": 49.55,
3021
+ "learning_rate": 1.1745360582569886e-07,
3022
+ "loss": 0.1522,
3023
+ "step": 4290
3024
+ },
3025
+ {
3026
+ "epoch": 49.67,
3027
+ "learning_rate": 0.0,
3028
+ "loss": 0.1502,
3029
+ "step": 4300
3030
+ },
3031
+ {
3032
+ "epoch": 49.67,
3033
+ "eval_accuracy": 0.9703153988868275,
3034
+ "eval_loss": 0.19585400819778442,
3035
+ "eval_runtime": 8.3207,
3036
+ "eval_samples_per_second": 64.779,
3037
+ "eval_steps_per_second": 8.172,
3038
+ "step": 4300
3039
+ },
3040
+ {
3041
+ "epoch": 49.67,
3042
+ "step": 4300,
3043
+ "total_flos": 5.11036354111998e+18,
3044
+ "train_loss": 0.23105980243793753,
3045
+ "train_runtime": 6101.6743,
3046
+ "train_samples_per_second": 39.702,
3047
+ "train_steps_per_second": 0.705
3048
  }
3049
  ],
3050
+ "max_steps": 4300,
3051
+ "num_train_epochs": 50,
3052
+ "total_flos": 5.11036354111998e+18,
3053
  "trial_name": null,
3054
  "trial_params": null
3055
  }