jalaneunos commited on
Commit
09d6b7d
1 Parent(s): 71af103

End of training

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8683385579937304,
4
- "eval_loss": 0.3320719301700592,
5
- "eval_runtime": 19.3598,
6
- "eval_samples_per_second": 148.297,
7
- "eval_steps_per_second": 4.649,
8
  "total_flos": 2.0021605356722135e+19,
9
- "train_loss": 0.3826778081384036,
10
- "train_runtime": 3330.0466,
11
- "train_samples_per_second": 77.588,
12
- "train_steps_per_second": 0.607
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8732149076976663,
4
+ "eval_loss": 0.3263641893863678,
5
+ "eval_runtime": 15.9042,
6
+ "eval_samples_per_second": 180.518,
7
+ "eval_steps_per_second": 5.659,
8
  "total_flos": 2.0021605356722135e+19,
9
+ "train_loss": 0.37925267927717454,
10
+ "train_runtime": 3266.1275,
11
+ "train_samples_per_second": 79.106,
12
+ "train_steps_per_second": 0.618
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8683385579937304,
4
- "eval_loss": 0.3320719301700592,
5
- "eval_runtime": 19.3598,
6
- "eval_samples_per_second": 148.297,
7
- "eval_steps_per_second": 4.649
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8732149076976663,
4
+ "eval_loss": 0.3263641893863678,
5
+ "eval_runtime": 15.9042,
6
+ "eval_samples_per_second": 180.518,
7
+ "eval_steps_per_second": 5.659
8
  }
runs/Mar16_07-56-12_9065e78e46e3/events.out.tfevents.1710579312.9065e78e46e3.418.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514b9e06e795d1a44bc748b203fb89bde681035259920a64218a64963429dd9a
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 2.0021605356722135e+19,
4
- "train_loss": 0.3826778081384036,
5
- "train_runtime": 3330.0466,
6
- "train_samples_per_second": 77.588,
7
- "train_steps_per_second": 0.607
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 2.0021605356722135e+19,
4
+ "train_loss": 0.37925267927717454,
5
+ "train_runtime": 3266.1275,
6
+ "train_samples_per_second": 79.106,
7
+ "train_steps_per_second": 0.618
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.8683385579937304,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-FER2013/checkpoint-1616",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
  "global_step": 2020,
@@ -10,1314 +10,1516 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
 
13
  "learning_rate": 2.4752475247524753e-06,
14
- "loss": 0.7062,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.1,
 
19
  "learning_rate": 4.950495049504951e-06,
20
- "loss": 0.6984,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.15,
 
25
  "learning_rate": 7.4257425742574256e-06,
26
  "loss": 0.6891,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.2,
 
31
  "learning_rate": 9.900990099009901e-06,
32
- "loss": 0.6814,
33
  "step": 40
34
  },
35
  {
36
  "epoch": 0.25,
 
37
  "learning_rate": 1.2376237623762377e-05,
38
- "loss": 0.6688,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.3,
 
43
  "learning_rate": 1.4851485148514851e-05,
44
- "loss": 0.6468,
45
  "step": 60
46
  },
47
  {
48
  "epoch": 0.35,
 
49
  "learning_rate": 1.7326732673267325e-05,
50
- "loss": 0.6276,
51
  "step": 70
52
  },
53
  {
54
  "epoch": 0.4,
 
55
  "learning_rate": 1.9801980198019803e-05,
56
- "loss": 0.6003,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.45,
 
61
  "learning_rate": 2.227722772277228e-05,
62
- "loss": 0.5689,
63
  "step": 90
64
  },
65
  {
66
  "epoch": 0.5,
 
67
  "learning_rate": 2.4752475247524754e-05,
68
- "loss": 0.5499,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.54,
 
73
  "learning_rate": 2.722772277227723e-05,
74
- "loss": 0.5432,
75
  "step": 110
76
  },
77
  {
78
  "epoch": 0.59,
 
79
  "learning_rate": 2.9702970297029702e-05,
80
- "loss": 0.5408,
81
  "step": 120
82
  },
83
  {
84
  "epoch": 0.64,
 
85
  "learning_rate": 3.217821782178218e-05,
86
- "loss": 0.535,
87
  "step": 130
88
  },
89
  {
90
  "epoch": 0.69,
 
91
  "learning_rate": 3.465346534653465e-05,
92
- "loss": 0.5331,
93
  "step": 140
94
  },
95
  {
96
  "epoch": 0.74,
 
97
  "learning_rate": 3.712871287128713e-05,
98
- "loss": 0.4929,
99
  "step": 150
100
  },
101
  {
102
  "epoch": 0.79,
 
103
  "learning_rate": 3.9603960396039605e-05,
104
- "loss": 0.5054,
105
  "step": 160
106
  },
107
  {
108
  "epoch": 0.84,
 
109
  "learning_rate": 4.207920792079208e-05,
110
- "loss": 0.4882,
111
  "step": 170
112
  },
113
  {
114
  "epoch": 0.89,
 
115
  "learning_rate": 4.455445544554456e-05,
116
- "loss": 0.4844,
117
  "step": 180
118
  },
119
  {
120
  "epoch": 0.94,
 
121
  "learning_rate": 4.702970297029703e-05,
122
- "loss": 0.5134,
123
  "step": 190
124
  },
125
  {
126
  "epoch": 0.99,
 
127
  "learning_rate": 4.950495049504951e-05,
128
- "loss": 0.495,
129
  "step": 200
130
  },
131
  {
132
  "epoch": 1.0,
133
- "eval_accuracy": 0.7739463601532567,
134
- "eval_loss": 0.46603715419769287,
135
- "eval_runtime": 15.7268,
136
- "eval_samples_per_second": 182.555,
137
- "eval_steps_per_second": 5.723,
138
  "step": 202
139
  },
140
  {
141
  "epoch": 1.04,
 
142
  "learning_rate": 4.977997799779978e-05,
143
- "loss": 0.5214,
144
  "step": 210
145
  },
146
  {
147
  "epoch": 1.09,
 
148
  "learning_rate": 4.950495049504951e-05,
149
- "loss": 0.5009,
150
  "step": 220
151
  },
152
  {
153
  "epoch": 1.14,
 
154
  "learning_rate": 4.9229922992299234e-05,
155
- "loss": 0.4632,
156
  "step": 230
157
  },
158
  {
159
  "epoch": 1.19,
 
160
  "learning_rate": 4.895489548954896e-05,
161
- "loss": 0.4585,
162
  "step": 240
163
  },
164
  {
165
  "epoch": 1.24,
 
166
  "learning_rate": 4.867986798679868e-05,
167
- "loss": 0.4339,
168
  "step": 250
169
  },
170
  {
171
  "epoch": 1.29,
 
172
  "learning_rate": 4.8404840484048406e-05,
173
- "loss": 0.4703,
174
  "step": 260
175
  },
176
  {
177
  "epoch": 1.34,
 
178
  "learning_rate": 4.812981298129813e-05,
179
- "loss": 0.4748,
180
  "step": 270
181
  },
182
  {
183
  "epoch": 1.39,
 
184
  "learning_rate": 4.785478547854786e-05,
185
- "loss": 0.4688,
186
  "step": 280
187
  },
188
  {
189
  "epoch": 1.44,
 
190
  "learning_rate": 4.7579757975797585e-05,
191
- "loss": 0.4613,
192
  "step": 290
193
  },
194
  {
195
  "epoch": 1.49,
 
196
  "learning_rate": 4.730473047304731e-05,
197
- "loss": 0.4633,
198
  "step": 300
199
  },
200
  {
201
  "epoch": 1.53,
 
202
  "learning_rate": 4.702970297029703e-05,
203
- "loss": 0.4661,
204
  "step": 310
205
  },
206
  {
207
  "epoch": 1.58,
 
208
  "learning_rate": 4.675467546754676e-05,
209
- "loss": 0.4476,
210
  "step": 320
211
  },
212
  {
213
  "epoch": 1.63,
 
214
  "learning_rate": 4.647964796479648e-05,
215
- "loss": 0.458,
216
  "step": 330
217
  },
218
  {
219
  "epoch": 1.68,
 
220
  "learning_rate": 4.62046204620462e-05,
221
- "loss": 0.4606,
222
  "step": 340
223
  },
224
  {
225
  "epoch": 1.73,
 
226
  "learning_rate": 4.592959295929593e-05,
227
- "loss": 0.4433,
228
  "step": 350
229
  },
230
  {
231
  "epoch": 1.78,
 
232
  "learning_rate": 4.5654565456545655e-05,
233
- "loss": 0.4568,
234
  "step": 360
235
  },
236
  {
237
  "epoch": 1.83,
 
238
  "learning_rate": 4.537953795379538e-05,
239
- "loss": 0.4395,
240
  "step": 370
241
  },
242
  {
243
  "epoch": 1.88,
 
244
  "learning_rate": 4.510451045104511e-05,
245
- "loss": 0.448,
246
  "step": 380
247
  },
248
  {
249
  "epoch": 1.93,
 
250
  "learning_rate": 4.4829482948294834e-05,
251
- "loss": 0.4722,
252
  "step": 390
253
  },
254
  {
255
  "epoch": 1.98,
 
256
  "learning_rate": 4.455445544554456e-05,
257
- "loss": 0.4632,
258
  "step": 400
259
  },
260
  {
261
  "epoch": 2.0,
262
- "eval_accuracy": 0.8286311389759665,
263
- "eval_loss": 0.3819791376590729,
264
- "eval_runtime": 15.8456,
265
- "eval_samples_per_second": 181.185,
266
- "eval_steps_per_second": 5.68,
267
  "step": 404
268
  },
269
  {
270
  "epoch": 2.03,
 
271
  "learning_rate": 4.427942794279428e-05,
272
- "loss": 0.4614,
273
  "step": 410
274
  },
275
  {
276
  "epoch": 2.08,
 
277
  "learning_rate": 4.4004400440044006e-05,
278
- "loss": 0.4644,
279
  "step": 420
280
  },
281
  {
282
  "epoch": 2.13,
 
283
  "learning_rate": 4.372937293729373e-05,
284
- "loss": 0.4325,
285
  "step": 430
286
  },
287
  {
288
  "epoch": 2.18,
 
289
  "learning_rate": 4.345434543454346e-05,
290
- "loss": 0.4297,
291
  "step": 440
292
  },
293
  {
294
  "epoch": 2.23,
 
295
  "learning_rate": 4.3179317931793185e-05,
296
- "loss": 0.456,
297
  "step": 450
298
  },
299
  {
300
  "epoch": 2.28,
 
301
  "learning_rate": 4.2904290429042904e-05,
302
- "loss": 0.4119,
303
  "step": 460
304
  },
305
  {
306
  "epoch": 2.33,
 
307
  "learning_rate": 4.262926292629263e-05,
308
- "loss": 0.4101,
309
  "step": 470
310
  },
311
  {
312
  "epoch": 2.38,
 
313
  "learning_rate": 4.2354235423542356e-05,
314
- "loss": 0.3827,
315
  "step": 480
316
  },
317
  {
318
  "epoch": 2.43,
 
319
  "learning_rate": 4.207920792079208e-05,
320
- "loss": 0.4177,
321
  "step": 490
322
  },
323
  {
324
  "epoch": 2.48,
 
325
  "learning_rate": 4.18041804180418e-05,
326
- "loss": 0.4286,
327
  "step": 500
328
  },
329
  {
330
  "epoch": 2.52,
 
331
  "learning_rate": 4.152915291529153e-05,
332
- "loss": 0.4471,
333
  "step": 510
334
  },
335
  {
336
  "epoch": 2.57,
 
337
  "learning_rate": 4.1254125412541255e-05,
338
- "loss": 0.4304,
339
  "step": 520
340
  },
341
  {
342
  "epoch": 2.62,
 
343
  "learning_rate": 4.097909790979098e-05,
344
- "loss": 0.4043,
345
  "step": 530
346
  },
347
  {
348
  "epoch": 2.67,
 
349
  "learning_rate": 4.070407040704071e-05,
350
- "loss": 0.4277,
351
  "step": 540
352
  },
353
  {
354
  "epoch": 2.72,
 
355
  "learning_rate": 4.042904290429043e-05,
356
- "loss": 0.403,
357
  "step": 550
358
  },
359
  {
360
  "epoch": 2.77,
 
361
  "learning_rate": 4.015401540154016e-05,
362
- "loss": 0.4138,
363
  "step": 560
364
  },
365
  {
366
  "epoch": 2.82,
 
367
  "learning_rate": 3.987898789878988e-05,
368
- "loss": 0.4005,
369
  "step": 570
370
  },
371
  {
372
  "epoch": 2.87,
 
373
  "learning_rate": 3.9603960396039605e-05,
374
- "loss": 0.407,
375
  "step": 580
376
  },
377
  {
378
  "epoch": 2.92,
 
379
  "learning_rate": 3.932893289328933e-05,
380
- "loss": 0.4124,
381
  "step": 590
382
  },
383
  {
384
  "epoch": 2.97,
 
385
  "learning_rate": 3.905390539053906e-05,
386
- "loss": 0.4013,
387
  "step": 600
388
  },
389
  {
390
  "epoch": 3.0,
391
- "eval_accuracy": 0.8446534308603274,
392
- "eval_loss": 0.35617050528526306,
393
- "eval_runtime": 16.2713,
394
- "eval_samples_per_second": 176.445,
395
- "eval_steps_per_second": 5.531,
396
  "step": 606
397
  },
398
  {
399
  "epoch": 3.02,
 
400
  "learning_rate": 3.877887788778878e-05,
401
- "loss": 0.3881,
402
  "step": 610
403
  },
404
  {
405
  "epoch": 3.07,
 
406
  "learning_rate": 3.8503850385038503e-05,
407
- "loss": 0.3761,
408
  "step": 620
409
  },
410
  {
411
  "epoch": 3.12,
 
412
  "learning_rate": 3.822882288228823e-05,
413
- "loss": 0.4161,
414
  "step": 630
415
  },
416
  {
417
  "epoch": 3.17,
 
418
  "learning_rate": 3.7953795379537956e-05,
419
- "loss": 0.3975,
420
  "step": 640
421
  },
422
  {
423
  "epoch": 3.22,
 
424
  "learning_rate": 3.767876787678768e-05,
425
- "loss": 0.4224,
426
  "step": 650
427
  },
428
  {
429
  "epoch": 3.27,
 
430
  "learning_rate": 3.74037403740374e-05,
431
- "loss": 0.3849,
432
  "step": 660
433
  },
434
  {
435
  "epoch": 3.32,
 
436
  "learning_rate": 3.712871287128713e-05,
437
- "loss": 0.4178,
438
  "step": 670
439
  },
440
  {
441
  "epoch": 3.37,
 
442
  "learning_rate": 3.6853685368536854e-05,
443
- "loss": 0.3966,
444
  "step": 680
445
  },
446
  {
447
  "epoch": 3.42,
 
448
  "learning_rate": 3.657865786578658e-05,
449
- "loss": 0.3886,
450
  "step": 690
451
  },
452
  {
453
  "epoch": 3.47,
 
454
  "learning_rate": 3.6303630363036307e-05,
455
- "loss": 0.4021,
456
  "step": 700
457
  },
458
  {
459
  "epoch": 3.51,
 
460
  "learning_rate": 3.602860286028603e-05,
461
- "loss": 0.3913,
462
  "step": 710
463
  },
464
  {
465
  "epoch": 3.56,
 
466
  "learning_rate": 3.575357535753576e-05,
467
- "loss": 0.3886,
468
  "step": 720
469
  },
470
  {
471
  "epoch": 3.61,
 
472
  "learning_rate": 3.5478547854785485e-05,
473
- "loss": 0.4066,
474
  "step": 730
475
  },
476
  {
477
  "epoch": 3.66,
 
478
  "learning_rate": 3.5203520352035205e-05,
479
- "loss": 0.401,
480
  "step": 740
481
  },
482
  {
483
  "epoch": 3.71,
 
484
  "learning_rate": 3.492849284928493e-05,
485
- "loss": 0.4002,
486
  "step": 750
487
  },
488
  {
489
  "epoch": 3.76,
 
490
  "learning_rate": 3.465346534653465e-05,
491
- "loss": 0.3738,
492
  "step": 760
493
  },
494
  {
495
  "epoch": 3.81,
 
496
  "learning_rate": 3.4378437843784377e-05,
497
- "loss": 0.4218,
498
  "step": 770
499
  },
500
  {
501
  "epoch": 3.86,
 
502
  "learning_rate": 3.41034103410341e-05,
503
- "loss": 0.368,
504
  "step": 780
505
  },
506
  {
507
  "epoch": 3.91,
 
508
  "learning_rate": 3.382838283828383e-05,
509
- "loss": 0.3969,
510
  "step": 790
511
  },
512
  {
513
  "epoch": 3.96,
 
514
  "learning_rate": 3.3553355335533555e-05,
515
- "loss": 0.3883,
516
  "step": 800
517
  },
518
  {
519
  "epoch": 4.0,
520
- "eval_accuracy": 0.851619644723093,
521
- "eval_loss": 0.3426118791103363,
522
- "eval_runtime": 15.7929,
523
- "eval_samples_per_second": 181.79,
524
- "eval_steps_per_second": 5.699,
525
  "step": 808
526
  },
527
  {
528
  "epoch": 4.01,
 
529
  "learning_rate": 3.327832783278328e-05,
530
- "loss": 0.3605,
531
  "step": 810
532
  },
533
  {
534
  "epoch": 4.06,
 
535
  "learning_rate": 3.300330033003301e-05,
536
- "loss": 0.3797,
537
  "step": 820
538
  },
539
  {
540
  "epoch": 4.11,
 
541
  "learning_rate": 3.272827282728273e-05,
542
- "loss": 0.3701,
543
  "step": 830
544
  },
545
  {
546
  "epoch": 4.16,
 
547
  "learning_rate": 3.2453245324532453e-05,
548
- "loss": 0.3968,
549
  "step": 840
550
  },
551
  {
552
  "epoch": 4.21,
 
553
  "learning_rate": 3.217821782178218e-05,
554
- "loss": 0.3754,
555
  "step": 850
556
  },
557
  {
558
  "epoch": 4.26,
 
559
  "learning_rate": 3.1903190319031906e-05,
560
- "loss": 0.3962,
561
  "step": 860
562
  },
563
  {
564
  "epoch": 4.31,
 
565
  "learning_rate": 3.162816281628163e-05,
566
- "loss": 0.3723,
567
  "step": 870
568
  },
569
  {
570
  "epoch": 4.36,
 
571
  "learning_rate": 3.135313531353136e-05,
572
- "loss": 0.3743,
573
  "step": 880
574
  },
575
  {
576
  "epoch": 4.41,
 
577
  "learning_rate": 3.1078107810781085e-05,
578
- "loss": 0.4,
579
  "step": 890
580
  },
581
  {
582
  "epoch": 4.46,
 
583
  "learning_rate": 3.0803080308030804e-05,
584
- "loss": 0.3827,
585
  "step": 900
586
  },
587
  {
588
  "epoch": 4.5,
 
589
  "learning_rate": 3.052805280528053e-05,
590
- "loss": 0.3488,
591
  "step": 910
592
  },
593
  {
594
  "epoch": 4.55,
 
595
  "learning_rate": 3.0253025302530253e-05,
596
- "loss": 0.3755,
597
  "step": 920
598
  },
599
  {
600
  "epoch": 4.6,
 
601
  "learning_rate": 2.9977997799779976e-05,
602
- "loss": 0.3695,
603
  "step": 930
604
  },
605
  {
606
  "epoch": 4.65,
 
607
  "learning_rate": 2.9702970297029702e-05,
608
- "loss": 0.3803,
609
  "step": 940
610
  },
611
  {
612
  "epoch": 4.7,
 
613
  "learning_rate": 2.942794279427943e-05,
614
- "loss": 0.404,
615
  "step": 950
616
  },
617
  {
618
  "epoch": 4.75,
 
619
  "learning_rate": 2.9152915291529155e-05,
620
- "loss": 0.3711,
621
  "step": 960
622
  },
623
  {
624
  "epoch": 4.8,
 
625
  "learning_rate": 2.8877887788778878e-05,
626
- "loss": 0.3495,
627
  "step": 970
628
  },
629
  {
630
  "epoch": 4.85,
 
631
  "learning_rate": 2.8602860286028604e-05,
632
- "loss": 0.3644,
633
  "step": 980
634
  },
635
  {
636
  "epoch": 4.9,
 
637
  "learning_rate": 2.832783278327833e-05,
638
- "loss": 0.3673,
639
  "step": 990
640
  },
641
  {
642
  "epoch": 4.95,
 
643
  "learning_rate": 2.8052805280528056e-05,
644
- "loss": 0.3597,
645
  "step": 1000
646
  },
647
  {
648
  "epoch": 5.0,
 
649
  "learning_rate": 2.777777777777778e-05,
650
- "loss": 0.3801,
651
  "step": 1010
652
  },
653
  {
654
  "epoch": 5.0,
655
- "eval_accuracy": 0.8561476837338906,
656
- "eval_loss": 0.33033424615859985,
657
- "eval_runtime": 16.0262,
658
- "eval_samples_per_second": 179.144,
659
- "eval_steps_per_second": 5.616,
660
  "step": 1010
661
  },
662
  {
663
  "epoch": 5.05,
 
664
  "learning_rate": 2.7502750275027505e-05,
665
- "loss": 0.3717,
666
  "step": 1020
667
  },
668
  {
669
  "epoch": 5.1,
 
670
  "learning_rate": 2.722772277227723e-05,
671
- "loss": 0.3409,
672
  "step": 1030
673
  },
674
  {
675
  "epoch": 5.15,
 
676
  "learning_rate": 2.6952695269526958e-05,
677
- "loss": 0.3647,
678
  "step": 1040
679
  },
680
  {
681
  "epoch": 5.2,
 
682
  "learning_rate": 2.667766776677668e-05,
683
- "loss": 0.3454,
684
  "step": 1050
685
  },
686
  {
687
  "epoch": 5.25,
 
688
  "learning_rate": 2.64026402640264e-05,
689
- "loss": 0.3592,
690
  "step": 1060
691
  },
692
  {
693
  "epoch": 5.3,
 
694
  "learning_rate": 2.6127612761276126e-05,
695
- "loss": 0.3529,
696
  "step": 1070
697
  },
698
  {
699
  "epoch": 5.35,
 
700
  "learning_rate": 2.5852585258525853e-05,
701
- "loss": 0.3598,
702
  "step": 1080
703
  },
704
  {
705
  "epoch": 5.4,
 
706
  "learning_rate": 2.557755775577558e-05,
707
- "loss": 0.3922,
708
  "step": 1090
709
  },
710
  {
711
  "epoch": 5.45,
 
712
  "learning_rate": 2.53025302530253e-05,
713
- "loss": 0.3547,
714
  "step": 1100
715
  },
716
  {
717
  "epoch": 5.5,
 
718
  "learning_rate": 2.5027502750275028e-05,
719
- "loss": 0.3432,
720
  "step": 1110
721
  },
722
  {
723
  "epoch": 5.54,
 
724
  "learning_rate": 2.4752475247524754e-05,
725
- "loss": 0.3542,
726
  "step": 1120
727
  },
728
  {
729
  "epoch": 5.59,
 
730
  "learning_rate": 2.447744774477448e-05,
731
- "loss": 0.3541,
732
  "step": 1130
733
  },
734
  {
735
  "epoch": 5.64,
 
736
  "learning_rate": 2.4202420242024203e-05,
737
- "loss": 0.3475,
738
  "step": 1140
739
  },
740
  {
741
  "epoch": 5.69,
 
742
  "learning_rate": 2.392739273927393e-05,
743
- "loss": 0.3528,
744
  "step": 1150
745
  },
746
  {
747
  "epoch": 5.74,
 
748
  "learning_rate": 2.3652365236523656e-05,
749
- "loss": 0.3446,
750
  "step": 1160
751
  },
752
  {
753
  "epoch": 5.79,
 
754
  "learning_rate": 2.337733773377338e-05,
755
- "loss": 0.3264,
756
  "step": 1170
757
  },
758
  {
759
  "epoch": 5.84,
 
760
  "learning_rate": 2.31023102310231e-05,
761
- "loss": 0.3327,
762
  "step": 1180
763
  },
764
  {
765
  "epoch": 5.89,
 
766
  "learning_rate": 2.2827282728272828e-05,
767
- "loss": 0.3529,
768
  "step": 1190
769
  },
770
  {
771
  "epoch": 5.94,
 
772
  "learning_rate": 2.2552255225522554e-05,
773
- "loss": 0.3473,
774
  "step": 1200
775
  },
776
  {
777
  "epoch": 5.99,
 
778
  "learning_rate": 2.227722772277228e-05,
779
- "loss": 0.3612,
780
  "step": 1210
781
  },
782
  {
783
  "epoch": 6.0,
784
- "eval_accuracy": 0.8557993730407524,
785
- "eval_loss": 0.3362487852573395,
786
- "eval_runtime": 15.9082,
787
- "eval_samples_per_second": 180.474,
788
- "eval_steps_per_second": 5.657,
789
  "step": 1212
790
  },
791
  {
792
  "epoch": 6.04,
 
793
  "learning_rate": 2.2002200220022003e-05,
794
- "loss": 0.3324,
795
  "step": 1220
796
  },
797
  {
798
  "epoch": 6.09,
 
799
  "learning_rate": 2.172717271727173e-05,
800
- "loss": 0.3365,
801
  "step": 1230
802
  },
803
  {
804
  "epoch": 6.14,
 
805
  "learning_rate": 2.1452145214521452e-05,
806
- "loss": 0.3212,
807
  "step": 1240
808
  },
809
  {
810
  "epoch": 6.19,
 
811
  "learning_rate": 2.1177117711771178e-05,
812
- "loss": 0.33,
813
  "step": 1250
814
  },
815
  {
816
  "epoch": 6.24,
 
817
  "learning_rate": 2.09020902090209e-05,
818
- "loss": 0.3627,
819
  "step": 1260
820
  },
821
  {
822
  "epoch": 6.29,
 
823
  "learning_rate": 2.0627062706270627e-05,
824
- "loss": 0.3322,
825
  "step": 1270
826
  },
827
  {
828
  "epoch": 6.34,
 
829
  "learning_rate": 2.0352035203520354e-05,
830
- "loss": 0.3236,
831
  "step": 1280
832
  },
833
  {
834
  "epoch": 6.39,
 
835
  "learning_rate": 2.007700770077008e-05,
836
- "loss": 0.3398,
837
  "step": 1290
838
  },
839
  {
840
  "epoch": 6.44,
 
841
  "learning_rate": 1.9801980198019803e-05,
842
- "loss": 0.3481,
843
  "step": 1300
844
  },
845
  {
846
  "epoch": 6.49,
 
847
  "learning_rate": 1.952695269526953e-05,
848
- "loss": 0.3421,
849
  "step": 1310
850
  },
851
  {
852
  "epoch": 6.53,
 
853
  "learning_rate": 1.9251925192519252e-05,
854
- "loss": 0.3136,
855
  "step": 1320
856
  },
857
  {
858
  "epoch": 6.58,
 
859
  "learning_rate": 1.8976897689768978e-05,
860
- "loss": 0.3432,
861
  "step": 1330
862
  },
863
  {
864
  "epoch": 6.63,
 
865
  "learning_rate": 1.87018701870187e-05,
866
- "loss": 0.3277,
867
  "step": 1340
868
  },
869
  {
870
  "epoch": 6.68,
 
871
  "learning_rate": 1.8426842684268427e-05,
872
- "loss": 0.3285,
873
  "step": 1350
874
  },
875
  {
876
  "epoch": 6.73,
 
877
  "learning_rate": 1.8151815181518153e-05,
878
- "loss": 0.3449,
879
  "step": 1360
880
  },
881
  {
882
  "epoch": 6.78,
 
883
  "learning_rate": 1.787678767876788e-05,
884
- "loss": 0.3433,
885
  "step": 1370
886
  },
887
  {
888
  "epoch": 6.83,
 
889
  "learning_rate": 1.7601760176017602e-05,
890
- "loss": 0.3275,
891
  "step": 1380
892
  },
893
  {
894
  "epoch": 6.88,
 
895
  "learning_rate": 1.7326732673267325e-05,
896
- "loss": 0.3351,
897
  "step": 1390
898
  },
899
  {
900
  "epoch": 6.93,
 
901
  "learning_rate": 1.705170517051705e-05,
902
- "loss": 0.2946,
903
  "step": 1400
904
  },
905
  {
906
  "epoch": 6.98,
 
907
  "learning_rate": 1.6776677667766778e-05,
908
- "loss": 0.3504,
909
  "step": 1410
910
  },
911
  {
912
  "epoch": 7.0,
913
- "eval_accuracy": 0.8652037617554859,
914
- "eval_loss": 0.330201655626297,
915
- "eval_runtime": 15.6907,
916
- "eval_samples_per_second": 182.974,
917
- "eval_steps_per_second": 5.736,
918
  "step": 1414
919
  },
920
  {
921
  "epoch": 7.03,
 
922
  "learning_rate": 1.6501650165016504e-05,
923
- "loss": 0.3424,
924
  "step": 1420
925
  },
926
  {
927
  "epoch": 7.08,
 
928
  "learning_rate": 1.6226622662266227e-05,
929
- "loss": 0.3244,
930
  "step": 1430
931
  },
932
  {
933
  "epoch": 7.13,
 
934
  "learning_rate": 1.5951595159515953e-05,
935
- "loss": 0.3208,
936
  "step": 1440
937
  },
938
  {
939
  "epoch": 7.18,
 
940
  "learning_rate": 1.567656765676568e-05,
941
- "loss": 0.3002,
942
  "step": 1450
943
  },
944
  {
945
  "epoch": 7.23,
 
946
  "learning_rate": 1.5401540154015402e-05,
947
- "loss": 0.3007,
948
  "step": 1460
949
  },
950
  {
951
  "epoch": 7.28,
 
952
  "learning_rate": 1.5126512651265127e-05,
953
- "loss": 0.3443,
954
  "step": 1470
955
  },
956
  {
957
  "epoch": 7.33,
 
958
  "learning_rate": 1.4851485148514851e-05,
959
- "loss": 0.3192,
960
  "step": 1480
961
  },
962
  {
963
  "epoch": 7.38,
 
964
  "learning_rate": 1.4576457645764577e-05,
965
- "loss": 0.3146,
966
  "step": 1490
967
  },
968
  {
969
  "epoch": 7.43,
 
970
  "learning_rate": 1.4301430143014302e-05,
971
- "loss": 0.3147,
972
  "step": 1500
973
  },
974
  {
975
  "epoch": 7.48,
 
976
  "learning_rate": 1.4026402640264028e-05,
977
- "loss": 0.3041,
978
  "step": 1510
979
  },
980
  {
981
  "epoch": 7.52,
 
982
  "learning_rate": 1.3751375137513753e-05,
983
- "loss": 0.2958,
984
  "step": 1520
985
  },
986
  {
987
  "epoch": 7.57,
 
988
  "learning_rate": 1.3476347634763479e-05,
989
- "loss": 0.3235,
990
  "step": 1530
991
  },
992
  {
993
  "epoch": 7.62,
 
994
  "learning_rate": 1.32013201320132e-05,
995
- "loss": 0.3137,
996
  "step": 1540
997
  },
998
  {
999
  "epoch": 7.67,
 
1000
  "learning_rate": 1.2926292629262926e-05,
1001
- "loss": 0.3222,
1002
  "step": 1550
1003
  },
1004
  {
1005
  "epoch": 7.72,
 
1006
  "learning_rate": 1.265126512651265e-05,
1007
- "loss": 0.3087,
1008
  "step": 1560
1009
  },
1010
  {
1011
  "epoch": 7.77,
 
1012
  "learning_rate": 1.2376237623762377e-05,
1013
- "loss": 0.2969,
1014
  "step": 1570
1015
  },
1016
  {
1017
  "epoch": 7.82,
 
1018
  "learning_rate": 1.2101210121012102e-05,
1019
- "loss": 0.3091,
1020
  "step": 1580
1021
  },
1022
  {
1023
  "epoch": 7.87,
 
1024
  "learning_rate": 1.1826182618261828e-05,
1025
- "loss": 0.3356,
1026
  "step": 1590
1027
  },
1028
  {
1029
  "epoch": 7.92,
 
1030
  "learning_rate": 1.155115511551155e-05,
1031
- "loss": 0.2903,
1032
  "step": 1600
1033
  },
1034
  {
1035
  "epoch": 7.97,
 
1036
  "learning_rate": 1.1276127612761277e-05,
1037
- "loss": 0.3366,
1038
  "step": 1610
1039
  },
1040
  {
1041
  "epoch": 8.0,
1042
- "eval_accuracy": 0.8683385579937304,
1043
- "eval_loss": 0.3320719301700592,
1044
- "eval_runtime": 15.9857,
1045
- "eval_samples_per_second": 179.598,
1046
- "eval_steps_per_second": 5.63,
1047
  "step": 1616
1048
  },
1049
  {
1050
  "epoch": 8.02,
 
1051
  "learning_rate": 1.1001100110011001e-05,
1052
- "loss": 0.2956,
1053
  "step": 1620
1054
  },
1055
  {
1056
  "epoch": 8.07,
 
1057
  "learning_rate": 1.0726072607260726e-05,
1058
- "loss": 0.2987,
1059
  "step": 1630
1060
  },
1061
  {
1062
  "epoch": 8.12,
 
1063
  "learning_rate": 1.045104510451045e-05,
1064
- "loss": 0.2965,
1065
  "step": 1640
1066
  },
1067
  {
1068
  "epoch": 8.17,
 
1069
  "learning_rate": 1.0176017601760177e-05,
1070
- "loss": 0.3014,
1071
  "step": 1650
1072
  },
1073
  {
1074
  "epoch": 8.22,
 
1075
  "learning_rate": 9.900990099009901e-06,
1076
- "loss": 0.3267,
1077
  "step": 1660
1078
  },
1079
  {
1080
  "epoch": 8.27,
 
1081
  "learning_rate": 9.625962596259626e-06,
1082
- "loss": 0.3243,
1083
  "step": 1670
1084
  },
1085
  {
1086
  "epoch": 8.32,
 
1087
  "learning_rate": 9.35093509350935e-06,
1088
- "loss": 0.2841,
1089
  "step": 1680
1090
  },
1091
  {
1092
  "epoch": 8.37,
 
1093
  "learning_rate": 9.075907590759077e-06,
1094
- "loss": 0.2949,
1095
  "step": 1690
1096
  },
1097
  {
1098
  "epoch": 8.42,
 
1099
  "learning_rate": 8.800880088008801e-06,
1100
- "loss": 0.2944,
1101
  "step": 1700
1102
  },
1103
  {
1104
  "epoch": 8.47,
 
1105
  "learning_rate": 8.525852585258526e-06,
1106
- "loss": 0.3137,
1107
  "step": 1710
1108
  },
1109
  {
1110
  "epoch": 8.51,
 
1111
  "learning_rate": 8.250825082508252e-06,
1112
- "loss": 0.2825,
1113
  "step": 1720
1114
  },
1115
  {
1116
  "epoch": 8.56,
 
1117
  "learning_rate": 7.975797579757976e-06,
1118
- "loss": 0.3012,
1119
  "step": 1730
1120
  },
1121
  {
1122
  "epoch": 8.61,
 
1123
  "learning_rate": 7.700770077007701e-06,
1124
- "loss": 0.3221,
1125
  "step": 1740
1126
  },
1127
  {
1128
  "epoch": 8.66,
 
1129
  "learning_rate": 7.4257425742574256e-06,
1130
- "loss": 0.2977,
1131
  "step": 1750
1132
  },
1133
  {
1134
  "epoch": 8.71,
 
1135
  "learning_rate": 7.150715071507151e-06,
1136
- "loss": 0.316,
1137
  "step": 1760
1138
  },
1139
  {
1140
  "epoch": 8.76,
 
1141
  "learning_rate": 6.875687568756876e-06,
1142
- "loss": 0.3023,
1143
  "step": 1770
1144
  },
1145
  {
1146
  "epoch": 8.81,
 
1147
  "learning_rate": 6.6006600660066e-06,
1148
- "loss": 0.2937,
1149
  "step": 1780
1150
  },
1151
  {
1152
  "epoch": 8.86,
 
1153
  "learning_rate": 6.325632563256325e-06,
1154
- "loss": 0.2825,
1155
  "step": 1790
1156
  },
1157
  {
1158
  "epoch": 8.91,
 
1159
  "learning_rate": 6.050605060506051e-06,
1160
- "loss": 0.2907,
1161
  "step": 1800
1162
  },
1163
  {
1164
  "epoch": 8.96,
 
1165
  "learning_rate": 5.775577557755775e-06,
1166
- "loss": 0.3007,
1167
  "step": 1810
1168
  },
1169
  {
1170
  "epoch": 9.0,
1171
- "eval_accuracy": 0.866597004528039,
1172
- "eval_loss": 0.33304381370544434,
1173
- "eval_runtime": 16.241,
1174
- "eval_samples_per_second": 176.775,
1175
- "eval_steps_per_second": 5.542,
1176
  "step": 1818
1177
  },
1178
  {
1179
  "epoch": 9.01,
 
1180
  "learning_rate": 5.500550055005501e-06,
1181
- "loss": 0.297,
1182
  "step": 1820
1183
  },
1184
  {
1185
  "epoch": 9.06,
 
1186
  "learning_rate": 5.225522552255225e-06,
1187
- "loss": 0.2894,
1188
  "step": 1830
1189
  },
1190
  {
1191
  "epoch": 9.11,
 
1192
  "learning_rate": 4.950495049504951e-06,
1193
- "loss": 0.2927,
1194
  "step": 1840
1195
  },
1196
  {
1197
  "epoch": 9.16,
 
1198
  "learning_rate": 4.675467546754675e-06,
1199
- "loss": 0.2884,
1200
  "step": 1850
1201
  },
1202
  {
1203
  "epoch": 9.21,
 
1204
  "learning_rate": 4.400440044004401e-06,
1205
- "loss": 0.3217,
1206
  "step": 1860
1207
  },
1208
  {
1209
  "epoch": 9.26,
 
1210
  "learning_rate": 4.125412541254126e-06,
1211
- "loss": 0.3278,
1212
  "step": 1870
1213
  },
1214
  {
1215
  "epoch": 9.31,
 
1216
  "learning_rate": 3.8503850385038505e-06,
1217
- "loss": 0.3126,
1218
  "step": 1880
1219
  },
1220
  {
1221
  "epoch": 9.36,
 
1222
  "learning_rate": 3.5753575357535755e-06,
1223
- "loss": 0.2997,
1224
  "step": 1890
1225
  },
1226
  {
1227
  "epoch": 9.41,
 
1228
  "learning_rate": 3.3003300330033e-06,
1229
- "loss": 0.279,
1230
  "step": 1900
1231
  },
1232
  {
1233
  "epoch": 9.46,
 
1234
  "learning_rate": 3.0253025302530254e-06,
1235
- "loss": 0.2686,
1236
  "step": 1910
1237
  },
1238
  {
1239
  "epoch": 9.5,
 
1240
  "learning_rate": 2.7502750275027504e-06,
1241
- "loss": 0.2811,
1242
  "step": 1920
1243
  },
1244
  {
1245
  "epoch": 9.55,
 
1246
  "learning_rate": 2.4752475247524753e-06,
1247
- "loss": 0.2866,
1248
  "step": 1930
1249
  },
1250
  {
1251
  "epoch": 9.6,
 
1252
  "learning_rate": 2.2002200220022003e-06,
1253
- "loss": 0.2746,
1254
  "step": 1940
1255
  },
1256
  {
1257
  "epoch": 9.65,
 
1258
  "learning_rate": 1.9251925192519253e-06,
1259
- "loss": 0.2565,
1260
  "step": 1950
1261
  },
1262
  {
1263
  "epoch": 9.7,
 
1264
  "learning_rate": 1.65016501650165e-06,
1265
- "loss": 0.3004,
1266
  "step": 1960
1267
  },
1268
  {
1269
  "epoch": 9.75,
 
1270
  "learning_rate": 1.3751375137513752e-06,
1271
- "loss": 0.3053,
1272
  "step": 1970
1273
  },
1274
  {
1275
  "epoch": 9.8,
 
1276
  "learning_rate": 1.1001100110011001e-06,
1277
- "loss": 0.2805,
1278
  "step": 1980
1279
  },
1280
  {
1281
  "epoch": 9.85,
 
1282
  "learning_rate": 8.25082508250825e-07,
1283
- "loss": 0.2817,
1284
  "step": 1990
1285
  },
1286
  {
1287
  "epoch": 9.9,
 
1288
  "learning_rate": 5.500550055005501e-07,
1289
- "loss": 0.3138,
1290
  "step": 2000
1291
  },
1292
  {
1293
  "epoch": 9.95,
 
1294
  "learning_rate": 2.7502750275027504e-07,
1295
- "loss": 0.2733,
1296
  "step": 2010
1297
  },
1298
  {
1299
  "epoch": 10.0,
 
1300
  "learning_rate": 0.0,
1301
- "loss": 0.3089,
1302
  "step": 2020
1303
  },
1304
  {
1305
  "epoch": 10.0,
1306
- "eval_accuracy": 0.8655520724486242,
1307
- "eval_loss": 0.3326851427555084,
1308
- "eval_runtime": 15.8375,
1309
- "eval_samples_per_second": 181.279,
1310
- "eval_steps_per_second": 5.683,
1311
  "step": 2020
1312
  },
1313
  {
1314
  "epoch": 10.0,
1315
  "step": 2020,
1316
  "total_flos": 2.0021605356722135e+19,
1317
- "train_loss": 0.3826778081384036,
1318
- "train_runtime": 3330.0466,
1319
- "train_samples_per_second": 77.588,
1320
- "train_steps_per_second": 0.607
1321
  }
1322
  ],
1323
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.8732149076976663,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-FER2013/checkpoint-2020",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
  "global_step": 2020,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
+ "grad_norm": 0.699901819229126,
14
  "learning_rate": 2.4752475247524753e-06,
15
+ "loss": 0.7003,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1,
20
+ "grad_norm": 0.40010973811149597,
21
  "learning_rate": 4.950495049504951e-06,
22
+ "loss": 0.6978,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.15,
27
+ "grad_norm": 0.3860187530517578,
28
  "learning_rate": 7.4257425742574256e-06,
29
  "loss": 0.6891,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2,
34
+ "grad_norm": 0.38579830527305603,
35
  "learning_rate": 9.900990099009901e-06,
36
+ "loss": 0.6837,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.25,
41
+ "grad_norm": 0.5694485306739807,
42
  "learning_rate": 1.2376237623762377e-05,
43
+ "loss": 0.6643,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.3,
48
+ "grad_norm": 0.5950794816017151,
49
  "learning_rate": 1.4851485148514851e-05,
50
+ "loss": 0.642,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.35,
55
+ "grad_norm": 0.7590372562408447,
56
  "learning_rate": 1.7326732673267325e-05,
57
+ "loss": 0.6213,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.4,
62
+ "grad_norm": 0.7452055215835571,
63
  "learning_rate": 1.9801980198019803e-05,
64
+ "loss": 0.6079,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.45,
69
+ "grad_norm": 1.1108556985855103,
70
  "learning_rate": 2.227722772277228e-05,
71
+ "loss": 0.568,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.5,
76
+ "grad_norm": 2.0815961360931396,
77
  "learning_rate": 2.4752475247524754e-05,
78
+ "loss": 0.5555,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.54,
83
+ "grad_norm": 2.206498146057129,
84
  "learning_rate": 2.722772277227723e-05,
85
+ "loss": 0.5587,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.59,
90
+ "grad_norm": 0.8447802066802979,
91
  "learning_rate": 2.9702970297029702e-05,
92
+ "loss": 0.5502,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.64,
97
+ "grad_norm": 0.9129859805107117,
98
  "learning_rate": 3.217821782178218e-05,
99
+ "loss": 0.5442,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.69,
104
+ "grad_norm": 1.3158477544784546,
105
  "learning_rate": 3.465346534653465e-05,
106
+ "loss": 0.509,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.74,
111
+ "grad_norm": 1.6786607503890991,
112
  "learning_rate": 3.712871287128713e-05,
113
+ "loss": 0.5129,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.79,
118
+ "grad_norm": 1.0716958045959473,
119
  "learning_rate": 3.9603960396039605e-05,
120
+ "loss": 0.5043,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.84,
125
+ "grad_norm": 1.8416863679885864,
126
  "learning_rate": 4.207920792079208e-05,
127
+ "loss": 0.4766,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.89,
132
+ "grad_norm": 2.109766721725464,
133
  "learning_rate": 4.455445544554456e-05,
134
+ "loss": 0.4746,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.94,
139
+ "grad_norm": 1.7481091022491455,
140
  "learning_rate": 4.702970297029703e-05,
141
+ "loss": 0.5162,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.99,
146
+ "grad_norm": 1.037062406539917,
147
  "learning_rate": 4.950495049504951e-05,
148
+ "loss": 0.4811,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 1.0,
153
+ "eval_accuracy": 0.800417972831766,
154
+ "eval_loss": 0.43151769042015076,
155
+ "eval_runtime": 15.3075,
156
+ "eval_samples_per_second": 187.555,
157
+ "eval_steps_per_second": 5.879,
158
  "step": 202
159
  },
160
  {
161
  "epoch": 1.04,
162
+ "grad_norm": 1.0516308546066284,
163
  "learning_rate": 4.977997799779978e-05,
164
+ "loss": 0.4625,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.09,
169
+ "grad_norm": 1.0761890411376953,
170
  "learning_rate": 4.950495049504951e-05,
171
+ "loss": 0.4659,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.14,
176
+ "grad_norm": 1.9692820310592651,
177
  "learning_rate": 4.9229922992299234e-05,
178
+ "loss": 0.4893,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.19,
183
+ "grad_norm": 0.9031820893287659,
184
  "learning_rate": 4.895489548954896e-05,
185
+ "loss": 0.4801,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.24,
190
+ "grad_norm": 1.241443395614624,
191
  "learning_rate": 4.867986798679868e-05,
192
+ "loss": 0.465,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.29,
197
+ "grad_norm": 1.1679638624191284,
198
  "learning_rate": 4.8404840484048406e-05,
199
+ "loss": 0.4396,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.34,
204
+ "grad_norm": 1.4820034503936768,
205
  "learning_rate": 4.812981298129813e-05,
206
+ "loss": 0.4573,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.39,
211
+ "grad_norm": 1.0649698972702026,
212
  "learning_rate": 4.785478547854786e-05,
213
+ "loss": 0.4616,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.44,
218
+ "grad_norm": 1.2591346502304077,
219
  "learning_rate": 4.7579757975797585e-05,
220
+ "loss": 0.4661,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.49,
225
+ "grad_norm": 1.9309344291687012,
226
  "learning_rate": 4.730473047304731e-05,
227
+ "loss": 0.4792,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.53,
232
+ "grad_norm": 0.9727767109870911,
233
  "learning_rate": 4.702970297029703e-05,
234
+ "loss": 0.471,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.58,
239
+ "grad_norm": 1.0227768421173096,
240
  "learning_rate": 4.675467546754676e-05,
241
+ "loss": 0.4779,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.63,
246
+ "grad_norm": 1.7914155721664429,
247
  "learning_rate": 4.647964796479648e-05,
248
+ "loss": 0.4552,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.68,
253
+ "grad_norm": 1.1211057901382446,
254
  "learning_rate": 4.62046204620462e-05,
255
+ "loss": 0.4542,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.73,
260
+ "grad_norm": 0.8994653820991516,
261
  "learning_rate": 4.592959295929593e-05,
262
+ "loss": 0.469,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.78,
267
+ "grad_norm": 1.6254159212112427,
268
  "learning_rate": 4.5654565456545655e-05,
269
+ "loss": 0.4209,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.83,
274
+ "grad_norm": 1.175451397895813,
275
  "learning_rate": 4.537953795379538e-05,
276
+ "loss": 0.4707,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.88,
281
+ "grad_norm": 1.6570440530776978,
282
  "learning_rate": 4.510451045104511e-05,
283
+ "loss": 0.4587,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.93,
288
+ "grad_norm": 1.0916353464126587,
289
  "learning_rate": 4.4829482948294834e-05,
290
+ "loss": 0.4507,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.98,
295
+ "grad_norm": 1.0758109092712402,
296
  "learning_rate": 4.455445544554456e-05,
297
+ "loss": 0.4287,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 2.0,
302
+ "eval_accuracy": 0.8432601880877743,
303
+ "eval_loss": 0.357921838760376,
304
+ "eval_runtime": 15.3987,
305
+ "eval_samples_per_second": 186.444,
306
+ "eval_steps_per_second": 5.845,
307
  "step": 404
308
  },
309
  {
310
  "epoch": 2.03,
311
+ "grad_norm": 0.7765569686889648,
312
  "learning_rate": 4.427942794279428e-05,
313
+ "loss": 0.4157,
314
  "step": 410
315
  },
316
  {
317
  "epoch": 2.08,
318
+ "grad_norm": 1.1473923921585083,
319
  "learning_rate": 4.4004400440044006e-05,
320
+ "loss": 0.4227,
321
  "step": 420
322
  },
323
  {
324
  "epoch": 2.13,
325
+ "grad_norm": 7.452829837799072,
326
  "learning_rate": 4.372937293729373e-05,
327
+ "loss": 0.4033,
328
  "step": 430
329
  },
330
  {
331
  "epoch": 2.18,
332
+ "grad_norm": 1.391101360321045,
333
  "learning_rate": 4.345434543454346e-05,
334
+ "loss": 0.4563,
335
  "step": 440
336
  },
337
  {
338
  "epoch": 2.23,
339
+ "grad_norm": 1.0371557474136353,
340
  "learning_rate": 4.3179317931793185e-05,
341
+ "loss": 0.4487,
342
  "step": 450
343
  },
344
  {
345
  "epoch": 2.28,
346
+ "grad_norm": 0.998115062713623,
347
  "learning_rate": 4.2904290429042904e-05,
348
+ "loss": 0.4178,
349
  "step": 460
350
  },
351
  {
352
  "epoch": 2.33,
353
+ "grad_norm": 1.1916583776474,
354
  "learning_rate": 4.262926292629263e-05,
355
+ "loss": 0.4361,
356
  "step": 470
357
  },
358
  {
359
  "epoch": 2.38,
360
+ "grad_norm": 1.2483221292495728,
361
  "learning_rate": 4.2354235423542356e-05,
362
+ "loss": 0.4317,
363
  "step": 480
364
  },
365
  {
366
  "epoch": 2.43,
367
+ "grad_norm": 1.2603436708450317,
368
  "learning_rate": 4.207920792079208e-05,
369
+ "loss": 0.4252,
370
  "step": 490
371
  },
372
  {
373
  "epoch": 2.48,
374
+ "grad_norm": 1.614512324333191,
375
  "learning_rate": 4.18041804180418e-05,
376
+ "loss": 0.4213,
377
  "step": 500
378
  },
379
  {
380
  "epoch": 2.52,
381
+ "grad_norm": 1.5845041275024414,
382
  "learning_rate": 4.152915291529153e-05,
383
+ "loss": 0.4276,
384
  "step": 510
385
  },
386
  {
387
  "epoch": 2.57,
388
+ "grad_norm": 0.9629144072532654,
389
  "learning_rate": 4.1254125412541255e-05,
390
+ "loss": 0.4168,
391
  "step": 520
392
  },
393
  {
394
  "epoch": 2.62,
395
+ "grad_norm": 1.0476891994476318,
396
  "learning_rate": 4.097909790979098e-05,
397
+ "loss": 0.4166,
398
  "step": 530
399
  },
400
  {
401
  "epoch": 2.67,
402
+ "grad_norm": 1.2000118494033813,
403
  "learning_rate": 4.070407040704071e-05,
404
+ "loss": 0.4238,
405
  "step": 540
406
  },
407
  {
408
  "epoch": 2.72,
409
+ "grad_norm": 1.608628511428833,
410
  "learning_rate": 4.042904290429043e-05,
411
+ "loss": 0.3974,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.77,
416
+ "grad_norm": 2.0195207595825195,
417
  "learning_rate": 4.015401540154016e-05,
418
+ "loss": 0.4087,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.82,
423
+ "grad_norm": 1.2935571670532227,
424
  "learning_rate": 3.987898789878988e-05,
425
+ "loss": 0.396,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.87,
430
+ "grad_norm": 1.2056093215942383,
431
  "learning_rate": 3.9603960396039605e-05,
432
+ "loss": 0.4191,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.92,
437
+ "grad_norm": 0.9370687007904053,
438
  "learning_rate": 3.932893289328933e-05,
439
+ "loss": 0.4323,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.97,
444
+ "grad_norm": 0.9331934452056885,
445
  "learning_rate": 3.905390539053906e-05,
446
+ "loss": 0.4184,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 3.0,
451
+ "eval_accuracy": 0.8467432950191571,
452
+ "eval_loss": 0.35166114568710327,
453
+ "eval_runtime": 15.3286,
454
+ "eval_samples_per_second": 187.297,
455
+ "eval_steps_per_second": 5.871,
456
  "step": 606
457
  },
458
  {
459
  "epoch": 3.02,
460
+ "grad_norm": 1.35804283618927,
461
  "learning_rate": 3.877887788778878e-05,
462
+ "loss": 0.4034,
463
  "step": 610
464
  },
465
  {
466
  "epoch": 3.07,
467
+ "grad_norm": 1.5728434324264526,
468
  "learning_rate": 3.8503850385038503e-05,
469
+ "loss": 0.3917,
470
  "step": 620
471
  },
472
  {
473
  "epoch": 3.12,
474
+ "grad_norm": 1.0493104457855225,
475
  "learning_rate": 3.822882288228823e-05,
476
+ "loss": 0.416,
477
  "step": 630
478
  },
479
  {
480
  "epoch": 3.17,
481
+ "grad_norm": 1.2043938636779785,
482
  "learning_rate": 3.7953795379537956e-05,
483
+ "loss": 0.3913,
484
  "step": 640
485
  },
486
  {
487
  "epoch": 3.22,
488
+ "grad_norm": 1.6263302564620972,
489
  "learning_rate": 3.767876787678768e-05,
490
+ "loss": 0.3815,
491
  "step": 650
492
  },
493
  {
494
  "epoch": 3.27,
495
+ "grad_norm": 1.8161449432373047,
496
  "learning_rate": 3.74037403740374e-05,
497
+ "loss": 0.3802,
498
  "step": 660
499
  },
500
  {
501
  "epoch": 3.32,
502
+ "grad_norm": 1.2450302839279175,
503
  "learning_rate": 3.712871287128713e-05,
504
+ "loss": 0.387,
505
  "step": 670
506
  },
507
  {
508
  "epoch": 3.37,
509
+ "grad_norm": 1.1230440139770508,
510
  "learning_rate": 3.6853685368536854e-05,
511
+ "loss": 0.4023,
512
  "step": 680
513
  },
514
  {
515
  "epoch": 3.42,
516
+ "grad_norm": 1.3349334001541138,
517
  "learning_rate": 3.657865786578658e-05,
518
+ "loss": 0.4092,
519
  "step": 690
520
  },
521
  {
522
  "epoch": 3.47,
523
+ "grad_norm": 1.1348090171813965,
524
  "learning_rate": 3.6303630363036307e-05,
525
+ "loss": 0.3724,
526
  "step": 700
527
  },
528
  {
529
  "epoch": 3.51,
530
+ "grad_norm": 1.1959893703460693,
531
  "learning_rate": 3.602860286028603e-05,
532
+ "loss": 0.415,
533
  "step": 710
534
  },
535
  {
536
  "epoch": 3.56,
537
+ "grad_norm": 1.0938156843185425,
538
  "learning_rate": 3.575357535753576e-05,
539
+ "loss": 0.3931,
540
  "step": 720
541
  },
542
  {
543
  "epoch": 3.61,
544
+ "grad_norm": 1.199876308441162,
545
  "learning_rate": 3.5478547854785485e-05,
546
+ "loss": 0.3817,
547
  "step": 730
548
  },
549
  {
550
  "epoch": 3.66,
551
+ "grad_norm": 1.3452043533325195,
552
  "learning_rate": 3.5203520352035205e-05,
553
+ "loss": 0.3897,
554
  "step": 740
555
  },
556
  {
557
  "epoch": 3.71,
558
+ "grad_norm": 1.3791470527648926,
559
  "learning_rate": 3.492849284928493e-05,
560
+ "loss": 0.3828,
561
  "step": 750
562
  },
563
  {
564
  "epoch": 3.76,
565
+ "grad_norm": 1.7758994102478027,
566
  "learning_rate": 3.465346534653465e-05,
567
+ "loss": 0.382,
568
  "step": 760
569
  },
570
  {
571
  "epoch": 3.81,
572
+ "grad_norm": 1.0924941301345825,
573
  "learning_rate": 3.4378437843784377e-05,
574
+ "loss": 0.4048,
575
  "step": 770
576
  },
577
  {
578
  "epoch": 3.86,
579
+ "grad_norm": 1.1729379892349243,
580
  "learning_rate": 3.41034103410341e-05,
581
+ "loss": 0.3716,
582
  "step": 780
583
  },
584
  {
585
  "epoch": 3.91,
586
+ "grad_norm": 1.1679847240447998,
587
  "learning_rate": 3.382838283828383e-05,
588
+ "loss": 0.3726,
589
  "step": 790
590
  },
591
  {
592
  "epoch": 3.96,
593
+ "grad_norm": 1.166401743888855,
594
  "learning_rate": 3.3553355335533555e-05,
595
+ "loss": 0.3931,
596
  "step": 800
597
  },
598
  {
599
  "epoch": 4.0,
600
+ "eval_accuracy": 0.855451062347614,
601
+ "eval_loss": 0.33083683252334595,
602
+ "eval_runtime": 15.5188,
603
+ "eval_samples_per_second": 185.001,
604
+ "eval_steps_per_second": 5.799,
605
  "step": 808
606
  },
607
  {
608
  "epoch": 4.01,
609
+ "grad_norm": 1.225820541381836,
610
  "learning_rate": 3.327832783278328e-05,
611
+ "loss": 0.3976,
612
  "step": 810
613
  },
614
  {
615
  "epoch": 4.06,
616
+ "grad_norm": 1.0681790113449097,
617
  "learning_rate": 3.300330033003301e-05,
618
+ "loss": 0.3641,
619
  "step": 820
620
  },
621
  {
622
  "epoch": 4.11,
623
+ "grad_norm": 1.3162754774093628,
624
  "learning_rate": 3.272827282728273e-05,
625
+ "loss": 0.3601,
626
  "step": 830
627
  },
628
  {
629
  "epoch": 4.16,
630
+ "grad_norm": 1.5509825944900513,
631
  "learning_rate": 3.2453245324532453e-05,
632
+ "loss": 0.3777,
633
  "step": 840
634
  },
635
  {
636
  "epoch": 4.21,
637
+ "grad_norm": 1.4965535402297974,
638
  "learning_rate": 3.217821782178218e-05,
639
+ "loss": 0.3662,
640
  "step": 850
641
  },
642
  {
643
  "epoch": 4.26,
644
+ "grad_norm": 1.846308946609497,
645
  "learning_rate": 3.1903190319031906e-05,
646
+ "loss": 0.3755,
647
  "step": 860
648
  },
649
  {
650
  "epoch": 4.31,
651
+ "grad_norm": 1.008911371231079,
652
  "learning_rate": 3.162816281628163e-05,
653
+ "loss": 0.3283,
654
  "step": 870
655
  },
656
  {
657
  "epoch": 4.36,
658
+ "grad_norm": 1.7026519775390625,
659
  "learning_rate": 3.135313531353136e-05,
660
+ "loss": 0.3782,
661
  "step": 880
662
  },
663
  {
664
  "epoch": 4.41,
665
+ "grad_norm": 1.3682829141616821,
666
  "learning_rate": 3.1078107810781085e-05,
667
+ "loss": 0.3943,
668
  "step": 890
669
  },
670
  {
671
  "epoch": 4.46,
672
+ "grad_norm": 1.2787247896194458,
673
  "learning_rate": 3.0803080308030804e-05,
674
+ "loss": 0.3651,
675
  "step": 900
676
  },
677
  {
678
  "epoch": 4.5,
679
+ "grad_norm": 1.2230846881866455,
680
  "learning_rate": 3.052805280528053e-05,
681
+ "loss": 0.3489,
682
  "step": 910
683
  },
684
  {
685
  "epoch": 4.55,
686
+ "grad_norm": 1.4894435405731201,
687
  "learning_rate": 3.0253025302530253e-05,
688
+ "loss": 0.372,
689
  "step": 920
690
  },
691
  {
692
  "epoch": 4.6,
693
+ "grad_norm": 1.6204265356063843,
694
  "learning_rate": 2.9977997799779976e-05,
695
+ "loss": 0.3637,
696
  "step": 930
697
  },
698
  {
699
  "epoch": 4.65,
700
+ "grad_norm": 1.4866546392440796,
701
  "learning_rate": 2.9702970297029702e-05,
702
+ "loss": 0.3657,
703
  "step": 940
704
  },
705
  {
706
  "epoch": 4.7,
707
+ "grad_norm": 1.8092739582061768,
708
  "learning_rate": 2.942794279427943e-05,
709
+ "loss": 0.375,
710
  "step": 950
711
  },
712
  {
713
  "epoch": 4.75,
714
+ "grad_norm": 1.3936327695846558,
715
  "learning_rate": 2.9152915291529155e-05,
716
+ "loss": 0.3737,
717
  "step": 960
718
  },
719
  {
720
  "epoch": 4.8,
721
+ "grad_norm": 1.3176568746566772,
722
  "learning_rate": 2.8877887788778878e-05,
723
+ "loss": 0.3565,
724
  "step": 970
725
  },
726
  {
727
  "epoch": 4.85,
728
+ "grad_norm": 1.4575417041778564,
729
  "learning_rate": 2.8602860286028604e-05,
730
+ "loss": 0.3611,
731
  "step": 980
732
  },
733
  {
734
  "epoch": 4.9,
735
+ "grad_norm": 1.5857540369033813,
736
  "learning_rate": 2.832783278327833e-05,
737
+ "loss": 0.3656,
738
  "step": 990
739
  },
740
  {
741
  "epoch": 4.95,
742
+ "grad_norm": 1.0636128187179565,
743
  "learning_rate": 2.8052805280528056e-05,
744
+ "loss": 0.3396,
745
  "step": 1000
746
  },
747
  {
748
  "epoch": 5.0,
749
+ "grad_norm": 1.4022269248962402,
750
  "learning_rate": 2.777777777777778e-05,
751
+ "loss": 0.3667,
752
  "step": 1010
753
  },
754
  {
755
  "epoch": 5.0,
756
+ "eval_accuracy": 0.8610240334378265,
757
+ "eval_loss": 0.3203551471233368,
758
+ "eval_runtime": 15.3956,
759
+ "eval_samples_per_second": 186.482,
760
+ "eval_steps_per_second": 5.846,
761
  "step": 1010
762
  },
763
  {
764
  "epoch": 5.05,
765
+ "grad_norm": 1.3242789506912231,
766
  "learning_rate": 2.7502750275027505e-05,
767
+ "loss": 0.3563,
768
  "step": 1020
769
  },
770
  {
771
  "epoch": 5.1,
772
+ "grad_norm": 1.8484835624694824,
773
  "learning_rate": 2.722772277227723e-05,
774
+ "loss": 0.3295,
775
  "step": 1030
776
  },
777
  {
778
  "epoch": 5.15,
779
+ "grad_norm": 1.2119253873825073,
780
  "learning_rate": 2.6952695269526958e-05,
781
+ "loss": 0.3789,
782
  "step": 1040
783
  },
784
  {
785
  "epoch": 5.2,
786
+ "grad_norm": 1.6589637994766235,
787
  "learning_rate": 2.667766776677668e-05,
788
+ "loss": 0.3379,
789
  "step": 1050
790
  },
791
  {
792
  "epoch": 5.25,
793
+ "grad_norm": 1.5714747905731201,
794
  "learning_rate": 2.64026402640264e-05,
795
+ "loss": 0.3675,
796
  "step": 1060
797
  },
798
  {
799
  "epoch": 5.3,
800
+ "grad_norm": 1.3527103662490845,
801
  "learning_rate": 2.6127612761276126e-05,
802
+ "loss": 0.3573,
803
  "step": 1070
804
  },
805
  {
806
  "epoch": 5.35,
807
+ "grad_norm": 1.055513620376587,
808
  "learning_rate": 2.5852585258525853e-05,
809
+ "loss": 0.3478,
810
  "step": 1080
811
  },
812
  {
813
  "epoch": 5.4,
814
+ "grad_norm": 1.6614227294921875,
815
  "learning_rate": 2.557755775577558e-05,
816
+ "loss": 0.3692,
817
  "step": 1090
818
  },
819
  {
820
  "epoch": 5.45,
821
+ "grad_norm": 1.5745677947998047,
822
  "learning_rate": 2.53025302530253e-05,
823
+ "loss": 0.331,
824
  "step": 1100
825
  },
826
  {
827
  "epoch": 5.5,
828
+ "grad_norm": 1.4897499084472656,
829
  "learning_rate": 2.5027502750275028e-05,
830
+ "loss": 0.352,
831
  "step": 1110
832
  },
833
  {
834
  "epoch": 5.54,
835
+ "grad_norm": 1.4070786237716675,
836
  "learning_rate": 2.4752475247524754e-05,
837
+ "loss": 0.345,
838
  "step": 1120
839
  },
840
  {
841
  "epoch": 5.59,
842
+ "grad_norm": 2.2010486125946045,
843
  "learning_rate": 2.447744774477448e-05,
844
+ "loss": 0.3213,
845
  "step": 1130
846
  },
847
  {
848
  "epoch": 5.64,
849
+ "grad_norm": 1.4076497554779053,
850
  "learning_rate": 2.4202420242024203e-05,
851
+ "loss": 0.3414,
852
  "step": 1140
853
  },
854
  {
855
  "epoch": 5.69,
856
+ "grad_norm": 1.168713092803955,
857
  "learning_rate": 2.392739273927393e-05,
858
+ "loss": 0.3521,
859
  "step": 1150
860
  },
861
  {
862
  "epoch": 5.74,
863
+ "grad_norm": 1.8131704330444336,
864
  "learning_rate": 2.3652365236523656e-05,
865
+ "loss": 0.3576,
866
  "step": 1160
867
  },
868
  {
869
  "epoch": 5.79,
870
+ "grad_norm": 1.2432703971862793,
871
  "learning_rate": 2.337733773377338e-05,
872
+ "loss": 0.3416,
873
  "step": 1170
874
  },
875
  {
876
  "epoch": 5.84,
877
+ "grad_norm": 2.248685598373413,
878
  "learning_rate": 2.31023102310231e-05,
879
+ "loss": 0.3396,
880
  "step": 1180
881
  },
882
  {
883
  "epoch": 5.89,
884
+ "grad_norm": 1.8308826684951782,
885
  "learning_rate": 2.2827282728272828e-05,
886
+ "loss": 0.3466,
887
  "step": 1190
888
  },
889
  {
890
  "epoch": 5.94,
891
+ "grad_norm": 2.5257883071899414,
892
  "learning_rate": 2.2552255225522554e-05,
893
+ "loss": 0.3186,
894
  "step": 1200
895
  },
896
  {
897
  "epoch": 5.99,
898
+ "grad_norm": 1.7196460962295532,
899
  "learning_rate": 2.227722772277228e-05,
900
+ "loss": 0.3545,
901
  "step": 1210
902
  },
903
  {
904
  "epoch": 6.0,
905
+ "eval_accuracy": 0.8659003831417624,
906
+ "eval_loss": 0.31437239050865173,
907
+ "eval_runtime": 15.7136,
908
+ "eval_samples_per_second": 182.707,
909
+ "eval_steps_per_second": 5.728,
910
  "step": 1212
911
  },
912
  {
913
  "epoch": 6.04,
914
+ "grad_norm": 1.643741250038147,
915
  "learning_rate": 2.2002200220022003e-05,
916
+ "loss": 0.3356,
917
  "step": 1220
918
  },
919
  {
920
  "epoch": 6.09,
921
+ "grad_norm": 1.5010825395584106,
922
  "learning_rate": 2.172717271727173e-05,
923
+ "loss": 0.3288,
924
  "step": 1230
925
  },
926
  {
927
  "epoch": 6.14,
928
+ "grad_norm": 1.2441235780715942,
929
  "learning_rate": 2.1452145214521452e-05,
930
+ "loss": 0.3395,
931
  "step": 1240
932
  },
933
  {
934
  "epoch": 6.19,
935
+ "grad_norm": 1.351552128791809,
936
  "learning_rate": 2.1177117711771178e-05,
937
+ "loss": 0.3011,
938
  "step": 1250
939
  },
940
  {
941
  "epoch": 6.24,
942
+ "grad_norm": 1.3512260913848877,
943
  "learning_rate": 2.09020902090209e-05,
944
+ "loss": 0.3416,
945
  "step": 1260
946
  },
947
  {
948
  "epoch": 6.29,
949
+ "grad_norm": 1.5813019275665283,
950
  "learning_rate": 2.0627062706270627e-05,
951
+ "loss": 0.3523,
952
  "step": 1270
953
  },
954
  {
955
  "epoch": 6.34,
956
+ "grad_norm": 1.5249056816101074,
957
  "learning_rate": 2.0352035203520354e-05,
958
+ "loss": 0.3288,
959
  "step": 1280
960
  },
961
  {
962
  "epoch": 6.39,
963
+ "grad_norm": 1.9175376892089844,
964
  "learning_rate": 2.007700770077008e-05,
965
+ "loss": 0.3527,
966
  "step": 1290
967
  },
968
  {
969
  "epoch": 6.44,
970
+ "grad_norm": 1.5546538829803467,
971
  "learning_rate": 1.9801980198019803e-05,
972
+ "loss": 0.3076,
973
  "step": 1300
974
  },
975
  {
976
  "epoch": 6.49,
977
+ "grad_norm": 1.5470608472824097,
978
  "learning_rate": 1.952695269526953e-05,
979
+ "loss": 0.3303,
980
  "step": 1310
981
  },
982
  {
983
  "epoch": 6.53,
984
+ "grad_norm": 1.3036069869995117,
985
  "learning_rate": 1.9251925192519252e-05,
986
+ "loss": 0.3527,
987
  "step": 1320
988
  },
989
  {
990
  "epoch": 6.58,
991
+ "grad_norm": 1.524294137954712,
992
  "learning_rate": 1.8976897689768978e-05,
993
+ "loss": 0.3243,
994
  "step": 1330
995
  },
996
  {
997
  "epoch": 6.63,
998
+ "grad_norm": 1.5366973876953125,
999
  "learning_rate": 1.87018701870187e-05,
1000
+ "loss": 0.3299,
1001
  "step": 1340
1002
  },
1003
  {
1004
  "epoch": 6.68,
1005
+ "grad_norm": 1.2275673151016235,
1006
  "learning_rate": 1.8426842684268427e-05,
1007
+ "loss": 0.2928,
1008
  "step": 1350
1009
  },
1010
  {
1011
  "epoch": 6.73,
1012
+ "grad_norm": 1.4205721616744995,
1013
  "learning_rate": 1.8151815181518153e-05,
1014
+ "loss": 0.3215,
1015
  "step": 1360
1016
  },
1017
  {
1018
  "epoch": 6.78,
1019
+ "grad_norm": 1.4984817504882812,
1020
  "learning_rate": 1.787678767876788e-05,
1021
+ "loss": 0.3008,
1022
  "step": 1370
1023
  },
1024
  {
1025
  "epoch": 6.83,
1026
+ "grad_norm": 1.6792049407958984,
1027
  "learning_rate": 1.7601760176017602e-05,
1028
+ "loss": 0.3403,
1029
  "step": 1380
1030
  },
1031
  {
1032
  "epoch": 6.88,
1033
+ "grad_norm": 1.5373748540878296,
1034
  "learning_rate": 1.7326732673267325e-05,
1035
+ "loss": 0.3353,
1036
  "step": 1390
1037
  },
1038
  {
1039
  "epoch": 6.93,
1040
+ "grad_norm": 2.501495361328125,
1041
  "learning_rate": 1.705170517051705e-05,
1042
+ "loss": 0.3219,
1043
  "step": 1400
1044
  },
1045
  {
1046
  "epoch": 6.98,
1047
+ "grad_norm": 1.3856743574142456,
1048
  "learning_rate": 1.6776677667766778e-05,
1049
+ "loss": 0.3137,
1050
  "step": 1410
1051
  },
1052
  {
1053
  "epoch": 7.0,
1054
+ "eval_accuracy": 0.864158829676071,
1055
+ "eval_loss": 0.3307534158229828,
1056
+ "eval_runtime": 15.9606,
1057
+ "eval_samples_per_second": 179.88,
1058
+ "eval_steps_per_second": 5.639,
1059
  "step": 1414
1060
  },
1061
  {
1062
  "epoch": 7.03,
1063
+ "grad_norm": 1.5321968793869019,
1064
  "learning_rate": 1.6501650165016504e-05,
1065
+ "loss": 0.308,
1066
  "step": 1420
1067
  },
1068
  {
1069
  "epoch": 7.08,
1070
+ "grad_norm": 1.4582158327102661,
1071
  "learning_rate": 1.6226622662266227e-05,
1072
+ "loss": 0.3168,
1073
  "step": 1430
1074
  },
1075
  {
1076
  "epoch": 7.13,
1077
+ "grad_norm": 1.4425067901611328,
1078
  "learning_rate": 1.5951595159515953e-05,
1079
+ "loss": 0.3213,
1080
  "step": 1440
1081
  },
1082
  {
1083
  "epoch": 7.18,
1084
+ "grad_norm": 1.933104395866394,
1085
  "learning_rate": 1.567656765676568e-05,
1086
+ "loss": 0.2978,
1087
  "step": 1450
1088
  },
1089
  {
1090
  "epoch": 7.23,
1091
+ "grad_norm": 1.5549027919769287,
1092
  "learning_rate": 1.5401540154015402e-05,
1093
+ "loss": 0.3281,
1094
  "step": 1460
1095
  },
1096
  {
1097
  "epoch": 7.28,
1098
+ "grad_norm": 1.4376338720321655,
1099
  "learning_rate": 1.5126512651265127e-05,
1100
+ "loss": 0.2896,
1101
  "step": 1470
1102
  },
1103
  {
1104
  "epoch": 7.33,
1105
+ "grad_norm": 1.414781093597412,
1106
  "learning_rate": 1.4851485148514851e-05,
1107
+ "loss": 0.3135,
1108
  "step": 1480
1109
  },
1110
  {
1111
  "epoch": 7.38,
1112
+ "grad_norm": 1.3260300159454346,
1113
  "learning_rate": 1.4576457645764577e-05,
1114
+ "loss": 0.2991,
1115
  "step": 1490
1116
  },
1117
  {
1118
  "epoch": 7.43,
1119
+ "grad_norm": 1.8632981777191162,
1120
  "learning_rate": 1.4301430143014302e-05,
1121
+ "loss": 0.3135,
1122
  "step": 1500
1123
  },
1124
  {
1125
  "epoch": 7.48,
1126
+ "grad_norm": 1.6400821208953857,
1127
  "learning_rate": 1.4026402640264028e-05,
1128
+ "loss": 0.2882,
1129
  "step": 1510
1130
  },
1131
  {
1132
  "epoch": 7.52,
1133
+ "grad_norm": 1.8777371644973755,
1134
  "learning_rate": 1.3751375137513753e-05,
1135
+ "loss": 0.3539,
1136
  "step": 1520
1137
  },
1138
  {
1139
  "epoch": 7.57,
1140
+ "grad_norm": 1.431818962097168,
1141
  "learning_rate": 1.3476347634763479e-05,
1142
+ "loss": 0.3065,
1143
  "step": 1530
1144
  },
1145
  {
1146
  "epoch": 7.62,
1147
+ "grad_norm": 2.002371311187744,
1148
  "learning_rate": 1.32013201320132e-05,
1149
+ "loss": 0.3362,
1150
  "step": 1540
1151
  },
1152
  {
1153
  "epoch": 7.67,
1154
+ "grad_norm": 1.3172950744628906,
1155
  "learning_rate": 1.2926292629262926e-05,
1156
+ "loss": 0.3062,
1157
  "step": 1550
1158
  },
1159
  {
1160
  "epoch": 7.72,
1161
+ "grad_norm": 2.155853271484375,
1162
  "learning_rate": 1.265126512651265e-05,
1163
+ "loss": 0.3151,
1164
  "step": 1560
1165
  },
1166
  {
1167
  "epoch": 7.77,
1168
+ "grad_norm": 1.8984310626983643,
1169
  "learning_rate": 1.2376237623762377e-05,
1170
+ "loss": 0.3488,
1171
  "step": 1570
1172
  },
1173
  {
1174
  "epoch": 7.82,
1175
+ "grad_norm": 1.3027973175048828,
1176
  "learning_rate": 1.2101210121012102e-05,
1177
+ "loss": 0.3197,
1178
  "step": 1580
1179
  },
1180
  {
1181
  "epoch": 7.87,
1182
+ "grad_norm": 1.9755516052246094,
1183
  "learning_rate": 1.1826182618261828e-05,
1184
+ "loss": 0.3287,
1185
  "step": 1590
1186
  },
1187
  {
1188
  "epoch": 7.92,
1189
+ "grad_norm": 1.6631439924240112,
1190
  "learning_rate": 1.155115511551155e-05,
1191
+ "loss": 0.303,
1192
  "step": 1600
1193
  },
1194
  {
1195
  "epoch": 7.97,
1196
+ "grad_norm": 1.8642737865447998,
1197
  "learning_rate": 1.1276127612761277e-05,
1198
+ "loss": 0.3178,
1199
  "step": 1610
1200
  },
1201
  {
1202
  "epoch": 8.0,
1203
+ "eval_accuracy": 0.8645071403692093,
1204
+ "eval_loss": 0.32295528054237366,
1205
+ "eval_runtime": 15.3242,
1206
+ "eval_samples_per_second": 187.351,
1207
+ "eval_steps_per_second": 5.873,
1208
  "step": 1616
1209
  },
1210
  {
1211
  "epoch": 8.02,
1212
+ "grad_norm": 1.4315767288208008,
1213
  "learning_rate": 1.1001100110011001e-05,
1214
+ "loss": 0.3078,
1215
  "step": 1620
1216
  },
1217
  {
1218
  "epoch": 8.07,
1219
+ "grad_norm": 1.6558310985565186,
1220
  "learning_rate": 1.0726072607260726e-05,
1221
+ "loss": 0.2853,
1222
  "step": 1630
1223
  },
1224
  {
1225
  "epoch": 8.12,
1226
+ "grad_norm": 1.906076431274414,
1227
  "learning_rate": 1.045104510451045e-05,
1228
+ "loss": 0.2814,
1229
  "step": 1640
1230
  },
1231
  {
1232
  "epoch": 8.17,
1233
+ "grad_norm": 2.086242914199829,
1234
  "learning_rate": 1.0176017601760177e-05,
1235
+ "loss": 0.2847,
1236
  "step": 1650
1237
  },
1238
  {
1239
  "epoch": 8.22,
1240
+ "grad_norm": 1.793742299079895,
1241
  "learning_rate": 9.900990099009901e-06,
1242
+ "loss": 0.2945,
1243
  "step": 1660
1244
  },
1245
  {
1246
  "epoch": 8.27,
1247
+ "grad_norm": 1.7942755222320557,
1248
  "learning_rate": 9.625962596259626e-06,
1249
+ "loss": 0.296,
1250
  "step": 1670
1251
  },
1252
  {
1253
  "epoch": 8.32,
1254
+ "grad_norm": 1.7433489561080933,
1255
  "learning_rate": 9.35093509350935e-06,
1256
+ "loss": 0.2885,
1257
  "step": 1680
1258
  },
1259
  {
1260
  "epoch": 8.37,
1261
+ "grad_norm": 1.6308104991912842,
1262
  "learning_rate": 9.075907590759077e-06,
1263
+ "loss": 0.3134,
1264
  "step": 1690
1265
  },
1266
  {
1267
  "epoch": 8.42,
1268
+ "grad_norm": 1.6005682945251465,
1269
  "learning_rate": 8.800880088008801e-06,
1270
+ "loss": 0.3206,
1271
  "step": 1700
1272
  },
1273
  {
1274
  "epoch": 8.47,
1275
+ "grad_norm": 1.563693642616272,
1276
  "learning_rate": 8.525852585258526e-06,
1277
+ "loss": 0.3112,
1278
  "step": 1710
1279
  },
1280
  {
1281
  "epoch": 8.51,
1282
+ "grad_norm": 1.796925663948059,
1283
  "learning_rate": 8.250825082508252e-06,
1284
+ "loss": 0.3076,
1285
  "step": 1720
1286
  },
1287
  {
1288
  "epoch": 8.56,
1289
+ "grad_norm": 1.5874933004379272,
1290
  "learning_rate": 7.975797579757976e-06,
1291
+ "loss": 0.3197,
1292
  "step": 1730
1293
  },
1294
  {
1295
  "epoch": 8.61,
1296
+ "grad_norm": 1.5310964584350586,
1297
  "learning_rate": 7.700770077007701e-06,
1298
+ "loss": 0.3026,
1299
  "step": 1740
1300
  },
1301
  {
1302
  "epoch": 8.66,
1303
+ "grad_norm": 1.9441354274749756,
1304
  "learning_rate": 7.4257425742574256e-06,
1305
+ "loss": 0.3051,
1306
  "step": 1750
1307
  },
1308
  {
1309
  "epoch": 8.71,
1310
+ "grad_norm": 1.8038005828857422,
1311
  "learning_rate": 7.150715071507151e-06,
1312
+ "loss": 0.3107,
1313
  "step": 1760
1314
  },
1315
  {
1316
  "epoch": 8.76,
1317
+ "grad_norm": 4.733364582061768,
1318
  "learning_rate": 6.875687568756876e-06,
1319
+ "loss": 0.2939,
1320
  "step": 1770
1321
  },
1322
  {
1323
  "epoch": 8.81,
1324
+ "grad_norm": 1.5854134559631348,
1325
  "learning_rate": 6.6006600660066e-06,
1326
+ "loss": 0.3049,
1327
  "step": 1780
1328
  },
1329
  {
1330
  "epoch": 8.86,
1331
+ "grad_norm": 1.7545024156570435,
1332
  "learning_rate": 6.325632563256325e-06,
1333
+ "loss": 0.2936,
1334
  "step": 1790
1335
  },
1336
  {
1337
  "epoch": 8.91,
1338
+ "grad_norm": 1.6307064294815063,
1339
  "learning_rate": 6.050605060506051e-06,
1340
+ "loss": 0.2855,
1341
  "step": 1800
1342
  },
1343
  {
1344
  "epoch": 8.96,
1345
+ "grad_norm": 1.6208785772323608,
1346
  "learning_rate": 5.775577557755775e-06,
1347
+ "loss": 0.2998,
1348
  "step": 1810
1349
  },
1350
  {
1351
  "epoch": 9.0,
1352
+ "eval_accuracy": 0.8707767328456983,
1353
+ "eval_loss": 0.32057538628578186,
1354
+ "eval_runtime": 15.4893,
1355
+ "eval_samples_per_second": 185.354,
1356
+ "eval_steps_per_second": 5.81,
1357
  "step": 1818
1358
  },
1359
  {
1360
  "epoch": 9.01,
1361
+ "grad_norm": 1.7324903011322021,
1362
  "learning_rate": 5.500550055005501e-06,
1363
+ "loss": 0.2916,
1364
  "step": 1820
1365
  },
1366
  {
1367
  "epoch": 9.06,
1368
+ "grad_norm": 1.4792495965957642,
1369
  "learning_rate": 5.225522552255225e-06,
1370
+ "loss": 0.2771,
1371
  "step": 1830
1372
  },
1373
  {
1374
  "epoch": 9.11,
1375
+ "grad_norm": 2.207402229309082,
1376
  "learning_rate": 4.950495049504951e-06,
1377
+ "loss": 0.2859,
1378
  "step": 1840
1379
  },
1380
  {
1381
  "epoch": 9.16,
1382
+ "grad_norm": 1.9077599048614502,
1383
  "learning_rate": 4.675467546754675e-06,
1384
+ "loss": 0.2833,
1385
  "step": 1850
1386
  },
1387
  {
1388
  "epoch": 9.21,
1389
+ "grad_norm": 1.7705243825912476,
1390
  "learning_rate": 4.400440044004401e-06,
1391
+ "loss": 0.3007,
1392
  "step": 1860
1393
  },
1394
  {
1395
  "epoch": 9.26,
1396
+ "grad_norm": 1.74075448513031,
1397
  "learning_rate": 4.125412541254126e-06,
1398
+ "loss": 0.2912,
1399
  "step": 1870
1400
  },
1401
  {
1402
  "epoch": 9.31,
1403
+ "grad_norm": 1.8058630228042603,
1404
  "learning_rate": 3.8503850385038505e-06,
1405
+ "loss": 0.297,
1406
  "step": 1880
1407
  },
1408
  {
1409
  "epoch": 9.36,
1410
+ "grad_norm": 1.7161786556243896,
1411
  "learning_rate": 3.5753575357535755e-06,
1412
+ "loss": 0.2789,
1413
  "step": 1890
1414
  },
1415
  {
1416
  "epoch": 9.41,
1417
+ "grad_norm": 2.8628289699554443,
1418
  "learning_rate": 3.3003300330033e-06,
1419
+ "loss": 0.3018,
1420
  "step": 1900
1421
  },
1422
  {
1423
  "epoch": 9.46,
1424
+ "grad_norm": 1.9037891626358032,
1425
  "learning_rate": 3.0253025302530254e-06,
1426
+ "loss": 0.3043,
1427
  "step": 1910
1428
  },
1429
  {
1430
  "epoch": 9.5,
1431
+ "grad_norm": 1.5443955659866333,
1432
  "learning_rate": 2.7502750275027504e-06,
1433
+ "loss": 0.2813,
1434
  "step": 1920
1435
  },
1436
  {
1437
  "epoch": 9.55,
1438
+ "grad_norm": 1.6395975351333618,
1439
  "learning_rate": 2.4752475247524753e-06,
1440
+ "loss": 0.2942,
1441
  "step": 1930
1442
  },
1443
  {
1444
  "epoch": 9.6,
1445
+ "grad_norm": 1.4861618280410767,
1446
  "learning_rate": 2.2002200220022003e-06,
1447
+ "loss": 0.2769,
1448
  "step": 1940
1449
  },
1450
  {
1451
  "epoch": 9.65,
1452
+ "grad_norm": 1.9940297603607178,
1453
  "learning_rate": 1.9251925192519253e-06,
1454
+ "loss": 0.2833,
1455
  "step": 1950
1456
  },
1457
  {
1458
  "epoch": 9.7,
1459
+ "grad_norm": 1.800776720046997,
1460
  "learning_rate": 1.65016501650165e-06,
1461
+ "loss": 0.2948,
1462
  "step": 1960
1463
  },
1464
  {
1465
  "epoch": 9.75,
1466
+ "grad_norm": 1.4160261154174805,
1467
  "learning_rate": 1.3751375137513752e-06,
1468
+ "loss": 0.2932,
1469
  "step": 1970
1470
  },
1471
  {
1472
  "epoch": 9.8,
1473
+ "grad_norm": 1.8922903537750244,
1474
  "learning_rate": 1.1001100110011001e-06,
1475
+ "loss": 0.3189,
1476
  "step": 1980
1477
  },
1478
  {
1479
  "epoch": 9.85,
1480
+ "grad_norm": 1.754544734954834,
1481
  "learning_rate": 8.25082508250825e-07,
1482
+ "loss": 0.2907,
1483
  "step": 1990
1484
  },
1485
  {
1486
  "epoch": 9.9,
1487
+ "grad_norm": 1.8808295726776123,
1488
  "learning_rate": 5.500550055005501e-07,
1489
+ "loss": 0.2763,
1490
  "step": 2000
1491
  },
1492
  {
1493
  "epoch": 9.95,
1494
+ "grad_norm": 2.0124707221984863,
1495
  "learning_rate": 2.7502750275027504e-07,
1496
+ "loss": 0.2611,
1497
  "step": 2010
1498
  },
1499
  {
1500
  "epoch": 10.0,
1501
+ "grad_norm": 1.8302078247070312,
1502
  "learning_rate": 0.0,
1503
+ "loss": 0.2773,
1504
  "step": 2020
1505
  },
1506
  {
1507
  "epoch": 10.0,
1508
+ "eval_accuracy": 0.8732149076976663,
1509
+ "eval_loss": 0.3263641893863678,
1510
+ "eval_runtime": 15.9034,
1511
+ "eval_samples_per_second": 180.527,
1512
+ "eval_steps_per_second": 5.659,
1513
  "step": 2020
1514
  },
1515
  {
1516
  "epoch": 10.0,
1517
  "step": 2020,
1518
  "total_flos": 2.0021605356722135e+19,
1519
+ "train_loss": 0.37925267927717454,
1520
+ "train_runtime": 3266.1275,
1521
+ "train_samples_per_second": 79.106,
1522
+ "train_steps_per_second": 0.618
1523
  }
1524
  ],
1525
  "logging_steps": 10,