raks87 commited on
Commit
838b61f
1 Parent(s): 5655d8c

End of training

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 2.99,
3
- "eval_accuracy": 0.9841333333333333,
4
- "eval_loss": 0.0466647744178772,
5
- "eval_runtime": 44.3114,
6
- "eval_samples_per_second": 338.513,
7
- "eval_steps_per_second": 10.584,
8
- "total_flos": 2.0513761171988152e+18,
9
- "train_loss": 0.37578185836037437,
10
- "train_runtime": 710.1831,
11
- "train_samples_per_second": 147.849,
12
- "train_steps_per_second": 1.153
13
  }
 
1
  {
2
  "epoch": 2.99,
3
+ "eval_accuracy": 0.9844,
4
+ "eval_loss": 0.04590694606304169,
5
+ "eval_runtime": 32.7189,
6
+ "eval_samples_per_second": 305.634,
7
+ "eval_steps_per_second": 9.566,
8
+ "total_flos": 2.930358373492064e+18,
9
+ "train_loss": 0.33610059461023056,
10
+ "train_runtime": 935.657,
11
+ "train_samples_per_second": 160.315,
12
+ "train_steps_per_second": 1.25
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.99,
3
- "eval_accuracy": 0.9841333333333333,
4
- "eval_loss": 0.0466647744178772,
5
- "eval_runtime": 44.3114,
6
- "eval_samples_per_second": 338.513,
7
- "eval_steps_per_second": 10.584
8
  }
 
1
  {
2
  "epoch": 2.99,
3
+ "eval_accuracy": 0.9844,
4
+ "eval_loss": 0.04590694606304169,
5
+ "eval_runtime": 32.7189,
6
+ "eval_samples_per_second": 305.634,
7
+ "eval_steps_per_second": 9.566
8
  }
runs/Apr13_14-54-16_67071951de9d/events.out.tfevents.1713021034.67071951de9d.6427.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b142f82300c3b85bcbfdc744203b64d047b3b7d888be2ef12e6a2ec6e995de5
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.99,
3
- "total_flos": 2.0513761171988152e+18,
4
- "train_loss": 0.37578185836037437,
5
- "train_runtime": 710.1831,
6
- "train_samples_per_second": 147.849,
7
- "train_steps_per_second": 1.153
8
  }
 
1
  {
2
  "epoch": 2.99,
3
+ "total_flos": 2.930358373492064e+18,
4
+ "train_loss": 0.33610059461023056,
5
+ "train_runtime": 935.657,
6
+ "train_samples_per_second": 160.315,
7
+ "train_steps_per_second": 1.25
8
  }
trainer_state.json CHANGED
@@ -1,623 +1,875 @@
1
  {
2
- "best_metric": 0.9841333333333333,
3
- "best_model_checkpoint": "vit-small-patch16-224-finetuned-cifar10/checkpoint-819",
4
- "epoch": 2.9945155393053016,
5
  "eval_steps": 500,
6
- "global_step": 819,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04,
13
- "grad_norm": 10.646742820739746,
14
- "learning_rate": 6.0975609756097564e-06,
15
- "loss": 2.6319,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.07,
20
- "grad_norm": 7.98280668258667,
21
- "learning_rate": 1.2195121951219513e-05,
22
- "loss": 2.3516,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.11,
27
- "grad_norm": 7.600331783294678,
28
- "learning_rate": 1.8292682926829268e-05,
29
- "loss": 1.8968,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.15,
34
- "grad_norm": 6.469565391540527,
35
- "learning_rate": 2.4390243902439026e-05,
36
- "loss": 1.3456,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.18,
41
- "grad_norm": 6.272436618804932,
42
- "learning_rate": 3.048780487804878e-05,
43
- "loss": 0.9244,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.22,
48
- "grad_norm": 5.537989616394043,
49
- "learning_rate": 3.6585365853658535e-05,
50
- "loss": 0.5842,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.26,
55
- "grad_norm": 5.8809590339660645,
56
- "learning_rate": 4.26829268292683e-05,
57
- "loss": 0.4952,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.29,
62
- "grad_norm": 8.118090629577637,
63
- "learning_rate": 4.878048780487805e-05,
64
- "loss": 0.498,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.33,
69
- "grad_norm": 5.70691442489624,
70
- "learning_rate": 4.94572591587517e-05,
71
- "loss": 0.4449,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.37,
76
- "grad_norm": 6.3202314376831055,
77
- "learning_rate": 4.877883310719132e-05,
78
- "loss": 0.4748,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.4,
83
- "grad_norm": 5.919207572937012,
84
- "learning_rate": 4.810040705563094e-05,
85
- "loss": 0.411,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.44,
90
- "grad_norm": 5.670413494110107,
91
- "learning_rate": 4.742198100407056e-05,
92
- "loss": 0.355,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.48,
97
- "grad_norm": 5.455657958984375,
98
- "learning_rate": 4.674355495251018e-05,
99
- "loss": 0.3883,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.51,
104
- "grad_norm": 5.554341793060303,
105
- "learning_rate": 4.60651289009498e-05,
106
- "loss": 0.4351,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.55,
111
- "grad_norm": 5.045084476470947,
112
- "learning_rate": 4.5386702849389416e-05,
113
- "loss": 0.3472,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.59,
118
- "grad_norm": 4.392239093780518,
119
- "learning_rate": 4.470827679782904e-05,
120
- "loss": 0.3645,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.62,
125
- "grad_norm": 5.654812335968018,
126
- "learning_rate": 4.402985074626866e-05,
127
- "loss": 0.3316,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.66,
132
- "grad_norm": 5.683121204376221,
133
- "learning_rate": 4.335142469470828e-05,
134
- "loss": 0.3252,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.69,
139
- "grad_norm": 7.785736083984375,
140
- "learning_rate": 4.26729986431479e-05,
141
- "loss": 0.3363,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.73,
146
- "grad_norm": 5.187464714050293,
147
- "learning_rate": 4.199457259158752e-05,
148
- "loss": 0.3216,
149
  "step": 200
150
  },
151
  {
152
- "epoch": 0.77,
153
- "grad_norm": 4.926880836486816,
154
- "learning_rate": 4.131614654002714e-05,
155
- "loss": 0.2756,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 0.8,
160
- "grad_norm": 4.24468994140625,
161
- "learning_rate": 4.063772048846676e-05,
162
- "loss": 0.2944,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 0.84,
167
- "grad_norm": 6.090999126434326,
168
- "learning_rate": 3.995929443690638e-05,
169
- "loss": 0.3404,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 0.88,
174
- "grad_norm": 4.666919708251953,
175
- "learning_rate": 3.9280868385345995e-05,
176
- "loss": 0.3581,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 0.91,
181
- "grad_norm": 5.284679412841797,
182
- "learning_rate": 3.860244233378562e-05,
183
- "loss": 0.3318,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 0.95,
188
- "grad_norm": 5.953047275543213,
189
- "learning_rate": 3.792401628222524e-05,
190
- "loss": 0.3297,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 0.99,
195
- "grad_norm": 6.444422245025635,
196
- "learning_rate": 3.724559023066486e-05,
197
- "loss": 0.2894,
198
  "step": 270
199
  },
200
  {
201
- "epoch": 1.0,
202
- "eval_accuracy": 0.9740666666666666,
203
- "eval_loss": 0.08225423842668533,
204
- "eval_runtime": 44.5616,
205
- "eval_samples_per_second": 336.613,
206
- "eval_steps_per_second": 10.525,
207
- "step": 273
208
- },
209
- {
210
- "epoch": 1.02,
211
- "grad_norm": 5.443094253540039,
212
- "learning_rate": 3.656716417910448e-05,
213
- "loss": 0.292,
214
  "step": 280
215
  },
216
  {
217
- "epoch": 1.06,
218
- "grad_norm": 6.843343734741211,
219
- "learning_rate": 3.58887381275441e-05,
220
- "loss": 0.2819,
221
  "step": 290
222
  },
223
  {
224
- "epoch": 1.1,
225
- "grad_norm": 5.8328142166137695,
226
- "learning_rate": 3.521031207598372e-05,
227
- "loss": 0.2971,
228
  "step": 300
229
  },
230
  {
231
- "epoch": 1.13,
232
- "grad_norm": 4.482433319091797,
233
- "learning_rate": 3.453188602442334e-05,
234
- "loss": 0.26,
235
  "step": 310
236
  },
237
  {
238
- "epoch": 1.17,
239
- "grad_norm": 5.714442729949951,
240
- "learning_rate": 3.385345997286296e-05,
241
- "loss": 0.2873,
242
  "step": 320
243
  },
244
  {
245
- "epoch": 1.21,
246
- "grad_norm": 5.785560607910156,
247
- "learning_rate": 3.3175033921302575e-05,
248
- "loss": 0.2367,
249
  "step": 330
250
  },
251
  {
252
- "epoch": 1.24,
253
- "grad_norm": 4.186683177947998,
254
- "learning_rate": 3.24966078697422e-05,
255
- "loss": 0.2415,
256
  "step": 340
257
  },
258
  {
259
- "epoch": 1.28,
260
- "grad_norm": 6.123615741729736,
261
- "learning_rate": 3.181818181818182e-05,
262
- "loss": 0.3272,
263
  "step": 350
264
  },
265
  {
266
- "epoch": 1.32,
267
- "grad_norm": 4.757399082183838,
268
- "learning_rate": 3.113975576662144e-05,
269
- "loss": 0.2609,
270
  "step": 360
271
  },
272
  {
273
- "epoch": 1.35,
274
- "grad_norm": 5.43366003036499,
275
- "learning_rate": 3.046132971506106e-05,
276
- "loss": 0.2718,
277
  "step": 370
278
  },
279
  {
280
- "epoch": 1.39,
281
- "grad_norm": 4.398933410644531,
282
- "learning_rate": 2.9782903663500678e-05,
283
- "loss": 0.2644,
284
  "step": 380
285
  },
286
  {
287
- "epoch": 1.43,
288
- "grad_norm": 5.111433506011963,
289
- "learning_rate": 2.91044776119403e-05,
290
- "loss": 0.2689,
291
  "step": 390
292
  },
293
  {
294
- "epoch": 1.46,
295
- "grad_norm": 5.859113693237305,
296
- "learning_rate": 2.842605156037992e-05,
297
- "loss": 0.2379,
 
 
 
 
 
 
 
 
 
298
  "step": 400
299
  },
300
  {
301
- "epoch": 1.5,
302
- "grad_norm": 3.6556389331817627,
303
- "learning_rate": 2.7747625508819542e-05,
304
- "loss": 0.2641,
305
  "step": 410
306
  },
307
  {
308
- "epoch": 1.54,
309
- "grad_norm": 6.068279266357422,
310
- "learning_rate": 2.7069199457259158e-05,
311
- "loss": 0.2393,
312
  "step": 420
313
  },
314
  {
315
- "epoch": 1.57,
316
- "grad_norm": 4.939550876617432,
317
- "learning_rate": 2.639077340569878e-05,
318
- "loss": 0.2518,
319
  "step": 430
320
  },
321
  {
322
- "epoch": 1.61,
323
- "grad_norm": 5.30012321472168,
324
- "learning_rate": 2.57123473541384e-05,
325
- "loss": 0.2538,
326
  "step": 440
327
  },
328
  {
329
- "epoch": 1.65,
330
- "grad_norm": 5.058879852294922,
331
- "learning_rate": 2.5033921302578023e-05,
332
- "loss": 0.2497,
333
  "step": 450
334
  },
335
  {
336
- "epoch": 1.68,
337
- "grad_norm": 3.878206729888916,
338
- "learning_rate": 2.4355495251017642e-05,
339
- "loss": 0.2441,
340
  "step": 460
341
  },
342
  {
343
- "epoch": 1.72,
344
- "grad_norm": 5.299980163574219,
345
- "learning_rate": 2.367706919945726e-05,
346
- "loss": 0.2498,
347
  "step": 470
348
  },
349
  {
350
- "epoch": 1.76,
351
- "grad_norm": 6.087621688842773,
352
- "learning_rate": 2.299864314789688e-05,
353
- "loss": 0.2806,
354
  "step": 480
355
  },
356
  {
357
- "epoch": 1.79,
358
- "grad_norm": 4.022277355194092,
359
- "learning_rate": 2.2320217096336503e-05,
360
- "loss": 0.2765,
361
  "step": 490
362
  },
363
  {
364
- "epoch": 1.83,
365
- "grad_norm": 4.6718220710754395,
366
- "learning_rate": 2.164179104477612e-05,
367
- "loss": 0.239,
368
  "step": 500
369
  },
370
  {
371
- "epoch": 1.86,
372
- "grad_norm": 4.384699821472168,
373
- "learning_rate": 2.0963364993215738e-05,
374
- "loss": 0.2757,
375
  "step": 510
376
  },
377
  {
378
- "epoch": 1.9,
379
- "grad_norm": 4.581112861633301,
380
- "learning_rate": 2.028493894165536e-05,
381
- "loss": 0.2678,
382
  "step": 520
383
  },
384
  {
385
- "epoch": 1.94,
386
- "grad_norm": 3.6300454139709473,
387
- "learning_rate": 1.960651289009498e-05,
388
- "loss": 0.2458,
389
  "step": 530
390
  },
391
  {
392
- "epoch": 1.97,
393
- "grad_norm": 5.09318733215332,
394
- "learning_rate": 1.89280868385346e-05,
395
- "loss": 0.2451,
396
  "step": 540
397
  },
398
  {
399
- "epoch": 2.0,
400
- "eval_accuracy": 0.9793333333333333,
401
- "eval_loss": 0.061371468007564545,
402
- "eval_runtime": 44.9848,
403
- "eval_samples_per_second": 333.446,
404
- "eval_steps_per_second": 10.426,
405
- "step": 547
406
- },
407
- {
408
- "epoch": 2.01,
409
- "grad_norm": 4.121983528137207,
410
- "learning_rate": 1.824966078697422e-05,
411
- "loss": 0.2469,
412
  "step": 550
413
  },
414
  {
415
- "epoch": 2.05,
416
- "grad_norm": 3.6604321002960205,
417
- "learning_rate": 1.757123473541384e-05,
418
- "loss": 0.24,
419
  "step": 560
420
  },
421
  {
422
- "epoch": 2.08,
423
- "grad_norm": 5.3272385597229,
424
- "learning_rate": 1.689280868385346e-05,
425
- "loss": 0.2585,
426
  "step": 570
427
  },
428
  {
429
- "epoch": 2.12,
430
- "grad_norm": 3.9364449977874756,
431
- "learning_rate": 1.6214382632293083e-05,
432
- "loss": 0.2234,
433
  "step": 580
434
  },
435
  {
436
- "epoch": 2.16,
437
- "grad_norm": 4.854574203491211,
438
- "learning_rate": 1.55359565807327e-05,
439
- "loss": 0.232,
440
  "step": 590
441
  },
442
  {
443
- "epoch": 2.19,
444
- "grad_norm": 3.7035410404205322,
445
- "learning_rate": 1.485753052917232e-05,
446
- "loss": 0.2095,
447
  "step": 600
448
  },
449
  {
450
- "epoch": 2.23,
451
- "grad_norm": 4.301865577697754,
452
- "learning_rate": 1.417910447761194e-05,
453
- "loss": 0.1954,
454
  "step": 610
455
  },
456
  {
457
- "epoch": 2.27,
458
- "grad_norm": 4.957614421844482,
459
- "learning_rate": 1.3500678426051561e-05,
460
- "loss": 0.2287,
461
  "step": 620
462
  },
463
  {
464
- "epoch": 2.3,
465
- "grad_norm": 4.7505645751953125,
466
- "learning_rate": 1.282225237449118e-05,
467
- "loss": 0.2181,
468
  "step": 630
469
  },
470
  {
471
- "epoch": 2.34,
472
- "grad_norm": 5.2432050704956055,
473
- "learning_rate": 1.2143826322930801e-05,
474
- "loss": 0.2306,
475
  "step": 640
476
  },
477
  {
478
- "epoch": 2.38,
479
- "grad_norm": 3.746467113494873,
480
- "learning_rate": 1.1465400271370422e-05,
481
- "loss": 0.2406,
482
  "step": 650
483
  },
484
  {
485
- "epoch": 2.41,
486
- "grad_norm": 6.041552543640137,
487
- "learning_rate": 1.0786974219810041e-05,
488
- "loss": 0.2681,
489
  "step": 660
490
  },
491
  {
492
- "epoch": 2.45,
493
- "grad_norm": 6.18747091293335,
494
- "learning_rate": 1.010854816824966e-05,
495
- "loss": 0.1891,
496
  "step": 670
497
  },
498
  {
499
- "epoch": 2.49,
500
- "grad_norm": 4.8129472732543945,
501
- "learning_rate": 9.430122116689281e-06,
502
- "loss": 0.2047,
503
  "step": 680
504
  },
505
  {
506
- "epoch": 2.52,
507
- "grad_norm": 3.5734217166900635,
508
- "learning_rate": 8.751696065128902e-06,
509
- "loss": 0.2172,
510
  "step": 690
511
  },
512
  {
513
- "epoch": 2.56,
514
- "grad_norm": 3.77048659324646,
515
- "learning_rate": 8.073270013568522e-06,
516
- "loss": 0.2085,
517
  "step": 700
518
  },
519
  {
520
- "epoch": 2.6,
521
- "grad_norm": 3.4081170558929443,
522
- "learning_rate": 7.394843962008141e-06,
523
- "loss": 0.2176,
524
  "step": 710
525
  },
526
  {
527
- "epoch": 2.63,
528
- "grad_norm": 4.99780797958374,
529
- "learning_rate": 6.716417910447762e-06,
530
- "loss": 0.1924,
531
  "step": 720
532
  },
533
  {
534
- "epoch": 2.67,
535
- "grad_norm": 4.378239631652832,
536
- "learning_rate": 6.037991858887382e-06,
537
- "loss": 0.2172,
538
  "step": 730
539
  },
540
  {
541
- "epoch": 2.71,
542
- "grad_norm": 4.286013126373291,
543
- "learning_rate": 5.359565807327002e-06,
544
- "loss": 0.2281,
545
  "step": 740
546
  },
547
  {
548
- "epoch": 2.74,
549
- "grad_norm": 4.865882396697998,
550
- "learning_rate": 4.681139755766622e-06,
551
- "loss": 0.2241,
552
  "step": 750
553
  },
554
  {
555
- "epoch": 2.78,
556
- "grad_norm": 3.976633071899414,
557
- "learning_rate": 4.002713704206242e-06,
558
- "loss": 0.2064,
559
  "step": 760
560
  },
561
  {
562
- "epoch": 2.82,
563
- "grad_norm": 4.488178253173828,
564
- "learning_rate": 3.324287652645862e-06,
565
- "loss": 0.1979,
566
  "step": 770
567
  },
568
  {
569
- "epoch": 2.85,
570
- "grad_norm": 3.1040143966674805,
571
- "learning_rate": 2.645861601085482e-06,
572
- "loss": 0.2191,
573
  "step": 780
574
  },
575
  {
576
- "epoch": 2.89,
577
- "grad_norm": 4.020033836364746,
578
- "learning_rate": 1.967435549525102e-06,
579
- "loss": 0.2144,
 
 
 
 
 
 
 
 
 
580
  "step": 790
581
  },
582
  {
583
- "epoch": 2.93,
584
- "grad_norm": 4.7591142654418945,
585
- "learning_rate": 1.289009497964722e-06,
586
- "loss": 0.2052,
587
  "step": 800
588
  },
589
  {
590
- "epoch": 2.96,
591
- "grad_norm": 4.364315986633301,
592
- "learning_rate": 6.10583446404342e-07,
593
- "loss": 0.2428,
594
  "step": 810
595
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  {
597
  "epoch": 2.99,
598
- "eval_accuracy": 0.9841333333333333,
599
- "eval_loss": 0.0466647744178772,
600
- "eval_runtime": 44.9184,
601
- "eval_samples_per_second": 333.939,
602
- "eval_steps_per_second": 10.441,
603
- "step": 819
604
  },
605
  {
606
  "epoch": 2.99,
607
- "step": 819,
608
- "total_flos": 2.0513761171988152e+18,
609
- "train_loss": 0.37578185836037437,
610
- "train_runtime": 710.1831,
611
- "train_samples_per_second": 147.849,
612
- "train_steps_per_second": 1.153
613
  }
614
  ],
615
  "logging_steps": 10,
616
- "max_steps": 819,
617
  "num_input_tokens_seen": 0,
618
  "num_train_epochs": 3,
619
  "save_steps": 500,
620
- "total_flos": 2.0513761171988152e+18,
621
  "train_batch_size": 32,
622
  "trial_name": null,
623
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9844,
3
+ "best_model_checkpoint": "vit-small-patch16-224-finetuned-cifar10/checkpoint-1170",
4
+ "epoch": 2.9942418426103647,
5
  "eval_steps": 500,
6
+ "global_step": 1170,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "grad_norm": 8.970051765441895,
14
+ "learning_rate": 4.273504273504274e-06,
15
+ "loss": 2.432,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.05,
20
+ "grad_norm": 7.947530746459961,
21
+ "learning_rate": 8.547008547008548e-06,
22
+ "loss": 2.2376,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.08,
27
+ "grad_norm": 7.60875129699707,
28
+ "learning_rate": 1.282051282051282e-05,
29
+ "loss": 1.9698,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.1,
34
+ "grad_norm": 6.434678554534912,
35
+ "learning_rate": 1.7094017094017095e-05,
36
+ "loss": 1.5489,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.13,
41
+ "grad_norm": 6.238937854766846,
42
+ "learning_rate": 2.1367521367521368e-05,
43
+ "loss": 1.1375,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.15,
48
+ "grad_norm": 5.423764228820801,
49
+ "learning_rate": 2.564102564102564e-05,
50
+ "loss": 0.7332,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.18,
55
+ "grad_norm": 7.145791530609131,
56
+ "learning_rate": 2.9914529914529915e-05,
57
+ "loss": 0.5841,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.2,
62
+ "grad_norm": 6.430414199829102,
63
+ "learning_rate": 3.418803418803419e-05,
64
+ "loss": 0.4957,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.23,
69
+ "grad_norm": 6.975243091583252,
70
+ "learning_rate": 3.846153846153846e-05,
71
+ "loss": 0.4523,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.26,
76
+ "grad_norm": 6.09722900390625,
77
+ "learning_rate": 4.2735042735042735e-05,
78
+ "loss": 0.4462,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.28,
83
+ "grad_norm": 5.052177429199219,
84
+ "learning_rate": 4.700854700854701e-05,
85
+ "loss": 0.3624,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.31,
90
+ "grad_norm": 5.480886459350586,
91
+ "learning_rate": 4.985754985754986e-05,
92
+ "loss": 0.3895,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.33,
97
+ "grad_norm": 4.691812992095947,
98
+ "learning_rate": 4.938271604938271e-05,
99
+ "loss": 0.3446,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.36,
104
+ "grad_norm": 6.404294013977051,
105
+ "learning_rate": 4.890788224121557e-05,
106
+ "loss": 0.4006,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.38,
111
+ "grad_norm": 5.33477258682251,
112
+ "learning_rate": 4.8433048433048433e-05,
113
+ "loss": 0.3532,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.41,
118
+ "grad_norm": 5.476822853088379,
119
+ "learning_rate": 4.7958214624881294e-05,
120
+ "loss": 0.3421,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.44,
125
+ "grad_norm": 6.005227565765381,
126
+ "learning_rate": 4.7483380816714154e-05,
127
+ "loss": 0.3692,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.46,
132
+ "grad_norm": 4.84128999710083,
133
+ "learning_rate": 4.700854700854701e-05,
134
+ "loss": 0.3457,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.49,
139
+ "grad_norm": 4.650127410888672,
140
+ "learning_rate": 4.653371320037987e-05,
141
+ "loss": 0.3685,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.51,
146
+ "grad_norm": 5.779717445373535,
147
+ "learning_rate": 4.605887939221273e-05,
148
+ "loss": 0.3451,
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 0.54,
153
+ "grad_norm": 6.38178825378418,
154
+ "learning_rate": 4.558404558404559e-05,
155
+ "loss": 0.3394,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 0.56,
160
+ "grad_norm": 5.148958206176758,
161
+ "learning_rate": 4.510921177587845e-05,
162
+ "loss": 0.3641,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 0.59,
167
+ "grad_norm": 3.9889109134674072,
168
+ "learning_rate": 4.463437796771131e-05,
169
+ "loss": 0.3108,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 0.61,
174
+ "grad_norm": 6.51384162902832,
175
+ "learning_rate": 4.415954415954416e-05,
176
+ "loss": 0.2949,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 0.64,
181
+ "grad_norm": 4.950154781341553,
182
+ "learning_rate": 4.368471035137702e-05,
183
+ "loss": 0.3352,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 0.67,
188
+ "grad_norm": 6.234986782073975,
189
+ "learning_rate": 4.3209876543209875e-05,
190
+ "loss": 0.309,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 0.69,
195
+ "grad_norm": 5.57274055480957,
196
+ "learning_rate": 4.2735042735042735e-05,
197
+ "loss": 0.3161,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 0.72,
202
+ "grad_norm": 5.511316776275635,
203
+ "learning_rate": 4.2260208926875595e-05,
204
+ "loss": 0.3018,
 
 
 
 
 
 
 
 
 
205
  "step": 280
206
  },
207
  {
208
+ "epoch": 0.74,
209
+ "grad_norm": 3.934835910797119,
210
+ "learning_rate": 4.1785375118708455e-05,
211
+ "loss": 0.3116,
212
  "step": 290
213
  },
214
  {
215
+ "epoch": 0.77,
216
+ "grad_norm": 5.323722839355469,
217
+ "learning_rate": 4.131054131054131e-05,
218
+ "loss": 0.3105,
219
  "step": 300
220
  },
221
  {
222
+ "epoch": 0.79,
223
+ "grad_norm": 4.1425909996032715,
224
+ "learning_rate": 4.083570750237417e-05,
225
+ "loss": 0.296,
226
  "step": 310
227
  },
228
  {
229
+ "epoch": 0.82,
230
+ "grad_norm": 5.026642799377441,
231
+ "learning_rate": 4.036087369420703e-05,
232
+ "loss": 0.3594,
233
  "step": 320
234
  },
235
  {
236
+ "epoch": 0.84,
237
+ "grad_norm": 3.949415922164917,
238
+ "learning_rate": 3.988603988603989e-05,
239
+ "loss": 0.2849,
240
  "step": 330
241
  },
242
  {
243
+ "epoch": 0.87,
244
+ "grad_norm": 4.380434989929199,
245
+ "learning_rate": 3.941120607787275e-05,
246
+ "loss": 0.2935,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 0.9,
251
+ "grad_norm": 4.884699821472168,
252
+ "learning_rate": 3.893637226970561e-05,
253
+ "loss": 0.3128,
254
  "step": 350
255
  },
256
  {
257
+ "epoch": 0.92,
258
+ "grad_norm": 6.950473308563232,
259
+ "learning_rate": 3.846153846153846e-05,
260
+ "loss": 0.3073,
261
  "step": 360
262
  },
263
  {
264
+ "epoch": 0.95,
265
+ "grad_norm": 5.410139560699463,
266
+ "learning_rate": 3.798670465337132e-05,
267
+ "loss": 0.2998,
268
  "step": 370
269
  },
270
  {
271
+ "epoch": 0.97,
272
+ "grad_norm": 4.638174057006836,
273
+ "learning_rate": 3.7511870845204176e-05,
274
+ "loss": 0.3124,
275
  "step": 380
276
  },
277
  {
278
+ "epoch": 1.0,
279
+ "grad_norm": 4.711712837219238,
280
+ "learning_rate": 3.7037037037037037e-05,
281
+ "loss": 0.2682,
282
  "step": 390
283
  },
284
  {
285
+ "epoch": 1.0,
286
+ "eval_accuracy": 0.9713,
287
+ "eval_loss": 0.0821285992860794,
288
+ "eval_runtime": 32.3489,
289
+ "eval_samples_per_second": 309.13,
290
+ "eval_steps_per_second": 9.676,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.02,
295
+ "grad_norm": 4.979706764221191,
296
+ "learning_rate": 3.65622032288699e-05,
297
+ "loss": 0.2836,
298
  "step": 400
299
  },
300
  {
301
+ "epoch": 1.05,
302
+ "grad_norm": 3.807408332824707,
303
+ "learning_rate": 3.608736942070276e-05,
304
+ "loss": 0.2521,
305
  "step": 410
306
  },
307
  {
308
+ "epoch": 1.07,
309
+ "grad_norm": 4.402036190032959,
310
+ "learning_rate": 3.561253561253561e-05,
311
+ "loss": 0.2469,
312
  "step": 420
313
  },
314
  {
315
+ "epoch": 1.1,
316
+ "grad_norm": 3.0667805671691895,
317
+ "learning_rate": 3.513770180436847e-05,
318
+ "loss": 0.2383,
319
  "step": 430
320
  },
321
  {
322
+ "epoch": 1.13,
323
+ "grad_norm": 6.403406620025635,
324
+ "learning_rate": 3.466286799620133e-05,
325
+ "loss": 0.277,
326
  "step": 440
327
  },
328
  {
329
+ "epoch": 1.15,
330
+ "grad_norm": 5.481479167938232,
331
+ "learning_rate": 3.418803418803419e-05,
332
+ "loss": 0.2533,
333
  "step": 450
334
  },
335
  {
336
+ "epoch": 1.18,
337
+ "grad_norm": 5.324038028717041,
338
+ "learning_rate": 3.371320037986705e-05,
339
+ "loss": 0.2813,
340
  "step": 460
341
  },
342
  {
343
+ "epoch": 1.2,
344
+ "grad_norm": 2.4851675033569336,
345
+ "learning_rate": 3.323836657169991e-05,
346
+ "loss": 0.3055,
347
  "step": 470
348
  },
349
  {
350
+ "epoch": 1.23,
351
+ "grad_norm": 4.365708827972412,
352
+ "learning_rate": 3.2763532763532764e-05,
353
+ "loss": 0.2797,
354
  "step": 480
355
  },
356
  {
357
+ "epoch": 1.25,
358
+ "grad_norm": 3.6004509925842285,
359
+ "learning_rate": 3.2288698955365625e-05,
360
+ "loss": 0.2776,
361
  "step": 490
362
  },
363
  {
364
+ "epoch": 1.28,
365
+ "grad_norm": 3.7064690589904785,
366
+ "learning_rate": 3.181386514719848e-05,
367
+ "loss": 0.2605,
368
  "step": 500
369
  },
370
  {
371
+ "epoch": 1.31,
372
+ "grad_norm": 4.883594036102295,
373
+ "learning_rate": 3.133903133903134e-05,
374
+ "loss": 0.2403,
375
  "step": 510
376
  },
377
  {
378
+ "epoch": 1.33,
379
+ "grad_norm": 5.014550685882568,
380
+ "learning_rate": 3.08641975308642e-05,
381
+ "loss": 0.2436,
382
  "step": 520
383
  },
384
  {
385
+ "epoch": 1.36,
386
+ "grad_norm": 4.304734230041504,
387
+ "learning_rate": 3.0389363722697055e-05,
388
+ "loss": 0.2639,
389
  "step": 530
390
  },
391
  {
392
+ "epoch": 1.38,
393
+ "grad_norm": 3.2759273052215576,
394
+ "learning_rate": 2.9914529914529915e-05,
395
+ "loss": 0.2651,
396
  "step": 540
397
  },
398
  {
399
+ "epoch": 1.41,
400
+ "grad_norm": 3.556528091430664,
401
+ "learning_rate": 2.9439696106362775e-05,
402
+ "loss": 0.2359,
 
 
 
 
 
 
 
 
 
403
  "step": 550
404
  },
405
  {
406
+ "epoch": 1.43,
407
+ "grad_norm": 4.672804355621338,
408
+ "learning_rate": 2.8964862298195632e-05,
409
+ "loss": 0.2376,
410
  "step": 560
411
  },
412
  {
413
+ "epoch": 1.46,
414
+ "grad_norm": 4.527768611907959,
415
+ "learning_rate": 2.8490028490028492e-05,
416
+ "loss": 0.2504,
417
  "step": 570
418
  },
419
  {
420
+ "epoch": 1.48,
421
+ "grad_norm": 5.102674961090088,
422
+ "learning_rate": 2.8015194681861352e-05,
423
+ "loss": 0.2604,
424
  "step": 580
425
  },
426
  {
427
+ "epoch": 1.51,
428
+ "grad_norm": 4.18637228012085,
429
+ "learning_rate": 2.754036087369421e-05,
430
+ "loss": 0.2746,
431
  "step": 590
432
  },
433
  {
434
+ "epoch": 1.54,
435
+ "grad_norm": 3.3811545372009277,
436
+ "learning_rate": 2.706552706552707e-05,
437
+ "loss": 0.2548,
438
  "step": 600
439
  },
440
  {
441
+ "epoch": 1.56,
442
+ "grad_norm": 3.1672608852386475,
443
+ "learning_rate": 2.6590693257359926e-05,
444
+ "loss": 0.2423,
445
  "step": 610
446
  },
447
  {
448
+ "epoch": 1.59,
449
+ "grad_norm": 3.609534740447998,
450
+ "learning_rate": 2.611585944919278e-05,
451
+ "loss": 0.1994,
452
  "step": 620
453
  },
454
  {
455
+ "epoch": 1.61,
456
+ "grad_norm": 5.083642482757568,
457
+ "learning_rate": 2.564102564102564e-05,
458
+ "loss": 0.292,
459
  "step": 630
460
  },
461
  {
462
+ "epoch": 1.64,
463
+ "grad_norm": 4.716630935668945,
464
+ "learning_rate": 2.51661918328585e-05,
465
+ "loss": 0.2333,
466
  "step": 640
467
  },
468
  {
469
+ "epoch": 1.66,
470
+ "grad_norm": 4.915971755981445,
471
+ "learning_rate": 2.4691358024691357e-05,
472
+ "loss": 0.2364,
473
  "step": 650
474
  },
475
  {
476
+ "epoch": 1.69,
477
+ "grad_norm": 4.216696739196777,
478
+ "learning_rate": 2.4216524216524217e-05,
479
+ "loss": 0.2503,
480
  "step": 660
481
  },
482
  {
483
+ "epoch": 1.71,
484
+ "grad_norm": 4.966453552246094,
485
+ "learning_rate": 2.3741690408357077e-05,
486
+ "loss": 0.224,
487
  "step": 670
488
  },
489
  {
490
+ "epoch": 1.74,
491
+ "grad_norm": 4.153652191162109,
492
+ "learning_rate": 2.3266856600189934e-05,
493
+ "loss": 0.2109,
494
  "step": 680
495
  },
496
  {
497
+ "epoch": 1.77,
498
+ "grad_norm": 2.9919214248657227,
499
+ "learning_rate": 2.2792022792022794e-05,
500
+ "loss": 0.2371,
501
  "step": 690
502
  },
503
  {
504
+ "epoch": 1.79,
505
+ "grad_norm": 5.105522155761719,
506
+ "learning_rate": 2.2317188983855654e-05,
507
+ "loss": 0.2187,
508
  "step": 700
509
  },
510
  {
511
+ "epoch": 1.82,
512
+ "grad_norm": 3.9936702251434326,
513
+ "learning_rate": 2.184235517568851e-05,
514
+ "loss": 0.2403,
515
  "step": 710
516
  },
517
  {
518
+ "epoch": 1.84,
519
+ "grad_norm": 5.220417022705078,
520
+ "learning_rate": 2.1367521367521368e-05,
521
+ "loss": 0.2383,
522
  "step": 720
523
  },
524
  {
525
+ "epoch": 1.87,
526
+ "grad_norm": 3.134110450744629,
527
+ "learning_rate": 2.0892687559354228e-05,
528
+ "loss": 0.2197,
529
  "step": 730
530
  },
531
  {
532
+ "epoch": 1.89,
533
+ "grad_norm": 5.406284809112549,
534
+ "learning_rate": 2.0417853751187084e-05,
535
+ "loss": 0.224,
536
  "step": 740
537
  },
538
  {
539
+ "epoch": 1.92,
540
+ "grad_norm": 5.273613452911377,
541
+ "learning_rate": 1.9943019943019945e-05,
542
+ "loss": 0.1951,
543
  "step": 750
544
  },
545
  {
546
+ "epoch": 1.94,
547
+ "grad_norm": 4.695704936981201,
548
+ "learning_rate": 1.9468186134852805e-05,
549
+ "loss": 0.2324,
550
  "step": 760
551
  },
552
  {
553
+ "epoch": 1.97,
554
+ "grad_norm": 5.734136581420898,
555
+ "learning_rate": 1.899335232668566e-05,
556
+ "loss": 0.248,
557
  "step": 770
558
  },
559
  {
560
+ "epoch": 2.0,
561
+ "grad_norm": 4.757730960845947,
562
+ "learning_rate": 1.8518518518518518e-05,
563
+ "loss": 0.252,
564
  "step": 780
565
  },
566
  {
567
+ "epoch": 2.0,
568
+ "eval_accuracy": 0.9842,
569
+ "eval_loss": 0.050203289836645126,
570
+ "eval_runtime": 31.7681,
571
+ "eval_samples_per_second": 314.781,
572
+ "eval_steps_per_second": 9.853,
573
+ "step": 781
574
+ },
575
+ {
576
+ "epoch": 2.02,
577
+ "grad_norm": 4.400415897369385,
578
+ "learning_rate": 1.804368471035138e-05,
579
+ "loss": 0.2364,
580
  "step": 790
581
  },
582
  {
583
+ "epoch": 2.05,
584
+ "grad_norm": 4.02318000793457,
585
+ "learning_rate": 1.7568850902184235e-05,
586
+ "loss": 0.2164,
587
  "step": 800
588
  },
589
  {
590
+ "epoch": 2.07,
591
+ "grad_norm": 5.289691925048828,
592
+ "learning_rate": 1.7094017094017095e-05,
593
+ "loss": 0.2472,
594
  "step": 810
595
  },
596
+ {
597
+ "epoch": 2.1,
598
+ "grad_norm": 3.842559337615967,
599
+ "learning_rate": 1.6619183285849956e-05,
600
+ "loss": 0.2209,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 2.12,
605
+ "grad_norm": 5.64124870300293,
606
+ "learning_rate": 1.6144349477682812e-05,
607
+ "loss": 0.2473,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 2.15,
612
+ "grad_norm": 4.473005771636963,
613
+ "learning_rate": 1.566951566951567e-05,
614
+ "loss": 0.2287,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 2.18,
619
+ "grad_norm": 4.476832389831543,
620
+ "learning_rate": 1.5194681861348528e-05,
621
+ "loss": 0.2208,
622
+ "step": 850
623
+ },
624
+ {
625
+ "epoch": 2.2,
626
+ "grad_norm": 4.89029598236084,
627
+ "learning_rate": 1.4719848053181388e-05,
628
+ "loss": 0.2332,
629
+ "step": 860
630
+ },
631
+ {
632
+ "epoch": 2.23,
633
+ "grad_norm": 5.0262627601623535,
634
+ "learning_rate": 1.4245014245014246e-05,
635
+ "loss": 0.2001,
636
+ "step": 870
637
+ },
638
+ {
639
+ "epoch": 2.25,
640
+ "grad_norm": 4.80778169631958,
641
+ "learning_rate": 1.3770180436847105e-05,
642
+ "loss": 0.2018,
643
+ "step": 880
644
+ },
645
+ {
646
+ "epoch": 2.28,
647
+ "grad_norm": 3.203700304031372,
648
+ "learning_rate": 1.3295346628679963e-05,
649
+ "loss": 0.2018,
650
+ "step": 890
651
+ },
652
+ {
653
+ "epoch": 2.3,
654
+ "grad_norm": 4.840007305145264,
655
+ "learning_rate": 1.282051282051282e-05,
656
+ "loss": 0.2754,
657
+ "step": 900
658
+ },
659
+ {
660
+ "epoch": 2.33,
661
+ "grad_norm": 4.501500129699707,
662
+ "learning_rate": 1.2345679012345678e-05,
663
+ "loss": 0.1986,
664
+ "step": 910
665
+ },
666
+ {
667
+ "epoch": 2.35,
668
+ "grad_norm": 3.4367220401763916,
669
+ "learning_rate": 1.1870845204178538e-05,
670
+ "loss": 0.2145,
671
+ "step": 920
672
+ },
673
+ {
674
+ "epoch": 2.38,
675
+ "grad_norm": 5.076148509979248,
676
+ "learning_rate": 1.1396011396011397e-05,
677
+ "loss": 0.2001,
678
+ "step": 930
679
+ },
680
+ {
681
+ "epoch": 2.41,
682
+ "grad_norm": 2.299694299697876,
683
+ "learning_rate": 1.0921177587844255e-05,
684
+ "loss": 0.1935,
685
+ "step": 940
686
+ },
687
+ {
688
+ "epoch": 2.43,
689
+ "grad_norm": 3.9149887561798096,
690
+ "learning_rate": 1.0446343779677114e-05,
691
+ "loss": 0.2158,
692
+ "step": 950
693
+ },
694
+ {
695
+ "epoch": 2.46,
696
+ "grad_norm": 5.087822437286377,
697
+ "learning_rate": 9.971509971509972e-06,
698
+ "loss": 0.2052,
699
+ "step": 960
700
+ },
701
+ {
702
+ "epoch": 2.48,
703
+ "grad_norm": 3.8664252758026123,
704
+ "learning_rate": 9.49667616334283e-06,
705
+ "loss": 0.2056,
706
+ "step": 970
707
+ },
708
+ {
709
+ "epoch": 2.51,
710
+ "grad_norm": 4.961308479309082,
711
+ "learning_rate": 9.02184235517569e-06,
712
+ "loss": 0.2157,
713
+ "step": 980
714
+ },
715
+ {
716
+ "epoch": 2.53,
717
+ "grad_norm": 4.553085803985596,
718
+ "learning_rate": 8.547008547008548e-06,
719
+ "loss": 0.2079,
720
+ "step": 990
721
+ },
722
+ {
723
+ "epoch": 2.56,
724
+ "grad_norm": 3.396073818206787,
725
+ "learning_rate": 8.072174738841406e-06,
726
+ "loss": 0.1919,
727
+ "step": 1000
728
+ },
729
+ {
730
+ "epoch": 2.58,
731
+ "grad_norm": 5.101813316345215,
732
+ "learning_rate": 7.597340930674264e-06,
733
+ "loss": 0.1812,
734
+ "step": 1010
735
+ },
736
+ {
737
+ "epoch": 2.61,
738
+ "grad_norm": 5.189543724060059,
739
+ "learning_rate": 7.122507122507123e-06,
740
+ "loss": 0.2123,
741
+ "step": 1020
742
+ },
743
+ {
744
+ "epoch": 2.64,
745
+ "grad_norm": 4.240951061248779,
746
+ "learning_rate": 6.6476733143399815e-06,
747
+ "loss": 0.218,
748
+ "step": 1030
749
+ },
750
+ {
751
+ "epoch": 2.66,
752
+ "grad_norm": 3.139678716659546,
753
+ "learning_rate": 6.172839506172839e-06,
754
+ "loss": 0.1824,
755
+ "step": 1040
756
+ },
757
+ {
758
+ "epoch": 2.69,
759
+ "grad_norm": 2.84495210647583,
760
+ "learning_rate": 5.6980056980056985e-06,
761
+ "loss": 0.1916,
762
+ "step": 1050
763
+ },
764
+ {
765
+ "epoch": 2.71,
766
+ "grad_norm": 3.4231679439544678,
767
+ "learning_rate": 5.223171889838557e-06,
768
+ "loss": 0.1953,
769
+ "step": 1060
770
+ },
771
+ {
772
+ "epoch": 2.74,
773
+ "grad_norm": 4.120250225067139,
774
+ "learning_rate": 4.748338081671415e-06,
775
+ "loss": 0.2046,
776
+ "step": 1070
777
+ },
778
+ {
779
+ "epoch": 2.76,
780
+ "grad_norm": 5.0515055656433105,
781
+ "learning_rate": 4.273504273504274e-06,
782
+ "loss": 0.1941,
783
+ "step": 1080
784
+ },
785
+ {
786
+ "epoch": 2.79,
787
+ "grad_norm": 4.197494029998779,
788
+ "learning_rate": 3.798670465337132e-06,
789
+ "loss": 0.1908,
790
+ "step": 1090
791
+ },
792
+ {
793
+ "epoch": 2.82,
794
+ "grad_norm": 5.316411972045898,
795
+ "learning_rate": 3.3238366571699908e-06,
796
+ "loss": 0.1954,
797
+ "step": 1100
798
+ },
799
+ {
800
+ "epoch": 2.84,
801
+ "grad_norm": 2.8527472019195557,
802
+ "learning_rate": 2.8490028490028492e-06,
803
+ "loss": 0.1466,
804
+ "step": 1110
805
+ },
806
+ {
807
+ "epoch": 2.87,
808
+ "grad_norm": 4.074756622314453,
809
+ "learning_rate": 2.3741690408357077e-06,
810
+ "loss": 0.2092,
811
+ "step": 1120
812
+ },
813
+ {
814
+ "epoch": 2.89,
815
+ "grad_norm": 4.336794853210449,
816
+ "learning_rate": 1.899335232668566e-06,
817
+ "loss": 0.1742,
818
+ "step": 1130
819
+ },
820
+ {
821
+ "epoch": 2.92,
822
+ "grad_norm": 3.8528594970703125,
823
+ "learning_rate": 1.4245014245014246e-06,
824
+ "loss": 0.2002,
825
+ "step": 1140
826
+ },
827
+ {
828
+ "epoch": 2.94,
829
+ "grad_norm": 4.159782409667969,
830
+ "learning_rate": 9.49667616334283e-07,
831
+ "loss": 0.1847,
832
+ "step": 1150
833
+ },
834
+ {
835
+ "epoch": 2.97,
836
+ "grad_norm": 5.317773342132568,
837
+ "learning_rate": 4.748338081671415e-07,
838
+ "loss": 0.2293,
839
+ "step": 1160
840
+ },
841
+ {
842
+ "epoch": 2.99,
843
+ "grad_norm": 4.241806983947754,
844
+ "learning_rate": 0.0,
845
+ "loss": 0.173,
846
+ "step": 1170
847
+ },
848
  {
849
  "epoch": 2.99,
850
+ "eval_accuracy": 0.9844,
851
+ "eval_loss": 0.04590694606304169,
852
+ "eval_runtime": 32.5039,
853
+ "eval_samples_per_second": 307.655,
854
+ "eval_steps_per_second": 9.63,
855
+ "step": 1170
856
  },
857
  {
858
  "epoch": 2.99,
859
+ "step": 1170,
860
+ "total_flos": 2.930358373492064e+18,
861
+ "train_loss": 0.33610059461023056,
862
+ "train_runtime": 935.657,
863
+ "train_samples_per_second": 160.315,
864
+ "train_steps_per_second": 1.25
865
  }
866
  ],
867
  "logging_steps": 10,
868
+ "max_steps": 1170,
869
  "num_input_tokens_seen": 0,
870
  "num_train_epochs": 3,
871
  "save_steps": 500,
872
+ "total_flos": 2.930358373492064e+18,
873
  "train_batch_size": 32,
874
  "trial_name": null,
875
  "trial_params": null