Mariofm02 commited on
Commit
5aa1fd4
1 Parent(s): b397482

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21K
4
  tags:
 
5
  - generated_from_trainer
6
  metrics:
7
  - accuracy
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # finetuned-cards-blackjack
17
 
18
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21K](https://huggingface.co/google/vit-base-patch16-224-in21K) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.5081
21
  - Accuracy: 0.8696
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21K
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
 
16
 
17
  # finetuned-cards-blackjack
18
 
19
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21K](https://huggingface.co/google/vit-base-patch16-224-in21K) on the card_images dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.5081
22
  - Accuracy: 0.8696
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 4.0,
3
- "eval_accuracy": 0.7710120068610634,
4
- "eval_loss": 0.912144124507904,
5
- "eval_runtime": 6.7842,
6
- "eval_samples_per_second": 171.871,
7
- "eval_steps_per_second": 21.521,
8
- "total_flos": 2.047635634195759e+18,
9
- "train_loss": 1.8191430680543978,
10
- "train_runtime": 594.0822,
11
- "train_samples_per_second": 44.458,
12
- "train_steps_per_second": 2.781
13
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "eval_accuracy": 0.869639794168096,
4
+ "eval_loss": 0.5081329345703125,
5
+ "eval_runtime": 6.0627,
6
+ "eval_samples_per_second": 192.325,
7
+ "eval_steps_per_second": 24.082,
8
+ "total_flos": 3.5833623598425784e+18,
9
+ "train_loss": 0.7298227465279536,
10
+ "train_runtime": 1041.6701,
11
+ "train_samples_per_second": 44.372,
12
+ "train_steps_per_second": 2.775
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "eval_accuracy": 0.7710120068610634,
4
- "eval_loss": 0.912144124507904,
5
- "eval_runtime": 6.7842,
6
- "eval_samples_per_second": 171.871,
7
- "eval_steps_per_second": 21.521
8
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "eval_accuracy": 0.869639794168096,
4
+ "eval_loss": 0.5081329345703125,
5
+ "eval_runtime": 6.0627,
6
+ "eval_samples_per_second": 192.325,
7
+ "eval_steps_per_second": 24.082
8
  }
runs/Mar29_20-03-14_44990517b672/events.out.tfevents.1711743718.44990517b672.3784.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a7ba40417f5c4964f1cd809b096f6c87c8d62f1075104134c45c2c03723fd3b
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "total_flos": 2.047635634195759e+18,
4
- "train_loss": 1.8191430680543978,
5
- "train_runtime": 594.0822,
6
- "train_samples_per_second": 44.458,
7
- "train_steps_per_second": 2.781
8
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "total_flos": 3.5833623598425784e+18,
4
+ "train_loss": 0.7298227465279536,
5
+ "train_runtime": 1041.6701,
6
+ "train_samples_per_second": 44.372,
7
+ "train_steps_per_second": 2.775
8
  }
trainer_state.json CHANGED
@@ -1,1328 +1,2304 @@
1
  {
2
- "best_metric": 0.912144124507904,
3
- "best_model_checkpoint": "finetuned-cards-blackjack/checkpoint-1600",
4
- "epoch": 4.0,
5
  "eval_steps": 100,
6
- "global_step": 1652,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "grad_norm": 1.466597557067871,
14
- "learning_rate": 0.0001987893462469734,
15
- "loss": 3.9543,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05,
20
- "grad_norm": 1.9476360082626343,
21
- "learning_rate": 0.00019757869249394675,
22
- "loss": 3.8868,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07,
27
- "grad_norm": 1.6487232446670532,
28
- "learning_rate": 0.0001963680387409201,
29
- "loss": 3.8185,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.1,
34
- "grad_norm": 1.8101606369018555,
35
- "learning_rate": 0.00019515738498789345,
36
- "loss": 3.6559,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.12,
41
- "grad_norm": 1.7900973558425903,
42
- "learning_rate": 0.00019394673123486684,
43
- "loss": 3.559,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.15,
48
- "grad_norm": 1.7922214269638062,
49
- "learning_rate": 0.0001927360774818402,
50
- "loss": 3.4135,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.17,
55
- "grad_norm": 1.9818700551986694,
56
- "learning_rate": 0.00019152542372881357,
57
- "loss": 3.3906,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.19,
62
- "grad_norm": 1.9315565824508667,
63
- "learning_rate": 0.00019031476997578695,
64
- "loss": 3.3191,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.22,
69
- "grad_norm": 1.9850099086761475,
70
- "learning_rate": 0.0001891041162227603,
71
- "loss": 3.2122,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.24,
76
- "grad_norm": 1.9584887027740479,
77
- "learning_rate": 0.00018789346246973366,
78
- "loss": 3.1307,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.24,
83
- "eval_accuracy": 0.20926243567753003,
84
- "eval_loss": 3.017998456954956,
85
- "eval_runtime": 6.19,
86
- "eval_samples_per_second": 188.367,
87
- "eval_steps_per_second": 23.586,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.27,
92
- "grad_norm": 2.39744234085083,
93
- "learning_rate": 0.00018668280871670702,
94
- "loss": 3.0667,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.29,
99
- "grad_norm": 1.891518473625183,
100
- "learning_rate": 0.0001854721549636804,
101
- "loss": 2.984,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.31,
106
- "grad_norm": 1.9065883159637451,
107
- "learning_rate": 0.00018426150121065375,
108
- "loss": 2.8457,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.34,
113
- "grad_norm": 2.126429796218872,
114
- "learning_rate": 0.00018305084745762714,
115
- "loss": 2.9638,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.36,
120
- "grad_norm": 1.9387011528015137,
121
- "learning_rate": 0.00018184019370460052,
122
- "loss": 2.7348,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 0.39,
127
- "grad_norm": 1.923202633857727,
128
- "learning_rate": 0.00018062953995157384,
129
- "loss": 2.8489,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 0.41,
134
- "grad_norm": 2.581446409225464,
135
- "learning_rate": 0.00017941888619854723,
136
- "loss": 2.7744,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 0.44,
141
- "grad_norm": 1.6987770795822144,
142
- "learning_rate": 0.00017820823244552058,
143
- "loss": 2.6428,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 0.46,
148
- "grad_norm": 1.9667104482650757,
149
- "learning_rate": 0.00017699757869249396,
150
- "loss": 2.6952,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 0.48,
155
- "grad_norm": 3.4282023906707764,
156
- "learning_rate": 0.00017578692493946732,
157
- "loss": 2.7372,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 0.48,
162
- "eval_accuracy": 0.27615780445969124,
163
- "eval_loss": 2.530054807662964,
164
- "eval_runtime": 6.2338,
165
- "eval_samples_per_second": 187.045,
166
- "eval_steps_per_second": 23.421,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.51,
171
- "grad_norm": 1.9124583005905151,
172
- "learning_rate": 0.0001745762711864407,
173
- "loss": 2.6423,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 0.53,
178
- "grad_norm": 2.4269683361053467,
179
- "learning_rate": 0.00017336561743341405,
180
- "loss": 2.668,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 0.56,
185
- "grad_norm": 1.9838333129882812,
186
- "learning_rate": 0.0001721549636803874,
187
- "loss": 2.5786,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 0.58,
192
- "grad_norm": 3.200087070465088,
193
- "learning_rate": 0.0001709443099273608,
194
- "loss": 2.5938,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 0.61,
199
- "grad_norm": 2.93118953704834,
200
- "learning_rate": 0.00016973365617433414,
201
- "loss": 2.4526,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 0.63,
206
- "grad_norm": 2.555947780609131,
207
- "learning_rate": 0.00016852300242130752,
208
- "loss": 2.41,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 0.65,
213
- "grad_norm": 2.9446065425872803,
214
- "learning_rate": 0.00016731234866828088,
215
- "loss": 2.4537,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 0.68,
220
- "grad_norm": 3.393993377685547,
221
- "learning_rate": 0.00016610169491525423,
222
- "loss": 2.4256,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 0.7,
227
- "grad_norm": 2.721825122833252,
228
- "learning_rate": 0.00016489104116222762,
229
- "loss": 2.4719,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 0.73,
234
- "grad_norm": 3.2610368728637695,
235
- "learning_rate": 0.00016368038740920097,
236
- "loss": 2.4969,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 0.73,
241
- "eval_accuracy": 0.3439108061749571,
242
- "eval_loss": 2.175961971282959,
243
- "eval_runtime": 6.2327,
244
- "eval_samples_per_second": 187.079,
245
- "eval_steps_per_second": 23.425,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 0.75,
250
- "grad_norm": 3.067995309829712,
251
- "learning_rate": 0.00016246973365617435,
252
- "loss": 2.4904,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 0.77,
257
- "grad_norm": 2.7957141399383545,
258
- "learning_rate": 0.0001612590799031477,
259
- "loss": 2.3913,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 0.8,
264
- "grad_norm": 2.281586170196533,
265
- "learning_rate": 0.0001600484261501211,
266
- "loss": 2.1749,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 0.82,
271
- "grad_norm": 2.4833972454071045,
272
- "learning_rate": 0.00015883777239709444,
273
- "loss": 2.4058,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 0.85,
278
- "grad_norm": 2.5052073001861572,
279
- "learning_rate": 0.0001576271186440678,
280
- "loss": 2.3236,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 0.87,
285
- "grad_norm": 2.479684352874756,
286
- "learning_rate": 0.00015641646489104115,
287
- "loss": 2.373,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 0.9,
292
- "grad_norm": 3.6352992057800293,
293
- "learning_rate": 0.00015520581113801453,
294
- "loss": 2.3282,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 0.92,
299
- "grad_norm": 2.748934030532837,
300
- "learning_rate": 0.00015399515738498791,
301
- "loss": 2.2062,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 0.94,
306
- "grad_norm": 2.0645978450775146,
307
- "learning_rate": 0.00015278450363196127,
308
- "loss": 2.1253,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 0.97,
313
- "grad_norm": 2.2856009006500244,
314
- "learning_rate": 0.00015157384987893465,
315
- "loss": 2.1973,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 0.97,
320
- "eval_accuracy": 0.37564322469982847,
321
- "eval_loss": 2.0102577209472656,
322
- "eval_runtime": 5.9741,
323
- "eval_samples_per_second": 195.175,
324
- "eval_steps_per_second": 24.439,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 0.99,
329
- "grad_norm": 2.866960048675537,
330
- "learning_rate": 0.00015036319612590798,
331
- "loss": 2.214,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 1.02,
336
- "grad_norm": 3.171844482421875,
337
- "learning_rate": 0.00014915254237288136,
338
- "loss": 2.0948,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 1.04,
343
- "grad_norm": 3.6916253566741943,
344
- "learning_rate": 0.00014794188861985471,
345
- "loss": 2.0649,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 1.07,
350
- "grad_norm": 2.3281314373016357,
351
- "learning_rate": 0.0001467312348668281,
352
- "loss": 2.0633,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 1.09,
357
- "grad_norm": 3.370180368423462,
358
- "learning_rate": 0.00014552058111380148,
359
- "loss": 1.9949,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 1.11,
364
- "grad_norm": 2.5389626026153564,
365
- "learning_rate": 0.00014430992736077483,
366
- "loss": 2.086,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 1.14,
371
- "grad_norm": 2.47526216506958,
372
- "learning_rate": 0.00014309927360774819,
373
- "loss": 2.0443,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 1.16,
378
- "grad_norm": 2.821577548980713,
379
- "learning_rate": 0.00014188861985472154,
380
- "loss": 2.0808,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 1.19,
385
- "grad_norm": 2.978994369506836,
386
- "learning_rate": 0.00014067796610169492,
387
- "loss": 2.1278,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 1.21,
392
- "grad_norm": 3.1431379318237305,
393
- "learning_rate": 0.00013946731234866828,
394
- "loss": 1.8847,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.21,
399
- "eval_accuracy": 0.41080617495711835,
400
- "eval_loss": 1.8402307033538818,
401
- "eval_runtime": 6.2119,
402
- "eval_samples_per_second": 187.705,
403
- "eval_steps_per_second": 23.503,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 1.23,
408
- "grad_norm": 3.1350502967834473,
409
- "learning_rate": 0.00013825665859564166,
410
- "loss": 2.02,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 1.26,
415
- "grad_norm": 2.63952374458313,
416
- "learning_rate": 0.00013704600484261504,
417
- "loss": 2.1684,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 1.28,
422
- "grad_norm": 2.7914199829101562,
423
- "learning_rate": 0.00013583535108958837,
424
- "loss": 1.8532,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 1.31,
429
- "grad_norm": 4.124698638916016,
430
- "learning_rate": 0.00013462469733656175,
431
- "loss": 1.9593,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 1.33,
436
- "grad_norm": 3.0953214168548584,
437
- "learning_rate": 0.0001334140435835351,
438
- "loss": 2.0143,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 1.36,
443
- "grad_norm": 3.626241683959961,
444
- "learning_rate": 0.00013220338983050849,
445
- "loss": 2.0349,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 1.38,
450
- "grad_norm": 3.22306752204895,
451
- "learning_rate": 0.00013099273607748184,
452
- "loss": 1.9283,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 1.4,
457
- "grad_norm": 2.6860299110412598,
458
- "learning_rate": 0.00012978208232445522,
459
- "loss": 1.9022,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 1.43,
464
- "grad_norm": 3.2099533081054688,
465
- "learning_rate": 0.00012857142857142858,
466
- "loss": 1.9102,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 1.45,
471
- "grad_norm": 2.5889129638671875,
472
- "learning_rate": 0.00012736077481840193,
473
- "loss": 1.746,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 1.45,
478
- "eval_accuracy": 0.48027444253859347,
479
- "eval_loss": 1.7051318883895874,
480
- "eval_runtime": 6.0914,
481
- "eval_samples_per_second": 191.418,
482
- "eval_steps_per_second": 23.968,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 1.48,
487
- "grad_norm": 2.6496353149414062,
488
- "learning_rate": 0.0001261501210653753,
489
- "loss": 1.8087,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 1.5,
494
- "grad_norm": 2.2695322036743164,
495
- "learning_rate": 0.00012493946731234867,
496
- "loss": 1.9172,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 1.53,
501
- "grad_norm": 3.144073724746704,
502
- "learning_rate": 0.00012372881355932205,
503
- "loss": 1.8943,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 1.55,
508
- "grad_norm": 2.9001333713531494,
509
- "learning_rate": 0.0001225181598062954,
510
- "loss": 1.9463,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 1.57,
515
- "grad_norm": 2.5096278190612793,
516
- "learning_rate": 0.00012130750605326877,
517
- "loss": 1.8045,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 1.6,
522
- "grad_norm": 2.2238059043884277,
523
- "learning_rate": 0.00012009685230024215,
524
- "loss": 1.9322,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 1.62,
529
- "grad_norm": 2.7545368671417236,
530
- "learning_rate": 0.00011888619854721549,
531
- "loss": 1.7305,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 1.65,
536
- "grad_norm": 2.8309366703033447,
537
- "learning_rate": 0.00011767554479418887,
538
- "loss": 1.8587,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 1.67,
543
- "grad_norm": 5.093832492828369,
544
- "learning_rate": 0.00011646489104116223,
545
- "loss": 1.8362,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 1.69,
550
- "grad_norm": 2.4374847412109375,
551
- "learning_rate": 0.0001152542372881356,
552
- "loss": 1.8698,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 1.69,
557
- "eval_accuracy": 0.4888507718696398,
558
- "eval_loss": 1.5985045433044434,
559
- "eval_runtime": 6.4332,
560
- "eval_samples_per_second": 181.249,
561
- "eval_steps_per_second": 22.695,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 1.72,
566
- "grad_norm": 2.8519837856292725,
567
- "learning_rate": 0.00011404358353510895,
568
- "loss": 1.8736,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 1.74,
573
- "grad_norm": 2.8379719257354736,
574
- "learning_rate": 0.00011283292978208233,
575
- "loss": 1.6395,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 1.77,
580
- "grad_norm": 3.884648323059082,
581
- "learning_rate": 0.00011174334140435836,
582
- "loss": 1.7938,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 1.79,
587
- "grad_norm": 3.2592883110046387,
588
- "learning_rate": 0.00011053268765133173,
589
- "loss": 1.6813,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 1.82,
594
- "grad_norm": 5.118261337280273,
595
- "learning_rate": 0.00010932203389830508,
596
- "loss": 1.9414,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 1.84,
601
- "grad_norm": 2.822026491165161,
602
- "learning_rate": 0.00010811138014527846,
603
- "loss": 1.7598,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 1.86,
608
- "grad_norm": 2.8540070056915283,
609
- "learning_rate": 0.00010690072639225182,
610
- "loss": 1.7024,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 1.89,
615
- "grad_norm": 4.354470252990723,
616
- "learning_rate": 0.00010569007263922519,
617
- "loss": 1.8987,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 1.91,
622
- "grad_norm": 3.528857707977295,
623
- "learning_rate": 0.00010447941888619854,
624
- "loss": 1.7933,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 1.94,
629
- "grad_norm": 2.76985764503479,
630
- "learning_rate": 0.00010326876513317192,
631
- "loss": 1.7261,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 1.94,
636
- "eval_accuracy": 0.5840480274442539,
637
- "eval_loss": 1.4311938285827637,
638
- "eval_runtime": 6.2955,
639
- "eval_samples_per_second": 185.213,
640
- "eval_steps_per_second": 23.191,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 1.96,
645
- "grad_norm": 3.15104079246521,
646
- "learning_rate": 0.00010205811138014529,
647
- "loss": 1.8079,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 1.99,
652
- "grad_norm": 3.0211942195892334,
653
- "learning_rate": 0.00010084745762711865,
654
- "loss": 1.611,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 2.01,
659
- "grad_norm": 2.527198076248169,
660
- "learning_rate": 9.963680387409201e-05,
661
- "loss": 1.7344,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 2.03,
666
- "grad_norm": 3.654705762863159,
667
- "learning_rate": 9.842615012106537e-05,
668
- "loss": 1.5921,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 2.06,
673
- "grad_norm": 2.6901042461395264,
674
- "learning_rate": 9.721549636803875e-05,
675
- "loss": 1.5688,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 2.08,
680
- "grad_norm": 2.830200672149658,
681
- "learning_rate": 9.600484261501212e-05,
682
- "loss": 1.5546,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 2.11,
687
- "grad_norm": 3.2287344932556152,
688
- "learning_rate": 9.479418886198547e-05,
689
- "loss": 1.5714,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 2.13,
694
- "grad_norm": 3.661449432373047,
695
- "learning_rate": 9.358353510895884e-05,
696
- "loss": 1.4973,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 2.15,
701
- "grad_norm": 6.353243827819824,
702
- "learning_rate": 9.237288135593221e-05,
703
- "loss": 1.5294,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 2.18,
708
- "grad_norm": 3.703733444213867,
709
- "learning_rate": 9.116222760290558e-05,
710
- "loss": 1.7385,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 2.18,
715
- "eval_accuracy": 0.6286449399656947,
716
- "eval_loss": 1.3585376739501953,
717
- "eval_runtime": 5.9781,
718
- "eval_samples_per_second": 195.046,
719
- "eval_steps_per_second": 24.423,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 2.2,
724
- "grad_norm": 2.730365514755249,
725
- "learning_rate": 8.995157384987893e-05,
726
- "loss": 1.626,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 2.23,
731
- "grad_norm": 4.335669040679932,
732
- "learning_rate": 8.874092009685231e-05,
733
- "loss": 1.5823,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 2.25,
738
- "grad_norm": 2.272915840148926,
739
- "learning_rate": 8.753026634382567e-05,
740
- "loss": 1.47,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 2.28,
745
- "grad_norm": 3.335453510284424,
746
- "learning_rate": 8.631961259079904e-05,
747
- "loss": 1.4733,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 2.3,
752
- "grad_norm": 5.18184232711792,
753
- "learning_rate": 8.51089588377724e-05,
754
- "loss": 1.3798,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 2.32,
759
- "grad_norm": 3.79761004447937,
760
- "learning_rate": 8.389830508474577e-05,
761
- "loss": 1.5103,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 2.35,
766
- "grad_norm": 2.568056344985962,
767
- "learning_rate": 8.268765133171913e-05,
768
- "loss": 1.5016,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 2.37,
773
- "grad_norm": 4.231459140777588,
774
- "learning_rate": 8.14769975786925e-05,
775
- "loss": 1.4617,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 2.4,
780
- "grad_norm": 3.2914044857025146,
781
- "learning_rate": 8.026634382566586e-05,
782
- "loss": 1.5527,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 2.42,
787
- "grad_norm": 2.967702627182007,
788
- "learning_rate": 7.905569007263923e-05,
789
- "loss": 1.5873,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 2.42,
794
- "eval_accuracy": 0.6758147512864494,
795
- "eval_loss": 1.2374264001846313,
796
- "eval_runtime": 6.2974,
797
- "eval_samples_per_second": 185.155,
798
- "eval_steps_per_second": 23.184,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 2.45,
803
- "grad_norm": 2.7834739685058594,
804
- "learning_rate": 7.78450363196126e-05,
805
- "loss": 1.4255,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 2.47,
810
- "grad_norm": 3.380810260772705,
811
- "learning_rate": 7.663438256658597e-05,
812
- "loss": 1.4528,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 2.49,
817
- "grad_norm": 3.3973748683929443,
818
- "learning_rate": 7.542372881355932e-05,
819
- "loss": 1.5726,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 2.52,
824
- "grad_norm": 2.9069502353668213,
825
- "learning_rate": 7.421307506053269e-05,
826
- "loss": 1.2987,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 2.54,
831
- "grad_norm": 2.8832297325134277,
832
- "learning_rate": 7.300242130750606e-05,
833
- "loss": 1.437,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 2.57,
838
- "grad_norm": 3.137310743331909,
839
- "learning_rate": 7.179176755447942e-05,
840
- "loss": 1.5,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 2.59,
845
- "grad_norm": 3.156430244445801,
846
- "learning_rate": 7.058111380145279e-05,
847
- "loss": 1.341,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 2.62,
852
- "grad_norm": 3.470303535461426,
853
- "learning_rate": 6.937046004842616e-05,
854
- "loss": 1.3986,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 2.64,
859
- "grad_norm": 3.426010847091675,
860
- "learning_rate": 6.815980629539952e-05,
861
- "loss": 1.3874,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 2.66,
866
- "grad_norm": 3.8181042671203613,
867
- "learning_rate": 6.694915254237288e-05,
868
- "loss": 1.4775,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 2.66,
873
- "eval_accuracy": 0.7024013722126929,
874
- "eval_loss": 1.1351556777954102,
875
- "eval_runtime": 6.2887,
876
- "eval_samples_per_second": 185.412,
877
- "eval_steps_per_second": 23.216,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 2.69,
882
- "grad_norm": 3.4228086471557617,
883
- "learning_rate": 6.573849878934625e-05,
884
- "loss": 1.4804,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 2.71,
889
- "grad_norm": 4.945833206176758,
890
- "learning_rate": 6.45278450363196e-05,
891
- "loss": 1.2617,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 2.74,
896
- "grad_norm": 2.712095022201538,
897
- "learning_rate": 6.331719128329297e-05,
898
- "loss": 1.4254,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 2.76,
903
- "grad_norm": 3.2312748432159424,
904
- "learning_rate": 6.210653753026636e-05,
905
- "loss": 1.4141,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 2.78,
910
- "grad_norm": 2.4630300998687744,
911
- "learning_rate": 6.089588377723972e-05,
912
- "loss": 1.3438,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 2.81,
917
- "grad_norm": 2.9009976387023926,
918
- "learning_rate": 5.968523002421308e-05,
919
- "loss": 1.3625,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 2.83,
924
- "grad_norm": 5.364362716674805,
925
- "learning_rate": 5.8474576271186446e-05,
926
- "loss": 1.4056,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 2.86,
931
- "grad_norm": 3.0310747623443604,
932
- "learning_rate": 5.726392251815981e-05,
933
- "loss": 1.2943,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 2.88,
938
- "grad_norm": 2.7472984790802,
939
- "learning_rate": 5.605326876513317e-05,
940
- "loss": 1.4934,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 2.91,
945
- "grad_norm": 2.9528918266296387,
946
- "learning_rate": 5.484261501210654e-05,
947
- "loss": 1.2697,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 2.91,
952
- "eval_accuracy": 0.70926243567753,
953
- "eval_loss": 1.104396104812622,
954
- "eval_runtime": 6.0071,
955
- "eval_samples_per_second": 194.105,
956
- "eval_steps_per_second": 24.305,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 2.93,
961
- "grad_norm": 2.5816805362701416,
962
- "learning_rate": 5.363196125907991e-05,
963
- "loss": 1.3362,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 2.95,
968
- "grad_norm": 3.5116188526153564,
969
- "learning_rate": 5.242130750605327e-05,
970
- "loss": 1.3128,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 2.98,
975
- "grad_norm": 2.873042583465576,
976
- "learning_rate": 5.121065375302664e-05,
977
- "loss": 1.3257,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 3.0,
982
- "grad_norm": 6.232132434844971,
983
- "learning_rate": 5e-05,
984
- "loss": 1.256,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 3.03,
989
- "grad_norm": 2.3054957389831543,
990
- "learning_rate": 4.8789346246973364e-05,
991
- "loss": 1.1805,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 3.05,
996
- "grad_norm": 3.0687952041625977,
997
- "learning_rate": 4.757869249394674e-05,
998
- "loss": 1.0767,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 3.08,
1003
- "grad_norm": 3.774822235107422,
1004
- "learning_rate": 4.63680387409201e-05,
1005
- "loss": 1.311,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 3.1,
1010
- "grad_norm": 4.785544395446777,
1011
- "learning_rate": 4.515738498789346e-05,
1012
- "loss": 1.2997,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 3.12,
1017
- "grad_norm": 3.4525294303894043,
1018
- "learning_rate": 4.394673123486683e-05,
1019
- "loss": 1.2039,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 3.15,
1024
- "grad_norm": 3.312502861022949,
1025
- "learning_rate": 4.27360774818402e-05,
1026
- "loss": 1.2137,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 3.15,
1031
- "eval_accuracy": 0.7615780445969125,
1032
- "eval_loss": 1.0005759000778198,
1033
- "eval_runtime": 6.3563,
1034
- "eval_samples_per_second": 183.44,
1035
- "eval_steps_per_second": 22.969,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 3.17,
1040
- "grad_norm": 3.375433921813965,
1041
- "learning_rate": 4.152542372881356e-05,
1042
- "loss": 1.2714,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 3.2,
1047
- "grad_norm": 2.5909006595611572,
1048
- "learning_rate": 4.0314769975786926e-05,
1049
- "loss": 1.3154,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 3.22,
1054
- "grad_norm": 3.0990185737609863,
1055
- "learning_rate": 3.910411622276029e-05,
1056
- "loss": 1.144,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 3.24,
1061
- "grad_norm": 1.911260962486267,
1062
- "learning_rate": 3.789346246973366e-05,
1063
- "loss": 1.0008,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 3.27,
1068
- "grad_norm": 2.93192458152771,
1069
- "learning_rate": 3.6682808716707024e-05,
1070
- "loss": 1.0603,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 3.29,
1075
- "grad_norm": 3.3576924800872803,
1076
- "learning_rate": 3.5472154963680385e-05,
1077
- "loss": 1.2791,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 3.32,
1082
- "grad_norm": 2.8567686080932617,
1083
- "learning_rate": 3.426150121065376e-05,
1084
- "loss": 1.1223,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 3.34,
1089
- "grad_norm": 2.735358953475952,
1090
- "learning_rate": 3.305084745762712e-05,
1091
- "loss": 1.2043,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 3.37,
1096
- "grad_norm": 3.374582529067993,
1097
- "learning_rate": 3.184019370460048e-05,
1098
- "loss": 1.0495,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 3.39,
1103
- "grad_norm": 4.9084792137146,
1104
- "learning_rate": 3.062953995157385e-05,
1105
- "loss": 1.423,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 3.39,
1110
- "eval_accuracy": 0.774442538593482,
1111
- "eval_loss": 0.9588848352432251,
1112
- "eval_runtime": 6.2494,
1113
- "eval_samples_per_second": 186.579,
1114
- "eval_steps_per_second": 23.362,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 3.41,
1119
- "grad_norm": 4.47416353225708,
1120
- "learning_rate": 2.941888619854722e-05,
1121
- "loss": 1.2965,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 3.44,
1126
- "grad_norm": 2.692729949951172,
1127
- "learning_rate": 2.8208232445520583e-05,
1128
- "loss": 1.1812,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 3.46,
1133
- "grad_norm": 3.5278244018554688,
1134
- "learning_rate": 2.6997578692493948e-05,
1135
- "loss": 1.2515,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 3.49,
1140
- "grad_norm": 2.9056203365325928,
1141
- "learning_rate": 2.5786924939467316e-05,
1142
- "loss": 1.0617,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 3.51,
1147
- "grad_norm": 2.6366896629333496,
1148
- "learning_rate": 2.457627118644068e-05,
1149
- "loss": 1.0449,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 3.54,
1154
- "grad_norm": 3.593003034591675,
1155
- "learning_rate": 2.3365617433414045e-05,
1156
- "loss": 1.1273,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 3.56,
1161
- "grad_norm": 3.5506863594055176,
1162
- "learning_rate": 2.215496368038741e-05,
1163
- "loss": 1.2625,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 3.58,
1168
- "grad_norm": 4.686192989349365,
1169
- "learning_rate": 2.0944309927360775e-05,
1170
- "loss": 1.1439,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 3.61,
1175
- "grad_norm": 3.072838068008423,
1176
- "learning_rate": 1.9733656174334143e-05,
1177
- "loss": 1.2008,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 3.63,
1182
- "grad_norm": 4.130647659301758,
1183
- "learning_rate": 1.8523002421307507e-05,
1184
- "loss": 1.0098,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 3.63,
1189
- "eval_accuracy": 0.7684391080617495,
1190
- "eval_loss": 0.9360153675079346,
1191
- "eval_runtime": 5.9954,
1192
- "eval_samples_per_second": 194.481,
1193
- "eval_steps_per_second": 24.352,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 3.66,
1198
- "grad_norm": 2.432633638381958,
1199
- "learning_rate": 1.7312348668280872e-05,
1200
- "loss": 1.0802,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 3.68,
1205
- "grad_norm": 3.6661131381988525,
1206
- "learning_rate": 1.6101694915254237e-05,
1207
- "loss": 1.0655,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 3.7,
1212
- "grad_norm": 3.967733860015869,
1213
- "learning_rate": 1.4891041162227603e-05,
1214
- "loss": 1.1482,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 3.73,
1219
- "grad_norm": 3.776456832885742,
1220
- "learning_rate": 1.3680387409200971e-05,
1221
- "loss": 1.2236,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 3.75,
1226
- "grad_norm": 3.1570096015930176,
1227
- "learning_rate": 1.2469733656174334e-05,
1228
- "loss": 1.0433,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 3.78,
1233
- "grad_norm": 3.3112399578094482,
1234
- "learning_rate": 1.12590799031477e-05,
1235
- "loss": 1.1766,
1236
  "step": 1560
1237
  },
1238
  {
1239
  "epoch": 3.8,
1240
- "grad_norm": 3.405649185180664,
1241
- "learning_rate": 1.0048426150121065e-05,
1242
- "loss": 1.1755,
1243
  "step": 1570
1244
  },
1245
  {
1246
  "epoch": 3.83,
1247
- "grad_norm": 2.6833651065826416,
1248
- "learning_rate": 8.837772397094432e-06,
1249
- "loss": 1.0593,
1250
  "step": 1580
1251
  },
1252
  {
1253
  "epoch": 3.85,
1254
- "grad_norm": 3.3236443996429443,
1255
- "learning_rate": 7.627118644067798e-06,
1256
- "loss": 1.1001,
1257
  "step": 1590
1258
  },
1259
  {
1260
  "epoch": 3.87,
1261
- "grad_norm": 3.5733933448791504,
1262
- "learning_rate": 6.4164648910411625e-06,
1263
- "loss": 1.1325,
1264
  "step": 1600
1265
  },
1266
  {
1267
  "epoch": 3.87,
1268
- "eval_accuracy": 0.7710120068610634,
1269
- "eval_loss": 0.912144124507904,
1270
- "eval_runtime": 6.3188,
1271
- "eval_samples_per_second": 184.528,
1272
- "eval_steps_per_second": 23.106,
1273
  "step": 1600
1274
  },
1275
  {
1276
  "epoch": 3.9,
1277
- "grad_norm": 3.3235766887664795,
1278
- "learning_rate": 5.205811138014528e-06,
1279
- "loss": 1.1434,
1280
  "step": 1610
1281
  },
1282
  {
1283
  "epoch": 3.92,
1284
- "grad_norm": 5.47670841217041,
1285
- "learning_rate": 3.9951573849878936e-06,
1286
- "loss": 1.0415,
1287
  "step": 1620
1288
  },
1289
  {
1290
  "epoch": 3.95,
1291
- "grad_norm": 2.83181095123291,
1292
- "learning_rate": 2.784503631961259e-06,
1293
- "loss": 1.0888,
1294
  "step": 1630
1295
  },
1296
  {
1297
  "epoch": 3.97,
1298
- "grad_norm": 4.57571268081665,
1299
- "learning_rate": 1.5738498789346248e-06,
1300
- "loss": 1.0908,
1301
  "step": 1640
1302
  },
1303
  {
1304
  "epoch": 4.0,
1305
- "grad_norm": 3.1416895389556885,
1306
- "learning_rate": 3.6319612590799036e-07,
1307
- "loss": 0.9855,
1308
  "step": 1650
1309
  },
1310
  {
1311
- "epoch": 4.0,
1312
- "step": 1652,
1313
- "total_flos": 2.047635634195759e+18,
1314
- "train_loss": 1.8191430680543978,
1315
- "train_runtime": 594.0822,
1316
- "train_samples_per_second": 44.458,
1317
- "train_steps_per_second": 2.781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  }
1319
  ],
1320
  "logging_steps": 10,
1321
- "max_steps": 1652,
1322
  "num_input_tokens_seen": 0,
1323
- "num_train_epochs": 4,
1324
  "save_steps": 100,
1325
- "total_flos": 2.047635634195759e+18,
1326
  "train_batch_size": 16,
1327
  "trial_name": null,
1328
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5081329345703125,
3
+ "best_model_checkpoint": "finetuned-cards-blackjack/checkpoint-2800",
4
+ "epoch": 7.0,
5
  "eval_steps": 100,
6
+ "global_step": 2891,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "grad_norm": 3.301379680633545,
14
+ "learning_rate": 0.00019930819785541338,
15
+ "loss": 1.3967,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05,
20
+ "grad_norm": 3.8909592628479004,
21
+ "learning_rate": 0.00019861639571082672,
22
+ "loss": 1.3469,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07,
27
+ "grad_norm": 3.4435207843780518,
28
+ "learning_rate": 0.00019792459356624006,
29
+ "loss": 1.321,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.1,
34
+ "grad_norm": 4.026681900024414,
35
+ "learning_rate": 0.0001972327914216534,
36
+ "loss": 1.0812,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.12,
41
+ "grad_norm": 5.660863399505615,
42
+ "learning_rate": 0.00019654098927706677,
43
+ "loss": 1.2255,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.15,
48
+ "grad_norm": 4.201864719390869,
49
+ "learning_rate": 0.00019584918713248014,
50
+ "loss": 1.2845,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.17,
55
+ "grad_norm": 3.0525405406951904,
56
+ "learning_rate": 0.00019515738498789345,
57
+ "loss": 1.3223,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.19,
62
+ "grad_norm": 4.4655351638793945,
63
+ "learning_rate": 0.00019446558284330682,
64
+ "loss": 1.444,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.22,
69
+ "grad_norm": 5.199573993682861,
70
+ "learning_rate": 0.00019377378069872016,
71
+ "loss": 1.4201,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.24,
76
+ "grad_norm": 4.03210973739624,
77
+ "learning_rate": 0.00019308197855413353,
78
+ "loss": 1.3563,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.24,
83
+ "eval_accuracy": 0.6749571183533448,
84
+ "eval_loss": 1.1494646072387695,
85
+ "eval_runtime": 6.2584,
86
+ "eval_samples_per_second": 186.311,
87
+ "eval_steps_per_second": 23.329,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.27,
92
+ "grad_norm": 4.278244495391846,
93
+ "learning_rate": 0.00019239017640954688,
94
+ "loss": 1.3488,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.29,
99
+ "grad_norm": 3.920788049697876,
100
+ "learning_rate": 0.00019169837426496022,
101
+ "loss": 1.451,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.31,
106
+ "grad_norm": 3.261601686477661,
107
+ "learning_rate": 0.00019100657212037359,
108
+ "loss": 1.2141,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.34,
113
+ "grad_norm": 5.404760837554932,
114
+ "learning_rate": 0.00019031476997578695,
115
+ "loss": 1.5746,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.36,
120
+ "grad_norm": 2.8406295776367188,
121
+ "learning_rate": 0.0001896229678312003,
122
+ "loss": 1.2777,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 0.39,
127
+ "grad_norm": 3.7745213508605957,
128
+ "learning_rate": 0.00018893116568661364,
129
+ "loss": 1.2075,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 0.41,
134
+ "grad_norm": 3.8692853450775146,
135
+ "learning_rate": 0.00018823936354202698,
136
+ "loss": 1.2082,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 0.44,
141
+ "grad_norm": 3.2764315605163574,
142
+ "learning_rate": 0.00018754756139744035,
143
+ "loss": 1.1009,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 0.46,
148
+ "grad_norm": 2.6145033836364746,
149
+ "learning_rate": 0.0001868557592528537,
150
+ "loss": 1.2236,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 0.48,
155
+ "grad_norm": 4.715363502502441,
156
+ "learning_rate": 0.00018616395710826703,
157
+ "loss": 1.3393,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 0.48,
162
+ "eval_accuracy": 0.7204116638078902,
163
+ "eval_loss": 1.0388233661651611,
164
+ "eval_runtime": 6.0054,
165
+ "eval_samples_per_second": 194.159,
166
+ "eval_steps_per_second": 24.311,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.51,
171
+ "grad_norm": 3.9448986053466797,
172
+ "learning_rate": 0.0001854721549636804,
173
+ "loss": 1.1597,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 0.53,
178
+ "grad_norm": 3.357956886291504,
179
+ "learning_rate": 0.00018478035281909374,
180
+ "loss": 1.1734,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 0.56,
185
+ "grad_norm": 5.5605244636535645,
186
+ "learning_rate": 0.0001840885506745071,
187
+ "loss": 1.165,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 0.58,
192
+ "grad_norm": 4.48176908493042,
193
+ "learning_rate": 0.00018339674852992045,
194
+ "loss": 1.4025,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 0.61,
199
+ "grad_norm": 3.6814768314361572,
200
+ "learning_rate": 0.0001827049463853338,
201
+ "loss": 1.198,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 0.63,
206
+ "grad_norm": 4.112949848175049,
207
+ "learning_rate": 0.00018201314424074716,
208
+ "loss": 1.1062,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 0.65,
213
+ "grad_norm": 5.755402565002441,
214
+ "learning_rate": 0.0001813213420961605,
215
+ "loss": 1.144,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 0.68,
220
+ "grad_norm": 3.6004717350006104,
221
+ "learning_rate": 0.00018062953995157384,
222
+ "loss": 1.2527,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 0.7,
227
+ "grad_norm": 2.4746742248535156,
228
+ "learning_rate": 0.0001799377378069872,
229
+ "loss": 1.1316,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 0.73,
234
+ "grad_norm": 2.231992483139038,
235
+ "learning_rate": 0.00017924593566240055,
236
+ "loss": 1.2033,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 0.73,
241
+ "eval_accuracy": 0.7547169811320755,
242
+ "eval_loss": 0.9323562979698181,
243
+ "eval_runtime": 7.9011,
244
+ "eval_samples_per_second": 147.574,
245
+ "eval_steps_per_second": 18.478,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 0.75,
250
+ "grad_norm": 4.073417663574219,
251
+ "learning_rate": 0.00017855413351781392,
252
+ "loss": 1.3702,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 0.77,
257
+ "grad_norm": 5.040902137756348,
258
+ "learning_rate": 0.00017786233137322726,
259
+ "loss": 1.1423,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 0.8,
264
+ "grad_norm": 3.7068464756011963,
265
+ "learning_rate": 0.0001771705292286406,
266
+ "loss": 1.0609,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 0.82,
271
+ "grad_norm": 4.864231586456299,
272
+ "learning_rate": 0.00017647872708405397,
273
+ "loss": 1.201,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 0.85,
278
+ "grad_norm": 6.007138252258301,
279
+ "learning_rate": 0.00017578692493946732,
280
+ "loss": 1.0687,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 0.87,
285
+ "grad_norm": 4.0837225914001465,
286
+ "learning_rate": 0.00017509512279488069,
287
+ "loss": 1.1311,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 0.9,
292
+ "grad_norm": 4.566812992095947,
293
+ "learning_rate": 0.00017440332065029403,
294
+ "loss": 1.3071,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 0.92,
299
+ "grad_norm": 3.3199901580810547,
300
+ "learning_rate": 0.00017371151850570737,
301
+ "loss": 1.0246,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 0.94,
306
+ "grad_norm": 2.4883534908294678,
307
+ "learning_rate": 0.00017301971636112074,
308
+ "loss": 1.0215,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 0.97,
313
+ "grad_norm": 5.232284069061279,
314
+ "learning_rate": 0.00017232791421653408,
315
+ "loss": 0.9672,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 0.97,
320
+ "eval_accuracy": 0.7658662092624356,
321
+ "eval_loss": 0.8557726144790649,
322
+ "eval_runtime": 6.2462,
323
+ "eval_samples_per_second": 186.674,
324
+ "eval_steps_per_second": 23.374,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 0.99,
329
+ "grad_norm": 3.8225362300872803,
330
+ "learning_rate": 0.00017163611207194742,
331
+ "loss": 1.0908,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 1.02,
336
+ "grad_norm": 4.098091125488281,
337
+ "learning_rate": 0.0001709443099273608,
338
+ "loss": 0.8621,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 1.04,
343
+ "grad_norm": 4.027368068695068,
344
+ "learning_rate": 0.00017025250778277413,
345
+ "loss": 0.9868,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 1.07,
350
+ "grad_norm": 4.375247478485107,
351
+ "learning_rate": 0.0001695607056381875,
352
+ "loss": 1.0179,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 1.09,
357
+ "grad_norm": 8.204839706420898,
358
+ "learning_rate": 0.00016886890349360084,
359
+ "loss": 0.902,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 1.11,
364
+ "grad_norm": 3.1056785583496094,
365
+ "learning_rate": 0.00016817710134901418,
366
+ "loss": 0.9873,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 1.14,
371
+ "grad_norm": 6.811554908752441,
372
+ "learning_rate": 0.00016748529920442755,
373
+ "loss": 0.9035,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 1.16,
378
+ "grad_norm": 4.715181350708008,
379
+ "learning_rate": 0.0001667934970598409,
380
+ "loss": 1.0024,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 1.19,
385
+ "grad_norm": 5.355204105377197,
386
+ "learning_rate": 0.00016610169491525423,
387
+ "loss": 1.0384,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 1.21,
392
+ "grad_norm": 8.26843547821045,
393
+ "learning_rate": 0.0001654098927706676,
394
+ "loss": 0.8674,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.21,
399
+ "eval_accuracy": 0.7615780445969125,
400
+ "eval_loss": 0.8456417322158813,
401
+ "eval_runtime": 5.9836,
402
+ "eval_samples_per_second": 194.865,
403
+ "eval_steps_per_second": 24.4,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 1.23,
408
+ "grad_norm": 3.612718343734741,
409
+ "learning_rate": 0.00016471809062608094,
410
+ "loss": 1.035,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 1.26,
415
+ "grad_norm": 3.2531259059906006,
416
+ "learning_rate": 0.0001640262884814943,
417
+ "loss": 0.9591,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 1.28,
422
+ "grad_norm": 2.1132917404174805,
423
+ "learning_rate": 0.00016333448633690765,
424
+ "loss": 0.7013,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 1.31,
429
+ "grad_norm": 5.840766906738281,
430
+ "learning_rate": 0.000162642684192321,
431
+ "loss": 1.1066,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 1.33,
436
+ "grad_norm": 2.8128092288970947,
437
+ "learning_rate": 0.00016195088204773436,
438
+ "loss": 0.8851,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 1.36,
443
+ "grad_norm": 5.935888290405273,
444
+ "learning_rate": 0.0001612590799031477,
445
+ "loss": 0.9738,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 1.38,
450
+ "grad_norm": 4.2558488845825195,
451
+ "learning_rate": 0.00016056727775856107,
452
+ "loss": 1.1094,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 1.4,
457
+ "grad_norm": 3.7361583709716797,
458
+ "learning_rate": 0.0001598754756139744,
459
+ "loss": 1.0376,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 1.43,
464
+ "grad_norm": 3.6672043800354004,
465
+ "learning_rate": 0.00015918367346938776,
466
+ "loss": 0.9765,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 1.45,
471
+ "grad_norm": 2.8976941108703613,
472
+ "learning_rate": 0.00015849187132480113,
473
+ "loss": 0.8277,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 1.45,
478
+ "eval_accuracy": 0.7958833619210978,
479
+ "eval_loss": 0.7562589049339294,
480
+ "eval_runtime": 6.7504,
481
+ "eval_samples_per_second": 172.731,
482
+ "eval_steps_per_second": 21.628,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 1.48,
487
+ "grad_norm": 4.665554046630859,
488
+ "learning_rate": 0.00015780006918021447,
489
+ "loss": 0.8139,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 1.5,
494
+ "grad_norm": 4.166018486022949,
495
+ "learning_rate": 0.0001571082670356278,
496
+ "loss": 1.1314,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 1.53,
501
+ "grad_norm": 3.610258102416992,
502
+ "learning_rate": 0.00015641646489104115,
503
+ "loss": 0.9497,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 1.55,
508
+ "grad_norm": 4.610332489013672,
509
+ "learning_rate": 0.00015572466274645452,
510
+ "loss": 1.0767,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 1.57,
515
+ "grad_norm": 3.796252965927124,
516
+ "learning_rate": 0.0001550328606018679,
517
+ "loss": 0.8486,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 1.6,
522
+ "grad_norm": 3.9809694290161133,
523
+ "learning_rate": 0.00015434105845728123,
524
+ "loss": 0.9211,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 1.62,
529
+ "grad_norm": 2.5232605934143066,
530
+ "learning_rate": 0.00015364925631269457,
531
+ "loss": 0.8843,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 1.65,
536
+ "grad_norm": 4.975670337677002,
537
+ "learning_rate": 0.00015295745416810794,
538
+ "loss": 0.9494,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 1.67,
543
+ "grad_norm": 5.420626163482666,
544
+ "learning_rate": 0.00015226565202352128,
545
+ "loss": 0.9786,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 1.69,
550
+ "grad_norm": 3.364365339279175,
551
+ "learning_rate": 0.00015157384987893465,
552
+ "loss": 0.8703,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 1.69,
557
+ "eval_accuracy": 0.7538593481989708,
558
+ "eval_loss": 0.8465284109115601,
559
+ "eval_runtime": 6.2814,
560
+ "eval_samples_per_second": 185.628,
561
+ "eval_steps_per_second": 23.243,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 1.72,
566
+ "grad_norm": 3.51340913772583,
567
+ "learning_rate": 0.00015088204773434796,
568
+ "loss": 0.9032,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 1.74,
573
+ "grad_norm": 3.7203245162963867,
574
+ "learning_rate": 0.00015019024558976133,
575
+ "loss": 0.7729,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 1.77,
580
+ "grad_norm": 3.596214771270752,
581
+ "learning_rate": 0.0001494984434451747,
582
+ "loss": 0.8151,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 1.79,
587
+ "grad_norm": 3.2724595069885254,
588
+ "learning_rate": 0.00014880664130058804,
589
+ "loss": 0.8064,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 1.82,
594
+ "grad_norm": 3.5748846530914307,
595
+ "learning_rate": 0.00014811483915600139,
596
+ "loss": 0.8419,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 1.84,
601
+ "grad_norm": 5.998478412628174,
602
+ "learning_rate": 0.00014742303701141473,
603
+ "loss": 0.8494,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 1.86,
608
+ "grad_norm": 3.545043706893921,
609
+ "learning_rate": 0.0001467312348668281,
610
+ "loss": 0.7695,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 1.89,
615
+ "grad_norm": 3.9944069385528564,
616
+ "learning_rate": 0.00014603943272224146,
617
+ "loss": 0.9405,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 1.91,
622
+ "grad_norm": 5.435621738433838,
623
+ "learning_rate": 0.00014534763057765478,
624
+ "loss": 0.8863,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 1.94,
629
+ "grad_norm": 7.365724086761475,
630
+ "learning_rate": 0.00014465582843306815,
631
+ "loss": 0.893,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 1.94,
636
+ "eval_accuracy": 0.8001715265866209,
637
+ "eval_loss": 0.688121497631073,
638
+ "eval_runtime": 6.0254,
639
+ "eval_samples_per_second": 193.514,
640
+ "eval_steps_per_second": 24.231,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 1.96,
645
+ "grad_norm": 6.192987442016602,
646
+ "learning_rate": 0.0001439640262884815,
647
+ "loss": 0.8819,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 1.99,
652
+ "grad_norm": 3.021066188812256,
653
+ "learning_rate": 0.00014327222414389486,
654
+ "loss": 0.6413,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 2.01,
659
+ "grad_norm": 4.522083759307861,
660
+ "learning_rate": 0.0001425804219993082,
661
+ "loss": 0.9185,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 2.03,
666
+ "grad_norm": 3.089639186859131,
667
+ "learning_rate": 0.00014188861985472154,
668
+ "loss": 0.7384,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 2.06,
673
+ "grad_norm": 4.491950988769531,
674
+ "learning_rate": 0.0001411968177101349,
675
+ "loss": 0.765,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 2.08,
680
+ "grad_norm": 3.618821144104004,
681
+ "learning_rate": 0.00014050501556554828,
682
+ "loss": 0.8237,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 2.11,
687
+ "grad_norm": 4.773688793182373,
688
+ "learning_rate": 0.00013981321342096162,
689
+ "loss": 0.7171,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 2.13,
694
+ "grad_norm": 1.607408881187439,
695
+ "learning_rate": 0.00013912141127637496,
696
+ "loss": 0.6438,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 2.15,
701
+ "grad_norm": 4.511462211608887,
702
+ "learning_rate": 0.0001384296091317883,
703
+ "loss": 0.7983,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 2.18,
708
+ "grad_norm": 4.259463787078857,
709
+ "learning_rate": 0.00013773780698720167,
710
+ "loss": 0.9454,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 2.18,
715
+ "eval_accuracy": 0.8027444253859348,
716
+ "eval_loss": 0.7210972905158997,
717
+ "eval_runtime": 5.9658,
718
+ "eval_samples_per_second": 195.449,
719
+ "eval_steps_per_second": 24.473,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 2.2,
724
+ "grad_norm": 3.810264825820923,
725
+ "learning_rate": 0.00013704600484261504,
726
+ "loss": 0.7729,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 2.23,
731
+ "grad_norm": 5.475677967071533,
732
+ "learning_rate": 0.00013635420269802835,
733
+ "loss": 0.7531,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 2.25,
738
+ "grad_norm": 2.9276745319366455,
739
+ "learning_rate": 0.00013566240055344172,
740
+ "loss": 0.725,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 2.28,
745
+ "grad_norm": 4.840962886810303,
746
+ "learning_rate": 0.00013497059840885506,
747
+ "loss": 0.6938,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 2.3,
752
+ "grad_norm": 5.3595194816589355,
753
+ "learning_rate": 0.00013427879626426843,
754
+ "loss": 0.6863,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 2.32,
759
+ "grad_norm": 7.755936145782471,
760
+ "learning_rate": 0.00013358699411968177,
761
+ "loss": 0.7146,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 2.35,
766
+ "grad_norm": 3.4426372051239014,
767
+ "learning_rate": 0.00013289519197509512,
768
+ "loss": 0.7144,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 2.37,
773
+ "grad_norm": 4.554823398590088,
774
+ "learning_rate": 0.00013220338983050849,
775
+ "loss": 0.6512,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 2.4,
780
+ "grad_norm": 2.8689632415771484,
781
+ "learning_rate": 0.00013151158768592183,
782
+ "loss": 0.7966,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 2.42,
787
+ "grad_norm": 3.4381957054138184,
788
+ "learning_rate": 0.0001308197855413352,
789
+ "loss": 0.8109,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 2.42,
794
+ "eval_accuracy": 0.8284734133790738,
795
+ "eval_loss": 0.6368530988693237,
796
+ "eval_runtime": 6.2777,
797
+ "eval_samples_per_second": 185.738,
798
+ "eval_steps_per_second": 23.257,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 2.45,
803
+ "grad_norm": 2.984152317047119,
804
+ "learning_rate": 0.00013012798339674854,
805
+ "loss": 0.6477,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 2.47,
810
+ "grad_norm": 5.486266613006592,
811
+ "learning_rate": 0.00012943618125216188,
812
+ "loss": 0.6834,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 2.49,
817
+ "grad_norm": 2.2987334728240967,
818
+ "learning_rate": 0.00012874437910757525,
819
+ "loss": 0.6415,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 2.52,
824
+ "grad_norm": 7.007256507873535,
825
+ "learning_rate": 0.0001280525769629886,
826
+ "loss": 0.6464,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 2.54,
831
+ "grad_norm": 1.817421555519104,
832
+ "learning_rate": 0.00012736077481840193,
833
+ "loss": 0.774,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 2.57,
838
+ "grad_norm": 4.492701530456543,
839
+ "learning_rate": 0.0001266689726738153,
840
+ "loss": 0.9331,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 2.59,
845
+ "grad_norm": 3.884744644165039,
846
+ "learning_rate": 0.00012597717052922864,
847
+ "loss": 0.6107,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 2.62,
852
+ "grad_norm": 4.274733066558838,
853
+ "learning_rate": 0.000125285368384642,
854
+ "loss": 0.6268,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 2.64,
859
+ "grad_norm": 4.432763576507568,
860
+ "learning_rate": 0.00012459356624005535,
861
+ "loss": 0.6326,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 2.66,
866
+ "grad_norm": 4.16074275970459,
867
+ "learning_rate": 0.0001239017640954687,
868
+ "loss": 0.8762,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 2.66,
873
+ "eval_accuracy": 0.839622641509434,
874
+ "eval_loss": 0.6335619688034058,
875
+ "eval_runtime": 6.1128,
876
+ "eval_samples_per_second": 190.748,
877
+ "eval_steps_per_second": 23.884,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 2.69,
882
+ "grad_norm": 4.018909931182861,
883
+ "learning_rate": 0.00012320996195088206,
884
+ "loss": 0.8039,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 2.71,
889
+ "grad_norm": 8.111551284790039,
890
+ "learning_rate": 0.0001225181598062954,
891
+ "loss": 0.6436,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 2.74,
896
+ "grad_norm": 4.22373628616333,
897
+ "learning_rate": 0.00012182635766170876,
898
+ "loss": 0.6228,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 2.76,
903
+ "grad_norm": 4.817978858947754,
904
+ "learning_rate": 0.00012113455551712211,
905
+ "loss": 0.7047,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 2.78,
910
+ "grad_norm": 5.471624851226807,
911
+ "learning_rate": 0.00012044275337253545,
912
+ "loss": 0.8293,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 2.81,
917
+ "grad_norm": 3.491068124771118,
918
+ "learning_rate": 0.00011975095122794881,
919
+ "loss": 0.7128,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 2.83,
924
+ "grad_norm": 4.463800430297852,
925
+ "learning_rate": 0.00011905914908336215,
926
+ "loss": 0.7569,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 2.86,
931
+ "grad_norm": 2.7582342624664307,
932
+ "learning_rate": 0.00011836734693877552,
933
+ "loss": 0.6774,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 2.88,
938
+ "grad_norm": 4.606247901916504,
939
+ "learning_rate": 0.00011767554479418887,
940
+ "loss": 0.7384,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 2.91,
945
+ "grad_norm": 4.3657660484313965,
946
+ "learning_rate": 0.00011698374264960222,
947
+ "loss": 0.8034,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 2.91,
952
+ "eval_accuracy": 0.8164665523156089,
953
+ "eval_loss": 0.657957911491394,
954
+ "eval_runtime": 6.0796,
955
+ "eval_samples_per_second": 191.79,
956
+ "eval_steps_per_second": 24.015,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 2.93,
961
+ "grad_norm": 3.4329655170440674,
962
+ "learning_rate": 0.00011629194050501557,
963
+ "loss": 0.733,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 2.95,
968
+ "grad_norm": 4.565850257873535,
969
+ "learning_rate": 0.00011560013836042894,
970
+ "loss": 0.7215,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 2.98,
975
+ "grad_norm": 3.225835084915161,
976
+ "learning_rate": 0.00011490833621584227,
977
+ "loss": 0.6895,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 3.0,
982
+ "grad_norm": 5.188159942626953,
983
+ "learning_rate": 0.00011421653407125564,
984
+ "loss": 0.6821,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 3.03,
989
+ "grad_norm": 1.3620127439498901,
990
+ "learning_rate": 0.00011352473192666896,
991
+ "loss": 0.5604,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 3.05,
996
+ "grad_norm": 3.446960687637329,
997
+ "learning_rate": 0.00011283292978208233,
998
+ "loss": 0.5554,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 3.08,
1003
+ "grad_norm": 5.016846656799316,
1004
+ "learning_rate": 0.00011214112763749569,
1005
+ "loss": 0.7156,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 3.1,
1010
+ "grad_norm": 5.600184440612793,
1011
+ "learning_rate": 0.00011144932549290903,
1012
+ "loss": 0.745,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 3.12,
1017
+ "grad_norm": 3.7640082836151123,
1018
+ "learning_rate": 0.00011075752334832239,
1019
+ "loss": 0.6316,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 3.15,
1024
+ "grad_norm": 4.837277889251709,
1025
+ "learning_rate": 0.00011006572120373573,
1026
+ "loss": 0.5833,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 3.15,
1031
+ "eval_accuracy": 0.8439108061749572,
1032
+ "eval_loss": 0.5827564597129822,
1033
+ "eval_runtime": 6.3009,
1034
+ "eval_samples_per_second": 185.052,
1035
+ "eval_steps_per_second": 23.171,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 3.17,
1040
+ "grad_norm": 5.6047797203063965,
1041
+ "learning_rate": 0.00010937391905914908,
1042
+ "loss": 0.7184,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 3.2,
1047
+ "grad_norm": 4.177833080291748,
1048
+ "learning_rate": 0.0001087512971290211,
1049
+ "loss": 0.7094,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 3.22,
1054
+ "grad_norm": 2.0531811714172363,
1055
+ "learning_rate": 0.00010805949498443447,
1056
+ "loss": 0.6679,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 3.24,
1061
+ "grad_norm": 2.919313430786133,
1062
+ "learning_rate": 0.0001073676928398478,
1063
+ "loss": 0.4368,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 3.27,
1068
+ "grad_norm": 5.47185754776001,
1069
+ "learning_rate": 0.00010667589069526116,
1070
+ "loss": 0.5666,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 3.29,
1075
+ "grad_norm": 5.082462310791016,
1076
+ "learning_rate": 0.00010598408855067452,
1077
+ "loss": 0.7758,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 3.32,
1082
+ "grad_norm": 3.3282408714294434,
1083
+ "learning_rate": 0.00010529228640608786,
1084
+ "loss": 0.6055,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 3.34,
1089
+ "grad_norm": 5.19661808013916,
1090
+ "learning_rate": 0.00010460048426150121,
1091
+ "loss": 0.6678,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 3.37,
1096
+ "grad_norm": 2.5423412322998047,
1097
+ "learning_rate": 0.00010390868211691456,
1098
+ "loss": 0.5333,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 3.39,
1103
+ "grad_norm": 9.068185806274414,
1104
+ "learning_rate": 0.00010321687997232792,
1105
+ "loss": 0.8811,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 3.39,
1110
+ "eval_accuracy": 0.8259005145797599,
1111
+ "eval_loss": 0.6564387679100037,
1112
+ "eval_runtime": 5.9782,
1113
+ "eval_samples_per_second": 195.041,
1114
+ "eval_steps_per_second": 24.422,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 3.41,
1119
+ "grad_norm": 5.794499397277832,
1120
+ "learning_rate": 0.00010252507782774128,
1121
+ "loss": 0.8535,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 3.44,
1126
+ "grad_norm": 4.731594562530518,
1127
+ "learning_rate": 0.00010183327568315462,
1128
+ "loss": 0.6556,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 3.46,
1133
+ "grad_norm": 3.823868751525879,
1134
+ "learning_rate": 0.00010114147353856798,
1135
+ "loss": 0.687,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 3.49,
1140
+ "grad_norm": 7.72351598739624,
1141
+ "learning_rate": 0.00010044967139398133,
1142
+ "loss": 0.6479,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 3.51,
1147
+ "grad_norm": 5.026217937469482,
1148
+ "learning_rate": 9.975786924939467e-05,
1149
+ "loss": 0.64,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 3.54,
1154
+ "grad_norm": 2.873476028442383,
1155
+ "learning_rate": 9.906606710480803e-05,
1156
+ "loss": 0.6692,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 3.56,
1161
+ "grad_norm": 3.9772098064422607,
1162
+ "learning_rate": 9.837426496022138e-05,
1163
+ "loss": 0.8156,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 3.58,
1168
+ "grad_norm": 5.044854164123535,
1169
+ "learning_rate": 9.768246281563474e-05,
1170
+ "loss": 0.7261,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 3.61,
1175
+ "grad_norm": 2.8663127422332764,
1176
+ "learning_rate": 9.699066067104808e-05,
1177
+ "loss": 0.7608,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 3.63,
1182
+ "grad_norm": 7.623239040374756,
1183
+ "learning_rate": 9.629885852646143e-05,
1184
+ "loss": 0.5639,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 3.63,
1189
+ "eval_accuracy": 0.8439108061749572,
1190
+ "eval_loss": 0.5736597180366516,
1191
+ "eval_runtime": 6.1394,
1192
+ "eval_samples_per_second": 189.92,
1193
+ "eval_steps_per_second": 23.781,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 3.66,
1198
+ "grad_norm": 4.9297966957092285,
1199
+ "learning_rate": 9.560705638187479e-05,
1200
+ "loss": 0.6672,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 3.68,
1205
+ "grad_norm": 2.0875468254089355,
1206
+ "learning_rate": 9.491525423728815e-05,
1207
+ "loss": 0.5335,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 3.7,
1212
+ "grad_norm": 10.350793838500977,
1213
+ "learning_rate": 9.422345209270149e-05,
1214
+ "loss": 0.7163,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 3.73,
1219
+ "grad_norm": 4.162230014801025,
1220
+ "learning_rate": 9.353164994811484e-05,
1221
+ "loss": 0.7528,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 3.75,
1226
+ "grad_norm": 5.249913215637207,
1227
+ "learning_rate": 9.28398478035282e-05,
1228
+ "loss": 0.7946,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 3.78,
1233
+ "grad_norm": 3.5188651084899902,
1234
+ "learning_rate": 9.214804565894155e-05,
1235
+ "loss": 0.7108,
1236
  "step": 1560
1237
  },
1238
  {
1239
  "epoch": 3.8,
1240
+ "grad_norm": 5.497143268585205,
1241
+ "learning_rate": 9.14562435143549e-05,
1242
+ "loss": 0.8685,
1243
  "step": 1570
1244
  },
1245
  {
1246
  "epoch": 3.83,
1247
+ "grad_norm": 4.383511066436768,
1248
+ "learning_rate": 9.076444136976825e-05,
1249
+ "loss": 0.7093,
1250
  "step": 1580
1251
  },
1252
  {
1253
  "epoch": 3.85,
1254
+ "grad_norm": 4.953444957733154,
1255
+ "learning_rate": 9.00726392251816e-05,
1256
+ "loss": 0.7253,
1257
  "step": 1590
1258
  },
1259
  {
1260
  "epoch": 3.87,
1261
+ "grad_norm": 3.61757230758667,
1262
+ "learning_rate": 8.938083708059496e-05,
1263
+ "loss": 0.639,
1264
  "step": 1600
1265
  },
1266
  {
1267
  "epoch": 3.87,
1268
+ "eval_accuracy": 0.8379073756432247,
1269
+ "eval_loss": 0.560886561870575,
1270
+ "eval_runtime": 5.9489,
1271
+ "eval_samples_per_second": 196.002,
1272
+ "eval_steps_per_second": 24.542,
1273
  "step": 1600
1274
  },
1275
  {
1276
  "epoch": 3.9,
1277
+ "grad_norm": 5.650317668914795,
1278
+ "learning_rate": 8.868903493600831e-05,
1279
+ "loss": 0.8287,
1280
  "step": 1610
1281
  },
1282
  {
1283
  "epoch": 3.92,
1284
+ "grad_norm": 6.012249946594238,
1285
+ "learning_rate": 8.799723279142166e-05,
1286
+ "loss": 0.6976,
1287
  "step": 1620
1288
  },
1289
  {
1290
  "epoch": 3.95,
1291
+ "grad_norm": 5.186240196228027,
1292
+ "learning_rate": 8.730543064683501e-05,
1293
+ "loss": 0.7452,
1294
  "step": 1630
1295
  },
1296
  {
1297
  "epoch": 3.97,
1298
+ "grad_norm": 5.2669572830200195,
1299
+ "learning_rate": 8.661362850224835e-05,
1300
+ "loss": 0.8586,
1301
  "step": 1640
1302
  },
1303
  {
1304
  "epoch": 4.0,
1305
+ "grad_norm": 2.4105117321014404,
1306
+ "learning_rate": 8.592182635766172e-05,
1307
+ "loss": 0.6194,
1308
  "step": 1650
1309
  },
1310
  {
1311
+ "epoch": 4.02,
1312
+ "grad_norm": 1.2886375188827515,
1313
+ "learning_rate": 8.523002421307506e-05,
1314
+ "loss": 0.4969,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 4.04,
1319
+ "grad_norm": 1.915207862854004,
1320
+ "learning_rate": 8.453822206848842e-05,
1321
+ "loss": 0.571,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 4.07,
1326
+ "grad_norm": 3.7422375679016113,
1327
+ "learning_rate": 8.384641992390176e-05,
1328
+ "loss": 0.6791,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 4.09,
1333
+ "grad_norm": 5.4421467781066895,
1334
+ "learning_rate": 8.315461777931513e-05,
1335
+ "loss": 0.6829,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 4.12,
1340
+ "grad_norm": 1.9872852563858032,
1341
+ "learning_rate": 8.246281563472847e-05,
1342
+ "loss": 0.6455,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 4.12,
1347
+ "eval_accuracy": 0.8370497427101201,
1348
+ "eval_loss": 0.5820054411888123,
1349
+ "eval_runtime": 6.2231,
1350
+ "eval_samples_per_second": 187.366,
1351
+ "eval_steps_per_second": 23.461,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 4.14,
1356
+ "grad_norm": 3.1257195472717285,
1357
+ "learning_rate": 8.177101349014182e-05,
1358
+ "loss": 0.6619,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 4.16,
1363
+ "grad_norm": 3.6743292808532715,
1364
+ "learning_rate": 8.107921134555517e-05,
1365
+ "loss": 0.8357,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 4.19,
1370
+ "grad_norm": 3.7856836318969727,
1371
+ "learning_rate": 8.038740920096852e-05,
1372
+ "loss": 0.6017,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 4.21,
1377
+ "grad_norm": 4.6526970863342285,
1378
+ "learning_rate": 7.969560705638188e-05,
1379
+ "loss": 0.6805,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 4.24,
1384
+ "grad_norm": 3.4002761840820312,
1385
+ "learning_rate": 7.900380491179523e-05,
1386
+ "loss": 0.5558,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 4.26,
1391
+ "grad_norm": 3.9795327186584473,
1392
+ "learning_rate": 7.831200276720859e-05,
1393
+ "loss": 0.6921,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 4.29,
1398
+ "grad_norm": 3.5085155963897705,
1399
+ "learning_rate": 7.762020062262193e-05,
1400
+ "loss": 0.5476,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 4.31,
1405
+ "grad_norm": 5.0314412117004395,
1406
+ "learning_rate": 7.69283984780353e-05,
1407
+ "loss": 0.8566,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 4.33,
1412
+ "grad_norm": 2.536855697631836,
1413
+ "learning_rate": 7.623659633344864e-05,
1414
+ "loss": 0.5743,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 4.36,
1419
+ "grad_norm": 4.995050430297852,
1420
+ "learning_rate": 7.5544794188862e-05,
1421
+ "loss": 0.5402,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 4.36,
1426
+ "eval_accuracy": 0.8344768439108061,
1427
+ "eval_loss": 0.5796906352043152,
1428
+ "eval_runtime": 6.1279,
1429
+ "eval_samples_per_second": 190.278,
1430
+ "eval_steps_per_second": 23.826,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 4.38,
1435
+ "grad_norm": 4.143467903137207,
1436
+ "learning_rate": 7.485299204427533e-05,
1437
+ "loss": 0.6715,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 4.41,
1442
+ "grad_norm": 1.8152028322219849,
1443
+ "learning_rate": 7.416118989968869e-05,
1444
+ "loss": 0.6965,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 4.43,
1449
+ "grad_norm": 3.699620485305786,
1450
+ "learning_rate": 7.346938775510205e-05,
1451
+ "loss": 0.5758,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 4.46,
1456
+ "grad_norm": 2.2266180515289307,
1457
+ "learning_rate": 7.27775856105154e-05,
1458
+ "loss": 0.6802,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 4.48,
1463
+ "grad_norm": 4.586669445037842,
1464
+ "learning_rate": 7.208578346592874e-05,
1465
+ "loss": 0.5885,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 4.5,
1470
+ "grad_norm": 4.72069787979126,
1471
+ "learning_rate": 7.13939813213421e-05,
1472
+ "loss": 0.6404,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 4.53,
1477
+ "grad_norm": 5.436990261077881,
1478
+ "learning_rate": 7.070217917675545e-05,
1479
+ "loss": 0.7781,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 4.55,
1484
+ "grad_norm": 4.715204238891602,
1485
+ "learning_rate": 7.001037703216881e-05,
1486
+ "loss": 0.7109,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 4.58,
1491
+ "grad_norm": 3.261801242828369,
1492
+ "learning_rate": 6.931857488758215e-05,
1493
+ "loss": 0.5707,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 4.6,
1498
+ "grad_norm": 3.516954183578491,
1499
+ "learning_rate": 6.86267727429955e-05,
1500
+ "loss": 0.5311,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 4.6,
1505
+ "eval_accuracy": 0.8456260720411664,
1506
+ "eval_loss": 0.55106520652771,
1507
+ "eval_runtime": 6.3501,
1508
+ "eval_samples_per_second": 183.618,
1509
+ "eval_steps_per_second": 22.992,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 4.62,
1514
+ "grad_norm": 4.697694778442383,
1515
+ "learning_rate": 6.793497059840886e-05,
1516
+ "loss": 0.6169,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 4.65,
1521
+ "grad_norm": 4.627555847167969,
1522
+ "learning_rate": 6.724316845382221e-05,
1523
+ "loss": 0.649,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 4.67,
1528
+ "grad_norm": 3.16441011428833,
1529
+ "learning_rate": 6.662054652369423e-05,
1530
+ "loss": 0.6538,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 4.7,
1535
+ "grad_norm": 3.6413562297821045,
1536
+ "learning_rate": 6.592874437910757e-05,
1537
+ "loss": 0.5963,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 4.72,
1542
+ "grad_norm": 4.7628960609436035,
1543
+ "learning_rate": 6.523694223452093e-05,
1544
+ "loss": 0.5106,
1545
+ "step": 1950
1546
+ },
1547
+ {
1548
+ "epoch": 4.75,
1549
+ "grad_norm": 3.3812217712402344,
1550
+ "learning_rate": 6.454514008993428e-05,
1551
+ "loss": 0.6908,
1552
+ "step": 1960
1553
+ },
1554
+ {
1555
+ "epoch": 4.77,
1556
+ "grad_norm": 3.9284725189208984,
1557
+ "learning_rate": 6.385333794534764e-05,
1558
+ "loss": 0.4863,
1559
+ "step": 1970
1560
+ },
1561
+ {
1562
+ "epoch": 4.79,
1563
+ "grad_norm": 3.633194923400879,
1564
+ "learning_rate": 6.316153580076098e-05,
1565
+ "loss": 0.4383,
1566
+ "step": 1980
1567
+ },
1568
+ {
1569
+ "epoch": 4.82,
1570
+ "grad_norm": 6.324495792388916,
1571
+ "learning_rate": 6.246973365617433e-05,
1572
+ "loss": 0.8217,
1573
+ "step": 1990
1574
+ },
1575
+ {
1576
+ "epoch": 4.84,
1577
+ "grad_norm": 5.055554389953613,
1578
+ "learning_rate": 6.177793151158769e-05,
1579
+ "loss": 0.5734,
1580
+ "step": 2000
1581
+ },
1582
+ {
1583
+ "epoch": 4.84,
1584
+ "eval_accuracy": 0.8507718696397941,
1585
+ "eval_loss": 0.5443547964096069,
1586
+ "eval_runtime": 5.9845,
1587
+ "eval_samples_per_second": 194.837,
1588
+ "eval_steps_per_second": 24.396,
1589
+ "step": 2000
1590
+ },
1591
+ {
1592
+ "epoch": 4.87,
1593
+ "grad_norm": 3.0936367511749268,
1594
+ "learning_rate": 6.108612936700104e-05,
1595
+ "loss": 0.6563,
1596
+ "step": 2010
1597
+ },
1598
+ {
1599
+ "epoch": 4.89,
1600
+ "grad_norm": 3.478715181350708,
1601
+ "learning_rate": 6.039432722241439e-05,
1602
+ "loss": 0.595,
1603
+ "step": 2020
1604
+ },
1605
+ {
1606
+ "epoch": 4.92,
1607
+ "grad_norm": 3.4817001819610596,
1608
+ "learning_rate": 5.970252507782774e-05,
1609
+ "loss": 0.3782,
1610
+ "step": 2030
1611
+ },
1612
+ {
1613
+ "epoch": 4.94,
1614
+ "grad_norm": 6.603343963623047,
1615
+ "learning_rate": 5.901072293324109e-05,
1616
+ "loss": 0.5735,
1617
+ "step": 2040
1618
+ },
1619
+ {
1620
+ "epoch": 4.96,
1621
+ "grad_norm": 1.6107616424560547,
1622
+ "learning_rate": 5.831892078865445e-05,
1623
+ "loss": 0.6045,
1624
+ "step": 2050
1625
+ },
1626
+ {
1627
+ "epoch": 4.99,
1628
+ "grad_norm": 4.367840766906738,
1629
+ "learning_rate": 5.76271186440678e-05,
1630
+ "loss": 0.5533,
1631
+ "step": 2060
1632
+ },
1633
+ {
1634
+ "epoch": 5.01,
1635
+ "grad_norm": 6.6618266105651855,
1636
+ "learning_rate": 5.6935316499481154e-05,
1637
+ "loss": 0.4114,
1638
+ "step": 2070
1639
+ },
1640
+ {
1641
+ "epoch": 5.04,
1642
+ "grad_norm": 7.852776050567627,
1643
+ "learning_rate": 5.62435143548945e-05,
1644
+ "loss": 0.642,
1645
+ "step": 2080
1646
+ },
1647
+ {
1648
+ "epoch": 5.06,
1649
+ "grad_norm": 5.769771099090576,
1650
+ "learning_rate": 5.555171221030785e-05,
1651
+ "loss": 0.5934,
1652
+ "step": 2090
1653
+ },
1654
+ {
1655
+ "epoch": 5.08,
1656
+ "grad_norm": 2.4635396003723145,
1657
+ "learning_rate": 5.485991006572121e-05,
1658
+ "loss": 0.5206,
1659
+ "step": 2100
1660
+ },
1661
+ {
1662
+ "epoch": 5.08,
1663
+ "eval_accuracy": 0.8636363636363636,
1664
+ "eval_loss": 0.5326434969902039,
1665
+ "eval_runtime": 6.1606,
1666
+ "eval_samples_per_second": 189.267,
1667
+ "eval_steps_per_second": 23.699,
1668
+ "step": 2100
1669
+ },
1670
+ {
1671
+ "epoch": 5.11,
1672
+ "grad_norm": 4.699713706970215,
1673
+ "learning_rate": 5.416810792113456e-05,
1674
+ "loss": 0.6311,
1675
+ "step": 2110
1676
+ },
1677
+ {
1678
+ "epoch": 5.13,
1679
+ "grad_norm": 3.2119288444519043,
1680
+ "learning_rate": 5.347630577654791e-05,
1681
+ "loss": 0.4781,
1682
+ "step": 2120
1683
+ },
1684
+ {
1685
+ "epoch": 5.16,
1686
+ "grad_norm": 5.083879470825195,
1687
+ "learning_rate": 5.278450363196126e-05,
1688
+ "loss": 0.5123,
1689
+ "step": 2130
1690
+ },
1691
+ {
1692
+ "epoch": 5.18,
1693
+ "grad_norm": 3.2444283962249756,
1694
+ "learning_rate": 5.209270148737462e-05,
1695
+ "loss": 0.39,
1696
+ "step": 2140
1697
+ },
1698
+ {
1699
+ "epoch": 5.21,
1700
+ "grad_norm": 2.9540326595306396,
1701
+ "learning_rate": 5.140089934278797e-05,
1702
+ "loss": 0.5081,
1703
+ "step": 2150
1704
+ },
1705
+ {
1706
+ "epoch": 5.23,
1707
+ "grad_norm": 4.055675029754639,
1708
+ "learning_rate": 5.0709097198201316e-05,
1709
+ "loss": 0.5884,
1710
+ "step": 2160
1711
+ },
1712
+ {
1713
+ "epoch": 5.25,
1714
+ "grad_norm": 2.7214150428771973,
1715
+ "learning_rate": 5.0017295053614664e-05,
1716
+ "loss": 0.5241,
1717
+ "step": 2170
1718
+ },
1719
+ {
1720
+ "epoch": 5.28,
1721
+ "grad_norm": 1.249835729598999,
1722
+ "learning_rate": 4.932549290902802e-05,
1723
+ "loss": 0.3994,
1724
+ "step": 2180
1725
+ },
1726
+ {
1727
+ "epoch": 5.3,
1728
+ "grad_norm": 5.9494829177856445,
1729
+ "learning_rate": 4.863369076444137e-05,
1730
+ "loss": 0.5295,
1731
+ "step": 2190
1732
+ },
1733
+ {
1734
+ "epoch": 5.33,
1735
+ "grad_norm": 3.318251371383667,
1736
+ "learning_rate": 4.794188861985472e-05,
1737
+ "loss": 0.6272,
1738
+ "step": 2200
1739
+ },
1740
+ {
1741
+ "epoch": 5.33,
1742
+ "eval_accuracy": 0.8524871355060034,
1743
+ "eval_loss": 0.5477628707885742,
1744
+ "eval_runtime": 6.3218,
1745
+ "eval_samples_per_second": 184.442,
1746
+ "eval_steps_per_second": 23.095,
1747
+ "step": 2200
1748
+ },
1749
+ {
1750
+ "epoch": 5.35,
1751
+ "grad_norm": 4.580955505371094,
1752
+ "learning_rate": 4.725008647526807e-05,
1753
+ "loss": 0.5204,
1754
+ "step": 2210
1755
+ },
1756
+ {
1757
+ "epoch": 5.38,
1758
+ "grad_norm": 1.7544004917144775,
1759
+ "learning_rate": 4.6558284330681426e-05,
1760
+ "loss": 0.5388,
1761
+ "step": 2220
1762
+ },
1763
+ {
1764
+ "epoch": 5.4,
1765
+ "grad_norm": 4.454672336578369,
1766
+ "learning_rate": 4.586648218609478e-05,
1767
+ "loss": 0.4773,
1768
+ "step": 2230
1769
+ },
1770
+ {
1771
+ "epoch": 5.42,
1772
+ "grad_norm": 7.039458274841309,
1773
+ "learning_rate": 4.517468004150813e-05,
1774
+ "loss": 0.5821,
1775
+ "step": 2240
1776
+ },
1777
+ {
1778
+ "epoch": 5.45,
1779
+ "grad_norm": 4.6715006828308105,
1780
+ "learning_rate": 4.4482877896921485e-05,
1781
+ "loss": 0.5881,
1782
+ "step": 2250
1783
+ },
1784
+ {
1785
+ "epoch": 5.47,
1786
+ "grad_norm": 3.3161721229553223,
1787
+ "learning_rate": 4.379107575233484e-05,
1788
+ "loss": 0.5587,
1789
+ "step": 2260
1790
+ },
1791
+ {
1792
+ "epoch": 5.5,
1793
+ "grad_norm": 5.245398044586182,
1794
+ "learning_rate": 4.309927360774819e-05,
1795
+ "loss": 0.4973,
1796
+ "step": 2270
1797
+ },
1798
+ {
1799
+ "epoch": 5.52,
1800
+ "grad_norm": 3.1016721725463867,
1801
+ "learning_rate": 4.2407471463161536e-05,
1802
+ "loss": 0.4004,
1803
+ "step": 2280
1804
+ },
1805
+ {
1806
+ "epoch": 5.54,
1807
+ "grad_norm": 4.883015155792236,
1808
+ "learning_rate": 4.171566931857489e-05,
1809
+ "loss": 0.4267,
1810
+ "step": 2290
1811
+ },
1812
+ {
1813
+ "epoch": 5.57,
1814
+ "grad_norm": 4.550380229949951,
1815
+ "learning_rate": 4.102386717398824e-05,
1816
+ "loss": 0.5124,
1817
+ "step": 2300
1818
+ },
1819
+ {
1820
+ "epoch": 5.57,
1821
+ "eval_accuracy": 0.8687821612349914,
1822
+ "eval_loss": 0.5295912623405457,
1823
+ "eval_runtime": 6.0432,
1824
+ "eval_samples_per_second": 192.944,
1825
+ "eval_steps_per_second": 24.159,
1826
+ "step": 2300
1827
+ },
1828
+ {
1829
+ "epoch": 5.59,
1830
+ "grad_norm": 3.2469303607940674,
1831
+ "learning_rate": 4.0332065029401595e-05,
1832
+ "loss": 0.4733,
1833
+ "step": 2310
1834
+ },
1835
+ {
1836
+ "epoch": 5.62,
1837
+ "grad_norm": 5.656401634216309,
1838
+ "learning_rate": 3.964026288481494e-05,
1839
+ "loss": 0.5978,
1840
+ "step": 2320
1841
+ },
1842
+ {
1843
+ "epoch": 5.64,
1844
+ "grad_norm": 2.7476541996002197,
1845
+ "learning_rate": 3.89484607402283e-05,
1846
+ "loss": 0.3513,
1847
+ "step": 2330
1848
+ },
1849
+ {
1850
+ "epoch": 5.67,
1851
+ "grad_norm": 4.047815322875977,
1852
+ "learning_rate": 3.825665859564165e-05,
1853
+ "loss": 0.5287,
1854
+ "step": 2340
1855
+ },
1856
+ {
1857
+ "epoch": 5.69,
1858
+ "grad_norm": 3.4885923862457275,
1859
+ "learning_rate": 3.7564856451055e-05,
1860
+ "loss": 0.4946,
1861
+ "step": 2350
1862
+ },
1863
+ {
1864
+ "epoch": 5.71,
1865
+ "grad_norm": 7.513520240783691,
1866
+ "learning_rate": 3.687305430646835e-05,
1867
+ "loss": 0.6233,
1868
+ "step": 2360
1869
+ },
1870
+ {
1871
+ "epoch": 5.74,
1872
+ "grad_norm": 2.3985989093780518,
1873
+ "learning_rate": 3.61812521618817e-05,
1874
+ "loss": 0.5149,
1875
+ "step": 2370
1876
+ },
1877
+ {
1878
+ "epoch": 5.76,
1879
+ "grad_norm": 5.046018123626709,
1880
+ "learning_rate": 3.5489450017295054e-05,
1881
+ "loss": 0.4948,
1882
+ "step": 2380
1883
+ },
1884
+ {
1885
+ "epoch": 5.79,
1886
+ "grad_norm": 2.6082875728607178,
1887
+ "learning_rate": 3.479764787270841e-05,
1888
+ "loss": 0.6084,
1889
+ "step": 2390
1890
+ },
1891
+ {
1892
+ "epoch": 5.81,
1893
+ "grad_norm": 2.541283369064331,
1894
+ "learning_rate": 3.410584572812176e-05,
1895
+ "loss": 0.5659,
1896
+ "step": 2400
1897
+ },
1898
+ {
1899
+ "epoch": 5.81,
1900
+ "eval_accuracy": 0.8704974271012007,
1901
+ "eval_loss": 0.5180826783180237,
1902
+ "eval_runtime": 6.1391,
1903
+ "eval_samples_per_second": 189.929,
1904
+ "eval_steps_per_second": 23.782,
1905
+ "step": 2400
1906
+ },
1907
+ {
1908
+ "epoch": 5.84,
1909
+ "grad_norm": 2.853994846343994,
1910
+ "learning_rate": 3.341404358353511e-05,
1911
+ "loss": 0.6081,
1912
+ "step": 2410
1913
+ },
1914
+ {
1915
+ "epoch": 5.86,
1916
+ "grad_norm": 4.628828525543213,
1917
+ "learning_rate": 3.272224143894847e-05,
1918
+ "loss": 0.4588,
1919
+ "step": 2420
1920
+ },
1921
+ {
1922
+ "epoch": 5.88,
1923
+ "grad_norm": 3.1006319522857666,
1924
+ "learning_rate": 3.2030439294361816e-05,
1925
+ "loss": 0.4341,
1926
+ "step": 2430
1927
+ },
1928
+ {
1929
+ "epoch": 5.91,
1930
+ "grad_norm": 2.395719528198242,
1931
+ "learning_rate": 3.133863714977517e-05,
1932
+ "loss": 0.442,
1933
+ "step": 2440
1934
+ },
1935
+ {
1936
+ "epoch": 5.93,
1937
+ "grad_norm": 3.238839864730835,
1938
+ "learning_rate": 3.064683500518852e-05,
1939
+ "loss": 0.4359,
1940
+ "step": 2450
1941
+ },
1942
+ {
1943
+ "epoch": 5.96,
1944
+ "grad_norm": 5.706843852996826,
1945
+ "learning_rate": 2.9955032860601867e-05,
1946
+ "loss": 0.5139,
1947
+ "step": 2460
1948
+ },
1949
+ {
1950
+ "epoch": 5.98,
1951
+ "grad_norm": 6.059083461761475,
1952
+ "learning_rate": 2.9263230716015223e-05,
1953
+ "loss": 0.4459,
1954
+ "step": 2470
1955
+ },
1956
+ {
1957
+ "epoch": 6.0,
1958
+ "grad_norm": 4.164783954620361,
1959
+ "learning_rate": 2.857142857142857e-05,
1960
+ "loss": 0.5037,
1961
+ "step": 2480
1962
+ },
1963
+ {
1964
+ "epoch": 6.03,
1965
+ "grad_norm": 3.230203151702881,
1966
+ "learning_rate": 2.7879626426841926e-05,
1967
+ "loss": 0.4225,
1968
+ "step": 2490
1969
+ },
1970
+ {
1971
+ "epoch": 6.05,
1972
+ "grad_norm": 5.467190742492676,
1973
+ "learning_rate": 2.7187824282255274e-05,
1974
+ "loss": 0.4212,
1975
+ "step": 2500
1976
+ },
1977
+ {
1978
+ "epoch": 6.05,
1979
+ "eval_accuracy": 0.8610634648370498,
1980
+ "eval_loss": 0.5200443267822266,
1981
+ "eval_runtime": 6.2608,
1982
+ "eval_samples_per_second": 186.239,
1983
+ "eval_steps_per_second": 23.32,
1984
+ "step": 2500
1985
+ },
1986
+ {
1987
+ "epoch": 6.08,
1988
+ "grad_norm": 3.7668442726135254,
1989
+ "learning_rate": 2.649602213766863e-05,
1990
+ "loss": 0.4042,
1991
+ "step": 2510
1992
+ },
1993
+ {
1994
+ "epoch": 6.1,
1995
+ "grad_norm": 3.094477415084839,
1996
+ "learning_rate": 2.580421999308198e-05,
1997
+ "loss": 0.4338,
1998
+ "step": 2520
1999
+ },
2000
+ {
2001
+ "epoch": 6.13,
2002
+ "grad_norm": 5.538024425506592,
2003
+ "learning_rate": 2.5112417848495333e-05,
2004
+ "loss": 0.3269,
2005
+ "step": 2530
2006
+ },
2007
+ {
2008
+ "epoch": 6.15,
2009
+ "grad_norm": 5.658746719360352,
2010
+ "learning_rate": 2.4420615703908685e-05,
2011
+ "loss": 0.4719,
2012
+ "step": 2540
2013
+ },
2014
+ {
2015
+ "epoch": 6.17,
2016
+ "grad_norm": 1.6886987686157227,
2017
+ "learning_rate": 2.3728813559322036e-05,
2018
+ "loss": 0.395,
2019
+ "step": 2550
2020
+ },
2021
+ {
2022
+ "epoch": 6.2,
2023
+ "grad_norm": 3.538180112838745,
2024
+ "learning_rate": 2.3037011414735388e-05,
2025
+ "loss": 0.2877,
2026
+ "step": 2560
2027
+ },
2028
+ {
2029
+ "epoch": 6.22,
2030
+ "grad_norm": 2.9912898540496826,
2031
+ "learning_rate": 2.234520927014874e-05,
2032
+ "loss": 0.4797,
2033
+ "step": 2570
2034
+ },
2035
+ {
2036
+ "epoch": 6.25,
2037
+ "grad_norm": 2.68037748336792,
2038
+ "learning_rate": 2.1653407125562088e-05,
2039
+ "loss": 0.5114,
2040
+ "step": 2580
2041
+ },
2042
+ {
2043
+ "epoch": 6.27,
2044
+ "grad_norm": 5.079796314239502,
2045
+ "learning_rate": 2.096160498097544e-05,
2046
+ "loss": 0.3604,
2047
+ "step": 2590
2048
+ },
2049
+ {
2050
+ "epoch": 6.3,
2051
+ "grad_norm": 3.052543878555298,
2052
+ "learning_rate": 2.026980283638879e-05,
2053
+ "loss": 0.4338,
2054
+ "step": 2600
2055
+ },
2056
+ {
2057
+ "epoch": 6.3,
2058
+ "eval_accuracy": 0.8730703259005146,
2059
+ "eval_loss": 0.5135151743888855,
2060
+ "eval_runtime": 5.9846,
2061
+ "eval_samples_per_second": 194.834,
2062
+ "eval_steps_per_second": 24.396,
2063
+ "step": 2600
2064
+ },
2065
+ {
2066
+ "epoch": 6.32,
2067
+ "grad_norm": 5.780861854553223,
2068
+ "learning_rate": 1.9578000691802147e-05,
2069
+ "loss": 0.3725,
2070
+ "step": 2610
2071
+ },
2072
+ {
2073
+ "epoch": 6.34,
2074
+ "grad_norm": 4.87053108215332,
2075
+ "learning_rate": 1.88861985472155e-05,
2076
+ "loss": 0.2491,
2077
+ "step": 2620
2078
+ },
2079
+ {
2080
+ "epoch": 6.37,
2081
+ "grad_norm": 2.2995293140411377,
2082
+ "learning_rate": 1.819439640262885e-05,
2083
+ "loss": 0.2911,
2084
+ "step": 2630
2085
+ },
2086
+ {
2087
+ "epoch": 6.39,
2088
+ "grad_norm": 1.6383118629455566,
2089
+ "learning_rate": 1.7502594258042202e-05,
2090
+ "loss": 0.2562,
2091
+ "step": 2640
2092
+ },
2093
+ {
2094
+ "epoch": 6.42,
2095
+ "grad_norm": 4.9596991539001465,
2096
+ "learning_rate": 1.6810792113455554e-05,
2097
+ "loss": 0.5795,
2098
+ "step": 2650
2099
+ },
2100
+ {
2101
+ "epoch": 6.44,
2102
+ "grad_norm": 2.922712802886963,
2103
+ "learning_rate": 1.6118989968868905e-05,
2104
+ "loss": 0.421,
2105
+ "step": 2660
2106
+ },
2107
+ {
2108
+ "epoch": 6.46,
2109
+ "grad_norm": 2.0401623249053955,
2110
+ "learning_rate": 1.5427187824282254e-05,
2111
+ "loss": 0.4283,
2112
+ "step": 2670
2113
+ },
2114
+ {
2115
+ "epoch": 6.49,
2116
+ "grad_norm": 0.9165148735046387,
2117
+ "learning_rate": 1.4735385679695607e-05,
2118
+ "loss": 0.4512,
2119
+ "step": 2680
2120
+ },
2121
+ {
2122
+ "epoch": 6.51,
2123
+ "grad_norm": 4.587483882904053,
2124
+ "learning_rate": 1.4043583535108959e-05,
2125
+ "loss": 0.4664,
2126
+ "step": 2690
2127
+ },
2128
+ {
2129
+ "epoch": 6.54,
2130
+ "grad_norm": 4.216481685638428,
2131
+ "learning_rate": 1.335178139052231e-05,
2132
+ "loss": 0.3407,
2133
+ "step": 2700
2134
+ },
2135
+ {
2136
+ "epoch": 6.54,
2137
+ "eval_accuracy": 0.87221269296741,
2138
+ "eval_loss": 0.5147121548652649,
2139
+ "eval_runtime": 6.1635,
2140
+ "eval_samples_per_second": 189.179,
2141
+ "eval_steps_per_second": 23.688,
2142
+ "step": 2700
2143
+ },
2144
+ {
2145
+ "epoch": 6.56,
2146
+ "grad_norm": 1.7551047801971436,
2147
+ "learning_rate": 1.2659979245935664e-05,
2148
+ "loss": 0.4725,
2149
+ "step": 2710
2150
+ },
2151
+ {
2152
+ "epoch": 6.59,
2153
+ "grad_norm": 4.851523399353027,
2154
+ "learning_rate": 1.1968177101349016e-05,
2155
+ "loss": 0.4639,
2156
+ "step": 2720
2157
+ },
2158
+ {
2159
+ "epoch": 6.61,
2160
+ "grad_norm": 6.040704727172852,
2161
+ "learning_rate": 1.1276374956762366e-05,
2162
+ "loss": 0.3146,
2163
+ "step": 2730
2164
+ },
2165
+ {
2166
+ "epoch": 6.63,
2167
+ "grad_norm": 1.6925532817840576,
2168
+ "learning_rate": 1.0584572812175717e-05,
2169
+ "loss": 0.3665,
2170
+ "step": 2740
2171
+ },
2172
+ {
2173
+ "epoch": 6.66,
2174
+ "grad_norm": 2.9491493701934814,
2175
+ "learning_rate": 9.89277066758907e-06,
2176
+ "loss": 0.467,
2177
+ "step": 2750
2178
+ },
2179
+ {
2180
+ "epoch": 6.68,
2181
+ "grad_norm": 2.1744699478149414,
2182
+ "learning_rate": 9.200968523002422e-06,
2183
+ "loss": 0.3542,
2184
+ "step": 2760
2185
+ },
2186
+ {
2187
+ "epoch": 6.71,
2188
+ "grad_norm": 3.170931577682495,
2189
+ "learning_rate": 8.509166378415774e-06,
2190
+ "loss": 0.5874,
2191
+ "step": 2770
2192
+ },
2193
+ {
2194
+ "epoch": 6.73,
2195
+ "grad_norm": 3.2446773052215576,
2196
+ "learning_rate": 7.817364233829124e-06,
2197
+ "loss": 0.3705,
2198
+ "step": 2780
2199
+ },
2200
+ {
2201
+ "epoch": 6.76,
2202
+ "grad_norm": 3.8055498600006104,
2203
+ "learning_rate": 7.125562089242477e-06,
2204
+ "loss": 0.3164,
2205
+ "step": 2790
2206
+ },
2207
+ {
2208
+ "epoch": 6.78,
2209
+ "grad_norm": 2.3979437351226807,
2210
+ "learning_rate": 6.4337599446558285e-06,
2211
+ "loss": 0.4043,
2212
+ "step": 2800
2213
+ },
2214
+ {
2215
+ "epoch": 6.78,
2216
+ "eval_accuracy": 0.869639794168096,
2217
+ "eval_loss": 0.5081329345703125,
2218
+ "eval_runtime": 6.6143,
2219
+ "eval_samples_per_second": 176.285,
2220
+ "eval_steps_per_second": 22.073,
2221
+ "step": 2800
2222
+ },
2223
+ {
2224
+ "epoch": 6.8,
2225
+ "grad_norm": 1.7395985126495361,
2226
+ "learning_rate": 5.74195780006918e-06,
2227
+ "loss": 0.3624,
2228
+ "step": 2810
2229
+ },
2230
+ {
2231
+ "epoch": 6.83,
2232
+ "grad_norm": 2.924905300140381,
2233
+ "learning_rate": 5.050155655482532e-06,
2234
+ "loss": 0.4046,
2235
+ "step": 2820
2236
+ },
2237
+ {
2238
+ "epoch": 6.85,
2239
+ "grad_norm": 11.709400177001953,
2240
+ "learning_rate": 4.358353510895884e-06,
2241
+ "loss": 0.4807,
2242
+ "step": 2830
2243
+ },
2244
+ {
2245
+ "epoch": 6.88,
2246
+ "grad_norm": 6.416582107543945,
2247
+ "learning_rate": 3.666551366309236e-06,
2248
+ "loss": 0.4782,
2249
+ "step": 2840
2250
+ },
2251
+ {
2252
+ "epoch": 6.9,
2253
+ "grad_norm": 6.1391448974609375,
2254
+ "learning_rate": 2.9747492217225875e-06,
2255
+ "loss": 0.4852,
2256
+ "step": 2850
2257
+ },
2258
+ {
2259
+ "epoch": 6.92,
2260
+ "grad_norm": 3.525520086288452,
2261
+ "learning_rate": 2.2829470771359392e-06,
2262
+ "loss": 0.4282,
2263
+ "step": 2860
2264
+ },
2265
+ {
2266
+ "epoch": 6.95,
2267
+ "grad_norm": 1.4197200536727905,
2268
+ "learning_rate": 1.591144932549291e-06,
2269
+ "loss": 0.4337,
2270
+ "step": 2870
2271
+ },
2272
+ {
2273
+ "epoch": 6.97,
2274
+ "grad_norm": 4.016748905181885,
2275
+ "learning_rate": 8.993427879626428e-07,
2276
+ "loss": 0.3915,
2277
+ "step": 2880
2278
+ },
2279
+ {
2280
+ "epoch": 7.0,
2281
+ "grad_norm": 2.1515309810638428,
2282
+ "learning_rate": 2.0754064337599448e-07,
2283
+ "loss": 0.4095,
2284
+ "step": 2890
2285
+ },
2286
+ {
2287
+ "epoch": 7.0,
2288
+ "step": 2891,
2289
+ "total_flos": 3.5833623598425784e+18,
2290
+ "train_loss": 0.7298227465279536,
2291
+ "train_runtime": 1041.6701,
2292
+ "train_samples_per_second": 44.372,
2293
+ "train_steps_per_second": 2.775
2294
  }
2295
  ],
2296
  "logging_steps": 10,
2297
+ "max_steps": 2891,
2298
  "num_input_tokens_seen": 0,
2299
+ "num_train_epochs": 7,
2300
  "save_steps": 100,
2301
+ "total_flos": 3.5833623598425784e+18,
2302
  "train_batch_size": 16,
2303
  "trial_name": null,
2304
  "trial_params": null