qubvel-hf HF staff commited on
Commit
85f72e4
1 Parent(s): 614858b

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,8 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: timm/resnet18.a1_in1k
5
  tags:
 
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -16,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # vit-base-beans
18
 
19
- This model is a fine-tuned version of [timm/resnet18.a1_in1k](https://huggingface.co/timm/resnet18.a1_in1k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.7412
22
  - Accuracy: 0.8120
 
3
  license: apache-2.0
4
  base_model: timm/resnet18.a1_in1k
5
  tags:
6
+ - image-classification
7
+ - vision
8
  - generated_from_trainer
9
  metrics:
10
  - accuracy
 
18
 
19
  # vit-base-beans
20
 
21
+ This model is a fine-tuned version of [timm/resnet18.a1_in1k](https://huggingface.co/timm/resnet18.a1_in1k) on the beans dataset.
22
  It achieves the following results on the evaluation set:
23
  - Loss: 0.7412
24
  - Accuracy: 0.8120
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 15.0,
3
- "eval_accuracy": 0.3308270676691729,
4
- "eval_loss": 1.1282602548599243,
5
- "eval_runtime": 0.9945,
6
- "eval_samples_per_second": 133.736,
7
- "eval_steps_per_second": 17.094,
8
  "total_flos": 1.5658365504595968e+17,
9
- "train_loss": 0.0,
10
- "train_runtime": 1.3792,
11
- "train_samples_per_second": 11245.899,
12
- "train_steps_per_second": 1413.894
13
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "eval_accuracy": 0.8120300751879699,
4
+ "eval_loss": 0.7412300109863281,
5
+ "eval_runtime": 0.7631,
6
+ "eval_samples_per_second": 174.289,
7
+ "eval_steps_per_second": 22.278,
8
  "total_flos": 1.5658365504595968e+17,
9
+ "train_loss": 0.9236146088135548,
10
+ "train_runtime": 142.953,
11
+ "train_samples_per_second": 108.497,
12
+ "train_steps_per_second": 13.641
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 15.0,
3
- "eval_accuracy": 0.3308270676691729,
4
- "eval_loss": 1.1282602548599243,
5
- "eval_runtime": 0.9945,
6
- "eval_samples_per_second": 133.736,
7
- "eval_steps_per_second": 17.094
8
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "eval_accuracy": 0.8120300751879699,
4
+ "eval_loss": 0.7412300109863281,
5
+ "eval_runtime": 0.7631,
6
+ "eval_samples_per_second": 174.289,
7
+ "eval_steps_per_second": 22.278
8
  }
runs/Nov19_22-41-36_ip-10-90-1-182/events.out.tfevents.1732056245.ip-10-90-1-182.3353433.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52a138c5c6c4abe6b692847b5659e465f472b55c50418466ebd24a00a9f14a8d
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 15.0,
3
  "total_flos": 1.5658365504595968e+17,
4
- "train_loss": 0.0,
5
- "train_runtime": 1.3792,
6
- "train_samples_per_second": 11245.899,
7
- "train_steps_per_second": 1413.894
8
  }
 
1
  {
2
  "epoch": 15.0,
3
  "total_flos": 1.5658365504595968e+17,
4
+ "train_loss": 0.9236146088135548,
5
+ "train_runtime": 142.953,
6
+ "train_samples_per_second": 108.497,
7
+ "train_steps_per_second": 13.641
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.738013505935669,
3
  "best_model_checkpoint": "./beans_outputs/checkpoint-1950",
4
  "epoch": 15.0,
5
  "eval_steps": 500,
@@ -10,1512 +10,1512 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
- "grad_norm": 2.099634885787964,
14
  "learning_rate": 1.98974358974359e-05,
15
  "loss": 1.1239,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
- "grad_norm": 1.7766097784042358,
21
  "learning_rate": 1.9794871794871798e-05,
22
  "loss": 1.1221,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
- "grad_norm": 1.9635370969772339,
28
  "learning_rate": 1.9692307692307696e-05,
29
  "loss": 1.1164,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
- "grad_norm": 2.459880828857422,
35
  "learning_rate": 1.958974358974359e-05,
36
- "loss": 1.1045,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
- "grad_norm": 1.7044235467910767,
42
  "learning_rate": 1.9487179487179488e-05,
43
- "loss": 1.1083,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
- "grad_norm": 1.4914859533309937,
49
  "learning_rate": 1.9384615384615386e-05,
50
- "loss": 1.1044,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
- "grad_norm": 2.6082077026367188,
56
  "learning_rate": 1.9282051282051284e-05,
57
- "loss": 1.1027,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
- "grad_norm": 2.722012996673584,
63
  "learning_rate": 1.9179487179487182e-05,
64
  "loss": 1.0908,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
- "grad_norm": 2.029851198196411,
70
  "learning_rate": 1.907692307692308e-05,
71
- "loss": 1.0941,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
- "grad_norm": 2.4984819889068604,
77
  "learning_rate": 1.8974358974358975e-05,
78
- "loss": 1.0913,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
- "grad_norm": 1.7019662857055664,
84
  "learning_rate": 1.8871794871794873e-05,
85
- "loss": 1.0945,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
- "grad_norm": 2.2935667037963867,
91
  "learning_rate": 1.876923076923077e-05,
92
- "loss": 1.0994,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
- "grad_norm": 3.9286272525787354,
98
  "learning_rate": 1.866666666666667e-05,
99
- "loss": 1.0861,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
  "eval_accuracy": 0.42857142857142855,
105
- "eval_loss": 1.0880533456802368,
106
- "eval_runtime": 0.7946,
107
- "eval_samples_per_second": 167.381,
108
- "eval_steps_per_second": 21.395,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
- "grad_norm": 2.083320140838623,
114
  "learning_rate": 1.8564102564102567e-05,
115
- "loss": 1.0772,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
- "grad_norm": 2.711829423904419,
121
  "learning_rate": 1.8461538461538465e-05,
122
- "loss": 1.072,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
- "grad_norm": 1.5575768947601318,
128
  "learning_rate": 1.835897435897436e-05,
129
- "loss": 1.0787,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
- "grad_norm": 2.3275976181030273,
135
  "learning_rate": 1.8256410256410257e-05,
136
- "loss": 1.0935,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
- "grad_norm": 2.1124396324157715,
142
  "learning_rate": 1.8153846153846155e-05,
143
- "loss": 1.0806,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
- "grad_norm": 2.1334290504455566,
149
  "learning_rate": 1.8051282051282053e-05,
150
- "loss": 1.0796,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
- "grad_norm": 1.7488656044006348,
156
  "learning_rate": 1.794871794871795e-05,
157
- "loss": 1.0762,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
- "grad_norm": 1.7498658895492554,
163
  "learning_rate": 1.784615384615385e-05,
164
- "loss": 1.0682,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
- "grad_norm": 3.5284996032714844,
170
  "learning_rate": 1.7743589743589744e-05,
171
- "loss": 1.0699,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
- "grad_norm": 2.088862895965576,
177
  "learning_rate": 1.7641025641025642e-05,
178
- "loss": 1.0707,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
- "grad_norm": 1.9373514652252197,
184
  "learning_rate": 1.753846153846154e-05,
185
- "loss": 1.0864,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
- "grad_norm": 2.00813889503479,
191
  "learning_rate": 1.7435897435897438e-05,
192
- "loss": 1.0681,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
- "grad_norm": 3.0624523162841797,
198
  "learning_rate": 1.7333333333333336e-05,
199
- "loss": 1.0631,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
- "eval_accuracy": 0.5413533834586466,
205
- "eval_loss": 1.0597343444824219,
206
- "eval_runtime": 0.7394,
207
- "eval_samples_per_second": 179.866,
208
- "eval_steps_per_second": 22.99,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
- "grad_norm": 2.6268935203552246,
214
  "learning_rate": 1.7230769230769234e-05,
215
- "loss": 1.0716,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
- "grad_norm": 2.3874154090881348,
221
  "learning_rate": 1.7128205128205128e-05,
222
- "loss": 1.0599,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
- "grad_norm": 2.3503990173339844,
228
  "learning_rate": 1.7025641025641026e-05,
229
- "loss": 1.054,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
- "grad_norm": 2.3312108516693115,
235
  "learning_rate": 1.6923076923076924e-05,
236
- "loss": 1.0436,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
- "grad_norm": 1.9090198278427124,
242
  "learning_rate": 1.6820512820512822e-05,
243
- "loss": 1.0569,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
- "grad_norm": 1.9505618810653687,
249
  "learning_rate": 1.671794871794872e-05,
250
- "loss": 1.0479,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
- "grad_norm": 1.4548966884613037,
256
  "learning_rate": 1.6615384615384618e-05,
257
- "loss": 1.0304,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
- "grad_norm": 2.1505134105682373,
263
  "learning_rate": 1.6512820512820513e-05,
264
- "loss": 1.0467,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
- "grad_norm": 2.8420169353485107,
270
  "learning_rate": 1.641025641025641e-05,
271
- "loss": 1.0509,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
- "grad_norm": 1.8315626382827759,
277
  "learning_rate": 1.630769230769231e-05,
278
  "loss": 1.0407,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
- "grad_norm": 1.8499083518981934,
284
  "learning_rate": 1.6205128205128207e-05,
285
- "loss": 1.0344,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
- "grad_norm": 1.9010175466537476,
291
  "learning_rate": 1.6102564102564105e-05,
292
- "loss": 1.0219,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
- "grad_norm": 4.809950828552246,
298
  "learning_rate": 1.6000000000000003e-05,
299
- "loss": 1.0443,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
- "eval_accuracy": 0.6691729323308271,
305
- "eval_loss": 1.0225275754928589,
306
- "eval_runtime": 0.7405,
307
- "eval_samples_per_second": 179.608,
308
- "eval_steps_per_second": 22.957,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
- "grad_norm": 2.195730686187744,
314
  "learning_rate": 1.5897435897435897e-05,
315
- "loss": 1.0414,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
- "grad_norm": 2.1667442321777344,
321
  "learning_rate": 1.5794871794871795e-05,
322
- "loss": 1.0302,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
- "grad_norm": 2.149664878845215,
328
  "learning_rate": 1.5692307692307693e-05,
329
- "loss": 1.0365,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
- "grad_norm": 2.2559187412261963,
335
  "learning_rate": 1.558974358974359e-05,
336
  "loss": 1.0159,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
- "grad_norm": 2.165260076522827,
342
  "learning_rate": 1.548717948717949e-05,
343
- "loss": 1.0274,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
- "grad_norm": 1.799578309059143,
349
  "learning_rate": 1.5384615384615387e-05,
350
- "loss": 1.0371,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
- "grad_norm": 2.441862106323242,
356
  "learning_rate": 1.5282051282051282e-05,
357
  "loss": 1.0322,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
- "grad_norm": 1.571476936340332,
363
  "learning_rate": 1.517948717948718e-05,
364
- "loss": 1.0046,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
- "grad_norm": 2.725184440612793,
370
  "learning_rate": 1.5076923076923078e-05,
371
- "loss": 1.0081,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
- "grad_norm": 1.5710386037826538,
377
  "learning_rate": 1.4974358974358976e-05,
378
- "loss": 0.9977,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
- "grad_norm": 1.6186603307724,
384
  "learning_rate": 1.4871794871794874e-05,
385
- "loss": 1.0217,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
- "grad_norm": 1.8194940090179443,
391
  "learning_rate": 1.4769230769230772e-05,
392
- "loss": 0.997,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
- "grad_norm": 5.478014945983887,
398
  "learning_rate": 1.4666666666666666e-05,
399
- "loss": 1.0218,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
- "eval_accuracy": 0.6842105263157895,
405
- "eval_loss": 0.9959980845451355,
406
- "eval_runtime": 0.7638,
407
- "eval_samples_per_second": 174.124,
408
- "eval_steps_per_second": 22.256,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
- "grad_norm": 2.7807815074920654,
414
  "learning_rate": 1.4564102564102564e-05,
415
- "loss": 1.0126,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
- "grad_norm": 1.8810123205184937,
421
  "learning_rate": 1.4461538461538462e-05,
422
- "loss": 1.0142,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
- "grad_norm": 2.737741708755493,
428
  "learning_rate": 1.435897435897436e-05,
429
- "loss": 1.0241,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
- "grad_norm": 2.183462381362915,
435
  "learning_rate": 1.4256410256410258e-05,
436
- "loss": 1.0324,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
- "grad_norm": 1.9445362091064453,
442
  "learning_rate": 1.4153846153846156e-05,
443
- "loss": 0.9823,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
- "grad_norm": 2.8362057209014893,
449
  "learning_rate": 1.405128205128205e-05,
450
- "loss": 0.9919,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
- "grad_norm": 2.5588510036468506,
456
  "learning_rate": 1.3948717948717949e-05,
457
- "loss": 0.987,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
- "grad_norm": 1.740641474723816,
463
  "learning_rate": 1.3846153846153847e-05,
464
- "loss": 0.9778,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
- "grad_norm": 3.247788429260254,
470
  "learning_rate": 1.3743589743589745e-05,
471
- "loss": 0.9978,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
- "grad_norm": 2.136213779449463,
477
  "learning_rate": 1.3641025641025643e-05,
478
- "loss": 1.0067,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
- "grad_norm": 1.8435287475585938,
484
  "learning_rate": 1.353846153846154e-05,
485
- "loss": 0.9691,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
- "grad_norm": 2.3936328887939453,
491
  "learning_rate": 1.3435897435897435e-05,
492
- "loss": 0.9692,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
- "grad_norm": 4.921462535858154,
498
  "learning_rate": 1.3333333333333333e-05,
499
- "loss": 1.0028,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
- "eval_accuracy": 0.7293233082706767,
505
- "eval_loss": 0.9568173289299011,
506
- "eval_runtime": 0.7715,
507
- "eval_samples_per_second": 172.394,
508
- "eval_steps_per_second": 22.035,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
- "grad_norm": 2.051286458969116,
514
  "learning_rate": 1.3230769230769231e-05,
515
- "loss": 0.9638,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
- "grad_norm": 2.7295546531677246,
521
  "learning_rate": 1.312820512820513e-05,
522
- "loss": 1.0056,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
- "grad_norm": 2.3378870487213135,
528
  "learning_rate": 1.3025641025641027e-05,
529
- "loss": 0.9916,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
- "grad_norm": 2.754603862762451,
535
  "learning_rate": 1.2923076923076925e-05,
536
- "loss": 0.9869,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
- "grad_norm": 2.6367554664611816,
542
  "learning_rate": 1.2820512820512823e-05,
543
- "loss": 0.9743,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
- "grad_norm": 2.152855396270752,
549
  "learning_rate": 1.2717948717948718e-05,
550
- "loss": 0.9669,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
- "grad_norm": 1.970173954963684,
556
  "learning_rate": 1.2615384615384616e-05,
557
- "loss": 0.9496,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
- "grad_norm": 2.49542498588562,
563
  "learning_rate": 1.2512820512820514e-05,
564
- "loss": 0.9644,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
- "grad_norm": 2.5210750102996826,
570
  "learning_rate": 1.2410256410256412e-05,
571
- "loss": 0.9453,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
- "grad_norm": 2.0949254035949707,
577
  "learning_rate": 1.230769230769231e-05,
578
- "loss": 0.95,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
- "grad_norm": 2.167081594467163,
584
  "learning_rate": 1.2205128205128208e-05,
585
- "loss": 0.9519,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
- "grad_norm": 3.6734519004821777,
591
  "learning_rate": 1.2102564102564102e-05,
592
- "loss": 0.9376,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
- "grad_norm": 4.938449859619141,
598
  "learning_rate": 1.2e-05,
599
- "loss": 0.9752,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_accuracy": 0.7669172932330827,
605
- "eval_loss": 0.9278604984283447,
606
- "eval_runtime": 0.7739,
607
- "eval_samples_per_second": 171.864,
608
- "eval_steps_per_second": 21.968,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
- "grad_norm": 2.8171768188476562,
614
  "learning_rate": 1.1897435897435898e-05,
615
- "loss": 0.9423,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
- "grad_norm": 2.5053982734680176,
621
  "learning_rate": 1.1794871794871796e-05,
622
- "loss": 0.9382,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
- "grad_norm": 1.875543475151062,
628
  "learning_rate": 1.1692307692307694e-05,
629
- "loss": 0.9318,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
- "grad_norm": 2.222604513168335,
635
  "learning_rate": 1.1589743589743592e-05,
636
- "loss": 0.9313,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
- "grad_norm": 2.5009353160858154,
642
  "learning_rate": 1.1487179487179487e-05,
643
- "loss": 0.9354,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
- "grad_norm": 3.2472548484802246,
649
  "learning_rate": 1.1384615384615385e-05,
650
- "loss": 0.9335,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
- "grad_norm": 2.944819927215576,
656
  "learning_rate": 1.1282051282051283e-05,
657
- "loss": 0.9403,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
- "grad_norm": 1.9217369556427002,
663
  "learning_rate": 1.117948717948718e-05,
664
- "loss": 0.9109,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
- "grad_norm": 2.2056405544281006,
670
  "learning_rate": 1.1076923076923079e-05,
671
- "loss": 0.9578,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
- "grad_norm": 2.6266028881073,
677
  "learning_rate": 1.0974358974358977e-05,
678
- "loss": 0.9249,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
- "grad_norm": 2.48388409614563,
684
  "learning_rate": 1.0871794871794871e-05,
685
- "loss": 0.9198,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
- "grad_norm": 3.0685908794403076,
691
  "learning_rate": 1.076923076923077e-05,
692
- "loss": 0.9424,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
- "grad_norm": 4.163443088531494,
698
  "learning_rate": 1.0666666666666667e-05,
699
- "loss": 0.924,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
- "eval_accuracy": 0.8045112781954887,
705
- "eval_loss": 0.8851932287216187,
706
- "eval_runtime": 0.7436,
707
- "eval_samples_per_second": 178.849,
708
- "eval_steps_per_second": 22.86,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
- "grad_norm": 2.746605634689331,
714
  "learning_rate": 1.0564102564102565e-05,
715
- "loss": 0.9102,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
- "grad_norm": 3.5535519123077393,
721
  "learning_rate": 1.0461538461538463e-05,
722
- "loss": 0.9097,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
- "grad_norm": 3.44745135307312,
728
  "learning_rate": 1.0358974358974361e-05,
729
- "loss": 0.899,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
- "grad_norm": 2.7519822120666504,
735
  "learning_rate": 1.0256410256410256e-05,
736
- "loss": 0.9072,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
- "grad_norm": 2.7004168033599854,
742
  "learning_rate": 1.0153846153846154e-05,
743
- "loss": 0.9231,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
- "grad_norm": 2.188138961791992,
749
  "learning_rate": 1.0051282051282052e-05,
750
- "loss": 0.8897,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
- "grad_norm": 2.8544998168945312,
756
  "learning_rate": 9.94871794871795e-06,
757
- "loss": 0.9233,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
- "grad_norm": 2.217107057571411,
763
  "learning_rate": 9.846153846153848e-06,
764
- "loss": 0.8854,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
- "grad_norm": 2.083819627761841,
770
  "learning_rate": 9.743589743589744e-06,
771
- "loss": 0.8808,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
- "grad_norm": 2.864044427871704,
777
  "learning_rate": 9.641025641025642e-06,
778
- "loss": 0.8818,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
- "grad_norm": 2.2901501655578613,
784
  "learning_rate": 9.53846153846154e-06,
785
- "loss": 0.9066,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
- "grad_norm": 2.5113565921783447,
791
  "learning_rate": 9.435897435897436e-06,
792
- "loss": 0.8577,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
- "grad_norm": 4.092830181121826,
798
  "learning_rate": 9.333333333333334e-06,
799
- "loss": 0.9179,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
  "eval_accuracy": 0.8120300751879699,
805
- "eval_loss": 0.8504595160484314,
806
- "eval_runtime": 0.7701,
807
- "eval_samples_per_second": 172.703,
808
- "eval_steps_per_second": 22.075,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
- "grad_norm": 1.9615790843963623,
814
  "learning_rate": 9.230769230769232e-06,
815
- "loss": 0.8931,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
- "grad_norm": 2.1320254802703857,
821
  "learning_rate": 9.128205128205129e-06,
822
- "loss": 0.8652,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
- "grad_norm": 2.0207359790802,
828
  "learning_rate": 9.025641025641027e-06,
829
- "loss": 0.8992,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
- "grad_norm": 1.9664427042007446,
835
  "learning_rate": 8.923076923076925e-06,
836
- "loss": 0.8552,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
- "grad_norm": 2.321737766265869,
842
  "learning_rate": 8.820512820512821e-06,
843
- "loss": 0.8973,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
- "grad_norm": 2.780322313308716,
849
  "learning_rate": 8.717948717948719e-06,
850
- "loss": 0.8695,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
- "grad_norm": 1.664422631263733,
856
  "learning_rate": 8.615384615384617e-06,
857
- "loss": 0.8711,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
- "grad_norm": 4.4292097091674805,
863
  "learning_rate": 8.512820512820513e-06,
864
- "loss": 0.8983,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
- "grad_norm": 3.7464237213134766,
870
  "learning_rate": 8.410256410256411e-06,
871
- "loss": 0.8831,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
- "grad_norm": 1.9225133657455444,
877
  "learning_rate": 8.307692307692309e-06,
878
- "loss": 0.8851,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
- "grad_norm": 2.6193883419036865,
884
  "learning_rate": 8.205128205128205e-06,
885
- "loss": 0.8718,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
- "grad_norm": 3.2045023441314697,
891
  "learning_rate": 8.102564102564103e-06,
892
- "loss": 0.8843,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
- "grad_norm": 4.040080547332764,
898
  "learning_rate": 8.000000000000001e-06,
899
- "loss": 0.9,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
  "eval_accuracy": 0.8045112781954887,
905
- "eval_loss": 0.8327888250350952,
906
- "eval_runtime": 0.764,
907
- "eval_samples_per_second": 174.095,
908
- "eval_steps_per_second": 22.253,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
- "grad_norm": 1.9053655862808228,
914
  "learning_rate": 7.897435897435898e-06,
915
- "loss": 0.8961,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
- "grad_norm": 2.056864023208618,
921
  "learning_rate": 7.794871794871796e-06,
922
- "loss": 0.8369,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
- "grad_norm": 2.1695327758789062,
928
  "learning_rate": 7.692307692307694e-06,
929
- "loss": 0.8496,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
- "grad_norm": 2.1930296421051025,
935
  "learning_rate": 7.58974358974359e-06,
936
- "loss": 0.8527,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
- "grad_norm": 2.0710272789001465,
942
  "learning_rate": 7.487179487179488e-06,
943
- "loss": 0.867,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
- "grad_norm": 3.325305938720703,
949
  "learning_rate": 7.384615384615386e-06,
950
- "loss": 0.8541,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
- "grad_norm": 2.5524134635925293,
956
  "learning_rate": 7.282051282051282e-06,
957
- "loss": 0.8711,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
- "grad_norm": 3.508930206298828,
963
  "learning_rate": 7.17948717948718e-06,
964
- "loss": 0.8263,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
- "grad_norm": 2.657597303390503,
970
  "learning_rate": 7.076923076923078e-06,
971
- "loss": 0.8519,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
- "grad_norm": 1.858414888381958,
977
  "learning_rate": 6.974358974358974e-06,
978
- "loss": 0.8306,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
- "grad_norm": 2.1380136013031006,
984
  "learning_rate": 6.871794871794872e-06,
985
- "loss": 0.8115,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
- "grad_norm": 2.1686654090881348,
991
  "learning_rate": 6.76923076923077e-06,
992
- "loss": 0.8366,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
- "grad_norm": 4.619501113891602,
998
  "learning_rate": 6.666666666666667e-06,
999
- "loss": 0.8084,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
  "eval_accuracy": 0.8421052631578947,
1005
- "eval_loss": 0.807080864906311,
1006
- "eval_runtime": 0.7501,
1007
- "eval_samples_per_second": 177.313,
1008
- "eval_steps_per_second": 22.664,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.076923076923077,
1013
- "grad_norm": 2.3092093467712402,
1014
  "learning_rate": 6.564102564102565e-06,
1015
- "loss": 0.8366,
1016
  "step": 1310
1017
  },
1018
  {
1019
  "epoch": 10.153846153846153,
1020
- "grad_norm": 2.2793080806732178,
1021
  "learning_rate": 6.461538461538463e-06,
1022
- "loss": 0.89,
1023
  "step": 1320
1024
  },
1025
  {
1026
  "epoch": 10.23076923076923,
1027
- "grad_norm": 2.369584560394287,
1028
  "learning_rate": 6.358974358974359e-06,
1029
- "loss": 0.8718,
1030
  "step": 1330
1031
  },
1032
  {
1033
  "epoch": 10.307692307692308,
1034
- "grad_norm": 2.307018995285034,
1035
  "learning_rate": 6.256410256410257e-06,
1036
- "loss": 0.8299,
1037
  "step": 1340
1038
  },
1039
  {
1040
  "epoch": 10.384615384615385,
1041
- "grad_norm": 2.086519718170166,
1042
  "learning_rate": 6.153846153846155e-06,
1043
- "loss": 0.8096,
1044
  "step": 1350
1045
  },
1046
  {
1047
  "epoch": 10.461538461538462,
1048
- "grad_norm": 2.7159907817840576,
1049
  "learning_rate": 6.051282051282051e-06,
1050
- "loss": 0.8442,
1051
  "step": 1360
1052
  },
1053
  {
1054
  "epoch": 10.538461538461538,
1055
- "grad_norm": 2.477699041366577,
1056
  "learning_rate": 5.948717948717949e-06,
1057
- "loss": 0.8133,
1058
  "step": 1370
1059
  },
1060
  {
1061
  "epoch": 10.615384615384615,
1062
- "grad_norm": 4.113278388977051,
1063
  "learning_rate": 5.846153846153847e-06,
1064
- "loss": 0.8159,
1065
  "step": 1380
1066
  },
1067
  {
1068
  "epoch": 10.692307692307692,
1069
- "grad_norm": 2.264570951461792,
1070
  "learning_rate": 5.743589743589743e-06,
1071
- "loss": 0.8086,
1072
  "step": 1390
1073
  },
1074
  {
1075
  "epoch": 10.76923076923077,
1076
- "grad_norm": 3.312915802001953,
1077
  "learning_rate": 5.641025641025641e-06,
1078
- "loss": 0.8668,
1079
  "step": 1400
1080
  },
1081
  {
1082
  "epoch": 10.846153846153847,
1083
- "grad_norm": 2.189178228378296,
1084
  "learning_rate": 5.538461538461539e-06,
1085
- "loss": 0.8298,
1086
  "step": 1410
1087
  },
1088
  {
1089
  "epoch": 10.923076923076923,
1090
- "grad_norm": 1.8542472124099731,
1091
  "learning_rate": 5.435897435897436e-06,
1092
- "loss": 0.8329,
1093
  "step": 1420
1094
  },
1095
  {
1096
  "epoch": 11.0,
1097
- "grad_norm": 4.028012752532959,
1098
  "learning_rate": 5.333333333333334e-06,
1099
- "loss": 0.8306,
1100
  "step": 1430
1101
  },
1102
  {
1103
  "epoch": 11.0,
1104
  "eval_accuracy": 0.8345864661654135,
1105
- "eval_loss": 0.7760081887245178,
1106
- "eval_runtime": 0.7954,
1107
- "eval_samples_per_second": 167.205,
1108
- "eval_steps_per_second": 21.372,
1109
  "step": 1430
1110
  },
1111
  {
1112
  "epoch": 11.076923076923077,
1113
- "grad_norm": 1.9505723714828491,
1114
  "learning_rate": 5.230769230769232e-06,
1115
- "loss": 0.8461,
1116
  "step": 1440
1117
  },
1118
  {
1119
  "epoch": 11.153846153846153,
1120
- "grad_norm": 3.543419122695923,
1121
  "learning_rate": 5.128205128205128e-06,
1122
- "loss": 0.8287,
1123
  "step": 1450
1124
  },
1125
  {
1126
  "epoch": 11.23076923076923,
1127
- "grad_norm": 2.6018764972686768,
1128
  "learning_rate": 5.025641025641026e-06,
1129
- "loss": 0.7996,
1130
  "step": 1460
1131
  },
1132
  {
1133
  "epoch": 11.307692307692308,
1134
- "grad_norm": 2.7204878330230713,
1135
  "learning_rate": 4.923076923076924e-06,
1136
- "loss": 0.8218,
1137
  "step": 1470
1138
  },
1139
  {
1140
  "epoch": 11.384615384615385,
1141
- "grad_norm": 2.6924397945404053,
1142
  "learning_rate": 4.820512820512821e-06,
1143
- "loss": 0.8152,
1144
  "step": 1480
1145
  },
1146
  {
1147
  "epoch": 11.461538461538462,
1148
- "grad_norm": 2.1294894218444824,
1149
  "learning_rate": 4.717948717948718e-06,
1150
- "loss": 0.8452,
1151
  "step": 1490
1152
  },
1153
  {
1154
  "epoch": 11.538461538461538,
1155
- "grad_norm": 2.45246958732605,
1156
  "learning_rate": 4.615384615384616e-06,
1157
- "loss": 0.8316,
1158
  "step": 1500
1159
  },
1160
  {
1161
  "epoch": 11.615384615384615,
1162
- "grad_norm": 3.2710139751434326,
1163
  "learning_rate": 4.512820512820513e-06,
1164
- "loss": 0.8272,
1165
  "step": 1510
1166
  },
1167
  {
1168
  "epoch": 11.692307692307692,
1169
- "grad_norm": 3.3108620643615723,
1170
  "learning_rate": 4.4102564102564104e-06,
1171
- "loss": 0.8539,
1172
  "step": 1520
1173
  },
1174
  {
1175
  "epoch": 11.76923076923077,
1176
- "grad_norm": 2.079916000366211,
1177
  "learning_rate": 4.307692307692308e-06,
1178
- "loss": 0.8067,
1179
  "step": 1530
1180
  },
1181
  {
1182
  "epoch": 11.846153846153847,
1183
- "grad_norm": 2.2175967693328857,
1184
  "learning_rate": 4.2051282051282055e-06,
1185
- "loss": 0.7943,
1186
  "step": 1540
1187
  },
1188
  {
1189
  "epoch": 11.923076923076923,
1190
- "grad_norm": 2.064258098602295,
1191
  "learning_rate": 4.102564102564103e-06,
1192
- "loss": 0.8268,
1193
  "step": 1550
1194
  },
1195
  {
1196
  "epoch": 12.0,
1197
- "grad_norm": 3.4313971996307373,
1198
  "learning_rate": 4.000000000000001e-06,
1199
- "loss": 0.8031,
1200
  "step": 1560
1201
  },
1202
  {
1203
  "epoch": 12.0,
1204
  "eval_accuracy": 0.8345864661654135,
1205
- "eval_loss": 0.7563135623931885,
1206
- "eval_runtime": 0.7817,
1207
- "eval_samples_per_second": 170.134,
1208
- "eval_steps_per_second": 21.746,
1209
  "step": 1560
1210
  },
1211
  {
1212
  "epoch": 12.076923076923077,
1213
- "grad_norm": 2.1071786880493164,
1214
  "learning_rate": 3.897435897435898e-06,
1215
- "loss": 0.7877,
1216
  "step": 1570
1217
  },
1218
  {
1219
  "epoch": 12.153846153846153,
1220
- "grad_norm": 2.6646640300750732,
1221
  "learning_rate": 3.794871794871795e-06,
1222
- "loss": 0.8223,
1223
  "step": 1580
1224
  },
1225
  {
1226
  "epoch": 12.23076923076923,
1227
- "grad_norm": 1.7268288135528564,
1228
  "learning_rate": 3.692307692307693e-06,
1229
- "loss": 0.8204,
1230
  "step": 1590
1231
  },
1232
  {
1233
  "epoch": 12.307692307692308,
1234
- "grad_norm": 2.982988119125366,
1235
  "learning_rate": 3.58974358974359e-06,
1236
- "loss": 0.8336,
1237
  "step": 1600
1238
  },
1239
  {
1240
  "epoch": 12.384615384615385,
1241
- "grad_norm": 2.559271812438965,
1242
  "learning_rate": 3.487179487179487e-06,
1243
- "loss": 0.8064,
1244
  "step": 1610
1245
  },
1246
  {
1247
  "epoch": 12.461538461538462,
1248
- "grad_norm": 2.528869390487671,
1249
  "learning_rate": 3.384615384615385e-06,
1250
- "loss": 0.793,
1251
  "step": 1620
1252
  },
1253
  {
1254
  "epoch": 12.538461538461538,
1255
- "grad_norm": 2.1543517112731934,
1256
  "learning_rate": 3.2820512820512823e-06,
1257
- "loss": 0.8031,
1258
  "step": 1630
1259
  },
1260
  {
1261
  "epoch": 12.615384615384615,
1262
- "grad_norm": 1.7778912782669067,
1263
  "learning_rate": 3.1794871794871795e-06,
1264
- "loss": 0.8086,
1265
  "step": 1640
1266
  },
1267
  {
1268
  "epoch": 12.692307692307692,
1269
- "grad_norm": 3.2962191104888916,
1270
  "learning_rate": 3.0769230769230774e-06,
1271
- "loss": 0.7617,
1272
  "step": 1650
1273
  },
1274
  {
1275
  "epoch": 12.76923076923077,
1276
- "grad_norm": 2.336732864379883,
1277
  "learning_rate": 2.9743589743589746e-06,
1278
- "loss": 0.8541,
1279
  "step": 1660
1280
  },
1281
  {
1282
  "epoch": 12.846153846153847,
1283
- "grad_norm": 1.9721437692642212,
1284
  "learning_rate": 2.8717948717948717e-06,
1285
- "loss": 0.8249,
1286
  "step": 1670
1287
  },
1288
  {
1289
  "epoch": 12.923076923076923,
1290
- "grad_norm": 2.0368008613586426,
1291
  "learning_rate": 2.7692307692307697e-06,
1292
- "loss": 0.795,
1293
  "step": 1680
1294
  },
1295
  {
1296
  "epoch": 13.0,
1297
- "grad_norm": 3.8102405071258545,
1298
  "learning_rate": 2.666666666666667e-06,
1299
- "loss": 0.8138,
1300
  "step": 1690
1301
  },
1302
  {
1303
  "epoch": 13.0,
1304
  "eval_accuracy": 0.8421052631578947,
1305
- "eval_loss": 0.753366231918335,
1306
- "eval_runtime": 0.7711,
1307
- "eval_samples_per_second": 172.483,
1308
- "eval_steps_per_second": 22.047,
1309
  "step": 1690
1310
  },
1311
  {
1312
  "epoch": 13.076923076923077,
1313
- "grad_norm": 3.0844614505767822,
1314
  "learning_rate": 2.564102564102564e-06,
1315
- "loss": 0.7901,
1316
  "step": 1700
1317
  },
1318
  {
1319
  "epoch": 13.153846153846153,
1320
- "grad_norm": 2.517000913619995,
1321
  "learning_rate": 2.461538461538462e-06,
1322
- "loss": 0.8205,
1323
  "step": 1710
1324
  },
1325
  {
1326
  "epoch": 13.23076923076923,
1327
- "grad_norm": 3.047574520111084,
1328
  "learning_rate": 2.358974358974359e-06,
1329
- "loss": 0.8113,
1330
  "step": 1720
1331
  },
1332
  {
1333
  "epoch": 13.307692307692308,
1334
- "grad_norm": 1.9097496271133423,
1335
  "learning_rate": 2.2564102564102566e-06,
1336
- "loss": 0.817,
1337
  "step": 1730
1338
  },
1339
  {
1340
  "epoch": 13.384615384615385,
1341
- "grad_norm": 2.553558826446533,
1342
  "learning_rate": 2.153846153846154e-06,
1343
- "loss": 0.7794,
1344
  "step": 1740
1345
  },
1346
  {
1347
  "epoch": 13.461538461538462,
1348
- "grad_norm": 3.915072202682495,
1349
  "learning_rate": 2.0512820512820513e-06,
1350
- "loss": 0.8238,
1351
  "step": 1750
1352
  },
1353
  {
1354
  "epoch": 13.538461538461538,
1355
- "grad_norm": 2.7563774585723877,
1356
  "learning_rate": 1.948717948717949e-06,
1357
- "loss": 0.8464,
1358
  "step": 1760
1359
  },
1360
  {
1361
  "epoch": 13.615384615384615,
1362
- "grad_norm": 1.9687329530715942,
1363
  "learning_rate": 1.8461538461538465e-06,
1364
- "loss": 0.7998,
1365
  "step": 1770
1366
  },
1367
  {
1368
  "epoch": 13.692307692307692,
1369
- "grad_norm": 1.9707388877868652,
1370
  "learning_rate": 1.7435897435897436e-06,
1371
- "loss": 0.7827,
1372
  "step": 1780
1373
  },
1374
  {
1375
  "epoch": 13.76923076923077,
1376
- "grad_norm": 1.8745958805084229,
1377
  "learning_rate": 1.6410256410256412e-06,
1378
- "loss": 0.8211,
1379
  "step": 1790
1380
  },
1381
  {
1382
  "epoch": 13.846153846153847,
1383
- "grad_norm": 3.193314552307129,
1384
  "learning_rate": 1.5384615384615387e-06,
1385
- "loss": 0.7874,
1386
  "step": 1800
1387
  },
1388
  {
1389
  "epoch": 13.923076923076923,
1390
- "grad_norm": 2.4726743698120117,
1391
  "learning_rate": 1.4358974358974359e-06,
1392
- "loss": 0.7323,
1393
  "step": 1810
1394
  },
1395
  {
1396
  "epoch": 14.0,
1397
- "grad_norm": 4.2939653396606445,
1398
  "learning_rate": 1.3333333333333334e-06,
1399
- "loss": 0.8178,
1400
  "step": 1820
1401
  },
1402
  {
1403
  "epoch": 14.0,
1404
  "eval_accuracy": 0.8270676691729323,
1405
- "eval_loss": 0.7507623434066772,
1406
- "eval_runtime": 0.7401,
1407
- "eval_samples_per_second": 179.698,
1408
- "eval_steps_per_second": 22.969,
1409
  "step": 1820
1410
  },
1411
  {
1412
  "epoch": 14.076923076923077,
1413
- "grad_norm": 2.51151967048645,
1414
  "learning_rate": 1.230769230769231e-06,
1415
- "loss": 0.8201,
1416
  "step": 1830
1417
  },
1418
  {
1419
  "epoch": 14.153846153846153,
1420
- "grad_norm": 2.035071849822998,
1421
  "learning_rate": 1.1282051282051283e-06,
1422
- "loss": 0.7885,
1423
  "step": 1840
1424
  },
1425
  {
1426
  "epoch": 14.23076923076923,
1427
- "grad_norm": 2.924006938934326,
1428
  "learning_rate": 1.0256410256410257e-06,
1429
- "loss": 0.7802,
1430
  "step": 1850
1431
  },
1432
  {
1433
  "epoch": 14.307692307692308,
1434
- "grad_norm": 3.3788881301879883,
1435
  "learning_rate": 9.230769230769232e-07,
1436
- "loss": 0.7717,
1437
  "step": 1860
1438
  },
1439
  {
1440
  "epoch": 14.384615384615385,
1441
- "grad_norm": 2.6425819396972656,
1442
  "learning_rate": 8.205128205128206e-07,
1443
- "loss": 0.7871,
1444
  "step": 1870
1445
  },
1446
  {
1447
  "epoch": 14.461538461538462,
1448
- "grad_norm": 2.781729221343994,
1449
  "learning_rate": 7.179487179487179e-07,
1450
- "loss": 0.7859,
1451
  "step": 1880
1452
  },
1453
  {
1454
  "epoch": 14.538461538461538,
1455
- "grad_norm": 3.0325913429260254,
1456
  "learning_rate": 6.153846153846155e-07,
1457
- "loss": 0.7783,
1458
  "step": 1890
1459
  },
1460
  {
1461
  "epoch": 14.615384615384615,
1462
- "grad_norm": 2.7311503887176514,
1463
  "learning_rate": 5.128205128205128e-07,
1464
- "loss": 0.8273,
1465
  "step": 1900
1466
  },
1467
  {
1468
  "epoch": 14.692307692307692,
1469
- "grad_norm": 2.3128809928894043,
1470
  "learning_rate": 4.102564102564103e-07,
1471
- "loss": 0.7977,
1472
  "step": 1910
1473
  },
1474
  {
1475
  "epoch": 14.76923076923077,
1476
- "grad_norm": 3.4510748386383057,
1477
  "learning_rate": 3.0769230769230774e-07,
1478
- "loss": 0.8029,
1479
  "step": 1920
1480
  },
1481
  {
1482
  "epoch": 14.846153846153847,
1483
- "grad_norm": 3.1088345050811768,
1484
  "learning_rate": 2.0512820512820514e-07,
1485
- "loss": 0.8509,
1486
  "step": 1930
1487
  },
1488
  {
1489
  "epoch": 14.923076923076923,
1490
- "grad_norm": 2.3365509510040283,
1491
  "learning_rate": 1.0256410256410257e-07,
1492
- "loss": 0.7569,
1493
  "step": 1940
1494
  },
1495
  {
1496
  "epoch": 15.0,
1497
- "grad_norm": 4.560791015625,
1498
  "learning_rate": 0.0,
1499
- "loss": 0.7901,
1500
  "step": 1950
1501
  },
1502
  {
1503
  "epoch": 15.0,
1504
- "eval_accuracy": 0.8195488721804511,
1505
- "eval_loss": 0.738013505935669,
1506
- "eval_runtime": 0.796,
1507
- "eval_samples_per_second": 167.085,
1508
- "eval_steps_per_second": 21.357,
1509
  "step": 1950
1510
  },
1511
  {
1512
  "epoch": 15.0,
1513
  "step": 1950,
1514
  "total_flos": 1.5658365504595968e+17,
1515
- "train_loss": 0.0,
1516
- "train_runtime": 1.3792,
1517
- "train_samples_per_second": 11245.899,
1518
- "train_steps_per_second": 1413.894
1519
  }
1520
  ],
1521
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.7412300109863281,
3
  "best_model_checkpoint": "./beans_outputs/checkpoint-1950",
4
  "epoch": 15.0,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
+ "grad_norm": 2.136049747467041,
14
  "learning_rate": 1.98974358974359e-05,
15
  "loss": 1.1239,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
+ "grad_norm": 1.8187670707702637,
21
  "learning_rate": 1.9794871794871798e-05,
22
  "loss": 1.1221,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
+ "grad_norm": 2.0219993591308594,
28
  "learning_rate": 1.9692307692307696e-05,
29
  "loss": 1.1164,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
+ "grad_norm": 2.4619803428649902,
35
  "learning_rate": 1.958974358974359e-05,
36
+ "loss": 1.1044,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
+ "grad_norm": 1.6733014583587646,
42
  "learning_rate": 1.9487179487179488e-05,
43
+ "loss": 1.1082,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
+ "grad_norm": 1.4969494342803955,
49
  "learning_rate": 1.9384615384615386e-05,
50
+ "loss": 1.1043,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
+ "grad_norm": 2.6347556114196777,
56
  "learning_rate": 1.9282051282051284e-05,
57
+ "loss": 1.1028,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
+ "grad_norm": 2.5843420028686523,
63
  "learning_rate": 1.9179487179487182e-05,
64
  "loss": 1.0908,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
+ "grad_norm": 2.0522916316986084,
70
  "learning_rate": 1.907692307692308e-05,
71
+ "loss": 1.094,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
+ "grad_norm": 2.4885082244873047,
77
  "learning_rate": 1.8974358974358975e-05,
78
+ "loss": 1.0912,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
+ "grad_norm": 1.7014166116714478,
84
  "learning_rate": 1.8871794871794873e-05,
85
+ "loss": 1.0949,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
+ "grad_norm": 2.294283866882324,
91
  "learning_rate": 1.876923076923077e-05,
92
+ "loss": 1.0992,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "grad_norm": 4.129885673522949,
98
  "learning_rate": 1.866666666666667e-05,
99
+ "loss": 1.0864,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
  "eval_accuracy": 0.42857142857142855,
105
+ "eval_loss": 1.0877832174301147,
106
+ "eval_runtime": 0.7833,
107
+ "eval_samples_per_second": 169.792,
108
+ "eval_steps_per_second": 21.703,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
+ "grad_norm": 2.331717014312744,
114
  "learning_rate": 1.8564102564102567e-05,
115
+ "loss": 1.0774,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
+ "grad_norm": 2.5262138843536377,
121
  "learning_rate": 1.8461538461538465e-05,
122
+ "loss": 1.0719,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
+ "grad_norm": 1.5971320867538452,
128
  "learning_rate": 1.835897435897436e-05,
129
+ "loss": 1.0781,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
+ "grad_norm": 2.383288860321045,
135
  "learning_rate": 1.8256410256410257e-05,
136
+ "loss": 1.0929,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
+ "grad_norm": 2.169706106185913,
142
  "learning_rate": 1.8153846153846155e-05,
143
+ "loss": 1.0805,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
+ "grad_norm": 2.1174418926239014,
149
  "learning_rate": 1.8051282051282053e-05,
150
+ "loss": 1.08,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
+ "grad_norm": 1.7236179113388062,
156
  "learning_rate": 1.794871794871795e-05,
157
+ "loss": 1.0766,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
+ "grad_norm": 1.7772722244262695,
163
  "learning_rate": 1.784615384615385e-05,
164
+ "loss": 1.0676,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
+ "grad_norm": 3.53834867477417,
170
  "learning_rate": 1.7743589743589744e-05,
171
+ "loss": 1.0695,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
+ "grad_norm": 2.0417070388793945,
177
  "learning_rate": 1.7641025641025642e-05,
178
+ "loss": 1.0706,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
+ "grad_norm": 1.9734611511230469,
184
  "learning_rate": 1.753846153846154e-05,
185
+ "loss": 1.0863,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
+ "grad_norm": 1.9997600317001343,
191
  "learning_rate": 1.7435897435897438e-05,
192
+ "loss": 1.068,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
+ "grad_norm": 3.0024373531341553,
198
  "learning_rate": 1.7333333333333336e-05,
199
+ "loss": 1.0629,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
+ "eval_accuracy": 0.5488721804511278,
205
+ "eval_loss": 1.0593525171279907,
206
+ "eval_runtime": 0.7442,
207
+ "eval_samples_per_second": 178.706,
208
+ "eval_steps_per_second": 22.842,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
+ "grad_norm": 1.977807641029358,
214
  "learning_rate": 1.7230769230769234e-05,
215
+ "loss": 1.0711,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
+ "grad_norm": 2.3906102180480957,
221
  "learning_rate": 1.7128205128205128e-05,
222
+ "loss": 1.0597,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
+ "grad_norm": 2.3670897483825684,
228
  "learning_rate": 1.7025641025641026e-05,
229
+ "loss": 1.0576,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
+ "grad_norm": 3.026155948638916,
235
  "learning_rate": 1.6923076923076924e-05,
236
+ "loss": 1.0434,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
+ "grad_norm": 1.9082350730895996,
242
  "learning_rate": 1.6820512820512822e-05,
243
+ "loss": 1.0566,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
+ "grad_norm": 1.9187153577804565,
249
  "learning_rate": 1.671794871794872e-05,
250
+ "loss": 1.0476,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
+ "grad_norm": 1.4435549974441528,
256
  "learning_rate": 1.6615384615384618e-05,
257
+ "loss": 1.032,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
+ "grad_norm": 2.1457245349884033,
263
  "learning_rate": 1.6512820512820513e-05,
264
+ "loss": 1.0475,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
+ "grad_norm": 1.9391709566116333,
270
  "learning_rate": 1.641025641025641e-05,
271
+ "loss": 1.0486,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
+ "grad_norm": 1.8148127794265747,
277
  "learning_rate": 1.630769230769231e-05,
278
  "loss": 1.0407,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
+ "grad_norm": 2.444157123565674,
284
  "learning_rate": 1.6205128205128207e-05,
285
+ "loss": 1.0356,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
+ "grad_norm": 1.9061695337295532,
291
  "learning_rate": 1.6102564102564105e-05,
292
+ "loss": 1.0239,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
+ "grad_norm": 4.859686851501465,
298
  "learning_rate": 1.6000000000000003e-05,
299
+ "loss": 1.0434,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
+ "eval_accuracy": 0.6766917293233082,
305
+ "eval_loss": 1.0230107307434082,
306
+ "eval_runtime": 0.7471,
307
+ "eval_samples_per_second": 178.027,
308
+ "eval_steps_per_second": 22.755,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
+ "grad_norm": 2.2021689414978027,
314
  "learning_rate": 1.5897435897435897e-05,
315
+ "loss": 1.0424,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
+ "grad_norm": 1.8670283555984497,
321
  "learning_rate": 1.5794871794871795e-05,
322
+ "loss": 1.0299,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
+ "grad_norm": 2.193986654281616,
328
  "learning_rate": 1.5692307692307693e-05,
329
+ "loss": 1.0369,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
+ "grad_norm": 2.26470685005188,
335
  "learning_rate": 1.558974358974359e-05,
336
  "loss": 1.0159,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
+ "grad_norm": 2.18507981300354,
342
  "learning_rate": 1.548717948717949e-05,
343
+ "loss": 1.0282,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
+ "grad_norm": 1.8047341108322144,
349
  "learning_rate": 1.5384615384615387e-05,
350
+ "loss": 1.0381,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
+ "grad_norm": 2.6463704109191895,
356
  "learning_rate": 1.5282051282051282e-05,
357
  "loss": 1.0322,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
+ "grad_norm": 1.6456215381622314,
363
  "learning_rate": 1.517948717948718e-05,
364
+ "loss": 1.0049,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
+ "grad_norm": 2.774256706237793,
370
  "learning_rate": 1.5076923076923078e-05,
371
+ "loss": 1.0091,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
+ "grad_norm": 1.572251319885254,
377
  "learning_rate": 1.4974358974358976e-05,
378
+ "loss": 0.998,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
+ "grad_norm": 1.5640805959701538,
384
  "learning_rate": 1.4871794871794874e-05,
385
+ "loss": 1.0222,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
+ "grad_norm": 1.9231537580490112,
391
  "learning_rate": 1.4769230769230772e-05,
392
+ "loss": 0.9979,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
+ "grad_norm": 5.481942176818848,
398
  "learning_rate": 1.4666666666666666e-05,
399
+ "loss": 1.0214,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
+ "eval_accuracy": 0.6766917293233082,
405
+ "eval_loss": 0.9964542388916016,
406
+ "eval_runtime": 0.7616,
407
+ "eval_samples_per_second": 174.638,
408
+ "eval_steps_per_second": 22.322,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
+ "grad_norm": 2.7514402866363525,
414
  "learning_rate": 1.4564102564102564e-05,
415
+ "loss": 1.0128,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
+ "grad_norm": 1.8411396741867065,
421
  "learning_rate": 1.4461538461538462e-05,
422
+ "loss": 1.0145,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
+ "grad_norm": 2.670154571533203,
428
  "learning_rate": 1.435897435897436e-05,
429
+ "loss": 1.0227,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
+ "grad_norm": 2.1951498985290527,
435
  "learning_rate": 1.4256410256410258e-05,
436
+ "loss": 1.0321,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
+ "grad_norm": 1.9692825078964233,
442
  "learning_rate": 1.4153846153846156e-05,
443
+ "loss": 0.9829,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
+ "grad_norm": 2.611340284347534,
449
  "learning_rate": 1.405128205128205e-05,
450
+ "loss": 0.9918,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
+ "grad_norm": 2.4288899898529053,
456
  "learning_rate": 1.3948717948717949e-05,
457
+ "loss": 0.9879,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
+ "grad_norm": 1.7537823915481567,
463
  "learning_rate": 1.3846153846153847e-05,
464
+ "loss": 0.9793,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
+ "grad_norm": 3.1588003635406494,
470
  "learning_rate": 1.3743589743589745e-05,
471
+ "loss": 1.0002,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
+ "grad_norm": 2.2472622394561768,
477
  "learning_rate": 1.3641025641025643e-05,
478
+ "loss": 1.0094,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
+ "grad_norm": 1.7958937883377075,
484
  "learning_rate": 1.353846153846154e-05,
485
+ "loss": 0.9703,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
+ "grad_norm": 2.415766477584839,
491
  "learning_rate": 1.3435897435897435e-05,
492
+ "loss": 0.9703,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
+ "grad_norm": 4.948933124542236,
498
  "learning_rate": 1.3333333333333333e-05,
499
+ "loss": 1.0026,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
+ "eval_accuracy": 0.7443609022556391,
505
+ "eval_loss": 0.9569369554519653,
506
+ "eval_runtime": 0.7647,
507
+ "eval_samples_per_second": 173.928,
508
+ "eval_steps_per_second": 22.231,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
+ "grad_norm": 2.1397032737731934,
514
  "learning_rate": 1.3230769230769231e-05,
515
+ "loss": 0.9645,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
+ "grad_norm": 2.7277321815490723,
521
  "learning_rate": 1.312820512820513e-05,
522
+ "loss": 1.0063,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
+ "grad_norm": 2.391350030899048,
528
  "learning_rate": 1.3025641025641027e-05,
529
+ "loss": 0.9918,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
+ "grad_norm": 2.751174211502075,
535
  "learning_rate": 1.2923076923076925e-05,
536
+ "loss": 0.9849,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
+ "grad_norm": 2.77424693107605,
542
  "learning_rate": 1.2820512820512823e-05,
543
+ "loss": 0.9745,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
+ "grad_norm": 1.9156702756881714,
549
  "learning_rate": 1.2717948717948718e-05,
550
+ "loss": 0.9684,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
+ "grad_norm": 1.9521454572677612,
556
  "learning_rate": 1.2615384615384616e-05,
557
+ "loss": 0.9503,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
+ "grad_norm": 2.468419313430786,
563
  "learning_rate": 1.2512820512820514e-05,
564
+ "loss": 0.9641,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
+ "grad_norm": 2.520923614501953,
570
  "learning_rate": 1.2410256410256412e-05,
571
+ "loss": 0.9471,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
+ "grad_norm": 2.1003758907318115,
577
  "learning_rate": 1.230769230769231e-05,
578
+ "loss": 0.9513,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
+ "grad_norm": 2.192279100418091,
584
  "learning_rate": 1.2205128205128208e-05,
585
+ "loss": 0.9527,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
+ "grad_norm": 3.8428618907928467,
591
  "learning_rate": 1.2102564102564102e-05,
592
+ "loss": 0.938,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
+ "grad_norm": 4.9151530265808105,
598
  "learning_rate": 1.2e-05,
599
+ "loss": 0.9753,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_accuracy": 0.7819548872180451,
605
+ "eval_loss": 0.9288201332092285,
606
+ "eval_runtime": 0.7499,
607
+ "eval_samples_per_second": 177.349,
608
+ "eval_steps_per_second": 22.669,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
+ "grad_norm": 2.7967398166656494,
614
  "learning_rate": 1.1897435897435898e-05,
615
+ "loss": 0.9428,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
+ "grad_norm": 2.5342345237731934,
621
  "learning_rate": 1.1794871794871796e-05,
622
+ "loss": 0.9406,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
+ "grad_norm": 1.877543330192566,
628
  "learning_rate": 1.1692307692307694e-05,
629
+ "loss": 0.9319,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
+ "grad_norm": 2.4524621963500977,
635
  "learning_rate": 1.1589743589743592e-05,
636
+ "loss": 0.9332,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
+ "grad_norm": 2.4967362880706787,
642
  "learning_rate": 1.1487179487179487e-05,
643
+ "loss": 0.9367,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
+ "grad_norm": 3.2078776359558105,
649
  "learning_rate": 1.1384615384615385e-05,
650
+ "loss": 0.9339,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
+ "grad_norm": 2.926706075668335,
656
  "learning_rate": 1.1282051282051283e-05,
657
+ "loss": 0.9416,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
+ "grad_norm": 1.8625017404556274,
663
  "learning_rate": 1.117948717948718e-05,
664
+ "loss": 0.9111,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
+ "grad_norm": 2.7141189575195312,
670
  "learning_rate": 1.1076923076923079e-05,
671
+ "loss": 0.9574,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
+ "grad_norm": 2.307347536087036,
677
  "learning_rate": 1.0974358974358977e-05,
678
+ "loss": 0.9259,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
+ "grad_norm": 2.3937132358551025,
684
  "learning_rate": 1.0871794871794871e-05,
685
+ "loss": 0.9207,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
+ "grad_norm": 3.0794668197631836,
691
  "learning_rate": 1.076923076923077e-05,
692
+ "loss": 0.9418,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
+ "grad_norm": 4.111669063568115,
698
  "learning_rate": 1.0666666666666667e-05,
699
+ "loss": 0.9252,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
+ "eval_accuracy": 0.7969924812030075,
705
+ "eval_loss": 0.8874692916870117,
706
+ "eval_runtime": 0.7823,
707
+ "eval_samples_per_second": 170.013,
708
+ "eval_steps_per_second": 21.731,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
+ "grad_norm": 2.7561662197113037,
714
  "learning_rate": 1.0564102564102565e-05,
715
+ "loss": 0.911,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
+ "grad_norm": 3.2020223140716553,
721
  "learning_rate": 1.0461538461538463e-05,
722
+ "loss": 0.912,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
+ "grad_norm": 3.459304094314575,
728
  "learning_rate": 1.0358974358974361e-05,
729
+ "loss": 0.8994,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
+ "grad_norm": 2.774078369140625,
735
  "learning_rate": 1.0256410256410256e-05,
736
+ "loss": 0.9079,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
+ "grad_norm": 2.7169668674468994,
742
  "learning_rate": 1.0153846153846154e-05,
743
+ "loss": 0.9256,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
+ "grad_norm": 2.171323299407959,
749
  "learning_rate": 1.0051282051282052e-05,
750
+ "loss": 0.8898,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
+ "grad_norm": 2.7350351810455322,
756
  "learning_rate": 9.94871794871795e-06,
757
+ "loss": 0.9243,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
+ "grad_norm": 2.3926539421081543,
763
  "learning_rate": 9.846153846153848e-06,
764
+ "loss": 0.8868,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
+ "grad_norm": 2.0602715015411377,
770
  "learning_rate": 9.743589743589744e-06,
771
+ "loss": 0.8837,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
+ "grad_norm": 2.885303497314453,
777
  "learning_rate": 9.641025641025642e-06,
778
+ "loss": 0.8827,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
+ "grad_norm": 2.261361837387085,
784
  "learning_rate": 9.53846153846154e-06,
785
+ "loss": 0.9047,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
+ "grad_norm": 2.6180179119110107,
791
  "learning_rate": 9.435897435897436e-06,
792
+ "loss": 0.861,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
+ "grad_norm": 4.225304126739502,
798
  "learning_rate": 9.333333333333334e-06,
799
+ "loss": 0.9192,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
  "eval_accuracy": 0.8120300751879699,
805
+ "eval_loss": 0.850643515586853,
806
+ "eval_runtime": 0.756,
807
+ "eval_samples_per_second": 175.92,
808
+ "eval_steps_per_second": 22.486,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
+ "grad_norm": 2.1875813007354736,
814
  "learning_rate": 9.230769230769232e-06,
815
+ "loss": 0.8953,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
+ "grad_norm": 2.1640567779541016,
821
  "learning_rate": 9.128205128205129e-06,
822
+ "loss": 0.8658,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
+ "grad_norm": 2.660614490509033,
828
  "learning_rate": 9.025641025641027e-06,
829
+ "loss": 0.8995,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
+ "grad_norm": 2.104029417037964,
835
  "learning_rate": 8.923076923076925e-06,
836
+ "loss": 0.8569,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
+ "grad_norm": 2.2643303871154785,
842
  "learning_rate": 8.820512820512821e-06,
843
+ "loss": 0.8972,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
+ "grad_norm": 2.632410764694214,
849
  "learning_rate": 8.717948717948719e-06,
850
+ "loss": 0.8715,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
+ "grad_norm": 1.6500084400177002,
856
  "learning_rate": 8.615384615384617e-06,
857
+ "loss": 0.8716,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
+ "grad_norm": 6.204855442047119,
863
  "learning_rate": 8.512820512820513e-06,
864
+ "loss": 0.8985,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
+ "grad_norm": 3.729611873626709,
870
  "learning_rate": 8.410256410256411e-06,
871
+ "loss": 0.8837,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
+ "grad_norm": 3.685739278793335,
877
  "learning_rate": 8.307692307692309e-06,
878
+ "loss": 0.8865,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
+ "grad_norm": 2.7028560638427734,
884
  "learning_rate": 8.205128205128205e-06,
885
+ "loss": 0.875,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
+ "grad_norm": 2.7692482471466064,
891
  "learning_rate": 8.102564102564103e-06,
892
+ "loss": 0.8867,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
+ "grad_norm": 3.9854462146759033,
898
  "learning_rate": 8.000000000000001e-06,
899
+ "loss": 0.9008,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
  "eval_accuracy": 0.8045112781954887,
905
+ "eval_loss": 0.8337866067886353,
906
+ "eval_runtime": 0.7963,
907
+ "eval_samples_per_second": 167.03,
908
+ "eval_steps_per_second": 21.35,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
+ "grad_norm": 1.9381572008132935,
914
  "learning_rate": 7.897435897435898e-06,
915
+ "loss": 0.8969,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
+ "grad_norm": 2.219219446182251,
921
  "learning_rate": 7.794871794871796e-06,
922
+ "loss": 0.8412,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
+ "grad_norm": 2.1302294731140137,
928
  "learning_rate": 7.692307692307694e-06,
929
+ "loss": 0.8483,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
+ "grad_norm": 2.541210174560547,
935
  "learning_rate": 7.58974358974359e-06,
936
+ "loss": 0.8536,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
+ "grad_norm": 1.952871322631836,
942
  "learning_rate": 7.487179487179488e-06,
943
+ "loss": 0.8707,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
+ "grad_norm": 3.273028612136841,
949
  "learning_rate": 7.384615384615386e-06,
950
+ "loss": 0.8547,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
+ "grad_norm": 2.6495628356933594,
956
  "learning_rate": 7.282051282051282e-06,
957
+ "loss": 0.8709,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
+ "grad_norm": 1.998024582862854,
963
  "learning_rate": 7.17948717948718e-06,
964
+ "loss": 0.8278,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
+ "grad_norm": 2.7621707916259766,
970
  "learning_rate": 7.076923076923078e-06,
971
+ "loss": 0.8544,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
+ "grad_norm": 1.844375491142273,
977
  "learning_rate": 6.974358974358974e-06,
978
+ "loss": 0.8324,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
+ "grad_norm": 2.149479866027832,
984
  "learning_rate": 6.871794871794872e-06,
985
+ "loss": 0.8146,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
+ "grad_norm": 2.2224795818328857,
991
  "learning_rate": 6.76923076923077e-06,
992
+ "loss": 0.8367,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
+ "grad_norm": 3.8497843742370605,
998
  "learning_rate": 6.666666666666667e-06,
999
+ "loss": 0.8079,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
  "eval_accuracy": 0.8421052631578947,
1005
+ "eval_loss": 0.8103837370872498,
1006
+ "eval_runtime": 0.7593,
1007
+ "eval_samples_per_second": 175.164,
1008
+ "eval_steps_per_second": 22.389,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.076923076923077,
1013
+ "grad_norm": 2.0343823432922363,
1014
  "learning_rate": 6.564102564102565e-06,
1015
+ "loss": 0.8408,
1016
  "step": 1310
1017
  },
1018
  {
1019
  "epoch": 10.153846153846153,
1020
+ "grad_norm": 2.4245193004608154,
1021
  "learning_rate": 6.461538461538463e-06,
1022
+ "loss": 0.899,
1023
  "step": 1320
1024
  },
1025
  {
1026
  "epoch": 10.23076923076923,
1027
+ "grad_norm": 2.3912925720214844,
1028
  "learning_rate": 6.358974358974359e-06,
1029
+ "loss": 0.8758,
1030
  "step": 1330
1031
  },
1032
  {
1033
  "epoch": 10.307692307692308,
1034
+ "grad_norm": 2.1387076377868652,
1035
  "learning_rate": 6.256410256410257e-06,
1036
+ "loss": 0.8295,
1037
  "step": 1340
1038
  },
1039
  {
1040
  "epoch": 10.384615384615385,
1041
+ "grad_norm": 2.142160415649414,
1042
  "learning_rate": 6.153846153846155e-06,
1043
+ "loss": 0.8075,
1044
  "step": 1350
1045
  },
1046
  {
1047
  "epoch": 10.461538461538462,
1048
+ "grad_norm": 2.6838831901550293,
1049
  "learning_rate": 6.051282051282051e-06,
1050
+ "loss": 0.8448,
1051
  "step": 1360
1052
  },
1053
  {
1054
  "epoch": 10.538461538461538,
1055
+ "grad_norm": 2.476369857788086,
1056
  "learning_rate": 5.948717948717949e-06,
1057
+ "loss": 0.817,
1058
  "step": 1370
1059
  },
1060
  {
1061
  "epoch": 10.615384615384615,
1062
+ "grad_norm": 3.031463861465454,
1063
  "learning_rate": 5.846153846153847e-06,
1064
+ "loss": 0.8177,
1065
  "step": 1380
1066
  },
1067
  {
1068
  "epoch": 10.692307692307692,
1069
+ "grad_norm": 2.2818636894226074,
1070
  "learning_rate": 5.743589743589743e-06,
1071
+ "loss": 0.8124,
1072
  "step": 1390
1073
  },
1074
  {
1075
  "epoch": 10.76923076923077,
1076
+ "grad_norm": 3.245805263519287,
1077
  "learning_rate": 5.641025641025641e-06,
1078
+ "loss": 0.8674,
1079
  "step": 1400
1080
  },
1081
  {
1082
  "epoch": 10.846153846153847,
1083
+ "grad_norm": 2.194627046585083,
1084
  "learning_rate": 5.538461538461539e-06,
1085
+ "loss": 0.831,
1086
  "step": 1410
1087
  },
1088
  {
1089
  "epoch": 10.923076923076923,
1090
+ "grad_norm": 1.8149436712265015,
1091
  "learning_rate": 5.435897435897436e-06,
1092
+ "loss": 0.8391,
1093
  "step": 1420
1094
  },
1095
  {
1096
  "epoch": 11.0,
1097
+ "grad_norm": 4.0584821701049805,
1098
  "learning_rate": 5.333333333333334e-06,
1099
+ "loss": 0.8332,
1100
  "step": 1430
1101
  },
1102
  {
1103
  "epoch": 11.0,
1104
  "eval_accuracy": 0.8345864661654135,
1105
+ "eval_loss": 0.7806060314178467,
1106
+ "eval_runtime": 0.742,
1107
+ "eval_samples_per_second": 179.256,
1108
+ "eval_steps_per_second": 22.912,
1109
  "step": 1430
1110
  },
1111
  {
1112
  "epoch": 11.076923076923077,
1113
+ "grad_norm": 1.9833248853683472,
1114
  "learning_rate": 5.230769230769232e-06,
1115
+ "loss": 0.8484,
1116
  "step": 1440
1117
  },
1118
  {
1119
  "epoch": 11.153846153846153,
1120
+ "grad_norm": 5.478232383728027,
1121
  "learning_rate": 5.128205128205128e-06,
1122
+ "loss": 0.8308,
1123
  "step": 1450
1124
  },
1125
  {
1126
  "epoch": 11.23076923076923,
1127
+ "grad_norm": 2.5792922973632812,
1128
  "learning_rate": 5.025641025641026e-06,
1129
+ "loss": 0.802,
1130
  "step": 1460
1131
  },
1132
  {
1133
  "epoch": 11.307692307692308,
1134
+ "grad_norm": 2.730989694595337,
1135
  "learning_rate": 4.923076923076924e-06,
1136
+ "loss": 0.8225,
1137
  "step": 1470
1138
  },
1139
  {
1140
  "epoch": 11.384615384615385,
1141
+ "grad_norm": 2.7447853088378906,
1142
  "learning_rate": 4.820512820512821e-06,
1143
+ "loss": 0.8176,
1144
  "step": 1480
1145
  },
1146
  {
1147
  "epoch": 11.461538461538462,
1148
+ "grad_norm": 2.6465837955474854,
1149
  "learning_rate": 4.717948717948718e-06,
1150
+ "loss": 0.8471,
1151
  "step": 1490
1152
  },
1153
  {
1154
  "epoch": 11.538461538461538,
1155
+ "grad_norm": 2.4876015186309814,
1156
  "learning_rate": 4.615384615384616e-06,
1157
+ "loss": 0.8349,
1158
  "step": 1500
1159
  },
1160
  {
1161
  "epoch": 11.615384615384615,
1162
+ "grad_norm": 3.2605788707733154,
1163
  "learning_rate": 4.512820512820513e-06,
1164
+ "loss": 0.8285,
1165
  "step": 1510
1166
  },
1167
  {
1168
  "epoch": 11.692307692307692,
1169
+ "grad_norm": 3.278341293334961,
1170
  "learning_rate": 4.4102564102564104e-06,
1171
+ "loss": 0.8546,
1172
  "step": 1520
1173
  },
1174
  {
1175
  "epoch": 11.76923076923077,
1176
+ "grad_norm": 2.0945637226104736,
1177
  "learning_rate": 4.307692307692308e-06,
1178
+ "loss": 0.8096,
1179
  "step": 1530
1180
  },
1181
  {
1182
  "epoch": 11.846153846153847,
1183
+ "grad_norm": 2.161726474761963,
1184
  "learning_rate": 4.2051282051282055e-06,
1185
+ "loss": 0.7938,
1186
  "step": 1540
1187
  },
1188
  {
1189
  "epoch": 11.923076923076923,
1190
+ "grad_norm": 2.1052703857421875,
1191
  "learning_rate": 4.102564102564103e-06,
1192
+ "loss": 0.8295,
1193
  "step": 1550
1194
  },
1195
  {
1196
  "epoch": 12.0,
1197
+ "grad_norm": 3.460094451904297,
1198
  "learning_rate": 4.000000000000001e-06,
1199
+ "loss": 0.8103,
1200
  "step": 1560
1201
  },
1202
  {
1203
  "epoch": 12.0,
1204
  "eval_accuracy": 0.8345864661654135,
1205
+ "eval_loss": 0.7585543990135193,
1206
+ "eval_runtime": 0.7508,
1207
+ "eval_samples_per_second": 177.133,
1208
+ "eval_steps_per_second": 22.641,
1209
  "step": 1560
1210
  },
1211
  {
1212
  "epoch": 12.076923076923077,
1213
+ "grad_norm": 2.943866014480591,
1214
  "learning_rate": 3.897435897435898e-06,
1215
+ "loss": 0.7903,
1216
  "step": 1570
1217
  },
1218
  {
1219
  "epoch": 12.153846153846153,
1220
+ "grad_norm": 2.6185402870178223,
1221
  "learning_rate": 3.794871794871795e-06,
1222
+ "loss": 0.8229,
1223
  "step": 1580
1224
  },
1225
  {
1226
  "epoch": 12.23076923076923,
1227
+ "grad_norm": 1.6378310918807983,
1228
  "learning_rate": 3.692307692307693e-06,
1229
+ "loss": 0.8246,
1230
  "step": 1590
1231
  },
1232
  {
1233
  "epoch": 12.307692307692308,
1234
+ "grad_norm": 2.3109569549560547,
1235
  "learning_rate": 3.58974358974359e-06,
1236
+ "loss": 0.8363,
1237
  "step": 1600
1238
  },
1239
  {
1240
  "epoch": 12.384615384615385,
1241
+ "grad_norm": 2.3602941036224365,
1242
  "learning_rate": 3.487179487179487e-06,
1243
+ "loss": 0.8078,
1244
  "step": 1610
1245
  },
1246
  {
1247
  "epoch": 12.461538461538462,
1248
+ "grad_norm": 3.0623390674591064,
1249
  "learning_rate": 3.384615384615385e-06,
1250
+ "loss": 0.794,
1251
  "step": 1620
1252
  },
1253
  {
1254
  "epoch": 12.538461538461538,
1255
+ "grad_norm": 2.947983741760254,
1256
  "learning_rate": 3.2820512820512823e-06,
1257
+ "loss": 0.8033,
1258
  "step": 1630
1259
  },
1260
  {
1261
  "epoch": 12.615384615384615,
1262
+ "grad_norm": 1.8083330392837524,
1263
  "learning_rate": 3.1794871794871795e-06,
1264
+ "loss": 0.8158,
1265
  "step": 1640
1266
  },
1267
  {
1268
  "epoch": 12.692307692307692,
1269
+ "grad_norm": 3.2873637676239014,
1270
  "learning_rate": 3.0769230769230774e-06,
1271
+ "loss": 0.7651,
1272
  "step": 1650
1273
  },
1274
  {
1275
  "epoch": 12.76923076923077,
1276
+ "grad_norm": 2.3777670860290527,
1277
  "learning_rate": 2.9743589743589746e-06,
1278
+ "loss": 0.8566,
1279
  "step": 1660
1280
  },
1281
  {
1282
  "epoch": 12.846153846153847,
1283
+ "grad_norm": 1.8692084550857544,
1284
  "learning_rate": 2.8717948717948717e-06,
1285
+ "loss": 0.8218,
1286
  "step": 1670
1287
  },
1288
  {
1289
  "epoch": 12.923076923076923,
1290
+ "grad_norm": 2.2379138469696045,
1291
  "learning_rate": 2.7692307692307697e-06,
1292
+ "loss": 0.7984,
1293
  "step": 1680
1294
  },
1295
  {
1296
  "epoch": 13.0,
1297
+ "grad_norm": 4.131476879119873,
1298
  "learning_rate": 2.666666666666667e-06,
1299
+ "loss": 0.8149,
1300
  "step": 1690
1301
  },
1302
  {
1303
  "epoch": 13.0,
1304
  "eval_accuracy": 0.8421052631578947,
1305
+ "eval_loss": 0.757113516330719,
1306
+ "eval_runtime": 0.7762,
1307
+ "eval_samples_per_second": 171.337,
1308
+ "eval_steps_per_second": 21.9,
1309
  "step": 1690
1310
  },
1311
  {
1312
  "epoch": 13.076923076923077,
1313
+ "grad_norm": 2.9936656951904297,
1314
  "learning_rate": 2.564102564102564e-06,
1315
+ "loss": 0.7917,
1316
  "step": 1700
1317
  },
1318
  {
1319
  "epoch": 13.153846153846153,
1320
+ "grad_norm": 2.5392699241638184,
1321
  "learning_rate": 2.461538461538462e-06,
1322
+ "loss": 0.8241,
1323
  "step": 1710
1324
  },
1325
  {
1326
  "epoch": 13.23076923076923,
1327
+ "grad_norm": 3.0166265964508057,
1328
  "learning_rate": 2.358974358974359e-06,
1329
+ "loss": 0.8117,
1330
  "step": 1720
1331
  },
1332
  {
1333
  "epoch": 13.307692307692308,
1334
+ "grad_norm": 1.8728867769241333,
1335
  "learning_rate": 2.2564102564102566e-06,
1336
+ "loss": 0.8155,
1337
  "step": 1730
1338
  },
1339
  {
1340
  "epoch": 13.384615384615385,
1341
+ "grad_norm": 2.50715708732605,
1342
  "learning_rate": 2.153846153846154e-06,
1343
+ "loss": 0.7814,
1344
  "step": 1740
1345
  },
1346
  {
1347
  "epoch": 13.461538461538462,
1348
+ "grad_norm": 5.447348594665527,
1349
  "learning_rate": 2.0512820512820513e-06,
1350
+ "loss": 0.8253,
1351
  "step": 1750
1352
  },
1353
  {
1354
  "epoch": 13.538461538461538,
1355
+ "grad_norm": 2.6522035598754883,
1356
  "learning_rate": 1.948717948717949e-06,
1357
+ "loss": 0.8486,
1358
  "step": 1760
1359
  },
1360
  {
1361
  "epoch": 13.615384615384615,
1362
+ "grad_norm": 2.1300199031829834,
1363
  "learning_rate": 1.8461538461538465e-06,
1364
+ "loss": 0.8027,
1365
  "step": 1770
1366
  },
1367
  {
1368
  "epoch": 13.692307692307692,
1369
+ "grad_norm": 2.1135923862457275,
1370
  "learning_rate": 1.7435897435897436e-06,
1371
+ "loss": 0.7852,
1372
  "step": 1780
1373
  },
1374
  {
1375
  "epoch": 13.76923076923077,
1376
+ "grad_norm": 1.871300220489502,
1377
  "learning_rate": 1.6410256410256412e-06,
1378
+ "loss": 0.8224,
1379
  "step": 1790
1380
  },
1381
  {
1382
  "epoch": 13.846153846153847,
1383
+ "grad_norm": 3.240356206893921,
1384
  "learning_rate": 1.5384615384615387e-06,
1385
+ "loss": 0.7895,
1386
  "step": 1800
1387
  },
1388
  {
1389
  "epoch": 13.923076923076923,
1390
+ "grad_norm": 2.5182340145111084,
1391
  "learning_rate": 1.4358974358974359e-06,
1392
+ "loss": 0.7316,
1393
  "step": 1810
1394
  },
1395
  {
1396
  "epoch": 14.0,
1397
+ "grad_norm": 4.281803607940674,
1398
  "learning_rate": 1.3333333333333334e-06,
1399
+ "loss": 0.8186,
1400
  "step": 1820
1401
  },
1402
  {
1403
  "epoch": 14.0,
1404
  "eval_accuracy": 0.8270676691729323,
1405
+ "eval_loss": 0.7540305852890015,
1406
+ "eval_runtime": 0.7703,
1407
+ "eval_samples_per_second": 172.654,
1408
+ "eval_steps_per_second": 22.069,
1409
  "step": 1820
1410
  },
1411
  {
1412
  "epoch": 14.076923076923077,
1413
+ "grad_norm": 2.050518751144409,
1414
  "learning_rate": 1.230769230769231e-06,
1415
+ "loss": 0.8222,
1416
  "step": 1830
1417
  },
1418
  {
1419
  "epoch": 14.153846153846153,
1420
+ "grad_norm": 2.051259994506836,
1421
  "learning_rate": 1.1282051282051283e-06,
1422
+ "loss": 0.7878,
1423
  "step": 1840
1424
  },
1425
  {
1426
  "epoch": 14.23076923076923,
1427
+ "grad_norm": 2.8861193656921387,
1428
  "learning_rate": 1.0256410256410257e-06,
1429
+ "loss": 0.78,
1430
  "step": 1850
1431
  },
1432
  {
1433
  "epoch": 14.307692307692308,
1434
+ "grad_norm": 4.159270763397217,
1435
  "learning_rate": 9.230769230769232e-07,
1436
+ "loss": 0.774,
1437
  "step": 1860
1438
  },
1439
  {
1440
  "epoch": 14.384615384615385,
1441
+ "grad_norm": 2.8624985218048096,
1442
  "learning_rate": 8.205128205128206e-07,
1443
+ "loss": 0.7882,
1444
  "step": 1870
1445
  },
1446
  {
1447
  "epoch": 14.461538461538462,
1448
+ "grad_norm": 2.5051703453063965,
1449
  "learning_rate": 7.179487179487179e-07,
1450
+ "loss": 0.7883,
1451
  "step": 1880
1452
  },
1453
  {
1454
  "epoch": 14.538461538461538,
1455
+ "grad_norm": 3.003545045852661,
1456
  "learning_rate": 6.153846153846155e-07,
1457
+ "loss": 0.7817,
1458
  "step": 1890
1459
  },
1460
  {
1461
  "epoch": 14.615384615384615,
1462
+ "grad_norm": 2.8403878211975098,
1463
  "learning_rate": 5.128205128205128e-07,
1464
+ "loss": 0.8294,
1465
  "step": 1900
1466
  },
1467
  {
1468
  "epoch": 14.692307692307692,
1469
+ "grad_norm": 2.124030590057373,
1470
  "learning_rate": 4.102564102564103e-07,
1471
+ "loss": 0.7978,
1472
  "step": 1910
1473
  },
1474
  {
1475
  "epoch": 14.76923076923077,
1476
+ "grad_norm": 4.762181758880615,
1477
  "learning_rate": 3.0769230769230774e-07,
1478
+ "loss": 0.8038,
1479
  "step": 1920
1480
  },
1481
  {
1482
  "epoch": 14.846153846153847,
1483
+ "grad_norm": 3.256133794784546,
1484
  "learning_rate": 2.0512820512820514e-07,
1485
+ "loss": 0.8535,
1486
  "step": 1930
1487
  },
1488
  {
1489
  "epoch": 14.923076923076923,
1490
+ "grad_norm": 2.355344772338867,
1491
  "learning_rate": 1.0256410256410257e-07,
1492
+ "loss": 0.7587,
1493
  "step": 1940
1494
  },
1495
  {
1496
  "epoch": 15.0,
1497
+ "grad_norm": 4.202574729919434,
1498
  "learning_rate": 0.0,
1499
+ "loss": 0.7929,
1500
  "step": 1950
1501
  },
1502
  {
1503
  "epoch": 15.0,
1504
+ "eval_accuracy": 0.8120300751879699,
1505
+ "eval_loss": 0.7412300109863281,
1506
+ "eval_runtime": 0.8087,
1507
+ "eval_samples_per_second": 164.47,
1508
+ "eval_steps_per_second": 21.022,
1509
  "step": 1950
1510
  },
1511
  {
1512
  "epoch": 15.0,
1513
  "step": 1950,
1514
  "total_flos": 1.5658365504595968e+17,
1515
+ "train_loss": 0.9236146088135548,
1516
+ "train_runtime": 142.953,
1517
+ "train_samples_per_second": 108.497,
1518
+ "train_steps_per_second": 13.641
1519
  }
1520
  ],
1521
  "logging_steps": 10,