Raihan004 commited on
Commit
3838257
1 Parent(s): 07db0d1

🍻 cheers

Browse files
README.md CHANGED
@@ -1,7 +1,8 @@
1
  ---
2
  license: apache-2.0
3
- base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - imagefolder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: imagefolder
18
  type: imagefolder
19
  config: default
20
  split: train
@@ -22,7 +23,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.8347978910369068
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # Action_model
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.7013
36
- - Accuracy: 0.8348
37
 
38
  ## Model description
39
 
 
1
  ---
2
  license: apache-2.0
3
+ base_model: Raihan004/Action_model
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: action_class
19
  type: imagefolder
20
  config: default
21
  split: train
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.8330404217926186
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
31
 
32
  # Action_model
33
 
34
+ This model is a fine-tuned version of [Raihan004/Action_model](https://huggingface.co/Raihan004/Action_model) on the action_class dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.6130
37
+ - Accuracy: 0.8330
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8541300527240774,
4
- "eval_loss": 0.49219682812690735,
5
- "eval_runtime": 8.6475,
6
- "eval_samples_per_second": 65.799,
7
- "eval_steps_per_second": 8.326,
8
  "total_flos": 3.3230947683690086e+18,
9
- "train_loss": 0.45721185349706395,
10
- "train_runtime": 1410.6493,
11
- "train_samples_per_second": 30.397,
12
- "train_steps_per_second": 1.9
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8330404217926186,
4
+ "eval_loss": 0.6129801869392395,
5
+ "eval_runtime": 8.4281,
6
+ "eval_samples_per_second": 67.512,
7
+ "eval_steps_per_second": 8.543,
8
  "total_flos": 3.3230947683690086e+18,
9
+ "train_loss": 0.23535207314277762,
10
+ "train_runtime": 1371.8304,
11
+ "train_samples_per_second": 31.258,
12
+ "train_steps_per_second": 1.954
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8541300527240774,
4
- "eval_loss": 0.49219682812690735,
5
- "eval_runtime": 8.6475,
6
- "eval_samples_per_second": 65.799,
7
- "eval_steps_per_second": 8.326
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8330404217926186,
4
+ "eval_loss": 0.6129801869392395,
5
+ "eval_runtime": 8.4281,
6
+ "eval_samples_per_second": 67.512,
7
+ "eval_steps_per_second": 8.543
8
  }
runs/May01_07-50-39_5c48e9aaec37/events.out.tfevents.1714573726.5c48e9aaec37.34.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c05609fc31780b43695335d82ef6de8e87bacae435e254f4e8089af320af667
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 3.3230947683690086e+18,
4
- "train_loss": 0.45721185349706395,
5
- "train_runtime": 1410.6493,
6
- "train_samples_per_second": 30.397,
7
- "train_steps_per_second": 1.9
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 3.3230947683690086e+18,
4
+ "train_loss": 0.23535207314277762,
5
+ "train_runtime": 1371.8304,
6
+ "train_samples_per_second": 31.258,
7
+ "train_steps_per_second": 1.954
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.49219682812690735,
3
- "best_model_checkpoint": "Action_model/checkpoint-1900",
4
  "epoch": 10.0,
5
  "eval_steps": 100,
6
  "global_step": 2680,
@@ -10,2122 +10,2122 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
- "grad_norm": 1.7867449522018433,
14
  "learning_rate": 9.96268656716418e-05,
15
- "loss": 2.2615,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07,
20
- "grad_norm": 1.8689260482788086,
21
  "learning_rate": 9.925373134328359e-05,
22
- "loss": 2.1747,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.11,
27
- "grad_norm": 1.9384838342666626,
28
  "learning_rate": 9.888059701492539e-05,
29
- "loss": 2.041,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.15,
34
- "grad_norm": 2.1394729614257812,
35
  "learning_rate": 9.850746268656717e-05,
36
- "loss": 1.8498,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.19,
41
- "grad_norm": 2.4759299755096436,
42
  "learning_rate": 9.813432835820896e-05,
43
- "loss": 1.7391,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.22,
48
- "grad_norm": 2.863851547241211,
49
  "learning_rate": 9.776119402985075e-05,
50
- "loss": 1.5671,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.26,
55
- "grad_norm": 2.2633190155029297,
56
  "learning_rate": 9.738805970149254e-05,
57
- "loss": 1.5166,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.3,
62
- "grad_norm": 2.2081799507141113,
63
  "learning_rate": 9.701492537313434e-05,
64
- "loss": 1.4131,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.34,
69
- "grad_norm": 2.7711575031280518,
70
  "learning_rate": 9.664179104477612e-05,
71
- "loss": 1.2985,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.37,
76
- "grad_norm": 2.807891607284546,
77
  "learning_rate": 9.626865671641792e-05,
78
- "loss": 1.2738,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.37,
83
- "eval_accuracy": 0.7346221441124781,
84
- "eval_loss": 1.130598545074463,
85
- "eval_runtime": 12.2164,
86
- "eval_samples_per_second": 46.577,
87
- "eval_steps_per_second": 5.894,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.41,
92
- "grad_norm": 3.494056224822998,
93
  "learning_rate": 9.58955223880597e-05,
94
- "loss": 1.2577,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.45,
99
- "grad_norm": 3.508563756942749,
100
  "learning_rate": 9.552238805970149e-05,
101
- "loss": 1.1644,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.49,
106
- "grad_norm": 5.29508638381958,
107
  "learning_rate": 9.514925373134329e-05,
108
- "loss": 1.2097,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.52,
113
- "grad_norm": 3.179622173309326,
114
  "learning_rate": 9.477611940298507e-05,
115
- "loss": 1.1636,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.56,
120
- "grad_norm": 5.1542253494262695,
121
  "learning_rate": 9.440298507462687e-05,
122
- "loss": 1.1178,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 0.6,
127
- "grad_norm": 2.3797709941864014,
128
  "learning_rate": 9.402985074626867e-05,
129
- "loss": 0.9977,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 0.63,
134
- "grad_norm": 4.6406474113464355,
135
  "learning_rate": 9.365671641791045e-05,
136
- "loss": 1.1188,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 0.67,
141
- "grad_norm": 3.9354639053344727,
142
- "learning_rate": 9.332089552238806e-05,
143
- "loss": 1.0444,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 0.71,
148
- "grad_norm": 5.296183109283447,
149
- "learning_rate": 9.294776119402986e-05,
150
- "loss": 1.0577,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 0.75,
155
- "grad_norm": 3.7493107318878174,
156
- "learning_rate": 9.257462686567165e-05,
157
- "loss": 0.9277,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 0.75,
162
- "eval_accuracy": 0.7644991212653779,
163
- "eval_loss": 0.8469392657279968,
164
- "eval_runtime": 8.5143,
165
- "eval_samples_per_second": 66.829,
166
- "eval_steps_per_second": 8.456,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.78,
171
- "grad_norm": 3.4060637950897217,
172
- "learning_rate": 9.220149253731343e-05,
173
- "loss": 0.8143,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 0.82,
178
- "grad_norm": 5.105949878692627,
179
- "learning_rate": 9.182835820895522e-05,
180
- "loss": 1.0109,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 0.86,
185
- "grad_norm": 3.1793041229248047,
186
- "learning_rate": 9.145522388059702e-05,
187
- "loss": 0.8406,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 0.9,
192
- "grad_norm": 3.1578280925750732,
193
- "learning_rate": 9.108208955223881e-05,
194
- "loss": 1.0428,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 0.93,
199
- "grad_norm": 4.253862380981445,
200
- "learning_rate": 9.07089552238806e-05,
201
- "loss": 0.8443,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 0.97,
206
- "grad_norm": 5.269837856292725,
207
- "learning_rate": 9.03358208955224e-05,
208
- "loss": 0.9181,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.01,
213
- "grad_norm": 4.203462600708008,
214
- "learning_rate": 8.996268656716418e-05,
215
- "loss": 0.8862,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.04,
220
- "grad_norm": 3.3668694496154785,
221
- "learning_rate": 8.958955223880598e-05,
222
- "loss": 0.6877,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.08,
227
- "grad_norm": 4.250879764556885,
228
- "learning_rate": 8.921641791044776e-05,
229
- "loss": 0.7136,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.12,
234
- "grad_norm": 4.63535213470459,
235
- "learning_rate": 8.884328358208955e-05,
236
- "loss": 0.9232,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.12,
241
- "eval_accuracy": 0.789103690685413,
242
- "eval_loss": 0.7195268273353577,
243
- "eval_runtime": 8.5181,
244
- "eval_samples_per_second": 66.799,
245
- "eval_steps_per_second": 8.453,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.16,
250
- "grad_norm": 4.166705131530762,
251
- "learning_rate": 8.847014925373135e-05,
252
- "loss": 0.778,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 1.19,
257
- "grad_norm": 4.43386173248291,
258
- "learning_rate": 8.809701492537313e-05,
259
- "loss": 0.7922,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 1.23,
264
- "grad_norm": 5.09395170211792,
265
- "learning_rate": 8.772388059701493e-05,
266
- "loss": 0.7687,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 1.27,
271
- "grad_norm": 3.8679046630859375,
272
- "learning_rate": 8.735074626865673e-05,
273
- "loss": 0.8119,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 1.31,
278
- "grad_norm": 7.185283184051514,
279
- "learning_rate": 8.697761194029851e-05,
280
- "loss": 0.6383,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.34,
285
- "grad_norm": 4.295166969299316,
286
- "learning_rate": 8.66044776119403e-05,
287
- "loss": 0.8093,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 1.38,
292
- "grad_norm": 4.563631057739258,
293
- "learning_rate": 8.623134328358208e-05,
294
- "loss": 0.7776,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 1.42,
299
- "grad_norm": 3.190004825592041,
300
- "learning_rate": 8.585820895522388e-05,
301
- "loss": 0.6576,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 1.46,
306
- "grad_norm": 6.8861083984375,
307
  "learning_rate": 8.548507462686568e-05,
308
- "loss": 0.7661,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 1.49,
313
- "grad_norm": 3.9371914863586426,
314
  "learning_rate": 8.511194029850747e-05,
315
- "loss": 0.7631,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 1.49,
320
- "eval_accuracy": 0.8172231985940246,
321
- "eval_loss": 0.6575105786323547,
322
- "eval_runtime": 8.5652,
323
- "eval_samples_per_second": 66.432,
324
- "eval_steps_per_second": 8.406,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 1.53,
329
- "grad_norm": 3.310469150543213,
330
  "learning_rate": 8.473880597014926e-05,
331
- "loss": 0.6434,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 1.57,
336
- "grad_norm": 4.5913591384887695,
337
  "learning_rate": 8.436567164179105e-05,
338
- "loss": 0.8034,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 1.6,
343
- "grad_norm": 4.726842403411865,
344
  "learning_rate": 8.399253731343283e-05,
345
- "loss": 0.6168,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 1.64,
350
- "grad_norm": 6.282873630523682,
351
  "learning_rate": 8.361940298507463e-05,
352
- "loss": 0.5843,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 1.68,
357
- "grad_norm": 4.726373672485352,
358
  "learning_rate": 8.324626865671642e-05,
359
- "loss": 0.5252,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 1.72,
364
- "grad_norm": 4.248327732086182,
365
- "learning_rate": 8.291044776119404e-05,
366
- "loss": 0.6661,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 1.75,
371
- "grad_norm": 4.138558387756348,
372
- "learning_rate": 8.253731343283582e-05,
373
- "loss": 0.6494,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 1.79,
378
- "grad_norm": 3.5934176445007324,
379
- "learning_rate": 8.216417910447761e-05,
380
- "loss": 0.6315,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 1.83,
385
- "grad_norm": 4.674199104309082,
386
- "learning_rate": 8.179104477611941e-05,
387
- "loss": 0.6855,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 1.87,
392
- "grad_norm": 6.128390312194824,
393
- "learning_rate": 8.14179104477612e-05,
394
- "loss": 0.6677,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.87,
399
- "eval_accuracy": 0.8347978910369068,
400
- "eval_loss": 0.5807344913482666,
401
- "eval_runtime": 9.1606,
402
- "eval_samples_per_second": 62.114,
403
- "eval_steps_per_second": 7.86,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 1.9,
408
- "grad_norm": 4.19622278213501,
409
- "learning_rate": 8.104477611940299e-05,
410
- "loss": 0.5657,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 1.94,
415
- "grad_norm": 4.012423992156982,
416
- "learning_rate": 8.067164179104479e-05,
417
- "loss": 0.5634,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 1.98,
422
- "grad_norm": 4.992753505706787,
423
- "learning_rate": 8.029850746268657e-05,
424
- "loss": 0.7189,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 2.01,
429
- "grad_norm": 3.9965474605560303,
430
- "learning_rate": 7.992537313432836e-05,
431
- "loss": 0.6142,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 2.05,
436
- "grad_norm": 8.174031257629395,
437
- "learning_rate": 7.955223880597016e-05,
438
- "loss": 0.6144,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 2.09,
443
- "grad_norm": 6.298408508300781,
444
- "learning_rate": 7.917910447761194e-05,
445
- "loss": 0.6173,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 2.13,
450
- "grad_norm": 4.2981157302856445,
451
- "learning_rate": 7.884328358208955e-05,
452
- "loss": 0.4996,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 2.16,
457
- "grad_norm": 3.9534685611724854,
458
- "learning_rate": 7.847014925373135e-05,
459
- "loss": 0.6207,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 2.2,
464
- "grad_norm": 5.578802585601807,
465
- "learning_rate": 7.809701492537313e-05,
466
- "loss": 0.4617,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 2.24,
471
- "grad_norm": 6.687201023101807,
472
- "learning_rate": 7.772388059701493e-05,
473
- "loss": 0.5063,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 2.24,
478
- "eval_accuracy": 0.843585237258348,
479
- "eval_loss": 0.5662876963615417,
480
- "eval_runtime": 8.5435,
481
- "eval_samples_per_second": 66.6,
482
- "eval_steps_per_second": 8.427,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 2.28,
487
- "grad_norm": 8.274934768676758,
488
- "learning_rate": 7.735074626865672e-05,
489
- "loss": 0.5806,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 2.31,
494
- "grad_norm": 2.605114221572876,
495
- "learning_rate": 7.697761194029851e-05,
496
- "loss": 0.4366,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 2.35,
501
- "grad_norm": 7.271874904632568,
502
- "learning_rate": 7.66044776119403e-05,
503
- "loss": 0.5235,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 2.39,
508
- "grad_norm": 9.129398345947266,
509
- "learning_rate": 7.623134328358208e-05,
510
- "loss": 0.5998,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 2.43,
515
- "grad_norm": 4.308353900909424,
516
- "learning_rate": 7.585820895522388e-05,
517
- "loss": 0.4727,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 2.46,
522
- "grad_norm": 2.917837381362915,
523
- "learning_rate": 7.548507462686567e-05,
524
- "loss": 0.5581,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 2.5,
529
- "grad_norm": 2.3711228370666504,
530
- "learning_rate": 7.511194029850747e-05,
531
- "loss": 0.4911,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 2.54,
536
- "grad_norm": 5.023218631744385,
537
- "learning_rate": 7.473880597014926e-05,
538
- "loss": 0.5413,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 2.57,
543
- "grad_norm": 4.481651306152344,
544
- "learning_rate": 7.436567164179105e-05,
545
- "loss": 0.5846,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 2.61,
550
- "grad_norm": 5.410364627838135,
551
- "learning_rate": 7.399253731343285e-05,
552
- "loss": 0.5071,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 2.61,
557
- "eval_accuracy": 0.8347978910369068,
558
- "eval_loss": 0.5480624437332153,
559
- "eval_runtime": 8.3917,
560
- "eval_samples_per_second": 67.805,
561
- "eval_steps_per_second": 8.58,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 2.65,
566
- "grad_norm": 7.889952659606934,
567
- "learning_rate": 7.361940298507463e-05,
568
- "loss": 0.5043,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 2.69,
573
- "grad_norm": 5.345368385314941,
574
- "learning_rate": 7.324626865671642e-05,
575
- "loss": 0.4811,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 2.72,
580
- "grad_norm": 5.252166748046875,
581
- "learning_rate": 7.287313432835822e-05,
582
- "loss": 0.557,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 2.76,
587
- "grad_norm": 2.338393449783325,
588
- "learning_rate": 7.25e-05,
589
- "loss": 0.4324,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 2.8,
594
- "grad_norm": 4.905025005340576,
595
- "learning_rate": 7.21268656716418e-05,
596
- "loss": 0.4682,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 2.84,
601
- "grad_norm": 9.954655647277832,
602
- "learning_rate": 7.175373134328358e-05,
603
- "loss": 0.6284,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 2.87,
608
- "grad_norm": 6.993457317352295,
609
- "learning_rate": 7.138059701492538e-05,
610
- "loss": 0.6658,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 2.91,
615
- "grad_norm": 7.744531631469727,
616
- "learning_rate": 7.100746268656717e-05,
617
- "loss": 0.5796,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 2.95,
622
- "grad_norm": 5.5365519523620605,
623
- "learning_rate": 7.063432835820895e-05,
624
- "loss": 0.4847,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 2.99,
629
- "grad_norm": 2.869462251663208,
630
- "learning_rate": 7.026119402985075e-05,
631
- "loss": 0.579,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 2.99,
636
- "eval_accuracy": 0.8260105448154658,
637
- "eval_loss": 0.5727049708366394,
638
- "eval_runtime": 8.478,
639
- "eval_samples_per_second": 67.115,
640
- "eval_steps_per_second": 8.493,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 3.02,
645
- "grad_norm": 3.311018228530884,
646
- "learning_rate": 6.988805970149253e-05,
647
- "loss": 0.3958,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 3.06,
652
- "grad_norm": 2.3017098903656006,
653
- "learning_rate": 6.951492537313433e-05,
654
- "loss": 0.3865,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 3.1,
659
- "grad_norm": 6.653348445892334,
660
- "learning_rate": 6.914179104477613e-05,
661
- "loss": 0.4278,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 3.13,
666
- "grad_norm": 6.431192874908447,
667
- "learning_rate": 6.876865671641792e-05,
668
- "loss": 0.515,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 3.17,
673
- "grad_norm": 5.057152271270752,
674
- "learning_rate": 6.83955223880597e-05,
675
- "loss": 0.409,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 3.21,
680
- "grad_norm": 7.399530410766602,
681
- "learning_rate": 6.802238805970149e-05,
682
- "loss": 0.5062,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 3.25,
687
- "grad_norm": 13.14486026763916,
688
- "learning_rate": 6.764925373134328e-05,
689
- "loss": 0.536,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 3.28,
694
- "grad_norm": 4.1537628173828125,
695
- "learning_rate": 6.727611940298508e-05,
696
- "loss": 0.4641,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 3.32,
701
- "grad_norm": 6.63462495803833,
702
- "learning_rate": 6.690298507462687e-05,
703
- "loss": 0.5114,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 3.36,
708
- "grad_norm": 2.6790735721588135,
709
- "learning_rate": 6.652985074626867e-05,
710
- "loss": 0.4298,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 3.36,
715
- "eval_accuracy": 0.827768014059754,
716
- "eval_loss": 0.5905970931053162,
717
- "eval_runtime": 8.3784,
718
- "eval_samples_per_second": 67.913,
719
- "eval_steps_per_second": 8.594,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 3.4,
724
- "grad_norm": 2.221060276031494,
725
- "learning_rate": 6.615671641791045e-05,
726
- "loss": 0.4404,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 3.43,
731
- "grad_norm": 3.257672071456909,
732
- "learning_rate": 6.578358208955225e-05,
733
- "loss": 0.3023,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 3.47,
738
- "grad_norm": 10.71721363067627,
739
- "learning_rate": 6.541044776119403e-05,
740
- "loss": 0.3513,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 3.51,
745
- "grad_norm": 7.456138610839844,
746
- "learning_rate": 6.503731343283582e-05,
747
- "loss": 0.4613,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 3.54,
752
- "grad_norm": 1.9394311904907227,
753
- "learning_rate": 6.466417910447762e-05,
754
- "loss": 0.4467,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 3.58,
759
- "grad_norm": 4.59872579574585,
760
- "learning_rate": 6.42910447761194e-05,
761
- "loss": 0.3312,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 3.62,
766
- "grad_norm": 4.213327407836914,
767
- "learning_rate": 6.39179104477612e-05,
768
- "loss": 0.4384,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 3.66,
773
- "grad_norm": 6.511275768280029,
774
- "learning_rate": 6.3544776119403e-05,
775
- "loss": 0.3468,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 3.69,
780
- "grad_norm": 6.56002950668335,
781
- "learning_rate": 6.317164179104478e-05,
782
- "loss": 0.3614,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 3.73,
787
- "grad_norm": 8.825058937072754,
788
- "learning_rate": 6.279850746268657e-05,
789
- "loss": 0.3999,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 3.73,
794
- "eval_accuracy": 0.8347978910369068,
795
- "eval_loss": 0.5692991018295288,
796
- "eval_runtime": 8.4026,
797
- "eval_samples_per_second": 67.717,
798
- "eval_steps_per_second": 8.569,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 3.77,
803
- "grad_norm": 6.40085506439209,
804
- "learning_rate": 6.242537313432835e-05,
805
- "loss": 0.4494,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 3.81,
810
- "grad_norm": 2.9111902713775635,
811
- "learning_rate": 6.205223880597015e-05,
812
- "loss": 0.46,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 3.84,
817
- "grad_norm": 5.360230922698975,
818
- "learning_rate": 6.167910447761195e-05,
819
- "loss": 0.4846,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 3.88,
824
- "grad_norm": 5.617285251617432,
825
- "learning_rate": 6.130597014925373e-05,
826
- "loss": 0.5172,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 3.92,
831
- "grad_norm": 8.090892791748047,
832
- "learning_rate": 6.0932835820895526e-05,
833
- "loss": 0.479,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 3.96,
838
- "grad_norm": 7.056848526000977,
839
- "learning_rate": 6.055970149253731e-05,
840
- "loss": 0.529,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 3.99,
845
- "grad_norm": 1.4730439186096191,
846
- "learning_rate": 6.018656716417911e-05,
847
- "loss": 0.3414,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 4.03,
852
- "grad_norm": 1.0286493301391602,
853
- "learning_rate": 5.9813432835820894e-05,
854
- "loss": 0.3596,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 4.07,
859
- "grad_norm": 4.1192779541015625,
860
- "learning_rate": 5.9440298507462686e-05,
861
- "loss": 0.3072,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 4.1,
866
- "grad_norm": 7.781830787658691,
867
- "learning_rate": 5.9067164179104484e-05,
868
- "loss": 0.414,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 4.1,
873
- "eval_accuracy": 0.8400702987697716,
874
- "eval_loss": 0.5115455985069275,
875
- "eval_runtime": 8.3668,
876
- "eval_samples_per_second": 68.007,
877
- "eval_steps_per_second": 8.605,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 4.14,
882
- "grad_norm": 6.427892208099365,
883
- "learning_rate": 5.869402985074627e-05,
884
- "loss": 0.3508,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 4.18,
889
- "grad_norm": 6.545078277587891,
890
- "learning_rate": 5.832089552238807e-05,
891
- "loss": 0.3952,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 4.22,
896
- "grad_norm": 10.781569480895996,
897
- "learning_rate": 5.7947761194029845e-05,
898
- "loss": 0.2817,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 4.25,
903
- "grad_norm": 7.064284324645996,
904
- "learning_rate": 5.7574626865671643e-05,
905
- "loss": 0.4672,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 4.29,
910
- "grad_norm": 10.146831512451172,
911
- "learning_rate": 5.720149253731344e-05,
912
- "loss": 0.3896,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 4.33,
917
- "grad_norm": 3.460674285888672,
918
- "learning_rate": 5.6828358208955227e-05,
919
- "loss": 0.3863,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 4.37,
924
- "grad_norm": 1.625980257987976,
925
- "learning_rate": 5.645522388059702e-05,
926
- "loss": 0.3756,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 4.4,
931
- "grad_norm": 4.2397074699401855,
932
- "learning_rate": 5.60820895522388e-05,
933
- "loss": 0.317,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 4.44,
938
- "grad_norm": 7.672292232513428,
939
- "learning_rate": 5.57089552238806e-05,
940
- "loss": 0.3034,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 4.48,
945
- "grad_norm": 5.202517032623291,
946
- "learning_rate": 5.533582089552239e-05,
947
- "loss": 0.408,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 4.48,
952
- "eval_accuracy": 0.836555360281195,
953
- "eval_loss": 0.5280522704124451,
954
- "eval_runtime": 8.3443,
955
- "eval_samples_per_second": 68.19,
956
- "eval_steps_per_second": 8.629,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 4.51,
961
- "grad_norm": 4.860721111297607,
962
- "learning_rate": 5.496268656716418e-05,
963
- "loss": 0.4828,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 4.55,
968
- "grad_norm": 6.979416370391846,
969
- "learning_rate": 5.4589552238805976e-05,
970
- "loss": 0.3471,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 4.59,
975
- "grad_norm": 0.8708503246307373,
976
- "learning_rate": 5.421641791044776e-05,
977
- "loss": 0.3038,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 4.63,
982
- "grad_norm": 8.209038734436035,
983
- "learning_rate": 5.384328358208955e-05,
984
- "loss": 0.4865,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 4.66,
989
- "grad_norm": 2.992311716079712,
990
- "learning_rate": 5.347014925373135e-05,
991
- "loss": 0.2837,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 4.7,
996
- "grad_norm": 3.729401111602783,
997
- "learning_rate": 5.3097014925373136e-05,
998
- "loss": 0.2387,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 4.74,
1003
- "grad_norm": 5.371230602264404,
1004
- "learning_rate": 5.272388059701493e-05,
1005
- "loss": 0.4279,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 4.78,
1010
- "grad_norm": 5.8051652908325195,
1011
- "learning_rate": 5.235074626865671e-05,
1012
- "loss": 0.4554,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 4.81,
1017
- "grad_norm": 4.095101833343506,
1018
- "learning_rate": 5.197761194029851e-05,
1019
- "loss": 0.345,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 4.85,
1024
- "grad_norm": 1.840154767036438,
1025
- "learning_rate": 5.16044776119403e-05,
1026
- "loss": 0.2794,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 4.85,
1031
- "eval_accuracy": 0.8453427065026362,
1032
- "eval_loss": 0.5268673300743103,
1033
- "eval_runtime": 8.2977,
1034
- "eval_samples_per_second": 68.573,
1035
- "eval_steps_per_second": 8.677,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 4.89,
1040
- "grad_norm": 2.5061440467834473,
1041
- "learning_rate": 5.123134328358209e-05,
1042
- "loss": 0.3043,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 4.93,
1047
- "grad_norm": 1.3759231567382812,
1048
- "learning_rate": 5.0858208955223885e-05,
1049
- "loss": 0.3049,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 4.96,
1054
- "grad_norm": 4.22188138961792,
1055
- "learning_rate": 5.048507462686567e-05,
1056
- "loss": 0.4139,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 5.0,
1061
- "grad_norm": 8.63315200805664,
1062
- "learning_rate": 5.011194029850746e-05,
1063
- "loss": 0.3967,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 5.04,
1068
- "grad_norm": 5.766853332519531,
1069
- "learning_rate": 4.973880597014925e-05,
1070
- "loss": 0.3292,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 5.07,
1075
- "grad_norm": 5.394406318664551,
1076
- "learning_rate": 4.9365671641791045e-05,
1077
- "loss": 0.3139,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 5.11,
1082
- "grad_norm": 2.7173523902893066,
1083
- "learning_rate": 4.899253731343284e-05,
1084
- "loss": 0.2619,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 5.15,
1089
- "grad_norm": 5.464639663696289,
1090
- "learning_rate": 4.861940298507463e-05,
1091
- "loss": 0.3413,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 5.19,
1096
- "grad_norm": 5.520565032958984,
1097
- "learning_rate": 4.824626865671642e-05,
1098
- "loss": 0.2433,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 5.22,
1103
- "grad_norm": 7.165010929107666,
1104
- "learning_rate": 4.787313432835821e-05,
1105
- "loss": 0.3881,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 5.22,
1110
- "eval_accuracy": 0.8471001757469244,
1111
- "eval_loss": 0.5245617032051086,
1112
- "eval_runtime": 8.3138,
1113
- "eval_samples_per_second": 68.44,
1114
- "eval_steps_per_second": 8.66,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 5.26,
1119
- "grad_norm": 9.019377708435059,
1120
- "learning_rate": 4.75e-05,
1121
- "loss": 0.3921,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 5.3,
1126
- "grad_norm": 2.8719711303710938,
1127
- "learning_rate": 4.7126865671641794e-05,
1128
- "loss": 0.2581,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 5.34,
1133
- "grad_norm": 1.599303126335144,
1134
- "learning_rate": 4.6753731343283586e-05,
1135
- "loss": 0.3145,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 5.37,
1140
- "grad_norm": 0.6838473677635193,
1141
- "learning_rate": 4.638059701492538e-05,
1142
- "loss": 0.2265,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 5.41,
1147
- "grad_norm": 4.16595983505249,
1148
- "learning_rate": 4.600746268656716e-05,
1149
- "loss": 0.2819,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 5.45,
1154
- "grad_norm": 4.727346897125244,
1155
- "learning_rate": 4.5634328358208954e-05,
1156
- "loss": 0.472,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 5.49,
1161
- "grad_norm": 5.769215106964111,
1162
- "learning_rate": 4.526119402985075e-05,
1163
- "loss": 0.2837,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 5.52,
1168
- "grad_norm": 2.133538246154785,
1169
- "learning_rate": 4.4888059701492544e-05,
1170
- "loss": 0.3245,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 5.56,
1175
- "grad_norm": 4.680506229400635,
1176
- "learning_rate": 4.451492537313433e-05,
1177
- "loss": 0.3059,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 5.6,
1182
- "grad_norm": 1.788087248802185,
1183
- "learning_rate": 4.414179104477612e-05,
1184
- "loss": 0.3097,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 5.6,
1189
- "eval_accuracy": 0.8523725834797891,
1190
- "eval_loss": 0.49264049530029297,
1191
- "eval_runtime": 8.432,
1192
- "eval_samples_per_second": 67.481,
1193
- "eval_steps_per_second": 8.539,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 5.63,
1198
- "grad_norm": 4.880031108856201,
1199
- "learning_rate": 4.376865671641791e-05,
1200
- "loss": 0.2706,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 5.67,
1205
- "grad_norm": 1.7767056226730347,
1206
- "learning_rate": 4.33955223880597e-05,
1207
- "loss": 0.1966,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 5.71,
1212
- "grad_norm": 3.718566656112671,
1213
- "learning_rate": 4.3022388059701495e-05,
1214
- "loss": 0.3677,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 5.75,
1219
- "grad_norm": 5.989944934844971,
1220
- "learning_rate": 4.2649253731343286e-05,
1221
- "loss": 0.3162,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 5.78,
1226
- "grad_norm": 2.0375638008117676,
1227
- "learning_rate": 4.227611940298508e-05,
1228
- "loss": 0.3895,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 5.82,
1233
- "grad_norm": 6.132637023925781,
1234
- "learning_rate": 4.190298507462686e-05,
1235
- "loss": 0.3279,
1236
  "step": 1560
1237
  },
1238
  {
1239
  "epoch": 5.86,
1240
- "grad_norm": 1.606240153312683,
1241
- "learning_rate": 4.152985074626866e-05,
1242
- "loss": 0.2464,
1243
  "step": 1570
1244
  },
1245
  {
1246
  "epoch": 5.9,
1247
- "grad_norm": 1.298972249031067,
1248
- "learning_rate": 4.115671641791045e-05,
1249
- "loss": 0.3232,
1250
  "step": 1580
1251
  },
1252
  {
1253
  "epoch": 5.93,
1254
- "grad_norm": 5.93351936340332,
1255
- "learning_rate": 4.0783582089552244e-05,
1256
- "loss": 0.4001,
1257
  "step": 1590
1258
  },
1259
  {
1260
  "epoch": 5.97,
1261
- "grad_norm": 7.276583194732666,
1262
- "learning_rate": 4.041044776119403e-05,
1263
- "loss": 0.2272,
1264
  "step": 1600
1265
  },
1266
  {
1267
  "epoch": 5.97,
1268
- "eval_accuracy": 0.8506151142355008,
1269
- "eval_loss": 0.5247848629951477,
1270
- "eval_runtime": 8.5315,
1271
- "eval_samples_per_second": 66.694,
1272
- "eval_steps_per_second": 8.439,
1273
  "step": 1600
1274
  },
1275
  {
1276
  "epoch": 6.01,
1277
- "grad_norm": 6.495110988616943,
1278
- "learning_rate": 4.003731343283582e-05,
1279
- "loss": 0.543,
1280
  "step": 1610
1281
  },
1282
  {
1283
  "epoch": 6.04,
1284
- "grad_norm": 4.471397399902344,
1285
- "learning_rate": 3.966417910447761e-05,
1286
- "loss": 0.3181,
1287
  "step": 1620
1288
  },
1289
  {
1290
  "epoch": 6.08,
1291
- "grad_norm": 1.866243600845337,
1292
- "learning_rate": 3.9291044776119404e-05,
1293
- "loss": 0.3089,
1294
  "step": 1630
1295
  },
1296
  {
1297
  "epoch": 6.12,
1298
- "grad_norm": 0.6558152437210083,
1299
- "learning_rate": 3.8917910447761195e-05,
1300
- "loss": 0.2184,
1301
  "step": 1640
1302
  },
1303
  {
1304
  "epoch": 6.16,
1305
- "grad_norm": 5.176995277404785,
1306
- "learning_rate": 3.854477611940299e-05,
1307
- "loss": 0.339,
1308
  "step": 1650
1309
  },
1310
  {
1311
  "epoch": 6.19,
1312
- "grad_norm": 4.923765659332275,
1313
- "learning_rate": 3.817164179104478e-05,
1314
- "loss": 0.2766,
1315
  "step": 1660
1316
  },
1317
  {
1318
  "epoch": 6.23,
1319
- "grad_norm": 7.87476921081543,
1320
- "learning_rate": 3.7798507462686563e-05,
1321
- "loss": 0.2416,
1322
  "step": 1670
1323
  },
1324
  {
1325
  "epoch": 6.27,
1326
- "grad_norm": 0.2711706757545471,
1327
- "learning_rate": 3.742537313432836e-05,
1328
- "loss": 0.2525,
1329
  "step": 1680
1330
  },
1331
  {
1332
  "epoch": 6.31,
1333
- "grad_norm": 3.922217607498169,
1334
- "learning_rate": 3.7052238805970153e-05,
1335
- "loss": 0.2029,
1336
  "step": 1690
1337
  },
1338
  {
1339
  "epoch": 6.34,
1340
- "grad_norm": 3.762434244155884,
1341
- "learning_rate": 3.6679104477611945e-05,
1342
- "loss": 0.2796,
1343
  "step": 1700
1344
  },
1345
  {
1346
  "epoch": 6.34,
1347
- "eval_accuracy": 0.8611599297012302,
1348
- "eval_loss": 0.5052834749221802,
1349
- "eval_runtime": 8.3793,
1350
- "eval_samples_per_second": 67.905,
1351
- "eval_steps_per_second": 8.593,
1352
  "step": 1700
1353
  },
1354
  {
1355
  "epoch": 6.38,
1356
- "grad_norm": 7.634174823760986,
1357
- "learning_rate": 3.630597014925373e-05,
1358
- "loss": 0.3526,
1359
  "step": 1710
1360
  },
1361
  {
1362
  "epoch": 6.42,
1363
- "grad_norm": 3.1712279319763184,
1364
- "learning_rate": 3.593283582089552e-05,
1365
- "loss": 0.352,
1366
  "step": 1720
1367
  },
1368
  {
1369
  "epoch": 6.46,
1370
- "grad_norm": 5.504647254943848,
1371
- "learning_rate": 3.555970149253732e-05,
1372
- "loss": 0.3803,
1373
  "step": 1730
1374
  },
1375
  {
1376
  "epoch": 6.49,
1377
- "grad_norm": 3.8011245727539062,
1378
- "learning_rate": 3.5186567164179105e-05,
1379
- "loss": 0.2704,
1380
  "step": 1740
1381
  },
1382
  {
1383
  "epoch": 6.53,
1384
- "grad_norm": 3.952202320098877,
1385
- "learning_rate": 3.4813432835820896e-05,
1386
- "loss": 0.2412,
1387
  "step": 1750
1388
  },
1389
  {
1390
  "epoch": 6.57,
1391
- "grad_norm": 8.292951583862305,
1392
- "learning_rate": 3.444029850746269e-05,
1393
- "loss": 0.4386,
1394
  "step": 1760
1395
  },
1396
  {
1397
  "epoch": 6.6,
1398
- "grad_norm": 7.3667683601379395,
1399
- "learning_rate": 3.406716417910448e-05,
1400
- "loss": 0.3602,
1401
  "step": 1770
1402
  },
1403
  {
1404
  "epoch": 6.64,
1405
- "grad_norm": 5.553083419799805,
1406
- "learning_rate": 3.369402985074627e-05,
1407
- "loss": 0.3948,
1408
  "step": 1780
1409
  },
1410
  {
1411
  "epoch": 6.68,
1412
- "grad_norm": 2.8751885890960693,
1413
- "learning_rate": 3.332089552238806e-05,
1414
- "loss": 0.3627,
1415
  "step": 1790
1416
  },
1417
  {
1418
  "epoch": 6.72,
1419
- "grad_norm": 1.8231642246246338,
1420
- "learning_rate": 3.2947761194029854e-05,
1421
- "loss": 0.3279,
1422
  "step": 1800
1423
  },
1424
  {
1425
  "epoch": 6.72,
1426
- "eval_accuracy": 0.8629173989455184,
1427
- "eval_loss": 0.5018983483314514,
1428
- "eval_runtime": 8.4577,
1429
- "eval_samples_per_second": 67.276,
1430
- "eval_steps_per_second": 8.513,
1431
  "step": 1800
1432
  },
1433
  {
1434
  "epoch": 6.75,
1435
- "grad_norm": 7.642894268035889,
1436
- "learning_rate": 3.2574626865671646e-05,
1437
- "loss": 0.2666,
1438
  "step": 1810
1439
  },
1440
  {
1441
  "epoch": 6.79,
1442
- "grad_norm": 6.284189701080322,
1443
- "learning_rate": 3.220149253731343e-05,
1444
- "loss": 0.2041,
1445
  "step": 1820
1446
  },
1447
  {
1448
  "epoch": 6.83,
1449
- "grad_norm": 0.28246691823005676,
1450
- "learning_rate": 3.182835820895523e-05,
1451
- "loss": 0.3354,
1452
  "step": 1830
1453
  },
1454
  {
1455
  "epoch": 6.87,
1456
- "grad_norm": 1.1345715522766113,
1457
- "learning_rate": 3.145522388059702e-05,
1458
- "loss": 0.2131,
1459
  "step": 1840
1460
  },
1461
  {
1462
  "epoch": 6.9,
1463
- "grad_norm": 3.8102524280548096,
1464
- "learning_rate": 3.1082089552238805e-05,
1465
- "loss": 0.2561,
1466
  "step": 1850
1467
  },
1468
  {
1469
  "epoch": 6.94,
1470
- "grad_norm": 2.3832194805145264,
1471
- "learning_rate": 3.07089552238806e-05,
1472
- "loss": 0.2473,
1473
  "step": 1860
1474
  },
1475
  {
1476
  "epoch": 6.98,
1477
- "grad_norm": 8.078363418579102,
1478
- "learning_rate": 3.033582089552239e-05,
1479
- "loss": 0.1613,
1480
  "step": 1870
1481
  },
1482
  {
1483
  "epoch": 7.01,
1484
- "grad_norm": 7.360495567321777,
1485
- "learning_rate": 2.9962686567164183e-05,
1486
- "loss": 0.2584,
1487
  "step": 1880
1488
  },
1489
  {
1490
  "epoch": 7.05,
1491
- "grad_norm": 3.4815495014190674,
1492
- "learning_rate": 2.958955223880597e-05,
1493
- "loss": 0.294,
1494
  "step": 1890
1495
  },
1496
  {
1497
  "epoch": 7.09,
1498
- "grad_norm": 2.735764980316162,
1499
- "learning_rate": 2.9216417910447763e-05,
1500
- "loss": 0.2674,
1501
  "step": 1900
1502
  },
1503
  {
1504
  "epoch": 7.09,
1505
- "eval_accuracy": 0.8541300527240774,
1506
- "eval_loss": 0.49219682812690735,
1507
- "eval_runtime": 8.6014,
1508
- "eval_samples_per_second": 66.152,
1509
- "eval_steps_per_second": 8.371,
1510
  "step": 1900
1511
  },
1512
  {
1513
  "epoch": 7.13,
1514
- "grad_norm": 7.633571624755859,
1515
- "learning_rate": 2.8843283582089555e-05,
1516
- "loss": 0.2381,
1517
  "step": 1910
1518
  },
1519
  {
1520
  "epoch": 7.16,
1521
- "grad_norm": 4.956969261169434,
1522
- "learning_rate": 2.8470149253731343e-05,
1523
- "loss": 0.2345,
1524
  "step": 1920
1525
  },
1526
  {
1527
  "epoch": 7.2,
1528
- "grad_norm": 4.375622272491455,
1529
- "learning_rate": 2.8097014925373134e-05,
1530
- "loss": 0.1311,
1531
  "step": 1930
1532
  },
1533
  {
1534
  "epoch": 7.24,
1535
- "grad_norm": 4.414794445037842,
1536
- "learning_rate": 2.772388059701493e-05,
1537
- "loss": 0.2952,
1538
  "step": 1940
1539
  },
1540
  {
1541
  "epoch": 7.28,
1542
- "grad_norm": 2.791107654571533,
1543
- "learning_rate": 2.7350746268656718e-05,
1544
- "loss": 0.2819,
1545
  "step": 1950
1546
  },
1547
  {
1548
  "epoch": 7.31,
1549
- "grad_norm": 1.6784512996673584,
1550
- "learning_rate": 2.697761194029851e-05,
1551
- "loss": 0.2262,
1552
  "step": 1960
1553
  },
1554
  {
1555
  "epoch": 7.35,
1556
- "grad_norm": 9.391459465026855,
1557
- "learning_rate": 2.6604477611940297e-05,
1558
- "loss": 0.2454,
1559
  "step": 1970
1560
  },
1561
  {
1562
  "epoch": 7.39,
1563
- "grad_norm": 2.932426929473877,
1564
- "learning_rate": 2.623134328358209e-05,
1565
- "loss": 0.3726,
1566
  "step": 1980
1567
  },
1568
  {
1569
  "epoch": 7.43,
1570
- "grad_norm": 4.181185245513916,
1571
- "learning_rate": 2.5858208955223884e-05,
1572
- "loss": 0.2476,
1573
  "step": 1990
1574
  },
1575
  {
1576
  "epoch": 7.46,
1577
- "grad_norm": 1.8798452615737915,
1578
- "learning_rate": 2.5485074626865672e-05,
1579
- "loss": 0.1587,
1580
  "step": 2000
1581
  },
1582
  {
1583
  "epoch": 7.46,
1584
- "eval_accuracy": 0.8453427065026362,
1585
- "eval_loss": 0.5443013906478882,
1586
- "eval_runtime": 8.3737,
1587
- "eval_samples_per_second": 67.951,
1588
- "eval_steps_per_second": 8.598,
1589
  "step": 2000
1590
  },
1591
  {
1592
  "epoch": 7.5,
1593
- "grad_norm": 3.390342950820923,
1594
- "learning_rate": 2.5111940298507464e-05,
1595
- "loss": 0.1547,
1596
  "step": 2010
1597
  },
1598
  {
1599
  "epoch": 7.54,
1600
- "grad_norm": 5.769399642944336,
1601
- "learning_rate": 2.4738805970149252e-05,
1602
- "loss": 0.3145,
1603
  "step": 2020
1604
  },
1605
  {
1606
  "epoch": 7.57,
1607
- "grad_norm": 2.5483851432800293,
1608
- "learning_rate": 2.4365671641791047e-05,
1609
- "loss": 0.1655,
1610
  "step": 2030
1611
  },
1612
  {
1613
  "epoch": 7.61,
1614
- "grad_norm": 1.7513008117675781,
1615
- "learning_rate": 2.3992537313432835e-05,
1616
- "loss": 0.2675,
1617
  "step": 2040
1618
  },
1619
  {
1620
  "epoch": 7.65,
1621
- "grad_norm": 1.5183019638061523,
1622
- "learning_rate": 2.361940298507463e-05,
1623
- "loss": 0.2594,
1624
  "step": 2050
1625
  },
1626
  {
1627
  "epoch": 7.69,
1628
- "grad_norm": 0.32409217953681946,
1629
- "learning_rate": 2.3246268656716418e-05,
1630
- "loss": 0.1883,
1631
  "step": 2060
1632
  },
1633
  {
1634
  "epoch": 7.72,
1635
- "grad_norm": 0.11139284074306488,
1636
- "learning_rate": 2.287313432835821e-05,
1637
- "loss": 0.3021,
1638
  "step": 2070
1639
  },
1640
  {
1641
  "epoch": 7.76,
1642
- "grad_norm": 7.34183406829834,
1643
- "learning_rate": 2.25e-05,
1644
- "loss": 0.2072,
1645
  "step": 2080
1646
  },
1647
  {
1648
  "epoch": 7.8,
1649
- "grad_norm": 5.418173789978027,
1650
- "learning_rate": 2.2126865671641793e-05,
1651
- "loss": 0.3153,
1652
  "step": 2090
1653
  },
1654
  {
1655
  "epoch": 7.84,
1656
- "grad_norm": 5.986533164978027,
1657
- "learning_rate": 2.1753731343283585e-05,
1658
- "loss": 0.1969,
1659
  "step": 2100
1660
  },
1661
  {
1662
  "epoch": 7.84,
1663
- "eval_accuracy": 0.8453427065026362,
1664
- "eval_loss": 0.49669986963272095,
1665
- "eval_runtime": 8.6875,
1666
- "eval_samples_per_second": 65.496,
1667
- "eval_steps_per_second": 8.288,
1668
  "step": 2100
1669
  },
1670
  {
1671
  "epoch": 7.87,
1672
- "grad_norm": 3.97404146194458,
1673
- "learning_rate": 2.1380597014925373e-05,
1674
- "loss": 0.226,
1675
  "step": 2110
1676
  },
1677
  {
1678
  "epoch": 7.91,
1679
- "grad_norm": 0.669471263885498,
1680
- "learning_rate": 2.1007462686567164e-05,
1681
- "loss": 0.1694,
1682
  "step": 2120
1683
  },
1684
  {
1685
  "epoch": 7.95,
1686
- "grad_norm": 2.947169303894043,
1687
- "learning_rate": 2.0634328358208956e-05,
1688
- "loss": 0.2161,
1689
  "step": 2130
1690
  },
1691
  {
1692
  "epoch": 7.99,
1693
- "grad_norm": 2.7200162410736084,
1694
- "learning_rate": 2.0261194029850748e-05,
1695
- "loss": 0.193,
1696
  "step": 2140
1697
  },
1698
  {
1699
  "epoch": 8.02,
1700
- "grad_norm": 5.843700885772705,
1701
- "learning_rate": 1.988805970149254e-05,
1702
- "loss": 0.1613,
1703
  "step": 2150
1704
  },
1705
  {
1706
  "epoch": 8.06,
1707
- "grad_norm": 4.60325288772583,
1708
- "learning_rate": 1.951492537313433e-05,
1709
- "loss": 0.1515,
1710
  "step": 2160
1711
  },
1712
  {
1713
  "epoch": 8.1,
1714
- "grad_norm": 2.5431127548217773,
1715
- "learning_rate": 1.914179104477612e-05,
1716
- "loss": 0.11,
1717
  "step": 2170
1718
  },
1719
  {
1720
  "epoch": 8.13,
1721
- "grad_norm": 6.271789073944092,
1722
- "learning_rate": 1.8768656716417914e-05,
1723
- "loss": 0.267,
1724
  "step": 2180
1725
  },
1726
  {
1727
  "epoch": 8.17,
1728
- "grad_norm": 5.760239601135254,
1729
- "learning_rate": 1.8395522388059702e-05,
1730
- "loss": 0.182,
1731
  "step": 2190
1732
  },
1733
  {
1734
  "epoch": 8.21,
1735
- "grad_norm": 6.8685407638549805,
1736
- "learning_rate": 1.8022388059701494e-05,
1737
- "loss": 0.2137,
1738
  "step": 2200
1739
  },
1740
  {
1741
  "epoch": 8.21,
1742
- "eval_accuracy": 0.8453427065026362,
1743
- "eval_loss": 0.5276018977165222,
1744
- "eval_runtime": 8.5437,
1745
- "eval_samples_per_second": 66.599,
1746
- "eval_steps_per_second": 8.427,
1747
  "step": 2200
1748
  },
1749
  {
1750
  "epoch": 8.25,
1751
- "grad_norm": 3.27811598777771,
1752
- "learning_rate": 1.7649253731343285e-05,
1753
- "loss": 0.2199,
1754
  "step": 2210
1755
  },
1756
  {
1757
  "epoch": 8.28,
1758
- "grad_norm": 6.934047698974609,
1759
- "learning_rate": 1.7276119402985073e-05,
1760
- "loss": 0.1897,
1761
  "step": 2220
1762
  },
1763
  {
1764
  "epoch": 8.32,
1765
- "grad_norm": 2.9394731521606445,
1766
- "learning_rate": 1.690298507462687e-05,
1767
- "loss": 0.1746,
1768
  "step": 2230
1769
  },
1770
  {
1771
  "epoch": 8.36,
1772
- "grad_norm": 3.681551694869995,
1773
- "learning_rate": 1.6529850746268657e-05,
1774
- "loss": 0.2328,
1775
  "step": 2240
1776
  },
1777
  {
1778
  "epoch": 8.4,
1779
- "grad_norm": 3.270867109298706,
1780
- "learning_rate": 1.6156716417910448e-05,
1781
- "loss": 0.1938,
1782
  "step": 2250
1783
  },
1784
  {
1785
  "epoch": 8.43,
1786
- "grad_norm": 1.7609107494354248,
1787
- "learning_rate": 1.578358208955224e-05,
1788
- "loss": 0.2182,
1789
  "step": 2260
1790
  },
1791
  {
1792
  "epoch": 8.47,
1793
- "grad_norm": 8.35375690460205,
1794
- "learning_rate": 1.541044776119403e-05,
1795
- "loss": 0.2743,
1796
  "step": 2270
1797
  },
1798
  {
1799
  "epoch": 8.51,
1800
- "grad_norm": 3.129824161529541,
1801
- "learning_rate": 1.5037313432835823e-05,
1802
- "loss": 0.2801,
1803
  "step": 2280
1804
  },
1805
  {
1806
  "epoch": 8.54,
1807
- "grad_norm": 6.506752014160156,
1808
- "learning_rate": 1.4664179104477613e-05,
1809
- "loss": 0.2695,
1810
  "step": 2290
1811
  },
1812
  {
1813
  "epoch": 8.58,
1814
- "grad_norm": 4.825931549072266,
1815
- "learning_rate": 1.4291044776119403e-05,
1816
- "loss": 0.1523,
1817
  "step": 2300
1818
  },
1819
  {
1820
  "epoch": 8.58,
1821
- "eval_accuracy": 0.8400702987697716,
1822
- "eval_loss": 0.557367742061615,
1823
- "eval_runtime": 8.4141,
1824
- "eval_samples_per_second": 67.625,
1825
- "eval_steps_per_second": 8.557,
1826
  "step": 2300
1827
  },
1828
  {
1829
  "epoch": 8.62,
1830
- "grad_norm": 2.8614978790283203,
1831
- "learning_rate": 1.3917910447761196e-05,
1832
- "loss": 0.3248,
1833
  "step": 2310
1834
  },
1835
  {
1836
  "epoch": 8.66,
1837
- "grad_norm": 4.763182163238525,
1838
- "learning_rate": 1.3544776119402986e-05,
1839
- "loss": 0.2438,
1840
  "step": 2320
1841
  },
1842
  {
1843
  "epoch": 8.69,
1844
- "grad_norm": 9.630255699157715,
1845
- "learning_rate": 1.3171641791044777e-05,
1846
- "loss": 0.1735,
1847
  "step": 2330
1848
  },
1849
  {
1850
  "epoch": 8.73,
1851
- "grad_norm": 0.24263332784175873,
1852
- "learning_rate": 1.2798507462686567e-05,
1853
- "loss": 0.178,
1854
  "step": 2340
1855
  },
1856
  {
1857
  "epoch": 8.77,
1858
- "grad_norm": 5.015183448791504,
1859
- "learning_rate": 1.2425373134328359e-05,
1860
- "loss": 0.2226,
1861
  "step": 2350
1862
  },
1863
  {
1864
  "epoch": 8.81,
1865
- "grad_norm": 3.0591747760772705,
1866
- "learning_rate": 1.2052238805970149e-05,
1867
- "loss": 0.1448,
1868
  "step": 2360
1869
  },
1870
  {
1871
  "epoch": 8.84,
1872
- "grad_norm": 2.034698247909546,
1873
- "learning_rate": 1.167910447761194e-05,
1874
- "loss": 0.2208,
1875
  "step": 2370
1876
  },
1877
  {
1878
  "epoch": 8.88,
1879
- "grad_norm": 11.01931381225586,
1880
- "learning_rate": 1.1305970149253732e-05,
1881
- "loss": 0.3189,
1882
  "step": 2380
1883
  },
1884
  {
1885
  "epoch": 8.92,
1886
- "grad_norm": 5.101518154144287,
1887
- "learning_rate": 1.0932835820895524e-05,
1888
- "loss": 0.2416,
1889
  "step": 2390
1890
  },
1891
  {
1892
  "epoch": 8.96,
1893
- "grad_norm": 9.501167297363281,
1894
- "learning_rate": 1.0559701492537313e-05,
1895
- "loss": 0.1046,
1896
  "step": 2400
1897
  },
1898
  {
1899
  "epoch": 8.96,
1900
- "eval_accuracy": 0.8506151142355008,
1901
- "eval_loss": 0.5301781892776489,
1902
- "eval_runtime": 8.5121,
1903
- "eval_samples_per_second": 66.846,
1904
- "eval_steps_per_second": 8.459,
1905
  "step": 2400
1906
  },
1907
  {
1908
  "epoch": 8.99,
1909
- "grad_norm": 5.100789546966553,
1910
- "learning_rate": 1.0186567164179105e-05,
1911
- "loss": 0.2397,
1912
  "step": 2410
1913
  },
1914
  {
1915
  "epoch": 9.03,
1916
- "grad_norm": 7.95840311050415,
1917
- "learning_rate": 9.813432835820897e-06,
1918
- "loss": 0.1868,
1919
  "step": 2420
1920
  },
1921
  {
1922
  "epoch": 9.07,
1923
- "grad_norm": 5.097079277038574,
1924
- "learning_rate": 9.440298507462688e-06,
1925
- "loss": 0.1372,
1926
  "step": 2430
1927
  },
1928
  {
1929
  "epoch": 9.1,
1930
- "grad_norm": 3.864272356033325,
1931
- "learning_rate": 9.067164179104478e-06,
1932
- "loss": 0.2585,
1933
  "step": 2440
1934
  },
1935
  {
1936
  "epoch": 9.14,
1937
- "grad_norm": 0.11230158805847168,
1938
- "learning_rate": 8.694029850746268e-06,
1939
- "loss": 0.0966,
1940
  "step": 2450
1941
  },
1942
  {
1943
  "epoch": 9.18,
1944
- "grad_norm": 7.959216594696045,
1945
- "learning_rate": 8.32089552238806e-06,
1946
- "loss": 0.2738,
1947
  "step": 2460
1948
  },
1949
  {
1950
  "epoch": 9.22,
1951
- "grad_norm": 2.576493263244629,
1952
- "learning_rate": 7.947761194029851e-06,
1953
- "loss": 0.1649,
1954
  "step": 2470
1955
  },
1956
  {
1957
  "epoch": 9.25,
1958
- "grad_norm": 0.5602326393127441,
1959
- "learning_rate": 7.574626865671643e-06,
1960
- "loss": 0.1362,
1961
  "step": 2480
1962
  },
1963
  {
1964
  "epoch": 9.29,
1965
- "grad_norm": 0.8377688527107239,
1966
- "learning_rate": 7.201492537313433e-06,
1967
- "loss": 0.1822,
1968
  "step": 2490
1969
  },
1970
  {
1971
  "epoch": 9.33,
1972
- "grad_norm": 9.408169746398926,
1973
- "learning_rate": 6.828358208955224e-06,
1974
- "loss": 0.2494,
1975
  "step": 2500
1976
  },
1977
  {
1978
  "epoch": 9.33,
1979
- "eval_accuracy": 0.859402460456942,
1980
- "eval_loss": 0.5221985578536987,
1981
- "eval_runtime": 8.4657,
1982
- "eval_samples_per_second": 67.212,
1983
- "eval_steps_per_second": 8.505,
1984
  "step": 2500
1985
  },
1986
  {
1987
  "epoch": 9.37,
1988
- "grad_norm": 4.556987762451172,
1989
- "learning_rate": 6.455223880597015e-06,
1990
- "loss": 0.1805,
1991
  "step": 2510
1992
  },
1993
  {
1994
  "epoch": 9.4,
1995
- "grad_norm": 3.8287057876586914,
1996
- "learning_rate": 6.082089552238806e-06,
1997
- "loss": 0.1475,
1998
  "step": 2520
1999
  },
2000
  {
2001
  "epoch": 9.44,
2002
- "grad_norm": 5.950246334075928,
2003
- "learning_rate": 5.708955223880597e-06,
2004
- "loss": 0.2726,
2005
  "step": 2530
2006
  },
2007
  {
2008
  "epoch": 9.48,
2009
- "grad_norm": 5.040480613708496,
2010
- "learning_rate": 5.335820895522389e-06,
2011
- "loss": 0.2486,
2012
  "step": 2540
2013
  },
2014
  {
2015
  "epoch": 9.51,
2016
- "grad_norm": 2.392062187194824,
2017
- "learning_rate": 4.9626865671641796e-06,
2018
- "loss": 0.1982,
2019
  "step": 2550
2020
  },
2021
  {
2022
  "epoch": 9.55,
2023
- "grad_norm": 7.3676886558532715,
2024
- "learning_rate": 4.58955223880597e-06,
2025
- "loss": 0.1572,
2026
  "step": 2560
2027
  },
2028
  {
2029
  "epoch": 9.59,
2030
- "grad_norm": 3.5289554595947266,
2031
- "learning_rate": 4.216417910447761e-06,
2032
- "loss": 0.2288,
2033
  "step": 2570
2034
  },
2035
  {
2036
  "epoch": 9.63,
2037
- "grad_norm": 2.5290260314941406,
2038
- "learning_rate": 3.843283582089553e-06,
2039
- "loss": 0.178,
2040
  "step": 2580
2041
  },
2042
  {
2043
  "epoch": 9.66,
2044
- "grad_norm": 4.0583271980285645,
2045
- "learning_rate": 3.4701492537313434e-06,
2046
- "loss": 0.1595,
2047
  "step": 2590
2048
  },
2049
  {
2050
  "epoch": 9.7,
2051
- "grad_norm": 12.406070709228516,
2052
- "learning_rate": 3.0970149253731345e-06,
2053
- "loss": 0.1924,
2054
  "step": 2600
2055
  },
2056
  {
2057
  "epoch": 9.7,
2058
- "eval_accuracy": 0.8506151142355008,
2059
- "eval_loss": 0.5271298885345459,
2060
- "eval_runtime": 8.6729,
2061
- "eval_samples_per_second": 65.607,
2062
- "eval_steps_per_second": 8.302,
2063
  "step": 2600
2064
  },
2065
  {
2066
  "epoch": 9.74,
2067
- "grad_norm": 0.0850793793797493,
2068
- "learning_rate": 2.7238805970149257e-06,
2069
- "loss": 0.1408,
2070
  "step": 2610
2071
  },
2072
  {
2073
  "epoch": 9.78,
2074
- "grad_norm": 0.3168705105781555,
2075
- "learning_rate": 2.3507462686567164e-06,
2076
- "loss": 0.1715,
2077
  "step": 2620
2078
  },
2079
  {
2080
  "epoch": 9.81,
2081
- "grad_norm": 8.39201545715332,
2082
- "learning_rate": 1.9776119402985076e-06,
2083
- "loss": 0.1584,
2084
  "step": 2630
2085
  },
2086
  {
2087
  "epoch": 9.85,
2088
- "grad_norm": 8.577099800109863,
2089
- "learning_rate": 1.6044776119402985e-06,
2090
- "loss": 0.1757,
2091
  "step": 2640
2092
  },
2093
  {
2094
  "epoch": 9.89,
2095
- "grad_norm": 2.9620418548583984,
2096
- "learning_rate": 1.2313432835820897e-06,
2097
- "loss": 0.2596,
2098
  "step": 2650
2099
  },
2100
  {
2101
  "epoch": 9.93,
2102
- "grad_norm": 1.613171100616455,
2103
- "learning_rate": 8.582089552238806e-07,
2104
- "loss": 0.2493,
2105
  "step": 2660
2106
  },
2107
  {
2108
  "epoch": 9.96,
2109
- "grad_norm": 8.745406150817871,
2110
- "learning_rate": 4.850746268656717e-07,
2111
- "loss": 0.2033,
2112
  "step": 2670
2113
  },
2114
  {
2115
  "epoch": 10.0,
2116
- "grad_norm": 4.619011878967285,
2117
- "learning_rate": 1.119402985074627e-07,
2118
- "loss": 0.2182,
2119
  "step": 2680
2120
  },
2121
  {
2122
  "epoch": 10.0,
2123
  "step": 2680,
2124
  "total_flos": 3.3230947683690086e+18,
2125
- "train_loss": 0.45721185349706395,
2126
- "train_runtime": 1410.6493,
2127
- "train_samples_per_second": 30.397,
2128
- "train_steps_per_second": 1.9
2129
  }
2130
  ],
2131
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.6129801869392395,
3
+ "best_model_checkpoint": "Action_model/checkpoint-300",
4
  "epoch": 10.0,
5
  "eval_steps": 100,
6
  "global_step": 2680,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
+ "grad_norm": 2.570383071899414,
14
  "learning_rate": 9.96268656716418e-05,
15
+ "loss": 0.1841,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07,
20
+ "grad_norm": 6.266295433044434,
21
  "learning_rate": 9.925373134328359e-05,
22
+ "loss": 0.2301,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.11,
27
+ "grad_norm": 8.001986503601074,
28
  "learning_rate": 9.888059701492539e-05,
29
+ "loss": 0.2533,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.15,
34
+ "grad_norm": 5.319194316864014,
35
  "learning_rate": 9.850746268656717e-05,
36
+ "loss": 0.2436,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.19,
41
+ "grad_norm": 0.9653372764587402,
42
  "learning_rate": 9.813432835820896e-05,
43
+ "loss": 0.3712,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.22,
48
+ "grad_norm": 7.348043441772461,
49
  "learning_rate": 9.776119402985075e-05,
50
+ "loss": 0.3645,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.26,
55
+ "grad_norm": 2.1969542503356934,
56
  "learning_rate": 9.738805970149254e-05,
57
+ "loss": 0.4609,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.3,
62
+ "grad_norm": 6.397550106048584,
63
  "learning_rate": 9.701492537313434e-05,
64
+ "loss": 0.4755,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.34,
69
+ "grad_norm": 6.923007488250732,
70
  "learning_rate": 9.664179104477612e-05,
71
+ "loss": 0.3901,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.37,
76
+ "grad_norm": 4.786198616027832,
77
  "learning_rate": 9.626865671641792e-05,
78
+ "loss": 0.255,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.37,
83
+ "eval_accuracy": 0.7926186291739895,
84
+ "eval_loss": 0.7616190314292908,
85
+ "eval_runtime": 8.7209,
86
+ "eval_samples_per_second": 65.245,
87
+ "eval_steps_per_second": 8.256,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.41,
92
+ "grad_norm": 8.368223190307617,
93
  "learning_rate": 9.58955223880597e-05,
94
+ "loss": 0.3784,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.45,
99
+ "grad_norm": 4.078306198120117,
100
  "learning_rate": 9.552238805970149e-05,
101
+ "loss": 0.4148,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.49,
106
+ "grad_norm": 7.815361022949219,
107
  "learning_rate": 9.514925373134329e-05,
108
+ "loss": 0.3621,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.52,
113
+ "grad_norm": 11.498431205749512,
114
  "learning_rate": 9.477611940298507e-05,
115
+ "loss": 0.3974,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.56,
120
+ "grad_norm": 7.946558952331543,
121
  "learning_rate": 9.440298507462687e-05,
122
+ "loss": 0.3856,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 0.6,
127
+ "grad_norm": 0.3486919403076172,
128
  "learning_rate": 9.402985074626867e-05,
129
+ "loss": 0.2435,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 0.63,
134
+ "grad_norm": 4.267444133758545,
135
  "learning_rate": 9.365671641791045e-05,
136
+ "loss": 0.3736,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 0.67,
141
+ "grad_norm": 3.022345542907715,
142
+ "learning_rate": 9.328358208955224e-05,
143
+ "loss": 0.439,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 0.71,
148
+ "grad_norm": 5.57196044921875,
149
+ "learning_rate": 9.291044776119402e-05,
150
+ "loss": 0.2996,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 0.75,
155
+ "grad_norm": 2.636216640472412,
156
+ "learning_rate": 9.253731343283582e-05,
157
+ "loss": 0.2048,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 0.75,
162
+ "eval_accuracy": 0.8084358523725835,
163
+ "eval_loss": 0.724670946598053,
164
+ "eval_runtime": 8.4461,
165
+ "eval_samples_per_second": 67.368,
166
+ "eval_steps_per_second": 8.525,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.78,
171
+ "grad_norm": 1.615098237991333,
172
+ "learning_rate": 9.216417910447762e-05,
173
+ "loss": 0.3594,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 0.82,
178
+ "grad_norm": 9.315821647644043,
179
+ "learning_rate": 9.17910447761194e-05,
180
+ "loss": 0.3046,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 0.86,
185
+ "grad_norm": 3.669430732727051,
186
+ "learning_rate": 9.14179104477612e-05,
187
+ "loss": 0.4158,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 0.9,
192
+ "grad_norm": 7.0882978439331055,
193
+ "learning_rate": 9.104477611940299e-05,
194
+ "loss": 0.3477,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 0.93,
199
+ "grad_norm": 1.1667325496673584,
200
+ "learning_rate": 9.067164179104479e-05,
201
+ "loss": 0.316,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 0.97,
206
+ "grad_norm": 1.482625961303711,
207
+ "learning_rate": 9.029850746268657e-05,
208
+ "loss": 0.3922,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.01,
213
+ "grad_norm": 0.20793116092681885,
214
+ "learning_rate": 8.992537313432836e-05,
215
+ "loss": 0.3751,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.04,
220
+ "grad_norm": 6.772298812866211,
221
+ "learning_rate": 8.955223880597016e-05,
222
+ "loss": 0.3269,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.08,
227
+ "grad_norm": 5.833349227905273,
228
+ "learning_rate": 8.917910447761194e-05,
229
+ "loss": 0.3026,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.12,
234
+ "grad_norm": 6.349458694458008,
235
+ "learning_rate": 8.880597014925374e-05,
236
+ "loss": 0.3763,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.12,
241
+ "eval_accuracy": 0.8330404217926186,
242
+ "eval_loss": 0.6129801869392395,
243
+ "eval_runtime": 8.4095,
244
+ "eval_samples_per_second": 67.661,
245
+ "eval_steps_per_second": 8.562,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.16,
250
+ "grad_norm": 4.767229080200195,
251
+ "learning_rate": 8.843283582089554e-05,
252
+ "loss": 0.3808,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 1.19,
257
+ "grad_norm": 12.675297737121582,
258
+ "learning_rate": 8.805970149253732e-05,
259
+ "loss": 0.3766,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 1.23,
264
+ "grad_norm": 3.8118245601654053,
265
+ "learning_rate": 8.76865671641791e-05,
266
+ "loss": 0.2642,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 1.27,
271
+ "grad_norm": 8.736045837402344,
272
+ "learning_rate": 8.731343283582089e-05,
273
+ "loss": 0.3041,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 1.31,
278
+ "grad_norm": 6.683359146118164,
279
+ "learning_rate": 8.694029850746269e-05,
280
+ "loss": 0.1352,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.34,
285
+ "grad_norm": 4.780521392822266,
286
+ "learning_rate": 8.656716417910447e-05,
287
+ "loss": 0.4005,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 1.38,
292
+ "grad_norm": 9.654714584350586,
293
+ "learning_rate": 8.619402985074627e-05,
294
+ "loss": 0.3646,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 1.42,
299
+ "grad_norm": 4.174666881561279,
300
+ "learning_rate": 8.582089552238807e-05,
301
+ "loss": 0.2353,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 1.46,
306
+ "grad_norm": 7.596667289733887,
307
  "learning_rate": 8.548507462686568e-05,
308
+ "loss": 0.3991,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 1.49,
313
+ "grad_norm": 5.592709064483643,
314
  "learning_rate": 8.511194029850747e-05,
315
+ "loss": 0.307,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 1.49,
320
+ "eval_accuracy": 0.789103690685413,
321
+ "eval_loss": 0.8137023448944092,
322
+ "eval_runtime": 8.3292,
323
+ "eval_samples_per_second": 68.314,
324
+ "eval_steps_per_second": 8.644,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 1.53,
329
+ "grad_norm": 2.232590675354004,
330
  "learning_rate": 8.473880597014926e-05,
331
+ "loss": 0.4669,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 1.57,
336
+ "grad_norm": 4.276609897613525,
337
  "learning_rate": 8.436567164179105e-05,
338
+ "loss": 0.3831,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 1.6,
343
+ "grad_norm": 7.262507915496826,
344
  "learning_rate": 8.399253731343283e-05,
345
+ "loss": 0.3472,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 1.64,
350
+ "grad_norm": 7.258556365966797,
351
  "learning_rate": 8.361940298507463e-05,
352
+ "loss": 0.2396,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 1.68,
357
+ "grad_norm": 4.945961952209473,
358
  "learning_rate": 8.324626865671642e-05,
359
+ "loss": 0.2433,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 1.72,
364
+ "grad_norm": 5.138702392578125,
365
+ "learning_rate": 8.287313432835821e-05,
366
+ "loss": 0.2947,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 1.75,
371
+ "grad_norm": 1.1640909910202026,
372
+ "learning_rate": 8.25e-05,
373
+ "loss": 0.4791,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 1.79,
378
+ "grad_norm": 4.626485824584961,
379
+ "learning_rate": 8.21268656716418e-05,
380
+ "loss": 0.286,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 1.83,
385
+ "grad_norm": 5.178492069244385,
386
+ "learning_rate": 8.17537313432836e-05,
387
+ "loss": 0.3202,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 1.87,
392
+ "grad_norm": 7.854339122772217,
393
+ "learning_rate": 8.138059701492538e-05,
394
+ "loss": 0.3542,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.87,
399
+ "eval_accuracy": 0.8014059753954306,
400
+ "eval_loss": 0.6611581444740295,
401
+ "eval_runtime": 8.5853,
402
+ "eval_samples_per_second": 66.276,
403
+ "eval_steps_per_second": 8.386,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 1.9,
408
+ "grad_norm": 1.429740071296692,
409
+ "learning_rate": 8.100746268656717e-05,
410
+ "loss": 0.3039,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 1.94,
415
+ "grad_norm": 2.9776551723480225,
416
+ "learning_rate": 8.063432835820895e-05,
417
+ "loss": 0.3825,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 1.98,
422
+ "grad_norm": 10.557899475097656,
423
+ "learning_rate": 8.026119402985075e-05,
424
+ "loss": 0.5109,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 2.01,
429
+ "grad_norm": 1.448002815246582,
430
+ "learning_rate": 7.988805970149255e-05,
431
+ "loss": 0.3421,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 2.05,
436
+ "grad_norm": 4.500860691070557,
437
+ "learning_rate": 7.951492537313433e-05,
438
+ "loss": 0.3008,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 2.09,
443
+ "grad_norm": 8.077374458312988,
444
+ "learning_rate": 7.914179104477613e-05,
445
+ "loss": 0.27,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 2.13,
450
+ "grad_norm": 0.16809479892253876,
451
+ "learning_rate": 7.876865671641792e-05,
452
+ "loss": 0.2184,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 2.16,
457
+ "grad_norm": 4.892763137817383,
458
+ "learning_rate": 7.83955223880597e-05,
459
+ "loss": 0.1479,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 2.2,
464
+ "grad_norm": 8.35221004486084,
465
+ "learning_rate": 7.80223880597015e-05,
466
+ "loss": 0.3498,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 2.24,
471
+ "grad_norm": 12.043429374694824,
472
+ "learning_rate": 7.764925373134328e-05,
473
+ "loss": 0.3518,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 2.24,
478
+ "eval_accuracy": 0.8189806678383128,
479
+ "eval_loss": 0.6964564919471741,
480
+ "eval_runtime": 8.3878,
481
+ "eval_samples_per_second": 67.837,
482
+ "eval_steps_per_second": 8.584,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 2.28,
487
+ "grad_norm": 3.7737715244293213,
488
+ "learning_rate": 7.727611940298508e-05,
489
+ "loss": 0.3532,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 2.31,
494
+ "grad_norm": 4.282881736755371,
495
+ "learning_rate": 7.690298507462687e-05,
496
+ "loss": 0.2214,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 2.35,
501
+ "grad_norm": 6.733531475067139,
502
+ "learning_rate": 7.652985074626866e-05,
503
+ "loss": 0.2709,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 2.39,
508
+ "grad_norm": 2.567267417907715,
509
+ "learning_rate": 7.615671641791045e-05,
510
+ "loss": 0.3725,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 2.43,
515
+ "grad_norm": 3.120966911315918,
516
+ "learning_rate": 7.578358208955223e-05,
517
+ "loss": 0.3036,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 2.46,
522
+ "grad_norm": 6.505622386932373,
523
+ "learning_rate": 7.541044776119403e-05,
524
+ "loss": 0.2426,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 2.5,
529
+ "grad_norm": 4.887637615203857,
530
+ "learning_rate": 7.503731343283582e-05,
531
+ "loss": 0.281,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 2.54,
536
+ "grad_norm": 9.790969848632812,
537
+ "learning_rate": 7.466417910447762e-05,
538
+ "loss": 0.4504,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 2.57,
543
+ "grad_norm": 4.354789733886719,
544
+ "learning_rate": 7.429104477611941e-05,
545
+ "loss": 0.4094,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 2.61,
550
+ "grad_norm": 5.015912055969238,
551
+ "learning_rate": 7.39179104477612e-05,
552
+ "loss": 0.3706,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 2.61,
557
+ "eval_accuracy": 0.804920913884007,
558
+ "eval_loss": 0.7254143357276917,
559
+ "eval_runtime": 8.3242,
560
+ "eval_samples_per_second": 68.355,
561
+ "eval_steps_per_second": 8.649,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 2.65,
566
+ "grad_norm": 5.382541656494141,
567
+ "learning_rate": 7.3544776119403e-05,
568
+ "loss": 0.1722,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 2.69,
573
+ "grad_norm": 5.573971748352051,
574
+ "learning_rate": 7.317164179104478e-05,
575
+ "loss": 0.327,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 2.72,
580
+ "grad_norm": 3.5606117248535156,
581
+ "learning_rate": 7.279850746268657e-05,
582
+ "loss": 0.2702,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 2.76,
587
+ "grad_norm": 1.7398028373718262,
588
+ "learning_rate": 7.242537313432837e-05,
589
+ "loss": 0.238,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 2.8,
594
+ "grad_norm": 2.7511751651763916,
595
+ "learning_rate": 7.205223880597015e-05,
596
+ "loss": 0.1848,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 2.84,
601
+ "grad_norm": 3.381510019302368,
602
+ "learning_rate": 7.167910447761195e-05,
603
+ "loss": 0.2261,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 2.87,
608
+ "grad_norm": 4.65634298324585,
609
+ "learning_rate": 7.130597014925373e-05,
610
+ "loss": 0.237,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 2.91,
615
+ "grad_norm": 10.35020923614502,
616
+ "learning_rate": 7.093283582089553e-05,
617
+ "loss": 0.3012,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 2.95,
622
+ "grad_norm": 8.878485679626465,
623
+ "learning_rate": 7.055970149253732e-05,
624
+ "loss": 0.4094,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 2.99,
629
+ "grad_norm": 2.9728074073791504,
630
+ "learning_rate": 7.01865671641791e-05,
631
+ "loss": 0.4084,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 2.99,
636
+ "eval_accuracy": 0.8101933216168717,
637
+ "eval_loss": 0.6746156811714172,
638
+ "eval_runtime": 8.2718,
639
+ "eval_samples_per_second": 68.788,
640
+ "eval_steps_per_second": 8.704,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 3.02,
645
+ "grad_norm": 4.835368633270264,
646
+ "learning_rate": 6.98134328358209e-05,
647
+ "loss": 0.3152,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 3.06,
652
+ "grad_norm": 2.9197049140930176,
653
+ "learning_rate": 6.944029850746268e-05,
654
+ "loss": 0.3433,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 3.1,
659
+ "grad_norm": 5.646128177642822,
660
+ "learning_rate": 6.906716417910448e-05,
661
+ "loss": 0.2604,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 3.13,
666
+ "grad_norm": 3.860607862472534,
667
+ "learning_rate": 6.869402985074627e-05,
668
+ "loss": 0.2831,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 3.17,
673
+ "grad_norm": 0.1358175426721573,
674
+ "learning_rate": 6.832089552238807e-05,
675
+ "loss": 0.242,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 3.21,
680
+ "grad_norm": 1.1011104583740234,
681
+ "learning_rate": 6.794776119402985e-05,
682
+ "loss": 0.2621,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 3.25,
687
+ "grad_norm": 7.837879180908203,
688
+ "learning_rate": 6.757462686567164e-05,
689
+ "loss": 0.249,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 3.28,
694
+ "grad_norm": 6.8647613525390625,
695
+ "learning_rate": 6.720149253731343e-05,
696
+ "loss": 0.3398,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 3.32,
701
+ "grad_norm": 2.8186678886413574,
702
+ "learning_rate": 6.682835820895522e-05,
703
+ "loss": 0.3092,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 3.36,
708
+ "grad_norm": 4.623282432556152,
709
+ "learning_rate": 6.645522388059702e-05,
710
+ "loss": 0.2533,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 3.36,
715
+ "eval_accuracy": 0.8189806678383128,
716
+ "eval_loss": 0.6866591572761536,
717
+ "eval_runtime": 8.3143,
718
+ "eval_samples_per_second": 68.436,
719
+ "eval_steps_per_second": 8.66,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 3.4,
724
+ "grad_norm": 4.85120964050293,
725
+ "learning_rate": 6.608208955223882e-05,
726
+ "loss": 0.2279,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 3.43,
731
+ "grad_norm": 0.7263774275779724,
732
+ "learning_rate": 6.57089552238806e-05,
733
+ "loss": 0.1725,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 3.47,
738
+ "grad_norm": 6.813180923461914,
739
+ "learning_rate": 6.53358208955224e-05,
740
+ "loss": 0.3304,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 3.51,
745
+ "grad_norm": 8.58501148223877,
746
+ "learning_rate": 6.496268656716418e-05,
747
+ "loss": 0.1864,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 3.54,
752
+ "grad_norm": 2.814436435699463,
753
+ "learning_rate": 6.458955223880597e-05,
754
+ "loss": 0.1496,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 3.58,
759
+ "grad_norm": 8.36603832244873,
760
+ "learning_rate": 6.421641791044777e-05,
761
+ "loss": 0.208,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 3.62,
766
+ "grad_norm": 3.5715956687927246,
767
+ "learning_rate": 6.384328358208955e-05,
768
+ "loss": 0.2429,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 3.66,
773
+ "grad_norm": 4.983556270599365,
774
+ "learning_rate": 6.347014925373135e-05,
775
+ "loss": 0.4053,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 3.69,
780
+ "grad_norm": 4.936723232269287,
781
+ "learning_rate": 6.309701492537313e-05,
782
+ "loss": 0.1545,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 3.73,
787
+ "grad_norm": 6.59185791015625,
788
+ "learning_rate": 6.272388059701493e-05,
789
+ "loss": 0.3147,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 3.73,
794
+ "eval_accuracy": 0.8189806678383128,
795
+ "eval_loss": 0.7077136635780334,
796
+ "eval_runtime": 8.3117,
797
+ "eval_samples_per_second": 68.457,
798
+ "eval_steps_per_second": 8.662,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 3.77,
803
+ "grad_norm": 9.348366737365723,
804
+ "learning_rate": 6.235074626865672e-05,
805
+ "loss": 0.3634,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 3.81,
810
+ "grad_norm": 9.918521881103516,
811
+ "learning_rate": 6.19776119402985e-05,
812
+ "loss": 0.3151,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 3.84,
817
+ "grad_norm": 5.687044143676758,
818
+ "learning_rate": 6.16044776119403e-05,
819
+ "loss": 0.3088,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 3.88,
824
+ "grad_norm": 3.8347887992858887,
825
+ "learning_rate": 6.123134328358209e-05,
826
+ "loss": 0.2128,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 3.92,
831
+ "grad_norm": 5.380050182342529,
832
+ "learning_rate": 6.0858208955223884e-05,
833
+ "loss": 0.255,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 3.96,
838
+ "grad_norm": 8.848828315734863,
839
+ "learning_rate": 6.0485074626865676e-05,
840
+ "loss": 0.2794,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 3.99,
845
+ "grad_norm": 3.9666404724121094,
846
+ "learning_rate": 6.011194029850746e-05,
847
+ "loss": 0.1954,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 4.03,
852
+ "grad_norm": 0.3369455635547638,
853
+ "learning_rate": 5.973880597014926e-05,
854
+ "loss": 0.2298,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 4.07,
859
+ "grad_norm": 16.327823638916016,
860
+ "learning_rate": 5.9365671641791044e-05,
861
+ "loss": 0.2504,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 4.1,
866
+ "grad_norm": 7.070168495178223,
867
+ "learning_rate": 5.8992537313432835e-05,
868
+ "loss": 0.3182,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 4.1,
873
+ "eval_accuracy": 0.8189806678383128,
874
+ "eval_loss": 0.6661401987075806,
875
+ "eval_runtime": 8.2263,
876
+ "eval_samples_per_second": 69.169,
877
+ "eval_steps_per_second": 8.752,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 4.14,
882
+ "grad_norm": 2.853975534439087,
883
+ "learning_rate": 5.8619402985074634e-05,
884
+ "loss": 0.201,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 4.18,
889
+ "grad_norm": 0.958690881729126,
890
+ "learning_rate": 5.824626865671642e-05,
891
+ "loss": 0.1833,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 4.22,
896
+ "grad_norm": 3.4794461727142334,
897
+ "learning_rate": 5.787313432835822e-05,
898
+ "loss": 0.2796,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 4.25,
903
+ "grad_norm": 4.793296813964844,
904
+ "learning_rate": 5.7499999999999995e-05,
905
+ "loss": 0.2281,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 4.29,
910
+ "grad_norm": 6.200154781341553,
911
+ "learning_rate": 5.712686567164179e-05,
912
+ "loss": 0.2814,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 4.33,
917
+ "grad_norm": 5.616389274597168,
918
+ "learning_rate": 5.675373134328359e-05,
919
+ "loss": 0.1656,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 4.37,
924
+ "grad_norm": 9.382554054260254,
925
+ "learning_rate": 5.6380597014925376e-05,
926
+ "loss": 0.19,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 4.4,
931
+ "grad_norm": 3.526240587234497,
932
+ "learning_rate": 5.600746268656717e-05,
933
+ "loss": 0.2063,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 4.44,
938
+ "grad_norm": 3.494896650314331,
939
+ "learning_rate": 5.563432835820895e-05,
940
+ "loss": 0.1681,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 4.48,
945
+ "grad_norm": 5.764057636260986,
946
+ "learning_rate": 5.526119402985075e-05,
947
+ "loss": 0.2248,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 4.48,
952
+ "eval_accuracy": 0.8418277680140598,
953
+ "eval_loss": 0.6632041335105896,
954
+ "eval_runtime": 8.1661,
955
+ "eval_samples_per_second": 69.679,
956
+ "eval_steps_per_second": 8.817,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 4.51,
961
+ "grad_norm": 4.680635452270508,
962
+ "learning_rate": 5.488805970149254e-05,
963
+ "loss": 0.2179,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 4.55,
968
+ "grad_norm": 10.24306869506836,
969
+ "learning_rate": 5.451492537313433e-05,
970
+ "loss": 0.2187,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 4.59,
975
+ "grad_norm": 3.054690361022949,
976
+ "learning_rate": 5.4141791044776126e-05,
977
+ "loss": 0.1729,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 4.63,
982
+ "grad_norm": 4.907272815704346,
983
+ "learning_rate": 5.376865671641791e-05,
984
+ "loss": 0.2762,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 4.66,
989
+ "grad_norm": 4.774748802185059,
990
+ "learning_rate": 5.33955223880597e-05,
991
+ "loss": 0.1965,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 4.7,
996
+ "grad_norm": 5.757875919342041,
997
+ "learning_rate": 5.30223880597015e-05,
998
+ "loss": 0.1564,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 4.74,
1003
+ "grad_norm": 0.3608088791370392,
1004
+ "learning_rate": 5.2649253731343286e-05,
1005
+ "loss": 0.0946,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 4.78,
1010
+ "grad_norm": 3.6289939880371094,
1011
+ "learning_rate": 5.227611940298508e-05,
1012
+ "loss": 0.3364,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 4.81,
1017
+ "grad_norm": 5.132009029388428,
1018
+ "learning_rate": 5.190298507462686e-05,
1019
+ "loss": 0.231,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 4.85,
1024
+ "grad_norm": 1.0347099304199219,
1025
+ "learning_rate": 5.152985074626866e-05,
1026
+ "loss": 0.1617,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 4.85,
1031
+ "eval_accuracy": 0.8172231985940246,
1032
+ "eval_loss": 0.7277125716209412,
1033
+ "eval_runtime": 8.4693,
1034
+ "eval_samples_per_second": 67.184,
1035
+ "eval_steps_per_second": 8.501,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 4.89,
1040
+ "grad_norm": 2.5996298789978027,
1041
+ "learning_rate": 5.115671641791045e-05,
1042
+ "loss": 0.385,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 4.93,
1047
+ "grad_norm": 3.724181890487671,
1048
+ "learning_rate": 5.078358208955224e-05,
1049
+ "loss": 0.1786,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 4.96,
1054
+ "grad_norm": 2.150557518005371,
1055
+ "learning_rate": 5.0410447761194035e-05,
1056
+ "loss": 0.2122,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 5.0,
1061
+ "grad_norm": 3.8813323974609375,
1062
+ "learning_rate": 5.003731343283582e-05,
1063
+ "loss": 0.2425,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 5.04,
1068
+ "grad_norm": 0.896369457244873,
1069
+ "learning_rate": 4.966417910447762e-05,
1070
+ "loss": 0.2208,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 5.07,
1075
+ "grad_norm": 9.002110481262207,
1076
+ "learning_rate": 4.92910447761194e-05,
1077
+ "loss": 0.1432,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 5.11,
1082
+ "grad_norm": 9.619662284851074,
1083
+ "learning_rate": 4.8917910447761195e-05,
1084
+ "loss": 0.1347,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 5.15,
1089
+ "grad_norm": 3.5148773193359375,
1090
+ "learning_rate": 4.8544776119402986e-05,
1091
+ "loss": 0.2837,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 5.19,
1096
+ "grad_norm": 7.631669044494629,
1097
+ "learning_rate": 4.817164179104478e-05,
1098
+ "loss": 0.1887,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 5.22,
1103
+ "grad_norm": 11.738872528076172,
1104
+ "learning_rate": 4.779850746268657e-05,
1105
+ "loss": 0.2578,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 5.22,
1110
+ "eval_accuracy": 0.8189806678383128,
1111
+ "eval_loss": 0.7114442586898804,
1112
+ "eval_runtime": 8.2672,
1113
+ "eval_samples_per_second": 68.826,
1114
+ "eval_steps_per_second": 8.709,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 5.26,
1119
+ "grad_norm": 6.67802095413208,
1120
+ "learning_rate": 4.742537313432836e-05,
1121
+ "loss": 0.2527,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 5.3,
1126
+ "grad_norm": 4.491325378417969,
1127
+ "learning_rate": 4.705223880597015e-05,
1128
+ "loss": 0.2386,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 5.34,
1133
+ "grad_norm": 1.1810379028320312,
1134
+ "learning_rate": 4.667910447761194e-05,
1135
+ "loss": 0.1693,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 5.37,
1140
+ "grad_norm": 6.075868129730225,
1141
+ "learning_rate": 4.6305970149253736e-05,
1142
+ "loss": 0.167,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 5.41,
1147
+ "grad_norm": 2.315635919570923,
1148
+ "learning_rate": 4.593283582089553e-05,
1149
+ "loss": 0.2243,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 5.45,
1154
+ "grad_norm": 10.839255332946777,
1155
+ "learning_rate": 4.555970149253732e-05,
1156
+ "loss": 0.2414,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 5.49,
1161
+ "grad_norm": 4.562304496765137,
1162
+ "learning_rate": 4.5186567164179104e-05,
1163
+ "loss": 0.264,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 5.52,
1168
+ "grad_norm": 1.8821789026260376,
1169
+ "learning_rate": 4.4813432835820895e-05,
1170
+ "loss": 0.1407,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 5.56,
1175
+ "grad_norm": 8.406396865844727,
1176
+ "learning_rate": 4.4440298507462694e-05,
1177
+ "loss": 0.1454,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 5.6,
1182
+ "grad_norm": 0.2816010117530823,
1183
+ "learning_rate": 4.406716417910448e-05,
1184
+ "loss": 0.1864,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 5.6,
1189
+ "eval_accuracy": 0.8172231985940246,
1190
+ "eval_loss": 0.755394458770752,
1191
+ "eval_runtime": 8.2598,
1192
+ "eval_samples_per_second": 68.888,
1193
+ "eval_steps_per_second": 8.717,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 5.63,
1198
+ "grad_norm": 6.619854927062988,
1199
+ "learning_rate": 4.369402985074627e-05,
1200
+ "loss": 0.2806,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 5.67,
1205
+ "grad_norm": 2.056018829345703,
1206
+ "learning_rate": 4.332089552238806e-05,
1207
+ "loss": 0.2583,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 5.71,
1212
+ "grad_norm": 0.966521680355072,
1213
+ "learning_rate": 4.294776119402985e-05,
1214
+ "loss": 0.0997,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 5.75,
1219
+ "grad_norm": 2.8261241912841797,
1220
+ "learning_rate": 4.2574626865671645e-05,
1221
+ "loss": 0.1604,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 5.78,
1226
+ "grad_norm": 3.089912176132202,
1227
+ "learning_rate": 4.2201492537313436e-05,
1228
+ "loss": 0.2775,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 5.82,
1233
+ "grad_norm": 7.935690879821777,
1234
+ "learning_rate": 4.182835820895523e-05,
1235
+ "loss": 0.2522,
1236
  "step": 1560
1237
  },
1238
  {
1239
  "epoch": 5.86,
1240
+ "grad_norm": 0.7999266982078552,
1241
+ "learning_rate": 4.145522388059702e-05,
1242
+ "loss": 0.0752,
1243
  "step": 1570
1244
  },
1245
  {
1246
  "epoch": 5.9,
1247
+ "grad_norm": 6.0712480545043945,
1248
+ "learning_rate": 4.1082089552238804e-05,
1249
+ "loss": 0.1933,
1250
  "step": 1580
1251
  },
1252
  {
1253
  "epoch": 5.93,
1254
+ "grad_norm": 10.768308639526367,
1255
+ "learning_rate": 4.07089552238806e-05,
1256
+ "loss": 0.1664,
1257
  "step": 1590
1258
  },
1259
  {
1260
  "epoch": 5.97,
1261
+ "grad_norm": 9.641716003417969,
1262
+ "learning_rate": 4.0335820895522394e-05,
1263
+ "loss": 0.3134,
1264
  "step": 1600
1265
  },
1266
  {
1267
  "epoch": 5.97,
1268
+ "eval_accuracy": 0.8154657293497364,
1269
+ "eval_loss": 0.7593356966972351,
1270
+ "eval_runtime": 8.4455,
1271
+ "eval_samples_per_second": 67.373,
1272
+ "eval_steps_per_second": 8.525,
1273
  "step": 1600
1274
  },
1275
  {
1276
  "epoch": 6.01,
1277
+ "grad_norm": 6.7538838386535645,
1278
+ "learning_rate": 3.996268656716418e-05,
1279
+ "loss": 0.1747,
1280
  "step": 1610
1281
  },
1282
  {
1283
  "epoch": 6.04,
1284
+ "grad_norm": 6.237377166748047,
1285
+ "learning_rate": 3.958955223880597e-05,
1286
+ "loss": 0.2406,
1287
  "step": 1620
1288
  },
1289
  {
1290
  "epoch": 6.08,
1291
+ "grad_norm": 7.950930118560791,
1292
+ "learning_rate": 3.921641791044776e-05,
1293
+ "loss": 0.1884,
1294
  "step": 1630
1295
  },
1296
  {
1297
  "epoch": 6.12,
1298
+ "grad_norm": 4.41484260559082,
1299
+ "learning_rate": 3.8843283582089554e-05,
1300
+ "loss": 0.1445,
1301
  "step": 1640
1302
  },
1303
  {
1304
  "epoch": 6.16,
1305
+ "grad_norm": 6.339887619018555,
1306
+ "learning_rate": 3.8470149253731345e-05,
1307
+ "loss": 0.2906,
1308
  "step": 1650
1309
  },
1310
  {
1311
  "epoch": 6.19,
1312
+ "grad_norm": 7.597599983215332,
1313
+ "learning_rate": 3.809701492537314e-05,
1314
+ "loss": 0.1576,
1315
  "step": 1660
1316
  },
1317
  {
1318
  "epoch": 6.23,
1319
+ "grad_norm": 2.379629373550415,
1320
+ "learning_rate": 3.772388059701493e-05,
1321
+ "loss": 0.2016,
1322
  "step": 1670
1323
  },
1324
  {
1325
  "epoch": 6.27,
1326
+ "grad_norm": 2.7694478034973145,
1327
+ "learning_rate": 3.735074626865671e-05,
1328
+ "loss": 0.1188,
1329
  "step": 1680
1330
  },
1331
  {
1332
  "epoch": 6.31,
1333
+ "grad_norm": 2.1837210655212402,
1334
+ "learning_rate": 3.6977611940298505e-05,
1335
+ "loss": 0.1908,
1336
  "step": 1690
1337
  },
1338
  {
1339
  "epoch": 6.34,
1340
+ "grad_norm": 4.4665350914001465,
1341
+ "learning_rate": 3.66044776119403e-05,
1342
+ "loss": 0.24,
1343
  "step": 1700
1344
  },
1345
  {
1346
  "epoch": 6.34,
1347
+ "eval_accuracy": 0.8260105448154658,
1348
+ "eval_loss": 0.7510848641395569,
1349
+ "eval_runtime": 8.2044,
1350
+ "eval_samples_per_second": 69.353,
1351
+ "eval_steps_per_second": 8.776,
1352
  "step": 1700
1353
  },
1354
  {
1355
  "epoch": 6.38,
1356
+ "grad_norm": 4.6551995277404785,
1357
+ "learning_rate": 3.6231343283582095e-05,
1358
+ "loss": 0.1631,
1359
  "step": 1710
1360
  },
1361
  {
1362
  "epoch": 6.42,
1363
+ "grad_norm": 1.098407506942749,
1364
+ "learning_rate": 3.585820895522388e-05,
1365
+ "loss": 0.0912,
1366
  "step": 1720
1367
  },
1368
  {
1369
  "epoch": 6.46,
1370
+ "grad_norm": 0.37138649821281433,
1371
+ "learning_rate": 3.548507462686567e-05,
1372
+ "loss": 0.2621,
1373
  "step": 1730
1374
  },
1375
  {
1376
  "epoch": 6.49,
1377
+ "grad_norm": 7.4571757316589355,
1378
+ "learning_rate": 3.511194029850746e-05,
1379
+ "loss": 0.268,
1380
  "step": 1740
1381
  },
1382
  {
1383
  "epoch": 6.53,
1384
+ "grad_norm": 0.5180323123931885,
1385
+ "learning_rate": 3.4738805970149254e-05,
1386
+ "loss": 0.2135,
1387
  "step": 1750
1388
  },
1389
  {
1390
  "epoch": 6.57,
1391
+ "grad_norm": 1.0866820812225342,
1392
+ "learning_rate": 3.4365671641791046e-05,
1393
+ "loss": 0.1489,
1394
  "step": 1760
1395
  },
1396
  {
1397
  "epoch": 6.6,
1398
+ "grad_norm": 8.90451717376709,
1399
+ "learning_rate": 3.399253731343284e-05,
1400
+ "loss": 0.288,
1401
  "step": 1770
1402
  },
1403
  {
1404
  "epoch": 6.64,
1405
+ "grad_norm": 1.1608803272247314,
1406
+ "learning_rate": 3.361940298507463e-05,
1407
+ "loss": 0.18,
1408
  "step": 1780
1409
  },
1410
  {
1411
  "epoch": 6.68,
1412
+ "grad_norm": 2.9207170009613037,
1413
+ "learning_rate": 3.3246268656716414e-05,
1414
+ "loss": 0.2414,
1415
  "step": 1790
1416
  },
1417
  {
1418
  "epoch": 6.72,
1419
+ "grad_norm": 0.2674783170223236,
1420
+ "learning_rate": 3.287313432835821e-05,
1421
+ "loss": 0.2359,
1422
  "step": 1800
1423
  },
1424
  {
1425
  "epoch": 6.72,
1426
+ "eval_accuracy": 0.8137082601054482,
1427
+ "eval_loss": 0.7501537203788757,
1428
+ "eval_runtime": 8.1528,
1429
+ "eval_samples_per_second": 69.792,
1430
+ "eval_steps_per_second": 8.831,
1431
  "step": 1800
1432
  },
1433
  {
1434
  "epoch": 6.75,
1435
+ "grad_norm": 8.241676330566406,
1436
+ "learning_rate": 3.2500000000000004e-05,
1437
+ "loss": 0.1975,
1438
  "step": 1810
1439
  },
1440
  {
1441
  "epoch": 6.79,
1442
+ "grad_norm": 2.0347325801849365,
1443
+ "learning_rate": 3.2126865671641796e-05,
1444
+ "loss": 0.218,
1445
  "step": 1820
1446
  },
1447
  {
1448
  "epoch": 6.83,
1449
+ "grad_norm": 1.0338706970214844,
1450
+ "learning_rate": 3.175373134328358e-05,
1451
+ "loss": 0.1437,
1452
  "step": 1830
1453
  },
1454
  {
1455
  "epoch": 6.87,
1456
+ "grad_norm": 0.34902578592300415,
1457
+ "learning_rate": 3.138059701492537e-05,
1458
+ "loss": 0.1883,
1459
  "step": 1840
1460
  },
1461
  {
1462
  "epoch": 6.9,
1463
+ "grad_norm": 6.642534255981445,
1464
+ "learning_rate": 3.100746268656717e-05,
1465
+ "loss": 0.2513,
1466
  "step": 1850
1467
  },
1468
  {
1469
  "epoch": 6.94,
1470
+ "grad_norm": 4.432920455932617,
1471
+ "learning_rate": 3.0634328358208955e-05,
1472
+ "loss": 0.1058,
1473
  "step": 1860
1474
  },
1475
  {
1476
  "epoch": 6.98,
1477
+ "grad_norm": 4.381640434265137,
1478
+ "learning_rate": 3.0261194029850747e-05,
1479
+ "loss": 0.2114,
1480
  "step": 1870
1481
  },
1482
  {
1483
  "epoch": 7.01,
1484
+ "grad_norm": 7.730411529541016,
1485
+ "learning_rate": 2.9888059701492538e-05,
1486
+ "loss": 0.2542,
1487
  "step": 1880
1488
  },
1489
  {
1490
  "epoch": 7.05,
1491
+ "grad_norm": 7.122923851013184,
1492
+ "learning_rate": 2.9514925373134326e-05,
1493
+ "loss": 0.2594,
1494
  "step": 1890
1495
  },
1496
  {
1497
  "epoch": 7.09,
1498
+ "grad_norm": 1.411278486251831,
1499
+ "learning_rate": 2.9141791044776125e-05,
1500
+ "loss": 0.2322,
1501
  "step": 1900
1502
  },
1503
  {
1504
  "epoch": 7.09,
1505
+ "eval_accuracy": 0.8347978910369068,
1506
+ "eval_loss": 0.6952534317970276,
1507
+ "eval_runtime": 8.3769,
1508
+ "eval_samples_per_second": 67.925,
1509
+ "eval_steps_per_second": 8.595,
1510
  "step": 1900
1511
  },
1512
  {
1513
  "epoch": 7.13,
1514
+ "grad_norm": 2.219285011291504,
1515
+ "learning_rate": 2.8768656716417913e-05,
1516
+ "loss": 0.1344,
1517
  "step": 1910
1518
  },
1519
  {
1520
  "epoch": 7.16,
1521
+ "grad_norm": 6.302455902099609,
1522
+ "learning_rate": 2.8395522388059705e-05,
1523
+ "loss": 0.2098,
1524
  "step": 1920
1525
  },
1526
  {
1527
  "epoch": 7.2,
1528
+ "grad_norm": 1.2837783098220825,
1529
+ "learning_rate": 2.8022388059701493e-05,
1530
+ "loss": 0.0906,
1531
  "step": 1930
1532
  },
1533
  {
1534
  "epoch": 7.24,
1535
+ "grad_norm": 6.604355335235596,
1536
+ "learning_rate": 2.7649253731343284e-05,
1537
+ "loss": 0.2352,
1538
  "step": 1940
1539
  },
1540
  {
1541
  "epoch": 7.28,
1542
+ "grad_norm": 9.916419982910156,
1543
+ "learning_rate": 2.727611940298508e-05,
1544
+ "loss": 0.1422,
1545
  "step": 1950
1546
  },
1547
  {
1548
  "epoch": 7.31,
1549
+ "grad_norm": 2.7665014266967773,
1550
+ "learning_rate": 2.6902985074626868e-05,
1551
+ "loss": 0.1722,
1552
  "step": 1960
1553
  },
1554
  {
1555
  "epoch": 7.35,
1556
+ "grad_norm": 0.24231348931789398,
1557
+ "learning_rate": 2.652985074626866e-05,
1558
+ "loss": 0.2935,
1559
  "step": 1970
1560
  },
1561
  {
1562
  "epoch": 7.39,
1563
+ "grad_norm": 0.8025885224342346,
1564
+ "learning_rate": 2.6156716417910447e-05,
1565
+ "loss": 0.157,
1566
  "step": 1980
1567
  },
1568
  {
1569
  "epoch": 7.43,
1570
+ "grad_norm": 1.6752264499664307,
1571
+ "learning_rate": 2.578358208955224e-05,
1572
+ "loss": 0.1256,
1573
  "step": 1990
1574
  },
1575
  {
1576
  "epoch": 7.46,
1577
+ "grad_norm": 2.404883861541748,
1578
+ "learning_rate": 2.5410447761194027e-05,
1579
+ "loss": 0.1514,
1580
  "step": 2000
1581
  },
1582
  {
1583
  "epoch": 7.46,
1584
+ "eval_accuracy": 0.8260105448154658,
1585
+ "eval_loss": 0.7120960354804993,
1586
+ "eval_runtime": 8.1425,
1587
+ "eval_samples_per_second": 69.88,
1588
+ "eval_steps_per_second": 8.842,
1589
  "step": 2000
1590
  },
1591
  {
1592
  "epoch": 7.5,
1593
+ "grad_norm": 5.409728050231934,
1594
+ "learning_rate": 2.5037313432835825e-05,
1595
+ "loss": 0.222,
1596
  "step": 2010
1597
  },
1598
  {
1599
  "epoch": 7.54,
1600
+ "grad_norm": 3.949014663696289,
1601
+ "learning_rate": 2.4664179104477614e-05,
1602
+ "loss": 0.245,
1603
  "step": 2020
1604
  },
1605
  {
1606
  "epoch": 7.57,
1607
+ "grad_norm": 8.40086555480957,
1608
+ "learning_rate": 2.4291044776119405e-05,
1609
+ "loss": 0.1408,
1610
  "step": 2030
1611
  },
1612
  {
1613
  "epoch": 7.61,
1614
+ "grad_norm": 7.694955348968506,
1615
+ "learning_rate": 2.3917910447761197e-05,
1616
+ "loss": 0.2072,
1617
  "step": 2040
1618
  },
1619
  {
1620
  "epoch": 7.65,
1621
+ "grad_norm": 1.9109055995941162,
1622
+ "learning_rate": 2.3544776119402985e-05,
1623
+ "loss": 0.145,
1624
  "step": 2050
1625
  },
1626
  {
1627
  "epoch": 7.69,
1628
+ "grad_norm": 12.803776741027832,
1629
+ "learning_rate": 2.3171641791044777e-05,
1630
+ "loss": 0.1274,
1631
  "step": 2060
1632
  },
1633
  {
1634
  "epoch": 7.72,
1635
+ "grad_norm": 3.3325235843658447,
1636
+ "learning_rate": 2.2798507462686568e-05,
1637
+ "loss": 0.1564,
1638
  "step": 2070
1639
  },
1640
  {
1641
  "epoch": 7.76,
1642
+ "grad_norm": 1.105327844619751,
1643
+ "learning_rate": 2.242537313432836e-05,
1644
+ "loss": 0.2008,
1645
  "step": 2080
1646
  },
1647
  {
1648
  "epoch": 7.8,
1649
+ "grad_norm": 1.7592620849609375,
1650
+ "learning_rate": 2.2052238805970148e-05,
1651
+ "loss": 0.203,
1652
  "step": 2090
1653
  },
1654
  {
1655
  "epoch": 7.84,
1656
+ "grad_norm": 0.13264060020446777,
1657
+ "learning_rate": 2.1679104477611943e-05,
1658
+ "loss": 0.2089,
1659
  "step": 2100
1660
  },
1661
  {
1662
  "epoch": 7.84,
1663
+ "eval_accuracy": 0.827768014059754,
1664
+ "eval_loss": 0.693087637424469,
1665
+ "eval_runtime": 8.2375,
1666
+ "eval_samples_per_second": 69.074,
1667
+ "eval_steps_per_second": 8.741,
1668
  "step": 2100
1669
  },
1670
  {
1671
  "epoch": 7.87,
1672
+ "grad_norm": 5.904381275177002,
1673
+ "learning_rate": 2.130597014925373e-05,
1674
+ "loss": 0.1754,
1675
  "step": 2110
1676
  },
1677
  {
1678
  "epoch": 7.91,
1679
+ "grad_norm": 1.7469266653060913,
1680
+ "learning_rate": 2.0932835820895526e-05,
1681
+ "loss": 0.1322,
1682
  "step": 2120
1683
  },
1684
  {
1685
  "epoch": 7.95,
1686
+ "grad_norm": 4.313326835632324,
1687
+ "learning_rate": 2.0559701492537314e-05,
1688
+ "loss": 0.1418,
1689
  "step": 2130
1690
  },
1691
  {
1692
  "epoch": 7.99,
1693
+ "grad_norm": 0.14211903512477875,
1694
+ "learning_rate": 2.0186567164179106e-05,
1695
+ "loss": 0.1534,
1696
  "step": 2140
1697
  },
1698
  {
1699
  "epoch": 8.02,
1700
+ "grad_norm": 5.527184009552002,
1701
+ "learning_rate": 1.9813432835820897e-05,
1702
+ "loss": 0.2122,
1703
  "step": 2150
1704
  },
1705
  {
1706
  "epoch": 8.06,
1707
+ "grad_norm": 0.2312430739402771,
1708
+ "learning_rate": 1.9440298507462686e-05,
1709
+ "loss": 0.1617,
1710
  "step": 2160
1711
  },
1712
  {
1713
  "epoch": 8.1,
1714
+ "grad_norm": 0.23949085175991058,
1715
+ "learning_rate": 1.906716417910448e-05,
1716
+ "loss": 0.1286,
1717
  "step": 2170
1718
  },
1719
  {
1720
  "epoch": 8.13,
1721
+ "grad_norm": 0.1903185099363327,
1722
+ "learning_rate": 1.869402985074627e-05,
1723
+ "loss": 0.0846,
1724
  "step": 2180
1725
  },
1726
  {
1727
  "epoch": 8.17,
1728
+ "grad_norm": 0.08518023788928986,
1729
+ "learning_rate": 1.832089552238806e-05,
1730
+ "loss": 0.0801,
1731
  "step": 2190
1732
  },
1733
  {
1734
  "epoch": 8.21,
1735
+ "grad_norm": 4.424215793609619,
1736
+ "learning_rate": 1.7947761194029852e-05,
1737
+ "loss": 0.2245,
1738
  "step": 2200
1739
  },
1740
  {
1741
  "epoch": 8.21,
1742
+ "eval_accuracy": 0.8330404217926186,
1743
+ "eval_loss": 0.7087014317512512,
1744
+ "eval_runtime": 8.1117,
1745
+ "eval_samples_per_second": 70.145,
1746
+ "eval_steps_per_second": 8.876,
1747
  "step": 2200
1748
  },
1749
  {
1750
  "epoch": 8.25,
1751
+ "grad_norm": 7.247931480407715,
1752
+ "learning_rate": 1.7574626865671644e-05,
1753
+ "loss": 0.0722,
1754
  "step": 2210
1755
  },
1756
  {
1757
  "epoch": 8.28,
1758
+ "grad_norm": 4.80264949798584,
1759
+ "learning_rate": 1.7201492537313435e-05,
1760
+ "loss": 0.0844,
1761
  "step": 2220
1762
  },
1763
  {
1764
  "epoch": 8.32,
1765
+ "grad_norm": 8.001790046691895,
1766
+ "learning_rate": 1.6828358208955223e-05,
1767
+ "loss": 0.1077,
1768
  "step": 2230
1769
  },
1770
  {
1771
  "epoch": 8.36,
1772
+ "grad_norm": 5.419641017913818,
1773
+ "learning_rate": 1.6455223880597015e-05,
1774
+ "loss": 0.1627,
1775
  "step": 2240
1776
  },
1777
  {
1778
  "epoch": 8.4,
1779
+ "grad_norm": 0.031686268746852875,
1780
+ "learning_rate": 1.6082089552238806e-05,
1781
+ "loss": 0.0984,
1782
  "step": 2250
1783
  },
1784
  {
1785
  "epoch": 8.43,
1786
+ "grad_norm": 6.095193862915039,
1787
+ "learning_rate": 1.5708955223880598e-05,
1788
+ "loss": 0.1756,
1789
  "step": 2260
1790
  },
1791
  {
1792
  "epoch": 8.47,
1793
+ "grad_norm": 5.179446220397949,
1794
+ "learning_rate": 1.5335820895522386e-05,
1795
+ "loss": 0.1708,
1796
  "step": 2270
1797
  },
1798
  {
1799
  "epoch": 8.51,
1800
+ "grad_norm": 4.06497049331665,
1801
+ "learning_rate": 1.496268656716418e-05,
1802
+ "loss": 0.1493,
1803
  "step": 2280
1804
  },
1805
  {
1806
  "epoch": 8.54,
1807
+ "grad_norm": 1.4721342325210571,
1808
+ "learning_rate": 1.458955223880597e-05,
1809
+ "loss": 0.2587,
1810
  "step": 2290
1811
  },
1812
  {
1813
  "epoch": 8.58,
1814
+ "grad_norm": 4.418783664703369,
1815
+ "learning_rate": 1.4216417910447763e-05,
1816
+ "loss": 0.1328,
1817
  "step": 2300
1818
  },
1819
  {
1820
  "epoch": 8.58,
1821
+ "eval_accuracy": 0.8312829525483304,
1822
+ "eval_loss": 0.700339674949646,
1823
+ "eval_runtime": 8.481,
1824
+ "eval_samples_per_second": 67.091,
1825
+ "eval_steps_per_second": 8.49,
1826
  "step": 2300
1827
  },
1828
  {
1829
  "epoch": 8.62,
1830
+ "grad_norm": 1.5734038352966309,
1831
+ "learning_rate": 1.3843283582089553e-05,
1832
+ "loss": 0.165,
1833
  "step": 2310
1834
  },
1835
  {
1836
  "epoch": 8.66,
1837
+ "grad_norm": 2.624784231185913,
1838
+ "learning_rate": 1.3470149253731342e-05,
1839
+ "loss": 0.0837,
1840
  "step": 2320
1841
  },
1842
  {
1843
  "epoch": 8.69,
1844
+ "grad_norm": 2.7039573192596436,
1845
+ "learning_rate": 1.3097014925373136e-05,
1846
+ "loss": 0.2098,
1847
  "step": 2330
1848
  },
1849
  {
1850
  "epoch": 8.73,
1851
+ "grad_norm": 6.542816638946533,
1852
+ "learning_rate": 1.2723880597014926e-05,
1853
+ "loss": 0.129,
1854
  "step": 2340
1855
  },
1856
  {
1857
  "epoch": 8.77,
1858
+ "grad_norm": 2.9511120319366455,
1859
+ "learning_rate": 1.2350746268656717e-05,
1860
+ "loss": 0.1762,
1861
  "step": 2350
1862
  },
1863
  {
1864
  "epoch": 8.81,
1865
+ "grad_norm": 3.435502529144287,
1866
+ "learning_rate": 1.1977611940298509e-05,
1867
+ "loss": 0.1345,
1868
  "step": 2360
1869
  },
1870
  {
1871
  "epoch": 8.84,
1872
+ "grad_norm": 2.1689364910125732,
1873
+ "learning_rate": 1.1604477611940299e-05,
1874
+ "loss": 0.1011,
1875
  "step": 2370
1876
  },
1877
  {
1878
  "epoch": 8.88,
1879
+ "grad_norm": 2.3366479873657227,
1880
+ "learning_rate": 1.123134328358209e-05,
1881
+ "loss": 0.1733,
1882
  "step": 2380
1883
  },
1884
  {
1885
  "epoch": 8.92,
1886
+ "grad_norm": 5.928171634674072,
1887
+ "learning_rate": 1.085820895522388e-05,
1888
+ "loss": 0.1089,
1889
  "step": 2390
1890
  },
1891
  {
1892
  "epoch": 8.96,
1893
+ "grad_norm": 0.08636012673377991,
1894
+ "learning_rate": 1.0485074626865672e-05,
1895
+ "loss": 0.1304,
1896
  "step": 2400
1897
  },
1898
  {
1899
  "epoch": 8.96,
1900
+ "eval_accuracy": 0.8224956063268892,
1901
+ "eval_loss": 0.7306046485900879,
1902
+ "eval_runtime": 8.4262,
1903
+ "eval_samples_per_second": 67.528,
1904
+ "eval_steps_per_second": 8.545,
1905
  "step": 2400
1906
  },
1907
  {
1908
  "epoch": 8.99,
1909
+ "grad_norm": 0.14256200194358826,
1910
+ "learning_rate": 1.0111940298507463e-05,
1911
+ "loss": 0.1506,
1912
  "step": 2410
1913
  },
1914
  {
1915
  "epoch": 9.03,
1916
+ "grad_norm": 0.4166848659515381,
1917
+ "learning_rate": 9.738805970149255e-06,
1918
+ "loss": 0.2058,
1919
  "step": 2420
1920
  },
1921
  {
1922
  "epoch": 9.07,
1923
+ "grad_norm": 0.3997032344341278,
1924
+ "learning_rate": 9.365671641791045e-06,
1925
+ "loss": 0.0482,
1926
  "step": 2430
1927
  },
1928
  {
1929
  "epoch": 9.1,
1930
+ "grad_norm": 9.076058387756348,
1931
+ "learning_rate": 8.992537313432836e-06,
1932
+ "loss": 0.2201,
1933
  "step": 2440
1934
  },
1935
  {
1936
  "epoch": 9.14,
1937
+ "grad_norm": 4.368849277496338,
1938
+ "learning_rate": 8.619402985074628e-06,
1939
+ "loss": 0.1288,
1940
  "step": 2450
1941
  },
1942
  {
1943
  "epoch": 9.18,
1944
+ "grad_norm": 4.311466693878174,
1945
+ "learning_rate": 8.24626865671642e-06,
1946
+ "loss": 0.3058,
1947
  "step": 2460
1948
  },
1949
  {
1950
  "epoch": 9.22,
1951
+ "grad_norm": 0.2911408543586731,
1952
+ "learning_rate": 7.87313432835821e-06,
1953
+ "loss": 0.1303,
1954
  "step": 2470
1955
  },
1956
  {
1957
  "epoch": 9.25,
1958
+ "grad_norm": 5.493233680725098,
1959
+ "learning_rate": 7.5e-06,
1960
+ "loss": 0.0915,
1961
  "step": 2480
1962
  },
1963
  {
1964
  "epoch": 9.29,
1965
+ "grad_norm": 0.09431172162294388,
1966
+ "learning_rate": 7.126865671641792e-06,
1967
+ "loss": 0.0954,
1968
  "step": 2490
1969
  },
1970
  {
1971
  "epoch": 9.33,
1972
+ "grad_norm": 1.8603869676589966,
1973
+ "learning_rate": 6.7537313432835825e-06,
1974
+ "loss": 0.1514,
1975
  "step": 2500
1976
  },
1977
  {
1978
  "epoch": 9.33,
1979
+ "eval_accuracy": 0.8260105448154658,
1980
+ "eval_loss": 0.7162156701087952,
1981
+ "eval_runtime": 8.3201,
1982
+ "eval_samples_per_second": 68.389,
1983
+ "eval_steps_per_second": 8.654,
1984
  "step": 2500
1985
  },
1986
  {
1987
  "epoch": 9.37,
1988
+ "grad_norm": 4.870584964752197,
1989
+ "learning_rate": 6.380597014925374e-06,
1990
+ "loss": 0.1354,
1991
  "step": 2510
1992
  },
1993
  {
1994
  "epoch": 9.4,
1995
+ "grad_norm": 2.316840410232544,
1996
+ "learning_rate": 6.007462686567165e-06,
1997
+ "loss": 0.1348,
1998
  "step": 2520
1999
  },
2000
  {
2001
  "epoch": 9.44,
2002
+ "grad_norm": 1.9005101919174194,
2003
+ "learning_rate": 5.6343283582089556e-06,
2004
+ "loss": 0.1755,
2005
  "step": 2530
2006
  },
2007
  {
2008
  "epoch": 9.48,
2009
+ "grad_norm": 0.1674620360136032,
2010
+ "learning_rate": 5.261194029850746e-06,
2011
+ "loss": 0.0878,
2012
  "step": 2540
2013
  },
2014
  {
2015
  "epoch": 9.51,
2016
+ "grad_norm": 5.729959011077881,
2017
+ "learning_rate": 4.888059701492537e-06,
2018
+ "loss": 0.1637,
2019
  "step": 2550
2020
  },
2021
  {
2022
  "epoch": 9.55,
2023
+ "grad_norm": 0.02724504843354225,
2024
+ "learning_rate": 4.514925373134329e-06,
2025
+ "loss": 0.1603,
2026
  "step": 2560
2027
  },
2028
  {
2029
  "epoch": 9.59,
2030
+ "grad_norm": 2.728663921356201,
2031
+ "learning_rate": 4.141791044776119e-06,
2032
+ "loss": 0.1152,
2033
  "step": 2570
2034
  },
2035
  {
2036
  "epoch": 9.63,
2037
+ "grad_norm": 8.920695304870605,
2038
+ "learning_rate": 3.7686567164179105e-06,
2039
+ "loss": 0.1964,
2040
  "step": 2580
2041
  },
2042
  {
2043
  "epoch": 9.66,
2044
+ "grad_norm": 2.3974239826202393,
2045
+ "learning_rate": 3.3955223880597013e-06,
2046
+ "loss": 0.0842,
2047
  "step": 2590
2048
  },
2049
  {
2050
  "epoch": 9.7,
2051
+ "grad_norm": 1.6431355476379395,
2052
+ "learning_rate": 3.022388059701493e-06,
2053
+ "loss": 0.2571,
2054
  "step": 2600
2055
  },
2056
  {
2057
  "epoch": 9.7,
2058
+ "eval_accuracy": 0.8347978910369068,
2059
+ "eval_loss": 0.7012546062469482,
2060
+ "eval_runtime": 8.3265,
2061
+ "eval_samples_per_second": 68.336,
2062
+ "eval_steps_per_second": 8.647,
2063
  "step": 2600
2064
  },
2065
  {
2066
  "epoch": 9.74,
2067
+ "grad_norm": 0.10621854662895203,
2068
+ "learning_rate": 2.6492537313432836e-06,
2069
+ "loss": 0.2632,
2070
  "step": 2610
2071
  },
2072
  {
2073
  "epoch": 9.78,
2074
+ "grad_norm": 4.150152206420898,
2075
+ "learning_rate": 2.2761194029850747e-06,
2076
+ "loss": 0.2804,
2077
  "step": 2620
2078
  },
2079
  {
2080
  "epoch": 9.81,
2081
+ "grad_norm": 4.01139497756958,
2082
+ "learning_rate": 1.9029850746268657e-06,
2083
+ "loss": 0.1696,
2084
  "step": 2630
2085
  },
2086
  {
2087
  "epoch": 9.85,
2088
+ "grad_norm": 4.7402262687683105,
2089
+ "learning_rate": 1.5298507462686568e-06,
2090
+ "loss": 0.1891,
2091
  "step": 2640
2092
  },
2093
  {
2094
  "epoch": 9.89,
2095
+ "grad_norm": 4.460111141204834,
2096
+ "learning_rate": 1.1567164179104478e-06,
2097
+ "loss": 0.1178,
2098
  "step": 2650
2099
  },
2100
  {
2101
  "epoch": 9.93,
2102
+ "grad_norm": 5.822507858276367,
2103
+ "learning_rate": 7.835820895522387e-07,
2104
+ "loss": 0.089,
2105
  "step": 2660
2106
  },
2107
  {
2108
  "epoch": 9.96,
2109
+ "grad_norm": 2.4408085346221924,
2110
+ "learning_rate": 4.1044776119402984e-07,
2111
+ "loss": 0.158,
2112
  "step": 2670
2113
  },
2114
  {
2115
  "epoch": 10.0,
2116
+ "grad_norm": 10.792135238647461,
2117
+ "learning_rate": 3.7313432835820895e-08,
2118
+ "loss": 0.2038,
2119
  "step": 2680
2120
  },
2121
  {
2122
  "epoch": 10.0,
2123
  "step": 2680,
2124
  "total_flos": 3.3230947683690086e+18,
2125
+ "train_loss": 0.23535207314277762,
2126
+ "train_runtime": 1371.8304,
2127
+ "train_samples_per_second": 31.258,
2128
+ "train_steps_per_second": 1.954
2129
  }
2130
  ],
2131
  "logging_steps": 10,