sharren commited on
Commit
974c2a8
1 Parent(s): dd9342e

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224
4
  tags:
 
5
  - generated_from_trainer
6
  metrics:
7
  - accuracy
@@ -18,13 +19,13 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # vit-lr-cosine-restarts
20
 
21
- This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on an unknown dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.7405
24
- - Accuracy: 0.8533
25
- - Precision: 0.8523
26
- - Recall: 0.8533
27
- - F1: 0.8511
28
 
29
  ## Model description
30
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
 
19
 
20
  # vit-lr-cosine-restarts
21
 
22
+ This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the skin-cancer dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 0.4453
25
+ - Accuracy: 0.8464
26
+ - Precision: 0.8464
27
+ - Recall: 0.8464
28
+ - F1: 0.8438
29
 
30
  ## Model description
31
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 5.61,
3
- "eval_accuracy": 0.8349514563106796,
4
- "eval_f1": 0.8328726444655051,
5
- "eval_loss": 0.4524156153202057,
6
- "eval_precision": 0.8389745564359966,
7
- "eval_recall": 0.8349514563106796,
8
- "eval_runtime": 39.837,
9
- "eval_samples_per_second": 72.395,
10
- "eval_steps_per_second": 9.062,
11
  "total_flos": 2.2287694956200755e+18,
12
- "train_loss": 0.41900131742159524,
13
- "train_runtime": 1406.907,
14
- "train_samples_per_second": 364.487,
15
- "train_steps_per_second": 22.816
16
  }
 
1
  {
2
  "epoch": 5.61,
3
+ "eval_accuracy": 0.8463938973647711,
4
+ "eval_f1": 0.8437993661883203,
5
+ "eval_loss": 0.4453237056732178,
6
+ "eval_precision": 0.8463641738950213,
7
+ "eval_recall": 0.8463938973647711,
8
+ "eval_runtime": 38.0107,
9
+ "eval_samples_per_second": 75.873,
10
+ "eval_steps_per_second": 9.497,
11
  "total_flos": 2.2287694956200755e+18,
12
+ "train_loss": 0.2811500767639114,
13
+ "train_runtime": 1301.746,
14
+ "train_samples_per_second": 393.932,
15
+ "train_steps_per_second": 24.659
16
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 5.61,
3
- "eval_accuracy": 0.8349514563106796,
4
- "eval_f1": 0.8328726444655051,
5
- "eval_loss": 0.4524156153202057,
6
- "eval_precision": 0.8389745564359966,
7
- "eval_recall": 0.8349514563106796,
8
- "eval_runtime": 39.837,
9
- "eval_samples_per_second": 72.395,
10
- "eval_steps_per_second": 9.062
11
  }
 
1
  {
2
  "epoch": 5.61,
3
+ "eval_accuracy": 0.8463938973647711,
4
+ "eval_f1": 0.8437993661883203,
5
+ "eval_loss": 0.4453237056732178,
6
+ "eval_precision": 0.8463641738950213,
7
+ "eval_recall": 0.8463938973647711,
8
+ "eval_runtime": 38.0107,
9
+ "eval_samples_per_second": 75.873,
10
+ "eval_steps_per_second": 9.497
11
  }
runs/Mar19_04-smaller_warmup/events.out.tfevents.1710821117.6492c5bf3fae.6515.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9fd08d9e079be94e02274b35278b6d5a9bc04ec08c5f79c7f0ddc5729b786d
3
+ size 51502
runs/Mar19_04-smaller_warmup/events.out.tfevents.1710822491.6492c5bf3fae.6515.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a0d8b2b7cfec210dbce4ba7c7bc1462a88ec2420f966f908f4a3ad08815e9df
3
+ size 560
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.61,
3
  "total_flos": 2.2287694956200755e+18,
4
- "train_loss": 0.41900131742159524,
5
- "train_runtime": 1406.907,
6
- "train_samples_per_second": 364.487,
7
- "train_steps_per_second": 22.816
8
  }
 
1
  {
2
  "epoch": 5.61,
3
  "total_flos": 2.2287694956200755e+18,
4
+ "train_loss": 0.2811500767639114,
5
+ "train_runtime": 1301.746,
6
+ "train_samples_per_second": 393.932,
7
+ "train_steps_per_second": 24.659
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.4524156153202057,
3
  "best_model_checkpoint": "./vit-lr-cosine-restarts/checkpoint-800",
4
  "epoch": 5.607476635514018,
5
  "eval_steps": 100,
@@ -10,1488 +10,1488 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "grad_norm": Infinity,
14
- "learning_rate": 5.000000000000001e-07,
15
- "loss": 2.4884,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.06,
20
- "grad_norm": 19.190855026245117,
21
- "learning_rate": 1.125e-06,
22
- "loss": 2.411,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.09,
27
- "grad_norm": 16.122661590576172,
28
- "learning_rate": 1.7500000000000002e-06,
29
- "loss": 2.2596,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.12,
34
- "grad_norm": 11.49656867980957,
35
- "learning_rate": 2.375e-06,
36
- "loss": 2.1154,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 14.204788208007812,
42
- "learning_rate": 3e-06,
43
- "loss": 1.7711,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.19,
48
- "grad_norm": 8.722100257873535,
49
- "learning_rate": 3.625e-06,
50
- "loss": 1.4865,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.22,
55
- "grad_norm": 7.642684459686279,
56
- "learning_rate": 4.250000000000001e-06,
57
- "loss": 1.282,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.25,
62
- "grad_norm": 5.861067295074463,
63
- "learning_rate": 4.875000000000001e-06,
64
- "loss": 1.06,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.28,
69
- "grad_norm": 5.785488128662109,
70
- "learning_rate": 5.500000000000001e-06,
71
- "loss": 0.9867,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.31,
76
- "grad_norm": 5.735620021820068,
77
- "learning_rate": 6.125e-06,
78
- "loss": 0.9572,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.31,
83
- "eval_accuracy": 0.6785714285714286,
84
- "eval_f1": 0.576811217156952,
85
- "eval_loss": 0.9378232359886169,
86
- "eval_precision": 0.534687120046434,
87
- "eval_recall": 0.6785714285714286,
88
- "eval_runtime": 38.8342,
89
- "eval_samples_per_second": 74.265,
90
- "eval_steps_per_second": 9.296,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 0.34,
95
- "grad_norm": 5.473198890686035,
96
- "learning_rate": 6.750000000000001e-06,
97
- "loss": 0.8799,
98
  "step": 110
99
  },
100
  {
101
  "epoch": 0.37,
102
- "grad_norm": 5.595673084259033,
103
- "learning_rate": 7.375e-06,
104
- "loss": 0.8161,
105
  "step": 120
106
  },
107
  {
108
  "epoch": 0.4,
109
- "grad_norm": 7.84308385848999,
110
- "learning_rate": 8.000000000000001e-06,
111
- "loss": 0.9048,
112
  "step": 130
113
  },
114
  {
115
  "epoch": 0.44,
116
- "grad_norm": 3.7328169345855713,
117
- "learning_rate": 8.625e-06,
118
- "loss": 0.7422,
119
  "step": 140
120
  },
121
  {
122
  "epoch": 0.47,
123
- "grad_norm": 5.750851631164551,
124
- "learning_rate": 9.25e-06,
125
- "loss": 0.8141,
126
  "step": 150
127
  },
128
  {
129
  "epoch": 0.5,
130
- "grad_norm": 5.3880295753479,
131
- "learning_rate": 9.875000000000001e-06,
132
- "loss": 0.7135,
133
  "step": 160
134
  },
135
  {
136
  "epoch": 0.53,
137
- "grad_norm": 5.539682388305664,
138
- "learning_rate": 1.05e-05,
139
- "loss": 0.8333,
140
  "step": 170
141
  },
142
  {
143
  "epoch": 0.56,
144
- "grad_norm": 6.282279968261719,
145
- "learning_rate": 1.1125000000000001e-05,
146
- "loss": 0.7989,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 0.59,
151
- "grad_norm": 7.1601362228393555,
152
- "learning_rate": 1.175e-05,
153
- "loss": 0.9196,
154
  "step": 190
155
  },
156
  {
157
  "epoch": 0.62,
158
- "grad_norm": 6.748279571533203,
159
- "learning_rate": 1.2375000000000001e-05,
160
- "loss": 0.7576,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 0.62,
165
- "eval_accuracy": 0.7170596393897365,
166
- "eval_f1": 0.6481913495629226,
167
- "eval_loss": 0.7868019342422485,
168
- "eval_precision": 0.6468741335888096,
169
- "eval_recall": 0.7170596393897365,
170
- "eval_runtime": 38.5481,
171
- "eval_samples_per_second": 74.816,
172
- "eval_steps_per_second": 9.365,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 0.65,
177
- "grad_norm": 5.007500171661377,
178
- "learning_rate": 1.3000000000000001e-05,
179
- "loss": 0.7059,
180
  "step": 210
181
  },
182
  {
183
  "epoch": 0.69,
184
- "grad_norm": 7.129433631896973,
185
- "learning_rate": 1.3625e-05,
186
- "loss": 0.7989,
187
  "step": 220
188
  },
189
  {
190
  "epoch": 0.72,
191
- "grad_norm": 12.505253791809082,
192
- "learning_rate": 1.4249999999999999e-05,
193
- "loss": 0.5823,
194
  "step": 230
195
  },
196
  {
197
  "epoch": 0.75,
198
- "grad_norm": 4.6094865798950195,
199
- "learning_rate": 1.4875e-05,
200
- "loss": 0.6079,
201
  "step": 240
202
  },
203
  {
204
  "epoch": 0.78,
205
- "grad_norm": 8.153918266296387,
206
- "learning_rate": 1.55e-05,
207
- "loss": 0.7762,
208
  "step": 250
209
  },
210
  {
211
  "epoch": 0.81,
212
- "grad_norm": 6.411535263061523,
213
- "learning_rate": 1.6125000000000002e-05,
214
- "loss": 0.5625,
215
  "step": 260
216
  },
217
  {
218
  "epoch": 0.84,
219
- "grad_norm": 5.9518818855285645,
220
- "learning_rate": 1.675e-05,
221
- "loss": 0.6057,
222
  "step": 270
223
  },
224
  {
225
  "epoch": 0.87,
226
- "grad_norm": 5.535661697387695,
227
- "learning_rate": 1.7375e-05,
228
- "loss": 0.6431,
229
  "step": 280
230
  },
231
  {
232
  "epoch": 0.9,
233
- "grad_norm": 7.387240886688232,
234
- "learning_rate": 1.8e-05,
235
- "loss": 0.6604,
236
  "step": 290
237
  },
238
  {
239
  "epoch": 0.93,
240
- "grad_norm": 5.090869903564453,
241
- "learning_rate": 1.8625000000000002e-05,
242
- "loss": 0.793,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 0.93,
247
- "eval_accuracy": 0.7812066574202496,
248
- "eval_f1": 0.7641221117942969,
249
- "eval_loss": 0.6202540993690491,
250
- "eval_precision": 0.7717978736248653,
251
- "eval_recall": 0.7812066574202496,
252
- "eval_runtime": 40.568,
253
- "eval_samples_per_second": 71.091,
254
- "eval_steps_per_second": 8.899,
255
  "step": 300
256
  },
257
  {
258
  "epoch": 0.97,
259
- "grad_norm": 4.731078147888184,
260
- "learning_rate": 1.925e-05,
261
- "loss": 0.6278,
262
  "step": 310
263
  },
264
  {
265
  "epoch": 1.0,
266
- "grad_norm": 7.773807048797607,
267
- "learning_rate": 1.9875000000000002e-05,
268
- "loss": 0.6535,
269
  "step": 320
270
  },
271
  {
272
  "epoch": 1.03,
273
- "grad_norm": 5.556732654571533,
274
- "learning_rate": 2.05e-05,
275
- "loss": 0.5634,
276
  "step": 330
277
  },
278
  {
279
  "epoch": 1.06,
280
- "grad_norm": 5.828562259674072,
281
- "learning_rate": 2.1125000000000002e-05,
282
- "loss": 0.4674,
283
  "step": 340
284
  },
285
  {
286
  "epoch": 1.09,
287
- "grad_norm": 4.718760967254639,
288
- "learning_rate": 2.175e-05,
289
- "loss": 0.4694,
290
  "step": 350
291
  },
292
  {
293
  "epoch": 1.12,
294
- "grad_norm": 3.8434319496154785,
295
- "learning_rate": 2.2375000000000002e-05,
296
- "loss": 0.503,
297
  "step": 360
298
  },
299
  {
300
  "epoch": 1.15,
301
- "grad_norm": 4.510343074798584,
302
- "learning_rate": 2.3000000000000003e-05,
303
- "loss": 0.4857,
304
  "step": 370
305
  },
306
  {
307
  "epoch": 1.18,
308
- "grad_norm": 8.198539733886719,
309
- "learning_rate": 2.3624999999999998e-05,
310
- "loss": 0.4871,
311
  "step": 380
312
  },
313
  {
314
  "epoch": 1.21,
315
- "grad_norm": 7.015860080718994,
316
- "learning_rate": 2.425e-05,
317
- "loss": 0.5578,
318
  "step": 390
319
  },
320
  {
321
  "epoch": 1.25,
322
- "grad_norm": 6.530871391296387,
323
- "learning_rate": 2.4875e-05,
324
- "loss": 0.4895,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 1.25,
329
- "eval_accuracy": 0.7981969486823856,
330
- "eval_f1": 0.7908168076777717,
331
- "eval_loss": 0.544183611869812,
332
- "eval_precision": 0.7914661144847153,
333
- "eval_recall": 0.7981969486823856,
334
- "eval_runtime": 38.6563,
335
- "eval_samples_per_second": 74.606,
336
- "eval_steps_per_second": 9.339,
337
  "step": 400
338
  },
339
  {
340
  "epoch": 1.28,
341
- "grad_norm": 6.384176254272461,
342
- "learning_rate": 2.5500000000000003e-05,
343
- "loss": 0.489,
344
  "step": 410
345
  },
346
  {
347
  "epoch": 1.31,
348
- "grad_norm": 4.7396650314331055,
349
- "learning_rate": 2.6124999999999998e-05,
350
- "loss": 0.4549,
351
  "step": 420
352
  },
353
  {
354
  "epoch": 1.34,
355
- "grad_norm": 5.204756259918213,
356
- "learning_rate": 2.6750000000000003e-05,
357
- "loss": 0.6083,
358
  "step": 430
359
  },
360
  {
361
  "epoch": 1.37,
362
- "grad_norm": 6.31406831741333,
363
- "learning_rate": 2.7375e-05,
364
- "loss": 0.5462,
365
  "step": 440
366
  },
367
  {
368
  "epoch": 1.4,
369
- "grad_norm": 4.303152561187744,
370
- "learning_rate": 2.8000000000000003e-05,
371
- "loss": 0.5187,
372
  "step": 450
373
  },
374
  {
375
  "epoch": 1.43,
376
- "grad_norm": 8.125056266784668,
377
- "learning_rate": 2.8625e-05,
378
- "loss": 0.4627,
379
  "step": 460
380
  },
381
  {
382
  "epoch": 1.46,
383
- "grad_norm": 4.409249782562256,
384
- "learning_rate": 2.925e-05,
385
- "loss": 0.3547,
386
  "step": 470
387
  },
388
  {
389
  "epoch": 1.5,
390
- "grad_norm": 11.175352096557617,
391
- "learning_rate": 2.9875000000000004e-05,
392
- "loss": 0.4459,
393
  "step": 480
394
  },
395
  {
396
  "epoch": 1.53,
397
- "grad_norm": 7.291630744934082,
398
- "learning_rate": 3.05e-05,
399
- "loss": 0.5164,
400
  "step": 490
401
  },
402
  {
403
  "epoch": 1.56,
404
- "grad_norm": 3.5186641216278076,
405
- "learning_rate": 3.1125000000000004e-05,
406
- "loss": 0.416,
407
  "step": 500
408
  },
409
  {
410
  "epoch": 1.56,
411
- "eval_accuracy": 0.8110263522884882,
412
- "eval_f1": 0.795727098474235,
413
- "eval_loss": 0.5408079624176025,
414
- "eval_precision": 0.8222736826602907,
415
- "eval_recall": 0.8110263522884882,
416
- "eval_runtime": 38.9648,
417
- "eval_samples_per_second": 74.016,
418
- "eval_steps_per_second": 9.265,
419
  "step": 500
420
  },
421
  {
422
  "epoch": 1.59,
423
- "grad_norm": 4.688148498535156,
424
- "learning_rate": 3.175e-05,
425
- "loss": 0.5085,
426
  "step": 510
427
  },
428
  {
429
  "epoch": 1.62,
430
- "grad_norm": 4.2772216796875,
431
- "learning_rate": 3.2375e-05,
432
- "loss": 0.4654,
433
  "step": 520
434
  },
435
  {
436
  "epoch": 1.65,
437
- "grad_norm": 8.14700698852539,
438
- "learning_rate": 3.3e-05,
439
- "loss": 0.464,
440
  "step": 530
441
  },
442
  {
443
  "epoch": 1.68,
444
- "grad_norm": 4.734298229217529,
445
- "learning_rate": 3.3625000000000004e-05,
446
- "loss": 0.3713,
447
  "step": 540
448
  },
449
  {
450
  "epoch": 1.71,
451
- "grad_norm": 5.7068915367126465,
452
- "learning_rate": 3.4250000000000006e-05,
453
- "loss": 0.4957,
454
  "step": 550
455
  },
456
  {
457
  "epoch": 1.74,
458
- "grad_norm": 4.455660820007324,
459
- "learning_rate": 3.4875e-05,
460
- "loss": 0.4147,
461
  "step": 560
462
  },
463
  {
464
  "epoch": 1.78,
465
- "grad_norm": 5.888510227203369,
466
- "learning_rate": 3.55e-05,
467
- "loss": 0.4616,
468
  "step": 570
469
  },
470
  {
471
  "epoch": 1.81,
472
- "grad_norm": 4.628395080566406,
473
- "learning_rate": 3.6125000000000004e-05,
474
- "loss": 0.3903,
475
  "step": 580
476
  },
477
  {
478
  "epoch": 1.84,
479
- "grad_norm": 6.706481456756592,
480
- "learning_rate": 3.675e-05,
481
- "loss": 0.4902,
482
  "step": 590
483
  },
484
  {
485
  "epoch": 1.87,
486
- "grad_norm": 4.396645545959473,
487
- "learning_rate": 3.737500000000001e-05,
488
- "loss": 0.4463,
489
  "step": 600
490
  },
491
  {
492
  "epoch": 1.87,
493
- "eval_accuracy": 0.7656033287101248,
494
- "eval_f1": 0.7762896609298523,
495
- "eval_loss": 0.6204918026924133,
496
- "eval_precision": 0.8403991265428096,
497
- "eval_recall": 0.7656033287101248,
498
- "eval_runtime": 38.6646,
499
- "eval_samples_per_second": 74.59,
500
- "eval_steps_per_second": 9.337,
501
  "step": 600
502
  },
503
  {
504
  "epoch": 1.9,
505
- "grad_norm": 6.707110404968262,
506
- "learning_rate": 3.8e-05,
507
- "loss": 0.5598,
508
  "step": 610
509
  },
510
  {
511
  "epoch": 1.93,
512
- "grad_norm": 4.102792263031006,
513
- "learning_rate": 3.8625e-05,
514
- "loss": 0.3939,
515
  "step": 620
516
  },
517
  {
518
  "epoch": 1.96,
519
- "grad_norm": 5.277581691741943,
520
- "learning_rate": 3.9250000000000005e-05,
521
- "loss": 0.3826,
522
  "step": 630
523
  },
524
  {
525
  "epoch": 1.99,
526
- "grad_norm": 4.773111343383789,
527
- "learning_rate": 3.9875e-05,
528
- "loss": 0.4986,
529
  "step": 640
530
  },
531
  {
532
  "epoch": 2.02,
533
- "grad_norm": 5.68511438369751,
534
- "learning_rate": 4.05e-05,
535
- "loss": 0.2855,
536
  "step": 650
537
  },
538
  {
539
  "epoch": 2.06,
540
- "grad_norm": 5.505866527557373,
541
- "learning_rate": 4.1125000000000004e-05,
542
- "loss": 0.3395,
543
  "step": 660
544
  },
545
  {
546
  "epoch": 2.09,
547
- "grad_norm": 4.09190559387207,
548
- "learning_rate": 4.175e-05,
549
- "loss": 0.2798,
550
  "step": 670
551
  },
552
  {
553
  "epoch": 2.12,
554
- "grad_norm": 5.384827613830566,
555
- "learning_rate": 4.237500000000001e-05,
556
- "loss": 0.286,
557
  "step": 680
558
  },
559
  {
560
  "epoch": 2.15,
561
- "grad_norm": 4.030750751495361,
562
- "learning_rate": 4.3e-05,
563
- "loss": 0.2659,
564
  "step": 690
565
  },
566
  {
567
  "epoch": 2.18,
568
- "grad_norm": 6.827621936798096,
569
- "learning_rate": 4.3625e-05,
570
- "loss": 0.206,
571
  "step": 700
572
  },
573
  {
574
  "epoch": 2.18,
575
- "eval_accuracy": 0.8304438280166435,
576
- "eval_f1": 0.8244335746464728,
577
- "eval_loss": 0.4993675947189331,
578
- "eval_precision": 0.8347840150379366,
579
- "eval_recall": 0.8304438280166435,
580
- "eval_runtime": 38.7905,
581
- "eval_samples_per_second": 74.348,
582
- "eval_steps_per_second": 9.306,
583
  "step": 700
584
  },
585
  {
586
  "epoch": 2.21,
587
- "grad_norm": 2.023770809173584,
588
- "learning_rate": 4.4250000000000005e-05,
589
- "loss": 0.258,
590
  "step": 710
591
  },
592
  {
593
  "epoch": 2.24,
594
- "grad_norm": 9.621185302734375,
595
- "learning_rate": 4.4875e-05,
596
- "loss": 0.4616,
597
  "step": 720
598
  },
599
  {
600
  "epoch": 2.27,
601
- "grad_norm": 6.095046520233154,
602
- "learning_rate": 4.55e-05,
603
- "loss": 0.3927,
604
  "step": 730
605
  },
606
  {
607
  "epoch": 2.31,
608
- "grad_norm": 5.082390308380127,
609
- "learning_rate": 4.6125e-05,
610
- "loss": 0.2643,
611
  "step": 740
612
  },
613
  {
614
  "epoch": 2.34,
615
- "grad_norm": 5.96766996383667,
616
- "learning_rate": 4.6750000000000005e-05,
617
- "loss": 0.2964,
618
  "step": 750
619
  },
620
  {
621
  "epoch": 2.37,
622
- "grad_norm": 4.461874485015869,
623
- "learning_rate": 4.7375e-05,
624
- "loss": 0.2321,
625
  "step": 760
626
  },
627
  {
628
  "epoch": 2.4,
629
- "grad_norm": 7.7858171463012695,
630
- "learning_rate": 4.8e-05,
631
- "loss": 0.3646,
632
  "step": 770
633
  },
634
  {
635
  "epoch": 2.43,
636
- "grad_norm": 7.20878267288208,
637
- "learning_rate": 4.8625e-05,
638
- "loss": 0.4159,
639
  "step": 780
640
  },
641
  {
642
  "epoch": 2.46,
643
- "grad_norm": 8.424154281616211,
644
- "learning_rate": 4.9250000000000004e-05,
645
- "loss": 0.3511,
646
  "step": 790
647
  },
648
  {
649
  "epoch": 2.49,
650
- "grad_norm": 3.951247215270996,
651
- "learning_rate": 4.9875000000000006e-05,
652
- "loss": 0.4006,
653
  "step": 800
654
  },
655
  {
656
  "epoch": 2.49,
657
- "eval_accuracy": 0.8349514563106796,
658
- "eval_f1": 0.8328726444655051,
659
- "eval_loss": 0.4524156153202057,
660
- "eval_precision": 0.8389745564359966,
661
- "eval_recall": 0.8349514563106796,
662
- "eval_runtime": 39.502,
663
- "eval_samples_per_second": 73.009,
664
- "eval_steps_per_second": 9.139,
665
  "step": 800
666
  },
667
  {
668
  "epoch": 2.52,
669
- "grad_norm": 5.085056304931641,
670
- "learning_rate": 5.05e-05,
671
- "loss": 0.3792,
672
  "step": 810
673
  },
674
  {
675
  "epoch": 2.55,
676
- "grad_norm": 5.90117073059082,
677
- "learning_rate": 5.1125e-05,
678
- "loss": 0.2828,
679
  "step": 820
680
  },
681
  {
682
  "epoch": 2.59,
683
- "grad_norm": 6.909815311431885,
684
- "learning_rate": 5.175e-05,
685
- "loss": 0.3099,
686
  "step": 830
687
  },
688
  {
689
  "epoch": 2.62,
690
- "grad_norm": 10.736394882202148,
691
- "learning_rate": 5.2375000000000006e-05,
692
- "loss": 0.2778,
693
  "step": 840
694
  },
695
  {
696
  "epoch": 2.65,
697
- "grad_norm": 2.348313570022583,
698
- "learning_rate": 5.300000000000001e-05,
699
- "loss": 0.2706,
700
  "step": 850
701
  },
702
  {
703
  "epoch": 2.68,
704
- "grad_norm": 3.5794482231140137,
705
- "learning_rate": 5.3625e-05,
706
- "loss": 0.2288,
707
  "step": 860
708
  },
709
  {
710
  "epoch": 2.71,
711
- "grad_norm": 6.5003461837768555,
712
- "learning_rate": 5.4250000000000004e-05,
713
- "loss": 0.3601,
714
  "step": 870
715
  },
716
  {
717
  "epoch": 2.74,
718
- "grad_norm": 6.113778114318848,
719
- "learning_rate": 5.4875e-05,
720
- "loss": 0.3756,
721
  "step": 880
722
  },
723
  {
724
  "epoch": 2.77,
725
- "grad_norm": 2.0605404376983643,
726
- "learning_rate": 5.550000000000001e-05,
727
- "loss": 0.2754,
728
  "step": 890
729
  },
730
  {
731
  "epoch": 2.8,
732
- "grad_norm": 11.390885353088379,
733
- "learning_rate": 5.6125e-05,
734
- "loss": 0.3208,
735
  "step": 900
736
  },
737
  {
738
  "epoch": 2.8,
739
- "eval_accuracy": 0.8290568654646324,
740
- "eval_f1": 0.8356873412541177,
741
- "eval_loss": 0.5083692073822021,
742
- "eval_precision": 0.8510969316144597,
743
- "eval_recall": 0.8290568654646324,
744
- "eval_runtime": 39.6529,
745
- "eval_samples_per_second": 72.731,
746
- "eval_steps_per_second": 9.104,
747
  "step": 900
748
  },
749
  {
750
  "epoch": 2.83,
751
- "grad_norm": 6.347045421600342,
752
- "learning_rate": 5.6750000000000004e-05,
753
- "loss": 0.4116,
754
  "step": 910
755
  },
756
  {
757
  "epoch": 2.87,
758
- "grad_norm": 11.242640495300293,
759
- "learning_rate": 5.7375e-05,
760
- "loss": 0.4315,
761
  "step": 920
762
  },
763
  {
764
  "epoch": 2.9,
765
- "grad_norm": 1.8943805694580078,
766
- "learning_rate": 5.8e-05,
767
- "loss": 0.4917,
768
  "step": 930
769
  },
770
  {
771
  "epoch": 2.93,
772
- "grad_norm": 8.402441024780273,
773
- "learning_rate": 5.862500000000001e-05,
774
- "loss": 0.3745,
775
  "step": 940
776
  },
777
  {
778
  "epoch": 2.96,
779
- "grad_norm": 7.3500590324401855,
780
- "learning_rate": 5.9250000000000004e-05,
781
- "loss": 0.2392,
782
  "step": 950
783
  },
784
  {
785
  "epoch": 2.99,
786
- "grad_norm": 4.970364093780518,
787
- "learning_rate": 5.9875000000000005e-05,
788
- "loss": 0.3614,
789
  "step": 960
790
  },
791
  {
792
  "epoch": 3.02,
793
- "grad_norm": 3.213763475418091,
794
- "learning_rate": 6.05e-05,
795
- "loss": 0.2037,
796
  "step": 970
797
  },
798
  {
799
  "epoch": 3.05,
800
- "grad_norm": 5.403902053833008,
801
- "learning_rate": 6.1125e-05,
802
- "loss": 0.21,
803
  "step": 980
804
  },
805
  {
806
  "epoch": 3.08,
807
- "grad_norm": 3.3675849437713623,
808
- "learning_rate": 6.175000000000001e-05,
809
- "loss": 0.2107,
810
  "step": 990
811
  },
812
  {
813
  "epoch": 3.12,
814
- "grad_norm": 5.057342529296875,
815
- "learning_rate": 6.237500000000001e-05,
816
- "loss": 0.1916,
817
  "step": 1000
818
  },
819
  {
820
  "epoch": 3.12,
821
- "eval_accuracy": 0.8200416088765603,
822
- "eval_f1": 0.825491122448177,
823
- "eval_loss": 0.5119706988334656,
824
- "eval_precision": 0.8464650898509153,
825
- "eval_recall": 0.8200416088765603,
826
- "eval_runtime": 39.2368,
827
- "eval_samples_per_second": 73.503,
828
- "eval_steps_per_second": 9.201,
829
  "step": 1000
830
  },
831
  {
832
  "epoch": 3.15,
833
- "grad_norm": 4.15879487991333,
834
- "learning_rate": 6.3e-05,
835
- "loss": 0.1482,
836
  "step": 1010
837
  },
838
  {
839
  "epoch": 3.18,
840
- "grad_norm": 4.375223159790039,
841
- "learning_rate": 6.3625e-05,
842
- "loss": 0.215,
843
  "step": 1020
844
  },
845
  {
846
  "epoch": 3.21,
847
- "grad_norm": 5.720744609832764,
848
- "learning_rate": 6.425e-05,
849
- "loss": 0.1132,
850
  "step": 1030
851
  },
852
  {
853
  "epoch": 3.24,
854
- "grad_norm": 2.6465506553649902,
855
- "learning_rate": 6.4875e-05,
856
- "loss": 0.0885,
857
  "step": 1040
858
  },
859
  {
860
  "epoch": 3.27,
861
- "grad_norm": 5.789972305297852,
862
- "learning_rate": 6.55e-05,
863
- "loss": 0.206,
864
  "step": 1050
865
  },
866
  {
867
  "epoch": 3.3,
868
- "grad_norm": 1.6569145917892456,
869
- "learning_rate": 6.612500000000001e-05,
870
- "loss": 0.0962,
871
  "step": 1060
872
  },
873
  {
874
  "epoch": 3.33,
875
- "grad_norm": 1.817624568939209,
876
- "learning_rate": 6.675e-05,
877
- "loss": 0.1607,
878
  "step": 1070
879
  },
880
  {
881
  "epoch": 3.36,
882
- "grad_norm": 5.404047012329102,
883
- "learning_rate": 6.7375e-05,
884
- "loss": 0.2756,
885
  "step": 1080
886
  },
887
  {
888
  "epoch": 3.4,
889
- "grad_norm": 7.324627876281738,
890
- "learning_rate": 6.800000000000001e-05,
891
- "loss": 0.2549,
892
  "step": 1090
893
  },
894
  {
895
  "epoch": 3.43,
896
- "grad_norm": 4.070283889770508,
897
- "learning_rate": 6.8625e-05,
898
- "loss": 0.2015,
899
  "step": 1100
900
  },
901
  {
902
  "epoch": 3.43,
903
- "eval_accuracy": 0.8183079056865464,
904
- "eval_f1": 0.7983379954735971,
905
- "eval_loss": 0.6911566257476807,
906
- "eval_precision": 0.8453091952985208,
907
- "eval_recall": 0.8183079056865464,
908
- "eval_runtime": 38.7985,
909
- "eval_samples_per_second": 74.333,
910
- "eval_steps_per_second": 9.304,
911
  "step": 1100
912
  },
913
  {
914
  "epoch": 3.46,
915
- "grad_norm": 2.4123291969299316,
916
- "learning_rate": 6.925e-05,
917
- "loss": 0.2511,
918
  "step": 1110
919
  },
920
  {
921
  "epoch": 3.49,
922
- "grad_norm": 6.693827152252197,
923
- "learning_rate": 6.9875e-05,
924
- "loss": 0.2531,
925
  "step": 1120
926
  },
927
  {
928
  "epoch": 3.52,
929
- "grad_norm": 15.4666109085083,
930
- "learning_rate": 7.05e-05,
931
- "loss": 0.2476,
932
  "step": 1130
933
  },
934
  {
935
  "epoch": 3.55,
936
- "grad_norm": 8.200079917907715,
937
- "learning_rate": 7.112500000000001e-05,
938
- "loss": 0.2294,
939
  "step": 1140
940
  },
941
  {
942
  "epoch": 3.58,
943
- "grad_norm": 7.075741291046143,
944
- "learning_rate": 7.175000000000001e-05,
945
- "loss": 0.2854,
946
  "step": 1150
947
  },
948
  {
949
  "epoch": 3.61,
950
- "grad_norm": 3.2751991748809814,
951
- "learning_rate": 7.2375e-05,
952
- "loss": 0.2435,
953
  "step": 1160
954
  },
955
  {
956
  "epoch": 3.64,
957
- "grad_norm": 3.889462947845459,
958
- "learning_rate": 7.3e-05,
959
- "loss": 0.1425,
960
  "step": 1170
961
  },
962
  {
963
  "epoch": 3.68,
964
- "grad_norm": 1.6340276002883911,
965
- "learning_rate": 7.3625e-05,
966
- "loss": 0.1798,
967
  "step": 1180
968
  },
969
  {
970
  "epoch": 3.71,
971
- "grad_norm": 10.412519454956055,
972
- "learning_rate": 7.425e-05,
973
- "loss": 0.1961,
974
  "step": 1190
975
  },
976
  {
977
  "epoch": 3.74,
978
- "grad_norm": 2.985041618347168,
979
- "learning_rate": 7.4875e-05,
980
- "loss": 0.2384,
981
  "step": 1200
982
  },
983
  {
984
  "epoch": 3.74,
985
- "eval_accuracy": 0.8131067961165048,
986
- "eval_f1": 0.8045747086705156,
987
- "eval_loss": 0.7051995992660522,
988
- "eval_precision": 0.8164608366585469,
989
- "eval_recall": 0.8131067961165048,
990
- "eval_runtime": 38.8306,
991
- "eval_samples_per_second": 74.271,
992
- "eval_steps_per_second": 9.297,
993
  "step": 1200
994
  },
995
  {
996
  "epoch": 3.77,
997
- "grad_norm": 5.682165145874023,
998
- "learning_rate": 7.55e-05,
999
- "loss": 0.2674,
1000
  "step": 1210
1001
  },
1002
  {
1003
  "epoch": 3.8,
1004
- "grad_norm": 8.544529914855957,
1005
- "learning_rate": 7.612500000000001e-05,
1006
- "loss": 0.3004,
1007
  "step": 1220
1008
  },
1009
  {
1010
  "epoch": 3.83,
1011
- "grad_norm": 9.870763778686523,
1012
- "learning_rate": 7.675e-05,
1013
- "loss": 0.2638,
1014
  "step": 1230
1015
  },
1016
  {
1017
  "epoch": 3.86,
1018
- "grad_norm": 3.0678322315216064,
1019
- "learning_rate": 7.737500000000001e-05,
1020
- "loss": 0.2658,
1021
  "step": 1240
1022
  },
1023
  {
1024
  "epoch": 3.89,
1025
- "grad_norm": 5.901866912841797,
1026
- "learning_rate": 7.800000000000001e-05,
1027
- "loss": 0.3354,
1028
  "step": 1250
1029
  },
1030
  {
1031
  "epoch": 3.93,
1032
- "grad_norm": 4.357693195343018,
1033
- "learning_rate": 7.8625e-05,
1034
- "loss": 0.2983,
1035
  "step": 1260
1036
  },
1037
  {
1038
  "epoch": 3.96,
1039
- "grad_norm": 9.29904842376709,
1040
- "learning_rate": 7.925e-05,
1041
- "loss": 0.3396,
1042
  "step": 1270
1043
  },
1044
  {
1045
  "epoch": 3.99,
1046
- "grad_norm": 9.512259483337402,
1047
- "learning_rate": 7.9875e-05,
1048
- "loss": 0.2826,
1049
  "step": 1280
1050
  },
1051
  {
1052
  "epoch": 4.02,
1053
- "grad_norm": 6.716480731964111,
1054
- "learning_rate": 8.05e-05,
1055
- "loss": 0.1743,
1056
  "step": 1290
1057
  },
1058
  {
1059
  "epoch": 4.05,
1060
- "grad_norm": 0.9523041248321533,
1061
- "learning_rate": 8.112500000000001e-05,
1062
- "loss": 0.1694,
1063
  "step": 1300
1064
  },
1065
  {
1066
  "epoch": 4.05,
1067
- "eval_accuracy": 0.8307905686546463,
1068
- "eval_f1": 0.8345388679849918,
1069
- "eval_loss": 0.49232053756713867,
1070
- "eval_precision": 0.8424641126230031,
1071
- "eval_recall": 0.8307905686546463,
1072
- "eval_runtime": 39.5635,
1073
- "eval_samples_per_second": 72.895,
1074
- "eval_steps_per_second": 9.125,
1075
  "step": 1300
1076
  },
1077
  {
1078
  "epoch": 4.08,
1079
- "grad_norm": 0.645494818687439,
1080
- "learning_rate": 8.175000000000001e-05,
1081
- "loss": 0.0897,
1082
  "step": 1310
1083
  },
1084
  {
1085
  "epoch": 4.11,
1086
- "grad_norm": 2.347318172454834,
1087
- "learning_rate": 8.2375e-05,
1088
- "loss": 0.1952,
1089
  "step": 1320
1090
  },
1091
  {
1092
  "epoch": 4.14,
1093
- "grad_norm": 9.760616302490234,
1094
- "learning_rate": 8.3e-05,
1095
- "loss": 0.1367,
1096
  "step": 1330
1097
  },
1098
  {
1099
  "epoch": 4.17,
1100
- "grad_norm": 4.394073963165283,
1101
- "learning_rate": 8.362500000000001e-05,
1102
- "loss": 0.0696,
1103
  "step": 1340
1104
  },
1105
  {
1106
  "epoch": 4.21,
1107
- "grad_norm": 0.1434166580438614,
1108
- "learning_rate": 8.425e-05,
1109
- "loss": 0.3269,
1110
  "step": 1350
1111
  },
1112
  {
1113
  "epoch": 4.24,
1114
- "grad_norm": 2.8811986446380615,
1115
- "learning_rate": 8.4875e-05,
1116
- "loss": 0.075,
1117
  "step": 1360
1118
  },
1119
  {
1120
  "epoch": 4.27,
1121
- "grad_norm": 4.467238426208496,
1122
- "learning_rate": 8.55e-05,
1123
- "loss": 0.2605,
1124
  "step": 1370
1125
  },
1126
  {
1127
  "epoch": 4.3,
1128
- "grad_norm": 6.685047149658203,
1129
- "learning_rate": 8.6125e-05,
1130
- "loss": 0.227,
1131
  "step": 1380
1132
  },
1133
  {
1134
  "epoch": 4.33,
1135
- "grad_norm": 6.381806373596191,
1136
- "learning_rate": 8.675000000000001e-05,
1137
- "loss": 0.1426,
1138
  "step": 1390
1139
  },
1140
  {
1141
  "epoch": 4.36,
1142
- "grad_norm": 6.624675273895264,
1143
- "learning_rate": 8.737500000000001e-05,
1144
- "loss": 0.2445,
1145
  "step": 1400
1146
  },
1147
  {
1148
  "epoch": 4.36,
1149
- "eval_accuracy": 0.8307905686546463,
1150
- "eval_f1": 0.8343676685959436,
1151
- "eval_loss": 0.5868554711341858,
1152
- "eval_precision": 0.847191138756196,
1153
- "eval_recall": 0.8307905686546463,
1154
- "eval_runtime": 38.9105,
1155
- "eval_samples_per_second": 74.119,
1156
- "eval_steps_per_second": 9.278,
1157
  "step": 1400
1158
  },
1159
  {
1160
  "epoch": 4.39,
1161
- "grad_norm": 0.38240641355514526,
1162
- "learning_rate": 8.800000000000001e-05,
1163
- "loss": 0.1002,
1164
  "step": 1410
1165
  },
1166
  {
1167
  "epoch": 4.42,
1168
- "grad_norm": 2.0704329013824463,
1169
- "learning_rate": 8.8625e-05,
1170
- "loss": 0.1727,
1171
  "step": 1420
1172
  },
1173
  {
1174
  "epoch": 4.45,
1175
- "grad_norm": 11.727005004882812,
1176
- "learning_rate": 8.925e-05,
1177
- "loss": 0.1992,
1178
  "step": 1430
1179
  },
1180
  {
1181
  "epoch": 4.49,
1182
- "grad_norm": 2.252082347869873,
1183
- "learning_rate": 8.9875e-05,
1184
- "loss": 0.1169,
1185
  "step": 1440
1186
  },
1187
  {
1188
  "epoch": 4.52,
1189
- "grad_norm": 11.580313682556152,
1190
- "learning_rate": 9.05e-05,
1191
- "loss": 0.4091,
1192
  "step": 1450
1193
  },
1194
  {
1195
  "epoch": 4.55,
1196
- "grad_norm": 1.8475000858306885,
1197
- "learning_rate": 9.1125e-05,
1198
- "loss": 0.1869,
1199
  "step": 1460
1200
  },
1201
  {
1202
  "epoch": 4.58,
1203
- "grad_norm": 4.9334797859191895,
1204
- "learning_rate": 9.175000000000001e-05,
1205
- "loss": 0.1896,
1206
  "step": 1470
1207
  },
1208
  {
1209
  "epoch": 4.61,
1210
- "grad_norm": 11.026119232177734,
1211
- "learning_rate": 9.2375e-05,
1212
- "loss": 0.2189,
1213
  "step": 1480
1214
  },
1215
  {
1216
  "epoch": 4.64,
1217
- "grad_norm": 5.846218109130859,
1218
- "learning_rate": 9.300000000000001e-05,
1219
- "loss": 0.279,
1220
  "step": 1490
1221
  },
1222
  {
1223
  "epoch": 4.67,
1224
- "grad_norm": 5.4308295249938965,
1225
- "learning_rate": 9.362500000000001e-05,
1226
- "loss": 0.1757,
1227
  "step": 1500
1228
  },
1229
  {
1230
  "epoch": 4.67,
1231
- "eval_accuracy": 0.8259361997226075,
1232
- "eval_f1": 0.8235909038686198,
1233
- "eval_loss": 0.669946014881134,
1234
- "eval_precision": 0.8379744032947182,
1235
- "eval_recall": 0.8259361997226075,
1236
- "eval_runtime": 39.5798,
1237
- "eval_samples_per_second": 72.865,
1238
- "eval_steps_per_second": 9.121,
1239
  "step": 1500
1240
  },
1241
  {
1242
  "epoch": 4.7,
1243
- "grad_norm": 4.273800373077393,
1244
- "learning_rate": 9.425e-05,
1245
- "loss": 0.2125,
1246
  "step": 1510
1247
  },
1248
  {
1249
  "epoch": 4.74,
1250
- "grad_norm": 6.921429634094238,
1251
- "learning_rate": 9.4875e-05,
1252
- "loss": 0.2207,
1253
  "step": 1520
1254
  },
1255
  {
1256
  "epoch": 4.77,
1257
- "grad_norm": 5.302011966705322,
1258
- "learning_rate": 9.55e-05,
1259
- "loss": 0.1411,
1260
  "step": 1530
1261
  },
1262
  {
1263
  "epoch": 4.8,
1264
- "grad_norm": 6.112096786499023,
1265
- "learning_rate": 9.6125e-05,
1266
- "loss": 0.2486,
1267
  "step": 1540
1268
  },
1269
  {
1270
  "epoch": 4.83,
1271
- "grad_norm": 4.856971263885498,
1272
- "learning_rate": 9.675000000000001e-05,
1273
- "loss": 0.1348,
1274
  "step": 1550
1275
  },
1276
  {
1277
  "epoch": 4.86,
1278
- "grad_norm": 5.860950469970703,
1279
- "learning_rate": 9.737500000000001e-05,
1280
- "loss": 0.1904,
1281
  "step": 1560
1282
  },
1283
  {
1284
  "epoch": 4.89,
1285
- "grad_norm": 20.086708068847656,
1286
- "learning_rate": 9.8e-05,
1287
- "loss": 0.4989,
1288
  "step": 1570
1289
  },
1290
  {
1291
  "epoch": 4.92,
1292
- "grad_norm": 8.487300872802734,
1293
- "learning_rate": 9.8625e-05,
1294
- "loss": 0.3612,
1295
  "step": 1580
1296
  },
1297
  {
1298
  "epoch": 4.95,
1299
- "grad_norm": 7.535490036010742,
1300
- "learning_rate": 9.925000000000001e-05,
1301
- "loss": 0.213,
1302
  "step": 1590
1303
  },
1304
  {
1305
  "epoch": 4.98,
1306
- "grad_norm": 0.8333636522293091,
1307
- "learning_rate": 9.9875e-05,
1308
- "loss": 0.2443,
1309
  "step": 1600
1310
  },
1311
  {
1312
  "epoch": 4.98,
1313
- "eval_accuracy": 0.8030513176144244,
1314
- "eval_f1": 0.7857260281816361,
1315
- "eval_loss": 0.7500908970832825,
1316
- "eval_precision": 0.8171747232724846,
1317
- "eval_recall": 0.8030513176144244,
1318
- "eval_runtime": 39.1476,
1319
- "eval_samples_per_second": 73.67,
1320
- "eval_steps_per_second": 9.222,
1321
  "step": 1600
1322
  },
1323
  {
1324
  "epoch": 5.02,
1325
- "grad_norm": 2.681772470474243,
1326
- "learning_rate": 9.999957561556831e-05,
1327
- "loss": 0.3144,
1328
  "step": 1610
1329
  },
1330
  {
1331
  "epoch": 5.05,
1332
- "grad_norm": 9.32345962524414,
1333
- "learning_rate": 9.999785156616144e-05,
1334
- "loss": 0.2125,
1335
  "step": 1620
1336
  },
1337
  {
1338
  "epoch": 5.08,
1339
- "grad_norm": 5.002188205718994,
1340
- "learning_rate": 9.999480137344589e-05,
1341
- "loss": 0.0641,
1342
  "step": 1630
1343
  },
1344
  {
1345
  "epoch": 5.11,
1346
- "grad_norm": 0.3894753158092499,
1347
- "learning_rate": 9.999042511832502e-05,
1348
- "loss": 0.0477,
1349
  "step": 1640
1350
  },
1351
  {
1352
  "epoch": 5.14,
1353
- "grad_norm": 8.571377754211426,
1354
- "learning_rate": 9.998472291687463e-05,
1355
- "loss": 0.1252,
1356
  "step": 1650
1357
  },
1358
  {
1359
  "epoch": 5.17,
1360
- "grad_norm": 0.09371213614940643,
1361
- "learning_rate": 9.997769492033998e-05,
1362
- "loss": 0.1112,
1363
  "step": 1660
1364
  },
1365
  {
1366
  "epoch": 5.2,
1367
- "grad_norm": 0.7354293465614319,
1368
- "learning_rate": 9.996934131513163e-05,
1369
- "loss": 0.0693,
1370
  "step": 1670
1371
  },
1372
  {
1373
  "epoch": 5.23,
1374
- "grad_norm": 0.20948350429534912,
1375
- "learning_rate": 9.99596623228207e-05,
1376
- "loss": 0.0876,
1377
  "step": 1680
1378
  },
1379
  {
1380
  "epoch": 5.26,
1381
- "grad_norm": 13.952701568603516,
1382
- "learning_rate": 9.994865820013281e-05,
1383
- "loss": 0.0678,
1384
  "step": 1690
1385
  },
1386
  {
1387
  "epoch": 5.3,
1388
- "grad_norm": 8.131987571716309,
1389
- "learning_rate": 9.993632923894143e-05,
1390
- "loss": 0.1498,
1391
  "step": 1700
1392
  },
1393
  {
1394
  "epoch": 5.3,
1395
- "eval_accuracy": 0.7971567267683772,
1396
- "eval_f1": 0.8015651844028568,
1397
- "eval_loss": 0.8649423122406006,
1398
- "eval_precision": 0.8395248583520678,
1399
- "eval_recall": 0.7971567267683772,
1400
- "eval_runtime": 39.0539,
1401
- "eval_samples_per_second": 73.847,
1402
- "eval_steps_per_second": 9.244,
1403
  "step": 1700
1404
  },
1405
  {
1406
  "epoch": 5.33,
1407
- "grad_norm": 0.155848890542984,
1408
- "learning_rate": 9.992267576625994e-05,
1409
- "loss": 0.14,
1410
  "step": 1710
1411
  },
1412
  {
1413
  "epoch": 5.36,
1414
- "grad_norm": 9.884765625,
1415
- "learning_rate": 9.990769814423313e-05,
1416
- "loss": 0.3033,
1417
  "step": 1720
1418
  },
1419
  {
1420
  "epoch": 5.39,
1421
- "grad_norm": 1.2280668020248413,
1422
- "learning_rate": 9.989139677012757e-05,
1423
- "loss": 0.2559,
1424
  "step": 1730
1425
  },
1426
  {
1427
  "epoch": 5.42,
1428
- "grad_norm": 7.7530741691589355,
1429
- "learning_rate": 9.9873772076321e-05,
1430
- "loss": 0.1839,
1431
  "step": 1740
1432
  },
1433
  {
1434
  "epoch": 5.45,
1435
- "grad_norm": 3.283310651779175,
1436
- "learning_rate": 9.985482453029087e-05,
1437
- "loss": 0.1326,
1438
  "step": 1750
1439
  },
1440
  {
1441
  "epoch": 5.48,
1442
- "grad_norm": 6.871270656585693,
1443
- "learning_rate": 9.983455463460203e-05,
1444
- "loss": 0.1726,
1445
  "step": 1760
1446
  },
1447
  {
1448
  "epoch": 5.51,
1449
- "grad_norm": 4.748425483703613,
1450
- "learning_rate": 9.98129629268933e-05,
1451
- "loss": 0.0782,
1452
  "step": 1770
1453
  },
1454
  {
1455
  "epoch": 5.55,
1456
- "grad_norm": 6.772828578948975,
1457
- "learning_rate": 9.979004997986327e-05,
1458
- "loss": 0.1466,
1459
  "step": 1780
1460
  },
1461
  {
1462
  "epoch": 5.58,
1463
- "grad_norm": 0.8694224953651428,
1464
- "learning_rate": 9.976581640125509e-05,
1465
- "loss": 0.1442,
1466
  "step": 1790
1467
  },
1468
  {
1469
  "epoch": 5.61,
1470
- "grad_norm": 3.4737765789031982,
1471
- "learning_rate": 9.974026283384031e-05,
1472
- "loss": 0.1072,
1473
  "step": 1800
1474
  },
1475
  {
1476
  "epoch": 5.61,
1477
- "eval_accuracy": 0.8335644937586685,
1478
- "eval_f1": 0.8386159061812507,
1479
- "eval_loss": 0.6284000873565674,
1480
- "eval_precision": 0.851118280726807,
1481
- "eval_recall": 0.8335644937586685,
1482
- "eval_runtime": 39.291,
1483
- "eval_samples_per_second": 73.401,
1484
- "eval_steps_per_second": 9.188,
1485
  "step": 1800
1486
  },
1487
  {
1488
  "epoch": 5.61,
1489
  "step": 1800,
1490
  "total_flos": 2.2287694956200755e+18,
1491
- "train_loss": 0.41900131742159524,
1492
- "train_runtime": 1406.907,
1493
- "train_samples_per_second": 364.487,
1494
- "train_steps_per_second": 22.816
1495
  }
1496
  ],
1497
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.4453237056732178,
3
  "best_model_checkpoint": "./vit-lr-cosine-restarts/checkpoint-800",
4
  "epoch": 5.607476635514018,
5
  "eval_steps": 100,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "grad_norm": 17.940969467163086,
14
+ "learning_rate": 1.125e-05,
15
+ "loss": 2.0172,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.06,
20
+ "grad_norm": 4.308961391448975,
21
+ "learning_rate": 2.375e-05,
22
+ "loss": 1.1159,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.09,
27
+ "grad_norm": 5.38205099105835,
28
+ "learning_rate": 3.625e-05,
29
+ "loss": 1.1398,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.12,
34
+ "grad_norm": 5.569328308105469,
35
+ "learning_rate": 4.875e-05,
36
+ "loss": 1.0508,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
+ "grad_norm": 5.870121002197266,
42
+ "learning_rate": 6.125000000000001e-05,
43
+ "loss": 0.8095,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.19,
48
+ "grad_norm": 6.100069046020508,
49
+ "learning_rate": 7.375e-05,
50
+ "loss": 0.8756,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.22,
55
+ "grad_norm": 4.655179023742676,
56
+ "learning_rate": 8.625000000000001e-05,
57
+ "loss": 0.9221,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.25,
62
+ "grad_norm": 4.762995719909668,
63
+ "learning_rate": 9.875000000000002e-05,
64
+ "loss": 0.6852,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.28,
69
+ "grad_norm": 5.309569358825684,
70
+ "learning_rate": 9.99995126719372e-05,
71
+ "loss": 0.6662,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.31,
76
+ "grad_norm": 4.177795886993408,
77
+ "learning_rate": 9.99978280932988e-05,
78
+ "loss": 0.686,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.31,
83
+ "eval_accuracy": 0.7517337031900139,
84
+ "eval_f1": 0.7444977301566197,
85
+ "eval_loss": 0.6707192063331604,
86
+ "eval_precision": 0.7624063091694634,
87
+ "eval_recall": 0.7517337031900139,
88
+ "eval_runtime": 36.8103,
89
+ "eval_samples_per_second": 78.348,
90
+ "eval_steps_per_second": 9.807,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 0.34,
95
+ "grad_norm": 3.6934421062469482,
96
+ "learning_rate": 9.999494028821966e-05,
97
+ "loss": 0.6901,
98
  "step": 110
99
  },
100
  {
101
  "epoch": 0.37,
102
+ "grad_norm": 8.207216262817383,
103
+ "learning_rate": 9.999084932619647e-05,
104
+ "loss": 0.6638,
105
  "step": 120
106
  },
107
  {
108
  "epoch": 0.4,
109
+ "grad_norm": 8.520118713378906,
110
+ "learning_rate": 9.998555530568059e-05,
111
+ "loss": 0.6285,
112
  "step": 130
113
  },
114
  {
115
  "epoch": 0.44,
116
+ "grad_norm": 3.053748846054077,
117
+ "learning_rate": 9.997905835407567e-05,
118
+ "loss": 0.5225,
119
  "step": 140
120
  },
121
  {
122
  "epoch": 0.47,
123
+ "grad_norm": 4.431256294250488,
124
+ "learning_rate": 9.997135862773453e-05,
125
+ "loss": 0.6307,
126
  "step": 150
127
  },
128
  {
129
  "epoch": 0.5,
130
+ "grad_norm": 7.533073902130127,
131
+ "learning_rate": 9.996245631195555e-05,
132
+ "loss": 0.5959,
133
  "step": 160
134
  },
135
  {
136
  "epoch": 0.53,
137
+ "grad_norm": 5.364689350128174,
138
+ "learning_rate": 9.99523516209781e-05,
139
+ "loss": 0.6344,
140
  "step": 170
141
  },
142
  {
143
  "epoch": 0.56,
144
+ "grad_norm": 6.664125919342041,
145
+ "learning_rate": 9.994104479797728e-05,
146
+ "loss": 0.6724,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 0.59,
151
+ "grad_norm": 4.114094257354736,
152
+ "learning_rate": 9.992853611505836e-05,
153
+ "loss": 0.6075,
154
  "step": 190
155
  },
156
  {
157
  "epoch": 0.62,
158
+ "grad_norm": 5.480118274688721,
159
+ "learning_rate": 9.991482587324993e-05,
160
+ "loss": 0.4852,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 0.62,
165
+ "eval_accuracy": 0.7704576976421637,
166
+ "eval_f1": 0.7230801818181609,
167
+ "eval_loss": 0.7021857500076294,
168
+ "eval_precision": 0.7857677405887458,
169
+ "eval_recall": 0.7704576976421637,
170
+ "eval_runtime": 37.0615,
171
+ "eval_samples_per_second": 77.817,
172
+ "eval_steps_per_second": 9.741,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 0.65,
177
+ "grad_norm": 4.710792064666748,
178
+ "learning_rate": 9.989991440249686e-05,
179
+ "loss": 0.6202,
180
  "step": 210
181
  },
182
  {
183
  "epoch": 0.69,
184
+ "grad_norm": 8.316452980041504,
185
+ "learning_rate": 9.988380206165225e-05,
186
+ "loss": 0.6816,
187
  "step": 220
188
  },
189
  {
190
  "epoch": 0.72,
191
+ "grad_norm": 3.131255626678467,
192
+ "learning_rate": 9.986648923846882e-05,
193
+ "loss": 0.4721,
194
  "step": 230
195
  },
196
  {
197
  "epoch": 0.75,
198
+ "grad_norm": 1.5796229839324951,
199
+ "learning_rate": 9.98479763495896e-05,
200
+ "loss": 0.5474,
201
  "step": 240
202
  },
203
  {
204
  "epoch": 0.78,
205
+ "grad_norm": 5.570051193237305,
206
+ "learning_rate": 9.98282638405379e-05,
207
+ "loss": 0.6763,
208
  "step": 250
209
  },
210
  {
211
  "epoch": 0.81,
212
+ "grad_norm": 4.160530090332031,
213
+ "learning_rate": 9.980735218570657e-05,
214
+ "loss": 0.5471,
215
  "step": 260
216
  },
217
  {
218
  "epoch": 0.84,
219
+ "grad_norm": 4.585910320281982,
220
+ "learning_rate": 9.978524188834659e-05,
221
+ "loss": 0.4856,
222
  "step": 270
223
  },
224
  {
225
  "epoch": 0.87,
226
+ "grad_norm": 2.5068447589874268,
227
+ "learning_rate": 9.976193348055496e-05,
228
+ "loss": 0.4724,
229
  "step": 280
230
  },
231
  {
232
  "epoch": 0.9,
233
+ "grad_norm": 7.588688373565674,
234
+ "learning_rate": 9.973742752326188e-05,
235
+ "loss": 0.7415,
236
  "step": 290
237
  },
238
  {
239
  "epoch": 0.93,
240
+ "grad_norm": 6.468356609344482,
241
+ "learning_rate": 9.971172460621732e-05,
242
+ "loss": 0.7098,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 0.93,
247
+ "eval_accuracy": 0.7995839112343966,
248
+ "eval_f1": 0.7972999747438626,
249
+ "eval_loss": 0.563714325428009,
250
+ "eval_precision": 0.8181443293620774,
251
+ "eval_recall": 0.7995839112343966,
252
+ "eval_runtime": 37.0609,
253
+ "eval_samples_per_second": 77.818,
254
+ "eval_steps_per_second": 9.741,
255
  "step": 300
256
  },
257
  {
258
  "epoch": 0.97,
259
+ "grad_norm": 4.219545841217041,
260
+ "learning_rate": 9.968482534797669e-05,
261
+ "loss": 0.5762,
262
  "step": 310
263
  },
264
  {
265
  "epoch": 1.0,
266
+ "grad_norm": 5.357555866241455,
267
+ "learning_rate": 9.965673039588614e-05,
268
+ "loss": 0.5181,
269
  "step": 320
270
  },
271
  {
272
  "epoch": 1.03,
273
+ "grad_norm": 4.9291582107543945,
274
+ "learning_rate": 9.962744042606678e-05,
275
+ "loss": 0.422,
276
  "step": 330
277
  },
278
  {
279
  "epoch": 1.06,
280
+ "grad_norm": 5.231845855712891,
281
+ "learning_rate": 9.959695614339857e-05,
282
+ "loss": 0.3889,
283
  "step": 340
284
  },
285
  {
286
  "epoch": 1.09,
287
+ "grad_norm": 3.579317808151245,
288
+ "learning_rate": 9.956527828150326e-05,
289
+ "loss": 0.3912,
290
  "step": 350
291
  },
292
  {
293
  "epoch": 1.12,
294
+ "grad_norm": 3.0033373832702637,
295
+ "learning_rate": 9.95324076027268e-05,
296
+ "loss": 0.3439,
297
  "step": 360
298
  },
299
  {
300
  "epoch": 1.15,
301
+ "grad_norm": 2.9946236610412598,
302
+ "learning_rate": 9.949834489812094e-05,
303
+ "loss": 0.4737,
304
  "step": 370
305
  },
306
  {
307
  "epoch": 1.18,
308
+ "grad_norm": 7.183070182800293,
309
+ "learning_rate": 9.946309098742424e-05,
310
+ "loss": 0.4325,
311
  "step": 380
312
  },
313
  {
314
  "epoch": 1.21,
315
+ "grad_norm": 4.066940784454346,
316
+ "learning_rate": 9.942664671904227e-05,
317
+ "loss": 0.527,
318
  "step": 390
319
  },
320
  {
321
  "epoch": 1.25,
322
+ "grad_norm": 4.427377700805664,
323
+ "learning_rate": 9.938901297002732e-05,
324
+ "loss": 0.4226,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 1.25,
329
+ "eval_accuracy": 0.7621359223300971,
330
+ "eval_f1": 0.7735141412800338,
331
+ "eval_loss": 0.6494001150131226,
332
+ "eval_precision": 0.8136939310866815,
333
+ "eval_recall": 0.7621359223300971,
334
+ "eval_runtime": 37.3436,
335
+ "eval_samples_per_second": 77.229,
336
+ "eval_steps_per_second": 9.667,
337
  "step": 400
338
  },
339
  {
340
  "epoch": 1.28,
341
+ "grad_norm": 3.8335494995117188,
342
+ "learning_rate": 9.935019064605713e-05,
343
+ "loss": 0.4727,
344
  "step": 410
345
  },
346
  {
347
  "epoch": 1.31,
348
+ "grad_norm": 3.420801877975464,
349
+ "learning_rate": 9.931018068141324e-05,
350
+ "loss": 0.4028,
351
  "step": 420
352
  },
353
  {
354
  "epoch": 1.34,
355
+ "grad_norm": 3.57491397857666,
356
+ "learning_rate": 9.926898403895842e-05,
357
+ "loss": 0.5144,
358
  "step": 430
359
  },
360
  {
361
  "epoch": 1.37,
362
+ "grad_norm": 2.4824776649475098,
363
+ "learning_rate": 9.92266017101135e-05,
364
+ "loss": 0.3917,
365
  "step": 440
366
  },
367
  {
368
  "epoch": 1.4,
369
+ "grad_norm": 3.9230153560638428,
370
+ "learning_rate": 9.918303471483359e-05,
371
+ "loss": 0.4286,
372
  "step": 450
373
  },
374
  {
375
  "epoch": 1.43,
376
+ "grad_norm": 3.334120512008667,
377
+ "learning_rate": 9.913828410158342e-05,
378
+ "loss": 0.4159,
379
  "step": 460
380
  },
381
  {
382
  "epoch": 1.46,
383
+ "grad_norm": 3.502173662185669,
384
+ "learning_rate": 9.909235094731222e-05,
385
+ "loss": 0.3367,
386
  "step": 470
387
  },
388
  {
389
  "epoch": 1.5,
390
+ "grad_norm": 6.164717674255371,
391
+ "learning_rate": 9.90452363574277e-05,
392
+ "loss": 0.3398,
393
  "step": 480
394
  },
395
  {
396
  "epoch": 1.53,
397
+ "grad_norm": 7.483583450317383,
398
+ "learning_rate": 9.899694146576952e-05,
399
+ "loss": 0.3941,
400
  "step": 490
401
  },
402
  {
403
  "epoch": 1.56,
404
+ "grad_norm": 2.83978009223938,
405
+ "learning_rate": 9.8947467434582e-05,
406
+ "loss": 0.3599,
407
  "step": 500
408
  },
409
  {
410
  "epoch": 1.56,
411
+ "eval_accuracy": 0.823509015256588,
412
+ "eval_f1": 0.8108853024782933,
413
+ "eval_loss": 0.5213786363601685,
414
+ "eval_precision": 0.8206648771819358,
415
+ "eval_recall": 0.823509015256588,
416
+ "eval_runtime": 37.1057,
417
+ "eval_samples_per_second": 77.724,
418
+ "eval_steps_per_second": 9.729,
419
  "step": 500
420
  },
421
  {
422
  "epoch": 1.59,
423
+ "grad_norm": 2.34218168258667,
424
+ "learning_rate": 9.889681545448608e-05,
425
+ "loss": 0.4614,
426
  "step": 510
427
  },
428
  {
429
  "epoch": 1.62,
430
+ "grad_norm": 4.667867660522461,
431
+ "learning_rate": 9.884498674445075e-05,
432
+ "loss": 0.4268,
433
  "step": 520
434
  },
435
  {
436
  "epoch": 1.65,
437
+ "grad_norm": 8.212017059326172,
438
+ "learning_rate": 9.87919825517637e-05,
439
+ "loss": 0.3964,
440
  "step": 530
441
  },
442
  {
443
  "epoch": 1.68,
444
+ "grad_norm": 2.64796781539917,
445
+ "learning_rate": 9.873780415200123e-05,
446
+ "loss": 0.3339,
447
  "step": 540
448
  },
449
  {
450
  "epoch": 1.71,
451
+ "grad_norm": 4.198647975921631,
452
+ "learning_rate": 9.868245284899764e-05,
453
+ "loss": 0.4548,
454
  "step": 550
455
  },
456
  {
457
  "epoch": 1.74,
458
+ "grad_norm": 4.222693920135498,
459
+ "learning_rate": 9.862592997481383e-05,
460
+ "loss": 0.3731,
461
  "step": 560
462
  },
463
  {
464
  "epoch": 1.78,
465
+ "grad_norm": 3.101301670074463,
466
+ "learning_rate": 9.856823688970525e-05,
467
+ "loss": 0.3133,
468
  "step": 570
469
  },
470
  {
471
  "epoch": 1.81,
472
+ "grad_norm": 3.361762523651123,
473
+ "learning_rate": 9.850937498208906e-05,
474
+ "loss": 0.3255,
475
  "step": 580
476
  },
477
  {
478
  "epoch": 1.84,
479
+ "grad_norm": 5.818238258361816,
480
+ "learning_rate": 9.84493456685109e-05,
481
+ "loss": 0.3941,
482
  "step": 590
483
  },
484
  {
485
  "epoch": 1.87,
486
+ "grad_norm": 2.1035287380218506,
487
+ "learning_rate": 9.838815039361066e-05,
488
+ "loss": 0.3533,
489
  "step": 600
490
  },
491
  {
492
  "epoch": 1.87,
493
+ "eval_accuracy": 0.8273231622746186,
494
+ "eval_f1": 0.8192634434149068,
495
+ "eval_loss": 0.534748911857605,
496
+ "eval_precision": 0.839212218456244,
497
+ "eval_recall": 0.8273231622746186,
498
+ "eval_runtime": 36.5566,
499
+ "eval_samples_per_second": 78.891,
500
+ "eval_steps_per_second": 9.875,
501
  "step": 600
502
  },
503
  {
504
  "epoch": 1.9,
505
+ "grad_norm": 5.71665620803833,
506
+ "learning_rate": 9.832579063008777e-05,
507
+ "loss": 0.4063,
508
  "step": 610
509
  },
510
  {
511
  "epoch": 1.93,
512
+ "grad_norm": 3.79347562789917,
513
+ "learning_rate": 9.826226787866574e-05,
514
+ "loss": 0.427,
515
  "step": 620
516
  },
517
  {
518
  "epoch": 1.96,
519
+ "grad_norm": 2.647468090057373,
520
+ "learning_rate": 9.819758366805607e-05,
521
+ "loss": 0.2353,
522
  "step": 630
523
  },
524
  {
525
  "epoch": 1.99,
526
+ "grad_norm": 3.3511950969696045,
527
+ "learning_rate": 9.813173955492141e-05,
528
+ "loss": 0.3889,
529
  "step": 640
530
  },
531
  {
532
  "epoch": 2.02,
533
+ "grad_norm": 4.286926746368408,
534
+ "learning_rate": 9.806473712383817e-05,
535
+ "loss": 0.2418,
536
  "step": 650
537
  },
538
  {
539
  "epoch": 2.06,
540
+ "grad_norm": 6.692331790924072,
541
+ "learning_rate": 9.79965779872583e-05,
542
+ "loss": 0.2296,
543
  "step": 660
544
  },
545
  {
546
  "epoch": 2.09,
547
+ "grad_norm": 3.424487590789795,
548
+ "learning_rate": 9.792726378547058e-05,
549
+ "loss": 0.1989,
550
  "step": 670
551
  },
552
  {
553
  "epoch": 2.12,
554
+ "grad_norm": 6.994248390197754,
555
+ "learning_rate": 9.785679618656106e-05,
556
+ "loss": 0.2232,
557
  "step": 680
558
  },
559
  {
560
  "epoch": 2.15,
561
+ "grad_norm": 5.3724365234375,
562
+ "learning_rate": 9.778517688637298e-05,
563
+ "loss": 0.2343,
564
  "step": 690
565
  },
566
  {
567
  "epoch": 2.18,
568
+ "grad_norm": 2.975245714187622,
569
+ "learning_rate": 9.77124076084659e-05,
570
+ "loss": 0.1178,
571
  "step": 700
572
  },
573
  {
574
  "epoch": 2.18,
575
+ "eval_accuracy": 0.8283633841886269,
576
+ "eval_f1": 0.8277021804405638,
577
+ "eval_loss": 0.5425286889076233,
578
+ "eval_precision": 0.838105973857857,
579
+ "eval_recall": 0.8283633841886269,
580
+ "eval_runtime": 36.4063,
581
+ "eval_samples_per_second": 79.217,
582
+ "eval_steps_per_second": 9.916,
583
  "step": 700
584
  },
585
  {
586
  "epoch": 2.21,
587
+ "grad_norm": 0.8108430504798889,
588
+ "learning_rate": 9.763849010407431e-05,
589
+ "loss": 0.2396,
590
  "step": 710
591
  },
592
  {
593
  "epoch": 2.24,
594
+ "grad_norm": 1.5715973377227783,
595
+ "learning_rate": 9.756342615206538e-05,
596
+ "loss": 0.3591,
597
  "step": 720
598
  },
599
  {
600
  "epoch": 2.27,
601
+ "grad_norm": 5.7021894454956055,
602
+ "learning_rate": 9.748721755889619e-05,
603
+ "loss": 0.3064,
604
  "step": 730
605
  },
606
  {
607
  "epoch": 2.31,
608
+ "grad_norm": 1.4883191585540771,
609
+ "learning_rate": 9.740986615857031e-05,
610
+ "loss": 0.1716,
611
  "step": 740
612
  },
613
  {
614
  "epoch": 2.34,
615
+ "grad_norm": 5.413182735443115,
616
+ "learning_rate": 9.733137381259363e-05,
617
+ "loss": 0.2045,
618
  "step": 750
619
  },
620
  {
621
  "epoch": 2.37,
622
+ "grad_norm": 2.8399062156677246,
623
+ "learning_rate": 9.725174240992947e-05,
624
+ "loss": 0.1557,
625
  "step": 760
626
  },
627
  {
628
  "epoch": 2.4,
629
+ "grad_norm": 8.557807922363281,
630
+ "learning_rate": 9.717097386695331e-05,
631
+ "loss": 0.3044,
632
  "step": 770
633
  },
634
  {
635
  "epoch": 2.43,
636
+ "grad_norm": 4.569642066955566,
637
+ "learning_rate": 9.708907012740649e-05,
638
+ "loss": 0.338,
639
  "step": 780
640
  },
641
  {
642
  "epoch": 2.46,
643
+ "grad_norm": 3.043851852416992,
644
+ "learning_rate": 9.700603316234952e-05,
645
+ "loss": 0.4123,
646
  "step": 790
647
  },
648
  {
649
  "epoch": 2.49,
650
+ "grad_norm": 2.985642194747925,
651
+ "learning_rate": 9.692186497011465e-05,
652
+ "loss": 0.2719,
653
  "step": 800
654
  },
655
  {
656
  "epoch": 2.49,
657
+ "eval_accuracy": 0.8463938973647711,
658
+ "eval_f1": 0.8437993661883203,
659
+ "eval_loss": 0.4453237056732178,
660
+ "eval_precision": 0.8463641738950213,
661
+ "eval_recall": 0.8463938973647711,
662
+ "eval_runtime": 36.5652,
663
+ "eval_samples_per_second": 78.873,
664
+ "eval_steps_per_second": 9.873,
665
  "step": 800
666
  },
667
  {
668
  "epoch": 2.52,
669
+ "grad_norm": 4.473091125488281,
670
+ "learning_rate": 9.683656757625777e-05,
671
+ "loss": 0.2869,
672
  "step": 810
673
  },
674
  {
675
  "epoch": 2.55,
676
+ "grad_norm": 7.418603420257568,
677
+ "learning_rate": 9.67501430335096e-05,
678
+ "loss": 0.202,
679
  "step": 820
680
  },
681
  {
682
  "epoch": 2.59,
683
+ "grad_norm": 3.375176191329956,
684
+ "learning_rate": 9.666259342172643e-05,
685
+ "loss": 0.1779,
686
  "step": 830
687
  },
688
  {
689
  "epoch": 2.62,
690
+ "grad_norm": 4.781165599822998,
691
+ "learning_rate": 9.65739208478399e-05,
692
+ "loss": 0.2148,
693
  "step": 840
694
  },
695
  {
696
  "epoch": 2.65,
697
+ "grad_norm": 0.5722386837005615,
698
+ "learning_rate": 9.648412744580644e-05,
699
+ "loss": 0.1715,
700
  "step": 850
701
  },
702
  {
703
  "epoch": 2.68,
704
+ "grad_norm": 5.580469608306885,
705
+ "learning_rate": 9.63932153765558e-05,
706
+ "loss": 0.2103,
707
  "step": 860
708
  },
709
  {
710
  "epoch": 2.71,
711
+ "grad_norm": 8.275124549865723,
712
+ "learning_rate": 9.630118682793917e-05,
713
+ "loss": 0.3448,
714
  "step": 870
715
  },
716
  {
717
  "epoch": 2.74,
718
+ "grad_norm": 5.884052753448486,
719
+ "learning_rate": 9.620804401467638e-05,
720
+ "loss": 0.2852,
721
  "step": 880
722
  },
723
  {
724
  "epoch": 2.77,
725
+ "grad_norm": 2.674102783203125,
726
+ "learning_rate": 9.611378917830271e-05,
727
+ "loss": 0.1887,
728
  "step": 890
729
  },
730
  {
731
  "epoch": 2.8,
732
+ "grad_norm": 2.95959210395813,
733
+ "learning_rate": 9.601842458711493e-05,
734
+ "loss": 0.1559,
735
  "step": 900
736
  },
737
  {
738
  "epoch": 2.8,
739
+ "eval_accuracy": 0.8325242718446602,
740
+ "eval_f1": 0.8284199470401122,
741
+ "eval_loss": 0.6127275824546814,
742
+ "eval_precision": 0.8566914070001943,
743
+ "eval_recall": 0.8325242718446602,
744
+ "eval_runtime": 37.1542,
745
+ "eval_samples_per_second": 77.623,
746
+ "eval_steps_per_second": 9.716,
747
  "step": 900
748
  },
749
  {
750
  "epoch": 2.83,
751
+ "grad_norm": 3.5971999168395996,
752
+ "learning_rate": 9.592195253611667e-05,
753
+ "loss": 0.3572,
754
  "step": 910
755
  },
756
  {
757
  "epoch": 2.87,
758
+ "grad_norm": 3.3870370388031006,
759
+ "learning_rate": 9.582437534696324e-05,
760
+ "loss": 0.3793,
761
  "step": 920
762
  },
763
  {
764
  "epoch": 2.9,
765
+ "grad_norm": 0.9074994325637817,
766
+ "learning_rate": 9.572569536790572e-05,
767
+ "loss": 0.3371,
768
  "step": 930
769
  },
770
  {
771
  "epoch": 2.93,
772
+ "grad_norm": 6.770932197570801,
773
+ "learning_rate": 9.562591497373448e-05,
774
+ "loss": 0.2833,
775
  "step": 940
776
  },
777
  {
778
  "epoch": 2.96,
779
+ "grad_norm": 6.183658123016357,
780
+ "learning_rate": 9.552503656572196e-05,
781
+ "loss": 0.2882,
782
  "step": 950
783
  },
784
  {
785
  "epoch": 2.99,
786
+ "grad_norm": 5.340375900268555,
787
+ "learning_rate": 9.542306257156502e-05,
788
+ "loss": 0.1809,
789
  "step": 960
790
  },
791
  {
792
  "epoch": 3.02,
793
+ "grad_norm": 4.281813621520996,
794
+ "learning_rate": 9.531999544532633e-05,
795
+ "loss": 0.1301,
796
  "step": 970
797
  },
798
  {
799
  "epoch": 3.05,
800
+ "grad_norm": 2.9234039783477783,
801
+ "learning_rate": 9.521583766737552e-05,
802
+ "loss": 0.088,
803
  "step": 980
804
  },
805
  {
806
  "epoch": 3.08,
807
+ "grad_norm": 0.24170830845832825,
808
+ "learning_rate": 9.511059174432925e-05,
809
+ "loss": 0.0491,
810
  "step": 990
811
  },
812
  {
813
  "epoch": 3.12,
814
+ "grad_norm": 4.773263931274414,
815
+ "learning_rate": 9.500426020899115e-05,
816
+ "loss": 0.1328,
817
  "step": 1000
818
  },
819
  {
820
  "epoch": 3.12,
821
+ "eval_accuracy": 0.8509015256588072,
822
+ "eval_f1": 0.8451195646353651,
823
+ "eval_loss": 0.5303316712379456,
824
+ "eval_precision": 0.845642270599866,
825
+ "eval_recall": 0.8509015256588072,
826
+ "eval_runtime": 36.7597,
827
+ "eval_samples_per_second": 78.456,
828
+ "eval_steps_per_second": 9.821,
829
  "step": 1000
830
  },
831
  {
832
  "epoch": 3.15,
833
+ "grad_norm": 1.725915789604187,
834
+ "learning_rate": 9.489684562029066e-05,
835
+ "loss": 0.1083,
836
  "step": 1010
837
  },
838
  {
839
  "epoch": 3.18,
840
+ "grad_norm": 4.2252888679504395,
841
+ "learning_rate": 9.47883505632215e-05,
842
+ "loss": 0.1296,
843
  "step": 1020
844
  },
845
  {
846
  "epoch": 3.21,
847
+ "grad_norm": 4.19112491607666,
848
+ "learning_rate": 9.467877764877955e-05,
849
+ "loss": 0.0713,
850
  "step": 1030
851
  },
852
  {
853
  "epoch": 3.24,
854
+ "grad_norm": 0.8787875175476074,
855
+ "learning_rate": 9.45681295138999e-05,
856
+ "loss": 0.0602,
857
  "step": 1040
858
  },
859
  {
860
  "epoch": 3.27,
861
+ "grad_norm": 2.9338300228118896,
862
+ "learning_rate": 9.445640882139342e-05,
863
+ "loss": 0.1112,
864
  "step": 1050
865
  },
866
  {
867
  "epoch": 3.3,
868
+ "grad_norm": 0.03492557257413864,
869
+ "learning_rate": 9.434361825988276e-05,
870
+ "loss": 0.0632,
871
  "step": 1060
872
  },
873
  {
874
  "epoch": 3.33,
875
+ "grad_norm": 7.183565616607666,
876
+ "learning_rate": 9.422976054373753e-05,
877
+ "loss": 0.1271,
878
  "step": 1070
879
  },
880
  {
881
  "epoch": 3.36,
882
+ "grad_norm": 5.910800457000732,
883
+ "learning_rate": 9.411483841300905e-05,
884
+ "loss": 0.1384,
885
  "step": 1080
886
  },
887
  {
888
  "epoch": 3.4,
889
+ "grad_norm": 4.911332607269287,
890
+ "learning_rate": 9.399885463336437e-05,
891
+ "loss": 0.0607,
892
  "step": 1090
893
  },
894
  {
895
  "epoch": 3.43,
896
+ "grad_norm": 1.9047012329101562,
897
+ "learning_rate": 9.388181199601974e-05,
898
+ "loss": 0.1756,
899
  "step": 1100
900
  },
901
  {
902
  "epoch": 3.43,
903
+ "eval_accuracy": 0.8321775312066574,
904
+ "eval_f1": 0.8151306059680461,
905
+ "eval_loss": 0.7960126996040344,
906
+ "eval_precision": 0.8366372545968512,
907
+ "eval_recall": 0.8321775312066574,
908
+ "eval_runtime": 36.834,
909
+ "eval_samples_per_second": 78.297,
910
+ "eval_steps_per_second": 9.801,
911
  "step": 1100
912
  },
913
  {
914
  "epoch": 3.46,
915
+ "grad_norm": 0.7878803610801697,
916
+ "learning_rate": 9.376371331767345e-05,
917
+ "loss": 0.1006,
918
  "step": 1110
919
  },
920
  {
921
  "epoch": 3.49,
922
+ "grad_norm": 1.042022705078125,
923
+ "learning_rate": 9.364456144043798e-05,
924
+ "loss": 0.1516,
925
  "step": 1120
926
  },
927
  {
928
  "epoch": 3.52,
929
+ "grad_norm": 1.4984287023544312,
930
+ "learning_rate": 9.35243592317717e-05,
931
+ "loss": 0.0771,
932
  "step": 1130
933
  },
934
  {
935
  "epoch": 3.55,
936
+ "grad_norm": 7.682912349700928,
937
+ "learning_rate": 9.340310958440976e-05,
938
+ "loss": 0.0898,
939
  "step": 1140
940
  },
941
  {
942
  "epoch": 3.58,
943
+ "grad_norm": 4.866548538208008,
944
+ "learning_rate": 9.328081541629453e-05,
945
+ "loss": 0.1182,
946
  "step": 1150
947
  },
948
  {
949
  "epoch": 3.61,
950
+ "grad_norm": 2.1378111839294434,
951
+ "learning_rate": 9.315747967050541e-05,
952
+ "loss": 0.2255,
953
  "step": 1160
954
  },
955
  {
956
  "epoch": 3.64,
957
+ "grad_norm": 1.4697102308273315,
958
+ "learning_rate": 9.303310531518793e-05,
959
+ "loss": 0.1011,
960
  "step": 1170
961
  },
962
  {
963
  "epoch": 3.68,
964
+ "grad_norm": 5.105794429779053,
965
+ "learning_rate": 9.290769534348236e-05,
966
+ "loss": 0.1298,
967
  "step": 1180
968
  },
969
  {
970
  "epoch": 3.71,
971
+ "grad_norm": 5.116852760314941,
972
+ "learning_rate": 9.278125277345168e-05,
973
+ "loss": 0.1145,
974
  "step": 1190
975
  },
976
  {
977
  "epoch": 3.74,
978
+ "grad_norm": 1.9126471281051636,
979
+ "learning_rate": 9.265378064800895e-05,
980
+ "loss": 0.3582,
981
  "step": 1200
982
  },
983
  {
984
  "epoch": 3.74,
985
+ "eval_accuracy": 0.834257975034674,
986
+ "eval_f1": 0.824916890066515,
987
+ "eval_loss": 0.6675512790679932,
988
+ "eval_precision": 0.8284494824114729,
989
+ "eval_recall": 0.834257975034674,
990
+ "eval_runtime": 36.0917,
991
+ "eval_samples_per_second": 79.908,
992
+ "eval_steps_per_second": 10.002,
993
  "step": 1200
994
  },
995
  {
996
  "epoch": 3.77,
997
+ "grad_norm": 4.828185081481934,
998
+ "learning_rate": 9.252528203484403e-05,
999
+ "loss": 0.1843,
1000
  "step": 1210
1001
  },
1002
  {
1003
  "epoch": 3.8,
1004
+ "grad_norm": 2.7517149448394775,
1005
+ "learning_rate": 9.239576002634984e-05,
1006
+ "loss": 0.1066,
1007
  "step": 1220
1008
  },
1009
  {
1010
  "epoch": 3.83,
1011
+ "grad_norm": 3.601691246032715,
1012
+ "learning_rate": 9.226521773954791e-05,
1013
+ "loss": 0.1121,
1014
  "step": 1230
1015
  },
1016
  {
1017
  "epoch": 3.86,
1018
+ "grad_norm": 0.0293317511677742,
1019
+ "learning_rate": 9.21336583160133e-05,
1020
+ "loss": 0.1822,
1021
  "step": 1240
1022
  },
1023
  {
1024
  "epoch": 3.89,
1025
+ "grad_norm": 0.6248491406440735,
1026
+ "learning_rate": 9.200108492179906e-05,
1027
+ "loss": 0.1261,
1028
  "step": 1250
1029
  },
1030
  {
1031
  "epoch": 3.93,
1032
+ "grad_norm": 0.1484900414943695,
1033
+ "learning_rate": 9.186750074736009e-05,
1034
+ "loss": 0.1224,
1035
  "step": 1260
1036
  },
1037
  {
1038
  "epoch": 3.96,
1039
+ "grad_norm": 2.4208881855010986,
1040
+ "learning_rate": 9.17329090074762e-05,
1041
+ "loss": 0.1018,
1042
  "step": 1270
1043
  },
1044
  {
1045
  "epoch": 3.99,
1046
+ "grad_norm": 0.208229199051857,
1047
+ "learning_rate": 9.159731294117492e-05,
1048
+ "loss": 0.1453,
1049
  "step": 1280
1050
  },
1051
  {
1052
  "epoch": 4.02,
1053
+ "grad_norm": 0.03745197877287865,
1054
+ "learning_rate": 9.146071581165345e-05,
1055
+ "loss": 0.1056,
1056
  "step": 1290
1057
  },
1058
  {
1059
  "epoch": 4.05,
1060
+ "grad_norm": 1.308124303817749,
1061
+ "learning_rate": 9.132312090620011e-05,
1062
+ "loss": 0.025,
1063
  "step": 1300
1064
  },
1065
  {
1066
  "epoch": 4.05,
1067
+ "eval_accuracy": 0.8474341192787794,
1068
+ "eval_f1": 0.8476771584783079,
1069
+ "eval_loss": 0.5981015563011169,
1070
+ "eval_precision": 0.859867898706205,
1071
+ "eval_recall": 0.8474341192787794,
1072
+ "eval_runtime": 36.0165,
1073
+ "eval_samples_per_second": 80.074,
1074
+ "eval_steps_per_second": 10.023,
1075
  "step": 1300
1076
  },
1077
  {
1078
  "epoch": 4.08,
1079
+ "grad_norm": 0.0758899599313736,
1080
+ "learning_rate": 9.118453153611532e-05,
1081
+ "loss": 0.0298,
1082
  "step": 1310
1083
  },
1084
  {
1085
  "epoch": 4.11,
1086
+ "grad_norm": 0.28617605566978455,
1087
+ "learning_rate": 9.104495103663187e-05,
1088
+ "loss": 0.017,
1089
  "step": 1320
1090
  },
1091
  {
1092
  "epoch": 4.14,
1093
+ "grad_norm": 0.19124433398246765,
1094
+ "learning_rate": 9.090438276683457e-05,
1095
+ "loss": 0.017,
1096
  "step": 1330
1097
  },
1098
  {
1099
  "epoch": 4.17,
1100
+ "grad_norm": 0.31906023621559143,
1101
+ "learning_rate": 9.07628301095796e-05,
1102
+ "loss": 0.0196,
1103
  "step": 1340
1104
  },
1105
  {
1106
  "epoch": 4.21,
1107
+ "grad_norm": 7.870569229125977,
1108
+ "learning_rate": 9.062029647141289e-05,
1109
+ "loss": 0.0685,
1110
  "step": 1350
1111
  },
1112
  {
1113
  "epoch": 4.24,
1114
+ "grad_norm": 5.755252361297607,
1115
+ "learning_rate": 9.04767852824883e-05,
1116
+ "loss": 0.0266,
1117
  "step": 1360
1118
  },
1119
  {
1120
  "epoch": 4.27,
1121
+ "grad_norm": 0.05481214076280594,
1122
+ "learning_rate": 9.0332299996485e-05,
1123
+ "loss": 0.0594,
1124
  "step": 1370
1125
  },
1126
  {
1127
  "epoch": 4.3,
1128
+ "grad_norm": 5.236385345458984,
1129
+ "learning_rate": 9.018684409052436e-05,
1130
+ "loss": 0.0999,
1131
  "step": 1380
1132
  },
1133
  {
1134
  "epoch": 4.33,
1135
+ "grad_norm": 5.066316604614258,
1136
+ "learning_rate": 9.004042106508625e-05,
1137
+ "loss": 0.0612,
1138
  "step": 1390
1139
  },
1140
  {
1141
  "epoch": 4.36,
1142
+ "grad_norm": 0.9376081824302673,
1143
+ "learning_rate": 8.989303444392487e-05,
1144
+ "loss": 0.042,
1145
  "step": 1400
1146
  },
1147
  {
1148
  "epoch": 4.36,
1149
+ "eval_accuracy": 0.8162274618585298,
1150
+ "eval_f1": 0.8241381969601037,
1151
+ "eval_loss": 0.8095719814300537,
1152
+ "eval_precision": 0.8477203881282387,
1153
+ "eval_recall": 0.8162274618585298,
1154
+ "eval_runtime": 36.3685,
1155
+ "eval_samples_per_second": 79.299,
1156
+ "eval_steps_per_second": 9.926,
1157
  "step": 1400
1158
  },
1159
  {
1160
  "epoch": 4.39,
1161
+ "grad_norm": 0.013413701206445694,
1162
+ "learning_rate": 8.974468777398388e-05,
1163
+ "loss": 0.1021,
1164
  "step": 1410
1165
  },
1166
  {
1167
  "epoch": 4.42,
1168
+ "grad_norm": 7.129204750061035,
1169
+ "learning_rate": 8.959538462531108e-05,
1170
+ "loss": 0.1356,
1171
  "step": 1420
1172
  },
1173
  {
1174
  "epoch": 4.45,
1175
+ "grad_norm": 3.306025505065918,
1176
+ "learning_rate": 8.944512859097245e-05,
1177
+ "loss": 0.0191,
1178
  "step": 1430
1179
  },
1180
  {
1181
  "epoch": 4.49,
1182
+ "grad_norm": 0.022593187168240547,
1183
+ "learning_rate": 8.929392328696574e-05,
1184
+ "loss": 0.0448,
1185
  "step": 1440
1186
  },
1187
  {
1188
  "epoch": 4.52,
1189
+ "grad_norm": 4.569972991943359,
1190
+ "learning_rate": 8.914177235213341e-05,
1191
+ "loss": 0.073,
1192
  "step": 1450
1193
  },
1194
  {
1195
  "epoch": 4.55,
1196
+ "grad_norm": 4.406450271606445,
1197
+ "learning_rate": 8.898867944807507e-05,
1198
+ "loss": 0.0672,
1199
  "step": 1460
1200
  },
1201
  {
1202
  "epoch": 4.58,
1203
+ "grad_norm": 7.7769904136657715,
1204
+ "learning_rate": 8.883464825905934e-05,
1205
+ "loss": 0.0947,
1206
  "step": 1470
1207
  },
1208
  {
1209
  "epoch": 4.61,
1210
+ "grad_norm": 9.705739974975586,
1211
+ "learning_rate": 8.867968249193526e-05,
1212
+ "loss": 0.0344,
1213
  "step": 1480
1214
  },
1215
  {
1216
  "epoch": 4.64,
1217
+ "grad_norm": 1.0004101991653442,
1218
+ "learning_rate": 8.852378587604297e-05,
1219
+ "loss": 0.0877,
1220
  "step": 1490
1221
  },
1222
  {
1223
  "epoch": 4.67,
1224
+ "grad_norm": 0.8759760856628418,
1225
+ "learning_rate": 8.836696216312405e-05,
1226
+ "loss": 0.05,
1227
  "step": 1500
1228
  },
1229
  {
1230
  "epoch": 4.67,
1231
+ "eval_accuracy": 0.841886269070735,
1232
+ "eval_f1": 0.8341416187793224,
1233
+ "eval_loss": 0.7948272228240967,
1234
+ "eval_precision": 0.8474305891416275,
1235
+ "eval_recall": 0.841886269070735,
1236
+ "eval_runtime": 37.2997,
1237
+ "eval_samples_per_second": 77.32,
1238
+ "eval_steps_per_second": 9.678,
1239
  "step": 1500
1240
  },
1241
  {
1242
  "epoch": 4.7,
1243
+ "grad_norm": 0.008059758692979813,
1244
+ "learning_rate": 8.82092151272312e-05,
1245
+ "loss": 0.0939,
1246
  "step": 1510
1247
  },
1248
  {
1249
  "epoch": 4.74,
1250
+ "grad_norm": 0.09355029463768005,
1251
+ "learning_rate": 8.80505485646374e-05,
1252
+ "loss": 0.03,
1253
  "step": 1520
1254
  },
1255
  {
1256
  "epoch": 4.77,
1257
+ "grad_norm": 0.4395085871219635,
1258
+ "learning_rate": 8.78909662937446e-05,
1259
+ "loss": 0.1189,
1260
  "step": 1530
1261
  },
1262
  {
1263
  "epoch": 4.8,
1264
+ "grad_norm": 0.6208884119987488,
1265
+ "learning_rate": 8.773047215499176e-05,
1266
+ "loss": 0.06,
1267
  "step": 1540
1268
  },
1269
  {
1270
  "epoch": 4.83,
1271
+ "grad_norm": 1.5753854513168335,
1272
+ "learning_rate": 8.756907001076249e-05,
1273
+ "loss": 0.0517,
1274
  "step": 1550
1275
  },
1276
  {
1277
  "epoch": 4.86,
1278
+ "grad_norm": 5.975317001342773,
1279
+ "learning_rate": 8.740676374529206e-05,
1280
+ "loss": 0.1042,
1281
  "step": 1560
1282
  },
1283
  {
1284
  "epoch": 4.89,
1285
+ "grad_norm": 3.8262839317321777,
1286
+ "learning_rate": 8.724355726457395e-05,
1287
+ "loss": 0.0716,
1288
  "step": 1570
1289
  },
1290
  {
1291
  "epoch": 4.92,
1292
+ "grad_norm": 2.5249273777008057,
1293
+ "learning_rate": 8.707945449626583e-05,
1294
+ "loss": 0.0254,
1295
  "step": 1580
1296
  },
1297
  {
1298
  "epoch": 4.95,
1299
+ "grad_norm": 10.16901969909668,
1300
+ "learning_rate": 8.691445938959504e-05,
1301
+ "loss": 0.1318,
1302
  "step": 1590
1303
  },
1304
  {
1305
  "epoch": 4.98,
1306
+ "grad_norm": 1.012184977531433,
1307
+ "learning_rate": 8.674857591526355e-05,
1308
+ "loss": 0.028,
1309
  "step": 1600
1310
  },
1311
  {
1312
  "epoch": 4.98,
1313
+ "eval_accuracy": 0.8457004160887656,
1314
+ "eval_f1": 0.8476462818721602,
1315
+ "eval_loss": 0.6741925477981567,
1316
+ "eval_precision": 0.8558392490201036,
1317
+ "eval_recall": 0.8457004160887656,
1318
+ "eval_runtime": 37.79,
1319
+ "eval_samples_per_second": 76.316,
1320
+ "eval_steps_per_second": 9.553,
1321
  "step": 1600
1322
  },
1323
  {
1324
  "epoch": 5.02,
1325
+ "grad_norm": 1.2411854267120361,
1326
+ "learning_rate": 8.658180806535243e-05,
1327
+ "loss": 0.0079,
1328
  "step": 1610
1329
  },
1330
  {
1331
  "epoch": 5.05,
1332
+ "grad_norm": 8.111499786376953,
1333
+ "learning_rate": 8.641415985322571e-05,
1334
+ "loss": 0.0455,
1335
  "step": 1620
1336
  },
1337
  {
1338
  "epoch": 5.08,
1339
+ "grad_norm": 0.05484266206622124,
1340
+ "learning_rate": 8.624563531343393e-05,
1341
+ "loss": 0.0255,
1342
  "step": 1630
1343
  },
1344
  {
1345
  "epoch": 5.11,
1346
+ "grad_norm": 0.6857353448867798,
1347
+ "learning_rate": 8.607623850161686e-05,
1348
+ "loss": 0.0115,
1349
  "step": 1640
1350
  },
1351
  {
1352
  "epoch": 5.14,
1353
+ "grad_norm": 0.38402843475341797,
1354
+ "learning_rate": 8.590597349440604e-05,
1355
+ "loss": 0.0171,
1356
  "step": 1650
1357
  },
1358
  {
1359
  "epoch": 5.17,
1360
+ "grad_norm": 0.07074743509292603,
1361
+ "learning_rate": 8.573484438932666e-05,
1362
+ "loss": 0.0067,
1363
  "step": 1660
1364
  },
1365
  {
1366
  "epoch": 5.2,
1367
+ "grad_norm": 0.1260824352502823,
1368
+ "learning_rate": 8.556285530469887e-05,
1369
+ "loss": 0.009,
1370
  "step": 1670
1371
  },
1372
  {
1373
  "epoch": 5.23,
1374
+ "grad_norm": 0.23438212275505066,
1375
+ "learning_rate": 8.539001037953876e-05,
1376
+ "loss": 0.0232,
1377
  "step": 1680
1378
  },
1379
  {
1380
  "epoch": 5.26,
1381
+ "grad_norm": 0.19910460710525513,
1382
+ "learning_rate": 8.521631377345869e-05,
1383
+ "loss": 0.0022,
1384
  "step": 1690
1385
  },
1386
  {
1387
  "epoch": 5.3,
1388
+ "grad_norm": 0.016467662528157234,
1389
+ "learning_rate": 8.50417696665672e-05,
1390
+ "loss": 0.0048,
1391
  "step": 1700
1392
  },
1393
  {
1394
  "epoch": 5.3,
1395
+ "eval_accuracy": 0.8484743411927878,
1396
+ "eval_f1": 0.8499629260872099,
1397
+ "eval_loss": 0.7832539081573486,
1398
+ "eval_precision": 0.8576584679191768,
1399
+ "eval_recall": 0.8484743411927878,
1400
+ "eval_runtime": 37.0076,
1401
+ "eval_samples_per_second": 77.93,
1402
+ "eval_steps_per_second": 9.755,
1403
  "step": 1700
1404
  },
1405
  {
1406
  "epoch": 5.33,
1407
+ "grad_norm": 0.006973025389015675,
1408
+ "learning_rate": 8.486638225936848e-05,
1409
+ "loss": 0.073,
1410
  "step": 1710
1411
  },
1412
  {
1413
  "epoch": 5.36,
1414
+ "grad_norm": 0.29455187916755676,
1415
+ "learning_rate": 8.469015577266115e-05,
1416
+ "loss": 0.002,
1417
  "step": 1720
1418
  },
1419
  {
1420
  "epoch": 5.39,
1421
+ "grad_norm": 0.06881581246852875,
1422
+ "learning_rate": 8.451309444743682e-05,
1423
+ "loss": 0.0479,
1424
  "step": 1730
1425
  },
1426
  {
1427
  "epoch": 5.42,
1428
+ "grad_norm": 0.03852876275777817,
1429
+ "learning_rate": 8.433520254477793e-05,
1430
+ "loss": 0.0821,
1431
  "step": 1740
1432
  },
1433
  {
1434
  "epoch": 5.45,
1435
+ "grad_norm": 0.0018428952898830175,
1436
+ "learning_rate": 8.415648434575529e-05,
1437
+ "loss": 0.0375,
1438
  "step": 1750
1439
  },
1440
  {
1441
  "epoch": 5.48,
1442
+ "grad_norm": 0.003056368324905634,
1443
+ "learning_rate": 8.397694415132495e-05,
1444
+ "loss": 0.0884,
1445
  "step": 1760
1446
  },
1447
  {
1448
  "epoch": 5.51,
1449
+ "grad_norm": 1.8021758794784546,
1450
+ "learning_rate": 8.379658628222478e-05,
1451
+ "loss": 0.0091,
1452
  "step": 1770
1453
  },
1454
  {
1455
  "epoch": 5.55,
1456
+ "grad_norm": 0.1937793642282486,
1457
+ "learning_rate": 8.361541507887045e-05,
1458
+ "loss": 0.0101,
1459
  "step": 1780
1460
  },
1461
  {
1462
  "epoch": 5.58,
1463
+ "grad_norm": 2.138684034347534,
1464
+ "learning_rate": 8.343343490125102e-05,
1465
+ "loss": 0.1135,
1466
  "step": 1790
1467
  },
1468
  {
1469
  "epoch": 5.61,
1470
+ "grad_norm": 0.781872034072876,
1471
+ "learning_rate": 8.325065012882392e-05,
1472
+ "loss": 0.0324,
1473
  "step": 1800
1474
  },
1475
  {
1476
  "epoch": 5.61,
1477
+ "eval_accuracy": 0.8533287101248266,
1478
+ "eval_f1": 0.8511397162148655,
1479
+ "eval_loss": 0.7405093312263489,
1480
+ "eval_precision": 0.8523403828700276,
1481
+ "eval_recall": 0.8533287101248266,
1482
+ "eval_runtime": 37.0368,
1483
+ "eval_samples_per_second": 77.869,
1484
+ "eval_steps_per_second": 9.747,
1485
  "step": 1800
1486
  },
1487
  {
1488
  "epoch": 5.61,
1489
  "step": 1800,
1490
  "total_flos": 2.2287694956200755e+18,
1491
+ "train_loss": 0.2811500767639114,
1492
+ "train_runtime": 1301.746,
1493
+ "train_samples_per_second": 393.932,
1494
+ "train_steps_per_second": 24.659
1495
  }
1496
  ],
1497
  "logging_steps": 10,