Xiyin02 commited on
Commit
a071d0a
1 Parent(s): 93707de

Upload 8 files

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 96.20253164556962,
3
- "eval_accuracy": 0.8746031746031746,
4
- "eval_loss": 0.5267017483711243,
5
- "eval_runtime": 10.7929,
6
- "eval_samples_per_second": 116.743,
7
- "eval_steps_per_second": 0.927,
8
- "total_flos": 7.515490775048022e+19,
9
- "train_loss": 0.33647052476280614,
10
- "train_runtime": 20573.1873,
11
- "train_samples_per_second": 48.996,
12
- "train_steps_per_second": 0.092
13
  }
 
1
  {
2
+ "epoch": 29.620253164556964,
3
+ "eval_accuracy": 0.8714285714285714,
4
+ "eval_loss": 0.4378375709056854,
5
+ "eval_runtime": 10.2444,
6
+ "eval_samples_per_second": 122.994,
7
+ "eval_steps_per_second": 1.952,
8
+ "total_flos": 2.3141184141358596e+19,
9
+ "train_loss": 0.5155148339067769,
10
+ "train_runtime": 6443.0087,
11
+ "train_samples_per_second": 46.935,
12
+ "train_steps_per_second": 0.182
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 96.20253164556962,
3
- "eval_accuracy": 0.8746031746031746,
4
- "eval_loss": 0.5267017483711243,
5
- "eval_runtime": 10.7929,
6
- "eval_samples_per_second": 116.743,
7
- "eval_steps_per_second": 0.927
8
  }
 
1
  {
2
+ "epoch": 29.620253164556964,
3
+ "eval_accuracy": 0.8714285714285714,
4
+ "eval_loss": 0.4378375709056854,
5
+ "eval_runtime": 10.2444,
6
+ "eval_samples_per_second": 122.994,
7
+ "eval_steps_per_second": 1.952
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff64402a55469a315d1e1c5a2136d2f36fa2972e9e72b454371736d9368e64da
3
  size 343263964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d008a2f8eaa5b32764dec5152e4060648d52f76f2db6d97f8b401961210f8ee9
3
  size 343263964
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 96.20253164556962,
3
- "total_flos": 7.515490775048022e+19,
4
- "train_loss": 0.33647052476280614,
5
- "train_runtime": 20573.1873,
6
- "train_samples_per_second": 48.996,
7
- "train_steps_per_second": 0.092
8
  }
 
1
  {
2
+ "epoch": 29.620253164556964,
3
+ "total_flos": 2.3141184141358596e+19,
4
+ "train_loss": 0.5155148339067769,
5
+ "train_runtime": 6443.0087,
6
+ "train_samples_per_second": 46.935,
7
+ "train_steps_per_second": 0.182
8
  }
trainer_state.json CHANGED
@@ -1,1579 +1,507 @@
1
  {
2
- "best_metric": 0.8746031746031746,
3
- "best_model_checkpoint": "CP2_HAR_vit-base-patch16-224/checkpoint-1382",
4
- "epoch": 96.20253164556962,
5
  "eval_steps": 500,
6
- "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.9620253164556962,
13
- "grad_norm": 2.8217339515686035,
14
- "learning_rate": 5e-06,
15
- "loss": 2.8429,
16
- "step": 19
17
- },
18
- {
19
- "epoch": 0.9620253164556962,
20
- "eval_accuracy": 0.16904761904761906,
21
- "eval_loss": 2.6474363803863525,
22
- "eval_runtime": 10.7675,
23
- "eval_samples_per_second": 117.018,
24
- "eval_steps_per_second": 0.929,
25
- "step": 19
26
- },
27
- {
28
- "epoch": 1.9746835443037973,
29
- "grad_norm": 2.534130334854126,
30
- "learning_rate": 1.0263157894736843e-05,
31
- "loss": 2.5152,
32
  "step": 39
33
  },
34
  {
35
- "epoch": 1.9746835443037973,
36
- "eval_accuracy": 0.31587301587301586,
37
- "eval_loss": 2.342618227005005,
38
- "eval_runtime": 10.7708,
39
- "eval_samples_per_second": 116.983,
40
- "eval_steps_per_second": 0.928,
41
  "step": 39
42
  },
43
  {
44
- "epoch": 2.9873417721518987,
45
- "grad_norm": 2.3139591217041016,
46
- "learning_rate": 1.5526315789473686e-05,
47
- "loss": 2.1548,
48
- "step": 59
49
- },
50
- {
51
- "epoch": 2.9873417721518987,
52
- "eval_accuracy": 0.5626984126984127,
53
- "eval_loss": 1.8545457124710083,
54
- "eval_runtime": 10.8474,
55
- "eval_samples_per_second": 116.157,
56
- "eval_steps_per_second": 0.922,
57
- "step": 59
58
- },
59
- {
60
- "epoch": 4.0,
61
- "grad_norm": 1.9620369672775269,
62
- "learning_rate": 2.078947368421053e-05,
63
- "loss": 1.6569,
64
  "step": 79
65
  },
66
  {
67
- "epoch": 4.0,
68
- "eval_accuracy": 0.7261904761904762,
69
- "eval_loss": 1.2644377946853638,
70
- "eval_runtime": 10.7977,
71
- "eval_samples_per_second": 116.692,
72
- "eval_steps_per_second": 0.926,
73
  "step": 79
74
  },
75
  {
76
- "epoch": 4.962025316455696,
77
- "grad_norm": 1.4392253160476685,
78
- "learning_rate": 2.578947368421053e-05,
79
- "loss": 1.2393,
80
- "step": 98
81
- },
82
- {
83
- "epoch": 4.962025316455696,
84
- "eval_accuracy": 0.7714285714285715,
85
- "eval_loss": 0.8716472387313843,
86
- "eval_runtime": 10.7696,
87
- "eval_samples_per_second": 116.996,
88
- "eval_steps_per_second": 0.929,
89
- "step": 98
90
- },
91
- {
92
- "epoch": 5.974683544303797,
93
- "grad_norm": 1.3771088123321533,
94
- "learning_rate": 3.105263157894737e-05,
95
- "loss": 0.8982,
96
  "step": 118
97
  },
98
  {
99
- "epoch": 5.974683544303797,
100
- "eval_accuracy": 0.8150793650793651,
101
- "eval_loss": 0.6652108430862427,
102
- "eval_runtime": 10.7233,
103
- "eval_samples_per_second": 117.501,
104
- "eval_steps_per_second": 0.933,
105
  "step": 118
106
  },
107
  {
108
- "epoch": 6.987341772151899,
109
- "grad_norm": 1.6404207944869995,
110
- "learning_rate": 3.6315789473684214e-05,
111
- "loss": 0.7694,
112
- "step": 138
113
- },
114
- {
115
- "epoch": 6.987341772151899,
116
- "eval_accuracy": 0.830952380952381,
117
- "eval_loss": 0.596939206123352,
118
- "eval_runtime": 10.7687,
119
- "eval_samples_per_second": 117.006,
120
- "eval_steps_per_second": 0.929,
121
- "step": 138
122
- },
123
- {
124
- "epoch": 8.0,
125
- "grad_norm": 1.4968894720077515,
126
- "learning_rate": 4.157894736842106e-05,
127
- "loss": 0.6819,
128
  "step": 158
129
  },
130
  {
131
- "epoch": 8.0,
132
- "eval_accuracy": 0.8396825396825397,
133
- "eval_loss": 0.5484516620635986,
134
- "eval_runtime": 10.8135,
135
- "eval_samples_per_second": 116.521,
136
- "eval_steps_per_second": 0.925,
137
  "step": 158
138
  },
139
  {
140
- "epoch": 8.962025316455696,
141
- "grad_norm": 1.414362907409668,
142
- "learning_rate": 4.657894736842106e-05,
143
- "loss": 0.6628,
144
- "step": 177
145
- },
146
- {
147
- "epoch": 8.962025316455696,
148
- "eval_accuracy": 0.8476190476190476,
149
- "eval_loss": 0.5053515434265137,
150
- "eval_runtime": 10.7521,
151
- "eval_samples_per_second": 117.186,
152
- "eval_steps_per_second": 0.93,
153
- "step": 177
154
- },
155
- {
156
- "epoch": 9.974683544303797,
157
- "grad_norm": 1.5052249431610107,
158
- "learning_rate": 4.97953216374269e-05,
159
- "loss": 0.5759,
160
  "step": 197
161
  },
162
  {
163
- "epoch": 9.974683544303797,
164
- "eval_accuracy": 0.8476190476190476,
165
- "eval_loss": 0.5064724683761597,
166
- "eval_runtime": 10.8275,
167
- "eval_samples_per_second": 116.37,
168
- "eval_steps_per_second": 0.924,
169
  "step": 197
170
  },
171
  {
172
- "epoch": 10.987341772151899,
173
- "grad_norm": 1.4250831604003906,
174
- "learning_rate": 4.921052631578947e-05,
175
- "loss": 0.5385,
176
- "step": 217
177
- },
178
- {
179
- "epoch": 10.987341772151899,
180
- "eval_accuracy": 0.8420634920634921,
181
- "eval_loss": 0.4821438789367676,
182
- "eval_runtime": 10.8099,
183
- "eval_samples_per_second": 116.56,
184
- "eval_steps_per_second": 0.925,
185
- "step": 217
186
- },
187
- {
188
- "epoch": 12.0,
189
- "grad_norm": 1.3822650909423828,
190
- "learning_rate": 4.8625730994152046e-05,
191
- "loss": 0.5022,
192
  "step": 237
193
  },
194
  {
195
- "epoch": 12.0,
196
- "eval_accuracy": 0.8507936507936508,
197
- "eval_loss": 0.47235107421875,
198
- "eval_runtime": 10.7944,
199
- "eval_samples_per_second": 116.727,
200
- "eval_steps_per_second": 0.926,
201
  "step": 237
202
  },
203
  {
204
- "epoch": 12.962025316455696,
205
- "grad_norm": 1.543864369392395,
206
- "learning_rate": 4.807017543859649e-05,
207
- "loss": 0.4841,
208
- "step": 256
209
- },
210
- {
211
- "epoch": 12.962025316455696,
212
- "eval_accuracy": 0.8587301587301587,
213
- "eval_loss": 0.48088887333869934,
214
- "eval_runtime": 10.7741,
215
- "eval_samples_per_second": 116.947,
216
- "eval_steps_per_second": 0.928,
217
- "step": 256
218
- },
219
- {
220
- "epoch": 13.974683544303797,
221
- "grad_norm": 1.4722260236740112,
222
- "learning_rate": 4.7485380116959065e-05,
223
- "loss": 0.4543,
224
  "step": 276
225
  },
226
  {
227
- "epoch": 13.974683544303797,
228
- "eval_accuracy": 0.861904761904762,
229
- "eval_loss": 0.4476229250431061,
230
- "eval_runtime": 10.73,
231
- "eval_samples_per_second": 117.428,
232
- "eval_steps_per_second": 0.932,
233
  "step": 276
234
  },
235
  {
236
- "epoch": 14.987341772151899,
237
- "grad_norm": 1.5065507888793945,
238
- "learning_rate": 4.690058479532164e-05,
239
- "loss": 0.4356,
240
- "step": 296
241
- },
242
- {
243
- "epoch": 14.987341772151899,
244
- "eval_accuracy": 0.8579365079365079,
245
- "eval_loss": 0.47357481718063354,
246
- "eval_runtime": 10.7482,
247
- "eval_samples_per_second": 117.229,
248
- "eval_steps_per_second": 0.93,
249
- "step": 296
250
- },
251
- {
252
- "epoch": 16.0,
253
- "grad_norm": 1.6331088542938232,
254
- "learning_rate": 4.6315789473684214e-05,
255
- "loss": 0.4021,
256
  "step": 316
257
  },
258
  {
259
- "epoch": 16.0,
260
- "eval_accuracy": 0.8587301587301587,
261
- "eval_loss": 0.46398431062698364,
262
- "eval_runtime": 10.7745,
263
- "eval_samples_per_second": 116.943,
264
- "eval_steps_per_second": 0.928,
265
  "step": 316
266
  },
267
  {
268
- "epoch": 16.962025316455698,
269
- "grad_norm": 1.7429494857788086,
270
- "learning_rate": 4.576023391812866e-05,
271
- "loss": 0.4073,
272
- "step": 335
273
- },
274
- {
275
- "epoch": 16.962025316455698,
276
- "eval_accuracy": 0.8579365079365079,
277
- "eval_loss": 0.4629625976085663,
278
- "eval_runtime": 10.7423,
279
- "eval_samples_per_second": 117.293,
280
- "eval_steps_per_second": 0.931,
281
- "step": 335
282
- },
283
- {
284
- "epoch": 17.974683544303797,
285
- "grad_norm": 1.3264607191085815,
286
- "learning_rate": 4.517543859649123e-05,
287
- "loss": 0.3782,
288
  "step": 355
289
  },
290
  {
291
- "epoch": 17.974683544303797,
292
- "eval_accuracy": 0.8595238095238096,
293
- "eval_loss": 0.4655977785587311,
294
- "eval_runtime": 10.8299,
295
- "eval_samples_per_second": 116.344,
296
- "eval_steps_per_second": 0.923,
297
  "step": 355
298
  },
299
  {
300
- "epoch": 18.9873417721519,
301
- "grad_norm": 1.481920599937439,
302
- "learning_rate": 4.4590643274853806e-05,
303
- "loss": 0.3617,
304
- "step": 375
305
- },
306
- {
307
- "epoch": 18.9873417721519,
308
- "eval_accuracy": 0.861904761904762,
309
- "eval_loss": 0.4484713077545166,
310
- "eval_runtime": 10.749,
311
- "eval_samples_per_second": 117.221,
312
- "eval_steps_per_second": 0.93,
313
- "step": 375
314
- },
315
- {
316
- "epoch": 20.0,
317
- "grad_norm": 1.3743647336959839,
318
- "learning_rate": 4.400584795321638e-05,
319
- "loss": 0.3448,
320
  "step": 395
321
  },
322
  {
323
- "epoch": 20.0,
324
- "eval_accuracy": 0.861904761904762,
325
- "eval_loss": 0.4735279381275177,
326
- "eval_runtime": 10.7186,
327
- "eval_samples_per_second": 117.553,
328
- "eval_steps_per_second": 0.933,
329
  "step": 395
330
  },
331
  {
332
- "epoch": 20.962025316455698,
333
- "grad_norm": 1.3664813041687012,
334
- "learning_rate": 4.345029239766082e-05,
335
- "loss": 0.3549,
336
- "step": 414
337
- },
338
- {
339
- "epoch": 20.962025316455698,
340
- "eval_accuracy": 0.8571428571428571,
341
- "eval_loss": 0.4780659079551697,
342
- "eval_runtime": 10.7555,
343
- "eval_samples_per_second": 117.149,
344
- "eval_steps_per_second": 0.93,
345
- "step": 414
346
- },
347
- {
348
- "epoch": 21.974683544303797,
349
- "grad_norm": 1.4020764827728271,
350
- "learning_rate": 4.286549707602339e-05,
351
- "loss": 0.3195,
352
  "step": 434
353
  },
354
  {
355
- "epoch": 21.974683544303797,
356
- "eval_accuracy": 0.861904761904762,
357
- "eval_loss": 0.4818320572376251,
358
- "eval_runtime": 10.6974,
359
- "eval_samples_per_second": 117.785,
360
- "eval_steps_per_second": 0.935,
361
  "step": 434
362
  },
363
  {
364
- "epoch": 22.9873417721519,
365
- "grad_norm": 1.2878130674362183,
366
- "learning_rate": 4.228070175438597e-05,
367
- "loss": 0.3219,
368
- "step": 454
369
- },
370
- {
371
- "epoch": 22.9873417721519,
372
- "eval_accuracy": 0.8650793650793651,
373
- "eval_loss": 0.47401970624923706,
374
- "eval_runtime": 10.7479,
375
- "eval_samples_per_second": 117.232,
376
- "eval_steps_per_second": 0.93,
377
- "step": 454
378
- },
379
- {
380
- "epoch": 24.0,
381
- "grad_norm": 1.5816872119903564,
382
- "learning_rate": 4.169590643274854e-05,
383
- "loss": 0.2966,
384
- "step": 474
385
- },
386
- {
387
- "epoch": 24.0,
388
- "eval_accuracy": 0.8642857142857143,
389
- "eval_loss": 0.4857538044452667,
390
- "eval_runtime": 10.884,
391
- "eval_samples_per_second": 115.766,
392
- "eval_steps_per_second": 0.919,
393
  "step": 474
394
  },
395
  {
396
- "epoch": 24.962025316455698,
397
- "grad_norm": 1.4161866903305054,
398
- "learning_rate": 4.1140350877192985e-05,
399
- "loss": 0.322,
400
- "step": 493
401
- },
402
- {
403
- "epoch": 24.962025316455698,
404
  "eval_accuracy": 0.8579365079365079,
405
- "eval_loss": 0.4993390738964081,
406
- "eval_runtime": 10.7563,
407
- "eval_samples_per_second": 117.141,
408
- "eval_steps_per_second": 0.93,
409
- "step": 493
410
  },
411
  {
412
- "epoch": 25.974683544303797,
413
- "grad_norm": 8.147224426269531,
414
- "learning_rate": 4.055555555555556e-05,
415
- "loss": 0.2806,
416
  "step": 513
417
  },
418
  {
419
- "epoch": 25.974683544303797,
420
- "eval_accuracy": 0.8650793650793651,
421
- "eval_loss": 0.4862901568412781,
422
- "eval_runtime": 10.7246,
423
- "eval_samples_per_second": 117.487,
424
- "eval_steps_per_second": 0.932,
425
  "step": 513
426
  },
427
  {
428
- "epoch": 26.9873417721519,
429
- "grad_norm": 1.3954640626907349,
430
- "learning_rate": 3.9970760233918134e-05,
431
- "loss": 0.2696,
432
- "step": 533
433
- },
434
- {
435
- "epoch": 26.9873417721519,
436
- "eval_accuracy": 0.8595238095238096,
437
- "eval_loss": 0.5064178705215454,
438
- "eval_runtime": 10.7667,
439
- "eval_samples_per_second": 117.027,
440
- "eval_steps_per_second": 0.929,
441
- "step": 533
442
- },
443
- {
444
- "epoch": 28.0,
445
- "grad_norm": 1.5532612800598145,
446
- "learning_rate": 3.93859649122807e-05,
447
- "loss": 0.2709,
448
  "step": 553
449
  },
450
  {
451
- "epoch": 28.0,
452
- "eval_accuracy": 0.8674603174603175,
453
- "eval_loss": 0.4656953811645508,
454
- "eval_runtime": 10.8334,
455
- "eval_samples_per_second": 116.307,
456
- "eval_steps_per_second": 0.923,
457
  "step": 553
458
  },
459
  {
460
- "epoch": 28.962025316455698,
461
- "grad_norm": 1.5748584270477295,
462
- "learning_rate": 3.883040935672515e-05,
463
- "loss": 0.2702,
464
- "step": 572
465
- },
466
- {
467
- "epoch": 28.962025316455698,
468
- "eval_accuracy": 0.8571428571428571,
469
- "eval_loss": 0.4933919310569763,
470
- "eval_runtime": 10.7751,
471
- "eval_samples_per_second": 116.936,
472
- "eval_steps_per_second": 0.928,
473
- "step": 572
474
- },
475
- {
476
- "epoch": 29.974683544303797,
477
- "grad_norm": 1.3018436431884766,
478
- "learning_rate": 3.824561403508773e-05,
479
- "loss": 0.2628,
480
  "step": 592
481
  },
482
  {
483
- "epoch": 29.974683544303797,
484
- "eval_accuracy": 0.8555555555555555,
485
- "eval_loss": 0.4940374493598938,
486
- "eval_runtime": 10.7573,
487
- "eval_samples_per_second": 117.13,
488
- "eval_steps_per_second": 0.93,
489
  "step": 592
490
  },
491
  {
492
- "epoch": 30.9873417721519,
493
- "grad_norm": 1.811011791229248,
494
- "learning_rate": 3.7660818713450294e-05,
495
- "loss": 0.2543,
496
- "step": 612
497
- },
498
- {
499
- "epoch": 30.9873417721519,
500
- "eval_accuracy": 0.8642857142857143,
501
- "eval_loss": 0.48308396339416504,
502
- "eval_runtime": 10.8262,
503
- "eval_samples_per_second": 116.384,
504
- "eval_steps_per_second": 0.924,
505
- "step": 612
506
- },
507
- {
508
- "epoch": 32.0,
509
- "grad_norm": 1.4332562685012817,
510
- "learning_rate": 3.707602339181287e-05,
511
- "loss": 0.2427,
512
  "step": 632
513
  },
514
  {
515
- "epoch": 32.0,
516
- "eval_accuracy": 0.861904761904762,
517
- "eval_loss": 0.4981466233730316,
518
- "eval_runtime": 10.8291,
519
- "eval_samples_per_second": 116.353,
520
- "eval_steps_per_second": 0.923,
521
  "step": 632
522
  },
523
  {
524
- "epoch": 32.962025316455694,
525
- "grad_norm": 1.5480865240097046,
526
- "learning_rate": 3.652046783625731e-05,
527
- "loss": 0.2659,
528
- "step": 651
529
- },
530
- {
531
- "epoch": 32.962025316455694,
532
- "eval_accuracy": 0.8642857142857143,
533
- "eval_loss": 0.5094291567802429,
534
- "eval_runtime": 10.7442,
535
- "eval_samples_per_second": 117.272,
536
- "eval_steps_per_second": 0.931,
537
- "step": 651
538
- },
539
- {
540
- "epoch": 33.9746835443038,
541
- "grad_norm": 1.3587052822113037,
542
- "learning_rate": 3.593567251461988e-05,
543
- "loss": 0.2398,
544
  "step": 671
545
  },
546
  {
547
- "epoch": 33.9746835443038,
548
- "eval_accuracy": 0.8658730158730159,
549
- "eval_loss": 0.5013704895973206,
550
- "eval_runtime": 10.759,
551
- "eval_samples_per_second": 117.112,
552
- "eval_steps_per_second": 0.929,
553
  "step": 671
554
  },
555
  {
556
- "epoch": 34.9873417721519,
557
- "grad_norm": 1.3286776542663574,
558
- "learning_rate": 3.5350877192982455e-05,
559
- "loss": 0.227,
560
- "step": 691
561
- },
562
- {
563
- "epoch": 34.9873417721519,
564
- "eval_accuracy": 0.8634920634920635,
565
- "eval_loss": 0.5037477612495422,
566
- "eval_runtime": 10.7589,
567
- "eval_samples_per_second": 117.112,
568
- "eval_steps_per_second": 0.929,
569
- "step": 691
570
- },
571
- {
572
- "epoch": 36.0,
573
- "grad_norm": 1.6547776460647583,
574
- "learning_rate": 3.476608187134503e-05,
575
- "loss": 0.2308,
576
  "step": 711
577
  },
578
  {
579
- "epoch": 36.0,
580
- "eval_accuracy": 0.8658730158730159,
581
- "eval_loss": 0.5068167448043823,
582
- "eval_runtime": 10.7754,
583
- "eval_samples_per_second": 116.933,
584
- "eval_steps_per_second": 0.928,
585
  "step": 711
586
  },
587
  {
588
- "epoch": 36.962025316455694,
589
- "grad_norm": 1.3685321807861328,
590
- "learning_rate": 3.421052631578947e-05,
591
- "loss": 0.2326,
592
- "step": 730
593
- },
594
- {
595
- "epoch": 36.962025316455694,
596
- "eval_accuracy": 0.8650793650793651,
597
- "eval_loss": 0.4980192184448242,
598
- "eval_runtime": 10.761,
599
- "eval_samples_per_second": 117.09,
600
- "eval_steps_per_second": 0.929,
601
- "step": 730
602
- },
603
- {
604
- "epoch": 37.9746835443038,
605
- "grad_norm": 1.2418972253799438,
606
- "learning_rate": 3.362573099415205e-05,
607
- "loss": 0.2242,
608
  "step": 750
609
  },
610
  {
611
- "epoch": 37.9746835443038,
612
- "eval_accuracy": 0.8587301587301587,
613
- "eval_loss": 0.4938106834888458,
614
- "eval_runtime": 11.0548,
615
- "eval_samples_per_second": 113.978,
616
- "eval_steps_per_second": 0.905,
617
  "step": 750
618
  },
619
  {
620
- "epoch": 38.9873417721519,
621
- "grad_norm": 1.3450112342834473,
622
- "learning_rate": 3.304093567251462e-05,
623
- "loss": 0.2152,
624
- "step": 770
625
- },
626
- {
627
- "epoch": 38.9873417721519,
628
- "eval_accuracy": 0.8626984126984127,
629
- "eval_loss": 0.49911221861839294,
630
- "eval_runtime": 10.8459,
631
- "eval_samples_per_second": 116.173,
632
- "eval_steps_per_second": 0.922,
633
- "step": 770
634
- },
635
- {
636
- "epoch": 40.0,
637
- "grad_norm": 1.3505226373672485,
638
- "learning_rate": 3.24561403508772e-05,
639
- "loss": 0.2205,
640
  "step": 790
641
  },
642
  {
643
- "epoch": 40.0,
644
- "eval_accuracy": 0.8571428571428571,
645
- "eval_loss": 0.5294431447982788,
646
- "eval_runtime": 11.024,
647
- "eval_samples_per_second": 114.296,
648
- "eval_steps_per_second": 0.907,
649
  "step": 790
650
  },
651
  {
652
- "epoch": 40.962025316455694,
653
- "grad_norm": 1.1549227237701416,
654
- "learning_rate": 3.1900584795321634e-05,
655
- "loss": 0.2299,
656
- "step": 809
657
- },
658
- {
659
- "epoch": 40.962025316455694,
660
- "eval_accuracy": 0.8650793650793651,
661
- "eval_loss": 0.5079935789108276,
662
- "eval_runtime": 10.8051,
663
- "eval_samples_per_second": 116.612,
664
- "eval_steps_per_second": 0.925,
665
- "step": 809
666
- },
667
- {
668
- "epoch": 41.9746835443038,
669
- "grad_norm": 1.252756118774414,
670
- "learning_rate": 3.131578947368421e-05,
671
- "loss": 0.1978,
672
  "step": 829
673
  },
674
  {
675
- "epoch": 41.9746835443038,
676
- "eval_accuracy": 0.861904761904762,
677
- "eval_loss": 0.5043396949768066,
678
- "eval_runtime": 11.0945,
679
- "eval_samples_per_second": 113.57,
680
- "eval_steps_per_second": 0.901,
681
  "step": 829
682
  },
683
  {
684
- "epoch": 42.9873417721519,
685
- "grad_norm": 1.20892333984375,
686
- "learning_rate": 3.073099415204678e-05,
687
- "loss": 0.2081,
688
- "step": 849
689
- },
690
- {
691
- "epoch": 42.9873417721519,
692
- "eval_accuracy": 0.8634920634920635,
693
- "eval_loss": 0.5008840560913086,
694
- "eval_runtime": 10.8249,
695
- "eval_samples_per_second": 116.398,
696
- "eval_steps_per_second": 0.924,
697
- "step": 849
698
- },
699
- {
700
- "epoch": 44.0,
701
- "grad_norm": 0.9471483826637268,
702
- "learning_rate": 3.0146198830409357e-05,
703
- "loss": 0.1893,
704
  "step": 869
705
  },
706
  {
707
- "epoch": 44.0,
708
- "eval_accuracy": 0.8571428571428571,
709
- "eval_loss": 0.5212369561195374,
710
- "eval_runtime": 10.7573,
711
- "eval_samples_per_second": 117.13,
712
- "eval_steps_per_second": 0.93,
713
  "step": 869
714
  },
715
  {
716
- "epoch": 44.962025316455694,
717
- "grad_norm": 1.4700783491134644,
718
- "learning_rate": 2.95906432748538e-05,
719
- "loss": 0.1988,
720
- "step": 888
721
- },
722
- {
723
- "epoch": 44.962025316455694,
724
- "eval_accuracy": 0.8626984126984127,
725
- "eval_loss": 0.4991550147533417,
726
- "eval_runtime": 10.8133,
727
- "eval_samples_per_second": 116.523,
728
- "eval_steps_per_second": 0.925,
729
- "step": 888
730
- },
731
- {
732
- "epoch": 45.9746835443038,
733
- "grad_norm": 1.0916502475738525,
734
- "learning_rate": 2.9005847953216375e-05,
735
- "loss": 0.1911,
736
  "step": 908
737
  },
738
  {
739
- "epoch": 45.9746835443038,
740
- "eval_accuracy": 0.8674603174603175,
741
- "eval_loss": 0.5237799882888794,
742
- "eval_runtime": 10.7538,
743
- "eval_samples_per_second": 117.168,
744
- "eval_steps_per_second": 0.93,
745
  "step": 908
746
  },
747
  {
748
- "epoch": 46.9873417721519,
749
- "grad_norm": 1.2590000629425049,
750
- "learning_rate": 2.842105263157895e-05,
751
- "loss": 0.1877,
752
- "step": 928
753
- },
754
- {
755
- "epoch": 46.9873417721519,
756
- "eval_accuracy": 0.8674603174603175,
757
- "eval_loss": 0.5184463262557983,
758
- "eval_runtime": 10.8005,
759
- "eval_samples_per_second": 116.662,
760
- "eval_steps_per_second": 0.926,
761
- "step": 928
762
- },
763
- {
764
- "epoch": 48.0,
765
- "grad_norm": 1.417845606803894,
766
- "learning_rate": 2.783625730994152e-05,
767
- "loss": 0.1957,
768
  "step": 948
769
  },
770
  {
771
- "epoch": 48.0,
772
- "eval_accuracy": 0.8571428571428571,
773
- "eval_loss": 0.5155279040336609,
774
- "eval_runtime": 10.7518,
775
- "eval_samples_per_second": 117.19,
776
- "eval_steps_per_second": 0.93,
777
  "step": 948
778
  },
779
  {
780
- "epoch": 48.962025316455694,
781
- "grad_norm": 1.304579496383667,
782
- "learning_rate": 2.7280701754385968e-05,
783
- "loss": 0.185,
784
- "step": 967
785
- },
786
- {
787
- "epoch": 48.962025316455694,
788
- "eval_accuracy": 0.8674603174603175,
789
- "eval_loss": 0.5028470158576965,
790
- "eval_runtime": 10.7385,
791
- "eval_samples_per_second": 117.335,
792
- "eval_steps_per_second": 0.931,
793
- "step": 967
794
- },
795
- {
796
- "epoch": 49.9746835443038,
797
- "grad_norm": 1.1883121728897095,
798
- "learning_rate": 2.669590643274854e-05,
799
- "loss": 0.1821,
800
  "step": 987
801
  },
802
  {
803
- "epoch": 49.9746835443038,
804
- "eval_accuracy": 0.8626984126984127,
805
- "eval_loss": 0.5118417739868164,
806
- "eval_runtime": 10.7974,
807
- "eval_samples_per_second": 116.694,
808
- "eval_steps_per_second": 0.926,
809
  "step": 987
810
  },
811
  {
812
- "epoch": 50.9873417721519,
813
- "grad_norm": 0.9844208359718323,
814
- "learning_rate": 2.6111111111111114e-05,
815
- "loss": 0.1843,
816
- "step": 1007
817
- },
818
- {
819
- "epoch": 50.9873417721519,
820
- "eval_accuracy": 0.8650793650793651,
821
- "eval_loss": 0.5006617903709412,
822
- "eval_runtime": 10.7816,
823
- "eval_samples_per_second": 116.866,
824
- "eval_steps_per_second": 0.928,
825
- "step": 1007
826
- },
827
- {
828
- "epoch": 52.0,
829
- "grad_norm": 1.392893671989441,
830
- "learning_rate": 2.5526315789473688e-05,
831
- "loss": 0.1711,
832
  "step": 1027
833
  },
834
  {
835
- "epoch": 52.0,
836
- "eval_accuracy": 0.8571428571428571,
837
- "eval_loss": 0.511702299118042,
838
- "eval_runtime": 10.8792,
839
- "eval_samples_per_second": 115.817,
840
- "eval_steps_per_second": 0.919,
841
  "step": 1027
842
  },
843
  {
844
- "epoch": 52.962025316455694,
845
- "grad_norm": 1.6035434007644653,
846
- "learning_rate": 2.4970760233918132e-05,
847
- "loss": 0.1903,
848
- "step": 1046
849
- },
850
- {
851
- "epoch": 52.962025316455694,
852
- "eval_accuracy": 0.8587301587301587,
853
- "eval_loss": 0.507360577583313,
854
- "eval_runtime": 10.9028,
855
- "eval_samples_per_second": 115.567,
856
- "eval_steps_per_second": 0.917,
857
- "step": 1046
858
- },
859
- {
860
- "epoch": 53.9746835443038,
861
- "grad_norm": 1.0503844022750854,
862
- "learning_rate": 2.4385964912280703e-05,
863
- "loss": 0.1713,
864
  "step": 1066
865
  },
866
  {
867
- "epoch": 53.9746835443038,
868
- "eval_accuracy": 0.8658730158730159,
869
- "eval_loss": 0.5167327523231506,
870
- "eval_runtime": 10.8055,
871
- "eval_samples_per_second": 116.608,
872
- "eval_steps_per_second": 0.925,
873
  "step": 1066
874
  },
875
  {
876
- "epoch": 54.9873417721519,
877
- "grad_norm": 1.0421777963638306,
878
- "learning_rate": 2.3801169590643278e-05,
879
- "loss": 0.1677,
880
- "step": 1086
881
- },
882
- {
883
- "epoch": 54.9873417721519,
884
- "eval_accuracy": 0.8666666666666667,
885
- "eval_loss": 0.5178954601287842,
886
- "eval_runtime": 10.802,
887
- "eval_samples_per_second": 116.645,
888
- "eval_steps_per_second": 0.926,
889
- "step": 1086
890
- },
891
- {
892
- "epoch": 56.0,
893
- "grad_norm": 1.283031940460205,
894
- "learning_rate": 2.321637426900585e-05,
895
- "loss": 0.16,
896
  "step": 1106
897
  },
898
  {
899
- "epoch": 56.0,
900
- "eval_accuracy": 0.8650793650793651,
901
- "eval_loss": 0.5145161747932434,
902
- "eval_runtime": 10.7346,
903
- "eval_samples_per_second": 117.377,
904
- "eval_steps_per_second": 0.932,
905
  "step": 1106
906
  },
907
  {
908
- "epoch": 56.962025316455694,
909
- "grad_norm": 2.3518636226654053,
910
- "learning_rate": 2.2660818713450292e-05,
911
- "loss": 0.1818,
912
- "step": 1125
913
- },
914
- {
915
- "epoch": 56.962025316455694,
916
- "eval_accuracy": 0.8650793650793651,
917
- "eval_loss": 0.5295758247375488,
918
- "eval_runtime": 10.7992,
919
- "eval_samples_per_second": 116.676,
920
- "eval_steps_per_second": 0.926,
921
- "step": 1125
922
- },
923
- {
924
- "epoch": 57.9746835443038,
925
- "grad_norm": 1.4039283990859985,
926
- "learning_rate": 2.2076023391812867e-05,
927
- "loss": 0.1725,
928
- "step": 1145
929
- },
930
- {
931
- "epoch": 57.9746835443038,
932
- "eval_accuracy": 0.8642857142857143,
933
- "eval_loss": 0.531140923500061,
934
- "eval_runtime": 10.7624,
935
- "eval_samples_per_second": 117.075,
936
- "eval_steps_per_second": 0.929,
937
  "step": 1145
938
  },
939
  {
940
- "epoch": 58.9873417721519,
941
- "grad_norm": 1.1663118600845337,
942
- "learning_rate": 2.149122807017544e-05,
943
- "loss": 0.1642,
944
- "step": 1165
945
- },
946
- {
947
- "epoch": 58.9873417721519,
948
- "eval_accuracy": 0.8626984126984127,
949
- "eval_loss": 0.5316585302352905,
950
- "eval_runtime": 10.7932,
951
- "eval_samples_per_second": 116.74,
952
- "eval_steps_per_second": 0.927,
953
- "step": 1165
954
- },
955
- {
956
- "epoch": 60.0,
957
- "grad_norm": 1.2489556074142456,
958
- "learning_rate": 2.0906432748538013e-05,
959
- "loss": 0.1626,
960
- "step": 1185
961
- },
962
- {
963
- "epoch": 60.0,
964
- "eval_accuracy": 0.861904761904762,
965
- "eval_loss": 0.5282865166664124,
966
- "eval_runtime": 10.7244,
967
- "eval_samples_per_second": 117.49,
968
- "eval_steps_per_second": 0.932,
969
- "step": 1185
970
- },
971
- {
972
- "epoch": 60.962025316455694,
973
- "grad_norm": 1.4613455533981323,
974
- "learning_rate": 2.0350877192982456e-05,
975
- "loss": 0.1621,
976
- "step": 1204
977
- },
978
- {
979
- "epoch": 60.962025316455694,
980
- "eval_accuracy": 0.8603174603174604,
981
- "eval_loss": 0.5266717076301575,
982
- "eval_runtime": 10.8467,
983
- "eval_samples_per_second": 116.164,
984
- "eval_steps_per_second": 0.922,
985
- "step": 1204
986
- },
987
- {
988
- "epoch": 61.9746835443038,
989
- "grad_norm": 1.4865529537200928,
990
- "learning_rate": 1.976608187134503e-05,
991
- "loss": 0.1503,
992
- "step": 1224
993
- },
994
- {
995
- "epoch": 61.9746835443038,
996
- "eval_accuracy": 0.861904761904762,
997
- "eval_loss": 0.5389307141304016,
998
- "eval_runtime": 10.7859,
999
- "eval_samples_per_second": 116.819,
1000
- "eval_steps_per_second": 0.927,
1001
- "step": 1224
1002
- },
1003
- {
1004
- "epoch": 62.9873417721519,
1005
- "grad_norm": 1.0126421451568604,
1006
- "learning_rate": 1.9181286549707602e-05,
1007
- "loss": 0.162,
1008
- "step": 1244
1009
- },
1010
- {
1011
- "epoch": 62.9873417721519,
1012
- "eval_accuracy": 0.8626984126984127,
1013
- "eval_loss": 0.540341317653656,
1014
- "eval_runtime": 10.7503,
1015
- "eval_samples_per_second": 117.206,
1016
- "eval_steps_per_second": 0.93,
1017
- "step": 1244
1018
- },
1019
- {
1020
- "epoch": 64.0,
1021
- "grad_norm": 1.3681743144989014,
1022
- "learning_rate": 1.8596491228070176e-05,
1023
- "loss": 0.154,
1024
- "step": 1264
1025
- },
1026
- {
1027
- "epoch": 64.0,
1028
- "eval_accuracy": 0.8650793650793651,
1029
- "eval_loss": 0.5240360498428345,
1030
- "eval_runtime": 10.6902,
1031
- "eval_samples_per_second": 117.865,
1032
- "eval_steps_per_second": 0.935,
1033
- "step": 1264
1034
- },
1035
- {
1036
- "epoch": 64.9620253164557,
1037
- "grad_norm": 1.1314650774002075,
1038
- "learning_rate": 1.804093567251462e-05,
1039
- "loss": 0.1525,
1040
- "step": 1283
1041
- },
1042
- {
1043
- "epoch": 64.9620253164557,
1044
- "eval_accuracy": 0.8650793650793651,
1045
- "eval_loss": 0.5337327718734741,
1046
- "eval_runtime": 10.5876,
1047
- "eval_samples_per_second": 119.007,
1048
- "eval_steps_per_second": 0.945,
1049
- "step": 1283
1050
- },
1051
- {
1052
- "epoch": 65.9746835443038,
1053
- "grad_norm": 1.1210103034973145,
1054
- "learning_rate": 1.745614035087719e-05,
1055
- "loss": 0.1529,
1056
- "step": 1303
1057
- },
1058
- {
1059
- "epoch": 65.9746835443038,
1060
- "eval_accuracy": 0.8642857142857143,
1061
- "eval_loss": 0.5457757115364075,
1062
- "eval_runtime": 10.8103,
1063
- "eval_samples_per_second": 116.555,
1064
- "eval_steps_per_second": 0.925,
1065
- "step": 1303
1066
- },
1067
- {
1068
- "epoch": 66.9873417721519,
1069
- "grad_norm": 1.2637122869491577,
1070
- "learning_rate": 1.6871345029239766e-05,
1071
- "loss": 0.1548,
1072
- "step": 1323
1073
- },
1074
- {
1075
- "epoch": 66.9873417721519,
1076
- "eval_accuracy": 0.8650793650793651,
1077
- "eval_loss": 0.5383771061897278,
1078
- "eval_runtime": 10.877,
1079
- "eval_samples_per_second": 115.84,
1080
- "eval_steps_per_second": 0.919,
1081
- "step": 1323
1082
- },
1083
- {
1084
- "epoch": 68.0,
1085
- "grad_norm": 1.1812046766281128,
1086
- "learning_rate": 1.628654970760234e-05,
1087
- "loss": 0.1556,
1088
- "step": 1343
1089
- },
1090
- {
1091
- "epoch": 68.0,
1092
- "eval_accuracy": 0.8626984126984127,
1093
- "eval_loss": 0.5395429134368896,
1094
- "eval_runtime": 10.829,
1095
- "eval_samples_per_second": 116.354,
1096
- "eval_steps_per_second": 0.923,
1097
- "step": 1343
1098
- },
1099
- {
1100
- "epoch": 68.9620253164557,
1101
- "grad_norm": 1.21077299118042,
1102
- "learning_rate": 1.5730994152046784e-05,
1103
- "loss": 0.1629,
1104
- "step": 1362
1105
- },
1106
- {
1107
- "epoch": 68.9620253164557,
1108
- "eval_accuracy": 0.8634920634920635,
1109
- "eval_loss": 0.5454122424125671,
1110
- "eval_runtime": 10.6544,
1111
- "eval_samples_per_second": 118.261,
1112
- "eval_steps_per_second": 0.939,
1113
- "step": 1362
1114
- },
1115
- {
1116
- "epoch": 69.9746835443038,
1117
- "grad_norm": 1.3317054510116577,
1118
- "learning_rate": 1.5146198830409358e-05,
1119
- "loss": 0.1387,
1120
- "step": 1382
1121
- },
1122
- {
1123
- "epoch": 69.9746835443038,
1124
- "eval_accuracy": 0.8746031746031746,
1125
- "eval_loss": 0.5267017483711243,
1126
- "eval_runtime": 10.8484,
1127
- "eval_samples_per_second": 116.147,
1128
- "eval_steps_per_second": 0.922,
1129
- "step": 1382
1130
- },
1131
- {
1132
- "epoch": 70.9873417721519,
1133
- "grad_norm": 1.2003090381622314,
1134
- "learning_rate": 1.4561403508771931e-05,
1135
- "loss": 0.1495,
1136
- "step": 1402
1137
- },
1138
- {
1139
- "epoch": 70.9873417721519,
1140
- "eval_accuracy": 0.8650793650793651,
1141
- "eval_loss": 0.5427414774894714,
1142
- "eval_runtime": 10.7136,
1143
- "eval_samples_per_second": 117.608,
1144
- "eval_steps_per_second": 0.933,
1145
- "step": 1402
1146
- },
1147
- {
1148
- "epoch": 72.0,
1149
- "grad_norm": 1.2609037160873413,
1150
- "learning_rate": 1.3976608187134504e-05,
1151
- "loss": 0.1465,
1152
- "step": 1422
1153
- },
1154
- {
1155
- "epoch": 72.0,
1156
- "eval_accuracy": 0.8690476190476191,
1157
- "eval_loss": 0.559054434299469,
1158
- "eval_runtime": 10.7661,
1159
- "eval_samples_per_second": 117.034,
1160
- "eval_steps_per_second": 0.929,
1161
- "step": 1422
1162
- },
1163
- {
1164
- "epoch": 72.9620253164557,
1165
- "grad_norm": 1.456437587738037,
1166
- "learning_rate": 1.3421052631578948e-05,
1167
- "loss": 0.1478,
1168
- "step": 1441
1169
- },
1170
- {
1171
- "epoch": 72.9620253164557,
1172
- "eval_accuracy": 0.8722222222222222,
1173
- "eval_loss": 0.532349705696106,
1174
- "eval_runtime": 10.7627,
1175
- "eval_samples_per_second": 117.071,
1176
- "eval_steps_per_second": 0.929,
1177
- "step": 1441
1178
- },
1179
- {
1180
- "epoch": 73.9746835443038,
1181
- "grad_norm": 1.404703140258789,
1182
- "learning_rate": 1.283625730994152e-05,
1183
- "loss": 0.1447,
1184
- "step": 1461
1185
- },
1186
- {
1187
- "epoch": 73.9746835443038,
1188
- "eval_accuracy": 0.8690476190476191,
1189
- "eval_loss": 0.5560940504074097,
1190
- "eval_runtime": 10.7443,
1191
- "eval_samples_per_second": 117.271,
1192
- "eval_steps_per_second": 0.931,
1193
- "step": 1461
1194
- },
1195
- {
1196
- "epoch": 74.9873417721519,
1197
- "grad_norm": 1.3342186212539673,
1198
- "learning_rate": 1.2251461988304095e-05,
1199
- "loss": 0.1435,
1200
- "step": 1481
1201
- },
1202
- {
1203
- "epoch": 74.9873417721519,
1204
- "eval_accuracy": 0.8658730158730159,
1205
- "eval_loss": 0.5346001982688904,
1206
- "eval_runtime": 10.8529,
1207
- "eval_samples_per_second": 116.098,
1208
- "eval_steps_per_second": 0.921,
1209
- "step": 1481
1210
- },
1211
- {
1212
- "epoch": 76.0,
1213
- "grad_norm": 1.2481029033660889,
1214
- "learning_rate": 1.1666666666666668e-05,
1215
- "loss": 0.1459,
1216
- "step": 1501
1217
- },
1218
- {
1219
- "epoch": 76.0,
1220
- "eval_accuracy": 0.8658730158730159,
1221
- "eval_loss": 0.5466868281364441,
1222
- "eval_runtime": 11.0795,
1223
- "eval_samples_per_second": 113.723,
1224
- "eval_steps_per_second": 0.903,
1225
- "step": 1501
1226
- },
1227
- {
1228
- "epoch": 76.9620253164557,
1229
- "grad_norm": 1.2153362035751343,
1230
- "learning_rate": 1.1111111111111112e-05,
1231
- "loss": 0.1474,
1232
- "step": 1520
1233
- },
1234
- {
1235
- "epoch": 76.9620253164557,
1236
- "eval_accuracy": 0.8690476190476191,
1237
- "eval_loss": 0.5463184118270874,
1238
- "eval_runtime": 10.7937,
1239
- "eval_samples_per_second": 116.735,
1240
- "eval_steps_per_second": 0.926,
1241
- "step": 1520
1242
- },
1243
- {
1244
- "epoch": 77.9746835443038,
1245
- "grad_norm": 1.2351834774017334,
1246
- "learning_rate": 1.0526315789473684e-05,
1247
- "loss": 0.1352,
1248
- "step": 1540
1249
- },
1250
- {
1251
- "epoch": 77.9746835443038,
1252
- "eval_accuracy": 0.8650793650793651,
1253
- "eval_loss": 0.5412562489509583,
1254
- "eval_runtime": 11.1033,
1255
- "eval_samples_per_second": 113.48,
1256
- "eval_steps_per_second": 0.901,
1257
- "step": 1540
1258
- },
1259
- {
1260
- "epoch": 78.9873417721519,
1261
- "grad_norm": 1.3961732387542725,
1262
- "learning_rate": 9.941520467836257e-06,
1263
- "loss": 0.1337,
1264
- "step": 1560
1265
- },
1266
- {
1267
- "epoch": 78.9873417721519,
1268
- "eval_accuracy": 0.8650793650793651,
1269
- "eval_loss": 0.5488775372505188,
1270
- "eval_runtime": 10.7671,
1271
- "eval_samples_per_second": 117.023,
1272
- "eval_steps_per_second": 0.929,
1273
- "step": 1560
1274
- },
1275
- {
1276
- "epoch": 80.0,
1277
- "grad_norm": 1.8050953149795532,
1278
- "learning_rate": 9.35672514619883e-06,
1279
- "loss": 0.1374,
1280
- "step": 1580
1281
- },
1282
- {
1283
- "epoch": 80.0,
1284
- "eval_accuracy": 0.8587301587301587,
1285
- "eval_loss": 0.5454345941543579,
1286
- "eval_runtime": 10.8299,
1287
- "eval_samples_per_second": 116.345,
1288
- "eval_steps_per_second": 0.923,
1289
- "step": 1580
1290
- },
1291
- {
1292
- "epoch": 80.9620253164557,
1293
- "grad_norm": 1.2362314462661743,
1294
- "learning_rate": 8.801169590643275e-06,
1295
- "loss": 0.1383,
1296
- "step": 1599
1297
- },
1298
- {
1299
- "epoch": 80.9620253164557,
1300
- "eval_accuracy": 0.8626984126984127,
1301
- "eval_loss": 0.5432500243186951,
1302
- "eval_runtime": 10.8767,
1303
- "eval_samples_per_second": 115.844,
1304
- "eval_steps_per_second": 0.919,
1305
- "step": 1599
1306
- },
1307
- {
1308
- "epoch": 81.9746835443038,
1309
- "grad_norm": 1.1372051239013672,
1310
- "learning_rate": 8.216374269005848e-06,
1311
- "loss": 0.1408,
1312
- "step": 1619
1313
- },
1314
- {
1315
- "epoch": 81.9746835443038,
1316
  "eval_accuracy": 0.8682539682539683,
1317
- "eval_loss": 0.5383033156394958,
1318
- "eval_runtime": 10.9264,
1319
- "eval_samples_per_second": 115.317,
1320
- "eval_steps_per_second": 0.915,
1321
- "step": 1619
1322
- },
1323
- {
1324
- "epoch": 82.9873417721519,
1325
- "grad_norm": 1.6927990913391113,
1326
- "learning_rate": 7.631578947368421e-06,
1327
- "loss": 0.134,
1328
- "step": 1639
1329
- },
1330
- {
1331
- "epoch": 82.9873417721519,
1332
- "eval_accuracy": 0.8642857142857143,
1333
- "eval_loss": 0.5522441267967224,
1334
- "eval_runtime": 10.7932,
1335
- "eval_samples_per_second": 116.74,
1336
- "eval_steps_per_second": 0.927,
1337
- "step": 1639
1338
- },
1339
- {
1340
- "epoch": 84.0,
1341
- "grad_norm": 1.1734745502471924,
1342
- "learning_rate": 7.046783625730995e-06,
1343
- "loss": 0.1353,
1344
- "step": 1659
1345
- },
1346
- {
1347
- "epoch": 84.0,
1348
- "eval_accuracy": 0.8579365079365079,
1349
- "eval_loss": 0.5485585331916809,
1350
- "eval_runtime": 10.8452,
1351
- "eval_samples_per_second": 116.181,
1352
- "eval_steps_per_second": 0.922,
1353
- "step": 1659
1354
- },
1355
- {
1356
- "epoch": 84.9620253164557,
1357
- "grad_norm": 1.3662621974945068,
1358
- "learning_rate": 6.4912280701754385e-06,
1359
- "loss": 0.1435,
1360
- "step": 1678
1361
- },
1362
- {
1363
- "epoch": 84.9620253164557,
1364
- "eval_accuracy": 0.8595238095238096,
1365
- "eval_loss": 0.5582545399665833,
1366
- "eval_runtime": 10.7527,
1367
- "eval_samples_per_second": 117.18,
1368
- "eval_steps_per_second": 0.93,
1369
- "step": 1678
1370
- },
1371
- {
1372
- "epoch": 85.9746835443038,
1373
- "grad_norm": 1.3297693729400635,
1374
- "learning_rate": 5.906432748538012e-06,
1375
- "loss": 0.1324,
1376
- "step": 1698
1377
- },
1378
- {
1379
- "epoch": 85.9746835443038,
1380
- "eval_accuracy": 0.861904761904762,
1381
- "eval_loss": 0.5551320910453796,
1382
- "eval_runtime": 10.8253,
1383
- "eval_samples_per_second": 116.394,
1384
- "eval_steps_per_second": 0.924,
1385
- "step": 1698
1386
- },
1387
- {
1388
- "epoch": 86.9873417721519,
1389
- "grad_norm": 1.2304210662841797,
1390
- "learning_rate": 5.321637426900585e-06,
1391
- "loss": 0.1306,
1392
- "step": 1718
1393
- },
1394
- {
1395
- "epoch": 86.9873417721519,
1396
- "eval_accuracy": 0.8611111111111112,
1397
- "eval_loss": 0.553473949432373,
1398
- "eval_runtime": 10.7756,
1399
- "eval_samples_per_second": 116.931,
1400
- "eval_steps_per_second": 0.928,
1401
- "step": 1718
1402
- },
1403
- {
1404
- "epoch": 88.0,
1405
- "grad_norm": 1.323527216911316,
1406
- "learning_rate": 4.736842105263159e-06,
1407
- "loss": 0.1348,
1408
- "step": 1738
1409
- },
1410
- {
1411
- "epoch": 88.0,
1412
- "eval_accuracy": 0.8666666666666667,
1413
- "eval_loss": 0.5498299598693848,
1414
- "eval_runtime": 10.7878,
1415
- "eval_samples_per_second": 116.799,
1416
- "eval_steps_per_second": 0.927,
1417
- "step": 1738
1418
- },
1419
- {
1420
- "epoch": 88.9620253164557,
1421
- "grad_norm": 1.0867611169815063,
1422
- "learning_rate": 4.181286549707602e-06,
1423
- "loss": 0.1334,
1424
- "step": 1757
1425
- },
1426
- {
1427
- "epoch": 88.9620253164557,
1428
- "eval_accuracy": 0.8658730158730159,
1429
- "eval_loss": 0.5582374930381775,
1430
- "eval_runtime": 10.7756,
1431
- "eval_samples_per_second": 116.931,
1432
- "eval_steps_per_second": 0.928,
1433
- "step": 1757
1434
- },
1435
- {
1436
- "epoch": 89.9746835443038,
1437
- "grad_norm": 1.0990999937057495,
1438
- "learning_rate": 3.5964912280701756e-06,
1439
- "loss": 0.1343,
1440
- "step": 1777
1441
- },
1442
- {
1443
- "epoch": 89.9746835443038,
1444
- "eval_accuracy": 0.8658730158730159,
1445
- "eval_loss": 0.5526331067085266,
1446
- "eval_runtime": 10.8124,
1447
- "eval_samples_per_second": 116.533,
1448
- "eval_steps_per_second": 0.925,
1449
- "step": 1777
1450
- },
1451
- {
1452
- "epoch": 90.9873417721519,
1453
- "grad_norm": 1.3471728563308716,
1454
- "learning_rate": 3.011695906432749e-06,
1455
- "loss": 0.1275,
1456
- "step": 1797
1457
- },
1458
- {
1459
- "epoch": 90.9873417721519,
1460
- "eval_accuracy": 0.8650793650793651,
1461
- "eval_loss": 0.5543471574783325,
1462
- "eval_runtime": 10.7534,
1463
- "eval_samples_per_second": 117.172,
1464
- "eval_steps_per_second": 0.93,
1465
- "step": 1797
1466
- },
1467
- {
1468
- "epoch": 92.0,
1469
- "grad_norm": 1.3125709295272827,
1470
- "learning_rate": 2.426900584795322e-06,
1471
- "loss": 0.1285,
1472
- "step": 1817
1473
- },
1474
- {
1475
- "epoch": 92.0,
1476
- "eval_accuracy": 0.8674603174603175,
1477
- "eval_loss": 0.551249086856842,
1478
- "eval_runtime": 10.7174,
1479
- "eval_samples_per_second": 117.566,
1480
- "eval_steps_per_second": 0.933,
1481
- "step": 1817
1482
- },
1483
- {
1484
- "epoch": 92.9620253164557,
1485
- "grad_norm": 1.069954752922058,
1486
- "learning_rate": 1.8713450292397662e-06,
1487
- "loss": 0.127,
1488
- "step": 1836
1489
- },
1490
- {
1491
- "epoch": 92.9620253164557,
1492
- "eval_accuracy": 0.8634920634920635,
1493
- "eval_loss": 0.5507932305335999,
1494
- "eval_runtime": 10.7352,
1495
- "eval_samples_per_second": 117.371,
1496
- "eval_steps_per_second": 0.932,
1497
- "step": 1836
1498
- },
1499
- {
1500
- "epoch": 93.9746835443038,
1501
- "grad_norm": 1.1107105016708374,
1502
- "learning_rate": 1.2865497076023394e-06,
1503
- "loss": 0.1258,
1504
- "step": 1856
1505
- },
1506
- {
1507
- "epoch": 93.9746835443038,
1508
- "eval_accuracy": 0.8642857142857143,
1509
- "eval_loss": 0.5506840944290161,
1510
- "eval_runtime": 10.5904,
1511
- "eval_samples_per_second": 118.975,
1512
- "eval_steps_per_second": 0.944,
1513
- "step": 1856
1514
- },
1515
- {
1516
- "epoch": 94.9873417721519,
1517
- "grad_norm": 1.1726576089859009,
1518
- "learning_rate": 7.017543859649123e-07,
1519
- "loss": 0.1119,
1520
- "step": 1876
1521
- },
1522
- {
1523
- "epoch": 94.9873417721519,
1524
- "eval_accuracy": 0.8666666666666667,
1525
- "eval_loss": 0.5506576299667358,
1526
- "eval_runtime": 10.8474,
1527
- "eval_samples_per_second": 116.157,
1528
- "eval_steps_per_second": 0.922,
1529
- "step": 1876
1530
- },
1531
- {
1532
- "epoch": 96.0,
1533
- "grad_norm": 1.4846915006637573,
1534
- "learning_rate": 1.1695906432748539e-07,
1535
- "loss": 0.1322,
1536
- "step": 1896
1537
- },
1538
- {
1539
- "epoch": 96.0,
1540
- "eval_accuracy": 0.8658730158730159,
1541
- "eval_loss": 0.5504564046859741,
1542
- "eval_runtime": 11.0992,
1543
- "eval_samples_per_second": 113.522,
1544
- "eval_steps_per_second": 0.901,
1545
- "step": 1896
1546
  },
1547
  {
1548
- "epoch": 96.20253164556962,
1549
- "grad_norm": 1.0216985940933228,
1550
  "learning_rate": 0.0,
1551
- "loss": 0.1315,
1552
- "step": 1900
1553
  },
1554
  {
1555
- "epoch": 96.20253164556962,
1556
- "eval_accuracy": 0.8658730158730159,
1557
- "eval_loss": 0.5504307150840759,
1558
- "eval_runtime": 10.802,
1559
- "eval_samples_per_second": 116.645,
1560
- "eval_steps_per_second": 0.926,
1561
- "step": 1900
1562
- },
1563
- {
1564
- "epoch": 96.20253164556962,
1565
- "step": 1900,
1566
- "total_flos": 7.515490775048022e+19,
1567
- "train_loss": 0.33647052476280614,
1568
- "train_runtime": 20573.1873,
1569
- "train_samples_per_second": 48.996,
1570
- "train_steps_per_second": 0.092
1571
  }
1572
  ],
1573
  "logging_steps": 500,
1574
- "max_steps": 1900,
1575
  "num_input_tokens_seen": 0,
1576
- "num_train_epochs": 100,
1577
  "save_steps": 500,
1578
  "stateful_callbacks": {
1579
  "TrainerControl": {
@@ -1587,8 +515,8 @@
1587
  "attributes": {}
1588
  }
1589
  },
1590
- "total_flos": 7.515490775048022e+19,
1591
- "train_batch_size": 128,
1592
  "trial_name": null,
1593
  "trial_params": null
1594
  }
 
1
  {
2
+ "best_metric": 0.8714285714285714,
3
+ "best_model_checkpoint": "CP2_HAR_vit-base-patch16-224/checkpoint-908",
4
+ "epoch": 29.620253164556964,
5
  "eval_steps": 500,
6
+ "global_step": 1170,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9873417721518988,
13
+ "grad_norm": 2.9084372520446777,
14
+ "learning_rate": 1.6666666666666667e-05,
15
+ "loss": 2.7032,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "step": 39
17
  },
18
  {
19
+ "epoch": 0.9873417721518988,
20
+ "eval_accuracy": 0.3388888888888889,
21
+ "eval_loss": 2.304168224334717,
22
+ "eval_runtime": 10.2204,
23
+ "eval_samples_per_second": 123.283,
24
+ "eval_steps_per_second": 1.957,
25
  "step": 39
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "grad_norm": 2.045494318008423,
30
+ "learning_rate": 3.376068376068376e-05,
31
+ "loss": 1.7639,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "step": 79
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7515873015873016,
37
+ "eval_loss": 1.0595871210098267,
38
+ "eval_runtime": 10.2792,
39
+ "eval_samples_per_second": 122.577,
40
+ "eval_steps_per_second": 1.946,
41
  "step": 79
42
  },
43
  {
44
+ "epoch": 2.9873417721518987,
45
+ "grad_norm": 1.7503687143325806,
46
+ "learning_rate": 4.995251661918329e-05,
47
+ "loss": 0.974,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  "step": 118
49
  },
50
  {
51
+ "epoch": 2.9873417721518987,
52
+ "eval_accuracy": 0.8134920634920635,
53
+ "eval_loss": 0.6007124781608582,
54
+ "eval_runtime": 10.208,
55
+ "eval_samples_per_second": 123.432,
56
+ "eval_steps_per_second": 1.959,
57
  "step": 118
58
  },
59
  {
60
+ "epoch": 4.0,
61
+ "grad_norm": 2.026381492614746,
62
+ "learning_rate": 4.8053181386514724e-05,
63
+ "loss": 0.7207,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  "step": 158
65
  },
66
  {
67
+ "epoch": 4.0,
68
+ "eval_accuracy": 0.8412698412698413,
69
+ "eval_loss": 0.49882617592811584,
70
+ "eval_runtime": 10.3437,
71
+ "eval_samples_per_second": 121.813,
72
+ "eval_steps_per_second": 1.934,
73
  "step": 158
74
  },
75
  {
76
+ "epoch": 4.987341772151899,
77
+ "grad_norm": 1.8753575086593628,
78
+ "learning_rate": 4.620132953466287e-05,
79
+ "loss": 0.6285,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  "step": 197
81
  },
82
  {
83
+ "epoch": 4.987341772151899,
84
+ "eval_accuracy": 0.8507936507936508,
85
+ "eval_loss": 0.4587480425834656,
86
+ "eval_runtime": 10.2618,
87
+ "eval_samples_per_second": 122.785,
88
+ "eval_steps_per_second": 1.949,
89
  "step": 197
90
  },
91
  {
92
+ "epoch": 6.0,
93
+ "grad_norm": 2.0627048015594482,
94
+ "learning_rate": 4.4301994301994304e-05,
95
+ "loss": 0.562,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  "step": 237
97
  },
98
  {
99
+ "epoch": 6.0,
100
+ "eval_accuracy": 0.8531746031746031,
101
+ "eval_loss": 0.4662785232067108,
102
+ "eval_runtime": 10.2093,
103
+ "eval_samples_per_second": 123.417,
104
+ "eval_steps_per_second": 1.959,
105
  "step": 237
106
  },
107
  {
108
+ "epoch": 6.987341772151899,
109
+ "grad_norm": 1.78373122215271,
110
+ "learning_rate": 4.2450142450142457e-05,
111
+ "loss": 0.5258,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  "step": 276
113
  },
114
  {
115
+ "epoch": 6.987341772151899,
116
+ "eval_accuracy": 0.8468253968253968,
117
+ "eval_loss": 0.45184874534606934,
118
+ "eval_runtime": 10.2263,
119
+ "eval_samples_per_second": 123.211,
120
+ "eval_steps_per_second": 1.956,
121
  "step": 276
122
  },
123
  {
124
+ "epoch": 8.0,
125
+ "grad_norm": 2.4166259765625,
126
+ "learning_rate": 4.0550807217473884e-05,
127
+ "loss": 0.4843,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "step": 316
129
  },
130
  {
131
+ "epoch": 8.0,
132
+ "eval_accuracy": 0.8603174603174604,
133
+ "eval_loss": 0.4466171860694885,
134
+ "eval_runtime": 10.1905,
135
+ "eval_samples_per_second": 123.645,
136
+ "eval_steps_per_second": 1.963,
137
  "step": 316
138
  },
139
  {
140
+ "epoch": 8.987341772151899,
141
+ "grad_norm": 2.37298321723938,
142
+ "learning_rate": 3.8698955365622036e-05,
143
+ "loss": 0.4491,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  "step": 355
145
  },
146
  {
147
+ "epoch": 8.987341772151899,
148
+ "eval_accuracy": 0.8523809523809524,
149
+ "eval_loss": 0.43793126940727234,
150
+ "eval_runtime": 10.2956,
151
+ "eval_samples_per_second": 122.382,
152
+ "eval_steps_per_second": 1.943,
153
  "step": 355
154
  },
155
  {
156
+ "epoch": 10.0,
157
+ "grad_norm": 2.5339529514312744,
158
+ "learning_rate": 3.679962013295346e-05,
159
+ "loss": 0.4288,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  "step": 395
161
  },
162
  {
163
+ "epoch": 10.0,
164
+ "eval_accuracy": 0.8674603174603175,
165
+ "eval_loss": 0.432355135679245,
166
+ "eval_runtime": 10.268,
167
+ "eval_samples_per_second": 122.712,
168
+ "eval_steps_per_second": 1.948,
169
  "step": 395
170
  },
171
  {
172
+ "epoch": 10.987341772151899,
173
+ "grad_norm": 1.7715898752212524,
174
+ "learning_rate": 3.4947768281101616e-05,
175
+ "loss": 0.4183,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  "step": 434
177
  },
178
  {
179
+ "epoch": 10.987341772151899,
180
+ "eval_accuracy": 0.8642857142857143,
181
+ "eval_loss": 0.44705930352211,
182
+ "eval_runtime": 10.2591,
183
+ "eval_samples_per_second": 122.817,
184
+ "eval_steps_per_second": 1.949,
185
  "step": 434
186
  },
187
  {
188
+ "epoch": 12.0,
189
+ "grad_norm": 2.3940932750701904,
190
+ "learning_rate": 3.304843304843305e-05,
191
+ "loss": 0.3882,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  "step": 474
193
  },
194
  {
195
+ "epoch": 12.0,
 
 
 
 
 
 
 
196
  "eval_accuracy": 0.8579365079365079,
197
+ "eval_loss": 0.43507805466651917,
198
+ "eval_runtime": 10.2613,
199
+ "eval_samples_per_second": 122.792,
200
+ "eval_steps_per_second": 1.949,
201
+ "step": 474
202
  },
203
  {
204
+ "epoch": 12.987341772151899,
205
+ "grad_norm": 2.104583501815796,
206
+ "learning_rate": 3.1196581196581195e-05,
207
+ "loss": 0.3777,
208
  "step": 513
209
  },
210
  {
211
+ "epoch": 12.987341772151899,
212
+ "eval_accuracy": 0.8611111111111112,
213
+ "eval_loss": 0.432034432888031,
214
+ "eval_runtime": 10.2065,
215
+ "eval_samples_per_second": 123.451,
216
+ "eval_steps_per_second": 1.96,
217
  "step": 513
218
  },
219
  {
220
+ "epoch": 14.0,
221
+ "grad_norm": 2.3956658840179443,
222
+ "learning_rate": 2.9297245963912633e-05,
223
+ "loss": 0.3497,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  "step": 553
225
  },
226
  {
227
+ "epoch": 14.0,
228
+ "eval_accuracy": 0.8642857142857143,
229
+ "eval_loss": 0.4432290196418762,
230
+ "eval_runtime": 10.2012,
231
+ "eval_samples_per_second": 123.515,
232
+ "eval_steps_per_second": 1.961,
233
  "step": 553
234
  },
235
  {
236
+ "epoch": 14.987341772151899,
237
+ "grad_norm": 2.0552070140838623,
238
+ "learning_rate": 2.744539411206078e-05,
239
+ "loss": 0.347,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  "step": 592
241
  },
242
  {
243
+ "epoch": 14.987341772151899,
244
+ "eval_accuracy": 0.8690476190476191,
245
+ "eval_loss": 0.4347086548805237,
246
+ "eval_runtime": 10.2609,
247
+ "eval_samples_per_second": 122.797,
248
+ "eval_steps_per_second": 1.949,
249
  "step": 592
250
  },
251
  {
252
+ "epoch": 16.0,
253
+ "grad_norm": 2.1157126426696777,
254
+ "learning_rate": 2.5546058879392216e-05,
255
+ "loss": 0.3331,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  "step": 632
257
  },
258
  {
259
+ "epoch": 16.0,
260
+ "eval_accuracy": 0.8603174603174604,
261
+ "eval_loss": 0.4517436623573303,
262
+ "eval_runtime": 10.3042,
263
+ "eval_samples_per_second": 122.28,
264
+ "eval_steps_per_second": 1.941,
265
  "step": 632
266
  },
267
  {
268
+ "epoch": 16.9873417721519,
269
+ "grad_norm": 1.8309712409973145,
270
+ "learning_rate": 2.3694207027540365e-05,
271
+ "loss": 0.3219,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  "step": 671
273
  },
274
  {
275
+ "epoch": 16.9873417721519,
276
+ "eval_accuracy": 0.8666666666666667,
277
+ "eval_loss": 0.44011229276657104,
278
+ "eval_runtime": 10.2371,
279
+ "eval_samples_per_second": 123.082,
280
+ "eval_steps_per_second": 1.954,
281
  "step": 671
282
  },
283
  {
284
+ "epoch": 18.0,
285
+ "grad_norm": 2.178051710128784,
286
+ "learning_rate": 2.1794871794871795e-05,
287
+ "loss": 0.3081,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  "step": 711
289
  },
290
  {
291
+ "epoch": 18.0,
292
+ "eval_accuracy": 0.8690476190476191,
293
+ "eval_loss": 0.4321274161338806,
294
+ "eval_runtime": 10.2691,
295
+ "eval_samples_per_second": 122.699,
296
+ "eval_steps_per_second": 1.948,
297
  "step": 711
298
  },
299
  {
300
+ "epoch": 18.9873417721519,
301
+ "grad_norm": 2.0867300033569336,
302
+ "learning_rate": 1.9943019943019945e-05,
303
+ "loss": 0.3194,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  "step": 750
305
  },
306
  {
307
+ "epoch": 18.9873417721519,
308
+ "eval_accuracy": 0.8690476190476191,
309
+ "eval_loss": 0.4421131908893585,
310
+ "eval_runtime": 10.2636,
311
+ "eval_samples_per_second": 122.764,
312
+ "eval_steps_per_second": 1.949,
313
  "step": 750
314
  },
315
  {
316
+ "epoch": 20.0,
317
+ "grad_norm": 2.312155246734619,
318
+ "learning_rate": 1.804368471035138e-05,
319
+ "loss": 0.3102,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  "step": 790
321
  },
322
  {
323
+ "epoch": 20.0,
324
+ "eval_accuracy": 0.8682539682539683,
325
+ "eval_loss": 0.4470122754573822,
326
+ "eval_runtime": 10.428,
327
+ "eval_samples_per_second": 120.829,
328
+ "eval_steps_per_second": 1.918,
329
  "step": 790
330
  },
331
  {
332
+ "epoch": 20.9873417721519,
333
+ "grad_norm": 1.674055814743042,
334
+ "learning_rate": 1.6191832858499524e-05,
335
+ "loss": 0.2908,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  "step": 829
337
  },
338
  {
339
+ "epoch": 20.9873417721519,
340
+ "eval_accuracy": 0.8666666666666667,
341
+ "eval_loss": 0.4368663430213928,
342
+ "eval_runtime": 10.304,
343
+ "eval_samples_per_second": 122.282,
344
+ "eval_steps_per_second": 1.941,
345
  "step": 829
346
  },
347
  {
348
+ "epoch": 22.0,
349
+ "grad_norm": 1.8067846298217773,
350
+ "learning_rate": 1.4292497625830961e-05,
351
+ "loss": 0.2794,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  "step": 869
353
  },
354
  {
355
+ "epoch": 22.0,
356
+ "eval_accuracy": 0.8642857142857143,
357
+ "eval_loss": 0.4426242411136627,
358
+ "eval_runtime": 10.2667,
359
+ "eval_samples_per_second": 122.726,
360
+ "eval_steps_per_second": 1.948,
361
  "step": 869
362
  },
363
  {
364
+ "epoch": 22.9873417721519,
365
+ "grad_norm": 2.093015193939209,
366
+ "learning_rate": 1.2440645773979107e-05,
367
+ "loss": 0.2684,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  "step": 908
369
  },
370
  {
371
+ "epoch": 22.9873417721519,
372
+ "eval_accuracy": 0.8714285714285714,
373
+ "eval_loss": 0.4378375709056854,
374
+ "eval_runtime": 10.2839,
375
+ "eval_samples_per_second": 122.522,
376
+ "eval_steps_per_second": 1.945,
377
  "step": 908
378
  },
379
  {
380
+ "epoch": 24.0,
381
+ "grad_norm": 1.7382984161376953,
382
+ "learning_rate": 1.0541310541310543e-05,
383
+ "loss": 0.2635,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  "step": 948
385
  },
386
  {
387
+ "epoch": 24.0,
388
+ "eval_accuracy": 0.8698412698412699,
389
+ "eval_loss": 0.44393062591552734,
390
+ "eval_runtime": 10.2502,
391
+ "eval_samples_per_second": 122.924,
392
+ "eval_steps_per_second": 1.951,
393
  "step": 948
394
  },
395
  {
396
+ "epoch": 24.9873417721519,
397
+ "grad_norm": 1.4845259189605713,
398
+ "learning_rate": 8.68945868945869e-06,
399
+ "loss": 0.2754,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  "step": 987
401
  },
402
  {
403
+ "epoch": 24.9873417721519,
404
+ "eval_accuracy": 0.8642857142857143,
405
+ "eval_loss": 0.45485520362854004,
406
+ "eval_runtime": 10.2015,
407
+ "eval_samples_per_second": 123.511,
408
+ "eval_steps_per_second": 1.96,
409
  "step": 987
410
  },
411
  {
412
+ "epoch": 26.0,
413
+ "grad_norm": 1.6323109865188599,
414
+ "learning_rate": 6.790123456790123e-06,
415
+ "loss": 0.2669,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  "step": 1027
417
  },
418
  {
419
+ "epoch": 26.0,
420
+ "eval_accuracy": 0.8674603174603175,
421
+ "eval_loss": 0.44393137097358704,
422
+ "eval_runtime": 10.2535,
423
+ "eval_samples_per_second": 122.885,
424
+ "eval_steps_per_second": 1.951,
425
  "step": 1027
426
  },
427
  {
428
+ "epoch": 26.9873417721519,
429
+ "grad_norm": 1.589407205581665,
430
+ "learning_rate": 4.938271604938272e-06,
431
+ "loss": 0.2616,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  "step": 1066
433
  },
434
  {
435
+ "epoch": 26.9873417721519,
436
+ "eval_accuracy": 0.8714285714285714,
437
+ "eval_loss": 0.4428676962852478,
438
+ "eval_runtime": 10.2429,
439
+ "eval_samples_per_second": 123.012,
440
+ "eval_steps_per_second": 1.953,
441
  "step": 1066
442
  },
443
  {
444
+ "epoch": 28.0,
445
+ "grad_norm": 1.8043303489685059,
446
+ "learning_rate": 3.038936372269706e-06,
447
+ "loss": 0.2501,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  "step": 1106
449
  },
450
  {
451
+ "epoch": 28.0,
452
+ "eval_accuracy": 0.8698412698412699,
453
+ "eval_loss": 0.4408431649208069,
454
+ "eval_runtime": 10.1648,
455
+ "eval_samples_per_second": 123.957,
456
+ "eval_steps_per_second": 1.968,
457
  "step": 1106
458
  },
459
  {
460
+ "epoch": 28.9873417721519,
461
+ "grad_norm": 2.025970935821533,
462
+ "learning_rate": 1.1870845204178538e-06,
463
+ "loss": 0.2622,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  "step": 1145
465
  },
466
  {
467
+ "epoch": 28.9873417721519,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  "eval_accuracy": 0.8682539682539683,
469
+ "eval_loss": 0.4434352219104767,
470
+ "eval_runtime": 10.25,
471
+ "eval_samples_per_second": 122.927,
472
+ "eval_steps_per_second": 1.951,
473
+ "step": 1145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  },
475
  {
476
+ "epoch": 29.620253164556964,
477
+ "grad_norm": 1.5968279838562012,
478
  "learning_rate": 0.0,
479
+ "loss": 0.2511,
480
+ "step": 1170
481
  },
482
  {
483
+ "epoch": 29.620253164556964,
484
+ "eval_accuracy": 0.8682539682539683,
485
+ "eval_loss": 0.44374439120292664,
486
+ "eval_runtime": 10.1581,
487
+ "eval_samples_per_second": 124.038,
488
+ "eval_steps_per_second": 1.969,
489
+ "step": 1170
490
+ },
491
+ {
492
+ "epoch": 29.620253164556964,
493
+ "step": 1170,
494
+ "total_flos": 2.3141184141358596e+19,
495
+ "train_loss": 0.5155148339067769,
496
+ "train_runtime": 6443.0087,
497
+ "train_samples_per_second": 46.935,
498
+ "train_steps_per_second": 0.182
499
  }
500
  ],
501
  "logging_steps": 500,
502
+ "max_steps": 1170,
503
  "num_input_tokens_seen": 0,
504
+ "num_train_epochs": 30,
505
  "save_steps": 500,
506
  "stateful_callbacks": {
507
  "TrainerControl": {
 
515
  "attributes": {}
516
  }
517
  },
518
+ "total_flos": 2.3141184141358596e+19,
519
+ "train_batch_size": 64,
520
  "trial_name": null,
521
  "trial_params": null
522
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a1c0f33a3024e6ec45ce8978209c580f91e2084ba0bf40c70af9b63aea9815a
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51ee0160cc69a3dbfcc17aff11717ae1cc6585f42ff06591fe7fea8318305b6f
3
  size 5112