minhah commited on
Commit
57e003d
·
verified ·
1 Parent(s): 466ea03

End of training

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +6 -6
  3. test_results.json +6 -6
  4. trainer_state.json +1895 -323
README.md CHANGED
@@ -17,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.6799
21
  - Accuracy: 0.3314
22
 
23
  ## Model description
 
17
 
18
  This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.6795
21
  - Accuracy: 0.3314
22
 
23
  ## Model description
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 7.11,
3
- "eval_accuracy": 0.3414225941422594,
4
- "eval_loss": 1.6634992361068726,
5
- "eval_runtime": 294.1145,
6
- "eval_samples_per_second": 4.063,
7
- "eval_steps_per_second": 0.255
8
  }
 
1
  {
2
+ "epoch": 58.01,
3
+ "eval_accuracy": 0.3313807531380753,
4
+ "eval_loss": 1.679498314857483,
5
+ "eval_runtime": 279.1958,
6
+ "eval_samples_per_second": 4.28,
7
+ "eval_steps_per_second": 0.136
8
  }
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 7.11,
3
- "eval_accuracy": 0.3414225941422594,
4
- "eval_loss": 1.6634992361068726,
5
- "eval_runtime": 294.1145,
6
- "eval_samples_per_second": 4.063,
7
- "eval_steps_per_second": 0.255
8
  }
 
1
  {
2
+ "epoch": 58.01,
3
+ "eval_accuracy": 0.3313807531380753,
4
+ "eval_loss": 1.679498314857483,
5
+ "eval_runtime": 279.1958,
6
+ "eval_samples_per_second": 4.28,
7
+ "eval_steps_per_second": 0.136
8
  }
trainer_state.json CHANGED
@@ -1,519 +1,2091 @@
1
  {
2
- "best_metric": 0.34782608695652173,
3
- "best_model_checkpoint": "videomae-base-finetuned-elder/checkpoint-219",
4
- "epoch": 7.112847222222222,
5
  "eval_steps": 500,
6
- "global_step": 576,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
- "grad_norm": 5.237472057342529,
14
- "learning_rate": 8.620689655172414e-06,
15
- "loss": 1.7724,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.03,
20
- "grad_norm": 6.054948806762695,
21
- "learning_rate": 1.7241379310344828e-05,
22
- "loss": 1.7224,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.05,
27
- "grad_norm": 4.134993076324463,
28
- "learning_rate": 2.5862068965517244e-05,
29
- "loss": 1.6904,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.07,
34
- "grad_norm": 4.677920341491699,
35
- "learning_rate": 3.4482758620689657e-05,
36
- "loss": 1.756,
 
 
 
 
 
 
 
 
 
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.09,
41
- "grad_norm": 4.563185214996338,
42
- "learning_rate": 4.3103448275862066e-05,
43
- "loss": 1.6722,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.1,
48
- "grad_norm": 5.584333419799805,
49
- "learning_rate": 4.980694980694981e-05,
50
- "loss": 1.7276,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.12,
55
- "grad_norm": 4.3401899337768555,
56
- "learning_rate": 4.884169884169885e-05,
57
- "loss": 1.7389,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.13,
62
- "eval_accuracy": 0.34221598877980364,
63
- "eval_loss": 1.6415042877197266,
64
- "eval_runtime": 290.9078,
65
- "eval_samples_per_second": 4.902,
66
- "eval_steps_per_second": 0.309,
67
- "step": 73
68
  },
69
  {
70
- "epoch": 1.01,
71
- "grad_norm": 3.965003252029419,
72
- "learning_rate": 4.787644787644788e-05,
73
- "loss": 1.7246,
74
  "step": 80
75
  },
76
  {
77
- "epoch": 1.03,
78
- "grad_norm": 5.857859134674072,
79
- "learning_rate": 4.6911196911196914e-05,
80
- "loss": 1.7429,
81
  "step": 90
82
  },
83
  {
84
- "epoch": 1.05,
85
- "grad_norm": 3.8463449478149414,
86
- "learning_rate": 4.594594594594595e-05,
87
- "loss": 1.7281,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 1.06,
92
- "grad_norm": 4.073330879211426,
93
- "learning_rate": 4.498069498069498e-05,
94
- "loss": 1.6925,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 1.08,
99
- "grad_norm": 4.23333740234375,
100
- "learning_rate": 4.401544401544402e-05,
101
- "loss": 1.6861,
 
 
 
 
 
 
 
 
 
102
  "step": 120
103
  },
104
  {
105
- "epoch": 1.1,
106
- "grad_norm": 3.201246500015259,
107
- "learning_rate": 4.305019305019305e-05,
108
- "loss": 1.6294,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 1.12,
113
- "grad_norm": 3.7988574504852295,
114
- "learning_rate": 4.2084942084942086e-05,
115
- "loss": 1.6564,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 1.13,
120
- "eval_accuracy": 0.32538569424964936,
121
- "eval_loss": 1.6350404024124146,
122
- "eval_runtime": 315.829,
123
- "eval_samples_per_second": 4.515,
124
- "eval_steps_per_second": 0.285,
125
- "step": 146
126
- },
127
- {
128
- "epoch": 2.01,
129
- "grad_norm": 4.17667818069458,
130
- "learning_rate": 4.111969111969112e-05,
131
- "loss": 1.681,
132
  "step": 150
133
  },
134
  {
135
- "epoch": 2.02,
136
- "grad_norm": 3.9209909439086914,
137
- "learning_rate": 4.015444015444015e-05,
138
- "loss": 1.7443,
139
  "step": 160
140
  },
141
  {
142
- "epoch": 2.04,
143
- "grad_norm": 2.9084713459014893,
144
- "learning_rate": 3.918918918918919e-05,
145
- "loss": 1.6921,
146
  "step": 170
147
  },
148
  {
149
- "epoch": 2.06,
150
- "grad_norm": 5.516010761260986,
151
- "learning_rate": 3.822393822393823e-05,
152
- "loss": 1.683,
153
  "step": 180
154
  },
155
  {
156
- "epoch": 2.08,
157
- "grad_norm": 2.67150616645813,
158
- "learning_rate": 3.725868725868726e-05,
159
- "loss": 1.6742,
 
 
 
 
 
 
 
 
 
160
  "step": 190
161
  },
162
  {
163
- "epoch": 2.09,
164
- "grad_norm": 4.397495746612549,
165
- "learning_rate": 3.6293436293436295e-05,
166
- "loss": 1.6729,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 2.11,
171
- "grad_norm": 3.290335178375244,
172
- "learning_rate": 3.532818532818533e-05,
173
- "loss": 1.6835,
174
  "step": 210
175
  },
176
  {
177
- "epoch": 2.13,
178
- "eval_accuracy": 0.34782608695652173,
179
- "eval_loss": 1.6297296285629272,
180
- "eval_runtime": 349.3232,
181
- "eval_samples_per_second": 4.082,
182
- "eval_steps_per_second": 0.258,
183
- "step": 219
184
  },
185
  {
186
- "epoch": 3.0,
187
- "grad_norm": 4.048051834106445,
188
- "learning_rate": 3.436293436293436e-05,
189
- "loss": 1.6983,
190
- "step": 220
 
 
191
  },
192
  {
193
- "epoch": 3.02,
194
- "grad_norm": 2.2507669925689697,
195
- "learning_rate": 3.33976833976834e-05,
196
- "loss": 1.6625,
197
  "step": 230
198
  },
199
  {
200
- "epoch": 3.04,
201
- "grad_norm": 3.7626376152038574,
202
- "learning_rate": 3.2432432432432436e-05,
203
- "loss": 1.6627,
204
  "step": 240
205
  },
206
  {
207
- "epoch": 3.05,
208
- "grad_norm": 5.144118309020996,
209
- "learning_rate": 3.1467181467181466e-05,
210
- "loss": 1.5669,
211
  "step": 250
212
  },
213
  {
214
- "epoch": 3.07,
215
- "grad_norm": 4.4853692054748535,
216
- "learning_rate": 3.0501930501930504e-05,
217
- "loss": 1.6128,
 
 
 
 
 
 
 
 
 
218
  "step": 260
219
  },
220
  {
221
- "epoch": 3.09,
222
- "grad_norm": 2.6562840938568115,
223
- "learning_rate": 2.953667953667954e-05,
224
- "loss": 1.7437,
225
  "step": 270
226
  },
227
  {
228
- "epoch": 3.11,
229
- "grad_norm": 5.279839992523193,
230
- "learning_rate": 2.857142857142857e-05,
231
- "loss": 1.6866,
232
  "step": 280
233
  },
234
  {
235
- "epoch": 3.12,
236
- "grad_norm": 4.48944091796875,
237
- "learning_rate": 2.7606177606177608e-05,
238
- "loss": 1.6586,
239
  "step": 290
240
  },
241
  {
242
- "epoch": 3.13,
243
- "eval_accuracy": 0.28892005610098176,
244
- "eval_loss": 1.6578232049942017,
245
- "eval_runtime": 322.0754,
246
- "eval_samples_per_second": 4.428,
247
- "eval_steps_per_second": 0.279,
248
- "step": 292
249
- },
250
- {
251
- "epoch": 4.01,
252
- "grad_norm": 2.674558639526367,
253
- "learning_rate": 2.6640926640926645e-05,
254
- "loss": 1.6305,
255
  "step": 300
256
  },
257
  {
258
- "epoch": 4.03,
259
- "grad_norm": 3.7416186332702637,
260
- "learning_rate": 2.5675675675675675e-05,
261
- "loss": 1.6256,
262
  "step": 310
263
  },
264
  {
265
- "epoch": 4.05,
266
- "grad_norm": 3.584017038345337,
267
- "learning_rate": 2.4710424710424712e-05,
268
- "loss": 1.6289,
269
  "step": 320
270
  },
271
  {
272
- "epoch": 4.07,
273
- "grad_norm": 3.1863162517547607,
274
- "learning_rate": 2.3745173745173746e-05,
275
- "loss": 1.6078,
276
  "step": 330
277
  },
278
  {
279
- "epoch": 4.08,
280
- "grad_norm": 4.056468486785889,
281
- "learning_rate": 2.277992277992278e-05,
282
- "loss": 1.7026,
 
 
 
 
 
 
 
 
 
283
  "step": 340
284
  },
285
  {
286
- "epoch": 4.1,
287
- "grad_norm": 3.548006772994995,
288
- "learning_rate": 2.1814671814671817e-05,
289
- "loss": 1.6603,
290
  "step": 350
291
  },
292
  {
293
- "epoch": 4.12,
294
- "grad_norm": 6.205471515655518,
295
- "learning_rate": 2.084942084942085e-05,
296
- "loss": 1.5774,
297
  "step": 360
298
  },
299
  {
300
- "epoch": 4.13,
301
- "eval_accuracy": 0.32959326788218796,
302
- "eval_loss": 1.6130512952804565,
303
- "eval_runtime": 350.4881,
304
- "eval_samples_per_second": 4.069,
305
- "eval_steps_per_second": 0.257,
306
- "step": 365
307
  },
308
  {
309
- "epoch": 5.01,
310
- "grad_norm": 5.552679538726807,
311
- "learning_rate": 1.9884169884169884e-05,
312
- "loss": 1.566,
 
 
313
  "step": 370
314
  },
315
  {
316
- "epoch": 5.03,
317
- "grad_norm": 4.196056365966797,
318
- "learning_rate": 1.891891891891892e-05,
319
- "loss": 1.644,
320
  "step": 380
321
  },
322
  {
323
- "epoch": 5.04,
324
- "grad_norm": 3.769803047180176,
325
- "learning_rate": 1.7953667953667955e-05,
326
- "loss": 1.6073,
327
  "step": 390
328
  },
329
  {
330
- "epoch": 5.06,
331
- "grad_norm": 4.715469837188721,
332
- "learning_rate": 1.698841698841699e-05,
333
- "loss": 1.648,
334
  "step": 400
335
  },
336
  {
337
- "epoch": 5.08,
338
- "grad_norm": 4.178481101989746,
339
- "learning_rate": 1.6023166023166026e-05,
340
- "loss": 1.6536,
 
 
 
 
 
 
 
 
 
341
  "step": 410
342
  },
343
  {
344
- "epoch": 5.1,
345
- "grad_norm": 3.690173625946045,
346
- "learning_rate": 1.505791505791506e-05,
347
- "loss": 1.5387,
348
  "step": 420
349
  },
350
  {
351
- "epoch": 5.11,
352
- "grad_norm": 4.677956581115723,
353
- "learning_rate": 1.4092664092664093e-05,
354
- "loss": 1.5861,
355
  "step": 430
356
  },
357
  {
358
- "epoch": 5.13,
359
- "eval_accuracy": 0.3429172510518934,
360
- "eval_loss": 1.6085110902786255,
361
- "eval_runtime": 327.2587,
362
- "eval_samples_per_second": 4.357,
363
- "eval_steps_per_second": 0.275,
364
- "step": 438
365
  },
366
  {
367
- "epoch": 6.0,
368
- "grad_norm": 3.9732487201690674,
369
- "learning_rate": 1.3127413127413127e-05,
370
- "loss": 1.5394,
371
- "step": 440
 
 
372
  },
373
  {
374
- "epoch": 6.02,
375
- "grad_norm": 4.788599967956543,
376
- "learning_rate": 1.2162162162162164e-05,
377
- "loss": 1.5403,
378
  "step": 450
379
  },
380
  {
381
- "epoch": 6.04,
382
- "grad_norm": 4.857376575469971,
383
- "learning_rate": 1.1196911196911197e-05,
384
- "loss": 1.4993,
385
  "step": 460
386
  },
387
  {
388
- "epoch": 6.06,
389
- "grad_norm": 4.482532501220703,
390
- "learning_rate": 1.0231660231660233e-05,
391
- "loss": 1.7165,
392
  "step": 470
393
  },
394
  {
395
- "epoch": 6.07,
396
- "grad_norm": 4.450554370880127,
397
- "learning_rate": 9.266409266409266e-06,
398
- "loss": 1.5992,
399
  "step": 480
400
  },
401
  {
402
- "epoch": 6.09,
403
- "grad_norm": 3.0036492347717285,
404
- "learning_rate": 8.301158301158302e-06,
405
- "loss": 1.5497,
 
 
 
 
 
 
 
 
 
406
  "step": 490
407
  },
408
  {
409
- "epoch": 6.11,
410
- "grad_norm": 4.3488240242004395,
411
- "learning_rate": 7.335907335907337e-06,
412
- "loss": 1.6037,
413
  "step": 500
414
  },
415
  {
416
- "epoch": 6.12,
417
- "grad_norm": 3.3629989624023438,
418
- "learning_rate": 6.370656370656371e-06,
419
- "loss": 1.6262,
420
  "step": 510
421
  },
422
  {
423
- "epoch": 6.13,
424
- "eval_accuracy": 0.3155680224403927,
425
- "eval_loss": 1.624500036239624,
426
- "eval_runtime": 326.5278,
427
- "eval_samples_per_second": 4.367,
428
- "eval_steps_per_second": 0.276,
429
- "step": 511
430
  },
431
  {
432
- "epoch": 7.02,
433
- "grad_norm": 3.1343469619750977,
434
- "learning_rate": 5.405405405405406e-06,
435
- "loss": 1.5258,
436
  "step": 520
437
  },
438
  {
439
- "epoch": 7.03,
440
- "grad_norm": 4.27249002456665,
441
- "learning_rate": 4.4401544401544405e-06,
442
- "loss": 1.5383,
443
  "step": 530
444
  },
445
  {
446
- "epoch": 7.05,
447
- "grad_norm": 3.4036643505096436,
448
- "learning_rate": 3.474903474903475e-06,
449
- "loss": 1.4919,
450
  "step": 540
451
  },
452
  {
453
- "epoch": 7.07,
454
- "grad_norm": 4.460765361785889,
455
- "learning_rate": 2.5096525096525096e-06,
456
- "loss": 1.5844,
457
  "step": 550
458
  },
459
  {
460
- "epoch": 7.09,
461
- "grad_norm": 3.683540105819702,
462
- "learning_rate": 1.5444015444015445e-06,
463
- "loss": 1.5198,
 
 
 
 
 
 
 
 
 
464
  "step": 560
465
  },
466
  {
467
- "epoch": 7.1,
468
- "grad_norm": 4.533329963684082,
469
- "learning_rate": 5.791505791505791e-07,
470
- "loss": 1.5688,
471
  "step": 570
472
  },
473
  {
474
- "epoch": 7.11,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  "eval_accuracy": 0.3141654978962132,
476
- "eval_loss": 1.6346054077148438,
477
- "eval_runtime": 366.9807,
478
- "eval_samples_per_second": 3.886,
479
- "eval_steps_per_second": 0.245,
480
- "step": 576
481
- },
482
- {
483
- "epoch": 7.11,
484
- "step": 576,
485
- "total_flos": 1.1388190514622824e+19,
486
- "train_loss": 1.6425048808256786,
487
- "train_runtime": 5181.3338,
488
- "train_samples_per_second": 1.779,
489
- "train_steps_per_second": 0.111
490
- },
491
- {
492
- "epoch": 7.11,
493
- "eval_accuracy": 0.3414225941422594,
494
- "eval_loss": 1.663814902305603,
495
- "eval_runtime": 287.8607,
496
- "eval_samples_per_second": 4.151,
497
- "eval_steps_per_second": 0.261,
498
- "step": 576
499
- },
500
- {
501
- "epoch": 7.11,
502
- "eval_accuracy": 0.3414225941422594,
503
- "eval_loss": 1.6634992361068726,
504
- "eval_runtime": 294.1145,
505
- "eval_samples_per_second": 4.063,
506
- "eval_steps_per_second": 0.255,
507
- "step": 576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  }
509
  ],
510
  "logging_steps": 10,
511
- "max_steps": 576,
512
  "num_input_tokens_seen": 0,
513
  "num_train_epochs": 9223372036854775807,
514
  "save_steps": 500,
515
- "total_flos": 1.1388190514622824e+19,
516
- "train_batch_size": 16,
517
  "trial_name": null,
518
  "trial_params": null
519
  }
 
1
  {
2
+ "best_metric": 0.3485273492286115,
3
+ "best_model_checkpoint": "videomae-base-finetuned-elder/checkpoint-259",
4
+ "epoch": 58.00648148148148,
5
  "eval_steps": 500,
6
+ "global_step": 2160,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "grad_norm": 5.482585906982422,
14
+ "learning_rate": 2.3148148148148148e-06,
15
+ "loss": 1.8348,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.01,
20
+ "grad_norm": 3.854177713394165,
21
+ "learning_rate": 4.6296296296296296e-06,
22
+ "loss": 1.8009,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.01,
27
+ "grad_norm": 2.802464485168457,
28
+ "learning_rate": 6.944444444444445e-06,
29
+ "loss": 1.7518,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.02,
34
+ "eval_accuracy": 0.34011220196353437,
35
+ "eval_loss": 1.6497516632080078,
36
+ "eval_runtime": 288.7052,
37
+ "eval_samples_per_second": 4.939,
38
+ "eval_steps_per_second": 0.156,
39
+ "step": 37
40
+ },
41
+ {
42
+ "epoch": 1.0,
43
+ "grad_norm": 3.4240188598632812,
44
+ "learning_rate": 9.259259259259259e-06,
45
+ "loss": 1.7321,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 1.01,
50
+ "grad_norm": 3.0115363597869873,
51
+ "learning_rate": 1.1574074074074075e-05,
52
+ "loss": 1.712,
53
  "step": 50
54
  },
55
  {
56
+ "epoch": 1.01,
57
+ "grad_norm": 2.6394731998443604,
58
+ "learning_rate": 1.388888888888889e-05,
59
+ "loss": 1.6934,
60
  "step": 60
61
  },
62
  {
63
+ "epoch": 1.02,
64
+ "grad_norm": 3.2504265308380127,
65
+ "learning_rate": 1.6203703703703704e-05,
66
+ "loss": 1.6435,
67
  "step": 70
68
  },
69
  {
70
+ "epoch": 1.02,
71
+ "eval_accuracy": 0.3436185133239832,
72
+ "eval_loss": 1.63015878200531,
73
+ "eval_runtime": 305.1135,
74
+ "eval_samples_per_second": 4.674,
75
+ "eval_steps_per_second": 0.147,
76
+ "step": 74
77
  },
78
  {
79
+ "epoch": 2.0,
80
+ "grad_norm": 3.015937328338623,
81
+ "learning_rate": 1.8518518518518518e-05,
82
+ "loss": 1.7107,
83
  "step": 80
84
  },
85
  {
86
+ "epoch": 2.01,
87
+ "grad_norm": 3.0576603412628174,
88
+ "learning_rate": 2.0833333333333336e-05,
89
+ "loss": 1.6863,
90
  "step": 90
91
  },
92
  {
93
+ "epoch": 2.01,
94
+ "grad_norm": 2.1496477127075195,
95
+ "learning_rate": 2.314814814814815e-05,
96
+ "loss": 1.6815,
97
  "step": 100
98
  },
99
  {
100
+ "epoch": 2.02,
101
+ "grad_norm": 2.7620458602905273,
102
+ "learning_rate": 2.5462962962962965e-05,
103
+ "loss": 1.6879,
104
  "step": 110
105
  },
106
  {
107
+ "epoch": 2.02,
108
+ "eval_accuracy": 0.34011220196353437,
109
+ "eval_loss": 1.6197831630706787,
110
+ "eval_runtime": 312.1037,
111
+ "eval_samples_per_second": 4.569,
112
+ "eval_steps_per_second": 0.144,
113
+ "step": 111
114
+ },
115
+ {
116
+ "epoch": 3.0,
117
+ "grad_norm": 2.320647954940796,
118
+ "learning_rate": 2.777777777777778e-05,
119
+ "loss": 1.7174,
120
  "step": 120
121
  },
122
  {
123
+ "epoch": 3.01,
124
+ "grad_norm": 3.75618577003479,
125
+ "learning_rate": 3.0092592592592593e-05,
126
+ "loss": 1.6653,
127
  "step": 130
128
  },
129
  {
130
+ "epoch": 3.01,
131
+ "grad_norm": 3.8325207233428955,
132
+ "learning_rate": 3.240740740740741e-05,
133
+ "loss": 1.7059,
134
  "step": 140
135
  },
136
  {
137
+ "epoch": 3.02,
138
+ "eval_accuracy": 0.197054698457223,
139
+ "eval_loss": 1.7028281688690186,
140
+ "eval_runtime": 315.8972,
141
+ "eval_samples_per_second": 4.514,
142
+ "eval_steps_per_second": 0.142,
143
+ "step": 148
144
+ },
145
+ {
146
+ "epoch": 4.0,
147
+ "grad_norm": 4.027806758880615,
148
+ "learning_rate": 3.472222222222222e-05,
149
+ "loss": 1.7085,
150
  "step": 150
151
  },
152
  {
153
+ "epoch": 4.01,
154
+ "grad_norm": 2.945446252822876,
155
+ "learning_rate": 3.7037037037037037e-05,
156
+ "loss": 1.6431,
157
  "step": 160
158
  },
159
  {
160
+ "epoch": 4.01,
161
+ "grad_norm": 2.925701856613159,
162
+ "learning_rate": 3.935185185185186e-05,
163
+ "loss": 1.696,
164
  "step": 170
165
  },
166
  {
167
+ "epoch": 4.01,
168
+ "grad_norm": 3.969252586364746,
169
+ "learning_rate": 4.166666666666667e-05,
170
+ "loss": 1.6555,
171
  "step": 180
172
  },
173
  {
174
+ "epoch": 4.02,
175
+ "eval_accuracy": 0.2903225806451613,
176
+ "eval_loss": 1.6066545248031616,
177
+ "eval_runtime": 318.4782,
178
+ "eval_samples_per_second": 4.478,
179
+ "eval_steps_per_second": 0.141,
180
+ "step": 185
181
+ },
182
+ {
183
+ "epoch": 5.0,
184
+ "grad_norm": 2.911034345626831,
185
+ "learning_rate": 4.3981481481481486e-05,
186
+ "loss": 1.6794,
187
  "step": 190
188
  },
189
  {
190
+ "epoch": 5.01,
191
+ "grad_norm": 2.8345723152160645,
192
+ "learning_rate": 4.62962962962963e-05,
193
+ "loss": 1.671,
194
  "step": 200
195
  },
196
  {
197
+ "epoch": 5.01,
198
+ "grad_norm": 2.6440374851226807,
199
+ "learning_rate": 4.8611111111111115e-05,
200
+ "loss": 1.6703,
201
  "step": 210
202
  },
203
  {
204
+ "epoch": 5.02,
205
+ "grad_norm": 2.750511407852173,
206
+ "learning_rate": 4.9897119341563785e-05,
207
+ "loss": 1.616,
208
+ "step": 220
 
 
209
  },
210
  {
211
+ "epoch": 5.02,
212
+ "eval_accuracy": 0.31626928471248245,
213
+ "eval_loss": 1.6073260307312012,
214
+ "eval_runtime": 318.3032,
215
+ "eval_samples_per_second": 4.48,
216
+ "eval_steps_per_second": 0.141,
217
+ "step": 222
218
  },
219
  {
220
+ "epoch": 6.0,
221
+ "grad_norm": 4.725213527679443,
222
+ "learning_rate": 4.963991769547325e-05,
223
+ "loss": 1.5793,
224
  "step": 230
225
  },
226
  {
227
+ "epoch": 6.01,
228
+ "grad_norm": 3.6718668937683105,
229
+ "learning_rate": 4.938271604938271e-05,
230
+ "loss": 1.6732,
231
  "step": 240
232
  },
233
  {
234
+ "epoch": 6.01,
235
+ "grad_norm": 2.6274123191833496,
236
+ "learning_rate": 4.912551440329218e-05,
237
+ "loss": 1.6706,
238
  "step": 250
239
  },
240
  {
241
+ "epoch": 6.02,
242
+ "eval_accuracy": 0.3485273492286115,
243
+ "eval_loss": 1.5842552185058594,
244
+ "eval_runtime": 318.7867,
245
+ "eval_samples_per_second": 4.473,
246
+ "eval_steps_per_second": 0.141,
247
+ "step": 259
248
+ },
249
+ {
250
+ "epoch": 7.0,
251
+ "grad_norm": 4.207123279571533,
252
+ "learning_rate": 4.886831275720165e-05,
253
+ "loss": 1.6857,
254
  "step": 260
255
  },
256
  {
257
+ "epoch": 7.01,
258
+ "grad_norm": 2.897780656814575,
259
+ "learning_rate": 4.8611111111111115e-05,
260
+ "loss": 1.5988,
261
  "step": 270
262
  },
263
  {
264
+ "epoch": 7.01,
265
+ "grad_norm": 2.6492152214050293,
266
+ "learning_rate": 4.835390946502058e-05,
267
+ "loss": 1.5941,
268
  "step": 280
269
  },
270
  {
271
+ "epoch": 7.01,
272
+ "grad_norm": 3.756108283996582,
273
+ "learning_rate": 4.809670781893004e-05,
274
+ "loss": 1.6317,
275
  "step": 290
276
  },
277
  {
278
+ "epoch": 7.02,
279
+ "eval_accuracy": 0.317671809256662,
280
+ "eval_loss": 1.6478837728500366,
281
+ "eval_runtime": 321.4859,
282
+ "eval_samples_per_second": 4.436,
283
+ "eval_steps_per_second": 0.14,
284
+ "step": 296
285
+ },
286
+ {
287
+ "epoch": 8.0,
288
+ "grad_norm": 3.5807301998138428,
289
+ "learning_rate": 4.783950617283951e-05,
290
+ "loss": 1.569,
291
  "step": 300
292
  },
293
  {
294
+ "epoch": 8.01,
295
+ "grad_norm": 3.0637378692626953,
296
+ "learning_rate": 4.758230452674897e-05,
297
+ "loss": 1.603,
298
  "step": 310
299
  },
300
  {
301
+ "epoch": 8.01,
302
+ "grad_norm": 2.659273386001587,
303
+ "learning_rate": 4.732510288065844e-05,
304
+ "loss": 1.658,
305
  "step": 320
306
  },
307
  {
308
+ "epoch": 8.02,
309
+ "grad_norm": 3.2369003295898438,
310
+ "learning_rate": 4.70679012345679e-05,
311
+ "loss": 1.5798,
312
  "step": 330
313
  },
314
  {
315
+ "epoch": 8.02,
316
+ "eval_accuracy": 0.19845722300140253,
317
+ "eval_loss": 1.7481985092163086,
318
+ "eval_runtime": 327.2626,
319
+ "eval_samples_per_second": 4.357,
320
+ "eval_steps_per_second": 0.138,
321
+ "step": 333
322
+ },
323
+ {
324
+ "epoch": 9.0,
325
+ "grad_norm": 2.4828362464904785,
326
+ "learning_rate": 4.6810699588477366e-05,
327
+ "loss": 1.6029,
328
  "step": 340
329
  },
330
  {
331
+ "epoch": 9.01,
332
+ "grad_norm": 4.688723087310791,
333
+ "learning_rate": 4.6553497942386833e-05,
334
+ "loss": 1.5591,
335
  "step": 350
336
  },
337
  {
338
+ "epoch": 9.01,
339
+ "grad_norm": 3.989243268966675,
340
+ "learning_rate": 4.62962962962963e-05,
341
+ "loss": 1.5823,
342
  "step": 360
343
  },
344
  {
345
+ "epoch": 9.02,
346
+ "grad_norm": 7.375450134277344,
347
+ "learning_rate": 4.603909465020577e-05,
348
+ "loss": 1.5923,
349
+ "step": 370
 
 
350
  },
351
  {
352
+ "epoch": 9.02,
353
+ "eval_accuracy": 0.270687237026648,
354
+ "eval_loss": 1.652872085571289,
355
+ "eval_runtime": 320.903,
356
+ "eval_samples_per_second": 4.444,
357
+ "eval_steps_per_second": 0.14,
358
  "step": 370
359
  },
360
  {
361
+ "epoch": 10.0,
362
+ "grad_norm": 2.5953173637390137,
363
+ "learning_rate": 4.578189300411523e-05,
364
+ "loss": 1.5677,
365
  "step": 380
366
  },
367
  {
368
+ "epoch": 10.01,
369
+ "grad_norm": 3.554826498031616,
370
+ "learning_rate": 4.5524691358024696e-05,
371
+ "loss": 1.5476,
372
  "step": 390
373
  },
374
  {
375
+ "epoch": 10.01,
376
+ "grad_norm": 3.9055745601654053,
377
+ "learning_rate": 4.5267489711934157e-05,
378
+ "loss": 1.6002,
379
  "step": 400
380
  },
381
  {
382
+ "epoch": 10.02,
383
+ "eval_accuracy": 0.3246844319775596,
384
+ "eval_loss": 1.6174861192703247,
385
+ "eval_runtime": 325.4885,
386
+ "eval_samples_per_second": 4.381,
387
+ "eval_steps_per_second": 0.138,
388
+ "step": 407
389
+ },
390
+ {
391
+ "epoch": 11.0,
392
+ "grad_norm": 1.8588017225265503,
393
+ "learning_rate": 4.5010288065843624e-05,
394
+ "loss": 1.5817,
395
  "step": 410
396
  },
397
  {
398
+ "epoch": 11.01,
399
+ "grad_norm": 5.376626491546631,
400
+ "learning_rate": 4.4753086419753084e-05,
401
+ "loss": 1.5898,
402
  "step": 420
403
  },
404
  {
405
+ "epoch": 11.01,
406
+ "grad_norm": 3.582792043685913,
407
+ "learning_rate": 4.449588477366255e-05,
408
+ "loss": 1.5793,
409
  "step": 430
410
  },
411
  {
412
+ "epoch": 11.02,
413
+ "grad_norm": 4.624961853027344,
414
+ "learning_rate": 4.423868312757202e-05,
415
+ "loss": 1.4946,
416
+ "step": 440
 
 
417
  },
418
  {
419
+ "epoch": 11.02,
420
+ "eval_accuracy": 0.29453015427769985,
421
+ "eval_loss": 1.641377568244934,
422
+ "eval_runtime": 313.5978,
423
+ "eval_samples_per_second": 4.547,
424
+ "eval_steps_per_second": 0.143,
425
+ "step": 444
426
  },
427
  {
428
+ "epoch": 12.0,
429
+ "grad_norm": 3.0706281661987305,
430
+ "learning_rate": 4.3981481481481486e-05,
431
+ "loss": 1.4587,
432
  "step": 450
433
  },
434
  {
435
+ "epoch": 12.01,
436
+ "grad_norm": 3.571446180343628,
437
+ "learning_rate": 4.372427983539095e-05,
438
+ "loss": 1.531,
439
  "step": 460
440
  },
441
  {
442
+ "epoch": 12.01,
443
+ "grad_norm": 5.381348133087158,
444
+ "learning_rate": 4.3467078189300414e-05,
445
+ "loss": 1.5326,
446
  "step": 470
447
  },
448
  {
449
+ "epoch": 12.02,
450
+ "grad_norm": 2.7086222171783447,
451
+ "learning_rate": 4.3209876543209875e-05,
452
+ "loss": 1.5688,
453
  "step": 480
454
  },
455
  {
456
+ "epoch": 12.02,
457
+ "eval_accuracy": 0.3338008415147265,
458
+ "eval_loss": 1.6061851978302002,
459
+ "eval_runtime": 318.985,
460
+ "eval_samples_per_second": 4.47,
461
+ "eval_steps_per_second": 0.141,
462
+ "step": 481
463
+ },
464
+ {
465
+ "epoch": 13.0,
466
+ "grad_norm": 2.4169111251831055,
467
+ "learning_rate": 4.295267489711934e-05,
468
+ "loss": 1.4803,
469
  "step": 490
470
  },
471
  {
472
+ "epoch": 13.01,
473
+ "grad_norm": 2.4832234382629395,
474
+ "learning_rate": 4.269547325102881e-05,
475
+ "loss": 1.534,
476
  "step": 500
477
  },
478
  {
479
+ "epoch": 13.01,
480
+ "grad_norm": 2.9390156269073486,
481
+ "learning_rate": 4.243827160493827e-05,
482
+ "loss": 1.5322,
483
  "step": 510
484
  },
485
  {
486
+ "epoch": 13.02,
487
+ "eval_accuracy": 0.2805049088359046,
488
+ "eval_loss": 1.6427088975906372,
489
+ "eval_runtime": 325.4763,
490
+ "eval_samples_per_second": 4.381,
491
+ "eval_steps_per_second": 0.138,
492
+ "step": 518
493
  },
494
  {
495
+ "epoch": 14.0,
496
+ "grad_norm": 3.6847591400146484,
497
+ "learning_rate": 4.2181069958847744e-05,
498
+ "loss": 1.5182,
499
  "step": 520
500
  },
501
  {
502
+ "epoch": 14.01,
503
+ "grad_norm": 2.374372959136963,
504
+ "learning_rate": 4.1923868312757205e-05,
505
+ "loss": 1.4685,
506
  "step": 530
507
  },
508
  {
509
+ "epoch": 14.01,
510
+ "grad_norm": 5.837779521942139,
511
+ "learning_rate": 4.166666666666667e-05,
512
+ "loss": 1.6207,
513
  "step": 540
514
  },
515
  {
516
+ "epoch": 14.01,
517
+ "grad_norm": 3.177703380584717,
518
+ "learning_rate": 4.140946502057613e-05,
519
+ "loss": 1.5078,
520
  "step": 550
521
  },
522
  {
523
+ "epoch": 14.02,
524
+ "eval_accuracy": 0.3134642356241234,
525
+ "eval_loss": 1.7241849899291992,
526
+ "eval_runtime": 321.2611,
527
+ "eval_samples_per_second": 4.439,
528
+ "eval_steps_per_second": 0.14,
529
+ "step": 555
530
+ },
531
+ {
532
+ "epoch": 15.0,
533
+ "grad_norm": 4.3974199295043945,
534
+ "learning_rate": 4.11522633744856e-05,
535
+ "loss": 1.4642,
536
  "step": 560
537
  },
538
  {
539
+ "epoch": 15.01,
540
+ "grad_norm": 2.068755626678467,
541
+ "learning_rate": 4.089506172839506e-05,
542
+ "loss": 1.4817,
543
  "step": 570
544
  },
545
  {
546
+ "epoch": 15.01,
547
+ "grad_norm": 3.324115514755249,
548
+ "learning_rate": 4.063786008230453e-05,
549
+ "loss": 1.5062,
550
+ "step": 580
551
+ },
552
+ {
553
+ "epoch": 15.02,
554
+ "grad_norm": 4.541093349456787,
555
+ "learning_rate": 4.038065843621399e-05,
556
+ "loss": 1.5014,
557
+ "step": 590
558
+ },
559
+ {
560
+ "epoch": 15.02,
561
+ "eval_accuracy": 0.32187938288920054,
562
+ "eval_loss": 1.6587475538253784,
563
+ "eval_runtime": 326.5203,
564
+ "eval_samples_per_second": 4.367,
565
+ "eval_steps_per_second": 0.138,
566
+ "step": 592
567
+ },
568
+ {
569
+ "epoch": 16.0,
570
+ "grad_norm": 2.232940435409546,
571
+ "learning_rate": 4.012345679012346e-05,
572
+ "loss": 1.5007,
573
+ "step": 600
574
+ },
575
+ {
576
+ "epoch": 16.01,
577
+ "grad_norm": 3.5462706089019775,
578
+ "learning_rate": 3.986625514403292e-05,
579
+ "loss": 1.4395,
580
+ "step": 610
581
+ },
582
+ {
583
+ "epoch": 16.01,
584
+ "grad_norm": 3.092838764190674,
585
+ "learning_rate": 3.960905349794239e-05,
586
+ "loss": 1.4861,
587
+ "step": 620
588
+ },
589
+ {
590
+ "epoch": 16.02,
591
+ "eval_accuracy": 0.23492286115007013,
592
+ "eval_loss": 1.8075391054153442,
593
+ "eval_runtime": 310.363,
594
+ "eval_samples_per_second": 4.595,
595
+ "eval_steps_per_second": 0.145,
596
+ "step": 629
597
+ },
598
+ {
599
+ "epoch": 17.0,
600
+ "grad_norm": 2.710343837738037,
601
+ "learning_rate": 3.935185185185186e-05,
602
+ "loss": 1.4438,
603
+ "step": 630
604
+ },
605
+ {
606
+ "epoch": 17.01,
607
+ "grad_norm": 3.683576822280884,
608
+ "learning_rate": 3.909465020576132e-05,
609
+ "loss": 1.4611,
610
+ "step": 640
611
+ },
612
+ {
613
+ "epoch": 17.01,
614
+ "grad_norm": 4.069783687591553,
615
+ "learning_rate": 3.8837448559670786e-05,
616
+ "loss": 1.4678,
617
+ "step": 650
618
+ },
619
+ {
620
+ "epoch": 17.01,
621
+ "grad_norm": 3.3984758853912354,
622
+ "learning_rate": 3.8580246913580246e-05,
623
+ "loss": 1.4983,
624
+ "step": 660
625
+ },
626
+ {
627
+ "epoch": 17.02,
628
+ "eval_accuracy": 0.3071528751753156,
629
+ "eval_loss": 1.6724629402160645,
630
+ "eval_runtime": 325.7044,
631
+ "eval_samples_per_second": 4.378,
632
+ "eval_steps_per_second": 0.138,
633
+ "step": 666
634
+ },
635
+ {
636
+ "epoch": 18.0,
637
+ "grad_norm": 3.628767967224121,
638
+ "learning_rate": 3.8323045267489713e-05,
639
+ "loss": 1.4937,
640
+ "step": 670
641
+ },
642
+ {
643
+ "epoch": 18.01,
644
+ "grad_norm": 4.624797821044922,
645
+ "learning_rate": 3.806584362139918e-05,
646
+ "loss": 1.4637,
647
+ "step": 680
648
+ },
649
+ {
650
+ "epoch": 18.01,
651
+ "grad_norm": 4.030791759490967,
652
+ "learning_rate": 3.780864197530865e-05,
653
+ "loss": 1.4073,
654
+ "step": 690
655
+ },
656
+ {
657
+ "epoch": 18.02,
658
+ "grad_norm": 3.900832176208496,
659
+ "learning_rate": 3.755144032921811e-05,
660
+ "loss": 1.4716,
661
+ "step": 700
662
+ },
663
+ {
664
+ "epoch": 18.02,
665
+ "eval_accuracy": 0.2657784011220196,
666
+ "eval_loss": 1.7466487884521484,
667
+ "eval_runtime": 316.8763,
668
+ "eval_samples_per_second": 4.5,
669
+ "eval_steps_per_second": 0.142,
670
+ "step": 703
671
+ },
672
+ {
673
+ "epoch": 19.0,
674
+ "grad_norm": 3.4346656799316406,
675
+ "learning_rate": 3.7294238683127576e-05,
676
+ "loss": 1.4094,
677
+ "step": 710
678
+ },
679
+ {
680
+ "epoch": 19.01,
681
+ "grad_norm": 2.6606976985931396,
682
+ "learning_rate": 3.7037037037037037e-05,
683
+ "loss": 1.4,
684
+ "step": 720
685
+ },
686
+ {
687
+ "epoch": 19.01,
688
+ "grad_norm": 3.55667781829834,
689
+ "learning_rate": 3.6779835390946504e-05,
690
+ "loss": 1.4422,
691
+ "step": 730
692
+ },
693
+ {
694
+ "epoch": 19.02,
695
+ "grad_norm": 9.335260391235352,
696
+ "learning_rate": 3.6522633744855964e-05,
697
+ "loss": 1.5072,
698
+ "step": 740
699
+ },
700
+ {
701
+ "epoch": 19.02,
702
+ "eval_accuracy": 0.2482468443197756,
703
+ "eval_loss": 1.7423261404037476,
704
+ "eval_runtime": 323.8192,
705
+ "eval_samples_per_second": 4.404,
706
+ "eval_steps_per_second": 0.139,
707
+ "step": 740
708
+ },
709
+ {
710
+ "epoch": 20.0,
711
+ "grad_norm": 4.2967681884765625,
712
+ "learning_rate": 3.626543209876543e-05,
713
+ "loss": 1.4085,
714
+ "step": 750
715
+ },
716
+ {
717
+ "epoch": 20.01,
718
+ "grad_norm": 3.630871295928955,
719
+ "learning_rate": 3.60082304526749e-05,
720
+ "loss": 1.3651,
721
+ "step": 760
722
+ },
723
+ {
724
+ "epoch": 20.01,
725
+ "grad_norm": 3.8776559829711914,
726
+ "learning_rate": 3.5751028806584366e-05,
727
+ "loss": 1.4874,
728
+ "step": 770
729
+ },
730
+ {
731
+ "epoch": 20.02,
732
+ "eval_accuracy": 0.2447405329593268,
733
+ "eval_loss": 1.7873163223266602,
734
+ "eval_runtime": 318.591,
735
+ "eval_samples_per_second": 4.476,
736
+ "eval_steps_per_second": 0.141,
737
+ "step": 777
738
+ },
739
+ {
740
+ "epoch": 21.0,
741
+ "grad_norm": 3.94429087638855,
742
+ "learning_rate": 3.5493827160493834e-05,
743
+ "loss": 1.4371,
744
+ "step": 780
745
+ },
746
+ {
747
+ "epoch": 21.01,
748
+ "grad_norm": 3.531804084777832,
749
+ "learning_rate": 3.5236625514403294e-05,
750
+ "loss": 1.419,
751
+ "step": 790
752
+ },
753
+ {
754
+ "epoch": 21.01,
755
+ "grad_norm": 3.9950878620147705,
756
+ "learning_rate": 3.497942386831276e-05,
757
+ "loss": 1.4106,
758
+ "step": 800
759
+ },
760
+ {
761
+ "epoch": 21.02,
762
+ "grad_norm": 5.080261707305908,
763
+ "learning_rate": 3.472222222222222e-05,
764
+ "loss": 1.4236,
765
+ "step": 810
766
+ },
767
+ {
768
+ "epoch": 21.02,
769
+ "eval_accuracy": 0.2496493688639551,
770
+ "eval_loss": 1.828230857849121,
771
+ "eval_runtime": 324.6474,
772
+ "eval_samples_per_second": 4.392,
773
+ "eval_steps_per_second": 0.139,
774
+ "step": 814
775
+ },
776
+ {
777
+ "epoch": 22.0,
778
+ "grad_norm": 3.356649160385132,
779
+ "learning_rate": 3.446502057613169e-05,
780
+ "loss": 1.3901,
781
+ "step": 820
782
+ },
783
+ {
784
+ "epoch": 22.01,
785
+ "grad_norm": 4.261005401611328,
786
+ "learning_rate": 3.420781893004115e-05,
787
+ "loss": 1.3512,
788
+ "step": 830
789
+ },
790
+ {
791
+ "epoch": 22.01,
792
+ "grad_norm": 4.515724182128906,
793
+ "learning_rate": 3.395061728395062e-05,
794
+ "loss": 1.4257,
795
+ "step": 840
796
+ },
797
+ {
798
+ "epoch": 22.02,
799
+ "grad_norm": 3.238006353378296,
800
+ "learning_rate": 3.3693415637860085e-05,
801
+ "loss": 1.4134,
802
+ "step": 850
803
+ },
804
+ {
805
+ "epoch": 22.02,
806
+ "eval_accuracy": 0.226507713884993,
807
+ "eval_loss": 1.840139389038086,
808
+ "eval_runtime": 337.3817,
809
+ "eval_samples_per_second": 4.227,
810
+ "eval_steps_per_second": 0.133,
811
+ "step": 851
812
+ },
813
+ {
814
+ "epoch": 23.0,
815
+ "grad_norm": 4.510561943054199,
816
+ "learning_rate": 3.343621399176955e-05,
817
+ "loss": 1.3086,
818
+ "step": 860
819
+ },
820
+ {
821
+ "epoch": 23.01,
822
+ "grad_norm": 3.6139793395996094,
823
+ "learning_rate": 3.317901234567901e-05,
824
+ "loss": 1.3137,
825
+ "step": 870
826
+ },
827
+ {
828
+ "epoch": 23.01,
829
+ "grad_norm": 4.887772560119629,
830
+ "learning_rate": 3.292181069958848e-05,
831
+ "loss": 1.3889,
832
+ "step": 880
833
+ },
834
+ {
835
+ "epoch": 23.02,
836
+ "eval_accuracy": 0.2713884992987377,
837
+ "eval_loss": 1.769424557685852,
838
+ "eval_runtime": 322.9783,
839
+ "eval_samples_per_second": 4.415,
840
+ "eval_steps_per_second": 0.139,
841
+ "step": 888
842
+ },
843
+ {
844
+ "epoch": 24.0,
845
+ "grad_norm": 5.044980049133301,
846
+ "learning_rate": 3.266460905349795e-05,
847
+ "loss": 1.5123,
848
+ "step": 890
849
+ },
850
+ {
851
+ "epoch": 24.01,
852
+ "grad_norm": 3.513139247894287,
853
+ "learning_rate": 3.240740740740741e-05,
854
+ "loss": 1.3466,
855
+ "step": 900
856
+ },
857
+ {
858
+ "epoch": 24.01,
859
+ "grad_norm": 4.22142219543457,
860
+ "learning_rate": 3.2150205761316875e-05,
861
+ "loss": 1.3941,
862
+ "step": 910
863
+ },
864
+ {
865
+ "epoch": 24.01,
866
+ "grad_norm": 4.938148021697998,
867
+ "learning_rate": 3.1893004115226336e-05,
868
+ "loss": 1.436,
869
+ "step": 920
870
+ },
871
+ {
872
+ "epoch": 24.02,
873
+ "eval_accuracy": 0.3022440392706872,
874
+ "eval_loss": 1.7301750183105469,
875
+ "eval_runtime": 333.7787,
876
+ "eval_samples_per_second": 4.272,
877
+ "eval_steps_per_second": 0.135,
878
+ "step": 925
879
+ },
880
+ {
881
+ "epoch": 25.0,
882
+ "grad_norm": 4.633960247039795,
883
+ "learning_rate": 3.16358024691358e-05,
884
+ "loss": 1.3746,
885
+ "step": 930
886
+ },
887
+ {
888
+ "epoch": 25.01,
889
+ "grad_norm": 4.339792251586914,
890
+ "learning_rate": 3.137860082304527e-05,
891
+ "loss": 1.3586,
892
+ "step": 940
893
+ },
894
+ {
895
+ "epoch": 25.01,
896
+ "grad_norm": 4.505563735961914,
897
+ "learning_rate": 3.112139917695474e-05,
898
+ "loss": 1.3702,
899
+ "step": 950
900
+ },
901
+ {
902
+ "epoch": 25.02,
903
+ "grad_norm": 3.4057440757751465,
904
+ "learning_rate": 3.08641975308642e-05,
905
+ "loss": 1.3266,
906
+ "step": 960
907
+ },
908
+ {
909
+ "epoch": 25.02,
910
  "eval_accuracy": 0.3141654978962132,
911
+ "eval_loss": 1.7449009418487549,
912
+ "eval_runtime": 322.6179,
913
+ "eval_samples_per_second": 4.42,
914
+ "eval_steps_per_second": 0.139,
915
+ "step": 962
916
+ },
917
+ {
918
+ "epoch": 26.0,
919
+ "grad_norm": 3.738858699798584,
920
+ "learning_rate": 3.0606995884773666e-05,
921
+ "loss": 1.3037,
922
+ "step": 970
923
+ },
924
+ {
925
+ "epoch": 26.01,
926
+ "grad_norm": 4.053394794464111,
927
+ "learning_rate": 3.0349794238683126e-05,
928
+ "loss": 1.2894,
929
+ "step": 980
930
+ },
931
+ {
932
+ "epoch": 26.01,
933
+ "grad_norm": 3.324256658554077,
934
+ "learning_rate": 3.0092592592592593e-05,
935
+ "loss": 1.3165,
936
+ "step": 990
937
+ },
938
+ {
939
+ "epoch": 26.02,
940
+ "eval_accuracy": 0.2938288920056101,
941
+ "eval_loss": 1.7723056077957153,
942
+ "eval_runtime": 328.3049,
943
+ "eval_samples_per_second": 4.344,
944
+ "eval_steps_per_second": 0.137,
945
+ "step": 999
946
+ },
947
+ {
948
+ "epoch": 27.0,
949
+ "grad_norm": 3.431753158569336,
950
+ "learning_rate": 2.9835390946502057e-05,
951
+ "loss": 1.3818,
952
+ "step": 1000
953
+ },
954
+ {
955
+ "epoch": 27.01,
956
+ "grad_norm": 4.195125579833984,
957
+ "learning_rate": 2.9578189300411525e-05,
958
+ "loss": 1.292,
959
+ "step": 1010
960
+ },
961
+ {
962
+ "epoch": 27.01,
963
+ "grad_norm": 4.109055995941162,
964
+ "learning_rate": 2.9320987654320992e-05,
965
+ "loss": 1.3738,
966
+ "step": 1020
967
+ },
968
+ {
969
+ "epoch": 27.01,
970
+ "grad_norm": 4.447628498077393,
971
+ "learning_rate": 2.9063786008230453e-05,
972
+ "loss": 1.3522,
973
+ "step": 1030
974
+ },
975
+ {
976
+ "epoch": 27.02,
977
+ "eval_accuracy": 0.31977559607293127,
978
+ "eval_loss": 1.7750145196914673,
979
+ "eval_runtime": 330.3668,
980
+ "eval_samples_per_second": 4.316,
981
+ "eval_steps_per_second": 0.136,
982
+ "step": 1036
983
+ },
984
+ {
985
+ "epoch": 28.0,
986
+ "grad_norm": 4.414978981018066,
987
+ "learning_rate": 2.880658436213992e-05,
988
+ "loss": 1.2548,
989
+ "step": 1040
990
+ },
991
+ {
992
+ "epoch": 28.01,
993
+ "grad_norm": 4.450014114379883,
994
+ "learning_rate": 2.8549382716049384e-05,
995
+ "loss": 1.2919,
996
+ "step": 1050
997
+ },
998
+ {
999
+ "epoch": 28.01,
1000
+ "grad_norm": 3.938246250152588,
1001
+ "learning_rate": 2.829218106995885e-05,
1002
+ "loss": 1.4023,
1003
+ "step": 1060
1004
+ },
1005
+ {
1006
+ "epoch": 28.02,
1007
+ "grad_norm": 4.146456718444824,
1008
+ "learning_rate": 2.8034979423868312e-05,
1009
+ "loss": 1.2635,
1010
+ "step": 1070
1011
+ },
1012
+ {
1013
+ "epoch": 28.02,
1014
+ "eval_accuracy": 0.3015427769985975,
1015
+ "eval_loss": 1.8097264766693115,
1016
+ "eval_runtime": 334.963,
1017
+ "eval_samples_per_second": 4.257,
1018
+ "eval_steps_per_second": 0.134,
1019
+ "step": 1073
1020
+ },
1021
+ {
1022
+ "epoch": 29.0,
1023
+ "grad_norm": 4.926144599914551,
1024
+ "learning_rate": 2.777777777777778e-05,
1025
+ "loss": 1.2375,
1026
+ "step": 1080
1027
+ },
1028
+ {
1029
+ "epoch": 29.01,
1030
+ "grad_norm": 5.324474334716797,
1031
+ "learning_rate": 2.7520576131687243e-05,
1032
+ "loss": 1.2292,
1033
+ "step": 1090
1034
+ },
1035
+ {
1036
+ "epoch": 29.01,
1037
+ "grad_norm": 5.955705642700195,
1038
+ "learning_rate": 2.726337448559671e-05,
1039
+ "loss": 1.3175,
1040
+ "step": 1100
1041
+ },
1042
+ {
1043
+ "epoch": 29.02,
1044
+ "grad_norm": 15.393019676208496,
1045
+ "learning_rate": 2.700617283950617e-05,
1046
+ "loss": 1.5006,
1047
+ "step": 1110
1048
+ },
1049
+ {
1050
+ "epoch": 29.02,
1051
+ "eval_accuracy": 0.2903225806451613,
1052
+ "eval_loss": 1.764939785003662,
1053
+ "eval_runtime": 334.1156,
1054
+ "eval_samples_per_second": 4.268,
1055
+ "eval_steps_per_second": 0.135,
1056
+ "step": 1110
1057
+ },
1058
+ {
1059
+ "epoch": 30.0,
1060
+ "grad_norm": 5.538832187652588,
1061
+ "learning_rate": 2.6748971193415638e-05,
1062
+ "loss": 1.2272,
1063
+ "step": 1120
1064
+ },
1065
+ {
1066
+ "epoch": 30.01,
1067
+ "grad_norm": 4.8999834060668945,
1068
+ "learning_rate": 2.6491769547325102e-05,
1069
+ "loss": 1.3556,
1070
+ "step": 1130
1071
+ },
1072
+ {
1073
+ "epoch": 30.01,
1074
+ "grad_norm": 4.719162464141846,
1075
+ "learning_rate": 2.623456790123457e-05,
1076
+ "loss": 1.2839,
1077
+ "step": 1140
1078
+ },
1079
+ {
1080
+ "epoch": 30.02,
1081
+ "eval_accuracy": 0.2720897615708275,
1082
+ "eval_loss": 1.7946317195892334,
1083
+ "eval_runtime": 329.2552,
1084
+ "eval_samples_per_second": 4.331,
1085
+ "eval_steps_per_second": 0.137,
1086
+ "step": 1147
1087
+ },
1088
+ {
1089
+ "epoch": 31.0,
1090
+ "grad_norm": 4.000698566436768,
1091
+ "learning_rate": 2.5977366255144037e-05,
1092
+ "loss": 1.317,
1093
+ "step": 1150
1094
+ },
1095
+ {
1096
+ "epoch": 31.01,
1097
+ "grad_norm": 4.718947887420654,
1098
+ "learning_rate": 2.5720164609053497e-05,
1099
+ "loss": 1.2813,
1100
+ "step": 1160
1101
+ },
1102
+ {
1103
+ "epoch": 31.01,
1104
+ "grad_norm": 5.832375526428223,
1105
+ "learning_rate": 2.5462962962962965e-05,
1106
+ "loss": 1.3293,
1107
+ "step": 1170
1108
+ },
1109
+ {
1110
+ "epoch": 31.02,
1111
+ "grad_norm": 4.554795742034912,
1112
+ "learning_rate": 2.520576131687243e-05,
1113
+ "loss": 1.2542,
1114
+ "step": 1180
1115
+ },
1116
+ {
1117
+ "epoch": 31.02,
1118
+ "eval_accuracy": 0.3064516129032258,
1119
+ "eval_loss": 1.8281904458999634,
1120
+ "eval_runtime": 338.8231,
1121
+ "eval_samples_per_second": 4.209,
1122
+ "eval_steps_per_second": 0.133,
1123
+ "step": 1184
1124
+ },
1125
+ {
1126
+ "epoch": 32.0,
1127
+ "grad_norm": 4.611420631408691,
1128
+ "learning_rate": 2.4948559670781893e-05,
1129
+ "loss": 1.2816,
1130
+ "step": 1190
1131
+ },
1132
+ {
1133
+ "epoch": 32.01,
1134
+ "grad_norm": 4.797652721405029,
1135
+ "learning_rate": 2.4691358024691357e-05,
1136
+ "loss": 1.2418,
1137
+ "step": 1200
1138
+ },
1139
+ {
1140
+ "epoch": 32.01,
1141
+ "grad_norm": 4.842775821685791,
1142
+ "learning_rate": 2.4434156378600824e-05,
1143
+ "loss": 1.2993,
1144
+ "step": 1210
1145
+ },
1146
+ {
1147
+ "epoch": 32.02,
1148
+ "grad_norm": 3.560959577560425,
1149
+ "learning_rate": 2.417695473251029e-05,
1150
+ "loss": 1.2637,
1151
+ "step": 1220
1152
+ },
1153
+ {
1154
+ "epoch": 32.02,
1155
+ "eval_accuracy": 0.29453015427769985,
1156
+ "eval_loss": 1.9262746572494507,
1157
+ "eval_runtime": 329.2386,
1158
+ "eval_samples_per_second": 4.331,
1159
+ "eval_steps_per_second": 0.137,
1160
+ "step": 1221
1161
+ },
1162
+ {
1163
+ "epoch": 33.0,
1164
+ "grad_norm": 4.209071159362793,
1165
+ "learning_rate": 2.3919753086419755e-05,
1166
+ "loss": 1.283,
1167
+ "step": 1230
1168
+ },
1169
+ {
1170
+ "epoch": 33.01,
1171
+ "grad_norm": 4.801421642303467,
1172
+ "learning_rate": 2.366255144032922e-05,
1173
+ "loss": 1.2626,
1174
+ "step": 1240
1175
+ },
1176
+ {
1177
+ "epoch": 33.01,
1178
+ "grad_norm": 3.3417141437530518,
1179
+ "learning_rate": 2.3405349794238683e-05,
1180
+ "loss": 1.2725,
1181
+ "step": 1250
1182
+ },
1183
+ {
1184
+ "epoch": 33.02,
1185
+ "eval_accuracy": 0.2812061711079944,
1186
+ "eval_loss": 1.8877640962600708,
1187
+ "eval_runtime": 315.8954,
1188
+ "eval_samples_per_second": 4.514,
1189
+ "eval_steps_per_second": 0.142,
1190
+ "step": 1258
1191
+ },
1192
+ {
1193
+ "epoch": 34.0,
1194
+ "grad_norm": 4.585484504699707,
1195
+ "learning_rate": 2.314814814814815e-05,
1196
+ "loss": 1.2775,
1197
+ "step": 1260
1198
+ },
1199
+ {
1200
+ "epoch": 34.01,
1201
+ "grad_norm": 4.262657165527344,
1202
+ "learning_rate": 2.2890946502057614e-05,
1203
+ "loss": 1.178,
1204
+ "step": 1270
1205
+ },
1206
+ {
1207
+ "epoch": 34.01,
1208
+ "grad_norm": 4.219261169433594,
1209
+ "learning_rate": 2.2633744855967078e-05,
1210
+ "loss": 1.1956,
1211
+ "step": 1280
1212
+ },
1213
+ {
1214
+ "epoch": 34.01,
1215
+ "grad_norm": 5.266630172729492,
1216
+ "learning_rate": 2.2376543209876542e-05,
1217
+ "loss": 1.3261,
1218
+ "step": 1290
1219
+ },
1220
+ {
1221
+ "epoch": 34.02,
1222
+ "eval_accuracy": 0.32398316970546986,
1223
+ "eval_loss": 1.8429114818572998,
1224
+ "eval_runtime": 315.1637,
1225
+ "eval_samples_per_second": 4.525,
1226
+ "eval_steps_per_second": 0.143,
1227
+ "step": 1295
1228
+ },
1229
+ {
1230
+ "epoch": 35.0,
1231
+ "grad_norm": 4.098344326019287,
1232
+ "learning_rate": 2.211934156378601e-05,
1233
+ "loss": 1.2926,
1234
+ "step": 1300
1235
+ },
1236
+ {
1237
+ "epoch": 35.01,
1238
+ "grad_norm": 5.764149188995361,
1239
+ "learning_rate": 2.1862139917695473e-05,
1240
+ "loss": 1.2233,
1241
+ "step": 1310
1242
+ },
1243
+ {
1244
+ "epoch": 35.01,
1245
+ "grad_norm": 5.489772319793701,
1246
+ "learning_rate": 2.1604938271604937e-05,
1247
+ "loss": 1.2497,
1248
+ "step": 1320
1249
+ },
1250
+ {
1251
+ "epoch": 35.02,
1252
+ "grad_norm": 4.76821756362915,
1253
+ "learning_rate": 2.1347736625514405e-05,
1254
+ "loss": 1.2834,
1255
+ "step": 1330
1256
+ },
1257
+ {
1258
+ "epoch": 35.02,
1259
+ "eval_accuracy": 0.2903225806451613,
1260
+ "eval_loss": 1.9099663496017456,
1261
+ "eval_runtime": 329.1538,
1262
+ "eval_samples_per_second": 4.332,
1263
+ "eval_steps_per_second": 0.137,
1264
+ "step": 1332
1265
+ },
1266
+ {
1267
+ "epoch": 36.0,
1268
+ "grad_norm": 3.996829032897949,
1269
+ "learning_rate": 2.1090534979423872e-05,
1270
+ "loss": 1.1749,
1271
+ "step": 1340
1272
+ },
1273
+ {
1274
+ "epoch": 36.01,
1275
+ "grad_norm": 4.916750431060791,
1276
+ "learning_rate": 2.0833333333333336e-05,
1277
+ "loss": 1.2347,
1278
+ "step": 1350
1279
+ },
1280
+ {
1281
+ "epoch": 36.01,
1282
+ "grad_norm": 6.722217559814453,
1283
+ "learning_rate": 2.05761316872428e-05,
1284
+ "loss": 1.2953,
1285
+ "step": 1360
1286
+ },
1287
+ {
1288
+ "epoch": 36.02,
1289
+ "eval_accuracy": 0.3078541374474053,
1290
+ "eval_loss": 1.9537488222122192,
1291
+ "eval_runtime": 321.6832,
1292
+ "eval_samples_per_second": 4.433,
1293
+ "eval_steps_per_second": 0.14,
1294
+ "step": 1369
1295
+ },
1296
+ {
1297
+ "epoch": 37.0,
1298
+ "grad_norm": 5.202242374420166,
1299
+ "learning_rate": 2.0318930041152264e-05,
1300
+ "loss": 1.2877,
1301
+ "step": 1370
1302
+ },
1303
+ {
1304
+ "epoch": 37.01,
1305
+ "grad_norm": 4.0624589920043945,
1306
+ "learning_rate": 2.006172839506173e-05,
1307
+ "loss": 1.1455,
1308
+ "step": 1380
1309
+ },
1310
+ {
1311
+ "epoch": 37.01,
1312
+ "grad_norm": 4.560797691345215,
1313
+ "learning_rate": 1.9804526748971195e-05,
1314
+ "loss": 1.3001,
1315
+ "step": 1390
1316
+ },
1317
+ {
1318
+ "epoch": 37.01,
1319
+ "grad_norm": 4.483878135681152,
1320
+ "learning_rate": 1.954732510288066e-05,
1321
+ "loss": 1.2118,
1322
+ "step": 1400
1323
+ },
1324
+ {
1325
+ "epoch": 37.02,
1326
+ "eval_accuracy": 0.26367461430575034,
1327
+ "eval_loss": 1.989565134048462,
1328
+ "eval_runtime": 319.6592,
1329
+ "eval_samples_per_second": 4.461,
1330
+ "eval_steps_per_second": 0.141,
1331
+ "step": 1406
1332
+ },
1333
+ {
1334
+ "epoch": 38.0,
1335
+ "grad_norm": 4.500958442687988,
1336
+ "learning_rate": 1.9290123456790123e-05,
1337
+ "loss": 1.1902,
1338
+ "step": 1410
1339
+ },
1340
+ {
1341
+ "epoch": 38.01,
1342
+ "grad_norm": 4.762270927429199,
1343
+ "learning_rate": 1.903292181069959e-05,
1344
+ "loss": 1.2485,
1345
+ "step": 1420
1346
+ },
1347
+ {
1348
+ "epoch": 38.01,
1349
+ "grad_norm": 4.195743560791016,
1350
+ "learning_rate": 1.8775720164609054e-05,
1351
+ "loss": 1.1943,
1352
+ "step": 1430
1353
+ },
1354
+ {
1355
+ "epoch": 38.02,
1356
+ "grad_norm": 5.3452582359313965,
1357
+ "learning_rate": 1.8518518518518518e-05,
1358
+ "loss": 1.1953,
1359
+ "step": 1440
1360
+ },
1361
+ {
1362
+ "epoch": 38.02,
1363
+ "eval_accuracy": 0.25455820476858343,
1364
+ "eval_loss": 2.028036117553711,
1365
+ "eval_runtime": 320.9292,
1366
+ "eval_samples_per_second": 4.443,
1367
+ "eval_steps_per_second": 0.14,
1368
+ "step": 1443
1369
+ },
1370
+ {
1371
+ "epoch": 39.0,
1372
+ "grad_norm": 4.503300189971924,
1373
+ "learning_rate": 1.8261316872427982e-05,
1374
+ "loss": 1.197,
1375
+ "step": 1450
1376
+ },
1377
+ {
1378
+ "epoch": 39.01,
1379
+ "grad_norm": 4.667981147766113,
1380
+ "learning_rate": 1.800411522633745e-05,
1381
+ "loss": 1.1577,
1382
+ "step": 1460
1383
+ },
1384
+ {
1385
+ "epoch": 39.01,
1386
+ "grad_norm": 5.093000888824463,
1387
+ "learning_rate": 1.7746913580246917e-05,
1388
+ "loss": 1.258,
1389
+ "step": 1470
1390
+ },
1391
+ {
1392
+ "epoch": 39.02,
1393
+ "grad_norm": 11.824995994567871,
1394
+ "learning_rate": 1.748971193415638e-05,
1395
+ "loss": 1.1522,
1396
+ "step": 1480
1397
+ },
1398
+ {
1399
+ "epoch": 39.02,
1400
+ "eval_accuracy": 0.28892005610098176,
1401
+ "eval_loss": 2.011441469192505,
1402
+ "eval_runtime": 335.1859,
1403
+ "eval_samples_per_second": 4.254,
1404
+ "eval_steps_per_second": 0.134,
1405
+ "step": 1480
1406
+ },
1407
+ {
1408
+ "epoch": 40.0,
1409
+ "grad_norm": 4.728548526763916,
1410
+ "learning_rate": 1.7232510288065845e-05,
1411
+ "loss": 1.1526,
1412
+ "step": 1490
1413
+ },
1414
+ {
1415
+ "epoch": 40.01,
1416
+ "grad_norm": 4.5145087242126465,
1417
+ "learning_rate": 1.697530864197531e-05,
1418
+ "loss": 1.1564,
1419
+ "step": 1500
1420
+ },
1421
+ {
1422
+ "epoch": 40.01,
1423
+ "grad_norm": 5.764113426208496,
1424
+ "learning_rate": 1.6718106995884776e-05,
1425
+ "loss": 1.2288,
1426
+ "step": 1510
1427
+ },
1428
+ {
1429
+ "epoch": 40.02,
1430
+ "eval_accuracy": 0.30504908835904626,
1431
+ "eval_loss": 2.0061888694763184,
1432
+ "eval_runtime": 326.9579,
1433
+ "eval_samples_per_second": 4.361,
1434
+ "eval_steps_per_second": 0.138,
1435
+ "step": 1517
1436
+ },
1437
+ {
1438
+ "epoch": 41.0,
1439
+ "grad_norm": 5.014638423919678,
1440
+ "learning_rate": 1.646090534979424e-05,
1441
+ "loss": 1.0934,
1442
+ "step": 1520
1443
+ },
1444
+ {
1445
+ "epoch": 41.01,
1446
+ "grad_norm": 5.846142768859863,
1447
+ "learning_rate": 1.6203703703703704e-05,
1448
+ "loss": 1.1862,
1449
+ "step": 1530
1450
+ },
1451
+ {
1452
+ "epoch": 41.01,
1453
+ "grad_norm": 4.802533149719238,
1454
+ "learning_rate": 1.5946502057613168e-05,
1455
+ "loss": 1.1589,
1456
+ "step": 1540
1457
+ },
1458
+ {
1459
+ "epoch": 41.02,
1460
+ "grad_norm": 5.541863918304443,
1461
+ "learning_rate": 1.5689300411522635e-05,
1462
+ "loss": 1.2318,
1463
+ "step": 1550
1464
+ },
1465
+ {
1466
+ "epoch": 41.02,
1467
+ "eval_accuracy": 0.24894810659186536,
1468
+ "eval_loss": 2.063317060470581,
1469
+ "eval_runtime": 329.5785,
1470
+ "eval_samples_per_second": 4.327,
1471
+ "eval_steps_per_second": 0.137,
1472
+ "step": 1554
1473
+ },
1474
+ {
1475
+ "epoch": 42.0,
1476
+ "grad_norm": 5.3696136474609375,
1477
+ "learning_rate": 1.54320987654321e-05,
1478
+ "loss": 1.1666,
1479
+ "step": 1560
1480
+ },
1481
+ {
1482
+ "epoch": 42.01,
1483
+ "grad_norm": 4.8603291511535645,
1484
+ "learning_rate": 1.5174897119341563e-05,
1485
+ "loss": 1.1357,
1486
+ "step": 1570
1487
+ },
1488
+ {
1489
+ "epoch": 42.01,
1490
+ "grad_norm": 4.631726264953613,
1491
+ "learning_rate": 1.4917695473251029e-05,
1492
+ "loss": 1.2216,
1493
+ "step": 1580
1494
+ },
1495
+ {
1496
+ "epoch": 42.02,
1497
+ "grad_norm": 4.8486328125,
1498
+ "learning_rate": 1.4660493827160496e-05,
1499
+ "loss": 1.1571,
1500
+ "step": 1590
1501
+ },
1502
+ {
1503
+ "epoch": 42.02,
1504
+ "eval_accuracy": 0.2826086956521739,
1505
+ "eval_loss": 2.047227382659912,
1506
+ "eval_runtime": 342.7107,
1507
+ "eval_samples_per_second": 4.161,
1508
+ "eval_steps_per_second": 0.131,
1509
+ "step": 1591
1510
+ },
1511
+ {
1512
+ "epoch": 43.0,
1513
+ "grad_norm": 6.484289169311523,
1514
+ "learning_rate": 1.440329218106996e-05,
1515
+ "loss": 1.2965,
1516
+ "step": 1600
1517
+ },
1518
+ {
1519
+ "epoch": 43.01,
1520
+ "grad_norm": 4.621644496917725,
1521
+ "learning_rate": 1.4146090534979426e-05,
1522
+ "loss": 1.1669,
1523
+ "step": 1610
1524
+ },
1525
+ {
1526
+ "epoch": 43.01,
1527
+ "grad_norm": 5.708728313446045,
1528
+ "learning_rate": 1.388888888888889e-05,
1529
+ "loss": 1.155,
1530
+ "step": 1620
1531
+ },
1532
+ {
1533
+ "epoch": 43.02,
1534
+ "eval_accuracy": 0.2755960729312763,
1535
+ "eval_loss": 2.0339248180389404,
1536
+ "eval_runtime": 311.8249,
1537
+ "eval_samples_per_second": 4.573,
1538
+ "eval_steps_per_second": 0.144,
1539
+ "step": 1628
1540
+ },
1541
+ {
1542
+ "epoch": 44.0,
1543
+ "grad_norm": 5.512135028839111,
1544
+ "learning_rate": 1.3631687242798355e-05,
1545
+ "loss": 1.1624,
1546
+ "step": 1630
1547
+ },
1548
+ {
1549
+ "epoch": 44.01,
1550
+ "grad_norm": 4.800120830535889,
1551
+ "learning_rate": 1.3374485596707819e-05,
1552
+ "loss": 1.1393,
1553
+ "step": 1640
1554
+ },
1555
+ {
1556
+ "epoch": 44.01,
1557
+ "grad_norm": 5.686243534088135,
1558
+ "learning_rate": 1.3117283950617285e-05,
1559
+ "loss": 1.1579,
1560
+ "step": 1650
1561
+ },
1562
+ {
1563
+ "epoch": 44.01,
1564
+ "grad_norm": 6.732748985290527,
1565
+ "learning_rate": 1.2860082304526749e-05,
1566
+ "loss": 1.1448,
1567
+ "step": 1660
1568
+ },
1569
+ {
1570
+ "epoch": 44.02,
1571
+ "eval_accuracy": 0.28190743338008417,
1572
+ "eval_loss": 2.0284523963928223,
1573
+ "eval_runtime": 306.358,
1574
+ "eval_samples_per_second": 4.655,
1575
+ "eval_steps_per_second": 0.147,
1576
+ "step": 1665
1577
+ },
1578
+ {
1579
+ "epoch": 45.0,
1580
+ "grad_norm": 3.9400506019592285,
1581
+ "learning_rate": 1.2602880658436214e-05,
1582
+ "loss": 1.1748,
1583
+ "step": 1670
1584
+ },
1585
+ {
1586
+ "epoch": 45.01,
1587
+ "grad_norm": 5.26138973236084,
1588
+ "learning_rate": 1.2345679012345678e-05,
1589
+ "loss": 1.1204,
1590
+ "step": 1680
1591
+ },
1592
+ {
1593
+ "epoch": 45.01,
1594
+ "grad_norm": 5.398032188415527,
1595
+ "learning_rate": 1.2088477366255146e-05,
1596
+ "loss": 1.1114,
1597
+ "step": 1690
1598
+ },
1599
+ {
1600
+ "epoch": 45.02,
1601
+ "grad_norm": 5.264804840087891,
1602
+ "learning_rate": 1.183127572016461e-05,
1603
+ "loss": 1.2088,
1604
+ "step": 1700
1605
+ },
1606
+ {
1607
+ "epoch": 45.02,
1608
+ "eval_accuracy": 0.2917251051893408,
1609
+ "eval_loss": 2.0568900108337402,
1610
+ "eval_runtime": 323.5679,
1611
+ "eval_samples_per_second": 4.407,
1612
+ "eval_steps_per_second": 0.139,
1613
+ "step": 1702
1614
+ },
1615
+ {
1616
+ "epoch": 46.0,
1617
+ "grad_norm": 4.0183210372924805,
1618
+ "learning_rate": 1.1574074074074075e-05,
1619
+ "loss": 1.139,
1620
+ "step": 1710
1621
+ },
1622
+ {
1623
+ "epoch": 46.01,
1624
+ "grad_norm": 5.541755676269531,
1625
+ "learning_rate": 1.1316872427983539e-05,
1626
+ "loss": 1.113,
1627
+ "step": 1720
1628
+ },
1629
+ {
1630
+ "epoch": 46.01,
1631
+ "grad_norm": 4.693772315979004,
1632
+ "learning_rate": 1.1059670781893005e-05,
1633
+ "loss": 1.1469,
1634
+ "step": 1730
1635
+ },
1636
+ {
1637
+ "epoch": 46.02,
1638
+ "eval_accuracy": 0.28892005610098176,
1639
+ "eval_loss": 2.1201205253601074,
1640
+ "eval_runtime": 315.1636,
1641
+ "eval_samples_per_second": 4.525,
1642
+ "eval_steps_per_second": 0.143,
1643
+ "step": 1739
1644
+ },
1645
+ {
1646
+ "epoch": 47.0,
1647
+ "grad_norm": 4.991947174072266,
1648
+ "learning_rate": 1.0802469135802469e-05,
1649
+ "loss": 1.1472,
1650
+ "step": 1740
1651
+ },
1652
+ {
1653
+ "epoch": 47.01,
1654
+ "grad_norm": 5.496129035949707,
1655
+ "learning_rate": 1.0545267489711936e-05,
1656
+ "loss": 1.1196,
1657
+ "step": 1750
1658
+ },
1659
+ {
1660
+ "epoch": 47.01,
1661
+ "grad_norm": 3.884612798690796,
1662
+ "learning_rate": 1.02880658436214e-05,
1663
+ "loss": 1.1656,
1664
+ "step": 1760
1665
+ },
1666
+ {
1667
+ "epoch": 47.01,
1668
+ "grad_norm": 5.605456352233887,
1669
+ "learning_rate": 1.0030864197530866e-05,
1670
+ "loss": 1.1332,
1671
+ "step": 1770
1672
+ },
1673
+ {
1674
+ "epoch": 47.02,
1675
+ "eval_accuracy": 0.2980364656381487,
1676
+ "eval_loss": 2.0940372943878174,
1677
+ "eval_runtime": 315.9032,
1678
+ "eval_samples_per_second": 4.514,
1679
+ "eval_steps_per_second": 0.142,
1680
+ "step": 1776
1681
+ },
1682
+ {
1683
+ "epoch": 48.0,
1684
+ "grad_norm": 6.020764350891113,
1685
+ "learning_rate": 9.77366255144033e-06,
1686
+ "loss": 1.1526,
1687
+ "step": 1780
1688
+ },
1689
+ {
1690
+ "epoch": 48.01,
1691
+ "grad_norm": 5.015161991119385,
1692
+ "learning_rate": 9.516460905349795e-06,
1693
+ "loss": 1.0704,
1694
+ "step": 1790
1695
+ },
1696
+ {
1697
+ "epoch": 48.01,
1698
+ "grad_norm": 6.186102390289307,
1699
+ "learning_rate": 9.259259259259259e-06,
1700
+ "loss": 1.1129,
1701
+ "step": 1800
1702
+ },
1703
+ {
1704
+ "epoch": 48.02,
1705
+ "grad_norm": 5.92779016494751,
1706
+ "learning_rate": 9.002057613168725e-06,
1707
+ "loss": 1.1608,
1708
+ "step": 1810
1709
+ },
1710
+ {
1711
+ "epoch": 48.02,
1712
+ "eval_accuracy": 0.27419354838709675,
1713
+ "eval_loss": 2.1327767372131348,
1714
+ "eval_runtime": 314.1204,
1715
+ "eval_samples_per_second": 4.54,
1716
+ "eval_steps_per_second": 0.143,
1717
+ "step": 1813
1718
+ },
1719
+ {
1720
+ "epoch": 49.0,
1721
+ "grad_norm": 4.245399475097656,
1722
+ "learning_rate": 8.74485596707819e-06,
1723
+ "loss": 1.0758,
1724
+ "step": 1820
1725
+ },
1726
+ {
1727
+ "epoch": 49.01,
1728
+ "grad_norm": 4.762213230133057,
1729
+ "learning_rate": 8.487654320987654e-06,
1730
+ "loss": 1.1185,
1731
+ "step": 1830
1732
+ },
1733
+ {
1734
+ "epoch": 49.01,
1735
+ "grad_norm": 5.845604419708252,
1736
+ "learning_rate": 8.23045267489712e-06,
1737
+ "loss": 1.1212,
1738
+ "step": 1840
1739
+ },
1740
+ {
1741
+ "epoch": 49.02,
1742
+ "grad_norm": 9.226399421691895,
1743
+ "learning_rate": 7.973251028806584e-06,
1744
+ "loss": 1.0913,
1745
+ "step": 1850
1746
+ },
1747
+ {
1748
+ "epoch": 49.02,
1749
+ "eval_accuracy": 0.28190743338008417,
1750
+ "eval_loss": 2.1332595348358154,
1751
+ "eval_runtime": 339.9555,
1752
+ "eval_samples_per_second": 4.195,
1753
+ "eval_steps_per_second": 0.132,
1754
+ "step": 1850
1755
+ },
1756
+ {
1757
+ "epoch": 50.0,
1758
+ "grad_norm": 5.531272888183594,
1759
+ "learning_rate": 7.71604938271605e-06,
1760
+ "loss": 1.0946,
1761
+ "step": 1860
1762
+ },
1763
+ {
1764
+ "epoch": 50.01,
1765
+ "grad_norm": 4.868674278259277,
1766
+ "learning_rate": 7.458847736625514e-06,
1767
+ "loss": 1.0852,
1768
+ "step": 1870
1769
+ },
1770
+ {
1771
+ "epoch": 50.01,
1772
+ "grad_norm": 6.927389621734619,
1773
+ "learning_rate": 7.20164609053498e-06,
1774
+ "loss": 1.1204,
1775
+ "step": 1880
1776
+ },
1777
+ {
1778
+ "epoch": 50.02,
1779
+ "eval_accuracy": 0.28751753155680226,
1780
+ "eval_loss": 2.201735258102417,
1781
+ "eval_runtime": 323.7365,
1782
+ "eval_samples_per_second": 4.405,
1783
+ "eval_steps_per_second": 0.139,
1784
+ "step": 1887
1785
+ },
1786
+ {
1787
+ "epoch": 51.0,
1788
+ "grad_norm": 4.928629398345947,
1789
+ "learning_rate": 6.944444444444445e-06,
1790
+ "loss": 1.0441,
1791
+ "step": 1890
1792
+ },
1793
+ {
1794
+ "epoch": 51.01,
1795
+ "grad_norm": 6.279874801635742,
1796
+ "learning_rate": 6.6872427983539096e-06,
1797
+ "loss": 1.128,
1798
+ "step": 1900
1799
+ },
1800
+ {
1801
+ "epoch": 51.01,
1802
+ "grad_norm": 6.877956390380859,
1803
+ "learning_rate": 6.430041152263374e-06,
1804
+ "loss": 1.0809,
1805
+ "step": 1910
1806
+ },
1807
+ {
1808
+ "epoch": 51.02,
1809
+ "grad_norm": 5.27929162979126,
1810
+ "learning_rate": 6.172839506172839e-06,
1811
+ "loss": 1.1052,
1812
+ "step": 1920
1813
+ },
1814
+ {
1815
+ "epoch": 51.02,
1816
+ "eval_accuracy": 0.29102384291725103,
1817
+ "eval_loss": 2.198258876800537,
1818
+ "eval_runtime": 316.9611,
1819
+ "eval_samples_per_second": 4.499,
1820
+ "eval_steps_per_second": 0.142,
1821
+ "step": 1924
1822
+ },
1823
+ {
1824
+ "epoch": 52.0,
1825
+ "grad_norm": 6.32177734375,
1826
+ "learning_rate": 5.915637860082305e-06,
1827
+ "loss": 1.1071,
1828
+ "step": 1930
1829
+ },
1830
+ {
1831
+ "epoch": 52.01,
1832
+ "grad_norm": 6.743009090423584,
1833
+ "learning_rate": 5.6584362139917696e-06,
1834
+ "loss": 1.0288,
1835
+ "step": 1940
1836
+ },
1837
+ {
1838
+ "epoch": 52.01,
1839
+ "grad_norm": 5.617419719696045,
1840
+ "learning_rate": 5.401234567901234e-06,
1841
+ "loss": 1.0651,
1842
+ "step": 1950
1843
+ },
1844
+ {
1845
+ "epoch": 52.02,
1846
+ "grad_norm": 5.337407112121582,
1847
+ "learning_rate": 5.1440329218107e-06,
1848
+ "loss": 1.0817,
1849
+ "step": 1960
1850
+ },
1851
+ {
1852
+ "epoch": 52.02,
1853
+ "eval_accuracy": 0.2966339410939691,
1854
+ "eval_loss": 2.1909656524658203,
1855
+ "eval_runtime": 312.5047,
1856
+ "eval_samples_per_second": 4.563,
1857
+ "eval_steps_per_second": 0.144,
1858
+ "step": 1961
1859
+ },
1860
+ {
1861
+ "epoch": 53.0,
1862
+ "grad_norm": 5.591818809509277,
1863
+ "learning_rate": 4.886831275720165e-06,
1864
+ "loss": 1.1214,
1865
+ "step": 1970
1866
+ },
1867
+ {
1868
+ "epoch": 53.01,
1869
+ "grad_norm": 4.366575241088867,
1870
+ "learning_rate": 4.6296296296296296e-06,
1871
+ "loss": 1.0782,
1872
+ "step": 1980
1873
+ },
1874
+ {
1875
+ "epoch": 53.01,
1876
+ "grad_norm": 5.795986175537109,
1877
+ "learning_rate": 4.372427983539095e-06,
1878
+ "loss": 1.0696,
1879
+ "step": 1990
1880
+ },
1881
+ {
1882
+ "epoch": 53.02,
1883
+ "eval_accuracy": 0.2805049088359046,
1884
+ "eval_loss": 2.199398994445801,
1885
+ "eval_runtime": 321.7456,
1886
+ "eval_samples_per_second": 4.432,
1887
+ "eval_steps_per_second": 0.14,
1888
+ "step": 1998
1889
+ },
1890
+ {
1891
+ "epoch": 54.0,
1892
+ "grad_norm": 4.923856258392334,
1893
+ "learning_rate": 4.11522633744856e-06,
1894
+ "loss": 1.1027,
1895
+ "step": 2000
1896
+ },
1897
+ {
1898
+ "epoch": 54.01,
1899
+ "grad_norm": 4.788481712341309,
1900
+ "learning_rate": 3.858024691358025e-06,
1901
+ "loss": 1.115,
1902
+ "step": 2010
1903
+ },
1904
+ {
1905
+ "epoch": 54.01,
1906
+ "grad_norm": 4.911264419555664,
1907
+ "learning_rate": 3.60082304526749e-06,
1908
+ "loss": 1.0481,
1909
+ "step": 2020
1910
+ },
1911
+ {
1912
+ "epoch": 54.01,
1913
+ "grad_norm": 6.288609981536865,
1914
+ "learning_rate": 3.3436213991769548e-06,
1915
+ "loss": 1.0465,
1916
+ "step": 2030
1917
+ },
1918
+ {
1919
+ "epoch": 54.02,
1920
+ "eval_accuracy": 0.28541374474053294,
1921
+ "eval_loss": 2.2044293880462646,
1922
+ "eval_runtime": 314.8977,
1923
+ "eval_samples_per_second": 4.528,
1924
+ "eval_steps_per_second": 0.143,
1925
+ "step": 2035
1926
+ },
1927
+ {
1928
+ "epoch": 55.0,
1929
+ "grad_norm": 6.370595455169678,
1930
+ "learning_rate": 3.0864197530864196e-06,
1931
+ "loss": 1.0229,
1932
+ "step": 2040
1933
+ },
1934
+ {
1935
+ "epoch": 55.01,
1936
+ "grad_norm": 5.17075252532959,
1937
+ "learning_rate": 2.8292181069958848e-06,
1938
+ "loss": 1.077,
1939
+ "step": 2050
1940
+ },
1941
+ {
1942
+ "epoch": 55.01,
1943
+ "grad_norm": 6.249576568603516,
1944
+ "learning_rate": 2.57201646090535e-06,
1945
+ "loss": 1.0598,
1946
+ "step": 2060
1947
+ },
1948
+ {
1949
+ "epoch": 55.02,
1950
+ "grad_norm": 6.3195390701293945,
1951
+ "learning_rate": 2.3148148148148148e-06,
1952
+ "loss": 1.0786,
1953
+ "step": 2070
1954
+ },
1955
+ {
1956
+ "epoch": 55.02,
1957
+ "eval_accuracy": 0.2903225806451613,
1958
+ "eval_loss": 2.182746648788452,
1959
+ "eval_runtime": 320.0132,
1960
+ "eval_samples_per_second": 4.456,
1961
+ "eval_steps_per_second": 0.141,
1962
+ "step": 2072
1963
+ },
1964
+ {
1965
+ "epoch": 56.0,
1966
+ "grad_norm": 4.941202640533447,
1967
+ "learning_rate": 2.05761316872428e-06,
1968
+ "loss": 1.0504,
1969
+ "step": 2080
1970
+ },
1971
+ {
1972
+ "epoch": 56.01,
1973
+ "grad_norm": 6.709362030029297,
1974
+ "learning_rate": 1.800411522633745e-06,
1975
+ "loss": 1.0698,
1976
+ "step": 2090
1977
+ },
1978
+ {
1979
+ "epoch": 56.01,
1980
+ "grad_norm": 4.710355758666992,
1981
+ "learning_rate": 1.5432098765432098e-06,
1982
+ "loss": 1.0293,
1983
+ "step": 2100
1984
+ },
1985
+ {
1986
+ "epoch": 56.02,
1987
+ "eval_accuracy": 0.29312762973352036,
1988
+ "eval_loss": 2.1847376823425293,
1989
+ "eval_runtime": 311.2651,
1990
+ "eval_samples_per_second": 4.581,
1991
+ "eval_steps_per_second": 0.145,
1992
+ "step": 2109
1993
+ },
1994
+ {
1995
+ "epoch": 57.0,
1996
+ "grad_norm": 5.022037506103516,
1997
+ "learning_rate": 1.286008230452675e-06,
1998
+ "loss": 1.0951,
1999
+ "step": 2110
2000
+ },
2001
+ {
2002
+ "epoch": 57.01,
2003
+ "grad_norm": 4.987078666687012,
2004
+ "learning_rate": 1.02880658436214e-06,
2005
+ "loss": 1.0677,
2006
+ "step": 2120
2007
+ },
2008
+ {
2009
+ "epoch": 57.01,
2010
+ "grad_norm": 6.031286716461182,
2011
+ "learning_rate": 7.716049382716049e-07,
2012
+ "loss": 1.0438,
2013
+ "step": 2130
2014
+ },
2015
+ {
2016
+ "epoch": 57.01,
2017
+ "grad_norm": 6.382678031921387,
2018
+ "learning_rate": 5.1440329218107e-07,
2019
+ "loss": 1.107,
2020
+ "step": 2140
2021
+ },
2022
+ {
2023
+ "epoch": 57.02,
2024
+ "eval_accuracy": 0.28751753155680226,
2025
+ "eval_loss": 2.1876089572906494,
2026
+ "eval_runtime": 327.4894,
2027
+ "eval_samples_per_second": 4.354,
2028
+ "eval_steps_per_second": 0.137,
2029
+ "step": 2146
2030
+ },
2031
+ {
2032
+ "epoch": 58.0,
2033
+ "grad_norm": 5.037903308868408,
2034
+ "learning_rate": 2.57201646090535e-07,
2035
+ "loss": 1.0106,
2036
+ "step": 2150
2037
+ },
2038
+ {
2039
+ "epoch": 58.01,
2040
+ "grad_norm": 5.149020671844482,
2041
+ "learning_rate": 0.0,
2042
+ "loss": 1.0571,
2043
+ "step": 2160
2044
+ },
2045
+ {
2046
+ "epoch": 58.01,
2047
+ "eval_accuracy": 0.288218793828892,
2048
+ "eval_loss": 2.188915967941284,
2049
+ "eval_runtime": 325.0665,
2050
+ "eval_samples_per_second": 4.387,
2051
+ "eval_steps_per_second": 0.138,
2052
+ "step": 2160
2053
+ },
2054
+ {
2055
+ "epoch": 58.01,
2056
+ "step": 2160,
2057
+ "total_flos": 8.417965007384071e+19,
2058
+ "train_loss": 1.3411160809022409,
2059
+ "train_runtime": 36316.6045,
2060
+ "train_samples_per_second": 1.903,
2061
+ "train_steps_per_second": 0.059
2062
+ },
2063
+ {
2064
+ "epoch": 58.01,
2065
+ "eval_accuracy": 0.3313807531380753,
2066
+ "eval_loss": 1.6799192428588867,
2067
+ "eval_runtime": 271.2848,
2068
+ "eval_samples_per_second": 4.405,
2069
+ "eval_steps_per_second": 0.14,
2070
+ "step": 2160
2071
+ },
2072
+ {
2073
+ "epoch": 58.01,
2074
+ "eval_accuracy": 0.3313807531380753,
2075
+ "eval_loss": 1.679498314857483,
2076
+ "eval_runtime": 279.1958,
2077
+ "eval_samples_per_second": 4.28,
2078
+ "eval_steps_per_second": 0.136,
2079
+ "step": 2160
2080
  }
2081
  ],
2082
  "logging_steps": 10,
2083
+ "max_steps": 2160,
2084
  "num_input_tokens_seen": 0,
2085
  "num_train_epochs": 9223372036854775807,
2086
  "save_steps": 500,
2087
+ "total_flos": 8.417965007384071e+19,
2088
+ "train_batch_size": 32,
2089
  "trial_name": null,
2090
  "trial_params": null
2091
  }