archit11 commited on
Commit
0030622
1 Parent(s): 4ddcb94

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.25,
3
- "eval_accuracy": 0.11733870967741936,
4
- "eval_loss": 2.6881909370422363,
5
- "eval_runtime": 956.0688,
6
- "eval_samples_per_second": 5.188,
7
- "eval_steps_per_second": 1.297
8
  }
 
1
  {
2
+ "epoch": 7.12,
3
+ "eval_accuracy": 0.225,
4
+ "eval_loss": 2.501384973526001,
5
+ "eval_runtime": 945.2716,
6
+ "eval_samples_per_second": 5.247,
7
+ "eval_steps_per_second": 0.656
8
  }
runs/Mar17_19-35-48_500f3adbbe43/events.out.tfevents.1710729400.500f3adbbe43.26.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f1b342a8b2c6d00ac8c36a307ac47d42b946637655330563b51a955a801c871
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a8ae09f173638e3a9b5cf378ee3b2bb4856529d7bcaabbb980f10e6a01cbe73
3
+ size 734
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.25,
3
- "eval_accuracy": 0.11733870967741936,
4
- "eval_loss": 2.6881909370422363,
5
- "eval_runtime": 956.0688,
6
- "eval_samples_per_second": 5.188,
7
- "eval_steps_per_second": 1.297
8
  }
 
1
  {
2
+ "epoch": 7.12,
3
+ "eval_accuracy": 0.225,
4
+ "eval_loss": 2.501384973526001,
5
+ "eval_runtime": 945.2716,
6
+ "eval_samples_per_second": 5.247,
7
+ "eval_steps_per_second": 0.656
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.18403036166096146,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucfcrime-full2/checkpoint-700",
4
- "epoch": 3.25,
5
  "eval_steps": 500,
6
  "global_step": 700,
7
  "is_hyper_param_search": false,
@@ -10,1055 +10,601 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "grad_norm": 14.482718467712402,
14
- "learning_rate": 3.5714285714285714e-06,
15
- "loss": 2.6878,
16
- "step": 5
17
- },
18
- {
19
- "epoch": 0.01,
20
- "grad_norm": 21.39861297607422,
21
  "learning_rate": 7.142857142857143e-06,
22
- "loss": 2.6074,
23
  "step": 10
24
  },
25
- {
26
- "epoch": 0.02,
27
- "grad_norm": 17.860671997070312,
28
- "learning_rate": 1.0714285714285714e-05,
29
- "loss": 2.7214,
30
- "step": 15
31
- },
32
  {
33
  "epoch": 0.03,
34
- "grad_norm": 16.79393768310547,
35
  "learning_rate": 1.4285714285714285e-05,
36
- "loss": 2.5776,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.04,
41
- "grad_norm": 14.525908470153809,
42
- "learning_rate": 1.785714285714286e-05,
43
- "loss": 2.689,
44
- "step": 25
45
- },
46
- {
47
- "epoch": 0.04,
48
- "grad_norm": 12.049698829650879,
49
  "learning_rate": 2.1428571428571428e-05,
50
- "loss": 2.6499,
51
  "step": 30
52
  },
53
- {
54
- "epoch": 0.05,
55
- "grad_norm": 13.192450523376465,
56
- "learning_rate": 2.5e-05,
57
- "loss": 2.5365,
58
- "step": 35
59
- },
60
  {
61
  "epoch": 0.06,
62
- "grad_norm": 10.041342735290527,
63
  "learning_rate": 2.857142857142857e-05,
64
- "loss": 2.4945,
65
  "step": 40
66
  },
67
- {
68
- "epoch": 0.06,
69
- "grad_norm": 10.616167068481445,
70
- "learning_rate": 3.2142857142857144e-05,
71
- "loss": 2.7132,
72
- "step": 45
73
- },
74
  {
75
  "epoch": 0.07,
76
- "grad_norm": 11.666309356689453,
77
  "learning_rate": 3.571428571428572e-05,
78
- "loss": 2.5329,
79
  "step": 50
80
  },
81
- {
82
- "epoch": 0.08,
83
- "grad_norm": 8.298579216003418,
84
- "learning_rate": 3.928571428571429e-05,
85
- "loss": 2.4331,
86
- "step": 55
87
- },
88
  {
89
  "epoch": 0.09,
90
- "grad_norm": 9.002656936645508,
91
  "learning_rate": 4.2857142857142856e-05,
92
- "loss": 2.3991,
93
  "step": 60
94
  },
95
- {
96
- "epoch": 0.09,
97
- "grad_norm": 10.443252563476562,
98
- "learning_rate": 4.642857142857143e-05,
99
- "loss": 2.7184,
100
- "step": 65
101
- },
102
  {
103
  "epoch": 0.1,
104
- "grad_norm": 8.290853500366211,
105
  "learning_rate": 5e-05,
106
- "loss": 2.4219,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.11,
111
- "grad_norm": 8.40210247039795,
112
- "learning_rate": 4.960317460317461e-05,
113
- "loss": 2.3317,
114
- "step": 75
115
- },
116
- {
117
- "epoch": 0.11,
118
- "grad_norm": 9.528456687927246,
119
  "learning_rate": 4.9206349206349204e-05,
120
- "loss": 2.7167,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.12,
125
- "grad_norm": 10.601034164428711,
126
- "learning_rate": 4.880952380952381e-05,
127
- "loss": 2.5201,
128
- "step": 85
 
 
129
  },
130
  {
131
- "epoch": 0.13,
132
- "grad_norm": 10.306778907775879,
133
  "learning_rate": 4.841269841269841e-05,
134
- "loss": 2.4928,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.14,
139
- "grad_norm": 10.532262802124023,
140
- "learning_rate": 4.801587301587302e-05,
141
- "loss": 2.4065,
142
- "step": 95
143
- },
144
- {
145
- "epoch": 0.14,
146
- "grad_norm": 10.122940063476562,
147
  "learning_rate": 4.761904761904762e-05,
148
- "loss": 2.7106,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.15,
153
- "grad_norm": 9.78131103515625,
154
- "learning_rate": 4.722222222222222e-05,
155
- "loss": 2.5572,
156
- "step": 105
157
- },
158
- {
159
- "epoch": 0.16,
160
- "grad_norm": 10.649138450622559,
161
  "learning_rate": 4.682539682539683e-05,
162
- "loss": 2.5781,
163
  "step": 110
164
  },
165
  {
166
- "epoch": 0.16,
167
- "grad_norm": 7.493427753448486,
168
- "learning_rate": 4.642857142857143e-05,
169
- "loss": 2.6114,
170
- "step": 115
171
- },
172
- {
173
- "epoch": 0.17,
174
- "grad_norm": 7.953385829925537,
175
  "learning_rate": 4.603174603174603e-05,
176
- "loss": 2.6687,
177
  "step": 120
178
  },
179
  {
180
- "epoch": 0.18,
181
- "grad_norm": 8.582961082458496,
182
- "learning_rate": 4.563492063492064e-05,
183
- "loss": 2.6787,
184
- "step": 125
185
- },
186
- {
187
- "epoch": 0.19,
188
- "grad_norm": 8.697818756103516,
189
  "learning_rate": 4.523809523809524e-05,
190
- "loss": 2.5829,
191
  "step": 130
192
  },
193
  {
194
- "epoch": 0.19,
195
- "grad_norm": 9.811901092529297,
196
- "learning_rate": 4.4841269841269846e-05,
197
- "loss": 2.6221,
198
- "step": 135
199
- },
200
- {
201
- "epoch": 0.2,
202
- "grad_norm": 9.103715896606445,
203
  "learning_rate": 4.4444444444444447e-05,
204
- "loss": 2.6806,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 0.21,
209
- "grad_norm": 8.211301803588867,
210
- "learning_rate": 4.404761904761905e-05,
211
- "loss": 2.6136,
212
- "step": 145
213
- },
214
- {
215
- "epoch": 0.21,
216
- "grad_norm": 10.060530662536621,
217
  "learning_rate": 4.3650793650793655e-05,
218
- "loss": 2.6461,
219
  "step": 150
220
  },
221
  {
222
- "epoch": 0.22,
223
- "grad_norm": 10.009857177734375,
224
- "learning_rate": 4.3253968253968256e-05,
225
- "loss": 2.571,
226
- "step": 155
227
- },
228
- {
229
- "epoch": 0.23,
230
- "grad_norm": 11.69568157196045,
231
  "learning_rate": 4.2857142857142856e-05,
232
- "loss": 2.758,
233
  "step": 160
234
  },
235
  {
236
- "epoch": 0.24,
237
- "grad_norm": 8.031876564025879,
238
- "learning_rate": 4.2460317460317464e-05,
239
- "loss": 2.4975,
240
- "step": 165
241
- },
242
- {
243
- "epoch": 0.24,
244
- "grad_norm": 8.202585220336914,
245
  "learning_rate": 4.2063492063492065e-05,
246
- "loss": 2.719,
247
  "step": 170
248
  },
249
  {
250
- "epoch": 0.25,
251
- "grad_norm": 8.405922889709473,
252
- "learning_rate": 4.166666666666667e-05,
253
- "loss": 2.6564,
254
- "step": 175
255
- },
256
- {
257
- "epoch": 0.25,
258
- "eval_accuracy": 0.17190058044351839,
259
- "eval_loss": 2.5464329719543457,
260
- "eval_runtime": 2790.7409,
261
- "eval_samples_per_second": 4.815,
262
- "eval_steps_per_second": 1.204,
263
- "step": 175
264
  },
265
  {
266
- "epoch": 1.01,
267
- "grad_norm": 8.090776443481445,
268
  "learning_rate": 4.126984126984127e-05,
269
- "loss": 2.5153,
270
  "step": 180
271
  },
272
  {
273
- "epoch": 1.01,
274
- "grad_norm": 7.668066024780273,
275
- "learning_rate": 4.0873015873015874e-05,
276
- "loss": 2.6691,
277
- "step": 185
278
- },
279
- {
280
- "epoch": 1.02,
281
- "grad_norm": 7.451665878295898,
282
  "learning_rate": 4.047619047619048e-05,
283
- "loss": 2.455,
284
  "step": 190
285
  },
286
  {
287
- "epoch": 1.03,
288
- "grad_norm": 6.811094284057617,
289
- "learning_rate": 4.007936507936508e-05,
290
- "loss": 2.7581,
291
- "step": 195
292
- },
293
- {
294
- "epoch": 1.04,
295
- "grad_norm": 8.437348365783691,
296
  "learning_rate": 3.968253968253968e-05,
297
- "loss": 2.6151,
298
  "step": 200
299
  },
300
  {
301
- "epoch": 1.04,
302
- "grad_norm": 9.83616828918457,
303
- "learning_rate": 3.928571428571429e-05,
304
- "loss": 2.7035,
305
- "step": 205
306
- },
307
- {
308
- "epoch": 1.05,
309
- "grad_norm": 7.769816875457764,
310
  "learning_rate": 3.888888888888889e-05,
311
- "loss": 2.6254,
312
  "step": 210
313
  },
314
  {
315
- "epoch": 1.06,
316
- "grad_norm": 11.614434242248535,
317
- "learning_rate": 3.84920634920635e-05,
318
- "loss": 2.5088,
319
- "step": 215
320
- },
321
- {
322
- "epoch": 1.06,
323
- "grad_norm": 7.872964859008789,
324
  "learning_rate": 3.809523809523809e-05,
325
- "loss": 2.4313,
326
  "step": 220
327
  },
328
  {
329
- "epoch": 1.07,
330
- "grad_norm": 9.000481605529785,
331
- "learning_rate": 3.76984126984127e-05,
332
- "loss": 2.6319,
333
- "step": 225
334
- },
335
- {
336
- "epoch": 1.08,
337
- "grad_norm": 7.4448394775390625,
338
  "learning_rate": 3.730158730158731e-05,
339
- "loss": 2.6064,
340
  "step": 230
341
  },
342
  {
343
- "epoch": 1.09,
344
- "grad_norm": 9.439050674438477,
345
- "learning_rate": 3.690476190476191e-05,
346
- "loss": 2.4276,
347
- "step": 235
348
- },
349
- {
350
- "epoch": 1.09,
351
- "grad_norm": 7.45771598815918,
352
  "learning_rate": 3.650793650793651e-05,
353
- "loss": 2.6828,
354
  "step": 240
355
  },
356
  {
357
- "epoch": 1.1,
358
- "grad_norm": 9.655268669128418,
359
- "learning_rate": 3.611111111111111e-05,
360
- "loss": 2.4295,
361
- "step": 245
362
- },
363
- {
364
- "epoch": 1.11,
365
- "grad_norm": 7.304623603820801,
366
  "learning_rate": 3.571428571428572e-05,
367
- "loss": 2.3941,
368
  "step": 250
369
  },
370
  {
371
- "epoch": 1.11,
372
- "grad_norm": 8.936468124389648,
373
- "learning_rate": 3.5317460317460324e-05,
374
- "loss": 2.5164,
375
- "step": 255
376
- },
377
- {
378
- "epoch": 1.12,
379
- "grad_norm": 6.037725448608398,
380
  "learning_rate": 3.492063492063492e-05,
381
- "loss": 2.3629,
382
  "step": 260
383
  },
384
  {
385
- "epoch": 1.13,
386
- "grad_norm": 9.806131362915039,
387
- "learning_rate": 3.4523809523809526e-05,
388
- "loss": 2.1973,
389
- "step": 265
 
 
390
  },
391
  {
392
- "epoch": 1.14,
393
- "grad_norm": 7.448783874511719,
394
  "learning_rate": 3.412698412698413e-05,
395
- "loss": 2.6785,
396
  "step": 270
397
  },
398
  {
399
- "epoch": 1.14,
400
- "grad_norm": 7.902461528778076,
401
- "learning_rate": 3.3730158730158734e-05,
402
- "loss": 2.5689,
403
- "step": 275
404
- },
405
- {
406
- "epoch": 1.15,
407
- "grad_norm": 8.133126258850098,
408
  "learning_rate": 3.3333333333333335e-05,
409
- "loss": 2.705,
410
  "step": 280
411
  },
412
  {
413
- "epoch": 1.16,
414
- "grad_norm": 7.387323379516602,
415
- "learning_rate": 3.2936507936507936e-05,
416
- "loss": 2.4184,
417
- "step": 285
418
- },
419
- {
420
- "epoch": 1.16,
421
- "grad_norm": 7.738976001739502,
422
  "learning_rate": 3.253968253968254e-05,
423
- "loss": 2.5947,
424
  "step": 290
425
  },
426
  {
427
- "epoch": 1.17,
428
- "grad_norm": 7.019045352935791,
429
- "learning_rate": 3.2142857142857144e-05,
430
- "loss": 2.3066,
431
- "step": 295
432
- },
433
- {
434
- "epoch": 1.18,
435
- "grad_norm": 7.5370588302612305,
436
  "learning_rate": 3.1746031746031745e-05,
437
- "loss": 2.4445,
438
  "step": 300
439
  },
440
  {
441
- "epoch": 1.19,
442
- "grad_norm": 7.962660312652588,
443
- "learning_rate": 3.134920634920635e-05,
444
- "loss": 2.4807,
445
- "step": 305
446
- },
447
- {
448
- "epoch": 1.19,
449
- "grad_norm": 8.314027786254883,
450
  "learning_rate": 3.095238095238095e-05,
451
- "loss": 2.5981,
452
  "step": 310
453
  },
454
  {
455
- "epoch": 1.2,
456
- "grad_norm": 10.364777565002441,
457
- "learning_rate": 3.055555555555556e-05,
458
- "loss": 2.5207,
459
- "step": 315
460
- },
461
- {
462
- "epoch": 1.21,
463
- "grad_norm": 8.472153663635254,
464
  "learning_rate": 3.0158730158730158e-05,
465
- "loss": 2.5291,
466
  "step": 320
467
  },
468
  {
469
- "epoch": 1.21,
470
- "grad_norm": 8.788971900939941,
471
- "learning_rate": 2.9761904761904762e-05,
472
- "loss": 2.5238,
473
- "step": 325
474
- },
475
- {
476
- "epoch": 1.22,
477
- "grad_norm": 8.0416898727417,
478
  "learning_rate": 2.9365079365079366e-05,
479
- "loss": 2.5041,
480
  "step": 330
481
  },
482
  {
483
- "epoch": 1.23,
484
- "grad_norm": 8.097698211669922,
485
- "learning_rate": 2.8968253968253974e-05,
486
- "loss": 2.3977,
487
- "step": 335
488
- },
489
- {
490
- "epoch": 1.24,
491
- "grad_norm": 8.505749702453613,
492
  "learning_rate": 2.857142857142857e-05,
493
- "loss": 2.2704,
494
  "step": 340
495
  },
496
  {
497
- "epoch": 1.24,
498
- "grad_norm": 7.548125267028809,
499
- "learning_rate": 2.8174603174603175e-05,
500
- "loss": 1.993,
501
- "step": 345
502
- },
503
- {
504
- "epoch": 1.25,
505
- "grad_norm": 8.76427936553955,
506
  "learning_rate": 2.777777777777778e-05,
507
- "loss": 2.3285,
508
  "step": 350
509
  },
510
  {
511
- "epoch": 1.25,
512
- "eval_accuracy": 0.1255395148087513,
513
- "eval_loss": 2.866504192352295,
514
- "eval_runtime": 2724.8913,
515
- "eval_samples_per_second": 4.932,
516
- "eval_steps_per_second": 1.233,
517
- "step": 350
518
- },
519
- {
520
- "epoch": 2.01,
521
- "grad_norm": 9.777434349060059,
522
- "learning_rate": 2.7380952380952383e-05,
523
- "loss": 2.3064,
524
- "step": 355
525
  },
526
  {
527
- "epoch": 2.01,
528
- "grad_norm": 9.634062767028809,
529
  "learning_rate": 2.6984126984126984e-05,
530
- "loss": 2.6674,
531
  "step": 360
532
  },
533
  {
534
- "epoch": 2.02,
535
- "grad_norm": 7.418952941894531,
536
- "learning_rate": 2.6587301587301588e-05,
537
- "loss": 2.3623,
538
- "step": 365
539
- },
540
- {
541
- "epoch": 2.03,
542
- "grad_norm": 9.471118927001953,
543
  "learning_rate": 2.6190476190476192e-05,
544
- "loss": 2.5705,
545
  "step": 370
546
  },
547
  {
548
- "epoch": 2.04,
549
- "grad_norm": 8.835360527038574,
550
- "learning_rate": 2.5793650793650796e-05,
551
- "loss": 2.6313,
552
- "step": 375
553
- },
554
- {
555
- "epoch": 2.04,
556
- "grad_norm": 9.770784378051758,
557
  "learning_rate": 2.5396825396825397e-05,
558
- "loss": 2.3423,
559
  "step": 380
560
  },
561
  {
562
- "epoch": 2.05,
563
- "grad_norm": 11.036508560180664,
564
- "learning_rate": 2.5e-05,
565
- "loss": 2.4206,
566
- "step": 385
567
- },
568
- {
569
- "epoch": 2.06,
570
- "grad_norm": 10.88526725769043,
571
  "learning_rate": 2.4603174603174602e-05,
572
- "loss": 2.3682,
573
  "step": 390
574
  },
575
  {
576
- "epoch": 2.06,
577
- "grad_norm": 9.554337501525879,
578
- "learning_rate": 2.4206349206349206e-05,
579
- "loss": 2.3234,
580
- "step": 395
581
- },
582
- {
583
- "epoch": 2.07,
584
- "grad_norm": 10.911561012268066,
585
  "learning_rate": 2.380952380952381e-05,
586
- "loss": 2.7633,
587
  "step": 400
588
  },
589
  {
590
- "epoch": 2.08,
591
- "grad_norm": 8.518780708312988,
592
- "learning_rate": 2.3412698412698414e-05,
593
- "loss": 2.209,
594
- "step": 405
595
- },
596
- {
597
- "epoch": 2.09,
598
- "grad_norm": 10.131900787353516,
599
  "learning_rate": 2.3015873015873015e-05,
600
- "loss": 2.5754,
601
  "step": 410
602
  },
603
  {
604
- "epoch": 2.09,
605
- "grad_norm": 7.803353309631348,
606
- "learning_rate": 2.261904761904762e-05,
607
- "loss": 2.2815,
608
- "step": 415
609
- },
610
- {
611
- "epoch": 2.1,
612
- "grad_norm": 8.873805046081543,
613
  "learning_rate": 2.2222222222222223e-05,
614
- "loss": 2.4105,
615
  "step": 420
616
  },
617
  {
618
- "epoch": 2.11,
619
- "grad_norm": 10.056378364562988,
620
- "learning_rate": 2.1825396825396827e-05,
621
- "loss": 2.4577,
622
- "step": 425
623
- },
624
- {
625
- "epoch": 2.11,
626
- "grad_norm": 9.494528770446777,
627
  "learning_rate": 2.1428571428571428e-05,
628
- "loss": 2.5227,
629
  "step": 430
630
  },
631
  {
632
- "epoch": 2.12,
633
- "grad_norm": 7.30636739730835,
634
- "learning_rate": 2.1031746031746032e-05,
635
- "loss": 2.1842,
636
- "step": 435
637
- },
638
- {
639
- "epoch": 2.13,
640
- "grad_norm": 11.144043922424316,
641
  "learning_rate": 2.0634920634920636e-05,
642
- "loss": 2.3519,
643
  "step": 440
644
  },
645
  {
646
- "epoch": 2.14,
647
- "grad_norm": 7.896585941314697,
648
- "learning_rate": 2.023809523809524e-05,
649
- "loss": 2.6106,
650
- "step": 445
 
 
651
  },
652
  {
653
- "epoch": 2.14,
654
- "grad_norm": 9.881820678710938,
655
  "learning_rate": 1.984126984126984e-05,
656
- "loss": 2.5722,
657
  "step": 450
658
  },
659
  {
660
- "epoch": 2.15,
661
- "grad_norm": 7.789708137512207,
662
- "learning_rate": 1.9444444444444445e-05,
663
- "loss": 2.385,
664
- "step": 455
665
- },
666
- {
667
- "epoch": 2.16,
668
- "grad_norm": 15.662388801574707,
669
  "learning_rate": 1.9047619047619046e-05,
670
- "loss": 2.4097,
671
  "step": 460
672
  },
673
  {
674
- "epoch": 2.16,
675
- "grad_norm": 11.497434616088867,
676
- "learning_rate": 1.8650793650793654e-05,
677
- "loss": 2.3551,
678
- "step": 465
679
- },
680
- {
681
- "epoch": 2.17,
682
- "grad_norm": 8.680765151977539,
683
  "learning_rate": 1.8253968253968254e-05,
684
- "loss": 2.4937,
685
  "step": 470
686
  },
687
  {
688
- "epoch": 2.18,
689
- "grad_norm": 9.611832618713379,
690
- "learning_rate": 1.785714285714286e-05,
691
- "loss": 2.2117,
692
- "step": 475
693
- },
694
- {
695
- "epoch": 2.19,
696
- "grad_norm": 8.637816429138184,
697
  "learning_rate": 1.746031746031746e-05,
698
- "loss": 2.2063,
699
  "step": 480
700
  },
701
  {
702
- "epoch": 2.19,
703
- "grad_norm": 8.593392372131348,
704
- "learning_rate": 1.7063492063492063e-05,
705
- "loss": 2.5038,
706
- "step": 485
707
- },
708
- {
709
- "epoch": 2.2,
710
- "grad_norm": 9.811333656311035,
711
  "learning_rate": 1.6666666666666667e-05,
712
- "loss": 2.1935,
713
  "step": 490
714
  },
715
  {
716
- "epoch": 2.21,
717
- "grad_norm": 9.25893497467041,
718
- "learning_rate": 1.626984126984127e-05,
719
- "loss": 2.7302,
720
- "step": 495
721
- },
722
- {
723
- "epoch": 2.21,
724
- "grad_norm": 8.803714752197266,
725
  "learning_rate": 1.5873015873015872e-05,
726
- "loss": 2.3372,
727
  "step": 500
728
  },
729
  {
730
- "epoch": 2.22,
731
- "grad_norm": 10.265325546264648,
732
- "learning_rate": 1.5476190476190476e-05,
733
- "loss": 2.3112,
734
- "step": 505
735
- },
736
- {
737
- "epoch": 2.23,
738
- "grad_norm": 8.800261497497559,
739
  "learning_rate": 1.5079365079365079e-05,
740
- "loss": 2.1571,
741
  "step": 510
742
  },
743
  {
744
- "epoch": 2.24,
745
- "grad_norm": 9.066210746765137,
746
- "learning_rate": 1.4682539682539683e-05,
747
- "loss": 2.4612,
748
- "step": 515
749
- },
750
- {
751
- "epoch": 2.24,
752
- "grad_norm": 11.743760108947754,
753
  "learning_rate": 1.4285714285714285e-05,
754
- "loss": 2.3783,
755
  "step": 520
756
  },
757
  {
758
- "epoch": 2.25,
759
- "grad_norm": 7.336353302001953,
760
- "learning_rate": 1.388888888888889e-05,
761
- "loss": 2.3545,
762
- "step": 525
763
- },
764
- {
765
- "epoch": 2.25,
766
- "eval_accuracy": 0.13528798928411967,
767
- "eval_loss": 2.775129556655884,
768
- "eval_runtime": 2705.5287,
769
- "eval_samples_per_second": 4.967,
770
- "eval_steps_per_second": 1.242,
771
- "step": 525
772
  },
773
  {
774
- "epoch": 3.01,
775
- "grad_norm": 11.444567680358887,
776
  "learning_rate": 1.3492063492063492e-05,
777
- "loss": 2.7387,
778
  "step": 530
779
  },
780
  {
781
- "epoch": 3.01,
782
- "grad_norm": 9.247318267822266,
783
- "learning_rate": 1.3095238095238096e-05,
784
- "loss": 2.7047,
785
- "step": 535
786
- },
787
- {
788
- "epoch": 3.02,
789
- "grad_norm": 9.126996040344238,
790
  "learning_rate": 1.2698412698412699e-05,
791
- "loss": 2.3381,
792
  "step": 540
793
  },
794
  {
795
- "epoch": 3.03,
796
- "grad_norm": 8.4282865524292,
797
- "learning_rate": 1.2301587301587301e-05,
798
- "loss": 2.292,
799
- "step": 545
800
- },
801
- {
802
- "epoch": 3.04,
803
- "grad_norm": 9.275003433227539,
804
  "learning_rate": 1.1904761904761905e-05,
805
- "loss": 2.3291,
806
  "step": 550
807
  },
808
  {
809
- "epoch": 3.04,
810
- "grad_norm": 9.664990425109863,
811
- "learning_rate": 1.1507936507936508e-05,
812
- "loss": 2.3276,
813
- "step": 555
814
- },
815
- {
816
- "epoch": 3.05,
817
- "grad_norm": 9.327167510986328,
818
  "learning_rate": 1.1111111111111112e-05,
819
- "loss": 2.1339,
820
  "step": 560
821
  },
822
  {
823
- "epoch": 3.06,
824
- "grad_norm": 8.076509475708008,
825
- "learning_rate": 1.0714285714285714e-05,
826
- "loss": 2.1437,
827
- "step": 565
828
- },
829
- {
830
- "epoch": 3.06,
831
- "grad_norm": 10.702942848205566,
832
  "learning_rate": 1.0317460317460318e-05,
833
- "loss": 2.4547,
834
  "step": 570
835
  },
836
  {
837
- "epoch": 3.07,
838
- "grad_norm": 13.38920783996582,
839
- "learning_rate": 9.92063492063492e-06,
840
- "loss": 2.5248,
841
- "step": 575
842
- },
843
- {
844
- "epoch": 3.08,
845
- "grad_norm": 7.934788227081299,
846
  "learning_rate": 9.523809523809523e-06,
847
- "loss": 2.4829,
848
  "step": 580
849
  },
850
  {
851
- "epoch": 3.09,
852
- "grad_norm": 8.314118385314941,
853
- "learning_rate": 9.126984126984127e-06,
854
- "loss": 2.1392,
855
- "step": 585
856
- },
857
- {
858
- "epoch": 3.09,
859
- "grad_norm": 11.382800102233887,
860
  "learning_rate": 8.73015873015873e-06,
861
- "loss": 2.2964,
862
  "step": 590
863
  },
864
  {
865
- "epoch": 3.1,
866
- "grad_norm": 7.882699012756348,
867
- "learning_rate": 8.333333333333334e-06,
868
- "loss": 2.1905,
869
- "step": 595
870
- },
871
- {
872
- "epoch": 3.11,
873
- "grad_norm": 9.530142784118652,
874
  "learning_rate": 7.936507936507936e-06,
875
- "loss": 2.3815,
876
  "step": 600
877
  },
878
  {
879
- "epoch": 3.11,
880
- "grad_norm": 8.318964958190918,
881
- "learning_rate": 7.5396825396825394e-06,
882
- "loss": 2.4007,
883
- "step": 605
884
- },
885
- {
886
- "epoch": 3.12,
887
- "grad_norm": 10.5969820022583,
888
  "learning_rate": 7.142857142857143e-06,
889
- "loss": 2.3558,
890
  "step": 610
891
  },
892
  {
893
- "epoch": 3.13,
894
- "grad_norm": 9.391063690185547,
895
- "learning_rate": 6.746031746031746e-06,
896
- "loss": 2.3825,
897
- "step": 615
 
 
898
  },
899
  {
900
- "epoch": 3.14,
901
- "grad_norm": 9.316353797912598,
902
  "learning_rate": 6.349206349206349e-06,
903
- "loss": 2.4972,
904
  "step": 620
905
  },
906
  {
907
- "epoch": 3.14,
908
- "grad_norm": 9.888083457946777,
909
- "learning_rate": 5.9523809523809525e-06,
910
- "loss": 2.2321,
911
- "step": 625
912
- },
913
- {
914
- "epoch": 3.15,
915
- "grad_norm": 8.392168045043945,
916
  "learning_rate": 5.555555555555556e-06,
917
- "loss": 2.196,
918
  "step": 630
919
  },
920
  {
921
- "epoch": 3.16,
922
- "grad_norm": 9.588519096374512,
923
- "learning_rate": 5.158730158730159e-06,
924
- "loss": 2.2595,
925
- "step": 635
926
- },
927
- {
928
- "epoch": 3.16,
929
- "grad_norm": 8.25489616394043,
930
  "learning_rate": 4.7619047619047615e-06,
931
- "loss": 2.0537,
932
  "step": 640
933
  },
934
  {
935
- "epoch": 3.17,
936
- "grad_norm": 10.085835456848145,
937
- "learning_rate": 4.365079365079365e-06,
938
- "loss": 2.4893,
939
- "step": 645
940
- },
941
- {
942
- "epoch": 3.18,
943
- "grad_norm": 10.919437408447266,
944
  "learning_rate": 3.968253968253968e-06,
945
- "loss": 2.4032,
946
  "step": 650
947
  },
948
  {
949
- "epoch": 3.19,
950
- "grad_norm": 8.23149299621582,
951
- "learning_rate": 3.5714285714285714e-06,
952
- "loss": 2.1834,
953
- "step": 655
954
- },
955
- {
956
- "epoch": 3.19,
957
- "grad_norm": 11.596048355102539,
958
  "learning_rate": 3.1746031746031746e-06,
959
- "loss": 2.3302,
960
  "step": 660
961
  },
962
  {
963
- "epoch": 3.2,
964
- "grad_norm": 11.665302276611328,
965
- "learning_rate": 2.777777777777778e-06,
966
- "loss": 2.3287,
967
- "step": 665
968
- },
969
- {
970
- "epoch": 3.21,
971
- "grad_norm": 9.233076095581055,
972
  "learning_rate": 2.3809523809523808e-06,
973
- "loss": 2.1008,
974
  "step": 670
975
  },
976
  {
977
- "epoch": 3.21,
978
- "grad_norm": 11.429000854492188,
979
- "learning_rate": 1.984126984126984e-06,
980
- "loss": 2.2577,
981
- "step": 675
982
- },
983
- {
984
- "epoch": 3.22,
985
- "grad_norm": 8.783037185668945,
986
  "learning_rate": 1.5873015873015873e-06,
987
- "loss": 2.0016,
988
  "step": 680
989
  },
990
  {
991
- "epoch": 3.23,
992
- "grad_norm": 8.462553977966309,
993
- "learning_rate": 1.1904761904761904e-06,
994
- "loss": 2.3779,
995
- "step": 685
996
- },
997
- {
998
- "epoch": 3.24,
999
- "grad_norm": 9.461167335510254,
1000
  "learning_rate": 7.936507936507937e-07,
1001
- "loss": 1.9491,
1002
  "step": 690
1003
  },
1004
  {
1005
- "epoch": 3.24,
1006
- "grad_norm": 12.051897048950195,
1007
- "learning_rate": 3.9682539682539683e-07,
1008
- "loss": 2.0514,
1009
- "step": 695
1010
- },
1011
- {
1012
- "epoch": 3.25,
1013
- "grad_norm": 11.628158569335938,
1014
  "learning_rate": 0.0,
1015
- "loss": 1.9726,
1016
  "step": 700
1017
  },
1018
  {
1019
- "epoch": 3.25,
1020
- "eval_accuracy": 0.18403036166096146,
1021
- "eval_loss": 2.5616397857666016,
1022
- "eval_runtime": 2737.1764,
1023
- "eval_samples_per_second": 4.909,
1024
- "eval_steps_per_second": 1.228,
1025
  "step": 700
1026
  },
1027
  {
1028
- "epoch": 3.25,
1029
  "step": 700,
1030
- "total_flos": 3.4893542331777024e+18,
1031
- "train_loss": 2.449405036653791,
1032
- "train_runtime": 12421.7881,
1033
- "train_samples_per_second": 0.225,
1034
- "train_steps_per_second": 0.056
1035
- },
1036
- {
1037
- "epoch": 3.25,
1038
- "eval_accuracy": 0.11733870967741936,
1039
- "eval_loss": 2.6881906986236572,
1040
- "eval_runtime": 975.7268,
1041
- "eval_samples_per_second": 5.083,
1042
- "eval_steps_per_second": 1.271,
1043
  "step": 700
1044
  },
1045
  {
1046
- "epoch": 3.25,
1047
- "eval_accuracy": 0.11733870967741936,
1048
- "eval_loss": 2.6881909370422363,
1049
- "eval_runtime": 956.0688,
1050
- "eval_samples_per_second": 5.188,
1051
- "eval_steps_per_second": 1.297,
1052
  "step": 700
1053
  }
1054
  ],
1055
- "logging_steps": 5,
1056
  "max_steps": 700,
1057
  "num_input_tokens_seen": 0,
1058
  "num_train_epochs": 9223372036854775807,
1059
  "save_steps": 500,
1060
- "total_flos": 3.4893542331777024e+18,
1061
- "train_batch_size": 4,
1062
  "trial_name": null,
1063
  "trial_params": null
1064
  }
 
1
  {
2
+ "best_metric": 0.23411221908022026,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucfcrime-full2/checkpoint-616",
4
+ "epoch": 7.12,
5
  "eval_steps": 500,
6
  "global_step": 700,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "grad_norm": 9.479973793029785,
 
 
 
 
 
 
 
14
  "learning_rate": 7.142857142857143e-06,
15
+ "loss": 2.6626,
16
  "step": 10
17
  },
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.03,
20
+ "grad_norm": 9.114124298095703,
21
  "learning_rate": 1.4285714285714285e-05,
22
+ "loss": 2.6526,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.04,
27
+ "grad_norm": 8.465902328491211,
 
 
 
 
 
 
 
28
  "learning_rate": 2.1428571428571428e-05,
29
+ "loss": 2.6297,
30
  "step": 30
31
  },
 
 
 
 
 
 
 
32
  {
33
  "epoch": 0.06,
34
+ "grad_norm": 8.733708381652832,
35
  "learning_rate": 2.857142857142857e-05,
36
+ "loss": 2.5219,
37
  "step": 40
38
  },
 
 
 
 
 
 
 
39
  {
40
  "epoch": 0.07,
41
+ "grad_norm": 7.121254920959473,
42
  "learning_rate": 3.571428571428572e-05,
43
+ "loss": 2.5271,
44
  "step": 50
45
  },
 
 
 
 
 
 
 
46
  {
47
  "epoch": 0.09,
48
+ "grad_norm": 7.228466510772705,
49
  "learning_rate": 4.2857142857142856e-05,
50
+ "loss": 2.5257,
51
  "step": 60
52
  },
 
 
 
 
 
 
 
53
  {
54
  "epoch": 0.1,
55
+ "grad_norm": 7.514686584472656,
56
  "learning_rate": 5e-05,
57
+ "loss": 2.5942,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.11,
62
+ "grad_norm": 8.028372764587402,
 
 
 
 
 
 
 
63
  "learning_rate": 4.9206349206349204e-05,
64
+ "loss": 2.5836,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.13,
69
+ "eval_accuracy": 0.20799226075308827,
70
+ "eval_loss": 2.494396924972534,
71
+ "eval_runtime": 2684.2234,
72
+ "eval_samples_per_second": 5.006,
73
+ "eval_steps_per_second": 0.626,
74
+ "step": 88
75
  },
76
  {
77
+ "epoch": 1.0,
78
+ "grad_norm": 9.068702697753906,
79
  "learning_rate": 4.841269841269841e-05,
80
+ "loss": 2.5532,
81
  "step": 90
82
  },
83
  {
84
+ "epoch": 1.02,
85
+ "grad_norm": 9.19058609008789,
 
 
 
 
 
 
 
86
  "learning_rate": 4.761904761904762e-05,
87
+ "loss": 2.4752,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 1.03,
92
+ "grad_norm": 6.300441741943359,
 
 
 
 
 
 
 
93
  "learning_rate": 4.682539682539683e-05,
94
+ "loss": 2.512,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 1.05,
99
+ "grad_norm": 6.24762487411499,
 
 
 
 
 
 
 
100
  "learning_rate": 4.603174603174603e-05,
101
+ "loss": 2.6188,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 1.06,
106
+ "grad_norm": 6.59229850769043,
 
 
 
 
 
 
 
107
  "learning_rate": 4.523809523809524e-05,
108
+ "loss": 2.3887,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 1.07,
113
+ "grad_norm": 7.241441249847412,
 
 
 
 
 
 
 
114
  "learning_rate": 4.4444444444444447e-05,
115
+ "loss": 2.4597,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 1.09,
120
+ "grad_norm": 6.4515275955200195,
 
 
 
 
 
 
 
121
  "learning_rate": 4.3650793650793655e-05,
122
+ "loss": 2.4792,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 1.1,
127
+ "grad_norm": 6.551385879516602,
 
 
 
 
 
 
 
128
  "learning_rate": 4.2857142857142856e-05,
129
+ "loss": 2.4636,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 1.12,
134
+ "grad_norm": 5.237429618835449,
 
 
 
 
 
 
 
135
  "learning_rate": 4.2063492063492065e-05,
136
+ "loss": 2.3212,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 1.13,
141
+ "eval_accuracy": 0.17725852061318648,
142
+ "eval_loss": 2.5854506492614746,
143
+ "eval_runtime": 2671.3135,
144
+ "eval_samples_per_second": 5.03,
145
+ "eval_steps_per_second": 0.629,
146
+ "step": 176
 
 
 
 
 
 
 
147
  },
148
  {
149
+ "epoch": 2.01,
150
+ "grad_norm": 8.875786781311035,
151
  "learning_rate": 4.126984126984127e-05,
152
+ "loss": 2.3138,
153
  "step": 180
154
  },
155
  {
156
+ "epoch": 2.02,
157
+ "grad_norm": 7.928117275238037,
 
 
 
 
 
 
 
158
  "learning_rate": 4.047619047619048e-05,
159
+ "loss": 2.535,
160
  "step": 190
161
  },
162
  {
163
+ "epoch": 2.03,
164
+ "grad_norm": 7.844710350036621,
 
 
 
 
 
 
 
165
  "learning_rate": 3.968253968253968e-05,
166
+ "loss": 2.4373,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 2.05,
171
+ "grad_norm": 7.789624214172363,
 
 
 
 
 
 
 
172
  "learning_rate": 3.888888888888889e-05,
173
+ "loss": 2.3984,
174
  "step": 210
175
  },
176
  {
177
+ "epoch": 2.06,
178
+ "grad_norm": 7.342363357543945,
 
 
 
 
 
 
 
179
  "learning_rate": 3.809523809523809e-05,
180
+ "loss": 2.1948,
181
  "step": 220
182
  },
183
  {
184
+ "epoch": 2.08,
185
+ "grad_norm": 6.116743564605713,
 
 
 
 
 
 
 
186
  "learning_rate": 3.730158730158731e-05,
187
+ "loss": 2.3234,
188
  "step": 230
189
  },
190
  {
191
+ "epoch": 2.09,
192
+ "grad_norm": 8.138971328735352,
 
 
 
 
 
 
 
193
  "learning_rate": 3.650793650793651e-05,
194
+ "loss": 2.3012,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 2.11,
199
+ "grad_norm": 7.563113689422607,
 
 
 
 
 
 
 
200
  "learning_rate": 3.571428571428572e-05,
201
+ "loss": 2.4211,
202
  "step": 250
203
  },
204
  {
205
+ "epoch": 2.12,
206
+ "grad_norm": 8.180752754211426,
 
 
 
 
 
 
 
207
  "learning_rate": 3.492063492063492e-05,
208
+ "loss": 2.2333,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 2.13,
213
+ "eval_accuracy": 0.10455424914421789,
214
+ "eval_loss": 2.627028226852417,
215
+ "eval_runtime": 2669.9747,
216
+ "eval_samples_per_second": 5.033,
217
+ "eval_steps_per_second": 0.629,
218
+ "step": 264
219
  },
220
  {
221
+ "epoch": 3.01,
222
+ "grad_norm": 8.733146667480469,
223
  "learning_rate": 3.412698412698413e-05,
224
+ "loss": 2.3931,
225
  "step": 270
226
  },
227
  {
228
+ "epoch": 3.02,
229
+ "grad_norm": 8.307793617248535,
 
 
 
 
 
 
 
230
  "learning_rate": 3.3333333333333335e-05,
231
+ "loss": 2.1948,
232
  "step": 280
233
  },
234
  {
235
+ "epoch": 3.04,
236
+ "grad_norm": 9.008221626281738,
 
 
 
 
 
 
 
237
  "learning_rate": 3.253968253968254e-05,
238
+ "loss": 2.3545,
239
  "step": 290
240
  },
241
  {
242
+ "epoch": 3.05,
243
+ "grad_norm": 8.743054389953613,
 
 
 
 
 
 
 
244
  "learning_rate": 3.1746031746031745e-05,
245
+ "loss": 2.1182,
246
  "step": 300
247
  },
248
  {
249
+ "epoch": 3.07,
250
+ "grad_norm": 7.782846450805664,
 
 
 
 
 
 
 
251
  "learning_rate": 3.095238095238095e-05,
252
+ "loss": 2.2446,
253
  "step": 310
254
  },
255
  {
256
+ "epoch": 3.08,
257
+ "grad_norm": 9.437503814697266,
 
 
 
 
 
 
 
258
  "learning_rate": 3.0158730158730158e-05,
259
+ "loss": 2.3095,
260
  "step": 320
261
  },
262
  {
263
+ "epoch": 3.09,
264
+ "grad_norm": 8.776910781860352,
 
 
 
 
 
 
 
265
  "learning_rate": 2.9365079365079366e-05,
266
+ "loss": 2.2165,
267
  "step": 330
268
  },
269
  {
270
+ "epoch": 3.11,
271
+ "grad_norm": 7.262299060821533,
 
 
 
 
 
 
 
272
  "learning_rate": 2.857142857142857e-05,
273
+ "loss": 1.9958,
274
  "step": 340
275
  },
276
  {
277
+ "epoch": 3.12,
278
+ "grad_norm": 7.0002288818359375,
 
 
 
 
 
 
 
279
  "learning_rate": 2.777777777777778e-05,
280
+ "loss": 1.985,
281
  "step": 350
282
  },
283
  {
284
+ "epoch": 3.13,
285
+ "eval_accuracy": 0.2108944783449918,
286
+ "eval_loss": 2.4058477878570557,
287
+ "eval_runtime": 2676.3678,
288
+ "eval_samples_per_second": 5.021,
289
+ "eval_steps_per_second": 0.628,
290
+ "step": 352
 
 
 
 
 
 
 
291
  },
292
  {
293
+ "epoch": 4.01,
294
+ "grad_norm": 13.075250625610352,
295
  "learning_rate": 2.6984126984126984e-05,
296
+ "loss": 1.9445,
297
  "step": 360
298
  },
299
  {
300
+ "epoch": 4.03,
301
+ "grad_norm": 11.625497817993164,
 
 
 
 
 
 
 
302
  "learning_rate": 2.6190476190476192e-05,
303
+ "loss": 1.9884,
304
  "step": 370
305
  },
306
  {
307
+ "epoch": 4.04,
308
+ "grad_norm": 9.822185516357422,
 
 
 
 
 
 
 
309
  "learning_rate": 2.5396825396825397e-05,
310
+ "loss": 2.1677,
311
  "step": 380
312
  },
313
  {
314
+ "epoch": 4.05,
315
+ "grad_norm": 9.551963806152344,
 
 
 
 
 
 
 
316
  "learning_rate": 2.4603174603174602e-05,
317
+ "loss": 2.1942,
318
  "step": 390
319
  },
320
  {
321
+ "epoch": 4.07,
322
+ "grad_norm": 10.00942325592041,
 
 
 
 
 
 
 
323
  "learning_rate": 2.380952380952381e-05,
324
+ "loss": 2.1495,
325
  "step": 400
326
  },
327
  {
328
+ "epoch": 4.08,
329
+ "grad_norm": 7.805147171020508,
 
 
 
 
 
 
 
330
  "learning_rate": 2.3015873015873015e-05,
331
+ "loss": 2.1318,
332
  "step": 410
333
  },
334
  {
335
+ "epoch": 4.1,
336
+ "grad_norm": 10.390979766845703,
 
 
 
 
 
 
 
337
  "learning_rate": 2.2222222222222223e-05,
338
+ "loss": 2.0618,
339
  "step": 420
340
  },
341
  {
342
+ "epoch": 4.11,
343
+ "grad_norm": 8.040055274963379,
 
 
 
 
 
 
 
344
  "learning_rate": 2.1428571428571428e-05,
345
+ "loss": 2.1917,
346
  "step": 430
347
  },
348
  {
349
+ "epoch": 4.13,
350
+ "grad_norm": 19.272586822509766,
 
 
 
 
 
 
 
351
  "learning_rate": 2.0634920634920636e-05,
352
+ "loss": 2.194,
353
  "step": 440
354
  },
355
  {
356
+ "epoch": 4.13,
357
+ "eval_accuracy": 0.22354517041226374,
358
+ "eval_loss": 2.3653502464294434,
359
+ "eval_runtime": 2678.1886,
360
+ "eval_samples_per_second": 5.018,
361
+ "eval_steps_per_second": 0.627,
362
+ "step": 440
363
  },
364
  {
365
+ "epoch": 5.01,
366
+ "grad_norm": 8.98901081085205,
367
  "learning_rate": 1.984126984126984e-05,
368
+ "loss": 1.9553,
369
  "step": 450
370
  },
371
  {
372
+ "epoch": 5.03,
373
+ "grad_norm": 9.513757705688477,
 
 
 
 
 
 
 
374
  "learning_rate": 1.9047619047619046e-05,
375
+ "loss": 1.9827,
376
  "step": 460
377
  },
378
  {
379
+ "epoch": 5.04,
380
+ "grad_norm": 8.948848724365234,
 
 
 
 
 
 
 
381
  "learning_rate": 1.8253968253968254e-05,
382
+ "loss": 1.8577,
383
  "step": 470
384
  },
385
  {
386
+ "epoch": 5.06,
387
+ "grad_norm": 10.43582534790039,
 
 
 
 
 
 
 
388
  "learning_rate": 1.746031746031746e-05,
389
+ "loss": 1.8543,
390
  "step": 480
391
  },
392
  {
393
+ "epoch": 5.07,
394
+ "grad_norm": 10.061915397644043,
 
 
 
 
 
 
 
395
  "learning_rate": 1.6666666666666667e-05,
396
+ "loss": 2.0255,
397
  "step": 490
398
  },
399
  {
400
+ "epoch": 5.09,
401
+ "grad_norm": 9.82913875579834,
 
 
 
 
 
 
 
402
  "learning_rate": 1.5873015873015872e-05,
403
+ "loss": 2.0622,
404
  "step": 500
405
  },
406
  {
407
+ "epoch": 5.1,
408
+ "grad_norm": 7.94363260269165,
 
 
 
 
 
 
 
409
  "learning_rate": 1.5079365079365079e-05,
410
+ "loss": 1.9135,
411
  "step": 510
412
  },
413
  {
414
+ "epoch": 5.11,
415
+ "grad_norm": 8.4102144241333,
 
 
 
 
 
 
 
416
  "learning_rate": 1.4285714285714285e-05,
417
+ "loss": 1.9796,
418
  "step": 520
419
  },
420
  {
421
+ "epoch": 5.13,
422
+ "eval_accuracy": 0.22354517041226374,
423
+ "eval_loss": 2.2609341144561768,
424
+ "eval_runtime": 2669.641,
425
+ "eval_samples_per_second": 5.034,
426
+ "eval_steps_per_second": 0.629,
427
+ "step": 528
 
 
 
 
 
 
 
428
  },
429
  {
430
+ "epoch": 6.0,
431
+ "grad_norm": 9.409048080444336,
432
  "learning_rate": 1.3492063492063492e-05,
433
+ "loss": 1.7939,
434
  "step": 530
435
  },
436
  {
437
+ "epoch": 6.02,
438
+ "grad_norm": 19.078195571899414,
 
 
 
 
 
 
 
439
  "learning_rate": 1.2698412698412699e-05,
440
+ "loss": 1.7682,
441
  "step": 540
442
  },
443
  {
444
+ "epoch": 6.03,
445
+ "grad_norm": 8.306180000305176,
 
 
 
 
 
 
 
446
  "learning_rate": 1.1904761904761905e-05,
447
+ "loss": 1.8795,
448
  "step": 550
449
  },
450
  {
451
+ "epoch": 6.05,
452
+ "grad_norm": 6.871161460876465,
 
 
 
 
 
 
 
453
  "learning_rate": 1.1111111111111112e-05,
454
+ "loss": 1.8303,
455
  "step": 560
456
  },
457
  {
458
+ "epoch": 6.06,
459
+ "grad_norm": 11.705153465270996,
 
 
 
 
 
 
 
460
  "learning_rate": 1.0317460317460318e-05,
461
+ "loss": 1.7997,
462
  "step": 570
463
  },
464
  {
465
+ "epoch": 6.07,
466
+ "grad_norm": 13.744942665100098,
 
 
 
 
 
 
 
467
  "learning_rate": 9.523809523809523e-06,
468
+ "loss": 1.7532,
469
  "step": 580
470
  },
471
  {
472
+ "epoch": 6.09,
473
+ "grad_norm": 11.410266876220703,
 
 
 
 
 
 
 
474
  "learning_rate": 8.73015873015873e-06,
475
+ "loss": 1.7416,
476
  "step": 590
477
  },
478
  {
479
+ "epoch": 6.1,
480
+ "grad_norm": 15.524273872375488,
 
 
 
 
 
 
 
481
  "learning_rate": 7.936507936507936e-06,
482
+ "loss": 1.9096,
483
  "step": 600
484
  },
485
  {
486
+ "epoch": 6.12,
487
+ "grad_norm": 9.335684776306152,
 
 
 
 
 
 
 
488
  "learning_rate": 7.142857142857143e-06,
489
+ "loss": 1.8786,
490
  "step": 610
491
  },
492
  {
493
+ "epoch": 6.13,
494
+ "eval_accuracy": 0.23411221908022026,
495
+ "eval_loss": 2.2724666595458984,
496
+ "eval_runtime": 2674.1892,
497
+ "eval_samples_per_second": 5.025,
498
+ "eval_steps_per_second": 0.628,
499
+ "step": 616
500
  },
501
  {
502
+ "epoch": 7.01,
503
+ "grad_norm": 12.772497177124023,
504
  "learning_rate": 6.349206349206349e-06,
505
+ "loss": 1.6914,
506
  "step": 620
507
  },
508
  {
509
+ "epoch": 7.02,
510
+ "grad_norm": 11.079833984375,
 
 
 
 
 
 
 
511
  "learning_rate": 5.555555555555556e-06,
512
+ "loss": 1.6596,
513
  "step": 630
514
  },
515
  {
516
+ "epoch": 7.03,
517
+ "grad_norm": 11.924140930175781,
 
 
 
 
 
 
 
518
  "learning_rate": 4.7619047619047615e-06,
519
+ "loss": 1.7996,
520
  "step": 640
521
  },
522
  {
523
+ "epoch": 7.05,
524
+ "grad_norm": 12.28601360321045,
 
 
 
 
 
 
 
525
  "learning_rate": 3.968253968253968e-06,
526
+ "loss": 1.5564,
527
  "step": 650
528
  },
529
  {
530
+ "epoch": 7.06,
531
+ "grad_norm": 12.644978523254395,
 
 
 
 
 
 
 
532
  "learning_rate": 3.1746031746031746e-06,
533
+ "loss": 1.8793,
534
  "step": 660
535
  },
536
  {
537
+ "epoch": 7.08,
538
+ "grad_norm": 11.849274635314941,
 
 
 
 
 
 
 
539
  "learning_rate": 2.3809523809523808e-06,
540
+ "loss": 1.6106,
541
  "step": 670
542
  },
543
  {
544
+ "epoch": 7.09,
545
+ "grad_norm": 12.6458101272583,
 
 
 
 
 
 
 
546
  "learning_rate": 1.5873015873015873e-06,
547
+ "loss": 1.9571,
548
  "step": 680
549
  },
550
  {
551
+ "epoch": 7.11,
552
+ "grad_norm": 12.309945106506348,
 
 
 
 
 
 
 
553
  "learning_rate": 7.936507936507937e-07,
554
+ "loss": 1.6146,
555
  "step": 690
556
  },
557
  {
558
+ "epoch": 7.12,
559
+ "grad_norm": 11.306781768798828,
 
 
 
 
 
 
 
560
  "learning_rate": 0.0,
561
+ "loss": 1.71,
562
  "step": 700
563
  },
564
  {
565
+ "epoch": 7.12,
566
+ "eval_accuracy": 0.22257776454829586,
567
+ "eval_loss": 2.2227871417999268,
568
+ "eval_runtime": 2677.8894,
569
+ "eval_samples_per_second": 5.018,
570
+ "eval_steps_per_second": 0.627,
571
  "step": 700
572
  },
573
  {
574
+ "epoch": 7.12,
575
  "step": 700,
576
+ "total_flos": 6.943814924023628e+18,
577
+ "train_loss": 2.1489518587929863,
578
+ "train_runtime": 24272.1999,
579
+ "train_samples_per_second": 0.231,
580
+ "train_steps_per_second": 0.029
581
+ },
582
+ {
583
+ "epoch": 7.12,
584
+ "eval_accuracy": 0.225,
585
+ "eval_loss": 2.50138521194458,
586
+ "eval_runtime": 955.2551,
587
+ "eval_samples_per_second": 5.192,
588
+ "eval_steps_per_second": 0.649,
589
  "step": 700
590
  },
591
  {
592
+ "epoch": 7.12,
593
+ "eval_accuracy": 0.225,
594
+ "eval_loss": 2.501384973526001,
595
+ "eval_runtime": 945.2716,
596
+ "eval_samples_per_second": 5.247,
597
+ "eval_steps_per_second": 0.656,
598
  "step": 700
599
  }
600
  ],
601
+ "logging_steps": 10,
602
  "max_steps": 700,
603
  "num_input_tokens_seen": 0,
604
  "num_train_epochs": 9223372036854775807,
605
  "save_steps": 500,
606
+ "total_flos": 6.943814924023628e+18,
607
+ "train_batch_size": 8,
608
  "trial_name": null,
609
  "trial_params": null
610
  }