Raihan004 commited on
Commit
bcf18c7
1 Parent(s): 9c34ea1

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - imagefolder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: imagefolder
18
  type: imagefolder
19
  config: default
20
  split: train
@@ -30,7 +31,7 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # Action_all_10_class
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
  - Loss: 0.4146
36
  - Accuracy: 0.8785
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: Action_small_dataset
19
  type: imagefolder
20
  config: default
21
  split: train
 
31
 
32
  # Action_all_10_class
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the Action_small_dataset dataset.
35
  It achieves the following results on the evaluation set:
36
  - Loss: 0.4146
37
  - Accuracy: 0.8785
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.8660968660968661,
4
- "eval_loss": 0.42591235041618347,
5
- "eval_runtime": 11.9995,
6
- "eval_samples_per_second": 58.502,
7
- "eval_steps_per_second": 7.334,
8
- "total_flos": 1.539101261655982e+18,
9
- "train_loss": 0.6624564435108599,
10
- "train_runtime": 820.1847,
11
- "train_samples_per_second": 24.214,
12
- "train_steps_per_second": 1.518
13
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.8785276073619632,
4
+ "eval_loss": 0.4145749807357788,
5
+ "eval_runtime": 11.5662,
6
+ "eval_samples_per_second": 70.464,
7
+ "eval_steps_per_second": 8.819,
8
+ "total_flos": 1.789030847196795e+18,
9
+ "train_loss": 0.6435174308433664,
10
+ "train_runtime": 776.1745,
11
+ "train_samples_per_second": 29.742,
12
+ "train_steps_per_second": 1.862
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.8660968660968661,
4
- "eval_loss": 0.42591235041618347,
5
- "eval_runtime": 11.9995,
6
- "eval_samples_per_second": 58.502,
7
- "eval_steps_per_second": 7.334
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.8785276073619632,
4
+ "eval_loss": 0.4145749807357788,
5
+ "eval_runtime": 11.5662,
6
+ "eval_samples_per_second": 70.464,
7
+ "eval_steps_per_second": 8.819
8
  }
runs/Apr30_14-52-00_e47bc2c7ac19/events.out.tfevents.1714489523.e47bc2c7ac19.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b915e629320c5d37a2eb5c4eb05ad751b260745a4910295acff1688f81bc0ad7
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 1.539101261655982e+18,
4
- "train_loss": 0.6624564435108599,
5
- "train_runtime": 820.1847,
6
- "train_samples_per_second": 24.214,
7
- "train_steps_per_second": 1.518
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 1.789030847196795e+18,
4
+ "train_loss": 0.6435174308433664,
5
+ "train_runtime": 776.1745,
6
+ "train_samples_per_second": 29.742,
7
+ "train_steps_per_second": 1.862
8
  }
trainer_state.json CHANGED
@@ -1,880 +1,1164 @@
1
  {
2
- "best_metric": 0.42591235041618347,
3
- "best_model_checkpoint": "Action_all_10_class/checkpoint-1100",
4
  "epoch": 5.0,
5
  "eval_steps": 100,
6
- "global_step": 1245,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04,
13
- "learning_rate": 0.00019839357429718877,
14
- "loss": 2.2375,
 
15
  "step": 10
16
  },
17
  {
18
- "epoch": 0.08,
19
- "learning_rate": 0.00019678714859437752,
20
- "loss": 2.0451,
 
21
  "step": 20
22
  },
23
  {
24
- "epoch": 0.12,
25
- "learning_rate": 0.00019518072289156628,
26
- "loss": 1.8663,
 
27
  "step": 30
28
  },
29
  {
30
- "epoch": 0.16,
31
- "learning_rate": 0.00019357429718875504,
32
- "loss": 1.5435,
 
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.2,
37
- "learning_rate": 0.00019196787148594377,
38
- "loss": 1.4349,
 
39
  "step": 50
40
  },
41
  {
42
- "epoch": 0.24,
43
- "learning_rate": 0.00019036144578313252,
44
- "loss": 1.4104,
 
45
  "step": 60
46
  },
47
  {
48
- "epoch": 0.28,
49
- "learning_rate": 0.00018875502008032128,
50
- "loss": 1.2762,
 
51
  "step": 70
52
  },
53
  {
54
- "epoch": 0.32,
55
- "learning_rate": 0.00018714859437751004,
56
- "loss": 1.2306,
 
57
  "step": 80
58
  },
59
  {
60
- "epoch": 0.36,
61
- "learning_rate": 0.0001855421686746988,
62
- "loss": 1.2362,
 
63
  "step": 90
64
  },
65
  {
66
- "epoch": 0.4,
67
- "learning_rate": 0.0001840963855421687,
68
- "loss": 1.0772,
 
69
  "step": 100
70
  },
71
  {
72
- "epoch": 0.4,
73
- "eval_accuracy": 0.6737891737891738,
74
- "eval_loss": 1.0524766445159912,
75
- "eval_runtime": 12.1219,
76
- "eval_samples_per_second": 57.912,
77
- "eval_steps_per_second": 7.26,
78
  "step": 100
79
  },
80
  {
81
- "epoch": 0.44,
82
- "learning_rate": 0.00018248995983935744,
83
- "loss": 1.2642,
 
84
  "step": 110
85
  },
86
  {
87
- "epoch": 0.48,
88
- "learning_rate": 0.0001808835341365462,
89
- "loss": 1.0873,
 
90
  "step": 120
91
  },
92
  {
93
- "epoch": 0.52,
94
- "learning_rate": 0.00017927710843373496,
95
- "loss": 1.1482,
 
96
  "step": 130
97
  },
98
  {
99
- "epoch": 0.56,
100
- "learning_rate": 0.00017767068273092371,
101
- "loss": 0.9356,
 
102
  "step": 140
103
  },
104
  {
105
- "epoch": 0.6,
106
- "learning_rate": 0.00017606425702811247,
107
- "loss": 1.0534,
 
108
  "step": 150
109
  },
110
  {
111
- "epoch": 0.64,
112
- "learning_rate": 0.00017445783132530123,
113
- "loss": 1.1054,
 
114
  "step": 160
115
  },
116
  {
117
- "epoch": 0.68,
118
- "learning_rate": 0.00017285140562248996,
119
- "loss": 1.0417,
 
120
  "step": 170
121
  },
122
  {
123
- "epoch": 0.72,
124
- "learning_rate": 0.00017124497991967871,
125
- "loss": 0.9529,
 
126
  "step": 180
127
  },
128
  {
129
- "epoch": 0.76,
130
- "learning_rate": 0.00016963855421686747,
131
- "loss": 1.0454,
 
132
  "step": 190
133
  },
134
  {
135
- "epoch": 0.8,
136
- "learning_rate": 0.00016803212851405623,
137
- "loss": 0.9054,
 
138
  "step": 200
139
  },
140
  {
141
- "epoch": 0.8,
142
- "eval_accuracy": 0.8005698005698005,
143
- "eval_loss": 0.734963595867157,
144
- "eval_runtime": 11.844,
145
- "eval_samples_per_second": 59.27,
146
- "eval_steps_per_second": 7.43,
147
  "step": 200
148
  },
149
  {
150
- "epoch": 0.84,
151
- "learning_rate": 0.00016642570281124499,
152
- "loss": 1.0347,
 
153
  "step": 210
154
  },
155
  {
156
- "epoch": 0.88,
157
- "learning_rate": 0.00016481927710843374,
158
- "loss": 1.0496,
 
159
  "step": 220
160
  },
161
  {
162
- "epoch": 0.92,
163
- "learning_rate": 0.0001632128514056225,
164
- "loss": 0.8833,
 
165
  "step": 230
166
  },
167
  {
168
- "epoch": 0.96,
169
- "learning_rate": 0.00016160642570281126,
170
- "loss": 0.9624,
 
171
  "step": 240
172
  },
173
  {
174
- "epoch": 1.0,
175
- "learning_rate": 0.00016,
176
- "loss": 0.8419,
 
177
  "step": 250
178
  },
179
  {
180
- "epoch": 1.04,
181
- "learning_rate": 0.00015839357429718874,
182
- "loss": 0.7875,
 
183
  "step": 260
184
  },
185
  {
186
- "epoch": 1.08,
187
- "learning_rate": 0.0001567871485943775,
188
- "loss": 0.7934,
 
189
  "step": 270
190
  },
191
  {
192
- "epoch": 1.12,
193
- "learning_rate": 0.00015518072289156626,
194
- "loss": 0.897,
 
195
  "step": 280
196
  },
197
  {
198
- "epoch": 1.16,
199
- "learning_rate": 0.00015357429718875501,
200
- "loss": 0.7199,
 
201
  "step": 290
202
  },
203
  {
204
- "epoch": 1.2,
205
- "learning_rate": 0.00015196787148594377,
206
- "loss": 0.9093,
 
207
  "step": 300
208
  },
209
  {
210
- "epoch": 1.2,
211
- "eval_accuracy": 0.8162393162393162,
212
- "eval_loss": 0.6519011855125427,
213
- "eval_runtime": 12.7334,
214
- "eval_samples_per_second": 55.13,
215
- "eval_steps_per_second": 6.911,
216
  "step": 300
217
  },
218
  {
219
- "epoch": 1.24,
220
- "learning_rate": 0.00015036144578313253,
221
- "loss": 0.8521,
 
222
  "step": 310
223
  },
224
  {
225
- "epoch": 1.29,
226
- "learning_rate": 0.00014875502008032128,
227
- "loss": 0.9056,
 
228
  "step": 320
229
  },
230
  {
231
- "epoch": 1.33,
232
- "learning_rate": 0.00014714859437751004,
233
- "loss": 0.8005,
 
234
  "step": 330
235
  },
236
  {
237
- "epoch": 1.37,
238
- "learning_rate": 0.0001455421686746988,
239
- "loss": 0.8129,
 
240
  "step": 340
241
  },
242
  {
243
- "epoch": 1.41,
244
- "learning_rate": 0.00014393574297188756,
245
- "loss": 0.6784,
 
246
  "step": 350
247
  },
248
  {
249
- "epoch": 1.45,
250
- "learning_rate": 0.0001423293172690763,
251
- "loss": 0.7941,
 
252
  "step": 360
253
  },
254
  {
255
- "epoch": 1.49,
256
- "learning_rate": 0.00014072289156626507,
257
- "loss": 0.7838,
 
258
  "step": 370
259
  },
260
  {
261
- "epoch": 1.53,
262
- "learning_rate": 0.00013911646586345383,
263
- "loss": 0.7424,
 
264
  "step": 380
265
  },
266
  {
267
- "epoch": 1.57,
268
- "learning_rate": 0.00013751004016064258,
269
- "loss": 0.7194,
 
270
  "step": 390
271
  },
272
  {
273
- "epoch": 1.61,
274
- "learning_rate": 0.00013590361445783134,
275
- "loss": 0.6264,
 
276
  "step": 400
277
  },
278
  {
279
- "epoch": 1.61,
280
- "eval_accuracy": 0.8105413105413105,
281
- "eval_loss": 0.6052153706550598,
282
- "eval_runtime": 10.8279,
283
- "eval_samples_per_second": 64.833,
284
- "eval_steps_per_second": 8.127,
285
  "step": 400
286
  },
287
  {
288
- "epoch": 1.65,
289
- "learning_rate": 0.0001342971887550201,
290
- "loss": 0.7401,
 
291
  "step": 410
292
  },
293
  {
294
- "epoch": 1.69,
295
- "learning_rate": 0.00013269076305220885,
296
- "loss": 0.5516,
 
297
  "step": 420
298
  },
299
  {
300
- "epoch": 1.73,
301
- "learning_rate": 0.0001310843373493976,
302
- "loss": 0.6522,
 
303
  "step": 430
304
  },
305
  {
306
- "epoch": 1.77,
307
- "learning_rate": 0.00012947791164658637,
308
- "loss": 0.7528,
 
309
  "step": 440
310
  },
311
  {
312
- "epoch": 1.81,
313
- "learning_rate": 0.00012787148594377512,
314
- "loss": 0.5984,
 
315
  "step": 450
316
  },
317
  {
318
- "epoch": 1.85,
319
- "learning_rate": 0.00012626506024096385,
320
- "loss": 0.6913,
 
321
  "step": 460
322
  },
323
  {
324
- "epoch": 1.89,
325
- "learning_rate": 0.0001246586345381526,
326
- "loss": 0.7507,
 
327
  "step": 470
328
  },
329
  {
330
- "epoch": 1.93,
331
- "learning_rate": 0.00012305220883534137,
332
- "loss": 0.8109,
 
333
  "step": 480
334
  },
335
  {
336
- "epoch": 1.97,
337
- "learning_rate": 0.00012144578313253012,
338
- "loss": 0.7775,
 
339
  "step": 490
340
  },
341
  {
342
- "epoch": 2.01,
343
- "learning_rate": 0.00011983935742971888,
344
- "loss": 0.9279,
 
345
  "step": 500
346
  },
347
  {
348
- "epoch": 2.01,
349
- "eval_accuracy": 0.7934472934472935,
350
- "eval_loss": 0.6441792249679565,
351
- "eval_runtime": 12.779,
352
- "eval_samples_per_second": 54.934,
353
- "eval_steps_per_second": 6.886,
354
  "step": 500
355
  },
356
  {
357
- "epoch": 2.05,
358
- "learning_rate": 0.00011839357429718876,
359
- "loss": 0.7877,
 
360
  "step": 510
361
  },
362
  {
363
- "epoch": 2.09,
364
- "learning_rate": 0.00011678714859437752,
365
- "loss": 0.519,
 
366
  "step": 520
367
  },
368
  {
369
- "epoch": 2.13,
370
- "learning_rate": 0.00011518072289156627,
371
- "loss": 0.5842,
 
372
  "step": 530
373
  },
374
  {
375
- "epoch": 2.17,
376
- "learning_rate": 0.00011357429718875503,
377
- "loss": 0.4631,
 
378
  "step": 540
379
  },
380
  {
381
- "epoch": 2.21,
382
- "learning_rate": 0.00011196787148594379,
383
- "loss": 0.6516,
 
384
  "step": 550
385
  },
386
  {
387
- "epoch": 2.25,
388
- "learning_rate": 0.00011036144578313254,
389
- "loss": 0.782,
 
390
  "step": 560
391
  },
392
  {
393
- "epoch": 2.29,
394
- "learning_rate": 0.00010875502008032127,
395
- "loss": 0.4786,
 
396
  "step": 570
397
  },
398
  {
399
- "epoch": 2.33,
400
- "learning_rate": 0.00010714859437751003,
401
- "loss": 0.5342,
 
402
  "step": 580
403
  },
404
  {
405
- "epoch": 2.37,
406
- "learning_rate": 0.00010554216867469879,
407
- "loss": 0.5114,
 
408
  "step": 590
409
  },
410
  {
411
- "epoch": 2.41,
412
- "learning_rate": 0.00010393574297188754,
413
- "loss": 0.5023,
 
414
  "step": 600
415
  },
416
  {
417
- "epoch": 2.41,
418
- "eval_accuracy": 0.8262108262108262,
419
- "eval_loss": 0.5365344882011414,
420
- "eval_runtime": 13.1708,
421
- "eval_samples_per_second": 53.3,
422
- "eval_steps_per_second": 6.681,
423
  "step": 600
424
  },
425
  {
426
- "epoch": 2.45,
427
- "learning_rate": 0.0001023293172690763,
428
- "loss": 0.6584,
 
429
  "step": 610
430
  },
431
  {
432
- "epoch": 2.49,
433
- "learning_rate": 0.00010072289156626506,
434
- "loss": 0.5492,
 
435
  "step": 620
436
  },
437
  {
438
- "epoch": 2.53,
439
- "learning_rate": 9.911646586345382e-05,
440
- "loss": 0.5666,
 
441
  "step": 630
442
  },
443
  {
444
- "epoch": 2.57,
445
- "learning_rate": 9.751004016064259e-05,
446
- "loss": 0.5791,
 
447
  "step": 640
448
  },
449
  {
450
- "epoch": 2.61,
451
- "learning_rate": 9.590361445783133e-05,
452
- "loss": 0.5201,
 
453
  "step": 650
454
  },
455
  {
456
- "epoch": 2.65,
457
- "learning_rate": 9.429718875502009e-05,
458
- "loss": 0.5602,
 
459
  "step": 660
460
  },
461
  {
462
- "epoch": 2.69,
463
- "learning_rate": 9.269076305220884e-05,
464
- "loss": 0.5228,
 
465
  "step": 670
466
  },
467
  {
468
- "epoch": 2.73,
469
- "learning_rate": 9.10843373493976e-05,
470
- "loss": 0.5114,
 
471
  "step": 680
472
  },
473
  {
474
- "epoch": 2.77,
475
- "learning_rate": 8.947791164658636e-05,
476
- "loss": 0.4281,
 
477
  "step": 690
478
  },
479
  {
480
- "epoch": 2.81,
481
- "learning_rate": 8.78714859437751e-05,
482
- "loss": 0.4644,
 
483
  "step": 700
484
  },
485
  {
486
- "epoch": 2.81,
487
- "eval_accuracy": 0.8304843304843305,
488
- "eval_loss": 0.5504655241966248,
489
- "eval_runtime": 12.0796,
490
- "eval_samples_per_second": 58.114,
491
- "eval_steps_per_second": 7.285,
492
  "step": 700
493
  },
494
  {
495
- "epoch": 2.85,
496
- "learning_rate": 8.626506024096386e-05,
497
- "loss": 0.5208,
 
498
  "step": 710
499
  },
500
  {
501
- "epoch": 2.89,
502
- "learning_rate": 8.465863453815261e-05,
503
- "loss": 0.4574,
 
504
  "step": 720
505
  },
506
  {
507
- "epoch": 2.93,
508
- "learning_rate": 8.305220883534137e-05,
509
- "loss": 0.4136,
 
510
  "step": 730
511
  },
512
  {
513
- "epoch": 2.97,
514
- "learning_rate": 8.144578313253013e-05,
515
- "loss": 0.418,
 
516
  "step": 740
517
  },
518
  {
519
- "epoch": 3.01,
520
- "learning_rate": 7.983935742971887e-05,
521
- "loss": 0.472,
 
522
  "step": 750
523
  },
524
  {
525
- "epoch": 3.05,
526
- "learning_rate": 7.823293172690763e-05,
527
- "loss": 0.3784,
 
528
  "step": 760
529
  },
530
  {
531
- "epoch": 3.09,
532
- "learning_rate": 7.662650602409639e-05,
533
- "loss": 0.5764,
 
534
  "step": 770
535
  },
536
  {
537
- "epoch": 3.13,
538
- "learning_rate": 7.502008032128514e-05,
539
- "loss": 0.6532,
 
540
  "step": 780
541
  },
542
  {
543
- "epoch": 3.17,
544
- "learning_rate": 7.34136546184739e-05,
545
- "loss": 0.3921,
 
546
  "step": 790
547
  },
548
  {
549
- "epoch": 3.21,
550
- "learning_rate": 7.180722891566266e-05,
551
- "loss": 0.4217,
 
552
  "step": 800
553
  },
554
  {
555
- "epoch": 3.21,
556
- "eval_accuracy": 0.8433048433048433,
557
- "eval_loss": 0.502394437789917,
558
- "eval_runtime": 10.653,
559
- "eval_samples_per_second": 65.897,
560
- "eval_steps_per_second": 8.261,
561
  "step": 800
562
  },
563
  {
564
- "epoch": 3.25,
565
- "learning_rate": 7.020080321285141e-05,
566
- "loss": 0.4454,
 
567
  "step": 810
568
  },
569
  {
570
- "epoch": 3.29,
571
- "learning_rate": 6.859437751004017e-05,
572
- "loss": 0.2962,
 
573
  "step": 820
574
  },
575
  {
576
- "epoch": 3.33,
577
- "learning_rate": 6.698795180722893e-05,
578
- "loss": 0.4175,
 
579
  "step": 830
580
  },
581
  {
582
- "epoch": 3.37,
583
- "learning_rate": 6.538152610441768e-05,
584
- "loss": 0.3381,
 
585
  "step": 840
586
  },
587
  {
588
- "epoch": 3.41,
589
- "learning_rate": 6.377510040160643e-05,
590
- "loss": 0.4441,
 
591
  "step": 850
592
  },
593
  {
594
- "epoch": 3.45,
595
- "learning_rate": 6.216867469879518e-05,
596
- "loss": 0.3907,
 
597
  "step": 860
598
  },
599
  {
600
- "epoch": 3.49,
601
- "learning_rate": 6.056224899598394e-05,
602
- "loss": 0.4765,
 
603
  "step": 870
604
  },
605
  {
606
- "epoch": 3.53,
607
- "learning_rate": 5.89558232931727e-05,
608
- "loss": 0.4384,
 
609
  "step": 880
610
  },
611
  {
612
- "epoch": 3.57,
613
- "learning_rate": 5.7349397590361454e-05,
614
- "loss": 0.5241,
 
615
  "step": 890
616
  },
617
  {
618
- "epoch": 3.61,
619
- "learning_rate": 5.57429718875502e-05,
620
- "loss": 0.4447,
 
621
  "step": 900
622
  },
623
  {
624
- "epoch": 3.61,
625
- "eval_accuracy": 0.8532763532763533,
626
- "eval_loss": 0.4699917733669281,
627
- "eval_runtime": 11.6414,
628
- "eval_samples_per_second": 60.302,
629
- "eval_steps_per_second": 7.559,
630
  "step": 900
631
  },
632
  {
633
- "epoch": 3.65,
634
- "learning_rate": 5.4136546184738955e-05,
635
- "loss": 0.3466,
 
636
  "step": 910
637
  },
638
  {
639
- "epoch": 3.69,
640
- "learning_rate": 5.253012048192771e-05,
641
- "loss": 0.4321,
 
642
  "step": 920
643
  },
644
  {
645
- "epoch": 3.73,
646
- "learning_rate": 5.092369477911647e-05,
647
- "loss": 0.4604,
 
648
  "step": 930
649
  },
650
  {
651
- "epoch": 3.78,
652
- "learning_rate": 4.9317269076305225e-05,
653
- "loss": 0.4072,
 
654
  "step": 940
655
  },
656
  {
657
- "epoch": 3.82,
658
- "learning_rate": 4.771084337349398e-05,
659
- "loss": 0.4058,
 
660
  "step": 950
661
  },
662
  {
663
- "epoch": 3.86,
664
- "learning_rate": 4.610441767068273e-05,
665
- "loss": 0.319,
 
666
  "step": 960
667
  },
668
  {
669
- "epoch": 3.9,
670
- "learning_rate": 4.449799196787149e-05,
671
- "loss": 0.514,
 
672
  "step": 970
673
  },
674
  {
675
- "epoch": 3.94,
676
- "learning_rate": 4.2891566265060246e-05,
677
- "loss": 0.427,
 
678
  "step": 980
679
  },
680
  {
681
- "epoch": 3.98,
682
- "learning_rate": 4.1285140562248996e-05,
683
- "loss": 0.5547,
 
684
  "step": 990
685
  },
686
  {
687
- "epoch": 4.02,
688
- "learning_rate": 3.967871485943775e-05,
689
- "loss": 0.4073,
 
690
  "step": 1000
691
  },
692
  {
693
- "epoch": 4.02,
694
- "eval_accuracy": 0.8632478632478633,
695
- "eval_loss": 0.460509717464447,
696
- "eval_runtime": 11.5739,
697
- "eval_samples_per_second": 60.654,
698
- "eval_steps_per_second": 7.603,
699
  "step": 1000
700
  },
701
  {
702
- "epoch": 4.06,
703
- "learning_rate": 3.8072289156626503e-05,
704
- "loss": 0.2889,
 
705
  "step": 1010
706
  },
707
  {
708
- "epoch": 4.1,
709
- "learning_rate": 3.646586345381526e-05,
710
- "loss": 0.2756,
 
711
  "step": 1020
712
  },
713
  {
714
- "epoch": 4.14,
715
- "learning_rate": 3.485943775100402e-05,
716
- "loss": 0.3363,
 
717
  "step": 1030
718
  },
719
  {
720
- "epoch": 4.18,
721
- "learning_rate": 3.3253012048192774e-05,
722
- "loss": 0.2898,
 
723
  "step": 1040
724
  },
725
  {
726
- "epoch": 4.22,
727
- "learning_rate": 3.164658634538153e-05,
728
- "loss": 0.3705,
 
729
  "step": 1050
730
  },
731
  {
732
- "epoch": 4.26,
733
- "learning_rate": 3.004016064257028e-05,
734
- "loss": 0.2515,
 
735
  "step": 1060
736
  },
737
  {
738
- "epoch": 4.3,
739
- "learning_rate": 2.8433734939759038e-05,
740
- "loss": 0.3794,
 
741
  "step": 1070
742
  },
743
  {
744
- "epoch": 4.34,
745
- "learning_rate": 2.6827309236947795e-05,
746
- "loss": 0.3288,
 
747
  "step": 1080
748
  },
749
  {
750
- "epoch": 4.38,
751
- "learning_rate": 2.522088353413655e-05,
752
- "loss": 0.3614,
 
753
  "step": 1090
754
  },
755
  {
756
- "epoch": 4.42,
757
- "learning_rate": 2.3614457831325302e-05,
758
- "loss": 0.2679,
 
759
  "step": 1100
760
  },
761
  {
762
- "epoch": 4.42,
763
- "eval_accuracy": 0.8660968660968661,
764
- "eval_loss": 0.42591235041618347,
765
- "eval_runtime": 12.9146,
766
- "eval_samples_per_second": 54.357,
767
- "eval_steps_per_second": 6.814,
768
  "step": 1100
769
  },
770
  {
771
- "epoch": 4.46,
772
- "learning_rate": 2.200803212851406e-05,
773
- "loss": 0.2891,
 
774
  "step": 1110
775
  },
776
  {
777
- "epoch": 4.5,
778
- "learning_rate": 2.0401606425702812e-05,
779
- "loss": 0.4053,
 
780
  "step": 1120
781
  },
782
  {
783
- "epoch": 4.54,
784
- "learning_rate": 1.8795180722891566e-05,
785
- "loss": 0.3411,
 
786
  "step": 1130
787
  },
788
  {
789
- "epoch": 4.58,
790
- "learning_rate": 1.7188755020080323e-05,
791
- "loss": 0.3382,
 
792
  "step": 1140
793
  },
794
  {
795
- "epoch": 4.62,
796
- "learning_rate": 1.5582329317269076e-05,
797
- "loss": 0.4011,
 
798
  "step": 1150
799
  },
800
  {
801
- "epoch": 4.66,
802
- "learning_rate": 1.3975903614457833e-05,
803
- "loss": 0.3232,
 
804
  "step": 1160
805
  },
806
  {
807
- "epoch": 4.7,
808
- "learning_rate": 1.2369477911646587e-05,
809
- "loss": 0.4139,
 
810
  "step": 1170
811
  },
812
  {
813
- "epoch": 4.74,
814
- "learning_rate": 1.0763052208835342e-05,
815
- "loss": 0.4019,
 
816
  "step": 1180
817
  },
818
  {
819
- "epoch": 4.78,
820
- "learning_rate": 9.156626506024097e-06,
821
- "loss": 0.2942,
 
822
  "step": 1190
823
  },
824
  {
825
- "epoch": 4.82,
826
- "learning_rate": 7.550200803212852e-06,
827
- "loss": 0.262,
 
828
  "step": 1200
829
  },
830
  {
831
- "epoch": 4.82,
832
- "eval_accuracy": 0.8603988603988604,
833
- "eval_loss": 0.4417168200016022,
834
- "eval_runtime": 12.439,
835
- "eval_samples_per_second": 56.435,
836
- "eval_steps_per_second": 7.075,
837
  "step": 1200
838
  },
839
  {
840
- "epoch": 4.86,
841
- "learning_rate": 5.943775100401607e-06,
842
- "loss": 0.2907,
 
843
  "step": 1210
844
  },
845
  {
846
- "epoch": 4.9,
847
- "learning_rate": 4.337349397590362e-06,
848
- "loss": 0.3646,
 
849
  "step": 1220
850
  },
851
  {
852
- "epoch": 4.94,
853
- "learning_rate": 2.7309236947791167e-06,
854
- "loss": 0.4708,
 
855
  "step": 1230
856
  },
857
  {
858
- "epoch": 4.98,
859
- "learning_rate": 1.1244979919678715e-06,
860
- "loss": 0.4184,
 
861
  "step": 1240
862
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  {
864
  "epoch": 5.0,
865
- "step": 1245,
866
- "total_flos": 1.539101261655982e+18,
867
- "train_loss": 0.6624564435108599,
868
- "train_runtime": 820.1847,
869
- "train_samples_per_second": 24.214,
870
- "train_steps_per_second": 1.518
871
  }
872
  ],
873
  "logging_steps": 10,
874
- "max_steps": 1245,
 
875
  "num_train_epochs": 5,
876
  "save_steps": 100,
877
- "total_flos": 1.539101261655982e+18,
 
878
  "trial_name": null,
879
  "trial_params": null
880
  }
 
1
  {
2
+ "best_metric": 0.4145749807357788,
3
+ "best_model_checkpoint": "Action_all_10_class/checkpoint-1400",
4
  "epoch": 5.0,
5
  "eval_steps": 100,
6
+ "global_step": 1445,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "grad_norm": 2.3712973594665527,
14
+ "learning_rate": 0.0001986159169550173,
15
+ "loss": 2.271,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.07,
20
+ "grad_norm": 1.8230082988739014,
21
+ "learning_rate": 0.0001972318339100346,
22
+ "loss": 2.0335,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.1,
27
+ "grad_norm": 2.621448040008545,
28
+ "learning_rate": 0.0001958477508650519,
29
+ "loss": 1.8885,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.14,
34
+ "grad_norm": 3.194303274154663,
35
+ "learning_rate": 0.0001944636678200692,
36
+ "loss": 1.6835,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.17,
41
+ "grad_norm": 2.9692325592041016,
42
+ "learning_rate": 0.0001930795847750865,
43
+ "loss": 1.5658,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.21,
48
+ "grad_norm": 3.6100592613220215,
49
+ "learning_rate": 0.00019169550173010383,
50
+ "loss": 1.4881,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.24,
55
+ "grad_norm": 2.926745653152466,
56
+ "learning_rate": 0.00019031141868512113,
57
+ "loss": 1.2581,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.28,
62
+ "grad_norm": 3.4733943939208984,
63
+ "learning_rate": 0.00018892733564013843,
64
+ "loss": 1.1786,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.31,
69
+ "grad_norm": 3.609900951385498,
70
+ "learning_rate": 0.00018754325259515573,
71
+ "loss": 1.1612,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.35,
76
+ "grad_norm": 3.1664867401123047,
77
+ "learning_rate": 0.00018615916955017303,
78
+ "loss": 1.1239,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.35,
83
+ "eval_accuracy": 0.7116564417177914,
84
+ "eval_loss": 0.9933902025222778,
85
+ "eval_runtime": 13.7579,
86
+ "eval_samples_per_second": 59.239,
87
+ "eval_steps_per_second": 7.414,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 0.38,
92
+ "grad_norm": 3.3514106273651123,
93
+ "learning_rate": 0.00018477508650519033,
94
+ "loss": 1.1763,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 0.42,
99
+ "grad_norm": 2.530064821243286,
100
+ "learning_rate": 0.0001833910034602076,
101
+ "loss": 1.1493,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 0.45,
106
+ "grad_norm": 4.615677833557129,
107
+ "learning_rate": 0.0001820069204152249,
108
+ "loss": 1.1661,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 0.48,
113
+ "grad_norm": 5.2710065841674805,
114
+ "learning_rate": 0.0001807612456747405,
115
+ "loss": 1.1174,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 0.52,
120
+ "grad_norm": 2.854151487350464,
121
+ "learning_rate": 0.0001793771626297578,
122
+ "loss": 1.0679,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 0.55,
127
+ "grad_norm": 4.68364143371582,
128
+ "learning_rate": 0.0001779930795847751,
129
+ "loss": 1.1566,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 0.59,
134
+ "grad_norm": 5.799492359161377,
135
+ "learning_rate": 0.0001766089965397924,
136
+ "loss": 0.9811,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 0.62,
141
+ "grad_norm": 7.8334760665893555,
142
+ "learning_rate": 0.0001752249134948097,
143
+ "loss": 1.0781,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 0.66,
148
+ "grad_norm": 4.817080020904541,
149
+ "learning_rate": 0.000173840830449827,
150
+ "loss": 1.0302,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 0.69,
155
+ "grad_norm": 4.084784507751465,
156
+ "learning_rate": 0.0001724567474048443,
157
+ "loss": 0.8878,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 0.69,
162
+ "eval_accuracy": 0.7705521472392638,
163
+ "eval_loss": 0.7667088508605957,
164
+ "eval_runtime": 10.9837,
165
+ "eval_samples_per_second": 74.201,
166
+ "eval_steps_per_second": 9.286,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 0.73,
171
+ "grad_norm": 3.8510003089904785,
172
+ "learning_rate": 0.0001710726643598616,
173
+ "loss": 1.0539,
174
  "step": 210
175
  },
176
  {
177
+ "epoch": 0.76,
178
+ "grad_norm": 4.4905619621276855,
179
+ "learning_rate": 0.00016968858131487892,
180
+ "loss": 0.9855,
181
  "step": 220
182
  },
183
  {
184
+ "epoch": 0.8,
185
+ "grad_norm": 3.903388738632202,
186
+ "learning_rate": 0.00016830449826989622,
187
+ "loss": 1.0442,
188
  "step": 230
189
  },
190
  {
191
+ "epoch": 0.83,
192
+ "grad_norm": 4.052041530609131,
193
+ "learning_rate": 0.00016692041522491352,
194
+ "loss": 1.004,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 0.87,
199
+ "grad_norm": 4.503437042236328,
200
+ "learning_rate": 0.00016553633217993081,
201
+ "loss": 0.8825,
202
  "step": 250
203
  },
204
  {
205
+ "epoch": 0.9,
206
+ "grad_norm": 4.2403459548950195,
207
+ "learning_rate": 0.00016415224913494811,
208
+ "loss": 0.9616,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 0.93,
213
+ "grad_norm": 6.690958499908447,
214
+ "learning_rate": 0.00016276816608996541,
215
+ "loss": 0.9127,
216
  "step": 270
217
  },
218
  {
219
+ "epoch": 0.97,
220
+ "grad_norm": 6.591899394989014,
221
+ "learning_rate": 0.0001613840830449827,
222
+ "loss": 1.0545,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 1.0,
227
+ "grad_norm": 2.814940929412842,
228
+ "learning_rate": 0.0001601384083044983,
229
+ "loss": 0.9937,
230
  "step": 290
231
  },
232
  {
233
+ "epoch": 1.04,
234
+ "grad_norm": 5.890912055969238,
235
+ "learning_rate": 0.00015875432525951557,
236
+ "loss": 0.9037,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 1.04,
241
+ "eval_accuracy": 0.8098159509202454,
242
+ "eval_loss": 0.6369422674179077,
243
+ "eval_runtime": 11.1962,
244
+ "eval_samples_per_second": 72.793,
245
+ "eval_steps_per_second": 9.11,
246
  "step": 300
247
  },
248
  {
249
+ "epoch": 1.07,
250
+ "grad_norm": 3.311278820037842,
251
+ "learning_rate": 0.00015737024221453287,
252
+ "loss": 0.825,
253
  "step": 310
254
  },
255
  {
256
+ "epoch": 1.11,
257
+ "grad_norm": 4.0047454833984375,
258
+ "learning_rate": 0.00015598615916955017,
259
+ "loss": 0.7347,
260
  "step": 320
261
  },
262
  {
263
+ "epoch": 1.14,
264
+ "grad_norm": 4.145818710327148,
265
+ "learning_rate": 0.00015460207612456747,
266
+ "loss": 0.8906,
267
  "step": 330
268
  },
269
  {
270
+ "epoch": 1.18,
271
+ "grad_norm": 5.543643951416016,
272
+ "learning_rate": 0.00015321799307958477,
273
+ "loss": 0.7669,
274
  "step": 340
275
  },
276
  {
277
+ "epoch": 1.21,
278
+ "grad_norm": 5.490930080413818,
279
+ "learning_rate": 0.00015183391003460207,
280
+ "loss": 0.774,
281
  "step": 350
282
  },
283
  {
284
+ "epoch": 1.25,
285
+ "grad_norm": 4.895139217376709,
286
+ "learning_rate": 0.00015044982698961937,
287
+ "loss": 0.6658,
288
  "step": 360
289
  },
290
  {
291
+ "epoch": 1.28,
292
+ "grad_norm": 4.527533054351807,
293
+ "learning_rate": 0.00014906574394463667,
294
+ "loss": 0.8603,
295
  "step": 370
296
  },
297
  {
298
+ "epoch": 1.31,
299
+ "grad_norm": 3.3609440326690674,
300
+ "learning_rate": 0.00014768166089965397,
301
+ "loss": 0.7099,
302
  "step": 380
303
  },
304
  {
305
+ "epoch": 1.35,
306
+ "grad_norm": 10.356605529785156,
307
+ "learning_rate": 0.0001462975778546713,
308
+ "loss": 0.7262,
309
  "step": 390
310
  },
311
  {
312
+ "epoch": 1.38,
313
+ "grad_norm": 3.6397433280944824,
314
+ "learning_rate": 0.0001449134948096886,
315
+ "loss": 0.7307,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 1.38,
320
+ "eval_accuracy": 0.8319018404907975,
321
+ "eval_loss": 0.577174186706543,
322
+ "eval_runtime": 11.1343,
323
+ "eval_samples_per_second": 73.197,
324
+ "eval_steps_per_second": 9.161,
325
  "step": 400
326
  },
327
  {
328
+ "epoch": 1.42,
329
+ "grad_norm": 5.693297386169434,
330
+ "learning_rate": 0.0001435294117647059,
331
+ "loss": 0.8061,
332
  "step": 410
333
  },
334
  {
335
+ "epoch": 1.45,
336
+ "grad_norm": 3.4219260215759277,
337
+ "learning_rate": 0.0001421453287197232,
338
+ "loss": 0.6227,
339
  "step": 420
340
  },
341
  {
342
+ "epoch": 1.49,
343
+ "grad_norm": 9.225582122802734,
344
+ "learning_rate": 0.0001407612456747405,
345
+ "loss": 0.6429,
346
  "step": 430
347
  },
348
  {
349
+ "epoch": 1.52,
350
+ "grad_norm": 5.8882293701171875,
351
+ "learning_rate": 0.0001393771626297578,
352
+ "loss": 0.686,
353
  "step": 440
354
  },
355
  {
356
+ "epoch": 1.56,
357
+ "grad_norm": 3.3229711055755615,
358
+ "learning_rate": 0.0001379930795847751,
359
+ "loss": 0.7574,
360
  "step": 450
361
  },
362
  {
363
+ "epoch": 1.59,
364
+ "grad_norm": 5.165830135345459,
365
+ "learning_rate": 0.0001366089965397924,
366
+ "loss": 0.6195,
367
  "step": 460
368
  },
369
  {
370
+ "epoch": 1.63,
371
+ "grad_norm": 6.676501274108887,
372
+ "learning_rate": 0.0001352249134948097,
373
+ "loss": 0.6789,
374
  "step": 470
375
  },
376
  {
377
+ "epoch": 1.66,
378
+ "grad_norm": 6.1190643310546875,
379
+ "learning_rate": 0.000133840830449827,
380
+ "loss": 0.7149,
381
  "step": 480
382
  },
383
  {
384
+ "epoch": 1.7,
385
+ "grad_norm": 2.723904848098755,
386
+ "learning_rate": 0.0001324567474048443,
387
+ "loss": 0.7014,
388
  "step": 490
389
  },
390
  {
391
+ "epoch": 1.73,
392
+ "grad_norm": 4.995339870452881,
393
+ "learning_rate": 0.00013107266435986162,
394
+ "loss": 0.6624,
395
  "step": 500
396
  },
397
  {
398
+ "epoch": 1.73,
399
+ "eval_accuracy": 0.7717791411042945,
400
+ "eval_loss": 0.6924724578857422,
401
+ "eval_runtime": 11.3854,
402
+ "eval_samples_per_second": 71.583,
403
+ "eval_steps_per_second": 8.959,
404
  "step": 500
405
  },
406
  {
407
+ "epoch": 1.76,
408
+ "grad_norm": 5.440706729888916,
409
+ "learning_rate": 0.00012968858131487892,
410
+ "loss": 0.8681,
411
  "step": 510
412
  },
413
  {
414
+ "epoch": 1.8,
415
+ "grad_norm": 4.539951801300049,
416
+ "learning_rate": 0.0001283044982698962,
417
+ "loss": 0.5997,
418
  "step": 520
419
  },
420
  {
421
+ "epoch": 1.83,
422
+ "grad_norm": 4.5558013916015625,
423
+ "learning_rate": 0.0001269204152249135,
424
+ "loss": 0.6839,
425
  "step": 530
426
  },
427
  {
428
+ "epoch": 1.87,
429
+ "grad_norm": 4.361762046813965,
430
+ "learning_rate": 0.0001255363321799308,
431
+ "loss": 0.6016,
432
  "step": 540
433
  },
434
  {
435
+ "epoch": 1.9,
436
+ "grad_norm": 4.278672695159912,
437
+ "learning_rate": 0.0001241522491349481,
438
+ "loss": 0.6111,
439
  "step": 550
440
  },
441
  {
442
+ "epoch": 1.94,
443
+ "grad_norm": 5.990556240081787,
444
+ "learning_rate": 0.0001227681660899654,
445
+ "loss": 0.9729,
446
  "step": 560
447
  },
448
  {
449
+ "epoch": 1.97,
450
+ "grad_norm": 4.511960983276367,
451
+ "learning_rate": 0.0001213840830449827,
452
+ "loss": 0.6829,
453
  "step": 570
454
  },
455
  {
456
+ "epoch": 2.01,
457
+ "grad_norm": 3.139665365219116,
458
+ "learning_rate": 0.00012,
459
+ "loss": 0.6516,
460
  "step": 580
461
  },
462
  {
463
+ "epoch": 2.04,
464
+ "grad_norm": 2.4773037433624268,
465
+ "learning_rate": 0.0001186159169550173,
466
+ "loss": 0.492,
467
  "step": 590
468
  },
469
  {
470
+ "epoch": 2.08,
471
+ "grad_norm": 1.613572597503662,
472
+ "learning_rate": 0.0001172318339100346,
473
+ "loss": 0.5781,
474
  "step": 600
475
  },
476
  {
477
+ "epoch": 2.08,
478
+ "eval_accuracy": 0.8404907975460123,
479
+ "eval_loss": 0.5438538789749146,
480
+ "eval_runtime": 11.1336,
481
+ "eval_samples_per_second": 73.202,
482
+ "eval_steps_per_second": 9.161,
483
  "step": 600
484
  },
485
  {
486
+ "epoch": 2.11,
487
+ "grad_norm": 2.5122666358947754,
488
+ "learning_rate": 0.0001158477508650519,
489
+ "loss": 0.55,
490
  "step": 610
491
  },
492
  {
493
+ "epoch": 2.15,
494
+ "grad_norm": 5.037026882171631,
495
+ "learning_rate": 0.00011446366782006921,
496
+ "loss": 0.5404,
497
  "step": 620
498
  },
499
  {
500
+ "epoch": 2.18,
501
+ "grad_norm": 6.795055866241455,
502
+ "learning_rate": 0.00011307958477508651,
503
+ "loss": 0.6455,
504
  "step": 630
505
  },
506
  {
507
+ "epoch": 2.21,
508
+ "grad_norm": 6.474560260772705,
509
+ "learning_rate": 0.00011169550173010381,
510
+ "loss": 0.5184,
511
  "step": 640
512
  },
513
  {
514
+ "epoch": 2.25,
515
+ "grad_norm": 8.110054969787598,
516
+ "learning_rate": 0.00011031141868512111,
517
+ "loss": 0.4706,
518
  "step": 650
519
  },
520
  {
521
+ "epoch": 2.28,
522
+ "grad_norm": 2.1118876934051514,
523
+ "learning_rate": 0.00010892733564013841,
524
+ "loss": 0.5291,
525
  "step": 660
526
  },
527
  {
528
+ "epoch": 2.32,
529
+ "grad_norm": 5.386865615844727,
530
+ "learning_rate": 0.00010754325259515571,
531
+ "loss": 0.605,
532
  "step": 670
533
  },
534
  {
535
+ "epoch": 2.35,
536
+ "grad_norm": 4.501631736755371,
537
+ "learning_rate": 0.00010615916955017303,
538
+ "loss": 0.4696,
539
  "step": 680
540
  },
541
  {
542
+ "epoch": 2.39,
543
+ "grad_norm": 4.06800651550293,
544
+ "learning_rate": 0.00010477508650519033,
545
+ "loss": 0.5214,
546
  "step": 690
547
  },
548
  {
549
+ "epoch": 2.42,
550
+ "grad_norm": 3.3574037551879883,
551
+ "learning_rate": 0.00010339100346020762,
552
+ "loss": 0.5537,
553
  "step": 700
554
  },
555
  {
556
+ "epoch": 2.42,
557
+ "eval_accuracy": 0.8331288343558282,
558
+ "eval_loss": 0.5256926417350769,
559
+ "eval_runtime": 11.3468,
560
+ "eval_samples_per_second": 71.826,
561
+ "eval_steps_per_second": 8.989,
562
  "step": 700
563
  },
564
  {
565
+ "epoch": 2.46,
566
+ "grad_norm": 6.940188884735107,
567
+ "learning_rate": 0.00010200692041522492,
568
+ "loss": 0.5371,
569
  "step": 710
570
  },
571
  {
572
+ "epoch": 2.49,
573
+ "grad_norm": 5.517166614532471,
574
+ "learning_rate": 0.00010076124567474047,
575
+ "loss": 0.5419,
576
  "step": 720
577
  },
578
  {
579
+ "epoch": 2.53,
580
+ "grad_norm": 3.2714200019836426,
581
+ "learning_rate": 9.937716262975779e-05,
582
+ "loss": 0.6657,
583
  "step": 730
584
  },
585
  {
586
+ "epoch": 2.56,
587
+ "grad_norm": 2.734272003173828,
588
+ "learning_rate": 9.79930795847751e-05,
589
+ "loss": 0.5712,
590
  "step": 740
591
  },
592
  {
593
+ "epoch": 2.6,
594
+ "grad_norm": 7.275644302368164,
595
+ "learning_rate": 9.66089965397924e-05,
596
+ "loss": 0.6513,
597
  "step": 750
598
  },
599
  {
600
+ "epoch": 2.63,
601
+ "grad_norm": 4.910625457763672,
602
+ "learning_rate": 9.52249134948097e-05,
603
+ "loss": 0.5641,
604
  "step": 760
605
  },
606
  {
607
+ "epoch": 2.66,
608
+ "grad_norm": 2.4771077632904053,
609
+ "learning_rate": 9.384083044982698e-05,
610
+ "loss": 0.4163,
611
  "step": 770
612
  },
613
  {
614
+ "epoch": 2.7,
615
+ "grad_norm": 4.3278303146362305,
616
+ "learning_rate": 9.24567474048443e-05,
617
+ "loss": 0.5759,
618
  "step": 780
619
  },
620
  {
621
+ "epoch": 2.73,
622
+ "grad_norm": 8.122814178466797,
623
+ "learning_rate": 9.10726643598616e-05,
624
+ "loss": 0.6389,
625
  "step": 790
626
  },
627
  {
628
+ "epoch": 2.77,
629
+ "grad_norm": 6.369782447814941,
630
+ "learning_rate": 8.96885813148789e-05,
631
+ "loss": 0.4112,
632
  "step": 800
633
  },
634
  {
635
+ "epoch": 2.77,
636
+ "eval_accuracy": 0.8564417177914111,
637
+ "eval_loss": 0.4499729573726654,
638
+ "eval_runtime": 11.5138,
639
+ "eval_samples_per_second": 70.785,
640
+ "eval_steps_per_second": 8.859,
641
  "step": 800
642
  },
643
  {
644
+ "epoch": 2.8,
645
+ "grad_norm": 7.130208969116211,
646
+ "learning_rate": 8.83044982698962e-05,
647
+ "loss": 0.4534,
648
  "step": 810
649
  },
650
  {
651
+ "epoch": 2.84,
652
+ "grad_norm": 5.094985008239746,
653
+ "learning_rate": 8.69204152249135e-05,
654
+ "loss": 0.4881,
655
  "step": 820
656
  },
657
  {
658
+ "epoch": 2.87,
659
+ "grad_norm": 2.527794361114502,
660
+ "learning_rate": 8.55363321799308e-05,
661
+ "loss": 0.2775,
662
  "step": 830
663
  },
664
  {
665
+ "epoch": 2.91,
666
+ "grad_norm": 9.366188049316406,
667
+ "learning_rate": 8.415224913494811e-05,
668
+ "loss": 0.6528,
669
  "step": 840
670
  },
671
  {
672
+ "epoch": 2.94,
673
+ "grad_norm": 3.863041400909424,
674
+ "learning_rate": 8.276816608996541e-05,
675
+ "loss": 0.5632,
676
  "step": 850
677
  },
678
  {
679
+ "epoch": 2.98,
680
+ "grad_norm": 3.3894691467285156,
681
+ "learning_rate": 8.138408304498271e-05,
682
+ "loss": 0.5885,
683
  "step": 860
684
  },
685
  {
686
+ "epoch": 3.01,
687
+ "grad_norm": 5.836617469787598,
688
+ "learning_rate": 8e-05,
689
+ "loss": 0.5864,
690
  "step": 870
691
  },
692
  {
693
+ "epoch": 3.04,
694
+ "grad_norm": 4.380435943603516,
695
+ "learning_rate": 7.86159169550173e-05,
696
+ "loss": 0.4523,
697
  "step": 880
698
  },
699
  {
700
+ "epoch": 3.08,
701
+ "grad_norm": 0.4667840898036957,
702
+ "learning_rate": 7.72318339100346e-05,
703
+ "loss": 0.2999,
704
  "step": 890
705
  },
706
  {
707
+ "epoch": 3.11,
708
+ "grad_norm": 3.003122091293335,
709
+ "learning_rate": 7.58477508650519e-05,
710
+ "loss": 0.3263,
711
  "step": 900
712
  },
713
  {
714
+ "epoch": 3.11,
715
+ "eval_accuracy": 0.841717791411043,
716
+ "eval_loss": 0.4910809397697449,
717
+ "eval_runtime": 11.4121,
718
+ "eval_samples_per_second": 71.416,
719
+ "eval_steps_per_second": 8.938,
720
  "step": 900
721
  },
722
  {
723
+ "epoch": 3.15,
724
+ "grad_norm": 4.519166469573975,
725
+ "learning_rate": 7.44636678200692e-05,
726
+ "loss": 0.417,
727
  "step": 910
728
  },
729
  {
730
+ "epoch": 3.18,
731
+ "grad_norm": 6.071402549743652,
732
+ "learning_rate": 7.30795847750865e-05,
733
+ "loss": 0.351,
734
  "step": 920
735
  },
736
  {
737
+ "epoch": 3.22,
738
+ "grad_norm": 6.0896687507629395,
739
+ "learning_rate": 7.16955017301038e-05,
740
+ "loss": 0.4687,
741
  "step": 930
742
  },
743
  {
744
+ "epoch": 3.25,
745
+ "grad_norm": 4.001879692077637,
746
+ "learning_rate": 7.031141868512112e-05,
747
+ "loss": 0.5083,
748
  "step": 940
749
  },
750
  {
751
+ "epoch": 3.29,
752
+ "grad_norm": 5.064093112945557,
753
+ "learning_rate": 6.892733564013842e-05,
754
+ "loss": 0.3407,
755
  "step": 950
756
  },
757
  {
758
+ "epoch": 3.32,
759
+ "grad_norm": 1.3715455532073975,
760
+ "learning_rate": 6.754325259515572e-05,
761
+ "loss": 0.4073,
762
  "step": 960
763
  },
764
  {
765
+ "epoch": 3.36,
766
+ "grad_norm": 6.380220413208008,
767
+ "learning_rate": 6.615916955017302e-05,
768
+ "loss": 0.4613,
769
  "step": 970
770
  },
771
  {
772
+ "epoch": 3.39,
773
+ "grad_norm": 2.1010074615478516,
774
+ "learning_rate": 6.477508650519032e-05,
775
+ "loss": 0.3706,
776
  "step": 980
777
  },
778
  {
779
+ "epoch": 3.43,
780
+ "grad_norm": 2.2640676498413086,
781
+ "learning_rate": 6.339100346020761e-05,
782
+ "loss": 0.3235,
783
  "step": 990
784
  },
785
  {
786
+ "epoch": 3.46,
787
+ "grad_norm": 2.6152210235595703,
788
+ "learning_rate": 6.200692041522491e-05,
789
+ "loss": 0.4592,
790
  "step": 1000
791
  },
792
  {
793
+ "epoch": 3.46,
794
+ "eval_accuracy": 0.8711656441717791,
795
+ "eval_loss": 0.4550653100013733,
796
+ "eval_runtime": 11.3924,
797
+ "eval_samples_per_second": 71.539,
798
+ "eval_steps_per_second": 8.953,
799
  "step": 1000
800
  },
801
  {
802
+ "epoch": 3.49,
803
+ "grad_norm": 4.868244647979736,
804
+ "learning_rate": 6.0622837370242214e-05,
805
+ "loss": 0.3891,
806
  "step": 1010
807
  },
808
  {
809
+ "epoch": 3.53,
810
+ "grad_norm": 4.125463962554932,
811
+ "learning_rate": 5.9238754325259514e-05,
812
+ "loss": 0.3462,
813
  "step": 1020
814
  },
815
  {
816
+ "epoch": 3.56,
817
+ "grad_norm": 5.510716915130615,
818
+ "learning_rate": 5.785467128027682e-05,
819
+ "loss": 0.3938,
820
  "step": 1030
821
  },
822
  {
823
+ "epoch": 3.6,
824
+ "grad_norm": 4.867416858673096,
825
+ "learning_rate": 5.647058823529412e-05,
826
+ "loss": 0.3851,
827
  "step": 1040
828
  },
829
  {
830
+ "epoch": 3.63,
831
+ "grad_norm": 5.599556922912598,
832
+ "learning_rate": 5.508650519031142e-05,
833
+ "loss": 0.3939,
834
  "step": 1050
835
  },
836
  {
837
+ "epoch": 3.67,
838
+ "grad_norm": 6.144674777984619,
839
+ "learning_rate": 5.3702422145328725e-05,
840
+ "loss": 0.5787,
841
  "step": 1060
842
  },
843
  {
844
+ "epoch": 3.7,
845
+ "grad_norm": 6.78063440322876,
846
+ "learning_rate": 5.2318339100346025e-05,
847
+ "loss": 0.4548,
848
  "step": 1070
849
  },
850
  {
851
+ "epoch": 3.74,
852
+ "grad_norm": 2.213515281677246,
853
+ "learning_rate": 5.0934256055363325e-05,
854
+ "loss": 0.418,
855
  "step": 1080
856
  },
857
  {
858
+ "epoch": 3.77,
859
+ "grad_norm": 3.3876266479492188,
860
+ "learning_rate": 4.9550173010380624e-05,
861
+ "loss": 0.432,
862
  "step": 1090
863
  },
864
  {
865
+ "epoch": 3.81,
866
+ "grad_norm": 6.500645160675049,
867
+ "learning_rate": 4.8166089965397924e-05,
868
+ "loss": 0.3204,
869
  "step": 1100
870
  },
871
  {
872
+ "epoch": 3.81,
873
+ "eval_accuracy": 0.8723926380368098,
874
+ "eval_loss": 0.4324829876422882,
875
+ "eval_runtime": 11.2965,
876
+ "eval_samples_per_second": 72.146,
877
+ "eval_steps_per_second": 9.029,
878
  "step": 1100
879
  },
880
  {
881
+ "epoch": 3.84,
882
+ "grad_norm": 2.518057107925415,
883
+ "learning_rate": 4.678200692041523e-05,
884
+ "loss": 0.3505,
885
  "step": 1110
886
  },
887
  {
888
+ "epoch": 3.88,
889
+ "grad_norm": 4.820311546325684,
890
+ "learning_rate": 4.539792387543253e-05,
891
+ "loss": 0.4641,
892
  "step": 1120
893
  },
894
  {
895
+ "epoch": 3.91,
896
+ "grad_norm": 1.1859557628631592,
897
+ "learning_rate": 4.401384083044983e-05,
898
+ "loss": 0.5202,
899
  "step": 1130
900
  },
901
  {
902
+ "epoch": 3.94,
903
+ "grad_norm": 2.3869946002960205,
904
+ "learning_rate": 4.262975778546713e-05,
905
+ "loss": 0.5007,
906
  "step": 1140
907
  },
908
  {
909
+ "epoch": 3.98,
910
+ "grad_norm": 6.64663553237915,
911
+ "learning_rate": 4.124567474048443e-05,
912
+ "loss": 0.3478,
913
  "step": 1150
914
  },
915
  {
916
+ "epoch": 4.01,
917
+ "grad_norm": 4.931898593902588,
918
+ "learning_rate": 3.9861591695501735e-05,
919
+ "loss": 0.3808,
920
  "step": 1160
921
  },
922
  {
923
+ "epoch": 4.05,
924
+ "grad_norm": 1.7521384954452515,
925
+ "learning_rate": 3.8477508650519034e-05,
926
+ "loss": 0.3247,
927
  "step": 1170
928
  },
929
  {
930
+ "epoch": 4.08,
931
+ "grad_norm": 1.5513068437576294,
932
+ "learning_rate": 3.7093425605536334e-05,
933
+ "loss": 0.393,
934
  "step": 1180
935
  },
936
  {
937
+ "epoch": 4.12,
938
+ "grad_norm": 5.2524094581604,
939
+ "learning_rate": 3.570934256055363e-05,
940
+ "loss": 0.3697,
941
  "step": 1190
942
  },
943
  {
944
+ "epoch": 4.15,
945
+ "grad_norm": 5.346503734588623,
946
+ "learning_rate": 3.432525951557093e-05,
947
+ "loss": 0.3268,
948
  "step": 1200
949
  },
950
  {
951
+ "epoch": 4.15,
952
+ "eval_accuracy": 0.8539877300613496,
953
+ "eval_loss": 0.4529338777065277,
954
+ "eval_runtime": 11.243,
955
+ "eval_samples_per_second": 72.49,
956
+ "eval_steps_per_second": 9.072,
957
  "step": 1200
958
  },
959
  {
960
+ "epoch": 4.19,
961
+ "grad_norm": 2.0786306858062744,
962
+ "learning_rate": 3.294117647058824e-05,
963
+ "loss": 0.2967,
964
  "step": 1210
965
  },
966
  {
967
+ "epoch": 4.22,
968
+ "grad_norm": 3.8141846656799316,
969
+ "learning_rate": 3.155709342560554e-05,
970
+ "loss": 0.2483,
971
  "step": 1220
972
  },
973
  {
974
+ "epoch": 4.26,
975
+ "grad_norm": 1.8864765167236328,
976
+ "learning_rate": 3.0173010380622842e-05,
977
+ "loss": 0.2443,
978
  "step": 1230
979
  },
980
  {
981
+ "epoch": 4.29,
982
+ "grad_norm": 0.9406078457832336,
983
+ "learning_rate": 2.878892733564014e-05,
984
+ "loss": 0.2637,
985
  "step": 1240
986
  },
987
+ {
988
+ "epoch": 4.33,
989
+ "grad_norm": 3.418455123901367,
990
+ "learning_rate": 2.7404844290657437e-05,
991
+ "loss": 0.2866,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 4.36,
996
+ "grad_norm": 1.8662397861480713,
997
+ "learning_rate": 2.602076124567474e-05,
998
+ "loss": 0.3381,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 4.39,
1003
+ "grad_norm": 3.9735002517700195,
1004
+ "learning_rate": 2.4636678200692043e-05,
1005
+ "loss": 0.3328,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 4.43,
1010
+ "grad_norm": 2.229581117630005,
1011
+ "learning_rate": 2.3252595155709346e-05,
1012
+ "loss": 0.4579,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 4.46,
1017
+ "grad_norm": 3.8186144828796387,
1018
+ "learning_rate": 2.1868512110726642e-05,
1019
+ "loss": 0.2986,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 4.5,
1024
+ "grad_norm": 2.310910701751709,
1025
+ "learning_rate": 2.0484429065743945e-05,
1026
+ "loss": 0.4267,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 4.5,
1031
+ "eval_accuracy": 0.8723926380368098,
1032
+ "eval_loss": 0.4355594515800476,
1033
+ "eval_runtime": 11.2323,
1034
+ "eval_samples_per_second": 72.559,
1035
+ "eval_steps_per_second": 9.081,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 4.53,
1040
+ "grad_norm": 5.4986042976379395,
1041
+ "learning_rate": 1.910034602076125e-05,
1042
+ "loss": 0.3289,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 4.57,
1047
+ "grad_norm": 9.17880916595459,
1048
+ "learning_rate": 1.7716262975778548e-05,
1049
+ "loss": 0.4384,
1050
+ "step": 1320
1051
+ },
1052
+ {
1053
+ "epoch": 4.6,
1054
+ "grad_norm": 0.5825958251953125,
1055
+ "learning_rate": 1.6332179930795848e-05,
1056
+ "loss": 0.1629,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 4.64,
1061
+ "grad_norm": 3.7861948013305664,
1062
+ "learning_rate": 1.4948096885813149e-05,
1063
+ "loss": 0.2607,
1064
+ "step": 1340
1065
+ },
1066
+ {
1067
+ "epoch": 4.67,
1068
+ "grad_norm": 0.08592710644006729,
1069
+ "learning_rate": 1.356401384083045e-05,
1070
+ "loss": 0.23,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 4.71,
1075
+ "grad_norm": 3.897566556930542,
1076
+ "learning_rate": 1.2179930795847751e-05,
1077
+ "loss": 0.2927,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 4.74,
1082
+ "grad_norm": 3.4900457859039307,
1083
+ "learning_rate": 1.0795847750865053e-05,
1084
+ "loss": 0.3149,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 4.78,
1089
+ "grad_norm": 0.48032164573669434,
1090
+ "learning_rate": 9.411764705882354e-06,
1091
+ "loss": 0.3346,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 4.81,
1096
+ "grad_norm": 3.148484945297241,
1097
+ "learning_rate": 8.027681660899653e-06,
1098
+ "loss": 0.3006,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 4.84,
1103
+ "grad_norm": 8.376299858093262,
1104
+ "learning_rate": 6.6435986159169555e-06,
1105
+ "loss": 0.2886,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 4.84,
1110
+ "eval_accuracy": 0.8785276073619632,
1111
+ "eval_loss": 0.4145749807357788,
1112
+ "eval_runtime": 11.483,
1113
+ "eval_samples_per_second": 70.974,
1114
+ "eval_steps_per_second": 8.883,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 4.88,
1119
+ "grad_norm": 10.592811584472656,
1120
+ "learning_rate": 5.259515570934257e-06,
1121
+ "loss": 0.2866,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 4.91,
1126
+ "grad_norm": 1.9005374908447266,
1127
+ "learning_rate": 3.875432525951557e-06,
1128
+ "loss": 0.367,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 4.95,
1133
+ "grad_norm": 6.674309730529785,
1134
+ "learning_rate": 2.4913494809688584e-06,
1135
+ "loss": 0.2715,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 4.98,
1140
+ "grad_norm": 7.622613430023193,
1141
+ "learning_rate": 1.1072664359861592e-06,
1142
+ "loss": 0.3306,
1143
+ "step": 1440
1144
+ },
1145
  {
1146
  "epoch": 5.0,
1147
+ "step": 1445,
1148
+ "total_flos": 1.789030847196795e+18,
1149
+ "train_loss": 0.6435174308433664,
1150
+ "train_runtime": 776.1745,
1151
+ "train_samples_per_second": 29.742,
1152
+ "train_steps_per_second": 1.862
1153
  }
1154
  ],
1155
  "logging_steps": 10,
1156
+ "max_steps": 1445,
1157
+ "num_input_tokens_seen": 0,
1158
  "num_train_epochs": 5,
1159
  "save_steps": 100,
1160
+ "total_flos": 1.789030847196795e+18,
1161
+ "train_batch_size": 16,
1162
  "trial_name": null,
1163
  "trial_params": null
1164
  }