Raihan004 commited on
Commit
6dbba84
1 Parent(s): 057d317

Model save

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.8066666666666666
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.6526
36
- - Accuracy: 0.8067
37
 
38
  ## Model description
39
 
@@ -65,12 +65,12 @@ The following hyperparameters were used during training:
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
68
- | 1.1323 | 0.32 | 100 | 1.0434 | 0.7543 |
69
- | 0.7842 | 0.64 | 200 | 0.7772 | 0.7810 |
70
- | 0.6174 | 0.96 | 300 | 0.7121 | 0.7848 |
71
- | 0.5362 | 1.27 | 400 | 0.7486 | 0.7724 |
72
- | 0.4918 | 1.59 | 500 | 0.6675 | 0.8029 |
73
- | 0.4346 | 1.91 | 600 | 0.6526 | 0.8067 |
74
 
75
 
76
  ### Framework versions
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.7742857142857142
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.7692
36
+ - Accuracy: 0.7743
37
 
38
  ## Model description
39
 
 
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
68
+ | 0.4296 | 0.32 | 100 | 0.8159 | 0.7448 |
69
+ | 0.4083 | 0.64 | 200 | 0.6974 | 0.7886 |
70
+ | 0.3384 | 0.96 | 300 | 0.8743 | 0.7371 |
71
+ | 0.2812 | 1.27 | 400 | 0.7739 | 0.7676 |
72
+ | 0.251 | 1.59 | 500 | 0.7166 | 0.7857 |
73
+ | 0.2425 | 1.91 | 600 | 0.7692 | 0.7743 |
74
 
75
 
76
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.799047619047619,
4
- "eval_loss": 0.6551458239555359,
5
- "eval_runtime": 15.0447,
6
- "eval_samples_per_second": 69.792,
7
- "eval_steps_per_second": 8.774,
8
- "total_flos": 7.776878731479245e+18,
9
- "train_loss": 0.2955047018209081,
10
- "train_runtime": 3022.5495,
11
- "train_samples_per_second": 33.2,
12
- "train_steps_per_second": 1.039
13
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "total_flos": 1.555375746295849e+18,
4
+ "train_loss": 0.7493580146959633,
5
+ "train_runtime": 668.3994,
6
+ "train_samples_per_second": 30.027,
7
+ "train_steps_per_second": 0.94
 
 
 
 
 
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:172eacc6266be44649695bb9e478304686feaaefc77ca43c1c34448163685899
3
  size 343248584
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ece9292430947c4d04a44fb463dfd30531d4ca2b5b3af2515c644cdbc73d2c
3
  size 343248584
runs/Apr16_07-14-49_7b4e89625b83/events.out.tfevents.1713253320.7b4e89625b83.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:500feeedd7710f55b31c68ed8551959f5d0e9f62b0447129d0d06369f4766567
3
+ size 14498
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 7.776878731479245e+18,
4
- "train_loss": 0.2955047018209081,
5
- "train_runtime": 3022.5495,
6
- "train_samples_per_second": 33.2,
7
- "train_steps_per_second": 1.039
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "total_flos": 1.555375746295849e+18,
4
+ "train_loss": 0.7493580146959633,
5
+ "train_runtime": 668.3994,
6
+ "train_samples_per_second": 30.027,
7
+ "train_steps_per_second": 0.94
8
  }
trainer_state.json CHANGED
@@ -1,1407 +1,300 @@
1
  {
2
- "best_metric": 0.6551458239555359,
3
  "best_model_checkpoint": "Action_model/checkpoint-600",
4
- "epoch": 10.0,
5
  "eval_steps": 100,
6
- "global_step": 3140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
- "grad_norm": 1.6790106296539307,
14
- "learning_rate": 9.936305732484077e-05,
15
- "loss": 2.1638,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.13,
20
- "grad_norm": 1.8781601190567017,
21
- "learning_rate": 9.872611464968153e-05,
22
- "loss": 1.8702,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.19,
27
- "grad_norm": 2.154752016067505,
28
- "learning_rate": 9.80891719745223e-05,
29
- "loss": 1.496,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.25,
34
- "grad_norm": 2.715329885482788,
35
- "learning_rate": 9.745222929936307e-05,
36
- "loss": 1.2633,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.32,
41
- "grad_norm": 2.8564321994781494,
42
- "learning_rate": 9.681528662420382e-05,
43
- "loss": 1.1382,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.32,
48
- "eval_accuracy": 0.7676190476190476,
49
- "eval_loss": 1.0001901388168335,
50
- "eval_runtime": 19.3002,
51
- "eval_samples_per_second": 54.404,
52
- "eval_steps_per_second": 6.839,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 0.38,
57
- "grad_norm": 2.2369894981384277,
58
- "learning_rate": 9.617834394904459e-05,
59
- "loss": 1.0084,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 0.45,
64
- "grad_norm": 3.4310965538024902,
65
- "learning_rate": 9.554140127388536e-05,
66
- "loss": 0.9195,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 0.51,
71
- "grad_norm": 3.0611016750335693,
72
- "learning_rate": 9.490445859872612e-05,
73
- "loss": 0.9319,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 0.57,
78
- "grad_norm": 4.078617095947266,
79
- "learning_rate": 9.426751592356689e-05,
80
- "loss": 0.7825,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 0.64,
85
- "grad_norm": 2.7804274559020996,
86
- "learning_rate": 9.363057324840766e-05,
87
- "loss": 0.782,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 0.64,
92
- "eval_accuracy": 0.7676190476190476,
93
- "eval_loss": 0.7673064470291138,
94
- "eval_runtime": 15.1028,
95
- "eval_samples_per_second": 69.523,
96
- "eval_steps_per_second": 8.74,
97
  "step": 200
98
  },
99
  {
100
  "epoch": 0.7,
101
- "grad_norm": 1.4780933856964111,
102
- "learning_rate": 9.299363057324841e-05,
103
- "loss": 0.6899,
104
  "step": 220
105
  },
106
  {
107
  "epoch": 0.76,
108
- "grad_norm": 2.280031681060791,
109
- "learning_rate": 9.238853503184714e-05,
110
- "loss": 0.6333,
111
  "step": 240
112
  },
113
  {
114
  "epoch": 0.83,
115
- "grad_norm": 2.9191763401031494,
116
- "learning_rate": 9.17515923566879e-05,
117
- "loss": 0.7143,
118
  "step": 260
119
  },
120
  {
121
  "epoch": 0.89,
122
- "grad_norm": 2.002995729446411,
123
- "learning_rate": 9.111464968152866e-05,
124
- "loss": 0.6578,
125
  "step": 280
126
  },
127
  {
128
  "epoch": 0.96,
129
- "grad_norm": 3.061410427093506,
130
- "learning_rate": 9.047770700636943e-05,
131
- "loss": 0.6289,
132
  "step": 300
133
  },
134
  {
135
  "epoch": 0.96,
136
- "eval_accuracy": 0.7866666666666666,
137
- "eval_loss": 0.70728999376297,
138
- "eval_runtime": 15.1443,
139
- "eval_samples_per_second": 69.333,
140
- "eval_steps_per_second": 8.716,
141
  "step": 300
142
  },
143
  {
144
  "epoch": 1.02,
145
- "grad_norm": 3.693359375,
146
- "learning_rate": 8.984076433121019e-05,
147
- "loss": 0.5495,
148
  "step": 320
149
  },
150
  {
151
  "epoch": 1.08,
152
- "grad_norm": 3.992403507232666,
153
- "learning_rate": 8.920382165605096e-05,
154
- "loss": 0.5563,
155
  "step": 340
156
  },
157
  {
158
  "epoch": 1.15,
159
- "grad_norm": 2.6919829845428467,
160
- "learning_rate": 8.856687898089173e-05,
161
- "loss": 0.5681,
162
  "step": 360
163
  },
164
  {
165
  "epoch": 1.21,
166
- "grad_norm": 3.0533978939056396,
167
- "learning_rate": 8.796178343949045e-05,
168
- "loss": 0.5286,
169
  "step": 380
170
  },
171
  {
172
  "epoch": 1.27,
173
- "grad_norm": 2.9173035621643066,
174
- "learning_rate": 8.732484076433122e-05,
175
- "loss": 0.5028,
176
  "step": 400
177
  },
178
  {
179
  "epoch": 1.27,
180
- "eval_accuracy": 0.7685714285714286,
181
- "eval_loss": 0.7260778546333313,
182
- "eval_runtime": 17.5337,
183
- "eval_samples_per_second": 59.885,
184
- "eval_steps_per_second": 7.528,
185
  "step": 400
186
  },
187
  {
188
  "epoch": 1.34,
189
- "grad_norm": 1.7928365468978882,
190
- "learning_rate": 8.668789808917198e-05,
191
- "loss": 0.4379,
192
  "step": 420
193
  },
194
  {
195
  "epoch": 1.4,
196
- "grad_norm": 4.260186195373535,
197
- "learning_rate": 8.605095541401275e-05,
198
- "loss": 0.48,
199
  "step": 440
200
  },
201
  {
202
  "epoch": 1.46,
203
- "grad_norm": 2.8024277687072754,
204
- "learning_rate": 8.541401273885352e-05,
205
- "loss": 0.3689,
206
  "step": 460
207
  },
208
  {
209
  "epoch": 1.53,
210
- "grad_norm": 4.045362949371338,
211
- "learning_rate": 8.477707006369427e-05,
212
- "loss": 0.3922,
213
  "step": 480
214
  },
215
  {
216
  "epoch": 1.59,
217
- "grad_norm": 1.6472926139831543,
218
- "learning_rate": 8.414012738853504e-05,
219
- "loss": 0.4746,
220
  "step": 500
221
  },
222
  {
223
  "epoch": 1.59,
224
- "eval_accuracy": 0.7619047619047619,
225
- "eval_loss": 0.7463707327842712,
226
- "eval_runtime": 15.5218,
227
- "eval_samples_per_second": 67.647,
228
- "eval_steps_per_second": 8.504,
229
  "step": 500
230
  },
231
  {
232
  "epoch": 1.66,
233
- "grad_norm": 4.001276969909668,
234
- "learning_rate": 8.350318471337581e-05,
235
- "loss": 0.4123,
236
  "step": 520
237
  },
238
  {
239
  "epoch": 1.72,
240
- "grad_norm": 4.151864528656006,
241
- "learning_rate": 8.286624203821657e-05,
242
- "loss": 0.4626,
243
  "step": 540
244
  },
245
  {
246
  "epoch": 1.78,
247
- "grad_norm": 3.43729567527771,
248
- "learning_rate": 8.222929936305733e-05,
249
- "loss": 0.4279,
250
  "step": 560
251
  },
252
  {
253
  "epoch": 1.85,
254
- "grad_norm": 3.8993654251098633,
255
- "learning_rate": 8.159235668789809e-05,
256
- "loss": 0.4288,
257
  "step": 580
258
  },
259
  {
260
  "epoch": 1.91,
261
- "grad_norm": 3.0417702198028564,
262
- "learning_rate": 8.095541401273886e-05,
263
- "loss": 0.4298,
264
  "step": 600
265
  },
266
  {
267
  "epoch": 1.91,
268
- "eval_accuracy": 0.799047619047619,
269
- "eval_loss": 0.6551458239555359,
270
- "eval_runtime": 15.0487,
271
- "eval_samples_per_second": 69.773,
272
- "eval_steps_per_second": 8.771,
273
  "step": 600
274
  },
275
  {
276
  "epoch": 1.97,
277
- "grad_norm": 4.088045597076416,
278
- "learning_rate": 8.031847133757963e-05,
279
- "loss": 0.404,
280
  "step": 620
281
  },
282
  {
283
- "epoch": 2.04,
284
- "grad_norm": 2.1314449310302734,
285
- "learning_rate": 7.968152866242038e-05,
286
- "loss": 0.4096,
287
- "step": 640
288
- },
289
- {
290
- "epoch": 2.1,
291
- "grad_norm": 5.116842746734619,
292
- "learning_rate": 7.904458598726115e-05,
293
- "loss": 0.3332,
294
- "step": 660
295
- },
296
- {
297
- "epoch": 2.17,
298
- "grad_norm": 4.045914649963379,
299
- "learning_rate": 7.840764331210192e-05,
300
- "loss": 0.2961,
301
- "step": 680
302
- },
303
- {
304
- "epoch": 2.23,
305
- "grad_norm": 1.9349473714828491,
306
- "learning_rate": 7.777070063694268e-05,
307
- "loss": 0.3488,
308
- "step": 700
309
- },
310
- {
311
- "epoch": 2.23,
312
- "eval_accuracy": 0.7733333333333333,
313
- "eval_loss": 0.7358552813529968,
314
- "eval_runtime": 15.4659,
315
- "eval_samples_per_second": 67.891,
316
- "eval_steps_per_second": 8.535,
317
- "step": 700
318
- },
319
- {
320
- "epoch": 2.29,
321
- "grad_norm": 4.3049845695495605,
322
- "learning_rate": 7.713375796178345e-05,
323
- "loss": 0.3078,
324
- "step": 720
325
- },
326
- {
327
- "epoch": 2.36,
328
- "grad_norm": 2.4363415241241455,
329
- "learning_rate": 7.649681528662422e-05,
330
- "loss": 0.4005,
331
- "step": 740
332
- },
333
- {
334
- "epoch": 2.42,
335
- "grad_norm": 3.518944501876831,
336
- "learning_rate": 7.585987261146497e-05,
337
- "loss": 0.3428,
338
- "step": 760
339
- },
340
- {
341
- "epoch": 2.48,
342
- "grad_norm": 3.8007328510284424,
343
- "learning_rate": 7.522292993630574e-05,
344
- "loss": 0.3471,
345
- "step": 780
346
- },
347
- {
348
- "epoch": 2.55,
349
- "grad_norm": 1.3215352296829224,
350
- "learning_rate": 7.45859872611465e-05,
351
- "loss": 0.266,
352
- "step": 800
353
- },
354
- {
355
- "epoch": 2.55,
356
- "eval_accuracy": 0.7514285714285714,
357
- "eval_loss": 0.829559862613678,
358
- "eval_runtime": 15.0559,
359
- "eval_samples_per_second": 69.74,
360
- "eval_steps_per_second": 8.767,
361
- "step": 800
362
- },
363
- {
364
- "epoch": 2.61,
365
- "grad_norm": 1.9990004301071167,
366
- "learning_rate": 7.394904458598727e-05,
367
- "loss": 0.2918,
368
- "step": 820
369
- },
370
- {
371
- "epoch": 2.68,
372
- "grad_norm": 5.201882362365723,
373
- "learning_rate": 7.331210191082802e-05,
374
- "loss": 0.3175,
375
- "step": 840
376
- },
377
- {
378
- "epoch": 2.74,
379
- "grad_norm": 4.187939167022705,
380
- "learning_rate": 7.267515923566879e-05,
381
- "loss": 0.304,
382
- "step": 860
383
- },
384
- {
385
- "epoch": 2.8,
386
- "grad_norm": 2.6038854122161865,
387
- "learning_rate": 7.203821656050955e-05,
388
- "loss": 0.3119,
389
- "step": 880
390
- },
391
- {
392
- "epoch": 2.87,
393
- "grad_norm": 3.0886316299438477,
394
- "learning_rate": 7.140127388535032e-05,
395
- "loss": 0.3651,
396
- "step": 900
397
- },
398
- {
399
- "epoch": 2.87,
400
- "eval_accuracy": 0.7304761904761905,
401
- "eval_loss": 0.8660680651664734,
402
- "eval_runtime": 15.7752,
403
- "eval_samples_per_second": 66.56,
404
- "eval_steps_per_second": 8.368,
405
- "step": 900
406
- },
407
- {
408
- "epoch": 2.93,
409
- "grad_norm": 3.3455846309661865,
410
- "learning_rate": 7.076433121019108e-05,
411
- "loss": 0.2739,
412
- "step": 920
413
- },
414
- {
415
- "epoch": 2.99,
416
- "grad_norm": 3.0882346630096436,
417
- "learning_rate": 7.012738853503184e-05,
418
- "loss": 0.2327,
419
- "step": 940
420
- },
421
- {
422
- "epoch": 3.06,
423
- "grad_norm": 4.43536901473999,
424
- "learning_rate": 6.949044585987261e-05,
425
- "loss": 0.2699,
426
- "step": 960
427
- },
428
- {
429
- "epoch": 3.12,
430
- "grad_norm": 3.8387985229492188,
431
- "learning_rate": 6.885350318471338e-05,
432
- "loss": 0.295,
433
- "step": 980
434
- },
435
- {
436
- "epoch": 3.18,
437
- "grad_norm": 2.8090808391571045,
438
- "learning_rate": 6.821656050955413e-05,
439
- "loss": 0.2796,
440
- "step": 1000
441
- },
442
- {
443
- "epoch": 3.18,
444
- "eval_accuracy": 0.7866666666666666,
445
- "eval_loss": 0.7188078165054321,
446
- "eval_runtime": 15.2769,
447
- "eval_samples_per_second": 68.731,
448
- "eval_steps_per_second": 8.641,
449
- "step": 1000
450
- },
451
- {
452
- "epoch": 3.25,
453
- "grad_norm": 2.789705991744995,
454
- "learning_rate": 6.75796178343949e-05,
455
- "loss": 0.234,
456
- "step": 1020
457
- },
458
- {
459
- "epoch": 3.31,
460
- "grad_norm": 4.02871561050415,
461
- "learning_rate": 6.694267515923567e-05,
462
- "loss": 0.2282,
463
- "step": 1040
464
- },
465
- {
466
- "epoch": 3.38,
467
- "grad_norm": 2.786869764328003,
468
- "learning_rate": 6.630573248407643e-05,
469
- "loss": 0.3052,
470
- "step": 1060
471
- },
472
- {
473
- "epoch": 3.44,
474
- "grad_norm": 3.5847015380859375,
475
- "learning_rate": 6.56687898089172e-05,
476
- "loss": 0.2343,
477
- "step": 1080
478
- },
479
- {
480
- "epoch": 3.5,
481
- "grad_norm": 2.2771642208099365,
482
- "learning_rate": 6.503184713375797e-05,
483
- "loss": 0.2703,
484
- "step": 1100
485
- },
486
- {
487
- "epoch": 3.5,
488
- "eval_accuracy": 0.7476190476190476,
489
- "eval_loss": 0.8421508073806763,
490
- "eval_runtime": 15.1919,
491
- "eval_samples_per_second": 69.116,
492
- "eval_steps_per_second": 8.689,
493
- "step": 1100
494
- },
495
- {
496
- "epoch": 3.57,
497
- "grad_norm": 3.3567535877227783,
498
- "learning_rate": 6.439490445859872e-05,
499
- "loss": 0.2429,
500
- "step": 1120
501
- },
502
- {
503
- "epoch": 3.63,
504
- "grad_norm": 4.021266937255859,
505
- "learning_rate": 6.375796178343949e-05,
506
- "loss": 0.2875,
507
- "step": 1140
508
- },
509
- {
510
- "epoch": 3.69,
511
- "grad_norm": 1.5646867752075195,
512
- "learning_rate": 6.312101910828026e-05,
513
- "loss": 0.2355,
514
- "step": 1160
515
- },
516
- {
517
- "epoch": 3.76,
518
- "grad_norm": 0.8209928274154663,
519
- "learning_rate": 6.248407643312102e-05,
520
- "loss": 0.2367,
521
- "step": 1180
522
- },
523
- {
524
- "epoch": 3.82,
525
- "grad_norm": 5.591761589050293,
526
- "learning_rate": 6.184713375796178e-05,
527
- "loss": 0.2608,
528
- "step": 1200
529
- },
530
- {
531
- "epoch": 3.82,
532
- "eval_accuracy": 0.7723809523809524,
533
- "eval_loss": 0.8207409381866455,
534
- "eval_runtime": 15.3778,
535
- "eval_samples_per_second": 68.28,
536
- "eval_steps_per_second": 8.584,
537
- "step": 1200
538
- },
539
- {
540
- "epoch": 3.89,
541
- "grad_norm": 2.2955307960510254,
542
- "learning_rate": 6.121019108280255e-05,
543
- "loss": 0.2174,
544
- "step": 1220
545
- },
546
- {
547
- "epoch": 3.95,
548
- "grad_norm": 4.40664005279541,
549
- "learning_rate": 6.057324840764331e-05,
550
- "loss": 0.2168,
551
- "step": 1240
552
- },
553
- {
554
- "epoch": 4.01,
555
- "grad_norm": 4.821913719177246,
556
- "learning_rate": 5.993630573248408e-05,
557
- "loss": 0.251,
558
- "step": 1260
559
- },
560
- {
561
- "epoch": 4.08,
562
- "grad_norm": 6.526182174682617,
563
- "learning_rate": 5.929936305732484e-05,
564
- "loss": 0.2424,
565
- "step": 1280
566
- },
567
- {
568
- "epoch": 4.14,
569
- "grad_norm": 1.996484398841858,
570
- "learning_rate": 5.86624203821656e-05,
571
- "loss": 0.251,
572
- "step": 1300
573
- },
574
- {
575
- "epoch": 4.14,
576
- "eval_accuracy": 0.7266666666666667,
577
- "eval_loss": 1.0251611471176147,
578
- "eval_runtime": 14.903,
579
- "eval_samples_per_second": 70.455,
580
- "eval_steps_per_second": 8.857,
581
- "step": 1300
582
- },
583
- {
584
- "epoch": 4.2,
585
- "grad_norm": 2.93237566947937,
586
- "learning_rate": 5.802547770700637e-05,
587
- "loss": 0.1727,
588
- "step": 1320
589
- },
590
- {
591
- "epoch": 4.27,
592
- "grad_norm": 3.0160069465637207,
593
- "learning_rate": 5.7388535031847135e-05,
594
- "loss": 0.2049,
595
- "step": 1340
596
- },
597
- {
598
- "epoch": 4.33,
599
- "grad_norm": 2.878361701965332,
600
- "learning_rate": 5.67515923566879e-05,
601
- "loss": 0.2221,
602
- "step": 1360
603
- },
604
- {
605
- "epoch": 4.39,
606
- "grad_norm": 3.4867329597473145,
607
- "learning_rate": 5.6114649681528666e-05,
608
- "loss": 0.2459,
609
- "step": 1380
610
- },
611
- {
612
- "epoch": 4.46,
613
- "grad_norm": 4.972071170806885,
614
- "learning_rate": 5.547770700636943e-05,
615
- "loss": 0.2085,
616
- "step": 1400
617
- },
618
- {
619
- "epoch": 4.46,
620
- "eval_accuracy": 0.7171428571428572,
621
- "eval_loss": 1.0474802255630493,
622
- "eval_runtime": 15.353,
623
- "eval_samples_per_second": 68.391,
624
- "eval_steps_per_second": 8.598,
625
- "step": 1400
626
- },
627
- {
628
- "epoch": 4.52,
629
- "grad_norm": 3.7226266860961914,
630
- "learning_rate": 5.484076433121019e-05,
631
- "loss": 0.22,
632
- "step": 1420
633
- },
634
- {
635
- "epoch": 4.59,
636
- "grad_norm": 3.0898613929748535,
637
- "learning_rate": 5.420382165605096e-05,
638
- "loss": 0.206,
639
- "step": 1440
640
- },
641
- {
642
- "epoch": 4.65,
643
- "grad_norm": 5.401129722595215,
644
- "learning_rate": 5.356687898089172e-05,
645
- "loss": 0.2215,
646
- "step": 1460
647
- },
648
- {
649
- "epoch": 4.71,
650
- "grad_norm": 3.430591344833374,
651
- "learning_rate": 5.2929936305732485e-05,
652
- "loss": 0.1883,
653
- "step": 1480
654
- },
655
- {
656
- "epoch": 4.78,
657
- "grad_norm": 0.33961915969848633,
658
- "learning_rate": 5.229299363057325e-05,
659
- "loss": 0.1715,
660
- "step": 1500
661
- },
662
- {
663
- "epoch": 4.78,
664
- "eval_accuracy": 0.7495238095238095,
665
- "eval_loss": 0.8852301836013794,
666
- "eval_runtime": 15.0006,
667
- "eval_samples_per_second": 69.997,
668
- "eval_steps_per_second": 8.8,
669
- "step": 1500
670
- },
671
- {
672
- "epoch": 4.84,
673
- "grad_norm": 3.065584182739258,
674
- "learning_rate": 5.1656050955414016e-05,
675
- "loss": 0.1687,
676
- "step": 1520
677
- },
678
- {
679
- "epoch": 4.9,
680
- "grad_norm": 4.120232582092285,
681
- "learning_rate": 5.101910828025478e-05,
682
- "loss": 0.214,
683
- "step": 1540
684
- },
685
- {
686
- "epoch": 4.97,
687
- "grad_norm": 0.5130987763404846,
688
- "learning_rate": 5.038216560509554e-05,
689
- "loss": 0.1631,
690
- "step": 1560
691
- },
692
- {
693
- "epoch": 5.03,
694
- "grad_norm": 4.085451126098633,
695
- "learning_rate": 4.974522292993631e-05,
696
- "loss": 0.1907,
697
- "step": 1580
698
- },
699
- {
700
- "epoch": 5.1,
701
- "grad_norm": 3.028500556945801,
702
- "learning_rate": 4.910828025477707e-05,
703
- "loss": 0.2051,
704
- "step": 1600
705
- },
706
- {
707
- "epoch": 5.1,
708
- "eval_accuracy": 0.7790476190476191,
709
- "eval_loss": 0.8164414763450623,
710
- "eval_runtime": 15.3889,
711
- "eval_samples_per_second": 68.231,
712
- "eval_steps_per_second": 8.578,
713
- "step": 1600
714
- },
715
- {
716
- "epoch": 5.16,
717
- "grad_norm": 2.5563900470733643,
718
- "learning_rate": 4.8471337579617835e-05,
719
- "loss": 0.187,
720
- "step": 1620
721
- },
722
- {
723
- "epoch": 5.22,
724
- "grad_norm": 3.853022813796997,
725
- "learning_rate": 4.7834394904458604e-05,
726
- "loss": 0.2186,
727
- "step": 1640
728
- },
729
- {
730
- "epoch": 5.29,
731
- "grad_norm": 6.835115909576416,
732
- "learning_rate": 4.7197452229299366e-05,
733
- "loss": 0.1717,
734
- "step": 1660
735
- },
736
- {
737
- "epoch": 5.35,
738
- "grad_norm": 3.7477526664733887,
739
- "learning_rate": 4.656050955414013e-05,
740
- "loss": 0.2352,
741
- "step": 1680
742
- },
743
- {
744
- "epoch": 5.41,
745
- "grad_norm": 3.5091373920440674,
746
- "learning_rate": 4.59235668789809e-05,
747
- "loss": 0.1481,
748
- "step": 1700
749
- },
750
- {
751
- "epoch": 5.41,
752
- "eval_accuracy": 0.7628571428571429,
753
- "eval_loss": 0.8825291991233826,
754
- "eval_runtime": 15.2694,
755
- "eval_samples_per_second": 68.765,
756
- "eval_steps_per_second": 8.645,
757
- "step": 1700
758
- },
759
- {
760
- "epoch": 5.48,
761
- "grad_norm": 4.236053943634033,
762
- "learning_rate": 4.528662420382166e-05,
763
- "loss": 0.1482,
764
- "step": 1720
765
- },
766
- {
767
- "epoch": 5.54,
768
- "grad_norm": 3.335090160369873,
769
- "learning_rate": 4.464968152866242e-05,
770
- "loss": 0.1399,
771
- "step": 1740
772
- },
773
- {
774
- "epoch": 5.61,
775
- "grad_norm": 2.238372802734375,
776
- "learning_rate": 4.4012738853503185e-05,
777
- "loss": 0.1664,
778
- "step": 1760
779
- },
780
- {
781
- "epoch": 5.67,
782
- "grad_norm": 1.2184184789657593,
783
- "learning_rate": 4.3375796178343954e-05,
784
- "loss": 0.179,
785
- "step": 1780
786
- },
787
- {
788
- "epoch": 5.73,
789
- "grad_norm": 7.519371509552002,
790
- "learning_rate": 4.2738853503184716e-05,
791
- "loss": 0.177,
792
- "step": 1800
793
- },
794
- {
795
- "epoch": 5.73,
796
- "eval_accuracy": 0.7866666666666666,
797
- "eval_loss": 0.8622841238975525,
798
- "eval_runtime": 16.3495,
799
- "eval_samples_per_second": 64.222,
800
- "eval_steps_per_second": 8.074,
801
- "step": 1800
802
- },
803
- {
804
- "epoch": 5.8,
805
- "grad_norm": 3.128350019454956,
806
- "learning_rate": 4.210191082802548e-05,
807
- "loss": 0.1943,
808
- "step": 1820
809
- },
810
- {
811
- "epoch": 5.86,
812
- "grad_norm": 3.5789175033569336,
813
- "learning_rate": 4.146496815286625e-05,
814
- "loss": 0.1431,
815
- "step": 1840
816
- },
817
- {
818
- "epoch": 5.92,
819
- "grad_norm": 5.671403408050537,
820
- "learning_rate": 4.082802547770701e-05,
821
- "loss": 0.1761,
822
- "step": 1860
823
- },
824
- {
825
- "epoch": 5.99,
826
- "grad_norm": 2.8949170112609863,
827
- "learning_rate": 4.019108280254777e-05,
828
- "loss": 0.1639,
829
- "step": 1880
830
- },
831
- {
832
- "epoch": 6.05,
833
- "grad_norm": 6.811347961425781,
834
- "learning_rate": 3.955414012738854e-05,
835
- "loss": 0.1607,
836
- "step": 1900
837
- },
838
- {
839
- "epoch": 6.05,
840
- "eval_accuracy": 0.7609523809523809,
841
- "eval_loss": 0.948749303817749,
842
- "eval_runtime": 15.6299,
843
- "eval_samples_per_second": 67.179,
844
- "eval_steps_per_second": 8.445,
845
- "step": 1900
846
- },
847
- {
848
- "epoch": 6.11,
849
- "grad_norm": 3.2780776023864746,
850
- "learning_rate": 3.8917197452229304e-05,
851
- "loss": 0.2392,
852
- "step": 1920
853
- },
854
- {
855
- "epoch": 6.18,
856
- "grad_norm": 1.425671935081482,
857
- "learning_rate": 3.8280254777070066e-05,
858
- "loss": 0.1988,
859
- "step": 1940
860
- },
861
- {
862
- "epoch": 6.24,
863
- "grad_norm": 4.055123329162598,
864
- "learning_rate": 3.7643312101910836e-05,
865
- "loss": 0.1563,
866
- "step": 1960
867
- },
868
- {
869
- "epoch": 6.31,
870
- "grad_norm": 5.6207356452941895,
871
- "learning_rate": 3.700636942675159e-05,
872
- "loss": 0.1364,
873
- "step": 1980
874
- },
875
- {
876
- "epoch": 6.37,
877
- "grad_norm": 0.6465654373168945,
878
- "learning_rate": 3.6369426751592353e-05,
879
- "loss": 0.1273,
880
- "step": 2000
881
- },
882
- {
883
- "epoch": 6.37,
884
- "eval_accuracy": 0.7733333333333333,
885
- "eval_loss": 0.8984624743461609,
886
- "eval_runtime": 15.237,
887
- "eval_samples_per_second": 68.911,
888
- "eval_steps_per_second": 8.663,
889
- "step": 2000
890
- },
891
- {
892
- "epoch": 6.43,
893
- "grad_norm": 3.3592026233673096,
894
- "learning_rate": 3.573248407643312e-05,
895
- "loss": 0.1903,
896
- "step": 2020
897
- },
898
- {
899
- "epoch": 6.5,
900
- "grad_norm": 2.187608480453491,
901
- "learning_rate": 3.5095541401273885e-05,
902
- "loss": 0.1684,
903
- "step": 2040
904
- },
905
- {
906
- "epoch": 6.56,
907
- "grad_norm": 2.657270908355713,
908
- "learning_rate": 3.445859872611465e-05,
909
- "loss": 0.1619,
910
- "step": 2060
911
- },
912
- {
913
- "epoch": 6.62,
914
- "grad_norm": 2.2679970264434814,
915
- "learning_rate": 3.3821656050955416e-05,
916
- "loss": 0.1556,
917
- "step": 2080
918
- },
919
- {
920
- "epoch": 6.69,
921
- "grad_norm": 1.9460710287094116,
922
- "learning_rate": 3.318471337579618e-05,
923
- "loss": 0.1609,
924
- "step": 2100
925
- },
926
- {
927
- "epoch": 6.69,
928
- "eval_accuracy": 0.7504761904761905,
929
- "eval_loss": 0.9624072313308716,
930
- "eval_runtime": 15.076,
931
- "eval_samples_per_second": 69.647,
932
- "eval_steps_per_second": 8.756,
933
- "step": 2100
934
- },
935
- {
936
- "epoch": 6.75,
937
- "grad_norm": 2.056673526763916,
938
- "learning_rate": 3.254777070063694e-05,
939
- "loss": 0.1779,
940
- "step": 2120
941
- },
942
- {
943
- "epoch": 6.82,
944
- "grad_norm": 2.4007034301757812,
945
- "learning_rate": 3.191082802547771e-05,
946
- "loss": 0.1359,
947
- "step": 2140
948
- },
949
- {
950
- "epoch": 6.88,
951
- "grad_norm": 6.746215343475342,
952
- "learning_rate": 3.127388535031847e-05,
953
- "loss": 0.1653,
954
- "step": 2160
955
- },
956
- {
957
- "epoch": 6.94,
958
- "grad_norm": 3.8807878494262695,
959
- "learning_rate": 3.0636942675159235e-05,
960
- "loss": 0.1434,
961
- "step": 2180
962
- },
963
- {
964
- "epoch": 7.01,
965
- "grad_norm": 4.5821990966796875,
966
- "learning_rate": 3e-05,
967
- "loss": 0.1583,
968
- "step": 2200
969
- },
970
- {
971
- "epoch": 7.01,
972
- "eval_accuracy": 0.7780952380952381,
973
- "eval_loss": 0.9015449285507202,
974
- "eval_runtime": 15.3184,
975
- "eval_samples_per_second": 68.545,
976
- "eval_steps_per_second": 8.617,
977
- "step": 2200
978
- },
979
- {
980
- "epoch": 7.07,
981
- "grad_norm": 3.755427837371826,
982
- "learning_rate": 2.9363057324840763e-05,
983
- "loss": 0.1251,
984
- "step": 2220
985
- },
986
- {
987
- "epoch": 7.13,
988
- "grad_norm": 2.709980010986328,
989
- "learning_rate": 2.872611464968153e-05,
990
- "loss": 0.1233,
991
- "step": 2240
992
- },
993
- {
994
- "epoch": 7.2,
995
- "grad_norm": 2.7730648517608643,
996
- "learning_rate": 2.8089171974522295e-05,
997
- "loss": 0.1019,
998
- "step": 2260
999
- },
1000
- {
1001
- "epoch": 7.26,
1002
- "grad_norm": 4.75289249420166,
1003
- "learning_rate": 2.7452229299363057e-05,
1004
- "loss": 0.1862,
1005
- "step": 2280
1006
- },
1007
- {
1008
- "epoch": 7.32,
1009
- "grad_norm": 2.336014747619629,
1010
- "learning_rate": 2.6815286624203823e-05,
1011
- "loss": 0.1178,
1012
- "step": 2300
1013
- },
1014
- {
1015
- "epoch": 7.32,
1016
- "eval_accuracy": 0.7761904761904762,
1017
- "eval_loss": 0.9142788648605347,
1018
- "eval_runtime": 15.5959,
1019
- "eval_samples_per_second": 67.325,
1020
- "eval_steps_per_second": 8.464,
1021
- "step": 2300
1022
- },
1023
- {
1024
- "epoch": 7.39,
1025
- "grad_norm": 5.943305015563965,
1026
- "learning_rate": 2.617834394904459e-05,
1027
- "loss": 0.146,
1028
- "step": 2320
1029
- },
1030
- {
1031
- "epoch": 7.45,
1032
- "grad_norm": 1.1991711854934692,
1033
- "learning_rate": 2.554140127388535e-05,
1034
- "loss": 0.1185,
1035
- "step": 2340
1036
- },
1037
- {
1038
- "epoch": 7.52,
1039
- "grad_norm": 1.5695414543151855,
1040
- "learning_rate": 2.4904458598726117e-05,
1041
- "loss": 0.1437,
1042
- "step": 2360
1043
- },
1044
- {
1045
- "epoch": 7.58,
1046
- "grad_norm": 0.3648098111152649,
1047
- "learning_rate": 2.426751592356688e-05,
1048
- "loss": 0.1028,
1049
- "step": 2380
1050
- },
1051
- {
1052
- "epoch": 7.64,
1053
- "grad_norm": 2.0215370655059814,
1054
- "learning_rate": 2.3630573248407645e-05,
1055
- "loss": 0.1175,
1056
- "step": 2400
1057
- },
1058
- {
1059
- "epoch": 7.64,
1060
- "eval_accuracy": 0.7590476190476191,
1061
- "eval_loss": 0.9670929908752441,
1062
- "eval_runtime": 18.4333,
1063
- "eval_samples_per_second": 56.962,
1064
- "eval_steps_per_second": 7.161,
1065
- "step": 2400
1066
- },
1067
- {
1068
- "epoch": 7.71,
1069
- "grad_norm": 5.889522075653076,
1070
- "learning_rate": 2.299363057324841e-05,
1071
- "loss": 0.152,
1072
- "step": 2420
1073
- },
1074
- {
1075
- "epoch": 7.77,
1076
- "grad_norm": 3.53730845451355,
1077
- "learning_rate": 2.2356687898089173e-05,
1078
- "loss": 0.165,
1079
- "step": 2440
1080
- },
1081
- {
1082
- "epoch": 7.83,
1083
- "grad_norm": 2.178981065750122,
1084
- "learning_rate": 2.171974522292994e-05,
1085
- "loss": 0.1128,
1086
- "step": 2460
1087
- },
1088
- {
1089
- "epoch": 7.9,
1090
- "grad_norm": 0.19914887845516205,
1091
- "learning_rate": 2.1114649681528666e-05,
1092
- "loss": 0.1307,
1093
- "step": 2480
1094
- },
1095
- {
1096
- "epoch": 7.96,
1097
- "grad_norm": 2.177159070968628,
1098
- "learning_rate": 2.0477707006369428e-05,
1099
- "loss": 0.1257,
1100
- "step": 2500
1101
- },
1102
- {
1103
- "epoch": 7.96,
1104
- "eval_accuracy": 0.7838095238095238,
1105
- "eval_loss": 0.8925411701202393,
1106
- "eval_runtime": 17.5418,
1107
- "eval_samples_per_second": 59.857,
1108
- "eval_steps_per_second": 7.525,
1109
- "step": 2500
1110
- },
1111
- {
1112
- "epoch": 8.03,
1113
- "grad_norm": 2.6388609409332275,
1114
- "learning_rate": 1.9840764331210194e-05,
1115
- "loss": 0.1199,
1116
- "step": 2520
1117
- },
1118
- {
1119
- "epoch": 8.09,
1120
- "grad_norm": 4.0329155921936035,
1121
- "learning_rate": 1.9203821656050956e-05,
1122
- "loss": 0.1158,
1123
- "step": 2540
1124
- },
1125
- {
1126
- "epoch": 8.15,
1127
- "grad_norm": 2.214768409729004,
1128
- "learning_rate": 1.856687898089172e-05,
1129
- "loss": 0.148,
1130
- "step": 2560
1131
- },
1132
- {
1133
- "epoch": 8.22,
1134
- "grad_norm": 3.5198891162872314,
1135
- "learning_rate": 1.7929936305732484e-05,
1136
- "loss": 0.1107,
1137
- "step": 2580
1138
- },
1139
- {
1140
- "epoch": 8.28,
1141
- "grad_norm": 1.0330649614334106,
1142
- "learning_rate": 1.7292993630573247e-05,
1143
- "loss": 0.0939,
1144
- "step": 2600
1145
- },
1146
- {
1147
- "epoch": 8.28,
1148
- "eval_accuracy": 0.7704761904761904,
1149
- "eval_loss": 0.9257068634033203,
1150
- "eval_runtime": 15.0117,
1151
- "eval_samples_per_second": 69.945,
1152
- "eval_steps_per_second": 8.793,
1153
- "step": 2600
1154
- },
1155
- {
1156
- "epoch": 8.34,
1157
- "grad_norm": 1.8858942985534668,
1158
- "learning_rate": 1.6656050955414012e-05,
1159
- "loss": 0.1063,
1160
- "step": 2620
1161
- },
1162
- {
1163
- "epoch": 8.41,
1164
- "grad_norm": 2.4143009185791016,
1165
- "learning_rate": 1.6019108280254778e-05,
1166
- "loss": 0.1624,
1167
- "step": 2640
1168
- },
1169
- {
1170
- "epoch": 8.47,
1171
- "grad_norm": 0.2723791003227234,
1172
- "learning_rate": 1.538216560509554e-05,
1173
- "loss": 0.1109,
1174
- "step": 2660
1175
- },
1176
- {
1177
- "epoch": 8.54,
1178
- "grad_norm": 3.380007266998291,
1179
- "learning_rate": 1.4745222929936306e-05,
1180
- "loss": 0.1375,
1181
- "step": 2680
1182
- },
1183
- {
1184
- "epoch": 8.6,
1185
- "grad_norm": 1.9235315322875977,
1186
- "learning_rate": 1.410828025477707e-05,
1187
- "loss": 0.1238,
1188
- "step": 2700
1189
- },
1190
- {
1191
- "epoch": 8.6,
1192
- "eval_accuracy": 0.7647619047619048,
1193
- "eval_loss": 0.9797086715698242,
1194
- "eval_runtime": 15.6554,
1195
- "eval_samples_per_second": 67.07,
1196
- "eval_steps_per_second": 8.432,
1197
- "step": 2700
1198
- },
1199
- {
1200
- "epoch": 8.66,
1201
- "grad_norm": 2.037036180496216,
1202
- "learning_rate": 1.3471337579617834e-05,
1203
- "loss": 0.1381,
1204
- "step": 2720
1205
- },
1206
- {
1207
- "epoch": 8.73,
1208
- "grad_norm": 1.407027006149292,
1209
- "learning_rate": 1.2834394904458598e-05,
1210
- "loss": 0.0994,
1211
- "step": 2740
1212
- },
1213
- {
1214
- "epoch": 8.79,
1215
- "grad_norm": 1.2764071226119995,
1216
- "learning_rate": 1.2197452229299364e-05,
1217
- "loss": 0.1033,
1218
- "step": 2760
1219
- },
1220
- {
1221
- "epoch": 8.85,
1222
- "grad_norm": 2.9256701469421387,
1223
- "learning_rate": 1.1560509554140128e-05,
1224
- "loss": 0.0982,
1225
- "step": 2780
1226
- },
1227
- {
1228
- "epoch": 8.92,
1229
- "grad_norm": 5.594937324523926,
1230
- "learning_rate": 1.0923566878980892e-05,
1231
- "loss": 0.1219,
1232
- "step": 2800
1233
- },
1234
- {
1235
- "epoch": 8.92,
1236
- "eval_accuracy": 0.7723809523809524,
1237
- "eval_loss": 0.939895510673523,
1238
- "eval_runtime": 14.8747,
1239
- "eval_samples_per_second": 70.589,
1240
- "eval_steps_per_second": 8.874,
1241
- "step": 2800
1242
- },
1243
- {
1244
- "epoch": 8.98,
1245
- "grad_norm": 3.3030786514282227,
1246
- "learning_rate": 1.0286624203821656e-05,
1247
- "loss": 0.0996,
1248
- "step": 2820
1249
- },
1250
- {
1251
- "epoch": 9.04,
1252
- "grad_norm": 1.5488649606704712,
1253
- "learning_rate": 9.649681528662422e-06,
1254
- "loss": 0.1477,
1255
- "step": 2840
1256
- },
1257
- {
1258
- "epoch": 9.11,
1259
- "grad_norm": 4.593501091003418,
1260
- "learning_rate": 9.012738853503185e-06,
1261
- "loss": 0.1142,
1262
- "step": 2860
1263
- },
1264
- {
1265
- "epoch": 9.17,
1266
- "grad_norm": 6.411059379577637,
1267
- "learning_rate": 8.375796178343949e-06,
1268
- "loss": 0.1249,
1269
- "step": 2880
1270
- },
1271
- {
1272
- "epoch": 9.24,
1273
- "grad_norm": 2.7087924480438232,
1274
- "learning_rate": 7.738853503184713e-06,
1275
- "loss": 0.0985,
1276
- "step": 2900
1277
- },
1278
- {
1279
- "epoch": 9.24,
1280
- "eval_accuracy": 0.7647619047619048,
1281
- "eval_loss": 0.9940046072006226,
1282
- "eval_runtime": 15.2749,
1283
- "eval_samples_per_second": 68.74,
1284
- "eval_steps_per_second": 8.642,
1285
- "step": 2900
1286
- },
1287
- {
1288
- "epoch": 9.3,
1289
- "grad_norm": 5.092051029205322,
1290
- "learning_rate": 7.1019108280254775e-06,
1291
- "loss": 0.1043,
1292
- "step": 2920
1293
- },
1294
- {
1295
- "epoch": 9.36,
1296
- "grad_norm": 10.890628814697266,
1297
- "learning_rate": 6.464968152866242e-06,
1298
- "loss": 0.1174,
1299
- "step": 2940
1300
- },
1301
- {
1302
- "epoch": 9.43,
1303
- "grad_norm": 3.2413973808288574,
1304
- "learning_rate": 5.8280254777070065e-06,
1305
- "loss": 0.105,
1306
- "step": 2960
1307
- },
1308
- {
1309
- "epoch": 9.49,
1310
- "grad_norm": 3.838075876235962,
1311
- "learning_rate": 5.191082802547771e-06,
1312
- "loss": 0.0857,
1313
- "step": 2980
1314
- },
1315
- {
1316
- "epoch": 9.55,
1317
- "grad_norm": 2.082455635070801,
1318
- "learning_rate": 4.5541401273885346e-06,
1319
- "loss": 0.1069,
1320
- "step": 3000
1321
- },
1322
- {
1323
- "epoch": 9.55,
1324
- "eval_accuracy": 0.7742857142857142,
1325
- "eval_loss": 0.9392004013061523,
1326
- "eval_runtime": 15.3798,
1327
- "eval_samples_per_second": 68.271,
1328
- "eval_steps_per_second": 8.583,
1329
- "step": 3000
1330
- },
1331
- {
1332
- "epoch": 9.62,
1333
- "grad_norm": 0.4810134768486023,
1334
- "learning_rate": 3.9171974522292995e-06,
1335
- "loss": 0.0748,
1336
- "step": 3020
1337
- },
1338
- {
1339
- "epoch": 9.68,
1340
- "grad_norm": 1.6180094480514526,
1341
- "learning_rate": 3.280254777070064e-06,
1342
- "loss": 0.1049,
1343
- "step": 3040
1344
- },
1345
- {
1346
- "epoch": 9.75,
1347
- "grad_norm": 0.6423608660697937,
1348
- "learning_rate": 2.6433121019108284e-06,
1349
- "loss": 0.126,
1350
- "step": 3060
1351
- },
1352
- {
1353
- "epoch": 9.81,
1354
- "grad_norm": 1.139814853668213,
1355
- "learning_rate": 2.0063694267515925e-06,
1356
- "loss": 0.0813,
1357
- "step": 3080
1358
- },
1359
- {
1360
- "epoch": 9.87,
1361
- "grad_norm": 0.03859005495905876,
1362
- "learning_rate": 1.3694267515923567e-06,
1363
- "loss": 0.0589,
1364
- "step": 3100
1365
- },
1366
- {
1367
- "epoch": 9.87,
1368
- "eval_accuracy": 0.78,
1369
- "eval_loss": 0.9408173561096191,
1370
- "eval_runtime": 15.1635,
1371
- "eval_samples_per_second": 69.245,
1372
- "eval_steps_per_second": 8.705,
1373
- "step": 3100
1374
- },
1375
- {
1376
- "epoch": 9.94,
1377
- "grad_norm": 2.202677011489868,
1378
- "learning_rate": 7.324840764331211e-07,
1379
- "loss": 0.0856,
1380
- "step": 3120
1381
- },
1382
- {
1383
- "epoch": 10.0,
1384
- "grad_norm": 3.140180826187134,
1385
- "learning_rate": 9.554140127388536e-08,
1386
- "loss": 0.0997,
1387
- "step": 3140
1388
- },
1389
- {
1390
- "epoch": 10.0,
1391
- "step": 3140,
1392
- "total_flos": 7.776878731479245e+18,
1393
- "train_loss": 0.2955047018209081,
1394
- "train_runtime": 3022.5495,
1395
- "train_samples_per_second": 33.2,
1396
- "train_steps_per_second": 1.039
1397
  }
1398
  ],
1399
  "logging_steps": 20,
1400
- "max_steps": 3140,
1401
  "num_input_tokens_seen": 0,
1402
- "num_train_epochs": 10,
1403
  "save_steps": 100,
1404
- "total_flos": 7.776878731479245e+18,
1405
  "train_batch_size": 32,
1406
  "trial_name": null,
1407
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6526193618774414,
3
  "best_model_checkpoint": "Action_model/checkpoint-600",
4
+ "epoch": 2.0,
5
  "eval_steps": 100,
6
+ "global_step": 628,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
+ "grad_norm": 1.701284408569336,
14
+ "learning_rate": 9.681528662420382e-05,
15
+ "loss": 2.1749,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.13,
20
+ "grad_norm": 1.666106939315796,
21
+ "learning_rate": 9.363057324840766e-05,
22
+ "loss": 1.8733,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.19,
27
+ "grad_norm": 2.066211700439453,
28
+ "learning_rate": 9.044585987261147e-05,
29
+ "loss": 1.5213,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.25,
34
+ "grad_norm": 2.146984815597534,
35
+ "learning_rate": 8.726114649681529e-05,
36
+ "loss": 1.2918,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.32,
41
+ "grad_norm": 3.542285442352295,
42
+ "learning_rate": 8.407643312101911e-05,
43
+ "loss": 1.1323,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.32,
48
+ "eval_accuracy": 0.7542857142857143,
49
+ "eval_loss": 1.0433851480484009,
50
+ "eval_runtime": 23.4991,
51
+ "eval_samples_per_second": 44.683,
52
+ "eval_steps_per_second": 5.617,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 0.38,
57
+ "grad_norm": 3.3140709400177,
58
+ "learning_rate": 8.089171974522294e-05,
59
+ "loss": 1.0549,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 0.45,
64
+ "grad_norm": 2.951521158218384,
65
+ "learning_rate": 7.770700636942676e-05,
66
+ "loss": 0.9484,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 0.51,
71
+ "grad_norm": 2.615647554397583,
72
+ "learning_rate": 7.452229299363057e-05,
73
+ "loss": 0.9485,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 0.57,
78
+ "grad_norm": 3.7320423126220703,
79
+ "learning_rate": 7.13375796178344e-05,
80
+ "loss": 0.8395,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 0.64,
85
+ "grad_norm": 2.483612060546875,
86
+ "learning_rate": 6.815286624203822e-05,
87
+ "loss": 0.7842,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 0.64,
92
+ "eval_accuracy": 0.780952380952381,
93
+ "eval_loss": 0.7771612405776978,
94
+ "eval_runtime": 14.6905,
95
+ "eval_samples_per_second": 71.475,
96
+ "eval_steps_per_second": 8.985,
97
  "step": 200
98
  },
99
  {
100
  "epoch": 0.7,
101
+ "grad_norm": 2.181293487548828,
102
+ "learning_rate": 6.512738853503185e-05,
103
+ "loss": 0.7154,
104
  "step": 220
105
  },
106
  {
107
  "epoch": 0.76,
108
+ "grad_norm": 3.2365713119506836,
109
+ "learning_rate": 6.194267515923567e-05,
110
+ "loss": 0.6236,
111
  "step": 240
112
  },
113
  {
114
  "epoch": 0.83,
115
+ "grad_norm": 3.164695978164673,
116
+ "learning_rate": 5.87579617834395e-05,
117
+ "loss": 0.706,
118
  "step": 260
119
  },
120
  {
121
  "epoch": 0.89,
122
+ "grad_norm": 3.6914055347442627,
123
+ "learning_rate": 5.5573248407643317e-05,
124
+ "loss": 0.6549,
125
  "step": 280
126
  },
127
  {
128
  "epoch": 0.96,
129
+ "grad_norm": 3.365011215209961,
130
+ "learning_rate": 5.238853503184714e-05,
131
+ "loss": 0.6174,
132
  "step": 300
133
  },
134
  {
135
  "epoch": 0.96,
136
+ "eval_accuracy": 0.7847619047619048,
137
+ "eval_loss": 0.7121406197547913,
138
+ "eval_runtime": 14.6509,
139
+ "eval_samples_per_second": 71.668,
140
+ "eval_steps_per_second": 9.01,
141
  "step": 300
142
  },
143
  {
144
  "epoch": 1.02,
145
+ "grad_norm": 3.2638425827026367,
146
+ "learning_rate": 4.9363057324840765e-05,
147
+ "loss": 0.5646,
148
  "step": 320
149
  },
150
  {
151
  "epoch": 1.08,
152
+ "grad_norm": 4.049094200134277,
153
+ "learning_rate": 4.617834394904459e-05,
154
+ "loss": 0.5271,
155
  "step": 340
156
  },
157
  {
158
  "epoch": 1.15,
159
+ "grad_norm": 2.9733641147613525,
160
+ "learning_rate": 4.299363057324841e-05,
161
+ "loss": 0.5735,
162
  "step": 360
163
  },
164
  {
165
  "epoch": 1.21,
166
+ "grad_norm": 3.511613130569458,
167
+ "learning_rate": 3.9808917197452234e-05,
168
+ "loss": 0.5137,
169
  "step": 380
170
  },
171
  {
172
  "epoch": 1.27,
173
+ "grad_norm": 2.1663146018981934,
174
+ "learning_rate": 3.662420382165605e-05,
175
+ "loss": 0.5362,
176
  "step": 400
177
  },
178
  {
179
  "epoch": 1.27,
180
+ "eval_accuracy": 0.7723809523809524,
181
+ "eval_loss": 0.7485681772232056,
182
+ "eval_runtime": 14.6889,
183
+ "eval_samples_per_second": 71.483,
184
+ "eval_steps_per_second": 8.986,
185
  "step": 400
186
  },
187
  {
188
  "epoch": 1.34,
189
+ "grad_norm": 3.5004124641418457,
190
+ "learning_rate": 3.343949044585987e-05,
191
+ "loss": 0.468,
192
  "step": 420
193
  },
194
  {
195
  "epoch": 1.4,
196
+ "grad_norm": 2.454324722290039,
197
+ "learning_rate": 3.0254777070063693e-05,
198
+ "loss": 0.474,
199
  "step": 440
200
  },
201
  {
202
  "epoch": 1.46,
203
+ "grad_norm": 3.5527968406677246,
204
+ "learning_rate": 2.707006369426752e-05,
205
+ "loss": 0.3886,
206
  "step": 460
207
  },
208
  {
209
  "epoch": 1.53,
210
+ "grad_norm": 4.704432487487793,
211
+ "learning_rate": 2.388535031847134e-05,
212
+ "loss": 0.3968,
213
  "step": 480
214
  },
215
  {
216
  "epoch": 1.59,
217
+ "grad_norm": 3.4028120040893555,
218
+ "learning_rate": 2.0700636942675162e-05,
219
+ "loss": 0.4918,
220
  "step": 500
221
  },
222
  {
223
  "epoch": 1.59,
224
+ "eval_accuracy": 0.8028571428571428,
225
+ "eval_loss": 0.6674752235412598,
226
+ "eval_runtime": 14.482,
227
+ "eval_samples_per_second": 72.504,
228
+ "eval_steps_per_second": 9.115,
229
  "step": 500
230
  },
231
  {
232
  "epoch": 1.66,
233
+ "grad_norm": 5.370004177093506,
234
+ "learning_rate": 1.751592356687898e-05,
235
+ "loss": 0.4256,
236
  "step": 520
237
  },
238
  {
239
  "epoch": 1.72,
240
+ "grad_norm": 3.2904512882232666,
241
+ "learning_rate": 1.4490445859872612e-05,
242
+ "loss": 0.4463,
243
  "step": 540
244
  },
245
  {
246
  "epoch": 1.78,
247
+ "grad_norm": 4.211310863494873,
248
+ "learning_rate": 1.1305732484076434e-05,
249
+ "loss": 0.4229,
250
  "step": 560
251
  },
252
  {
253
  "epoch": 1.85,
254
+ "grad_norm": 3.5312132835388184,
255
+ "learning_rate": 8.121019108280255e-06,
256
+ "loss": 0.4013,
257
  "step": 580
258
  },
259
  {
260
  "epoch": 1.91,
261
+ "grad_norm": 4.966080665588379,
262
+ "learning_rate": 4.936305732484077e-06,
263
+ "loss": 0.4346,
264
  "step": 600
265
  },
266
  {
267
  "epoch": 1.91,
268
+ "eval_accuracy": 0.8066666666666666,
269
+ "eval_loss": 0.6526193618774414,
270
+ "eval_runtime": 14.3611,
271
+ "eval_samples_per_second": 73.114,
272
+ "eval_steps_per_second": 9.192,
273
  "step": 600
274
  },
275
  {
276
  "epoch": 1.97,
277
+ "grad_norm": 2.851719856262207,
278
+ "learning_rate": 1.7515923566878982e-06,
279
+ "loss": 0.4156,
280
  "step": 620
281
  },
282
  {
283
+ "epoch": 2.0,
284
+ "step": 628,
285
+ "total_flos": 1.555375746295849e+18,
286
+ "train_loss": 0.7493580146959633,
287
+ "train_runtime": 668.3994,
288
+ "train_samples_per_second": 30.027,
289
+ "train_steps_per_second": 0.94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  }
291
  ],
292
  "logging_steps": 20,
293
+ "max_steps": 628,
294
  "num_input_tokens_seen": 0,
295
+ "num_train_epochs": 2,
296
  "save_steps": 100,
297
+ "total_flos": 1.555375746295849e+18,
298
  "train_batch_size": 32,
299
  "trial_name": null,
300
  "trial_params": null