Raihan004 commited on
Commit
008f4bc
1 Parent(s): 1ee945f

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - imagefolder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: imagefolder
18
  type: imagefolder
19
  config: default
20
  split: train
@@ -30,7 +31,7 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # Action_model
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
  - Loss: 1.1399
36
  - Accuracy: 0.7891
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: action_class
19
  type: imagefolder
20
  config: default
21
  split: train
 
31
 
32
  # Action_model
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the action_class dataset.
35
  It achieves the following results on the evaluation set:
36
  - Loss: 1.1399
37
  - Accuracy: 0.7891
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.8330404217926186,
4
- "eval_loss": 0.6129801869392395,
5
- "eval_runtime": 8.4281,
6
- "eval_samples_per_second": 67.512,
7
- "eval_steps_per_second": 8.543,
8
- "total_flos": 3.3230947683690086e+18,
9
- "train_loss": 0.23535207314277762,
10
- "train_runtime": 1371.8304,
11
- "train_samples_per_second": 31.258,
12
- "train_steps_per_second": 1.954
13
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.789103690685413,
4
+ "eval_loss": 1.1399264335632324,
5
+ "eval_runtime": 7.7057,
6
+ "eval_samples_per_second": 73.841,
7
+ "eval_steps_per_second": 9.344,
8
+ "total_flos": 3.3230947683690086e+17,
9
+ "train_loss": 1.549544946471257,
10
+ "train_runtime": 145.4848,
11
+ "train_samples_per_second": 29.474,
12
+ "train_steps_per_second": 0.921
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.8330404217926186,
4
- "eval_loss": 0.6129801869392395,
5
- "eval_runtime": 8.4281,
6
- "eval_samples_per_second": 67.512,
7
- "eval_steps_per_second": 8.543
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.789103690685413,
4
+ "eval_loss": 1.1399264335632324,
5
+ "eval_runtime": 7.7057,
6
+ "eval_samples_per_second": 73.841,
7
+ "eval_steps_per_second": 9.344
8
  }
runs/May07_17-20-53_6cf576645707/events.out.tfevents.1715102622.6cf576645707.35.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a325ab9253138bc7fd40964f52865aa317e7c1c4073d45e5eeb89dd355257a0
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 3.3230947683690086e+18,
4
- "train_loss": 0.23535207314277762,
5
- "train_runtime": 1371.8304,
6
- "train_samples_per_second": 31.258,
7
- "train_steps_per_second": 1.954
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 3.3230947683690086e+17,
4
+ "train_loss": 1.549544946471257,
5
+ "train_runtime": 145.4848,
6
+ "train_samples_per_second": 29.474,
7
+ "train_steps_per_second": 0.921
8
  }
trainer_state.json CHANGED
@@ -1,2140 +1,81 @@
1
  {
2
- "best_metric": 0.6129801869392395,
3
- "best_model_checkpoint": "Action_model/checkpoint-300",
4
- "epoch": 10.0,
5
  "eval_steps": 100,
6
- "global_step": 2680,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.04,
13
- "grad_norm": 2.570383071899414,
14
- "learning_rate": 9.96268656716418e-05,
15
- "loss": 0.1841,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.07,
20
- "grad_norm": 6.266295433044434,
21
- "learning_rate": 9.925373134328359e-05,
22
- "loss": 0.2301,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.11,
27
- "grad_norm": 8.001986503601074,
28
- "learning_rate": 9.888059701492539e-05,
29
- "loss": 0.2533,
30
- "step": 30
31
- },
32
  {
33
  "epoch": 0.15,
34
- "grad_norm": 5.319194316864014,
35
- "learning_rate": 9.850746268656717e-05,
36
- "loss": 0.2436,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.19,
41
- "grad_norm": 0.9653372764587402,
42
- "learning_rate": 9.813432835820896e-05,
43
- "loss": 0.3712,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.22,
48
- "grad_norm": 7.348043441772461,
49
- "learning_rate": 9.776119402985075e-05,
50
- "loss": 0.3645,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.26,
55
- "grad_norm": 2.1969542503356934,
56
- "learning_rate": 9.738805970149254e-05,
57
- "loss": 0.4609,
58
- "step": 70
59
  },
60
  {
61
  "epoch": 0.3,
62
- "grad_norm": 6.397550106048584,
63
- "learning_rate": 9.701492537313434e-05,
64
- "loss": 0.4755,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.34,
69
- "grad_norm": 6.923007488250732,
70
- "learning_rate": 9.664179104477612e-05,
71
- "loss": 0.3901,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.37,
76
- "grad_norm": 4.786198616027832,
77
- "learning_rate": 9.626865671641792e-05,
78
- "loss": 0.255,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.37,
83
- "eval_accuracy": 0.7926186291739895,
84
- "eval_loss": 0.7616190314292908,
85
- "eval_runtime": 8.7209,
86
- "eval_samples_per_second": 65.245,
87
- "eval_steps_per_second": 8.256,
88
- "step": 100
89
- },
90
- {
91
- "epoch": 0.41,
92
- "grad_norm": 8.368223190307617,
93
- "learning_rate": 9.58955223880597e-05,
94
- "loss": 0.3784,
95
- "step": 110
96
  },
97
  {
98
  "epoch": 0.45,
99
- "grad_norm": 4.078306198120117,
100
- "learning_rate": 9.552238805970149e-05,
101
- "loss": 0.4148,
102
- "step": 120
103
- },
104
- {
105
- "epoch": 0.49,
106
- "grad_norm": 7.815361022949219,
107
- "learning_rate": 9.514925373134329e-05,
108
- "loss": 0.3621,
109
- "step": 130
110
- },
111
- {
112
- "epoch": 0.52,
113
- "grad_norm": 11.498431205749512,
114
- "learning_rate": 9.477611940298507e-05,
115
- "loss": 0.3974,
116
- "step": 140
117
- },
118
- {
119
- "epoch": 0.56,
120
- "grad_norm": 7.946558952331543,
121
- "learning_rate": 9.440298507462687e-05,
122
- "loss": 0.3856,
123
- "step": 150
124
  },
125
  {
126
  "epoch": 0.6,
127
- "grad_norm": 0.3486919403076172,
128
- "learning_rate": 9.402985074626867e-05,
129
- "loss": 0.2435,
130
- "step": 160
131
- },
132
- {
133
- "epoch": 0.63,
134
- "grad_norm": 4.267444133758545,
135
- "learning_rate": 9.365671641791045e-05,
136
- "loss": 0.3736,
137
- "step": 170
138
- },
139
- {
140
- "epoch": 0.67,
141
- "grad_norm": 3.022345542907715,
142
- "learning_rate": 9.328358208955224e-05,
143
- "loss": 0.439,
144
- "step": 180
145
- },
146
- {
147
- "epoch": 0.71,
148
- "grad_norm": 5.57196044921875,
149
- "learning_rate": 9.291044776119402e-05,
150
- "loss": 0.2996,
151
- "step": 190
152
  },
153
  {
154
  "epoch": 0.75,
155
- "grad_norm": 2.636216640472412,
156
- "learning_rate": 9.253731343283582e-05,
157
- "loss": 0.2048,
158
- "step": 200
159
  },
160
  {
161
  "epoch": 0.75,
162
- "eval_accuracy": 0.8084358523725835,
163
- "eval_loss": 0.724670946598053,
164
- "eval_runtime": 8.4461,
165
- "eval_samples_per_second": 67.368,
166
- "eval_steps_per_second": 8.525,
167
- "step": 200
168
- },
169
- {
170
- "epoch": 0.78,
171
- "grad_norm": 1.615098237991333,
172
- "learning_rate": 9.216417910447762e-05,
173
- "loss": 0.3594,
174
- "step": 210
175
- },
176
- {
177
- "epoch": 0.82,
178
- "grad_norm": 9.315821647644043,
179
- "learning_rate": 9.17910447761194e-05,
180
- "loss": 0.3046,
181
- "step": 220
182
- },
183
- {
184
- "epoch": 0.86,
185
- "grad_norm": 3.669430732727051,
186
- "learning_rate": 9.14179104477612e-05,
187
- "loss": 0.4158,
188
- "step": 230
189
- },
190
- {
191
- "epoch": 0.9,
192
- "grad_norm": 7.0882978439331055,
193
- "learning_rate": 9.104477611940299e-05,
194
- "loss": 0.3477,
195
- "step": 240
196
- },
197
- {
198
- "epoch": 0.93,
199
- "grad_norm": 1.1667325496673584,
200
- "learning_rate": 9.067164179104479e-05,
201
- "loss": 0.316,
202
- "step": 250
203
- },
204
- {
205
- "epoch": 0.97,
206
- "grad_norm": 1.482625961303711,
207
- "learning_rate": 9.029850746268657e-05,
208
- "loss": 0.3922,
209
- "step": 260
210
- },
211
- {
212
- "epoch": 1.01,
213
- "grad_norm": 0.20793116092681885,
214
- "learning_rate": 8.992537313432836e-05,
215
- "loss": 0.3751,
216
- "step": 270
217
- },
218
- {
219
- "epoch": 1.04,
220
- "grad_norm": 6.772298812866211,
221
- "learning_rate": 8.955223880597016e-05,
222
- "loss": 0.3269,
223
- "step": 280
224
- },
225
- {
226
- "epoch": 1.08,
227
- "grad_norm": 5.833349227905273,
228
- "learning_rate": 8.917910447761194e-05,
229
- "loss": 0.3026,
230
- "step": 290
231
- },
232
- {
233
- "epoch": 1.12,
234
- "grad_norm": 6.349458694458008,
235
- "learning_rate": 8.880597014925374e-05,
236
- "loss": 0.3763,
237
- "step": 300
238
- },
239
- {
240
- "epoch": 1.12,
241
- "eval_accuracy": 0.8330404217926186,
242
- "eval_loss": 0.6129801869392395,
243
- "eval_runtime": 8.4095,
244
- "eval_samples_per_second": 67.661,
245
- "eval_steps_per_second": 8.562,
246
- "step": 300
247
- },
248
- {
249
- "epoch": 1.16,
250
- "grad_norm": 4.767229080200195,
251
- "learning_rate": 8.843283582089554e-05,
252
- "loss": 0.3808,
253
- "step": 310
254
- },
255
- {
256
- "epoch": 1.19,
257
- "grad_norm": 12.675297737121582,
258
- "learning_rate": 8.805970149253732e-05,
259
- "loss": 0.3766,
260
- "step": 320
261
- },
262
- {
263
- "epoch": 1.23,
264
- "grad_norm": 3.8118245601654053,
265
- "learning_rate": 8.76865671641791e-05,
266
- "loss": 0.2642,
267
- "step": 330
268
- },
269
- {
270
- "epoch": 1.27,
271
- "grad_norm": 8.736045837402344,
272
- "learning_rate": 8.731343283582089e-05,
273
- "loss": 0.3041,
274
- "step": 340
275
- },
276
- {
277
- "epoch": 1.31,
278
- "grad_norm": 6.683359146118164,
279
- "learning_rate": 8.694029850746269e-05,
280
- "loss": 0.1352,
281
- "step": 350
282
- },
283
- {
284
- "epoch": 1.34,
285
- "grad_norm": 4.780521392822266,
286
- "learning_rate": 8.656716417910447e-05,
287
- "loss": 0.4005,
288
- "step": 360
289
- },
290
- {
291
- "epoch": 1.38,
292
- "grad_norm": 9.654714584350586,
293
- "learning_rate": 8.619402985074627e-05,
294
- "loss": 0.3646,
295
- "step": 370
296
- },
297
- {
298
- "epoch": 1.42,
299
- "grad_norm": 4.174666881561279,
300
- "learning_rate": 8.582089552238807e-05,
301
- "loss": 0.2353,
302
- "step": 380
303
- },
304
- {
305
- "epoch": 1.46,
306
- "grad_norm": 7.596667289733887,
307
- "learning_rate": 8.548507462686568e-05,
308
- "loss": 0.3991,
309
- "step": 390
310
- },
311
- {
312
- "epoch": 1.49,
313
- "grad_norm": 5.592709064483643,
314
- "learning_rate": 8.511194029850747e-05,
315
- "loss": 0.307,
316
- "step": 400
317
- },
318
- {
319
- "epoch": 1.49,
320
  "eval_accuracy": 0.789103690685413,
321
- "eval_loss": 0.8137023448944092,
322
- "eval_runtime": 8.3292,
323
- "eval_samples_per_second": 68.314,
324
- "eval_steps_per_second": 8.644,
325
- "step": 400
326
- },
327
- {
328
- "epoch": 1.53,
329
- "grad_norm": 2.232590675354004,
330
- "learning_rate": 8.473880597014926e-05,
331
- "loss": 0.4669,
332
- "step": 410
333
- },
334
- {
335
- "epoch": 1.57,
336
- "grad_norm": 4.276609897613525,
337
- "learning_rate": 8.436567164179105e-05,
338
- "loss": 0.3831,
339
- "step": 420
340
- },
341
- {
342
- "epoch": 1.6,
343
- "grad_norm": 7.262507915496826,
344
- "learning_rate": 8.399253731343283e-05,
345
- "loss": 0.3472,
346
- "step": 430
347
- },
348
- {
349
- "epoch": 1.64,
350
- "grad_norm": 7.258556365966797,
351
- "learning_rate": 8.361940298507463e-05,
352
- "loss": 0.2396,
353
- "step": 440
354
- },
355
- {
356
- "epoch": 1.68,
357
- "grad_norm": 4.945961952209473,
358
- "learning_rate": 8.324626865671642e-05,
359
- "loss": 0.2433,
360
- "step": 450
361
- },
362
- {
363
- "epoch": 1.72,
364
- "grad_norm": 5.138702392578125,
365
- "learning_rate": 8.287313432835821e-05,
366
- "loss": 0.2947,
367
- "step": 460
368
- },
369
- {
370
- "epoch": 1.75,
371
- "grad_norm": 1.1640909910202026,
372
- "learning_rate": 8.25e-05,
373
- "loss": 0.4791,
374
- "step": 470
375
- },
376
- {
377
- "epoch": 1.79,
378
- "grad_norm": 4.626485824584961,
379
- "learning_rate": 8.21268656716418e-05,
380
- "loss": 0.286,
381
- "step": 480
382
- },
383
- {
384
- "epoch": 1.83,
385
- "grad_norm": 5.178492069244385,
386
- "learning_rate": 8.17537313432836e-05,
387
- "loss": 0.3202,
388
- "step": 490
389
- },
390
- {
391
- "epoch": 1.87,
392
- "grad_norm": 7.854339122772217,
393
- "learning_rate": 8.138059701492538e-05,
394
- "loss": 0.3542,
395
- "step": 500
396
- },
397
- {
398
- "epoch": 1.87,
399
- "eval_accuracy": 0.8014059753954306,
400
- "eval_loss": 0.6611581444740295,
401
- "eval_runtime": 8.5853,
402
- "eval_samples_per_second": 66.276,
403
- "eval_steps_per_second": 8.386,
404
- "step": 500
405
- },
406
- {
407
- "epoch": 1.9,
408
- "grad_norm": 1.429740071296692,
409
- "learning_rate": 8.100746268656717e-05,
410
- "loss": 0.3039,
411
- "step": 510
412
- },
413
- {
414
- "epoch": 1.94,
415
- "grad_norm": 2.9776551723480225,
416
- "learning_rate": 8.063432835820895e-05,
417
- "loss": 0.3825,
418
- "step": 520
419
- },
420
- {
421
- "epoch": 1.98,
422
- "grad_norm": 10.557899475097656,
423
- "learning_rate": 8.026119402985075e-05,
424
- "loss": 0.5109,
425
- "step": 530
426
- },
427
- {
428
- "epoch": 2.01,
429
- "grad_norm": 1.448002815246582,
430
- "learning_rate": 7.988805970149255e-05,
431
- "loss": 0.3421,
432
- "step": 540
433
- },
434
- {
435
- "epoch": 2.05,
436
- "grad_norm": 4.500860691070557,
437
- "learning_rate": 7.951492537313433e-05,
438
- "loss": 0.3008,
439
- "step": 550
440
- },
441
- {
442
- "epoch": 2.09,
443
- "grad_norm": 8.077374458312988,
444
- "learning_rate": 7.914179104477613e-05,
445
- "loss": 0.27,
446
- "step": 560
447
- },
448
- {
449
- "epoch": 2.13,
450
- "grad_norm": 0.16809479892253876,
451
- "learning_rate": 7.876865671641792e-05,
452
- "loss": 0.2184,
453
- "step": 570
454
- },
455
- {
456
- "epoch": 2.16,
457
- "grad_norm": 4.892763137817383,
458
- "learning_rate": 7.83955223880597e-05,
459
- "loss": 0.1479,
460
- "step": 580
461
- },
462
- {
463
- "epoch": 2.2,
464
- "grad_norm": 8.35221004486084,
465
- "learning_rate": 7.80223880597015e-05,
466
- "loss": 0.3498,
467
- "step": 590
468
- },
469
- {
470
- "epoch": 2.24,
471
- "grad_norm": 12.043429374694824,
472
- "learning_rate": 7.764925373134328e-05,
473
- "loss": 0.3518,
474
- "step": 600
475
- },
476
- {
477
- "epoch": 2.24,
478
- "eval_accuracy": 0.8189806678383128,
479
- "eval_loss": 0.6964564919471741,
480
- "eval_runtime": 8.3878,
481
- "eval_samples_per_second": 67.837,
482
- "eval_steps_per_second": 8.584,
483
- "step": 600
484
- },
485
- {
486
- "epoch": 2.28,
487
- "grad_norm": 3.7737715244293213,
488
- "learning_rate": 7.727611940298508e-05,
489
- "loss": 0.3532,
490
- "step": 610
491
- },
492
- {
493
- "epoch": 2.31,
494
- "grad_norm": 4.282881736755371,
495
- "learning_rate": 7.690298507462687e-05,
496
- "loss": 0.2214,
497
- "step": 620
498
- },
499
- {
500
- "epoch": 2.35,
501
- "grad_norm": 6.733531475067139,
502
- "learning_rate": 7.652985074626866e-05,
503
- "loss": 0.2709,
504
- "step": 630
505
- },
506
- {
507
- "epoch": 2.39,
508
- "grad_norm": 2.567267417907715,
509
- "learning_rate": 7.615671641791045e-05,
510
- "loss": 0.3725,
511
- "step": 640
512
- },
513
- {
514
- "epoch": 2.43,
515
- "grad_norm": 3.120966911315918,
516
- "learning_rate": 7.578358208955223e-05,
517
- "loss": 0.3036,
518
- "step": 650
519
- },
520
- {
521
- "epoch": 2.46,
522
- "grad_norm": 6.505622386932373,
523
- "learning_rate": 7.541044776119403e-05,
524
- "loss": 0.2426,
525
- "step": 660
526
- },
527
- {
528
- "epoch": 2.5,
529
- "grad_norm": 4.887637615203857,
530
- "learning_rate": 7.503731343283582e-05,
531
- "loss": 0.281,
532
- "step": 670
533
- },
534
- {
535
- "epoch": 2.54,
536
- "grad_norm": 9.790969848632812,
537
- "learning_rate": 7.466417910447762e-05,
538
- "loss": 0.4504,
539
- "step": 680
540
- },
541
- {
542
- "epoch": 2.57,
543
- "grad_norm": 4.354789733886719,
544
- "learning_rate": 7.429104477611941e-05,
545
- "loss": 0.4094,
546
- "step": 690
547
- },
548
- {
549
- "epoch": 2.61,
550
- "grad_norm": 5.015912055969238,
551
- "learning_rate": 7.39179104477612e-05,
552
- "loss": 0.3706,
553
- "step": 700
554
- },
555
- {
556
- "epoch": 2.61,
557
- "eval_accuracy": 0.804920913884007,
558
- "eval_loss": 0.7254143357276917,
559
- "eval_runtime": 8.3242,
560
- "eval_samples_per_second": 68.355,
561
- "eval_steps_per_second": 8.649,
562
- "step": 700
563
- },
564
- {
565
- "epoch": 2.65,
566
- "grad_norm": 5.382541656494141,
567
- "learning_rate": 7.3544776119403e-05,
568
- "loss": 0.1722,
569
- "step": 710
570
- },
571
- {
572
- "epoch": 2.69,
573
- "grad_norm": 5.573971748352051,
574
- "learning_rate": 7.317164179104478e-05,
575
- "loss": 0.327,
576
- "step": 720
577
- },
578
- {
579
- "epoch": 2.72,
580
- "grad_norm": 3.5606117248535156,
581
- "learning_rate": 7.279850746268657e-05,
582
- "loss": 0.2702,
583
- "step": 730
584
- },
585
- {
586
- "epoch": 2.76,
587
- "grad_norm": 1.7398028373718262,
588
- "learning_rate": 7.242537313432837e-05,
589
- "loss": 0.238,
590
- "step": 740
591
- },
592
- {
593
- "epoch": 2.8,
594
- "grad_norm": 2.7511751651763916,
595
- "learning_rate": 7.205223880597015e-05,
596
- "loss": 0.1848,
597
- "step": 750
598
- },
599
- {
600
- "epoch": 2.84,
601
- "grad_norm": 3.381510019302368,
602
- "learning_rate": 7.167910447761195e-05,
603
- "loss": 0.2261,
604
- "step": 760
605
- },
606
- {
607
- "epoch": 2.87,
608
- "grad_norm": 4.65634298324585,
609
- "learning_rate": 7.130597014925373e-05,
610
- "loss": 0.237,
611
- "step": 770
612
- },
613
- {
614
- "epoch": 2.91,
615
- "grad_norm": 10.35020923614502,
616
- "learning_rate": 7.093283582089553e-05,
617
- "loss": 0.3012,
618
- "step": 780
619
- },
620
- {
621
- "epoch": 2.95,
622
- "grad_norm": 8.878485679626465,
623
- "learning_rate": 7.055970149253732e-05,
624
- "loss": 0.4094,
625
- "step": 790
626
- },
627
- {
628
- "epoch": 2.99,
629
- "grad_norm": 2.9728074073791504,
630
- "learning_rate": 7.01865671641791e-05,
631
- "loss": 0.4084,
632
- "step": 800
633
- },
634
- {
635
- "epoch": 2.99,
636
- "eval_accuracy": 0.8101933216168717,
637
- "eval_loss": 0.6746156811714172,
638
- "eval_runtime": 8.2718,
639
- "eval_samples_per_second": 68.788,
640
- "eval_steps_per_second": 8.704,
641
- "step": 800
642
- },
643
- {
644
- "epoch": 3.02,
645
- "grad_norm": 4.835368633270264,
646
- "learning_rate": 6.98134328358209e-05,
647
- "loss": 0.3152,
648
- "step": 810
649
- },
650
- {
651
- "epoch": 3.06,
652
- "grad_norm": 2.9197049140930176,
653
- "learning_rate": 6.944029850746268e-05,
654
- "loss": 0.3433,
655
- "step": 820
656
- },
657
- {
658
- "epoch": 3.1,
659
- "grad_norm": 5.646128177642822,
660
- "learning_rate": 6.906716417910448e-05,
661
- "loss": 0.2604,
662
- "step": 830
663
- },
664
- {
665
- "epoch": 3.13,
666
- "grad_norm": 3.860607862472534,
667
- "learning_rate": 6.869402985074627e-05,
668
- "loss": 0.2831,
669
- "step": 840
670
- },
671
- {
672
- "epoch": 3.17,
673
- "grad_norm": 0.1358175426721573,
674
- "learning_rate": 6.832089552238807e-05,
675
- "loss": 0.242,
676
- "step": 850
677
- },
678
- {
679
- "epoch": 3.21,
680
- "grad_norm": 1.1011104583740234,
681
- "learning_rate": 6.794776119402985e-05,
682
- "loss": 0.2621,
683
- "step": 860
684
- },
685
- {
686
- "epoch": 3.25,
687
- "grad_norm": 7.837879180908203,
688
- "learning_rate": 6.757462686567164e-05,
689
- "loss": 0.249,
690
- "step": 870
691
- },
692
- {
693
- "epoch": 3.28,
694
- "grad_norm": 6.8647613525390625,
695
- "learning_rate": 6.720149253731343e-05,
696
- "loss": 0.3398,
697
- "step": 880
698
- },
699
- {
700
- "epoch": 3.32,
701
- "grad_norm": 2.8186678886413574,
702
- "learning_rate": 6.682835820895522e-05,
703
- "loss": 0.3092,
704
- "step": 890
705
- },
706
- {
707
- "epoch": 3.36,
708
- "grad_norm": 4.623282432556152,
709
- "learning_rate": 6.645522388059702e-05,
710
- "loss": 0.2533,
711
- "step": 900
712
- },
713
- {
714
- "epoch": 3.36,
715
- "eval_accuracy": 0.8189806678383128,
716
- "eval_loss": 0.6866591572761536,
717
- "eval_runtime": 8.3143,
718
- "eval_samples_per_second": 68.436,
719
- "eval_steps_per_second": 8.66,
720
- "step": 900
721
- },
722
- {
723
- "epoch": 3.4,
724
- "grad_norm": 4.85120964050293,
725
- "learning_rate": 6.608208955223882e-05,
726
- "loss": 0.2279,
727
- "step": 910
728
- },
729
- {
730
- "epoch": 3.43,
731
- "grad_norm": 0.7263774275779724,
732
- "learning_rate": 6.57089552238806e-05,
733
- "loss": 0.1725,
734
- "step": 920
735
- },
736
- {
737
- "epoch": 3.47,
738
- "grad_norm": 6.813180923461914,
739
- "learning_rate": 6.53358208955224e-05,
740
- "loss": 0.3304,
741
- "step": 930
742
- },
743
- {
744
- "epoch": 3.51,
745
- "grad_norm": 8.58501148223877,
746
- "learning_rate": 6.496268656716418e-05,
747
- "loss": 0.1864,
748
- "step": 940
749
- },
750
- {
751
- "epoch": 3.54,
752
- "grad_norm": 2.814436435699463,
753
- "learning_rate": 6.458955223880597e-05,
754
- "loss": 0.1496,
755
- "step": 950
756
- },
757
- {
758
- "epoch": 3.58,
759
- "grad_norm": 8.36603832244873,
760
- "learning_rate": 6.421641791044777e-05,
761
- "loss": 0.208,
762
- "step": 960
763
- },
764
- {
765
- "epoch": 3.62,
766
- "grad_norm": 3.5715956687927246,
767
- "learning_rate": 6.384328358208955e-05,
768
- "loss": 0.2429,
769
- "step": 970
770
- },
771
- {
772
- "epoch": 3.66,
773
- "grad_norm": 4.983556270599365,
774
- "learning_rate": 6.347014925373135e-05,
775
- "loss": 0.4053,
776
- "step": 980
777
- },
778
- {
779
- "epoch": 3.69,
780
- "grad_norm": 4.936723232269287,
781
- "learning_rate": 6.309701492537313e-05,
782
- "loss": 0.1545,
783
- "step": 990
784
- },
785
- {
786
- "epoch": 3.73,
787
- "grad_norm": 6.59185791015625,
788
- "learning_rate": 6.272388059701493e-05,
789
- "loss": 0.3147,
790
- "step": 1000
791
- },
792
- {
793
- "epoch": 3.73,
794
- "eval_accuracy": 0.8189806678383128,
795
- "eval_loss": 0.7077136635780334,
796
- "eval_runtime": 8.3117,
797
- "eval_samples_per_second": 68.457,
798
- "eval_steps_per_second": 8.662,
799
- "step": 1000
800
- },
801
- {
802
- "epoch": 3.77,
803
- "grad_norm": 9.348366737365723,
804
- "learning_rate": 6.235074626865672e-05,
805
- "loss": 0.3634,
806
- "step": 1010
807
- },
808
- {
809
- "epoch": 3.81,
810
- "grad_norm": 9.918521881103516,
811
- "learning_rate": 6.19776119402985e-05,
812
- "loss": 0.3151,
813
- "step": 1020
814
- },
815
- {
816
- "epoch": 3.84,
817
- "grad_norm": 5.687044143676758,
818
- "learning_rate": 6.16044776119403e-05,
819
- "loss": 0.3088,
820
- "step": 1030
821
- },
822
- {
823
- "epoch": 3.88,
824
- "grad_norm": 3.8347887992858887,
825
- "learning_rate": 6.123134328358209e-05,
826
- "loss": 0.2128,
827
- "step": 1040
828
- },
829
- {
830
- "epoch": 3.92,
831
- "grad_norm": 5.380050182342529,
832
- "learning_rate": 6.0858208955223884e-05,
833
- "loss": 0.255,
834
- "step": 1050
835
- },
836
- {
837
- "epoch": 3.96,
838
- "grad_norm": 8.848828315734863,
839
- "learning_rate": 6.0485074626865676e-05,
840
- "loss": 0.2794,
841
- "step": 1060
842
- },
843
- {
844
- "epoch": 3.99,
845
- "grad_norm": 3.9666404724121094,
846
- "learning_rate": 6.011194029850746e-05,
847
- "loss": 0.1954,
848
- "step": 1070
849
- },
850
- {
851
- "epoch": 4.03,
852
- "grad_norm": 0.3369455635547638,
853
- "learning_rate": 5.973880597014926e-05,
854
- "loss": 0.2298,
855
- "step": 1080
856
- },
857
- {
858
- "epoch": 4.07,
859
- "grad_norm": 16.327823638916016,
860
- "learning_rate": 5.9365671641791044e-05,
861
- "loss": 0.2504,
862
- "step": 1090
863
- },
864
- {
865
- "epoch": 4.1,
866
- "grad_norm": 7.070168495178223,
867
- "learning_rate": 5.8992537313432835e-05,
868
- "loss": 0.3182,
869
- "step": 1100
870
- },
871
- {
872
- "epoch": 4.1,
873
- "eval_accuracy": 0.8189806678383128,
874
- "eval_loss": 0.6661401987075806,
875
- "eval_runtime": 8.2263,
876
- "eval_samples_per_second": 69.169,
877
- "eval_steps_per_second": 8.752,
878
- "step": 1100
879
- },
880
- {
881
- "epoch": 4.14,
882
- "grad_norm": 2.853975534439087,
883
- "learning_rate": 5.8619402985074634e-05,
884
- "loss": 0.201,
885
- "step": 1110
886
- },
887
- {
888
- "epoch": 4.18,
889
- "grad_norm": 0.958690881729126,
890
- "learning_rate": 5.824626865671642e-05,
891
- "loss": 0.1833,
892
- "step": 1120
893
- },
894
- {
895
- "epoch": 4.22,
896
- "grad_norm": 3.4794461727142334,
897
- "learning_rate": 5.787313432835822e-05,
898
- "loss": 0.2796,
899
- "step": 1130
900
- },
901
- {
902
- "epoch": 4.25,
903
- "grad_norm": 4.793296813964844,
904
- "learning_rate": 5.7499999999999995e-05,
905
- "loss": 0.2281,
906
- "step": 1140
907
- },
908
- {
909
- "epoch": 4.29,
910
- "grad_norm": 6.200154781341553,
911
- "learning_rate": 5.712686567164179e-05,
912
- "loss": 0.2814,
913
- "step": 1150
914
- },
915
- {
916
- "epoch": 4.33,
917
- "grad_norm": 5.616389274597168,
918
- "learning_rate": 5.675373134328359e-05,
919
- "loss": 0.1656,
920
- "step": 1160
921
  },
922
  {
923
- "epoch": 4.37,
924
- "grad_norm": 9.382554054260254,
925
- "learning_rate": 5.6380597014925376e-05,
926
- "loss": 0.19,
927
- "step": 1170
928
- },
929
- {
930
- "epoch": 4.4,
931
- "grad_norm": 3.526240587234497,
932
- "learning_rate": 5.600746268656717e-05,
933
- "loss": 0.2063,
934
- "step": 1180
935
- },
936
- {
937
- "epoch": 4.44,
938
- "grad_norm": 3.494896650314331,
939
- "learning_rate": 5.563432835820895e-05,
940
- "loss": 0.1681,
941
- "step": 1190
942
- },
943
- {
944
- "epoch": 4.48,
945
- "grad_norm": 5.764057636260986,
946
- "learning_rate": 5.526119402985075e-05,
947
- "loss": 0.2248,
948
- "step": 1200
949
- },
950
- {
951
- "epoch": 4.48,
952
- "eval_accuracy": 0.8418277680140598,
953
- "eval_loss": 0.6632041335105896,
954
- "eval_runtime": 8.1661,
955
- "eval_samples_per_second": 69.679,
956
- "eval_steps_per_second": 8.817,
957
- "step": 1200
958
- },
959
- {
960
- "epoch": 4.51,
961
- "grad_norm": 4.680635452270508,
962
- "learning_rate": 5.488805970149254e-05,
963
- "loss": 0.2179,
964
- "step": 1210
965
- },
966
- {
967
- "epoch": 4.55,
968
- "grad_norm": 10.24306869506836,
969
- "learning_rate": 5.451492537313433e-05,
970
- "loss": 0.2187,
971
- "step": 1220
972
- },
973
- {
974
- "epoch": 4.59,
975
- "grad_norm": 3.054690361022949,
976
- "learning_rate": 5.4141791044776126e-05,
977
- "loss": 0.1729,
978
- "step": 1230
979
- },
980
- {
981
- "epoch": 4.63,
982
- "grad_norm": 4.907272815704346,
983
- "learning_rate": 5.376865671641791e-05,
984
- "loss": 0.2762,
985
- "step": 1240
986
- },
987
- {
988
- "epoch": 4.66,
989
- "grad_norm": 4.774748802185059,
990
- "learning_rate": 5.33955223880597e-05,
991
- "loss": 0.1965,
992
- "step": 1250
993
- },
994
- {
995
- "epoch": 4.7,
996
- "grad_norm": 5.757875919342041,
997
- "learning_rate": 5.30223880597015e-05,
998
- "loss": 0.1564,
999
- "step": 1260
1000
- },
1001
- {
1002
- "epoch": 4.74,
1003
- "grad_norm": 0.3608088791370392,
1004
- "learning_rate": 5.2649253731343286e-05,
1005
- "loss": 0.0946,
1006
- "step": 1270
1007
- },
1008
- {
1009
- "epoch": 4.78,
1010
- "grad_norm": 3.6289939880371094,
1011
- "learning_rate": 5.227611940298508e-05,
1012
- "loss": 0.3364,
1013
- "step": 1280
1014
- },
1015
- {
1016
- "epoch": 4.81,
1017
- "grad_norm": 5.132009029388428,
1018
- "learning_rate": 5.190298507462686e-05,
1019
- "loss": 0.231,
1020
- "step": 1290
1021
- },
1022
- {
1023
- "epoch": 4.85,
1024
- "grad_norm": 1.0347099304199219,
1025
- "learning_rate": 5.152985074626866e-05,
1026
- "loss": 0.1617,
1027
- "step": 1300
1028
- },
1029
- {
1030
- "epoch": 4.85,
1031
- "eval_accuracy": 0.8172231985940246,
1032
- "eval_loss": 0.7277125716209412,
1033
- "eval_runtime": 8.4693,
1034
- "eval_samples_per_second": 67.184,
1035
- "eval_steps_per_second": 8.501,
1036
- "step": 1300
1037
- },
1038
- {
1039
- "epoch": 4.89,
1040
- "grad_norm": 2.5996298789978027,
1041
- "learning_rate": 5.115671641791045e-05,
1042
- "loss": 0.385,
1043
- "step": 1310
1044
- },
1045
- {
1046
- "epoch": 4.93,
1047
- "grad_norm": 3.724181890487671,
1048
- "learning_rate": 5.078358208955224e-05,
1049
- "loss": 0.1786,
1050
- "step": 1320
1051
- },
1052
- {
1053
- "epoch": 4.96,
1054
- "grad_norm": 2.150557518005371,
1055
- "learning_rate": 5.0410447761194035e-05,
1056
- "loss": 0.2122,
1057
- "step": 1330
1058
- },
1059
- {
1060
- "epoch": 5.0,
1061
- "grad_norm": 3.8813323974609375,
1062
- "learning_rate": 5.003731343283582e-05,
1063
- "loss": 0.2425,
1064
- "step": 1340
1065
- },
1066
- {
1067
- "epoch": 5.04,
1068
- "grad_norm": 0.896369457244873,
1069
- "learning_rate": 4.966417910447762e-05,
1070
- "loss": 0.2208,
1071
- "step": 1350
1072
- },
1073
- {
1074
- "epoch": 5.07,
1075
- "grad_norm": 9.002110481262207,
1076
- "learning_rate": 4.92910447761194e-05,
1077
- "loss": 0.1432,
1078
- "step": 1360
1079
- },
1080
- {
1081
- "epoch": 5.11,
1082
- "grad_norm": 9.619662284851074,
1083
- "learning_rate": 4.8917910447761195e-05,
1084
- "loss": 0.1347,
1085
- "step": 1370
1086
- },
1087
- {
1088
- "epoch": 5.15,
1089
- "grad_norm": 3.5148773193359375,
1090
- "learning_rate": 4.8544776119402986e-05,
1091
- "loss": 0.2837,
1092
- "step": 1380
1093
- },
1094
- {
1095
- "epoch": 5.19,
1096
- "grad_norm": 7.631669044494629,
1097
- "learning_rate": 4.817164179104478e-05,
1098
- "loss": 0.1887,
1099
- "step": 1390
1100
- },
1101
- {
1102
- "epoch": 5.22,
1103
- "grad_norm": 11.738872528076172,
1104
- "learning_rate": 4.779850746268657e-05,
1105
- "loss": 0.2578,
1106
- "step": 1400
1107
- },
1108
- {
1109
- "epoch": 5.22,
1110
- "eval_accuracy": 0.8189806678383128,
1111
- "eval_loss": 0.7114442586898804,
1112
- "eval_runtime": 8.2672,
1113
- "eval_samples_per_second": 68.826,
1114
- "eval_steps_per_second": 8.709,
1115
- "step": 1400
1116
- },
1117
- {
1118
- "epoch": 5.26,
1119
- "grad_norm": 6.67802095413208,
1120
- "learning_rate": 4.742537313432836e-05,
1121
- "loss": 0.2527,
1122
- "step": 1410
1123
- },
1124
- {
1125
- "epoch": 5.3,
1126
- "grad_norm": 4.491325378417969,
1127
- "learning_rate": 4.705223880597015e-05,
1128
- "loss": 0.2386,
1129
- "step": 1420
1130
- },
1131
- {
1132
- "epoch": 5.34,
1133
- "grad_norm": 1.1810379028320312,
1134
- "learning_rate": 4.667910447761194e-05,
1135
- "loss": 0.1693,
1136
- "step": 1430
1137
- },
1138
- {
1139
- "epoch": 5.37,
1140
- "grad_norm": 6.075868129730225,
1141
- "learning_rate": 4.6305970149253736e-05,
1142
- "loss": 0.167,
1143
- "step": 1440
1144
- },
1145
- {
1146
- "epoch": 5.41,
1147
- "grad_norm": 2.315635919570923,
1148
- "learning_rate": 4.593283582089553e-05,
1149
- "loss": 0.2243,
1150
- "step": 1450
1151
- },
1152
- {
1153
- "epoch": 5.45,
1154
- "grad_norm": 10.839255332946777,
1155
- "learning_rate": 4.555970149253732e-05,
1156
- "loss": 0.2414,
1157
- "step": 1460
1158
- },
1159
- {
1160
- "epoch": 5.49,
1161
- "grad_norm": 4.562304496765137,
1162
- "learning_rate": 4.5186567164179104e-05,
1163
- "loss": 0.264,
1164
- "step": 1470
1165
- },
1166
- {
1167
- "epoch": 5.52,
1168
- "grad_norm": 1.8821789026260376,
1169
- "learning_rate": 4.4813432835820895e-05,
1170
- "loss": 0.1407,
1171
- "step": 1480
1172
- },
1173
- {
1174
- "epoch": 5.56,
1175
- "grad_norm": 8.406396865844727,
1176
- "learning_rate": 4.4440298507462694e-05,
1177
- "loss": 0.1454,
1178
- "step": 1490
1179
- },
1180
- {
1181
- "epoch": 5.6,
1182
- "grad_norm": 0.2816010117530823,
1183
- "learning_rate": 4.406716417910448e-05,
1184
- "loss": 0.1864,
1185
- "step": 1500
1186
- },
1187
- {
1188
- "epoch": 5.6,
1189
- "eval_accuracy": 0.8172231985940246,
1190
- "eval_loss": 0.755394458770752,
1191
- "eval_runtime": 8.2598,
1192
- "eval_samples_per_second": 68.888,
1193
- "eval_steps_per_second": 8.717,
1194
- "step": 1500
1195
- },
1196
- {
1197
- "epoch": 5.63,
1198
- "grad_norm": 6.619854927062988,
1199
- "learning_rate": 4.369402985074627e-05,
1200
- "loss": 0.2806,
1201
- "step": 1510
1202
- },
1203
- {
1204
- "epoch": 5.67,
1205
- "grad_norm": 2.056018829345703,
1206
- "learning_rate": 4.332089552238806e-05,
1207
- "loss": 0.2583,
1208
- "step": 1520
1209
- },
1210
- {
1211
- "epoch": 5.71,
1212
- "grad_norm": 0.966521680355072,
1213
- "learning_rate": 4.294776119402985e-05,
1214
- "loss": 0.0997,
1215
- "step": 1530
1216
- },
1217
- {
1218
- "epoch": 5.75,
1219
- "grad_norm": 2.8261241912841797,
1220
- "learning_rate": 4.2574626865671645e-05,
1221
- "loss": 0.1604,
1222
- "step": 1540
1223
- },
1224
- {
1225
- "epoch": 5.78,
1226
- "grad_norm": 3.089912176132202,
1227
- "learning_rate": 4.2201492537313436e-05,
1228
- "loss": 0.2775,
1229
- "step": 1550
1230
- },
1231
- {
1232
- "epoch": 5.82,
1233
- "grad_norm": 7.935690879821777,
1234
- "learning_rate": 4.182835820895523e-05,
1235
- "loss": 0.2522,
1236
- "step": 1560
1237
- },
1238
- {
1239
- "epoch": 5.86,
1240
- "grad_norm": 0.7999266982078552,
1241
- "learning_rate": 4.145522388059702e-05,
1242
- "loss": 0.0752,
1243
- "step": 1570
1244
- },
1245
- {
1246
- "epoch": 5.9,
1247
- "grad_norm": 6.0712480545043945,
1248
- "learning_rate": 4.1082089552238804e-05,
1249
- "loss": 0.1933,
1250
- "step": 1580
1251
- },
1252
- {
1253
- "epoch": 5.93,
1254
- "grad_norm": 10.768308639526367,
1255
- "learning_rate": 4.07089552238806e-05,
1256
- "loss": 0.1664,
1257
- "step": 1590
1258
- },
1259
- {
1260
- "epoch": 5.97,
1261
- "grad_norm": 9.641716003417969,
1262
- "learning_rate": 4.0335820895522394e-05,
1263
- "loss": 0.3134,
1264
- "step": 1600
1265
- },
1266
- {
1267
- "epoch": 5.97,
1268
- "eval_accuracy": 0.8154657293497364,
1269
- "eval_loss": 0.7593356966972351,
1270
- "eval_runtime": 8.4455,
1271
- "eval_samples_per_second": 67.373,
1272
- "eval_steps_per_second": 8.525,
1273
- "step": 1600
1274
- },
1275
- {
1276
- "epoch": 6.01,
1277
- "grad_norm": 6.7538838386535645,
1278
- "learning_rate": 3.996268656716418e-05,
1279
- "loss": 0.1747,
1280
- "step": 1610
1281
- },
1282
- {
1283
- "epoch": 6.04,
1284
- "grad_norm": 6.237377166748047,
1285
- "learning_rate": 3.958955223880597e-05,
1286
- "loss": 0.2406,
1287
- "step": 1620
1288
- },
1289
- {
1290
- "epoch": 6.08,
1291
- "grad_norm": 7.950930118560791,
1292
- "learning_rate": 3.921641791044776e-05,
1293
- "loss": 0.1884,
1294
- "step": 1630
1295
- },
1296
- {
1297
- "epoch": 6.12,
1298
- "grad_norm": 4.41484260559082,
1299
- "learning_rate": 3.8843283582089554e-05,
1300
- "loss": 0.1445,
1301
- "step": 1640
1302
- },
1303
- {
1304
- "epoch": 6.16,
1305
- "grad_norm": 6.339887619018555,
1306
- "learning_rate": 3.8470149253731345e-05,
1307
- "loss": 0.2906,
1308
- "step": 1650
1309
- },
1310
- {
1311
- "epoch": 6.19,
1312
- "grad_norm": 7.597599983215332,
1313
- "learning_rate": 3.809701492537314e-05,
1314
- "loss": 0.1576,
1315
- "step": 1660
1316
- },
1317
- {
1318
- "epoch": 6.23,
1319
- "grad_norm": 2.379629373550415,
1320
- "learning_rate": 3.772388059701493e-05,
1321
- "loss": 0.2016,
1322
- "step": 1670
1323
- },
1324
- {
1325
- "epoch": 6.27,
1326
- "grad_norm": 2.7694478034973145,
1327
- "learning_rate": 3.735074626865671e-05,
1328
- "loss": 0.1188,
1329
- "step": 1680
1330
- },
1331
- {
1332
- "epoch": 6.31,
1333
- "grad_norm": 2.1837210655212402,
1334
- "learning_rate": 3.6977611940298505e-05,
1335
- "loss": 0.1908,
1336
- "step": 1690
1337
- },
1338
- {
1339
- "epoch": 6.34,
1340
- "grad_norm": 4.4665350914001465,
1341
- "learning_rate": 3.66044776119403e-05,
1342
- "loss": 0.24,
1343
- "step": 1700
1344
- },
1345
- {
1346
- "epoch": 6.34,
1347
- "eval_accuracy": 0.8260105448154658,
1348
- "eval_loss": 0.7510848641395569,
1349
- "eval_runtime": 8.2044,
1350
- "eval_samples_per_second": 69.353,
1351
- "eval_steps_per_second": 8.776,
1352
- "step": 1700
1353
- },
1354
- {
1355
- "epoch": 6.38,
1356
- "grad_norm": 4.6551995277404785,
1357
- "learning_rate": 3.6231343283582095e-05,
1358
- "loss": 0.1631,
1359
- "step": 1710
1360
- },
1361
- {
1362
- "epoch": 6.42,
1363
- "grad_norm": 1.098407506942749,
1364
- "learning_rate": 3.585820895522388e-05,
1365
- "loss": 0.0912,
1366
- "step": 1720
1367
- },
1368
- {
1369
- "epoch": 6.46,
1370
- "grad_norm": 0.37138649821281433,
1371
- "learning_rate": 3.548507462686567e-05,
1372
- "loss": 0.2621,
1373
- "step": 1730
1374
- },
1375
- {
1376
- "epoch": 6.49,
1377
- "grad_norm": 7.4571757316589355,
1378
- "learning_rate": 3.511194029850746e-05,
1379
- "loss": 0.268,
1380
- "step": 1740
1381
- },
1382
- {
1383
- "epoch": 6.53,
1384
- "grad_norm": 0.5180323123931885,
1385
- "learning_rate": 3.4738805970149254e-05,
1386
- "loss": 0.2135,
1387
- "step": 1750
1388
- },
1389
- {
1390
- "epoch": 6.57,
1391
- "grad_norm": 1.0866820812225342,
1392
- "learning_rate": 3.4365671641791046e-05,
1393
- "loss": 0.1489,
1394
- "step": 1760
1395
- },
1396
- {
1397
- "epoch": 6.6,
1398
- "grad_norm": 8.90451717376709,
1399
- "learning_rate": 3.399253731343284e-05,
1400
- "loss": 0.288,
1401
- "step": 1770
1402
- },
1403
- {
1404
- "epoch": 6.64,
1405
- "grad_norm": 1.1608803272247314,
1406
- "learning_rate": 3.361940298507463e-05,
1407
- "loss": 0.18,
1408
- "step": 1780
1409
- },
1410
- {
1411
- "epoch": 6.68,
1412
- "grad_norm": 2.9207170009613037,
1413
- "learning_rate": 3.3246268656716414e-05,
1414
- "loss": 0.2414,
1415
- "step": 1790
1416
- },
1417
- {
1418
- "epoch": 6.72,
1419
- "grad_norm": 0.2674783170223236,
1420
- "learning_rate": 3.287313432835821e-05,
1421
- "loss": 0.2359,
1422
- "step": 1800
1423
- },
1424
- {
1425
- "epoch": 6.72,
1426
- "eval_accuracy": 0.8137082601054482,
1427
- "eval_loss": 0.7501537203788757,
1428
- "eval_runtime": 8.1528,
1429
- "eval_samples_per_second": 69.792,
1430
- "eval_steps_per_second": 8.831,
1431
- "step": 1800
1432
- },
1433
- {
1434
- "epoch": 6.75,
1435
- "grad_norm": 8.241676330566406,
1436
- "learning_rate": 3.2500000000000004e-05,
1437
- "loss": 0.1975,
1438
- "step": 1810
1439
- },
1440
- {
1441
- "epoch": 6.79,
1442
- "grad_norm": 2.0347325801849365,
1443
- "learning_rate": 3.2126865671641796e-05,
1444
- "loss": 0.218,
1445
- "step": 1820
1446
- },
1447
- {
1448
- "epoch": 6.83,
1449
- "grad_norm": 1.0338706970214844,
1450
- "learning_rate": 3.175373134328358e-05,
1451
- "loss": 0.1437,
1452
- "step": 1830
1453
- },
1454
- {
1455
- "epoch": 6.87,
1456
- "grad_norm": 0.34902578592300415,
1457
- "learning_rate": 3.138059701492537e-05,
1458
- "loss": 0.1883,
1459
- "step": 1840
1460
- },
1461
- {
1462
- "epoch": 6.9,
1463
- "grad_norm": 6.642534255981445,
1464
- "learning_rate": 3.100746268656717e-05,
1465
- "loss": 0.2513,
1466
- "step": 1850
1467
- },
1468
- {
1469
- "epoch": 6.94,
1470
- "grad_norm": 4.432920455932617,
1471
- "learning_rate": 3.0634328358208955e-05,
1472
- "loss": 0.1058,
1473
- "step": 1860
1474
- },
1475
- {
1476
- "epoch": 6.98,
1477
- "grad_norm": 4.381640434265137,
1478
- "learning_rate": 3.0261194029850747e-05,
1479
- "loss": 0.2114,
1480
- "step": 1870
1481
- },
1482
- {
1483
- "epoch": 7.01,
1484
- "grad_norm": 7.730411529541016,
1485
- "learning_rate": 2.9888059701492538e-05,
1486
- "loss": 0.2542,
1487
- "step": 1880
1488
- },
1489
- {
1490
- "epoch": 7.05,
1491
- "grad_norm": 7.122923851013184,
1492
- "learning_rate": 2.9514925373134326e-05,
1493
- "loss": 0.2594,
1494
- "step": 1890
1495
- },
1496
- {
1497
- "epoch": 7.09,
1498
- "grad_norm": 1.411278486251831,
1499
- "learning_rate": 2.9141791044776125e-05,
1500
- "loss": 0.2322,
1501
- "step": 1900
1502
- },
1503
- {
1504
- "epoch": 7.09,
1505
- "eval_accuracy": 0.8347978910369068,
1506
- "eval_loss": 0.6952534317970276,
1507
- "eval_runtime": 8.3769,
1508
- "eval_samples_per_second": 67.925,
1509
- "eval_steps_per_second": 8.595,
1510
- "step": 1900
1511
- },
1512
- {
1513
- "epoch": 7.13,
1514
- "grad_norm": 2.219285011291504,
1515
- "learning_rate": 2.8768656716417913e-05,
1516
- "loss": 0.1344,
1517
- "step": 1910
1518
- },
1519
- {
1520
- "epoch": 7.16,
1521
- "grad_norm": 6.302455902099609,
1522
- "learning_rate": 2.8395522388059705e-05,
1523
- "loss": 0.2098,
1524
- "step": 1920
1525
- },
1526
- {
1527
- "epoch": 7.2,
1528
- "grad_norm": 1.2837783098220825,
1529
- "learning_rate": 2.8022388059701493e-05,
1530
- "loss": 0.0906,
1531
- "step": 1930
1532
- },
1533
- {
1534
- "epoch": 7.24,
1535
- "grad_norm": 6.604355335235596,
1536
- "learning_rate": 2.7649253731343284e-05,
1537
- "loss": 0.2352,
1538
- "step": 1940
1539
- },
1540
- {
1541
- "epoch": 7.28,
1542
- "grad_norm": 9.916419982910156,
1543
- "learning_rate": 2.727611940298508e-05,
1544
- "loss": 0.1422,
1545
- "step": 1950
1546
- },
1547
- {
1548
- "epoch": 7.31,
1549
- "grad_norm": 2.7665014266967773,
1550
- "learning_rate": 2.6902985074626868e-05,
1551
- "loss": 0.1722,
1552
- "step": 1960
1553
- },
1554
- {
1555
- "epoch": 7.35,
1556
- "grad_norm": 0.24231348931789398,
1557
- "learning_rate": 2.652985074626866e-05,
1558
- "loss": 0.2935,
1559
- "step": 1970
1560
- },
1561
- {
1562
- "epoch": 7.39,
1563
- "grad_norm": 0.8025885224342346,
1564
- "learning_rate": 2.6156716417910447e-05,
1565
- "loss": 0.157,
1566
- "step": 1980
1567
- },
1568
- {
1569
- "epoch": 7.43,
1570
- "grad_norm": 1.6752264499664307,
1571
- "learning_rate": 2.578358208955224e-05,
1572
- "loss": 0.1256,
1573
- "step": 1990
1574
- },
1575
- {
1576
- "epoch": 7.46,
1577
- "grad_norm": 2.404883861541748,
1578
- "learning_rate": 2.5410447761194027e-05,
1579
- "loss": 0.1514,
1580
- "step": 2000
1581
- },
1582
- {
1583
- "epoch": 7.46,
1584
- "eval_accuracy": 0.8260105448154658,
1585
- "eval_loss": 0.7120960354804993,
1586
- "eval_runtime": 8.1425,
1587
- "eval_samples_per_second": 69.88,
1588
- "eval_steps_per_second": 8.842,
1589
- "step": 2000
1590
- },
1591
- {
1592
- "epoch": 7.5,
1593
- "grad_norm": 5.409728050231934,
1594
- "learning_rate": 2.5037313432835825e-05,
1595
- "loss": 0.222,
1596
- "step": 2010
1597
- },
1598
- {
1599
- "epoch": 7.54,
1600
- "grad_norm": 3.949014663696289,
1601
- "learning_rate": 2.4664179104477614e-05,
1602
- "loss": 0.245,
1603
- "step": 2020
1604
- },
1605
- {
1606
- "epoch": 7.57,
1607
- "grad_norm": 8.40086555480957,
1608
- "learning_rate": 2.4291044776119405e-05,
1609
- "loss": 0.1408,
1610
- "step": 2030
1611
- },
1612
- {
1613
- "epoch": 7.61,
1614
- "grad_norm": 7.694955348968506,
1615
- "learning_rate": 2.3917910447761197e-05,
1616
- "loss": 0.2072,
1617
- "step": 2040
1618
- },
1619
- {
1620
- "epoch": 7.65,
1621
- "grad_norm": 1.9109055995941162,
1622
- "learning_rate": 2.3544776119402985e-05,
1623
- "loss": 0.145,
1624
- "step": 2050
1625
- },
1626
- {
1627
- "epoch": 7.69,
1628
- "grad_norm": 12.803776741027832,
1629
- "learning_rate": 2.3171641791044777e-05,
1630
- "loss": 0.1274,
1631
- "step": 2060
1632
- },
1633
- {
1634
- "epoch": 7.72,
1635
- "grad_norm": 3.3325235843658447,
1636
- "learning_rate": 2.2798507462686568e-05,
1637
- "loss": 0.1564,
1638
- "step": 2070
1639
- },
1640
- {
1641
- "epoch": 7.76,
1642
- "grad_norm": 1.105327844619751,
1643
- "learning_rate": 2.242537313432836e-05,
1644
- "loss": 0.2008,
1645
- "step": 2080
1646
- },
1647
- {
1648
- "epoch": 7.8,
1649
- "grad_norm": 1.7592620849609375,
1650
- "learning_rate": 2.2052238805970148e-05,
1651
- "loss": 0.203,
1652
- "step": 2090
1653
- },
1654
- {
1655
- "epoch": 7.84,
1656
- "grad_norm": 0.13264060020446777,
1657
- "learning_rate": 2.1679104477611943e-05,
1658
- "loss": 0.2089,
1659
- "step": 2100
1660
- },
1661
- {
1662
- "epoch": 7.84,
1663
- "eval_accuracy": 0.827768014059754,
1664
- "eval_loss": 0.693087637424469,
1665
- "eval_runtime": 8.2375,
1666
- "eval_samples_per_second": 69.074,
1667
- "eval_steps_per_second": 8.741,
1668
- "step": 2100
1669
- },
1670
- {
1671
- "epoch": 7.87,
1672
- "grad_norm": 5.904381275177002,
1673
- "learning_rate": 2.130597014925373e-05,
1674
- "loss": 0.1754,
1675
- "step": 2110
1676
- },
1677
- {
1678
- "epoch": 7.91,
1679
- "grad_norm": 1.7469266653060913,
1680
- "learning_rate": 2.0932835820895526e-05,
1681
- "loss": 0.1322,
1682
- "step": 2120
1683
- },
1684
- {
1685
- "epoch": 7.95,
1686
- "grad_norm": 4.313326835632324,
1687
- "learning_rate": 2.0559701492537314e-05,
1688
- "loss": 0.1418,
1689
- "step": 2130
1690
- },
1691
- {
1692
- "epoch": 7.99,
1693
- "grad_norm": 0.14211903512477875,
1694
- "learning_rate": 2.0186567164179106e-05,
1695
- "loss": 0.1534,
1696
- "step": 2140
1697
- },
1698
- {
1699
- "epoch": 8.02,
1700
- "grad_norm": 5.527184009552002,
1701
- "learning_rate": 1.9813432835820897e-05,
1702
- "loss": 0.2122,
1703
- "step": 2150
1704
- },
1705
- {
1706
- "epoch": 8.06,
1707
- "grad_norm": 0.2312430739402771,
1708
- "learning_rate": 1.9440298507462686e-05,
1709
- "loss": 0.1617,
1710
- "step": 2160
1711
- },
1712
- {
1713
- "epoch": 8.1,
1714
- "grad_norm": 0.23949085175991058,
1715
- "learning_rate": 1.906716417910448e-05,
1716
- "loss": 0.1286,
1717
- "step": 2170
1718
- },
1719
- {
1720
- "epoch": 8.13,
1721
- "grad_norm": 0.1903185099363327,
1722
- "learning_rate": 1.869402985074627e-05,
1723
- "loss": 0.0846,
1724
- "step": 2180
1725
- },
1726
- {
1727
- "epoch": 8.17,
1728
- "grad_norm": 0.08518023788928986,
1729
- "learning_rate": 1.832089552238806e-05,
1730
- "loss": 0.0801,
1731
- "step": 2190
1732
- },
1733
- {
1734
- "epoch": 8.21,
1735
- "grad_norm": 4.424215793609619,
1736
- "learning_rate": 1.7947761194029852e-05,
1737
- "loss": 0.2245,
1738
- "step": 2200
1739
- },
1740
- {
1741
- "epoch": 8.21,
1742
- "eval_accuracy": 0.8330404217926186,
1743
- "eval_loss": 0.7087014317512512,
1744
- "eval_runtime": 8.1117,
1745
- "eval_samples_per_second": 70.145,
1746
- "eval_steps_per_second": 8.876,
1747
- "step": 2200
1748
- },
1749
- {
1750
- "epoch": 8.25,
1751
- "grad_norm": 7.247931480407715,
1752
- "learning_rate": 1.7574626865671644e-05,
1753
- "loss": 0.0722,
1754
- "step": 2210
1755
- },
1756
- {
1757
- "epoch": 8.28,
1758
- "grad_norm": 4.80264949798584,
1759
- "learning_rate": 1.7201492537313435e-05,
1760
- "loss": 0.0844,
1761
- "step": 2220
1762
- },
1763
- {
1764
- "epoch": 8.32,
1765
- "grad_norm": 8.001790046691895,
1766
- "learning_rate": 1.6828358208955223e-05,
1767
- "loss": 0.1077,
1768
- "step": 2230
1769
- },
1770
- {
1771
- "epoch": 8.36,
1772
- "grad_norm": 5.419641017913818,
1773
- "learning_rate": 1.6455223880597015e-05,
1774
- "loss": 0.1627,
1775
- "step": 2240
1776
- },
1777
- {
1778
- "epoch": 8.4,
1779
- "grad_norm": 0.031686268746852875,
1780
- "learning_rate": 1.6082089552238806e-05,
1781
- "loss": 0.0984,
1782
- "step": 2250
1783
- },
1784
- {
1785
- "epoch": 8.43,
1786
- "grad_norm": 6.095193862915039,
1787
- "learning_rate": 1.5708955223880598e-05,
1788
- "loss": 0.1756,
1789
- "step": 2260
1790
- },
1791
- {
1792
- "epoch": 8.47,
1793
- "grad_norm": 5.179446220397949,
1794
- "learning_rate": 1.5335820895522386e-05,
1795
- "loss": 0.1708,
1796
- "step": 2270
1797
- },
1798
- {
1799
- "epoch": 8.51,
1800
- "grad_norm": 4.06497049331665,
1801
- "learning_rate": 1.496268656716418e-05,
1802
- "loss": 0.1493,
1803
- "step": 2280
1804
- },
1805
- {
1806
- "epoch": 8.54,
1807
- "grad_norm": 1.4721342325210571,
1808
- "learning_rate": 1.458955223880597e-05,
1809
- "loss": 0.2587,
1810
- "step": 2290
1811
- },
1812
- {
1813
- "epoch": 8.58,
1814
- "grad_norm": 4.418783664703369,
1815
- "learning_rate": 1.4216417910447763e-05,
1816
- "loss": 0.1328,
1817
- "step": 2300
1818
- },
1819
- {
1820
- "epoch": 8.58,
1821
- "eval_accuracy": 0.8312829525483304,
1822
- "eval_loss": 0.700339674949646,
1823
- "eval_runtime": 8.481,
1824
- "eval_samples_per_second": 67.091,
1825
- "eval_steps_per_second": 8.49,
1826
- "step": 2300
1827
- },
1828
- {
1829
- "epoch": 8.62,
1830
- "grad_norm": 1.5734038352966309,
1831
- "learning_rate": 1.3843283582089553e-05,
1832
- "loss": 0.165,
1833
- "step": 2310
1834
- },
1835
- {
1836
- "epoch": 8.66,
1837
- "grad_norm": 2.624784231185913,
1838
- "learning_rate": 1.3470149253731342e-05,
1839
- "loss": 0.0837,
1840
- "step": 2320
1841
- },
1842
- {
1843
- "epoch": 8.69,
1844
- "grad_norm": 2.7039573192596436,
1845
- "learning_rate": 1.3097014925373136e-05,
1846
- "loss": 0.2098,
1847
- "step": 2330
1848
- },
1849
- {
1850
- "epoch": 8.73,
1851
- "grad_norm": 6.542816638946533,
1852
- "learning_rate": 1.2723880597014926e-05,
1853
- "loss": 0.129,
1854
- "step": 2340
1855
- },
1856
- {
1857
- "epoch": 8.77,
1858
- "grad_norm": 2.9511120319366455,
1859
- "learning_rate": 1.2350746268656717e-05,
1860
- "loss": 0.1762,
1861
- "step": 2350
1862
- },
1863
- {
1864
- "epoch": 8.81,
1865
- "grad_norm": 3.435502529144287,
1866
- "learning_rate": 1.1977611940298509e-05,
1867
- "loss": 0.1345,
1868
- "step": 2360
1869
- },
1870
- {
1871
- "epoch": 8.84,
1872
- "grad_norm": 2.1689364910125732,
1873
- "learning_rate": 1.1604477611940299e-05,
1874
- "loss": 0.1011,
1875
- "step": 2370
1876
- },
1877
- {
1878
- "epoch": 8.88,
1879
- "grad_norm": 2.3366479873657227,
1880
- "learning_rate": 1.123134328358209e-05,
1881
- "loss": 0.1733,
1882
- "step": 2380
1883
- },
1884
- {
1885
- "epoch": 8.92,
1886
- "grad_norm": 5.928171634674072,
1887
- "learning_rate": 1.085820895522388e-05,
1888
- "loss": 0.1089,
1889
- "step": 2390
1890
- },
1891
- {
1892
- "epoch": 8.96,
1893
- "grad_norm": 0.08636012673377991,
1894
- "learning_rate": 1.0485074626865672e-05,
1895
- "loss": 0.1304,
1896
- "step": 2400
1897
- },
1898
- {
1899
- "epoch": 8.96,
1900
- "eval_accuracy": 0.8224956063268892,
1901
- "eval_loss": 0.7306046485900879,
1902
- "eval_runtime": 8.4262,
1903
- "eval_samples_per_second": 67.528,
1904
- "eval_steps_per_second": 8.545,
1905
- "step": 2400
1906
- },
1907
- {
1908
- "epoch": 8.99,
1909
- "grad_norm": 0.14256200194358826,
1910
- "learning_rate": 1.0111940298507463e-05,
1911
- "loss": 0.1506,
1912
- "step": 2410
1913
- },
1914
- {
1915
- "epoch": 9.03,
1916
- "grad_norm": 0.4166848659515381,
1917
- "learning_rate": 9.738805970149255e-06,
1918
- "loss": 0.2058,
1919
- "step": 2420
1920
- },
1921
- {
1922
- "epoch": 9.07,
1923
- "grad_norm": 0.3997032344341278,
1924
- "learning_rate": 9.365671641791045e-06,
1925
- "loss": 0.0482,
1926
- "step": 2430
1927
- },
1928
- {
1929
- "epoch": 9.1,
1930
- "grad_norm": 9.076058387756348,
1931
- "learning_rate": 8.992537313432836e-06,
1932
- "loss": 0.2201,
1933
- "step": 2440
1934
- },
1935
- {
1936
- "epoch": 9.14,
1937
- "grad_norm": 4.368849277496338,
1938
- "learning_rate": 8.619402985074628e-06,
1939
- "loss": 0.1288,
1940
- "step": 2450
1941
- },
1942
- {
1943
- "epoch": 9.18,
1944
- "grad_norm": 4.311466693878174,
1945
- "learning_rate": 8.24626865671642e-06,
1946
- "loss": 0.3058,
1947
- "step": 2460
1948
- },
1949
- {
1950
- "epoch": 9.22,
1951
- "grad_norm": 0.2911408543586731,
1952
- "learning_rate": 7.87313432835821e-06,
1953
- "loss": 0.1303,
1954
- "step": 2470
1955
- },
1956
- {
1957
- "epoch": 9.25,
1958
- "grad_norm": 5.493233680725098,
1959
- "learning_rate": 7.5e-06,
1960
- "loss": 0.0915,
1961
- "step": 2480
1962
- },
1963
- {
1964
- "epoch": 9.29,
1965
- "grad_norm": 0.09431172162294388,
1966
- "learning_rate": 7.126865671641792e-06,
1967
- "loss": 0.0954,
1968
- "step": 2490
1969
- },
1970
- {
1971
- "epoch": 9.33,
1972
- "grad_norm": 1.8603869676589966,
1973
- "learning_rate": 6.7537313432835825e-06,
1974
- "loss": 0.1514,
1975
- "step": 2500
1976
- },
1977
- {
1978
- "epoch": 9.33,
1979
- "eval_accuracy": 0.8260105448154658,
1980
- "eval_loss": 0.7162156701087952,
1981
- "eval_runtime": 8.3201,
1982
- "eval_samples_per_second": 68.389,
1983
- "eval_steps_per_second": 8.654,
1984
- "step": 2500
1985
- },
1986
- {
1987
- "epoch": 9.37,
1988
- "grad_norm": 4.870584964752197,
1989
- "learning_rate": 6.380597014925374e-06,
1990
- "loss": 0.1354,
1991
- "step": 2510
1992
- },
1993
- {
1994
- "epoch": 9.4,
1995
- "grad_norm": 2.316840410232544,
1996
- "learning_rate": 6.007462686567165e-06,
1997
- "loss": 0.1348,
1998
- "step": 2520
1999
- },
2000
- {
2001
- "epoch": 9.44,
2002
- "grad_norm": 1.9005101919174194,
2003
- "learning_rate": 5.6343283582089556e-06,
2004
- "loss": 0.1755,
2005
- "step": 2530
2006
- },
2007
- {
2008
- "epoch": 9.48,
2009
- "grad_norm": 0.1674620360136032,
2010
- "learning_rate": 5.261194029850746e-06,
2011
- "loss": 0.0878,
2012
- "step": 2540
2013
- },
2014
- {
2015
- "epoch": 9.51,
2016
- "grad_norm": 5.729959011077881,
2017
- "learning_rate": 4.888059701492537e-06,
2018
- "loss": 0.1637,
2019
- "step": 2550
2020
- },
2021
- {
2022
- "epoch": 9.55,
2023
- "grad_norm": 0.02724504843354225,
2024
- "learning_rate": 4.514925373134329e-06,
2025
- "loss": 0.1603,
2026
- "step": 2560
2027
- },
2028
- {
2029
- "epoch": 9.59,
2030
- "grad_norm": 2.728663921356201,
2031
- "learning_rate": 4.141791044776119e-06,
2032
- "loss": 0.1152,
2033
- "step": 2570
2034
- },
2035
- {
2036
- "epoch": 9.63,
2037
- "grad_norm": 8.920695304870605,
2038
- "learning_rate": 3.7686567164179105e-06,
2039
- "loss": 0.1964,
2040
- "step": 2580
2041
- },
2042
- {
2043
- "epoch": 9.66,
2044
- "grad_norm": 2.3974239826202393,
2045
- "learning_rate": 3.3955223880597013e-06,
2046
- "loss": 0.0842,
2047
- "step": 2590
2048
- },
2049
- {
2050
- "epoch": 9.7,
2051
- "grad_norm": 1.6431355476379395,
2052
- "learning_rate": 3.022388059701493e-06,
2053
- "loss": 0.2571,
2054
- "step": 2600
2055
- },
2056
- {
2057
- "epoch": 9.7,
2058
- "eval_accuracy": 0.8347978910369068,
2059
- "eval_loss": 0.7012546062469482,
2060
- "eval_runtime": 8.3265,
2061
- "eval_samples_per_second": 68.336,
2062
- "eval_steps_per_second": 8.647,
2063
- "step": 2600
2064
- },
2065
- {
2066
- "epoch": 9.74,
2067
- "grad_norm": 0.10621854662895203,
2068
- "learning_rate": 2.6492537313432836e-06,
2069
- "loss": 0.2632,
2070
- "step": 2610
2071
- },
2072
- {
2073
- "epoch": 9.78,
2074
- "grad_norm": 4.150152206420898,
2075
- "learning_rate": 2.2761194029850747e-06,
2076
- "loss": 0.2804,
2077
- "step": 2620
2078
- },
2079
- {
2080
- "epoch": 9.81,
2081
- "grad_norm": 4.01139497756958,
2082
- "learning_rate": 1.9029850746268657e-06,
2083
- "loss": 0.1696,
2084
- "step": 2630
2085
- },
2086
- {
2087
- "epoch": 9.85,
2088
- "grad_norm": 4.7402262687683105,
2089
- "learning_rate": 1.5298507462686568e-06,
2090
- "loss": 0.1891,
2091
- "step": 2640
2092
- },
2093
- {
2094
- "epoch": 9.89,
2095
- "grad_norm": 4.460111141204834,
2096
- "learning_rate": 1.1567164179104478e-06,
2097
- "loss": 0.1178,
2098
- "step": 2650
2099
- },
2100
- {
2101
- "epoch": 9.93,
2102
- "grad_norm": 5.822507858276367,
2103
- "learning_rate": 7.835820895522387e-07,
2104
- "loss": 0.089,
2105
- "step": 2660
2106
- },
2107
- {
2108
- "epoch": 9.96,
2109
- "grad_norm": 2.4408085346221924,
2110
- "learning_rate": 4.1044776119402984e-07,
2111
- "loss": 0.158,
2112
- "step": 2670
2113
- },
2114
- {
2115
- "epoch": 10.0,
2116
- "grad_norm": 10.792135238647461,
2117
- "learning_rate": 3.7313432835820895e-08,
2118
- "loss": 0.2038,
2119
- "step": 2680
2120
  },
2121
  {
2122
- "epoch": 10.0,
2123
- "step": 2680,
2124
- "total_flos": 3.3230947683690086e+18,
2125
- "train_loss": 0.23535207314277762,
2126
- "train_runtime": 1371.8304,
2127
- "train_samples_per_second": 31.258,
2128
- "train_steps_per_second": 1.954
2129
  }
2130
  ],
2131
- "logging_steps": 10,
2132
- "max_steps": 2680,
2133
  "num_input_tokens_seen": 0,
2134
- "num_train_epochs": 10,
2135
  "save_steps": 100,
2136
- "total_flos": 3.3230947683690086e+18,
2137
- "train_batch_size": 16,
2138
  "trial_name": null,
2139
  "trial_params": null
2140
  }
 
1
  {
2
+ "best_metric": 1.1399264335632324,
3
+ "best_model_checkpoint": "Action_model/checkpoint-100",
4
+ "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 134,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.15,
13
+ "grad_norm": 1.509538173675537,
14
+ "learning_rate": 8.507462686567164e-05,
15
+ "loss": 2.1872,
16
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 0.3,
20
+ "grad_norm": 1.6843363046646118,
21
+ "learning_rate": 7.014925373134329e-05,
22
+ "loss": 1.872,
23
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.45,
27
+ "grad_norm": 1.891447901725769,
28
+ "learning_rate": 5.5223880597014934e-05,
29
+ "loss": 1.5872,
30
+ "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.6,
34
+ "grad_norm": 1.9932177066802979,
35
+ "learning_rate": 4.029850746268657e-05,
36
+ "loss": 1.3864,
37
+ "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.75,
41
+ "grad_norm": 1.7128252983093262,
42
+ "learning_rate": 2.537313432835821e-05,
43
+ "loss": 1.2948,
44
+ "step": 100
45
  },
46
  {
47
  "epoch": 0.75,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  "eval_accuracy": 0.789103690685413,
49
+ "eval_loss": 1.1399264335632324,
50
+ "eval_runtime": 11.9209,
51
+ "eval_samples_per_second": 47.731,
52
+ "eval_steps_per_second": 6.04,
53
+ "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  },
55
  {
56
+ "epoch": 0.9,
57
+ "grad_norm": 2.182009696960449,
58
+ "learning_rate": 1.0447761194029851e-05,
59
+ "loss": 1.2271,
60
+ "step": 120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  {
63
+ "epoch": 1.0,
64
+ "step": 134,
65
+ "total_flos": 3.3230947683690086e+17,
66
+ "train_loss": 1.549544946471257,
67
+ "train_runtime": 145.4848,
68
+ "train_samples_per_second": 29.474,
69
+ "train_steps_per_second": 0.921
70
  }
71
  ],
72
+ "logging_steps": 20,
73
+ "max_steps": 134,
74
  "num_input_tokens_seen": 0,
75
+ "num_train_epochs": 1,
76
  "save_steps": 100,
77
+ "total_flos": 3.3230947683690086e+17,
78
+ "train_batch_size": 32,
79
  "trial_name": null,
80
  "trial_params": null
81
  }