Raihan004 commited on
Commit
bae8716
1 Parent(s): 48aae5b

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - imagefolder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: imagefolder
18
  type: imagefolder
19
  config: default
20
  split: train
@@ -22,7 +23,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.7676190476190476
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # Action_model
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.7365
36
- - Accuracy: 0.7676
37
 
38
  ## Model description
39
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: action_class
19
  type: imagefolder
20
  config: default
21
  split: train
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.7847619047619048
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
31
 
32
  # Action_model
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the action_class dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.7120
37
+ - Accuracy: 0.7848
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.7742857142857142,
4
- "eval_loss": 0.7498265504837036,
5
- "eval_runtime": 17.7837,
6
- "eval_samples_per_second": 59.043,
7
- "eval_steps_per_second": 7.423,
8
- "total_flos": 7.776878731479245e+18,
9
- "train_loss": 0.29035600812002355,
10
- "train_runtime": 3700.3159,
11
- "train_samples_per_second": 27.119,
12
- "train_steps_per_second": 1.697
13
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.7847619047619048,
4
+ "eval_loss": 0.7120087742805481,
5
+ "eval_runtime": 16.7402,
6
+ "eval_samples_per_second": 62.723,
7
+ "eval_steps_per_second": 7.885,
8
+ "total_flos": 1.555375746295849e+18,
9
+ "train_loss": 0.6562361546382782,
10
+ "train_runtime": 775.5335,
11
+ "train_samples_per_second": 25.879,
12
+ "train_steps_per_second": 1.62
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.7742857142857142,
4
- "eval_loss": 0.7498265504837036,
5
- "eval_runtime": 17.7837,
6
- "eval_samples_per_second": 59.043,
7
- "eval_steps_per_second": 7.423
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.7847619047619048,
4
+ "eval_loss": 0.7120087742805481,
5
+ "eval_runtime": 16.7402,
6
+ "eval_samples_per_second": 62.723,
7
+ "eval_steps_per_second": 7.885
8
  }
runs/Apr17_12-23-55_b85bd6c644ee/events.out.tfevents.1713358954.b85bd6c644ee.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc0882b65ec5025e5ca0e1b65a573d0bde37c0afb7cea8b4a6eb965ce5c5ba96
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 7.776878731479245e+18,
4
- "train_loss": 0.29035600812002355,
5
- "train_runtime": 3700.3159,
6
- "train_samples_per_second": 27.119,
7
- "train_steps_per_second": 1.697
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "total_flos": 1.555375746295849e+18,
4
+ "train_loss": 0.6562361546382782,
5
+ "train_runtime": 775.5335,
6
+ "train_samples_per_second": 25.879,
7
+ "train_steps_per_second": 1.62
8
  }
trainer_state.json CHANGED
@@ -1,2785 +1,571 @@
1
  {
2
- "best_metric": 0.7498265504837036,
3
- "best_model_checkpoint": "Action_model/checkpoint-900",
4
- "epoch": 10.0,
5
  "eval_steps": 100,
6
- "global_step": 6280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "grad_norm": 1.9456478357315063,
14
- "learning_rate": 9.968152866242038e-05,
15
- "loss": 2.2288,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.06,
20
- "grad_norm": 2.081900119781494,
21
- "learning_rate": 9.936305732484077e-05,
22
- "loss": 2.0145,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.1,
27
- "grad_norm": 2.330000877380371,
28
- "learning_rate": 9.904458598726115e-05,
29
- "loss": 1.8019,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.13,
34
- "grad_norm": 3.2511394023895264,
35
- "learning_rate": 9.874203821656052e-05,
36
- "loss": 1.5067,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 2.664090394973755,
42
- "learning_rate": 9.842356687898089e-05,
43
- "loss": 1.3203,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.16,
48
- "eval_accuracy": 0.7447619047619047,
49
- "eval_loss": 1.1809700727462769,
50
- "eval_runtime": 19.4439,
51
- "eval_samples_per_second": 54.001,
52
- "eval_steps_per_second": 6.789,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 0.19,
57
- "grad_norm": 2.723193407058716,
58
- "learning_rate": 9.810509554140128e-05,
59
- "loss": 1.2276,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 0.22,
64
- "grad_norm": 3.213502883911133,
65
- "learning_rate": 9.778662420382166e-05,
66
- "loss": 1.1515,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 0.25,
71
- "grad_norm": 3.7674548625946045,
72
- "learning_rate": 9.746815286624204e-05,
73
- "loss": 1.0409,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 0.29,
78
- "grad_norm": 2.553504228591919,
79
- "learning_rate": 9.714968152866243e-05,
80
- "loss": 1.016,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 0.32,
85
- "grad_norm": 4.68205451965332,
86
- "learning_rate": 9.683121019108281e-05,
87
- "loss": 0.998,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 0.32,
92
- "eval_accuracy": 0.7704761904761904,
93
- "eval_loss": 0.8924565315246582,
94
- "eval_runtime": 15.156,
95
- "eval_samples_per_second": 69.28,
96
- "eval_steps_per_second": 8.709,
97
  "step": 200
98
  },
99
  {
100
  "epoch": 0.35,
101
- "grad_norm": 3.2618489265441895,
102
- "learning_rate": 9.651273885350319e-05,
103
- "loss": 0.984,
104
  "step": 220
105
  },
106
  {
107
  "epoch": 0.38,
108
- "grad_norm": 2.7326881885528564,
109
- "learning_rate": 9.619426751592358e-05,
110
- "loss": 0.9017,
111
  "step": 240
112
  },
113
  {
114
  "epoch": 0.41,
115
- "grad_norm": 4.338141918182373,
116
- "learning_rate": 9.587579617834396e-05,
117
- "loss": 0.8816,
118
  "step": 260
119
  },
120
  {
121
  "epoch": 0.45,
122
- "grad_norm": 5.7808918952941895,
123
- "learning_rate": 9.555732484076433e-05,
124
- "loss": 0.7619,
125
  "step": 280
126
  },
127
  {
128
  "epoch": 0.48,
129
- "grad_norm": 5.106058120727539,
130
- "learning_rate": 9.523885350318473e-05,
131
- "loss": 0.765,
132
  "step": 300
133
  },
134
  {
135
  "epoch": 0.48,
136
- "eval_accuracy": 0.7638095238095238,
137
- "eval_loss": 0.8198639750480652,
138
- "eval_runtime": 15.6032,
139
- "eval_samples_per_second": 67.294,
140
- "eval_steps_per_second": 8.46,
141
  "step": 300
142
  },
143
  {
144
  "epoch": 0.51,
145
- "grad_norm": 2.242779493331909,
146
- "learning_rate": 9.49203821656051e-05,
147
- "loss": 0.8391,
148
  "step": 320
149
  },
150
  {
151
  "epoch": 0.54,
152
- "grad_norm": 5.80740213394165,
153
- "learning_rate": 9.460191082802548e-05,
154
- "loss": 0.6795,
155
  "step": 340
156
  },
157
  {
158
  "epoch": 0.57,
159
- "grad_norm": 5.4263153076171875,
160
- "learning_rate": 9.428343949044587e-05,
161
- "loss": 0.7903,
162
  "step": 360
163
  },
164
  {
165
  "epoch": 0.61,
166
- "grad_norm": 3.731250762939453,
167
- "learning_rate": 9.396496815286625e-05,
168
- "loss": 0.7099,
169
  "step": 380
170
  },
171
  {
172
  "epoch": 0.64,
173
- "grad_norm": 5.383817672729492,
174
- "learning_rate": 9.364649681528663e-05,
175
- "loss": 0.6521,
176
  "step": 400
177
  },
178
  {
179
  "epoch": 0.64,
180
- "eval_accuracy": 0.7371428571428571,
181
- "eval_loss": 0.8275942206382751,
182
- "eval_runtime": 17.2731,
183
- "eval_samples_per_second": 60.788,
184
- "eval_steps_per_second": 7.642,
185
  "step": 400
186
  },
187
  {
188
  "epoch": 0.67,
189
- "grad_norm": 4.755931377410889,
190
- "learning_rate": 9.332802547770702e-05,
191
- "loss": 0.6679,
192
  "step": 420
193
  },
194
  {
195
  "epoch": 0.7,
196
- "grad_norm": 3.101043462753296,
197
- "learning_rate": 9.30095541401274e-05,
198
- "loss": 0.5997,
199
  "step": 440
200
  },
201
  {
202
  "epoch": 0.73,
203
- "grad_norm": 3.172748327255249,
204
- "learning_rate": 9.269108280254777e-05,
205
- "loss": 0.738,
206
  "step": 460
207
  },
208
  {
209
  "epoch": 0.76,
210
- "grad_norm": 2.2239062786102295,
211
- "learning_rate": 9.237261146496817e-05,
212
- "loss": 0.5563,
213
  "step": 480
214
  },
215
  {
216
  "epoch": 0.8,
217
- "grad_norm": 4.797071933746338,
218
- "learning_rate": 9.205414012738854e-05,
219
- "loss": 0.7612,
220
  "step": 500
221
  },
222
  {
223
  "epoch": 0.8,
224
- "eval_accuracy": 0.7209523809523809,
225
- "eval_loss": 0.8631104230880737,
226
- "eval_runtime": 15.8902,
227
- "eval_samples_per_second": 66.078,
228
- "eval_steps_per_second": 8.307,
229
  "step": 500
230
  },
231
  {
232
  "epoch": 0.83,
233
- "grad_norm": 6.720874786376953,
234
- "learning_rate": 9.173566878980892e-05,
235
- "loss": 0.6968,
236
  "step": 520
237
  },
238
  {
239
  "epoch": 0.86,
240
- "grad_norm": 2.4448862075805664,
241
- "learning_rate": 9.141719745222931e-05,
242
- "loss": 0.5712,
243
  "step": 540
244
  },
245
  {
246
  "epoch": 0.89,
247
- "grad_norm": 4.25390100479126,
248
- "learning_rate": 9.109872611464969e-05,
249
- "loss": 0.6566,
250
  "step": 560
251
  },
252
  {
253
  "epoch": 0.92,
254
- "grad_norm": 4.468232154846191,
255
- "learning_rate": 9.078025477707007e-05,
256
- "loss": 0.6277,
257
  "step": 580
258
  },
259
  {
260
  "epoch": 0.96,
261
- "grad_norm": 5.054867267608643,
262
- "learning_rate": 9.046178343949046e-05,
263
- "loss": 0.5894,
264
  "step": 600
265
  },
266
  {
267
  "epoch": 0.96,
268
- "eval_accuracy": 0.7476190476190476,
269
- "eval_loss": 0.8175568580627441,
270
- "eval_runtime": 18.194,
271
- "eval_samples_per_second": 57.711,
272
- "eval_steps_per_second": 7.255,
273
  "step": 600
274
  },
275
  {
276
  "epoch": 0.99,
277
- "grad_norm": 3.530838966369629,
278
- "learning_rate": 9.014331210191084e-05,
279
- "loss": 0.5674,
280
  "step": 620
281
  },
282
  {
283
  "epoch": 1.02,
284
- "grad_norm": 2.4299445152282715,
285
- "learning_rate": 8.982484076433122e-05,
286
- "loss": 0.5017,
287
  "step": 640
288
  },
289
  {
290
  "epoch": 1.05,
291
- "grad_norm": 9.271321296691895,
292
- "learning_rate": 8.950636942675161e-05,
293
- "loss": 0.4916,
294
  "step": 660
295
  },
296
  {
297
  "epoch": 1.08,
298
- "grad_norm": 3.6438238620758057,
299
- "learning_rate": 8.918789808917198e-05,
300
- "loss": 0.5967,
301
  "step": 680
302
  },
303
  {
304
  "epoch": 1.11,
305
- "grad_norm": 11.338240623474121,
306
- "learning_rate": 8.886942675159236e-05,
307
- "loss": 0.5381,
308
  "step": 700
309
  },
310
  {
311
  "epoch": 1.11,
312
- "eval_accuracy": 0.7523809523809524,
313
- "eval_loss": 0.7964714765548706,
314
- "eval_runtime": 16.275,
315
- "eval_samples_per_second": 64.516,
316
- "eval_steps_per_second": 8.111,
317
  "step": 700
318
  },
319
  {
320
  "epoch": 1.15,
321
- "grad_norm": 7.793953895568848,
322
- "learning_rate": 8.855095541401275e-05,
323
- "loss": 0.4818,
324
  "step": 720
325
  },
326
  {
327
  "epoch": 1.18,
328
- "grad_norm": 3.596195697784424,
329
- "learning_rate": 8.823248407643313e-05,
330
- "loss": 0.5585,
331
  "step": 740
332
  },
333
  {
334
  "epoch": 1.21,
335
- "grad_norm": 2.5034844875335693,
336
- "learning_rate": 8.791401273885351e-05,
337
- "loss": 0.4471,
338
  "step": 760
339
  },
340
  {
341
  "epoch": 1.24,
342
- "grad_norm": 2.8802740573883057,
343
- "learning_rate": 8.75955414012739e-05,
344
- "loss": 0.4908,
345
  "step": 780
346
  },
347
  {
348
  "epoch": 1.27,
349
- "grad_norm": 3.318392753601074,
350
- "learning_rate": 8.727707006369426e-05,
351
- "loss": 0.4066,
352
  "step": 800
353
  },
354
  {
355
  "epoch": 1.27,
356
- "eval_accuracy": 0.7485714285714286,
357
- "eval_loss": 0.8102111220359802,
358
- "eval_runtime": 15.1922,
359
- "eval_samples_per_second": 69.115,
360
- "eval_steps_per_second": 8.689,
361
  "step": 800
362
  },
363
  {
364
  "epoch": 1.31,
365
- "grad_norm": 0.7513989806175232,
366
- "learning_rate": 8.695859872611464e-05,
367
- "loss": 0.3771,
368
  "step": 820
369
  },
370
  {
371
  "epoch": 1.34,
372
- "grad_norm": 2.7686688899993896,
373
- "learning_rate": 8.664012738853503e-05,
374
- "loss": 0.4788,
375
  "step": 840
376
  },
377
  {
378
  "epoch": 1.37,
379
- "grad_norm": 7.201110363006592,
380
- "learning_rate": 8.632165605095541e-05,
381
- "loss": 0.4752,
382
  "step": 860
383
  },
384
  {
385
  "epoch": 1.4,
386
- "grad_norm": 5.174149513244629,
387
- "learning_rate": 8.600318471337579e-05,
388
- "loss": 0.4771,
389
  "step": 880
390
  },
391
  {
392
  "epoch": 1.43,
393
- "grad_norm": 2.768077850341797,
394
- "learning_rate": 8.568471337579618e-05,
395
- "loss": 0.4825,
396
  "step": 900
397
  },
398
  {
399
  "epoch": 1.43,
400
- "eval_accuracy": 0.7742857142857142,
401
- "eval_loss": 0.7498265504837036,
402
- "eval_runtime": 15.2211,
403
- "eval_samples_per_second": 68.983,
404
- "eval_steps_per_second": 8.672,
405
  "step": 900
406
  },
407
  {
408
  "epoch": 1.46,
409
- "grad_norm": 5.662777423858643,
410
- "learning_rate": 8.536624203821656e-05,
411
- "loss": 0.3955,
412
  "step": 920
413
  },
414
  {
415
  "epoch": 1.5,
416
- "grad_norm": 2.15919828414917,
417
- "learning_rate": 8.504777070063694e-05,
418
- "loss": 0.3685,
419
  "step": 940
420
  },
421
  {
422
  "epoch": 1.53,
423
- "grad_norm": 7.603888988494873,
424
- "learning_rate": 8.472929936305733e-05,
425
- "loss": 0.3679,
426
  "step": 960
427
  },
428
  {
429
  "epoch": 1.56,
430
- "grad_norm": 3.6407835483551025,
431
- "learning_rate": 8.44108280254777e-05,
432
- "loss": 0.4757,
433
  "step": 980
434
  },
435
  {
436
  "epoch": 1.59,
437
- "grad_norm": 3.8537333011627197,
438
- "learning_rate": 8.409235668789808e-05,
439
- "loss": 0.4955,
440
  "step": 1000
441
  },
442
  {
443
  "epoch": 1.59,
444
- "eval_accuracy": 0.7019047619047619,
445
- "eval_loss": 0.9751647710800171,
446
- "eval_runtime": 15.4284,
447
- "eval_samples_per_second": 68.056,
448
- "eval_steps_per_second": 8.556,
449
  "step": 1000
450
  },
451
  {
452
  "epoch": 1.62,
453
- "grad_norm": 3.7005233764648438,
454
- "learning_rate": 8.377388535031847e-05,
455
- "loss": 0.4366,
456
  "step": 1020
457
  },
458
  {
459
  "epoch": 1.66,
460
- "grad_norm": 4.745129585266113,
461
- "learning_rate": 8.345541401273885e-05,
462
- "loss": 0.4014,
463
  "step": 1040
464
  },
465
  {
466
  "epoch": 1.69,
467
- "grad_norm": 2.9663562774658203,
468
- "learning_rate": 8.313694267515923e-05,
469
- "loss": 0.4432,
470
  "step": 1060
471
  },
472
  {
473
  "epoch": 1.72,
474
- "grad_norm": 3.435357093811035,
475
- "learning_rate": 8.281847133757962e-05,
476
- "loss": 0.475,
477
  "step": 1080
478
  },
479
  {
480
  "epoch": 1.75,
481
- "grad_norm": 8.792357444763184,
482
- "learning_rate": 8.25e-05,
483
- "loss": 0.3945,
484
  "step": 1100
485
  },
486
  {
487
  "epoch": 1.75,
488
- "eval_accuracy": 0.7380952380952381,
489
- "eval_loss": 0.8149927854537964,
490
- "eval_runtime": 15.9868,
491
- "eval_samples_per_second": 65.679,
492
- "eval_steps_per_second": 8.257,
493
  "step": 1100
494
  },
495
  {
496
  "epoch": 1.78,
497
- "grad_norm": 4.0536346435546875,
498
- "learning_rate": 8.218152866242038e-05,
499
- "loss": 0.4341,
500
  "step": 1120
501
  },
502
  {
503
  "epoch": 1.82,
504
- "grad_norm": 4.5750732421875,
505
- "learning_rate": 8.186305732484077e-05,
506
- "loss": 0.2879,
507
  "step": 1140
508
  },
509
  {
510
  "epoch": 1.85,
511
- "grad_norm": 9.274312019348145,
512
- "learning_rate": 8.154458598726115e-05,
513
- "loss": 0.4391,
514
  "step": 1160
515
  },
516
  {
517
  "epoch": 1.88,
518
- "grad_norm": 3.804741859436035,
519
- "learning_rate": 8.122611464968152e-05,
520
- "loss": 0.4918,
521
  "step": 1180
522
  },
523
  {
524
  "epoch": 1.91,
525
- "grad_norm": 8.260722160339355,
526
- "learning_rate": 8.090764331210192e-05,
527
- "loss": 0.4142,
528
  "step": 1200
529
  },
530
  {
531
  "epoch": 1.91,
532
- "eval_accuracy": 0.7609523809523809,
533
- "eval_loss": 0.7953311204910278,
534
- "eval_runtime": 15.4376,
535
- "eval_samples_per_second": 68.016,
536
- "eval_steps_per_second": 8.551,
537
  "step": 1200
538
  },
539
  {
540
  "epoch": 1.94,
541
- "grad_norm": 6.791586399078369,
542
- "learning_rate": 8.05891719745223e-05,
543
- "loss": 0.3653,
544
  "step": 1220
545
  },
546
  {
547
  "epoch": 1.97,
548
- "grad_norm": 5.216760158538818,
549
- "learning_rate": 8.027070063694267e-05,
550
- "loss": 0.3321,
551
  "step": 1240
552
  },
553
  {
554
- "epoch": 2.01,
555
- "grad_norm": 2.542558431625366,
556
- "learning_rate": 7.995222929936306e-05,
557
- "loss": 0.3406,
558
- "step": 1260
559
- },
560
- {
561
- "epoch": 2.04,
562
- "grad_norm": 4.629979133605957,
563
- "learning_rate": 7.963375796178344e-05,
564
- "loss": 0.3548,
565
- "step": 1280
566
- },
567
- {
568
- "epoch": 2.07,
569
- "grad_norm": 0.3911110460758209,
570
- "learning_rate": 7.931528662420382e-05,
571
- "loss": 0.3915,
572
- "step": 1300
573
- },
574
- {
575
- "epoch": 2.07,
576
- "eval_accuracy": 0.7638095238095238,
577
- "eval_loss": 0.814107358455658,
578
- "eval_runtime": 15.2728,
579
- "eval_samples_per_second": 68.749,
580
- "eval_steps_per_second": 8.643,
581
- "step": 1300
582
- },
583
- {
584
- "epoch": 2.1,
585
- "grad_norm": 12.208237648010254,
586
- "learning_rate": 7.899681528662421e-05,
587
- "loss": 0.3693,
588
- "step": 1320
589
- },
590
- {
591
- "epoch": 2.13,
592
- "grad_norm": 11.460808753967285,
593
- "learning_rate": 7.867834394904459e-05,
594
- "loss": 0.2728,
595
- "step": 1340
596
- },
597
- {
598
- "epoch": 2.17,
599
- "grad_norm": 4.868059158325195,
600
- "learning_rate": 7.835987261146497e-05,
601
- "loss": 0.3227,
602
- "step": 1360
603
- },
604
- {
605
- "epoch": 2.2,
606
- "grad_norm": 2.08876895904541,
607
- "learning_rate": 7.804140127388536e-05,
608
- "loss": 0.2696,
609
- "step": 1380
610
- },
611
- {
612
- "epoch": 2.23,
613
- "grad_norm": 7.153079032897949,
614
- "learning_rate": 7.772292993630573e-05,
615
- "loss": 0.3937,
616
- "step": 1400
617
- },
618
- {
619
- "epoch": 2.23,
620
- "eval_accuracy": 0.7704761904761904,
621
- "eval_loss": 0.7881984114646912,
622
- "eval_runtime": 18.2099,
623
- "eval_samples_per_second": 57.661,
624
- "eval_steps_per_second": 7.249,
625
- "step": 1400
626
- },
627
- {
628
- "epoch": 2.26,
629
- "grad_norm": 6.202333927154541,
630
- "learning_rate": 7.740445859872611e-05,
631
- "loss": 0.3254,
632
- "step": 1420
633
- },
634
- {
635
- "epoch": 2.29,
636
- "grad_norm": 5.620304107666016,
637
- "learning_rate": 7.70859872611465e-05,
638
- "loss": 0.3754,
639
- "step": 1440
640
- },
641
- {
642
- "epoch": 2.32,
643
- "grad_norm": 2.9809110164642334,
644
- "learning_rate": 7.676751592356688e-05,
645
- "loss": 0.4064,
646
- "step": 1460
647
- },
648
- {
649
- "epoch": 2.36,
650
- "grad_norm": 1.7137058973312378,
651
- "learning_rate": 7.644904458598726e-05,
652
- "loss": 0.3613,
653
- "step": 1480
654
- },
655
- {
656
- "epoch": 2.39,
657
- "grad_norm": 6.261457443237305,
658
- "learning_rate": 7.613057324840765e-05,
659
- "loss": 0.3144,
660
- "step": 1500
661
- },
662
- {
663
- "epoch": 2.39,
664
- "eval_accuracy": 0.7514285714285714,
665
- "eval_loss": 0.8657103776931763,
666
- "eval_runtime": 17.3337,
667
- "eval_samples_per_second": 60.576,
668
- "eval_steps_per_second": 7.615,
669
- "step": 1500
670
- },
671
- {
672
- "epoch": 2.42,
673
- "grad_norm": 3.399336814880371,
674
- "learning_rate": 7.581210191082803e-05,
675
- "loss": 0.3042,
676
- "step": 1520
677
- },
678
- {
679
- "epoch": 2.45,
680
- "grad_norm": 0.7967382073402405,
681
- "learning_rate": 7.54936305732484e-05,
682
- "loss": 0.291,
683
- "step": 1540
684
- },
685
- {
686
- "epoch": 2.48,
687
- "grad_norm": 6.790742874145508,
688
- "learning_rate": 7.51751592356688e-05,
689
- "loss": 0.3098,
690
- "step": 1560
691
- },
692
- {
693
- "epoch": 2.52,
694
- "grad_norm": 7.324584007263184,
695
- "learning_rate": 7.485668789808917e-05,
696
- "loss": 0.2617,
697
- "step": 1580
698
- },
699
- {
700
- "epoch": 2.55,
701
- "grad_norm": 6.444709300994873,
702
- "learning_rate": 7.453821656050955e-05,
703
- "loss": 0.3143,
704
- "step": 1600
705
- },
706
- {
707
- "epoch": 2.55,
708
- "eval_accuracy": 0.7085714285714285,
709
- "eval_loss": 1.056207299232483,
710
- "eval_runtime": 15.1055,
711
- "eval_samples_per_second": 69.511,
712
- "eval_steps_per_second": 8.739,
713
- "step": 1600
714
- },
715
- {
716
- "epoch": 2.58,
717
- "grad_norm": 7.054200172424316,
718
- "learning_rate": 7.421974522292993e-05,
719
- "loss": 0.3793,
720
- "step": 1620
721
- },
722
- {
723
- "epoch": 2.61,
724
- "grad_norm": 4.678903579711914,
725
- "learning_rate": 7.390127388535032e-05,
726
- "loss": 0.2708,
727
- "step": 1640
728
- },
729
- {
730
- "epoch": 2.64,
731
- "grad_norm": 10.15630054473877,
732
- "learning_rate": 7.35828025477707e-05,
733
- "loss": 0.3232,
734
- "step": 1660
735
- },
736
- {
737
- "epoch": 2.68,
738
- "grad_norm": 5.110073089599609,
739
- "learning_rate": 7.326433121019108e-05,
740
- "loss": 0.2127,
741
- "step": 1680
742
- },
743
- {
744
- "epoch": 2.71,
745
- "grad_norm": 9.176678657531738,
746
- "learning_rate": 7.294585987261147e-05,
747
- "loss": 0.3884,
748
- "step": 1700
749
- },
750
- {
751
- "epoch": 2.71,
752
- "eval_accuracy": 0.7161904761904762,
753
- "eval_loss": 1.0501899719238281,
754
- "eval_runtime": 14.9783,
755
- "eval_samples_per_second": 70.101,
756
- "eval_steps_per_second": 8.813,
757
- "step": 1700
758
- },
759
- {
760
- "epoch": 2.74,
761
- "grad_norm": 6.712092876434326,
762
- "learning_rate": 7.262738853503185e-05,
763
- "loss": 0.3172,
764
- "step": 1720
765
- },
766
- {
767
- "epoch": 2.77,
768
- "grad_norm": 4.619245529174805,
769
- "learning_rate": 7.230891719745222e-05,
770
- "loss": 0.3494,
771
- "step": 1740
772
- },
773
- {
774
- "epoch": 2.8,
775
- "grad_norm": 4.715748310089111,
776
- "learning_rate": 7.199044585987262e-05,
777
- "loss": 0.2987,
778
- "step": 1760
779
- },
780
- {
781
- "epoch": 2.83,
782
- "grad_norm": 0.6376525163650513,
783
- "learning_rate": 7.1671974522293e-05,
784
- "loss": 0.2985,
785
- "step": 1780
786
- },
787
- {
788
- "epoch": 2.87,
789
- "grad_norm": 3.6230549812316895,
790
- "learning_rate": 7.135350318471337e-05,
791
- "loss": 0.3472,
792
- "step": 1800
793
- },
794
- {
795
- "epoch": 2.87,
796
- "eval_accuracy": 0.7571428571428571,
797
- "eval_loss": 0.850594699382782,
798
- "eval_runtime": 14.9846,
799
- "eval_samples_per_second": 70.072,
800
- "eval_steps_per_second": 8.809,
801
- "step": 1800
802
- },
803
- {
804
- "epoch": 2.9,
805
- "grad_norm": 1.0222142934799194,
806
- "learning_rate": 7.103503184713376e-05,
807
- "loss": 0.2874,
808
- "step": 1820
809
- },
810
- {
811
- "epoch": 2.93,
812
- "grad_norm": 7.561610698699951,
813
- "learning_rate": 7.071656050955414e-05,
814
- "loss": 0.3647,
815
- "step": 1840
816
- },
817
- {
818
- "epoch": 2.96,
819
- "grad_norm": 3.159402370452881,
820
- "learning_rate": 7.039808917197452e-05,
821
- "loss": 0.2743,
822
- "step": 1860
823
- },
824
- {
825
- "epoch": 2.99,
826
- "grad_norm": 1.7933435440063477,
827
- "learning_rate": 7.007961783439491e-05,
828
- "loss": 0.2955,
829
- "step": 1880
830
- },
831
- {
832
- "epoch": 3.03,
833
- "grad_norm": 3.0481631755828857,
834
- "learning_rate": 6.976114649681529e-05,
835
- "loss": 0.2545,
836
- "step": 1900
837
- },
838
- {
839
- "epoch": 3.03,
840
- "eval_accuracy": 0.7209523809523809,
841
- "eval_loss": 1.0028949975967407,
842
- "eval_runtime": 14.9856,
843
- "eval_samples_per_second": 70.067,
844
- "eval_steps_per_second": 8.808,
845
- "step": 1900
846
- },
847
- {
848
- "epoch": 3.06,
849
- "grad_norm": 2.6126086711883545,
850
- "learning_rate": 6.944267515923567e-05,
851
- "loss": 0.233,
852
- "step": 1920
853
- },
854
- {
855
- "epoch": 3.09,
856
- "grad_norm": 4.197572708129883,
857
- "learning_rate": 6.912420382165606e-05,
858
- "loss": 0.2317,
859
- "step": 1940
860
- },
861
- {
862
- "epoch": 3.12,
863
- "grad_norm": 1.731285810470581,
864
- "learning_rate": 6.880573248407643e-05,
865
- "loss": 0.2237,
866
- "step": 1960
867
- },
868
- {
869
- "epoch": 3.15,
870
- "grad_norm": 2.4551756381988525,
871
- "learning_rate": 6.848726114649681e-05,
872
- "loss": 0.2146,
873
- "step": 1980
874
- },
875
- {
876
- "epoch": 3.18,
877
- "grad_norm": 4.228660583496094,
878
- "learning_rate": 6.81687898089172e-05,
879
- "loss": 0.2213,
880
- "step": 2000
881
- },
882
- {
883
- "epoch": 3.18,
884
- "eval_accuracy": 0.7933333333333333,
885
- "eval_loss": 0.8099285960197449,
886
- "eval_runtime": 15.0203,
887
- "eval_samples_per_second": 69.905,
888
- "eval_steps_per_second": 8.788,
889
- "step": 2000
890
- },
891
- {
892
- "epoch": 3.22,
893
- "grad_norm": 0.1519007682800293,
894
- "learning_rate": 6.785031847133758e-05,
895
- "loss": 0.2416,
896
- "step": 2020
897
- },
898
- {
899
- "epoch": 3.25,
900
- "grad_norm": 6.512945175170898,
901
- "learning_rate": 6.753184713375796e-05,
902
- "loss": 0.2955,
903
- "step": 2040
904
- },
905
- {
906
- "epoch": 3.28,
907
- "grad_norm": 3.3740453720092773,
908
- "learning_rate": 6.721337579617835e-05,
909
- "loss": 0.2502,
910
- "step": 2060
911
- },
912
- {
913
- "epoch": 3.31,
914
- "grad_norm": 13.082685470581055,
915
- "learning_rate": 6.689490445859873e-05,
916
- "loss": 0.3225,
917
- "step": 2080
918
- },
919
- {
920
- "epoch": 3.34,
921
- "grad_norm": 0.8484336137771606,
922
- "learning_rate": 6.65764331210191e-05,
923
- "loss": 0.3429,
924
- "step": 2100
925
- },
926
- {
927
- "epoch": 3.34,
928
- "eval_accuracy": 0.7466666666666667,
929
- "eval_loss": 0.9165627360343933,
930
- "eval_runtime": 15.0876,
931
- "eval_samples_per_second": 69.593,
932
- "eval_steps_per_second": 8.749,
933
- "step": 2100
934
- },
935
- {
936
- "epoch": 3.38,
937
- "grad_norm": 2.131553888320923,
938
- "learning_rate": 6.627388535031848e-05,
939
- "loss": 0.2263,
940
- "step": 2120
941
- },
942
- {
943
- "epoch": 3.41,
944
- "grad_norm": 0.06019827350974083,
945
- "learning_rate": 6.595541401273886e-05,
946
- "loss": 0.2397,
947
- "step": 2140
948
- },
949
- {
950
- "epoch": 3.44,
951
- "grad_norm": 6.373557090759277,
952
- "learning_rate": 6.563694267515924e-05,
953
- "loss": 0.2236,
954
- "step": 2160
955
- },
956
- {
957
- "epoch": 3.47,
958
- "grad_norm": 2.4861459732055664,
959
- "learning_rate": 6.531847133757962e-05,
960
- "loss": 0.2515,
961
- "step": 2180
962
- },
963
- {
964
- "epoch": 3.5,
965
- "grad_norm": 11.316793441772461,
966
- "learning_rate": 6.500000000000001e-05,
967
- "loss": 0.3478,
968
- "step": 2200
969
- },
970
- {
971
- "epoch": 3.5,
972
- "eval_accuracy": 0.7561904761904762,
973
- "eval_loss": 0.9201664328575134,
974
- "eval_runtime": 15.3295,
975
- "eval_samples_per_second": 68.495,
976
- "eval_steps_per_second": 8.611,
977
- "step": 2200
978
- },
979
- {
980
- "epoch": 3.54,
981
- "grad_norm": 6.753748893737793,
982
- "learning_rate": 6.468152866242039e-05,
983
- "loss": 0.2106,
984
- "step": 2220
985
- },
986
- {
987
- "epoch": 3.57,
988
- "grad_norm": 3.0223276615142822,
989
- "learning_rate": 6.436305732484076e-05,
990
- "loss": 0.1678,
991
- "step": 2240
992
- },
993
- {
994
- "epoch": 3.6,
995
- "grad_norm": 8.31506633758545,
996
- "learning_rate": 6.404458598726115e-05,
997
- "loss": 0.2301,
998
- "step": 2260
999
- },
1000
- {
1001
- "epoch": 3.63,
1002
- "grad_norm": 12.245420455932617,
1003
- "learning_rate": 6.372611464968153e-05,
1004
- "loss": 0.2038,
1005
- "step": 2280
1006
- },
1007
- {
1008
- "epoch": 3.66,
1009
- "grad_norm": 4.953712463378906,
1010
- "learning_rate": 6.340764331210191e-05,
1011
- "loss": 0.2247,
1012
- "step": 2300
1013
- },
1014
- {
1015
- "epoch": 3.66,
1016
- "eval_accuracy": 0.7638095238095238,
1017
- "eval_loss": 0.985853910446167,
1018
- "eval_runtime": 15.021,
1019
- "eval_samples_per_second": 69.902,
1020
- "eval_steps_per_second": 8.788,
1021
- "step": 2300
1022
- },
1023
- {
1024
- "epoch": 3.69,
1025
- "grad_norm": 7.013779640197754,
1026
- "learning_rate": 6.30891719745223e-05,
1027
- "loss": 0.2492,
1028
- "step": 2320
1029
- },
1030
- {
1031
- "epoch": 3.73,
1032
- "grad_norm": 7.607855319976807,
1033
- "learning_rate": 6.277070063694268e-05,
1034
- "loss": 0.1915,
1035
- "step": 2340
1036
- },
1037
- {
1038
- "epoch": 3.76,
1039
- "grad_norm": 7.456921100616455,
1040
- "learning_rate": 6.245222929936306e-05,
1041
- "loss": 0.2568,
1042
- "step": 2360
1043
- },
1044
- {
1045
- "epoch": 3.79,
1046
- "grad_norm": 5.894541263580322,
1047
- "learning_rate": 6.213375796178345e-05,
1048
- "loss": 0.2534,
1049
- "step": 2380
1050
- },
1051
- {
1052
- "epoch": 3.82,
1053
- "grad_norm": 2.887982130050659,
1054
- "learning_rate": 6.181528662420383e-05,
1055
- "loss": 0.2873,
1056
- "step": 2400
1057
- },
1058
- {
1059
- "epoch": 3.82,
1060
- "eval_accuracy": 0.7390476190476191,
1061
- "eval_loss": 1.0160545110702515,
1062
- "eval_runtime": 15.1086,
1063
- "eval_samples_per_second": 69.497,
1064
- "eval_steps_per_second": 8.737,
1065
- "step": 2400
1066
- },
1067
- {
1068
- "epoch": 3.85,
1069
- "grad_norm": 0.935616672039032,
1070
- "learning_rate": 6.14968152866242e-05,
1071
- "loss": 0.3611,
1072
- "step": 2420
1073
- },
1074
- {
1075
- "epoch": 3.89,
1076
- "grad_norm": 1.3130766153335571,
1077
- "learning_rate": 6.11783439490446e-05,
1078
- "loss": 0.2596,
1079
- "step": 2440
1080
- },
1081
- {
1082
- "epoch": 3.92,
1083
- "grad_norm": 2.0882582664489746,
1084
- "learning_rate": 6.085987261146497e-05,
1085
- "loss": 0.311,
1086
- "step": 2460
1087
- },
1088
- {
1089
- "epoch": 3.95,
1090
- "grad_norm": 5.632941246032715,
1091
- "learning_rate": 6.054140127388536e-05,
1092
- "loss": 0.2415,
1093
- "step": 2480
1094
- },
1095
- {
1096
- "epoch": 3.98,
1097
- "grad_norm": 0.2712355852127075,
1098
- "learning_rate": 6.0222929936305736e-05,
1099
- "loss": 0.2815,
1100
- "step": 2500
1101
- },
1102
- {
1103
- "epoch": 3.98,
1104
- "eval_accuracy": 0.7590476190476191,
1105
- "eval_loss": 0.9630508422851562,
1106
- "eval_runtime": 15.1527,
1107
- "eval_samples_per_second": 69.295,
1108
- "eval_steps_per_second": 8.711,
1109
- "step": 2500
1110
- },
1111
- {
1112
- "epoch": 4.01,
1113
- "grad_norm": 1.808802843093872,
1114
- "learning_rate": 5.990445859872612e-05,
1115
- "loss": 0.2004,
1116
- "step": 2520
1117
- },
1118
- {
1119
- "epoch": 4.04,
1120
- "grad_norm": 1.8293687105178833,
1121
- "learning_rate": 5.9585987261146505e-05,
1122
- "loss": 0.2575,
1123
- "step": 2540
1124
- },
1125
- {
1126
- "epoch": 4.08,
1127
- "grad_norm": 7.782797813415527,
1128
- "learning_rate": 5.926751592356688e-05,
1129
- "loss": 0.2977,
1130
- "step": 2560
1131
- },
1132
- {
1133
- "epoch": 4.11,
1134
- "grad_norm": 6.617612838745117,
1135
- "learning_rate": 5.894904458598727e-05,
1136
- "loss": 0.1844,
1137
- "step": 2580
1138
- },
1139
- {
1140
- "epoch": 4.14,
1141
- "grad_norm": 2.1140823364257812,
1142
- "learning_rate": 5.863057324840765e-05,
1143
- "loss": 0.1706,
1144
- "step": 2600
1145
- },
1146
- {
1147
- "epoch": 4.14,
1148
- "eval_accuracy": 0.741904761904762,
1149
- "eval_loss": 0.9995649456977844,
1150
- "eval_runtime": 14.9483,
1151
- "eval_samples_per_second": 70.242,
1152
- "eval_steps_per_second": 8.83,
1153
- "step": 2600
1154
- },
1155
- {
1156
- "epoch": 4.17,
1157
- "grad_norm": 3.152398109436035,
1158
- "learning_rate": 5.831210191082803e-05,
1159
- "loss": 0.2508,
1160
- "step": 2620
1161
- },
1162
- {
1163
- "epoch": 4.2,
1164
- "grad_norm": 2.778350830078125,
1165
- "learning_rate": 5.7993630573248414e-05,
1166
- "loss": 0.1961,
1167
- "step": 2640
1168
- },
1169
- {
1170
- "epoch": 4.24,
1171
- "grad_norm": 2.680100679397583,
1172
- "learning_rate": 5.76751592356688e-05,
1173
- "loss": 0.2261,
1174
- "step": 2660
1175
- },
1176
- {
1177
- "epoch": 4.27,
1178
- "grad_norm": 1.4785109758377075,
1179
- "learning_rate": 5.7356687898089176e-05,
1180
- "loss": 0.3083,
1181
- "step": 2680
1182
- },
1183
- {
1184
- "epoch": 4.3,
1185
- "grad_norm": 7.928396224975586,
1186
- "learning_rate": 5.703821656050956e-05,
1187
- "loss": 0.1709,
1188
- "step": 2700
1189
- },
1190
- {
1191
- "epoch": 4.3,
1192
- "eval_accuracy": 0.7028571428571428,
1193
- "eval_loss": 1.1969478130340576,
1194
- "eval_runtime": 15.008,
1195
- "eval_samples_per_second": 69.963,
1196
- "eval_steps_per_second": 8.795,
1197
- "step": 2700
1198
- },
1199
- {
1200
- "epoch": 4.33,
1201
- "grad_norm": 4.27495813369751,
1202
- "learning_rate": 5.673566878980892e-05,
1203
- "loss": 0.2189,
1204
- "step": 2720
1205
- },
1206
- {
1207
- "epoch": 4.36,
1208
- "grad_norm": 2.0654408931732178,
1209
- "learning_rate": 5.6417197452229296e-05,
1210
- "loss": 0.1724,
1211
- "step": 2740
1212
- },
1213
- {
1214
- "epoch": 4.39,
1215
- "grad_norm": 6.015655517578125,
1216
- "learning_rate": 5.609872611464968e-05,
1217
- "loss": 0.2304,
1218
- "step": 2760
1219
- },
1220
- {
1221
- "epoch": 4.43,
1222
- "grad_norm": 5.418392658233643,
1223
- "learning_rate": 5.5780254777070065e-05,
1224
- "loss": 0.2434,
1225
- "step": 2780
1226
- },
1227
- {
1228
- "epoch": 4.46,
1229
- "grad_norm": 5.330830097198486,
1230
- "learning_rate": 5.546178343949044e-05,
1231
- "loss": 0.2847,
1232
- "step": 2800
1233
- },
1234
- {
1235
- "epoch": 4.46,
1236
- "eval_accuracy": 0.7276190476190476,
1237
- "eval_loss": 1.0895568132400513,
1238
- "eval_runtime": 15.0519,
1239
- "eval_samples_per_second": 69.759,
1240
- "eval_steps_per_second": 8.77,
1241
- "step": 2800
1242
- },
1243
- {
1244
- "epoch": 4.49,
1245
- "grad_norm": 3.495591163635254,
1246
- "learning_rate": 5.514331210191083e-05,
1247
- "loss": 0.2109,
1248
- "step": 2820
1249
- },
1250
- {
1251
- "epoch": 4.52,
1252
- "grad_norm": 2.1377482414245605,
1253
- "learning_rate": 5.482484076433121e-05,
1254
- "loss": 0.2599,
1255
- "step": 2840
1256
- },
1257
- {
1258
- "epoch": 4.55,
1259
- "grad_norm": 3.6066718101501465,
1260
- "learning_rate": 5.450636942675159e-05,
1261
- "loss": 0.2052,
1262
- "step": 2860
1263
- },
1264
- {
1265
- "epoch": 4.59,
1266
- "grad_norm": 8.174120903015137,
1267
- "learning_rate": 5.4187898089171974e-05,
1268
- "loss": 0.2501,
1269
- "step": 2880
1270
- },
1271
- {
1272
- "epoch": 4.62,
1273
- "grad_norm": 7.208817481994629,
1274
- "learning_rate": 5.386942675159236e-05,
1275
- "loss": 0.286,
1276
- "step": 2900
1277
- },
1278
- {
1279
- "epoch": 4.62,
1280
- "eval_accuracy": 0.7628571428571429,
1281
- "eval_loss": 0.9894290566444397,
1282
- "eval_runtime": 15.1829,
1283
- "eval_samples_per_second": 69.157,
1284
- "eval_steps_per_second": 8.694,
1285
- "step": 2900
1286
- },
1287
- {
1288
- "epoch": 4.65,
1289
- "grad_norm": 0.47829246520996094,
1290
- "learning_rate": 5.3550955414012736e-05,
1291
- "loss": 0.202,
1292
- "step": 2920
1293
- },
1294
- {
1295
- "epoch": 4.68,
1296
- "grad_norm": 4.554569244384766,
1297
- "learning_rate": 5.323248407643312e-05,
1298
- "loss": 0.225,
1299
- "step": 2940
1300
- },
1301
- {
1302
- "epoch": 4.71,
1303
- "grad_norm": 7.922834396362305,
1304
- "learning_rate": 5.2914012738853506e-05,
1305
- "loss": 0.2861,
1306
- "step": 2960
1307
- },
1308
- {
1309
- "epoch": 4.75,
1310
- "grad_norm": 4.200577735900879,
1311
- "learning_rate": 5.2595541401273883e-05,
1312
- "loss": 0.173,
1313
- "step": 2980
1314
- },
1315
- {
1316
- "epoch": 4.78,
1317
- "grad_norm": 6.183908939361572,
1318
- "learning_rate": 5.227707006369427e-05,
1319
- "loss": 0.2066,
1320
- "step": 3000
1321
- },
1322
- {
1323
- "epoch": 4.78,
1324
- "eval_accuracy": 0.7485714285714286,
1325
- "eval_loss": 1.0703792572021484,
1326
- "eval_runtime": 15.0438,
1327
- "eval_samples_per_second": 69.796,
1328
- "eval_steps_per_second": 8.774,
1329
- "step": 3000
1330
- },
1331
- {
1332
- "epoch": 4.81,
1333
- "grad_norm": 8.824482917785645,
1334
- "learning_rate": 5.1958598726114646e-05,
1335
- "loss": 0.2808,
1336
- "step": 3020
1337
- },
1338
- {
1339
- "epoch": 4.84,
1340
- "grad_norm": 4.873842716217041,
1341
- "learning_rate": 5.164012738853503e-05,
1342
- "loss": 0.2838,
1343
- "step": 3040
1344
- },
1345
- {
1346
- "epoch": 4.87,
1347
- "grad_norm": 4.467446327209473,
1348
- "learning_rate": 5.1321656050955415e-05,
1349
- "loss": 0.1993,
1350
- "step": 3060
1351
- },
1352
- {
1353
- "epoch": 4.9,
1354
- "grad_norm": 2.0264174938201904,
1355
- "learning_rate": 5.100318471337579e-05,
1356
- "loss": 0.1696,
1357
- "step": 3080
1358
- },
1359
- {
1360
- "epoch": 4.94,
1361
- "grad_norm": 0.3309895396232605,
1362
- "learning_rate": 5.068471337579618e-05,
1363
- "loss": 0.1579,
1364
- "step": 3100
1365
- },
1366
- {
1367
- "epoch": 4.94,
1368
- "eval_accuracy": 0.780952380952381,
1369
- "eval_loss": 0.9727718234062195,
1370
- "eval_runtime": 15.0757,
1371
- "eval_samples_per_second": 69.648,
1372
- "eval_steps_per_second": 8.756,
1373
- "step": 3100
1374
- },
1375
- {
1376
- "epoch": 4.97,
1377
- "grad_norm": 13.229090690612793,
1378
- "learning_rate": 5.036624203821656e-05,
1379
- "loss": 0.2475,
1380
- "step": 3120
1381
- },
1382
- {
1383
- "epoch": 5.0,
1384
- "grad_norm": 0.18536876142024994,
1385
- "learning_rate": 5.004777070063694e-05,
1386
- "loss": 0.1731,
1387
- "step": 3140
1388
- },
1389
- {
1390
- "epoch": 5.03,
1391
- "grad_norm": 2.7908613681793213,
1392
- "learning_rate": 4.9729299363057324e-05,
1393
- "loss": 0.1825,
1394
- "step": 3160
1395
- },
1396
- {
1397
- "epoch": 5.06,
1398
- "grad_norm": 1.8446813821792603,
1399
- "learning_rate": 4.941082802547771e-05,
1400
- "loss": 0.225,
1401
- "step": 3180
1402
- },
1403
- {
1404
- "epoch": 5.1,
1405
- "grad_norm": 1.650841236114502,
1406
- "learning_rate": 4.9092356687898087e-05,
1407
- "loss": 0.1716,
1408
- "step": 3200
1409
- },
1410
- {
1411
- "epoch": 5.1,
1412
- "eval_accuracy": 0.7123809523809523,
1413
- "eval_loss": 1.1833752393722534,
1414
- "eval_runtime": 15.2994,
1415
- "eval_samples_per_second": 68.63,
1416
- "eval_steps_per_second": 8.628,
1417
- "step": 3200
1418
- },
1419
- {
1420
- "epoch": 5.13,
1421
- "grad_norm": 2.1248185634613037,
1422
- "learning_rate": 4.877388535031847e-05,
1423
- "loss": 0.1548,
1424
- "step": 3220
1425
- },
1426
- {
1427
- "epoch": 5.16,
1428
- "grad_norm": 1.8586862087249756,
1429
- "learning_rate": 4.8455414012738856e-05,
1430
- "loss": 0.1676,
1431
- "step": 3240
1432
- },
1433
- {
1434
- "epoch": 5.19,
1435
- "grad_norm": 0.8100853562355042,
1436
- "learning_rate": 4.8136942675159233e-05,
1437
- "loss": 0.2012,
1438
- "step": 3260
1439
- },
1440
- {
1441
- "epoch": 5.22,
1442
- "grad_norm": 4.240858554840088,
1443
- "learning_rate": 4.781847133757962e-05,
1444
- "loss": 0.2005,
1445
- "step": 3280
1446
- },
1447
- {
1448
- "epoch": 5.25,
1449
- "grad_norm": 10.562071800231934,
1450
- "learning_rate": 4.75e-05,
1451
- "loss": 0.1584,
1452
- "step": 3300
1453
- },
1454
- {
1455
- "epoch": 5.25,
1456
- "eval_accuracy": 0.7523809523809524,
1457
- "eval_loss": 1.0277926921844482,
1458
- "eval_runtime": 15.3062,
1459
- "eval_samples_per_second": 68.6,
1460
- "eval_steps_per_second": 8.624,
1461
- "step": 3300
1462
- },
1463
- {
1464
- "epoch": 5.29,
1465
- "grad_norm": 0.4224202632904053,
1466
- "learning_rate": 4.718152866242038e-05,
1467
- "loss": 0.199,
1468
- "step": 3320
1469
- },
1470
- {
1471
- "epoch": 5.32,
1472
- "grad_norm": 0.20747283101081848,
1473
- "learning_rate": 4.6863057324840765e-05,
1474
- "loss": 0.2439,
1475
- "step": 3340
1476
- },
1477
- {
1478
- "epoch": 5.35,
1479
- "grad_norm": 0.2911475896835327,
1480
- "learning_rate": 4.654458598726115e-05,
1481
- "loss": 0.2808,
1482
- "step": 3360
1483
- },
1484
- {
1485
- "epoch": 5.38,
1486
- "grad_norm": 4.381645202636719,
1487
- "learning_rate": 4.622611464968153e-05,
1488
- "loss": 0.1426,
1489
- "step": 3380
1490
- },
1491
- {
1492
- "epoch": 5.41,
1493
- "grad_norm": 2.27622389793396,
1494
- "learning_rate": 4.590764331210191e-05,
1495
- "loss": 0.1419,
1496
- "step": 3400
1497
- },
1498
- {
1499
- "epoch": 5.41,
1500
- "eval_accuracy": 0.7371428571428571,
1501
- "eval_loss": 1.1342014074325562,
1502
- "eval_runtime": 15.0273,
1503
- "eval_samples_per_second": 69.873,
1504
- "eval_steps_per_second": 8.784,
1505
- "step": 3400
1506
- },
1507
- {
1508
- "epoch": 5.45,
1509
- "grad_norm": 2.8847768306732178,
1510
- "learning_rate": 4.5589171974522296e-05,
1511
- "loss": 0.2371,
1512
- "step": 3420
1513
- },
1514
- {
1515
- "epoch": 5.48,
1516
- "grad_norm": 12.182328224182129,
1517
- "learning_rate": 4.5270700636942674e-05,
1518
- "loss": 0.2688,
1519
- "step": 3440
1520
- },
1521
- {
1522
- "epoch": 5.51,
1523
- "grad_norm": 5.950127124786377,
1524
- "learning_rate": 4.495222929936306e-05,
1525
- "loss": 0.1657,
1526
- "step": 3460
1527
- },
1528
- {
1529
- "epoch": 5.54,
1530
- "grad_norm": 0.8961606621742249,
1531
- "learning_rate": 4.4633757961783443e-05,
1532
- "loss": 0.1859,
1533
- "step": 3480
1534
- },
1535
- {
1536
- "epoch": 5.57,
1537
- "grad_norm": 4.591856956481934,
1538
- "learning_rate": 4.431528662420382e-05,
1539
- "loss": 0.2002,
1540
- "step": 3500
1541
- },
1542
- {
1543
- "epoch": 5.57,
1544
- "eval_accuracy": 0.7323809523809524,
1545
- "eval_loss": 1.1519098281860352,
1546
- "eval_runtime": 15.3172,
1547
- "eval_samples_per_second": 68.551,
1548
- "eval_steps_per_second": 8.618,
1549
- "step": 3500
1550
- },
1551
- {
1552
- "epoch": 5.61,
1553
- "grad_norm": 4.491227626800537,
1554
- "learning_rate": 4.3996815286624206e-05,
1555
- "loss": 0.1545,
1556
- "step": 3520
1557
- },
1558
- {
1559
- "epoch": 5.64,
1560
- "grad_norm": 10.363092422485352,
1561
- "learning_rate": 4.3678343949044584e-05,
1562
- "loss": 0.207,
1563
- "step": 3540
1564
- },
1565
- {
1566
- "epoch": 5.67,
1567
- "grad_norm": 5.781987190246582,
1568
- "learning_rate": 4.335987261146497e-05,
1569
- "loss": 0.1606,
1570
- "step": 3560
1571
- },
1572
- {
1573
- "epoch": 5.7,
1574
- "grad_norm": 13.844124794006348,
1575
- "learning_rate": 4.304140127388535e-05,
1576
- "loss": 0.139,
1577
- "step": 3580
1578
- },
1579
- {
1580
- "epoch": 5.73,
1581
- "grad_norm": 10.184959411621094,
1582
- "learning_rate": 4.272292993630573e-05,
1583
- "loss": 0.1987,
1584
- "step": 3600
1585
- },
1586
- {
1587
- "epoch": 5.73,
1588
- "eval_accuracy": 0.7561904761904762,
1589
- "eval_loss": 1.0741750001907349,
1590
- "eval_runtime": 15.1322,
1591
- "eval_samples_per_second": 69.388,
1592
- "eval_steps_per_second": 8.723,
1593
- "step": 3600
1594
- },
1595
- {
1596
- "epoch": 5.76,
1597
- "grad_norm": 1.3531529903411865,
1598
- "learning_rate": 4.2404458598726115e-05,
1599
- "loss": 0.1534,
1600
- "step": 3620
1601
- },
1602
- {
1603
- "epoch": 5.8,
1604
- "grad_norm": 5.064535617828369,
1605
- "learning_rate": 4.20859872611465e-05,
1606
- "loss": 0.2151,
1607
- "step": 3640
1608
- },
1609
- {
1610
- "epoch": 5.83,
1611
- "grad_norm": 3.2219882011413574,
1612
- "learning_rate": 4.176751592356688e-05,
1613
- "loss": 0.1485,
1614
- "step": 3660
1615
- },
1616
- {
1617
- "epoch": 5.86,
1618
- "grad_norm": 2.8255577087402344,
1619
- "learning_rate": 4.144904458598726e-05,
1620
- "loss": 0.1773,
1621
- "step": 3680
1622
- },
1623
- {
1624
- "epoch": 5.89,
1625
- "grad_norm": 8.52731704711914,
1626
- "learning_rate": 4.1130573248407647e-05,
1627
- "loss": 0.1207,
1628
- "step": 3700
1629
- },
1630
- {
1631
- "epoch": 5.89,
1632
- "eval_accuracy": 0.7380952380952381,
1633
- "eval_loss": 1.1578549146652222,
1634
- "eval_runtime": 14.9376,
1635
- "eval_samples_per_second": 70.293,
1636
- "eval_steps_per_second": 8.837,
1637
- "step": 3700
1638
- },
1639
- {
1640
- "epoch": 5.92,
1641
- "grad_norm": 4.12412166595459,
1642
- "learning_rate": 4.0812101910828024e-05,
1643
- "loss": 0.1709,
1644
- "step": 3720
1645
- },
1646
- {
1647
- "epoch": 5.96,
1648
- "grad_norm": 6.599554538726807,
1649
- "learning_rate": 4.049363057324841e-05,
1650
- "loss": 0.2047,
1651
- "step": 3740
1652
- },
1653
- {
1654
- "epoch": 5.99,
1655
- "grad_norm": 1.2085940837860107,
1656
- "learning_rate": 4.0175159235668793e-05,
1657
- "loss": 0.1329,
1658
- "step": 3760
1659
- },
1660
- {
1661
- "epoch": 6.02,
1662
- "grad_norm": 0.18271246552467346,
1663
- "learning_rate": 3.985668789808917e-05,
1664
- "loss": 0.3181,
1665
- "step": 3780
1666
- },
1667
- {
1668
- "epoch": 6.05,
1669
- "grad_norm": 3.431486129760742,
1670
- "learning_rate": 3.9538216560509556e-05,
1671
- "loss": 0.1403,
1672
- "step": 3800
1673
- },
1674
- {
1675
- "epoch": 6.05,
1676
- "eval_accuracy": 0.7304761904761905,
1677
- "eval_loss": 1.100738763809204,
1678
- "eval_runtime": 15.064,
1679
- "eval_samples_per_second": 69.703,
1680
- "eval_steps_per_second": 8.763,
1681
- "step": 3800
1682
- },
1683
- {
1684
- "epoch": 6.08,
1685
- "grad_norm": 2.7748119831085205,
1686
- "learning_rate": 3.921974522292994e-05,
1687
- "loss": 0.2,
1688
- "step": 3820
1689
- },
1690
- {
1691
- "epoch": 6.11,
1692
- "grad_norm": 7.583074569702148,
1693
- "learning_rate": 3.890127388535032e-05,
1694
- "loss": 0.1741,
1695
- "step": 3840
1696
- },
1697
- {
1698
- "epoch": 6.15,
1699
- "grad_norm": 0.11816076934337616,
1700
- "learning_rate": 3.85828025477707e-05,
1701
- "loss": 0.1037,
1702
- "step": 3860
1703
- },
1704
- {
1705
- "epoch": 6.18,
1706
- "grad_norm": 1.9137811660766602,
1707
- "learning_rate": 3.826433121019109e-05,
1708
- "loss": 0.1443,
1709
- "step": 3880
1710
- },
1711
- {
1712
- "epoch": 6.21,
1713
- "grad_norm": 0.09991751611232758,
1714
- "learning_rate": 3.7945859872611465e-05,
1715
- "loss": 0.1569,
1716
- "step": 3900
1717
- },
1718
- {
1719
- "epoch": 6.21,
1720
- "eval_accuracy": 0.7466666666666667,
1721
- "eval_loss": 1.112838864326477,
1722
- "eval_runtime": 15.0365,
1723
- "eval_samples_per_second": 69.83,
1724
- "eval_steps_per_second": 8.779,
1725
- "step": 3900
1726
- },
1727
- {
1728
- "epoch": 6.24,
1729
- "grad_norm": 1.1668697595596313,
1730
- "learning_rate": 3.762738853503185e-05,
1731
- "loss": 0.177,
1732
- "step": 3920
1733
- },
1734
- {
1735
- "epoch": 6.27,
1736
- "grad_norm": 1.2017813920974731,
1737
- "learning_rate": 3.7308917197452234e-05,
1738
- "loss": 0.2475,
1739
- "step": 3940
1740
- },
1741
- {
1742
- "epoch": 6.31,
1743
- "grad_norm": 1.67183256149292,
1744
- "learning_rate": 3.699044585987261e-05,
1745
- "loss": 0.1248,
1746
- "step": 3960
1747
- },
1748
- {
1749
- "epoch": 6.34,
1750
- "grad_norm": 3.272127389907837,
1751
- "learning_rate": 3.6671974522292997e-05,
1752
- "loss": 0.1516,
1753
- "step": 3980
1754
- },
1755
- {
1756
- "epoch": 6.37,
1757
- "grad_norm": 2.3881561756134033,
1758
- "learning_rate": 3.635350318471338e-05,
1759
- "loss": 0.1763,
1760
- "step": 4000
1761
- },
1762
- {
1763
- "epoch": 6.37,
1764
- "eval_accuracy": 0.7523809523809524,
1765
- "eval_loss": 1.0720144510269165,
1766
- "eval_runtime": 15.1243,
1767
- "eval_samples_per_second": 69.425,
1768
- "eval_steps_per_second": 8.728,
1769
- "step": 4000
1770
- },
1771
- {
1772
- "epoch": 6.4,
1773
- "grad_norm": 3.0372302532196045,
1774
- "learning_rate": 3.603503184713376e-05,
1775
- "loss": 0.1839,
1776
- "step": 4020
1777
- },
1778
- {
1779
- "epoch": 6.43,
1780
- "grad_norm": 5.162286281585693,
1781
- "learning_rate": 3.5716560509554144e-05,
1782
- "loss": 0.2029,
1783
- "step": 4040
1784
- },
1785
- {
1786
- "epoch": 6.46,
1787
- "grad_norm": 1.8002054691314697,
1788
- "learning_rate": 3.539808917197452e-05,
1789
- "loss": 0.237,
1790
- "step": 4060
1791
- },
1792
- {
1793
- "epoch": 6.5,
1794
- "grad_norm": 0.39735618233680725,
1795
- "learning_rate": 3.5079617834394906e-05,
1796
- "loss": 0.1846,
1797
- "step": 4080
1798
- },
1799
- {
1800
- "epoch": 6.53,
1801
- "grad_norm": 3.818883180618286,
1802
- "learning_rate": 3.476114649681529e-05,
1803
- "loss": 0.2426,
1804
- "step": 4100
1805
- },
1806
- {
1807
- "epoch": 6.53,
1808
- "eval_accuracy": 0.7247619047619047,
1809
- "eval_loss": 1.1483975648880005,
1810
- "eval_runtime": 14.9761,
1811
- "eval_samples_per_second": 70.112,
1812
- "eval_steps_per_second": 8.814,
1813
- "step": 4100
1814
- },
1815
- {
1816
- "epoch": 6.56,
1817
- "grad_norm": 0.8610227704048157,
1818
- "learning_rate": 3.444267515923567e-05,
1819
- "loss": 0.1394,
1820
- "step": 4120
1821
- },
1822
- {
1823
- "epoch": 6.59,
1824
- "grad_norm": 7.635997295379639,
1825
- "learning_rate": 3.412420382165605e-05,
1826
- "loss": 0.2134,
1827
- "step": 4140
1828
- },
1829
- {
1830
- "epoch": 6.62,
1831
- "grad_norm": 0.2937486171722412,
1832
- "learning_rate": 3.380573248407644e-05,
1833
- "loss": 0.1797,
1834
- "step": 4160
1835
- },
1836
- {
1837
- "epoch": 6.66,
1838
- "grad_norm": 2.422271966934204,
1839
- "learning_rate": 3.3487261146496815e-05,
1840
- "loss": 0.1803,
1841
- "step": 4180
1842
- },
1843
- {
1844
- "epoch": 6.69,
1845
- "grad_norm": 6.8127007484436035,
1846
- "learning_rate": 3.31687898089172e-05,
1847
- "loss": 0.1434,
1848
- "step": 4200
1849
- },
1850
- {
1851
- "epoch": 6.69,
1852
- "eval_accuracy": 0.7342857142857143,
1853
- "eval_loss": 1.1789644956588745,
1854
- "eval_runtime": 14.9138,
1855
- "eval_samples_per_second": 70.405,
1856
- "eval_steps_per_second": 8.851,
1857
- "step": 4200
1858
- },
1859
- {
1860
- "epoch": 6.72,
1861
- "grad_norm": 0.9255816340446472,
1862
- "learning_rate": 3.2850318471337584e-05,
1863
- "loss": 0.2083,
1864
- "step": 4220
1865
- },
1866
- {
1867
- "epoch": 6.75,
1868
- "grad_norm": 1.347322702407837,
1869
- "learning_rate": 3.253184713375796e-05,
1870
- "loss": 0.2296,
1871
- "step": 4240
1872
- },
1873
- {
1874
- "epoch": 6.78,
1875
- "grad_norm": 0.05040664225816727,
1876
- "learning_rate": 3.221337579617835e-05,
1877
- "loss": 0.1151,
1878
- "step": 4260
1879
- },
1880
- {
1881
- "epoch": 6.82,
1882
- "grad_norm": 0.04670969396829605,
1883
- "learning_rate": 3.189490445859873e-05,
1884
- "loss": 0.1435,
1885
- "step": 4280
1886
- },
1887
- {
1888
- "epoch": 6.85,
1889
- "grad_norm": 3.0971717834472656,
1890
- "learning_rate": 3.157643312101911e-05,
1891
- "loss": 0.2191,
1892
- "step": 4300
1893
- },
1894
- {
1895
- "epoch": 6.85,
1896
- "eval_accuracy": 0.7485714285714286,
1897
- "eval_loss": 1.1169472932815552,
1898
- "eval_runtime": 14.9321,
1899
- "eval_samples_per_second": 70.319,
1900
- "eval_steps_per_second": 8.84,
1901
- "step": 4300
1902
- },
1903
- {
1904
- "epoch": 6.88,
1905
- "grad_norm": 0.6114574670791626,
1906
- "learning_rate": 3.1257961783439494e-05,
1907
- "loss": 0.1114,
1908
- "step": 4320
1909
- },
1910
- {
1911
- "epoch": 6.91,
1912
- "grad_norm": 4.660861015319824,
1913
- "learning_rate": 3.093949044585988e-05,
1914
- "loss": 0.186,
1915
- "step": 4340
1916
- },
1917
- {
1918
- "epoch": 6.94,
1919
- "grad_norm": 1.0122532844543457,
1920
- "learning_rate": 3.0621019108280256e-05,
1921
- "loss": 0.1284,
1922
- "step": 4360
1923
- },
1924
- {
1925
- "epoch": 6.97,
1926
- "grad_norm": 4.9644880294799805,
1927
- "learning_rate": 3.030254777070064e-05,
1928
- "loss": 0.2162,
1929
- "step": 4380
1930
- },
1931
- {
1932
- "epoch": 7.01,
1933
- "grad_norm": 3.802830219268799,
1934
- "learning_rate": 2.998407643312102e-05,
1935
- "loss": 0.2062,
1936
- "step": 4400
1937
- },
1938
- {
1939
- "epoch": 7.01,
1940
- "eval_accuracy": 0.7609523809523809,
1941
- "eval_loss": 1.1300264596939087,
1942
- "eval_runtime": 14.9402,
1943
- "eval_samples_per_second": 70.28,
1944
- "eval_steps_per_second": 8.835,
1945
- "step": 4400
1946
- },
1947
- {
1948
- "epoch": 7.04,
1949
- "grad_norm": 0.5785458087921143,
1950
- "learning_rate": 2.9665605095541403e-05,
1951
- "loss": 0.082,
1952
- "step": 4420
1953
- },
1954
- {
1955
- "epoch": 7.07,
1956
- "grad_norm": 0.0573742650449276,
1957
- "learning_rate": 2.9347133757961787e-05,
1958
- "loss": 0.1013,
1959
- "step": 4440
1960
- },
1961
- {
1962
- "epoch": 7.1,
1963
- "grad_norm": 2.6736879348754883,
1964
- "learning_rate": 2.902866242038217e-05,
1965
- "loss": 0.1665,
1966
- "step": 4460
1967
- },
1968
- {
1969
- "epoch": 7.13,
1970
- "grad_norm": 4.041416645050049,
1971
- "learning_rate": 2.871019108280255e-05,
1972
- "loss": 0.1626,
1973
- "step": 4480
1974
- },
1975
- {
1976
- "epoch": 7.17,
1977
- "grad_norm": 1.28642737865448,
1978
- "learning_rate": 2.8391719745222934e-05,
1979
- "loss": 0.1495,
1980
- "step": 4500
1981
- },
1982
- {
1983
- "epoch": 7.17,
1984
- "eval_accuracy": 0.7495238095238095,
1985
- "eval_loss": 1.1477110385894775,
1986
- "eval_runtime": 14.8067,
1987
- "eval_samples_per_second": 70.914,
1988
- "eval_steps_per_second": 8.915,
1989
- "step": 4500
1990
- },
1991
- {
1992
- "epoch": 7.2,
1993
- "grad_norm": 2.69707989692688,
1994
- "learning_rate": 2.8073248407643316e-05,
1995
- "loss": 0.1423,
1996
- "step": 4520
1997
- },
1998
- {
1999
- "epoch": 7.23,
2000
- "grad_norm": 6.730169296264648,
2001
- "learning_rate": 2.7754777070063697e-05,
2002
- "loss": 0.1623,
2003
- "step": 4540
2004
- },
2005
- {
2006
- "epoch": 7.26,
2007
- "grad_norm": 7.955442428588867,
2008
- "learning_rate": 2.743630573248408e-05,
2009
- "loss": 0.2045,
2010
- "step": 4560
2011
- },
2012
- {
2013
- "epoch": 7.29,
2014
- "grad_norm": 3.3925881385803223,
2015
- "learning_rate": 2.7117834394904462e-05,
2016
- "loss": 0.0884,
2017
- "step": 4580
2018
- },
2019
- {
2020
- "epoch": 7.32,
2021
- "grad_norm": 12.87939167022705,
2022
- "learning_rate": 2.6799363057324844e-05,
2023
- "loss": 0.1261,
2024
- "step": 4600
2025
- },
2026
- {
2027
- "epoch": 7.32,
2028
- "eval_accuracy": 0.7657142857142857,
2029
- "eval_loss": 1.0891320705413818,
2030
- "eval_runtime": 15.1965,
2031
- "eval_samples_per_second": 69.095,
2032
- "eval_steps_per_second": 8.686,
2033
- "step": 4600
2034
- },
2035
- {
2036
- "epoch": 7.36,
2037
- "grad_norm": 1.0216339826583862,
2038
- "learning_rate": 2.6480891719745228e-05,
2039
- "loss": 0.1678,
2040
- "step": 4620
2041
- },
2042
- {
2043
- "epoch": 7.39,
2044
- "grad_norm": 3.8394949436187744,
2045
- "learning_rate": 2.616242038216561e-05,
2046
- "loss": 0.129,
2047
- "step": 4640
2048
- },
2049
- {
2050
- "epoch": 7.42,
2051
- "grad_norm": 2.563333511352539,
2052
- "learning_rate": 2.584394904458599e-05,
2053
- "loss": 0.1512,
2054
- "step": 4660
2055
- },
2056
- {
2057
- "epoch": 7.45,
2058
- "grad_norm": 4.966637134552002,
2059
- "learning_rate": 2.5525477707006372e-05,
2060
- "loss": 0.1261,
2061
- "step": 4680
2062
- },
2063
- {
2064
- "epoch": 7.48,
2065
- "grad_norm": 6.724792003631592,
2066
- "learning_rate": 2.5207006369426756e-05,
2067
- "loss": 0.12,
2068
- "step": 4700
2069
- },
2070
- {
2071
- "epoch": 7.48,
2072
- "eval_accuracy": 0.76,
2073
- "eval_loss": 1.1359137296676636,
2074
- "eval_runtime": 14.87,
2075
- "eval_samples_per_second": 70.612,
2076
- "eval_steps_per_second": 8.877,
2077
- "step": 4700
2078
- },
2079
- {
2080
- "epoch": 7.52,
2081
- "grad_norm": 0.4127102196216583,
2082
- "learning_rate": 2.4888535031847134e-05,
2083
- "loss": 0.1206,
2084
- "step": 4720
2085
- },
2086
- {
2087
- "epoch": 7.55,
2088
- "grad_norm": 0.21055227518081665,
2089
- "learning_rate": 2.457006369426752e-05,
2090
- "loss": 0.1455,
2091
- "step": 4740
2092
- },
2093
- {
2094
- "epoch": 7.58,
2095
- "grad_norm": 3.864630937576294,
2096
- "learning_rate": 2.426751592356688e-05,
2097
- "loss": 0.1809,
2098
- "step": 4760
2099
- },
2100
- {
2101
- "epoch": 7.61,
2102
- "grad_norm": 2.548668146133423,
2103
- "learning_rate": 2.3949044585987263e-05,
2104
- "loss": 0.182,
2105
- "step": 4780
2106
- },
2107
- {
2108
- "epoch": 7.64,
2109
- "grad_norm": 5.192627906799316,
2110
- "learning_rate": 2.3630573248407645e-05,
2111
- "loss": 0.1396,
2112
- "step": 4800
2113
- },
2114
- {
2115
- "epoch": 7.64,
2116
- "eval_accuracy": 0.7409523809523809,
2117
- "eval_loss": 1.1230192184448242,
2118
- "eval_runtime": 15.1532,
2119
- "eval_samples_per_second": 69.292,
2120
- "eval_steps_per_second": 8.711,
2121
- "step": 4800
2122
- },
2123
- {
2124
- "epoch": 7.68,
2125
- "grad_norm": 4.093466281890869,
2126
- "learning_rate": 2.3312101910828026e-05,
2127
- "loss": 0.1563,
2128
- "step": 4820
2129
- },
2130
- {
2131
- "epoch": 7.71,
2132
- "grad_norm": 0.3261258006095886,
2133
- "learning_rate": 2.299363057324841e-05,
2134
- "loss": 0.1461,
2135
- "step": 4840
2136
- },
2137
- {
2138
- "epoch": 7.74,
2139
- "grad_norm": 1.178528070449829,
2140
- "learning_rate": 2.267515923566879e-05,
2141
- "loss": 0.1578,
2142
- "step": 4860
2143
- },
2144
- {
2145
- "epoch": 7.77,
2146
- "grad_norm": 1.2750300168991089,
2147
- "learning_rate": 2.2356687898089173e-05,
2148
- "loss": 0.1937,
2149
- "step": 4880
2150
- },
2151
- {
2152
- "epoch": 7.8,
2153
- "grad_norm": 5.664385795593262,
2154
- "learning_rate": 2.2038216560509557e-05,
2155
- "loss": 0.0728,
2156
- "step": 4900
2157
- },
2158
- {
2159
- "epoch": 7.8,
2160
- "eval_accuracy": 0.7552380952380953,
2161
- "eval_loss": 1.120953917503357,
2162
- "eval_runtime": 15.3497,
2163
- "eval_samples_per_second": 68.405,
2164
- "eval_steps_per_second": 8.6,
2165
- "step": 4900
2166
- },
2167
- {
2168
- "epoch": 7.83,
2169
- "grad_norm": 0.04029368981719017,
2170
- "learning_rate": 2.171974522292994e-05,
2171
- "loss": 0.0901,
2172
- "step": 4920
2173
- },
2174
- {
2175
- "epoch": 7.87,
2176
- "grad_norm": 0.18388265371322632,
2177
- "learning_rate": 2.140127388535032e-05,
2178
- "loss": 0.0982,
2179
- "step": 4940
2180
- },
2181
- {
2182
- "epoch": 7.9,
2183
- "grad_norm": 0.03308388963341713,
2184
- "learning_rate": 2.10828025477707e-05,
2185
- "loss": 0.1396,
2186
- "step": 4960
2187
- },
2188
- {
2189
- "epoch": 7.93,
2190
- "grad_norm": 0.023461850360035896,
2191
- "learning_rate": 2.0764331210191085e-05,
2192
- "loss": 0.2196,
2193
- "step": 4980
2194
- },
2195
- {
2196
- "epoch": 7.96,
2197
- "grad_norm": 0.02594674378633499,
2198
- "learning_rate": 2.0445859872611467e-05,
2199
- "loss": 0.175,
2200
- "step": 5000
2201
- },
2202
- {
2203
- "epoch": 7.96,
2204
- "eval_accuracy": 0.7504761904761905,
2205
- "eval_loss": 1.1203620433807373,
2206
- "eval_runtime": 14.9947,
2207
- "eval_samples_per_second": 70.025,
2208
- "eval_steps_per_second": 8.803,
2209
- "step": 5000
2210
- },
2211
- {
2212
- "epoch": 7.99,
2213
- "grad_norm": 0.27780893445014954,
2214
- "learning_rate": 2.0127388535031848e-05,
2215
- "loss": 0.1099,
2216
- "step": 5020
2217
- },
2218
- {
2219
- "epoch": 8.03,
2220
- "grad_norm": 0.02279558964073658,
2221
- "learning_rate": 1.9808917197452232e-05,
2222
- "loss": 0.1265,
2223
- "step": 5040
2224
- },
2225
- {
2226
- "epoch": 8.06,
2227
- "grad_norm": 4.824516773223877,
2228
- "learning_rate": 1.9490445859872614e-05,
2229
- "loss": 0.0849,
2230
- "step": 5060
2231
- },
2232
- {
2233
- "epoch": 8.09,
2234
- "grad_norm": 5.1949334144592285,
2235
- "learning_rate": 1.9171974522292995e-05,
2236
- "loss": 0.1568,
2237
- "step": 5080
2238
- },
2239
- {
2240
- "epoch": 8.12,
2241
- "grad_norm": 1.2651612758636475,
2242
- "learning_rate": 1.885350318471338e-05,
2243
- "loss": 0.1214,
2244
- "step": 5100
2245
- },
2246
- {
2247
- "epoch": 8.12,
2248
- "eval_accuracy": 0.7542857142857143,
2249
- "eval_loss": 1.106448769569397,
2250
- "eval_runtime": 14.9134,
2251
- "eval_samples_per_second": 70.407,
2252
- "eval_steps_per_second": 8.851,
2253
- "step": 5100
2254
- },
2255
- {
2256
- "epoch": 8.15,
2257
- "grad_norm": 3.7672173976898193,
2258
- "learning_rate": 1.8535031847133757e-05,
2259
- "loss": 0.1851,
2260
- "step": 5120
2261
- },
2262
- {
2263
- "epoch": 8.18,
2264
- "grad_norm": 0.10589015483856201,
2265
- "learning_rate": 1.8216560509554138e-05,
2266
- "loss": 0.0865,
2267
- "step": 5140
2268
- },
2269
- {
2270
- "epoch": 8.22,
2271
- "grad_norm": 3.8695759773254395,
2272
- "learning_rate": 1.7898089171974523e-05,
2273
- "loss": 0.1378,
2274
- "step": 5160
2275
- },
2276
- {
2277
- "epoch": 8.25,
2278
- "grad_norm": 0.10915953665971756,
2279
- "learning_rate": 1.7579617834394904e-05,
2280
- "loss": 0.108,
2281
- "step": 5180
2282
- },
2283
- {
2284
- "epoch": 8.28,
2285
- "grad_norm": 6.51039981842041,
2286
- "learning_rate": 1.7261146496815285e-05,
2287
- "loss": 0.1218,
2288
- "step": 5200
2289
- },
2290
- {
2291
- "epoch": 8.28,
2292
- "eval_accuracy": 0.7771428571428571,
2293
- "eval_loss": 1.023201823234558,
2294
- "eval_runtime": 14.7617,
2295
- "eval_samples_per_second": 71.13,
2296
- "eval_steps_per_second": 8.942,
2297
- "step": 5200
2298
- },
2299
- {
2300
- "epoch": 8.31,
2301
- "grad_norm": 5.766481876373291,
2302
- "learning_rate": 1.694267515923567e-05,
2303
- "loss": 0.1055,
2304
- "step": 5220
2305
- },
2306
- {
2307
- "epoch": 8.34,
2308
- "grad_norm": 1.3557641506195068,
2309
- "learning_rate": 1.662420382165605e-05,
2310
- "loss": 0.1426,
2311
- "step": 5240
2312
- },
2313
- {
2314
- "epoch": 8.38,
2315
- "grad_norm": 2.0178778171539307,
2316
- "learning_rate": 1.6305732484076432e-05,
2317
- "loss": 0.1147,
2318
- "step": 5260
2319
- },
2320
- {
2321
- "epoch": 8.41,
2322
- "grad_norm": 0.3355507254600525,
2323
- "learning_rate": 1.5987261146496817e-05,
2324
- "loss": 0.0986,
2325
- "step": 5280
2326
- },
2327
- {
2328
- "epoch": 8.44,
2329
- "grad_norm": 8.486790657043457,
2330
- "learning_rate": 1.5668789808917198e-05,
2331
- "loss": 0.1556,
2332
- "step": 5300
2333
- },
2334
- {
2335
- "epoch": 8.44,
2336
- "eval_accuracy": 0.7771428571428571,
2337
- "eval_loss": 1.0489068031311035,
2338
- "eval_runtime": 14.8821,
2339
- "eval_samples_per_second": 70.554,
2340
- "eval_steps_per_second": 8.87,
2341
- "step": 5300
2342
- },
2343
- {
2344
- "epoch": 8.47,
2345
- "grad_norm": 0.0707675963640213,
2346
- "learning_rate": 1.536624203821656e-05,
2347
- "loss": 0.0841,
2348
- "step": 5320
2349
- },
2350
- {
2351
- "epoch": 8.5,
2352
- "grad_norm": 16.97945213317871,
2353
- "learning_rate": 1.5047770700636943e-05,
2354
- "loss": 0.1825,
2355
- "step": 5340
2356
- },
2357
- {
2358
- "epoch": 8.54,
2359
- "grad_norm": 3.0360000133514404,
2360
- "learning_rate": 1.4729299363057326e-05,
2361
- "loss": 0.1569,
2362
- "step": 5360
2363
- },
2364
- {
2365
- "epoch": 8.57,
2366
- "grad_norm": 0.4013698697090149,
2367
- "learning_rate": 1.4410828025477707e-05,
2368
- "loss": 0.115,
2369
- "step": 5380
2370
- },
2371
- {
2372
- "epoch": 8.6,
2373
- "grad_norm": 0.027194073423743248,
2374
- "learning_rate": 1.409235668789809e-05,
2375
- "loss": 0.1019,
2376
- "step": 5400
2377
- },
2378
- {
2379
- "epoch": 8.6,
2380
- "eval_accuracy": 0.7752380952380953,
2381
- "eval_loss": 1.091567873954773,
2382
- "eval_runtime": 15.0217,
2383
- "eval_samples_per_second": 69.899,
2384
- "eval_steps_per_second": 8.787,
2385
- "step": 5400
2386
- },
2387
- {
2388
- "epoch": 8.63,
2389
- "grad_norm": 5.168393611907959,
2390
- "learning_rate": 1.3773885350318472e-05,
2391
- "loss": 0.0843,
2392
- "step": 5420
2393
- },
2394
- {
2395
- "epoch": 8.66,
2396
- "grad_norm": 0.056524887681007385,
2397
- "learning_rate": 1.3455414012738854e-05,
2398
- "loss": 0.0594,
2399
- "step": 5440
2400
- },
2401
- {
2402
- "epoch": 8.69,
2403
- "grad_norm": 0.16345758736133575,
2404
- "learning_rate": 1.3136942675159237e-05,
2405
- "loss": 0.1234,
2406
- "step": 5460
2407
- },
2408
- {
2409
- "epoch": 8.73,
2410
- "grad_norm": 9.12987995147705,
2411
- "learning_rate": 1.2818471337579618e-05,
2412
- "loss": 0.0643,
2413
- "step": 5480
2414
- },
2415
- {
2416
- "epoch": 8.76,
2417
- "grad_norm": 9.363956451416016,
2418
- "learning_rate": 1.25e-05,
2419
- "loss": 0.1446,
2420
- "step": 5500
2421
- },
2422
- {
2423
- "epoch": 8.76,
2424
- "eval_accuracy": 0.7504761904761905,
2425
- "eval_loss": 1.1855816841125488,
2426
- "eval_runtime": 15.0734,
2427
- "eval_samples_per_second": 69.659,
2428
- "eval_steps_per_second": 8.757,
2429
- "step": 5500
2430
- },
2431
- {
2432
- "epoch": 8.79,
2433
- "grad_norm": 0.7227168083190918,
2434
- "learning_rate": 1.2181528662420383e-05,
2435
- "loss": 0.1234,
2436
- "step": 5520
2437
- },
2438
- {
2439
- "epoch": 8.82,
2440
- "grad_norm": 2.766003370285034,
2441
- "learning_rate": 1.1863057324840765e-05,
2442
- "loss": 0.0921,
2443
- "step": 5540
2444
- },
2445
- {
2446
- "epoch": 8.85,
2447
- "grad_norm": 0.011736826971173286,
2448
- "learning_rate": 1.1544585987261148e-05,
2449
- "loss": 0.0684,
2450
- "step": 5560
2451
- },
2452
- {
2453
- "epoch": 8.89,
2454
- "grad_norm": 0.02954920195043087,
2455
- "learning_rate": 1.1226114649681529e-05,
2456
- "loss": 0.109,
2457
- "step": 5580
2458
- },
2459
- {
2460
- "epoch": 8.92,
2461
- "grad_norm": 1.0454398393630981,
2462
- "learning_rate": 1.0907643312101912e-05,
2463
- "loss": 0.1348,
2464
- "step": 5600
2465
- },
2466
- {
2467
- "epoch": 8.92,
2468
- "eval_accuracy": 0.7638095238095238,
2469
- "eval_loss": 1.1379646062850952,
2470
- "eval_runtime": 14.986,
2471
- "eval_samples_per_second": 70.065,
2472
- "eval_steps_per_second": 8.808,
2473
- "step": 5600
2474
- },
2475
- {
2476
- "epoch": 8.95,
2477
- "grad_norm": 0.1124209389090538,
2478
- "learning_rate": 1.0589171974522294e-05,
2479
- "loss": 0.1118,
2480
- "step": 5620
2481
- },
2482
- {
2483
- "epoch": 8.98,
2484
- "grad_norm": 0.05434571951627731,
2485
- "learning_rate": 1.0270700636942676e-05,
2486
- "loss": 0.0981,
2487
- "step": 5640
2488
- },
2489
- {
2490
- "epoch": 9.01,
2491
- "grad_norm": 6.469759941101074,
2492
- "learning_rate": 9.952229299363059e-06,
2493
- "loss": 0.1224,
2494
- "step": 5660
2495
- },
2496
- {
2497
- "epoch": 9.04,
2498
- "grad_norm": 2.5620501041412354,
2499
- "learning_rate": 9.633757961783441e-06,
2500
- "loss": 0.0646,
2501
- "step": 5680
2502
- },
2503
- {
2504
- "epoch": 9.08,
2505
- "grad_norm": 4.989463806152344,
2506
- "learning_rate": 9.315286624203821e-06,
2507
- "loss": 0.1402,
2508
- "step": 5700
2509
- },
2510
- {
2511
- "epoch": 9.08,
2512
- "eval_accuracy": 0.7695238095238095,
2513
- "eval_loss": 1.1233410835266113,
2514
- "eval_runtime": 15.0736,
2515
- "eval_samples_per_second": 69.658,
2516
- "eval_steps_per_second": 8.757,
2517
- "step": 5700
2518
- },
2519
- {
2520
- "epoch": 9.11,
2521
- "grad_norm": 0.28552043437957764,
2522
- "learning_rate": 8.996815286624204e-06,
2523
- "loss": 0.0686,
2524
- "step": 5720
2525
- },
2526
- {
2527
- "epoch": 9.14,
2528
- "grad_norm": 0.09686419367790222,
2529
- "learning_rate": 8.678343949044587e-06,
2530
- "loss": 0.1217,
2531
- "step": 5740
2532
- },
2533
- {
2534
- "epoch": 9.17,
2535
- "grad_norm": 5.7156572341918945,
2536
- "learning_rate": 8.359872611464968e-06,
2537
- "loss": 0.1461,
2538
- "step": 5760
2539
- },
2540
- {
2541
- "epoch": 9.2,
2542
- "grad_norm": 10.184377670288086,
2543
- "learning_rate": 8.04140127388535e-06,
2544
- "loss": 0.0892,
2545
- "step": 5780
2546
- },
2547
- {
2548
- "epoch": 9.24,
2549
- "grad_norm": 0.3750424087047577,
2550
- "learning_rate": 7.722929936305732e-06,
2551
- "loss": 0.1075,
2552
- "step": 5800
2553
- },
2554
- {
2555
- "epoch": 9.24,
2556
- "eval_accuracy": 0.7628571428571429,
2557
- "eval_loss": 1.1471986770629883,
2558
- "eval_runtime": 15.0092,
2559
- "eval_samples_per_second": 69.957,
2560
- "eval_steps_per_second": 8.795,
2561
- "step": 5800
2562
- },
2563
- {
2564
- "epoch": 9.27,
2565
- "grad_norm": 12.310994148254395,
2566
- "learning_rate": 7.404458598726115e-06,
2567
- "loss": 0.1211,
2568
- "step": 5820
2569
- },
2570
- {
2571
- "epoch": 9.3,
2572
- "grad_norm": 0.08023884147405624,
2573
- "learning_rate": 7.085987261146497e-06,
2574
- "loss": 0.0503,
2575
- "step": 5840
2576
- },
2577
- {
2578
- "epoch": 9.33,
2579
- "grad_norm": 10.179062843322754,
2580
- "learning_rate": 6.76751592356688e-06,
2581
- "loss": 0.0965,
2582
- "step": 5860
2583
- },
2584
- {
2585
- "epoch": 9.36,
2586
- "grad_norm": 6.414700031280518,
2587
- "learning_rate": 6.449044585987262e-06,
2588
- "loss": 0.1485,
2589
- "step": 5880
2590
- },
2591
- {
2592
- "epoch": 9.39,
2593
- "grad_norm": 4.7796854972839355,
2594
- "learning_rate": 6.130573248407644e-06,
2595
- "loss": 0.0991,
2596
- "step": 5900
2597
- },
2598
- {
2599
- "epoch": 9.39,
2600
- "eval_accuracy": 0.7647619047619048,
2601
- "eval_loss": 1.1529643535614014,
2602
- "eval_runtime": 14.9351,
2603
- "eval_samples_per_second": 70.304,
2604
- "eval_steps_per_second": 8.838,
2605
- "step": 5900
2606
- },
2607
- {
2608
- "epoch": 9.43,
2609
- "grad_norm": 3.3885576725006104,
2610
- "learning_rate": 5.812101910828026e-06,
2611
- "loss": 0.0582,
2612
- "step": 5920
2613
- },
2614
- {
2615
- "epoch": 9.46,
2616
- "grad_norm": 0.3516397178173065,
2617
- "learning_rate": 5.493630573248408e-06,
2618
- "loss": 0.1151,
2619
- "step": 5940
2620
- },
2621
- {
2622
- "epoch": 9.49,
2623
- "grad_norm": 0.029496684670448303,
2624
- "learning_rate": 5.175159235668791e-06,
2625
- "loss": 0.0601,
2626
- "step": 5960
2627
- },
2628
- {
2629
- "epoch": 9.52,
2630
- "grad_norm": 0.00963157694786787,
2631
- "learning_rate": 4.856687898089173e-06,
2632
- "loss": 0.0885,
2633
- "step": 5980
2634
- },
2635
- {
2636
- "epoch": 9.55,
2637
- "grad_norm": 1.4221217632293701,
2638
- "learning_rate": 4.538216560509554e-06,
2639
- "loss": 0.081,
2640
- "step": 6000
2641
- },
2642
- {
2643
- "epoch": 9.55,
2644
- "eval_accuracy": 0.7628571428571429,
2645
- "eval_loss": 1.1586228609085083,
2646
- "eval_runtime": 14.9129,
2647
- "eval_samples_per_second": 70.409,
2648
- "eval_steps_per_second": 8.851,
2649
- "step": 6000
2650
- },
2651
- {
2652
- "epoch": 9.59,
2653
- "grad_norm": 0.09046982228755951,
2654
- "learning_rate": 4.219745222929937e-06,
2655
- "loss": 0.0744,
2656
- "step": 6020
2657
- },
2658
- {
2659
- "epoch": 9.62,
2660
- "grad_norm": 3.131237030029297,
2661
- "learning_rate": 3.901273885350319e-06,
2662
- "loss": 0.095,
2663
- "step": 6040
2664
- },
2665
- {
2666
- "epoch": 9.65,
2667
- "grad_norm": 0.04757837578654289,
2668
- "learning_rate": 3.5828025477707007e-06,
2669
- "loss": 0.0827,
2670
- "step": 6060
2671
- },
2672
- {
2673
- "epoch": 9.68,
2674
- "grad_norm": 0.19970394670963287,
2675
- "learning_rate": 3.2643312101910827e-06,
2676
- "loss": 0.1016,
2677
- "step": 6080
2678
- },
2679
- {
2680
- "epoch": 9.71,
2681
- "grad_norm": 4.088714599609375,
2682
- "learning_rate": 2.945859872611465e-06,
2683
- "loss": 0.0724,
2684
- "step": 6100
2685
- },
2686
- {
2687
- "epoch": 9.71,
2688
- "eval_accuracy": 0.7676190476190476,
2689
- "eval_loss": 1.1590627431869507,
2690
- "eval_runtime": 14.9656,
2691
- "eval_samples_per_second": 70.161,
2692
- "eval_steps_per_second": 8.82,
2693
- "step": 6100
2694
- },
2695
- {
2696
- "epoch": 9.75,
2697
- "grad_norm": 0.02539316564798355,
2698
- "learning_rate": 2.6273885350318472e-06,
2699
- "loss": 0.0873,
2700
- "step": 6120
2701
- },
2702
- {
2703
- "epoch": 9.78,
2704
- "grad_norm": 0.04443074390292168,
2705
- "learning_rate": 2.3089171974522293e-06,
2706
- "loss": 0.1642,
2707
- "step": 6140
2708
- },
2709
- {
2710
- "epoch": 9.81,
2711
- "grad_norm": 4.564943790435791,
2712
- "learning_rate": 1.9904458598726113e-06,
2713
- "loss": 0.0766,
2714
- "step": 6160
2715
- },
2716
- {
2717
- "epoch": 9.84,
2718
- "grad_norm": 3.6170880794525146,
2719
- "learning_rate": 1.6719745222929937e-06,
2720
- "loss": 0.129,
2721
- "step": 6180
2722
- },
2723
- {
2724
- "epoch": 9.87,
2725
- "grad_norm": 3.5646533966064453,
2726
- "learning_rate": 1.353503184713376e-06,
2727
- "loss": 0.0399,
2728
- "step": 6200
2729
- },
2730
- {
2731
- "epoch": 9.87,
2732
- "eval_accuracy": 0.7695238095238095,
2733
- "eval_loss": 1.1417580842971802,
2734
- "eval_runtime": 14.9815,
2735
- "eval_samples_per_second": 70.086,
2736
- "eval_steps_per_second": 8.811,
2737
- "step": 6200
2738
- },
2739
- {
2740
- "epoch": 9.9,
2741
- "grad_norm": 0.011990565806627274,
2742
- "learning_rate": 1.035031847133758e-06,
2743
- "loss": 0.079,
2744
- "step": 6220
2745
- },
2746
- {
2747
- "epoch": 9.94,
2748
- "grad_norm": 0.08819901943206787,
2749
- "learning_rate": 7.165605095541401e-07,
2750
- "loss": 0.1412,
2751
- "step": 6240
2752
- },
2753
- {
2754
- "epoch": 9.97,
2755
- "grad_norm": 1.3580721616744995,
2756
- "learning_rate": 3.980891719745223e-07,
2757
- "loss": 0.0805,
2758
- "step": 6260
2759
- },
2760
- {
2761
- "epoch": 10.0,
2762
- "grad_norm": 0.38720598816871643,
2763
- "learning_rate": 7.961783439490447e-08,
2764
- "loss": 0.0744,
2765
- "step": 6280
2766
- },
2767
- {
2768
- "epoch": 10.0,
2769
- "step": 6280,
2770
- "total_flos": 7.776878731479245e+18,
2771
- "train_loss": 0.29035600812002355,
2772
- "train_runtime": 3700.3159,
2773
- "train_samples_per_second": 27.119,
2774
- "train_steps_per_second": 1.697
2775
  }
2776
  ],
2777
  "logging_steps": 20,
2778
- "max_steps": 6280,
2779
  "num_input_tokens_seen": 0,
2780
- "num_train_epochs": 10,
2781
  "save_steps": 100,
2782
- "total_flos": 7.776878731479245e+18,
2783
  "train_batch_size": 16,
2784
  "trial_name": null,
2785
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7120087742805481,
3
+ "best_model_checkpoint": "Action_model/checkpoint-600",
4
+ "epoch": 2.0,
5
  "eval_steps": 100,
6
+ "global_step": 1256,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "grad_norm": 2.222489356994629,
14
+ "learning_rate": 9.840764331210192e-05,
15
+ "loss": 2.2257,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.06,
20
+ "grad_norm": 2.293585777282715,
21
+ "learning_rate": 9.681528662420382e-05,
22
+ "loss": 2.0112,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.1,
27
+ "grad_norm": 2.167264223098755,
28
+ "learning_rate": 9.522292993630574e-05,
29
+ "loss": 1.8158,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.13,
34
+ "grad_norm": 3.259340286254883,
35
+ "learning_rate": 9.363057324840766e-05,
36
+ "loss": 1.5504,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.16,
41
+ "grad_norm": 2.540658473968506,
42
+ "learning_rate": 9.203821656050956e-05,
43
+ "loss": 1.3489,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.16,
48
+ "eval_accuracy": 0.7,
49
+ "eval_loss": 1.2611500024795532,
50
+ "eval_runtime": 22.4862,
51
+ "eval_samples_per_second": 46.695,
52
+ "eval_steps_per_second": 5.87,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 0.19,
57
+ "grad_norm": 2.064728021621704,
58
+ "learning_rate": 9.044585987261147e-05,
59
+ "loss": 1.2181,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 0.22,
64
+ "grad_norm": 4.9456000328063965,
65
+ "learning_rate": 8.885350318471338e-05,
66
+ "loss": 1.1517,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 0.25,
71
+ "grad_norm": 3.7164435386657715,
72
+ "learning_rate": 8.73407643312102e-05,
73
+ "loss": 1.0429,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 0.29,
78
+ "grad_norm": 3.3535468578338623,
79
+ "learning_rate": 8.57484076433121e-05,
80
+ "loss": 0.9935,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 0.32,
85
+ "grad_norm": 5.574573040008545,
86
+ "learning_rate": 8.415605095541401e-05,
87
+ "loss": 1.0112,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 0.32,
92
+ "eval_accuracy": 0.7590476190476191,
93
+ "eval_loss": 0.9050103425979614,
94
+ "eval_runtime": 14.9955,
95
+ "eval_samples_per_second": 70.021,
96
+ "eval_steps_per_second": 8.803,
97
  "step": 200
98
  },
99
  {
100
  "epoch": 0.35,
101
+ "grad_norm": 2.9566049575805664,
102
+ "learning_rate": 8.256369426751593e-05,
103
+ "loss": 0.9295,
104
  "step": 220
105
  },
106
  {
107
  "epoch": 0.38,
108
+ "grad_norm": 2.9411683082580566,
109
+ "learning_rate": 8.097133757961783e-05,
110
+ "loss": 0.895,
111
  "step": 240
112
  },
113
  {
114
  "epoch": 0.41,
115
+ "grad_norm": 4.049664497375488,
116
+ "learning_rate": 7.937898089171975e-05,
117
+ "loss": 0.8278,
118
  "step": 260
119
  },
120
  {
121
  "epoch": 0.45,
122
+ "grad_norm": 4.934290409088135,
123
+ "learning_rate": 7.778662420382165e-05,
124
+ "loss": 0.7667,
125
  "step": 280
126
  },
127
  {
128
  "epoch": 0.48,
129
+ "grad_norm": 5.120416164398193,
130
+ "learning_rate": 7.619426751592357e-05,
131
+ "loss": 0.7962,
132
  "step": 300
133
  },
134
  {
135
  "epoch": 0.48,
136
+ "eval_accuracy": 0.7504761904761905,
137
+ "eval_loss": 0.852245032787323,
138
+ "eval_runtime": 14.85,
139
+ "eval_samples_per_second": 70.707,
140
+ "eval_steps_per_second": 8.889,
141
  "step": 300
142
  },
143
  {
144
  "epoch": 0.51,
145
+ "grad_norm": 2.324152708053589,
146
+ "learning_rate": 7.460191082802548e-05,
147
+ "loss": 0.8446,
148
  "step": 320
149
  },
150
  {
151
  "epoch": 0.54,
152
+ "grad_norm": 6.521075248718262,
153
+ "learning_rate": 7.300955414012739e-05,
154
+ "loss": 0.6878,
155
  "step": 340
156
  },
157
  {
158
  "epoch": 0.57,
159
+ "grad_norm": 4.726436614990234,
160
+ "learning_rate": 7.14171974522293e-05,
161
+ "loss": 0.7465,
162
  "step": 360
163
  },
164
  {
165
  "epoch": 0.61,
166
+ "grad_norm": 3.800800085067749,
167
+ "learning_rate": 6.982484076433122e-05,
168
+ "loss": 0.706,
169
  "step": 380
170
  },
171
  {
172
  "epoch": 0.64,
173
+ "grad_norm": 5.507264614105225,
174
+ "learning_rate": 6.823248407643312e-05,
175
+ "loss": 0.6383,
176
  "step": 400
177
  },
178
  {
179
  "epoch": 0.64,
180
+ "eval_accuracy": 0.7219047619047619,
181
+ "eval_loss": 0.8676416277885437,
182
+ "eval_runtime": 14.6076,
183
+ "eval_samples_per_second": 71.881,
184
+ "eval_steps_per_second": 9.036,
185
  "step": 400
186
  },
187
  {
188
  "epoch": 0.67,
189
+ "grad_norm": 6.359522342681885,
190
+ "learning_rate": 6.664012738853504e-05,
191
+ "loss": 0.6658,
192
  "step": 420
193
  },
194
  {
195
  "epoch": 0.7,
196
+ "grad_norm": 3.9945261478424072,
197
+ "learning_rate": 6.504777070063695e-05,
198
+ "loss": 0.6106,
199
  "step": 440
200
  },
201
  {
202
  "epoch": 0.73,
203
+ "grad_norm": 2.555899143218994,
204
+ "learning_rate": 6.345541401273885e-05,
205
+ "loss": 0.7034,
206
  "step": 460
207
  },
208
  {
209
  "epoch": 0.76,
210
+ "grad_norm": 2.68978214263916,
211
+ "learning_rate": 6.186305732484077e-05,
212
+ "loss": 0.4986,
213
  "step": 480
214
  },
215
  {
216
  "epoch": 0.8,
217
+ "grad_norm": 4.49608039855957,
218
+ "learning_rate": 6.027070063694268e-05,
219
+ "loss": 0.6485,
220
  "step": 500
221
  },
222
  {
223
  "epoch": 0.8,
224
+ "eval_accuracy": 0.7323809523809524,
225
+ "eval_loss": 0.8052415251731873,
226
+ "eval_runtime": 14.5568,
227
+ "eval_samples_per_second": 72.131,
228
+ "eval_steps_per_second": 9.068,
229
  "step": 500
230
  },
231
  {
232
  "epoch": 0.83,
233
+ "grad_norm": 5.239855766296387,
234
+ "learning_rate": 5.867834394904459e-05,
235
+ "loss": 0.6176,
236
  "step": 520
237
  },
238
  {
239
  "epoch": 0.86,
240
+ "grad_norm": 2.8663668632507324,
241
+ "learning_rate": 5.70859872611465e-05,
242
+ "loss": 0.5519,
243
  "step": 540
244
  },
245
  {
246
  "epoch": 0.89,
247
+ "grad_norm": 2.615525245666504,
248
+ "learning_rate": 5.5493630573248414e-05,
249
+ "loss": 0.6374,
250
  "step": 560
251
  },
252
  {
253
  "epoch": 0.92,
254
+ "grad_norm": 3.312385082244873,
255
+ "learning_rate": 5.3901273885350324e-05,
256
+ "loss": 0.5816,
257
  "step": 580
258
  },
259
  {
260
  "epoch": 0.96,
261
+ "grad_norm": 4.399689197540283,
262
+ "learning_rate": 5.230891719745223e-05,
263
+ "loss": 0.5452,
264
  "step": 600
265
  },
266
  {
267
  "epoch": 0.96,
268
+ "eval_accuracy": 0.7847619047619048,
269
+ "eval_loss": 0.7120087742805481,
270
+ "eval_runtime": 14.577,
271
+ "eval_samples_per_second": 72.031,
272
+ "eval_steps_per_second": 9.055,
273
  "step": 600
274
  },
275
  {
276
  "epoch": 0.99,
277
+ "grad_norm": 3.4874184131622314,
278
+ "learning_rate": 5.071656050955414e-05,
279
+ "loss": 0.5328,
280
  "step": 620
281
  },
282
  {
283
  "epoch": 1.02,
284
+ "grad_norm": 5.2181396484375,
285
+ "learning_rate": 4.912420382165605e-05,
286
+ "loss": 0.5078,
287
  "step": 640
288
  },
289
  {
290
  "epoch": 1.05,
291
+ "grad_norm": 2.219102621078491,
292
+ "learning_rate": 4.753184713375796e-05,
293
+ "loss": 0.4969,
294
  "step": 660
295
  },
296
  {
297
  "epoch": 1.08,
298
+ "grad_norm": 4.785001754760742,
299
+ "learning_rate": 4.593949044585987e-05,
300
+ "loss": 0.5407,
301
  "step": 680
302
  },
303
  {
304
  "epoch": 1.11,
305
+ "grad_norm": 7.441385269165039,
306
+ "learning_rate": 4.4347133757961786e-05,
307
+ "loss": 0.4882,
308
  "step": 700
309
  },
310
  {
311
  "epoch": 1.11,
312
+ "eval_accuracy": 0.7714285714285715,
313
+ "eval_loss": 0.7478358745574951,
314
+ "eval_runtime": 14.7271,
315
+ "eval_samples_per_second": 71.297,
316
+ "eval_steps_per_second": 8.963,
317
  "step": 700
318
  },
319
  {
320
  "epoch": 1.15,
321
+ "grad_norm": 3.0530927181243896,
322
+ "learning_rate": 4.2754777070063695e-05,
323
+ "loss": 0.423,
324
  "step": 720
325
  },
326
  {
327
  "epoch": 1.18,
328
+ "grad_norm": 3.1082653999328613,
329
+ "learning_rate": 4.1162420382165605e-05,
330
+ "loss": 0.505,
331
  "step": 740
332
  },
333
  {
334
  "epoch": 1.21,
335
+ "grad_norm": 5.5019683837890625,
336
+ "learning_rate": 3.957006369426752e-05,
337
+ "loss": 0.4445,
338
  "step": 760
339
  },
340
  {
341
  "epoch": 1.24,
342
+ "grad_norm": 3.35685658454895,
343
+ "learning_rate": 3.797770700636943e-05,
344
+ "loss": 0.4795,
345
  "step": 780
346
  },
347
  {
348
  "epoch": 1.27,
349
+ "grad_norm": 0.8577423691749573,
350
+ "learning_rate": 3.638535031847134e-05,
351
+ "loss": 0.3409,
352
  "step": 800
353
  },
354
  {
355
  "epoch": 1.27,
356
+ "eval_accuracy": 0.7742857142857142,
357
+ "eval_loss": 0.7310556769371033,
358
+ "eval_runtime": 14.6273,
359
+ "eval_samples_per_second": 71.784,
360
+ "eval_steps_per_second": 9.024,
361
  "step": 800
362
  },
363
  {
364
  "epoch": 1.31,
365
+ "grad_norm": 2.747500419616699,
366
+ "learning_rate": 3.479299363057325e-05,
367
+ "loss": 0.3633,
368
  "step": 820
369
  },
370
  {
371
  "epoch": 1.34,
372
+ "grad_norm": 2.4795773029327393,
373
+ "learning_rate": 3.3200636942675165e-05,
374
+ "loss": 0.4641,
375
  "step": 840
376
  },
377
  {
378
  "epoch": 1.37,
379
+ "grad_norm": 5.826427936553955,
380
+ "learning_rate": 3.1608280254777074e-05,
381
+ "loss": 0.4289,
382
  "step": 860
383
  },
384
  {
385
  "epoch": 1.4,
386
+ "grad_norm": 4.507148742675781,
387
+ "learning_rate": 3.0015923566878983e-05,
388
+ "loss": 0.4525,
389
  "step": 880
390
  },
391
  {
392
  "epoch": 1.43,
393
+ "grad_norm": 2.810245990753174,
394
+ "learning_rate": 2.8423566878980896e-05,
395
+ "loss": 0.4105,
396
  "step": 900
397
  },
398
  {
399
  "epoch": 1.43,
400
+ "eval_accuracy": 0.780952380952381,
401
+ "eval_loss": 0.735313892364502,
402
+ "eval_runtime": 14.7897,
403
+ "eval_samples_per_second": 70.995,
404
+ "eval_steps_per_second": 8.925,
405
  "step": 900
406
  },
407
  {
408
  "epoch": 1.46,
409
+ "grad_norm": 3.5758163928985596,
410
+ "learning_rate": 2.6831210191082805e-05,
411
+ "loss": 0.3657,
412
  "step": 920
413
  },
414
  {
415
  "epoch": 1.5,
416
+ "grad_norm": 2.174391031265259,
417
+ "learning_rate": 2.5238853503184718e-05,
418
+ "loss": 0.3409,
419
  "step": 940
420
  },
421
  {
422
  "epoch": 1.53,
423
+ "grad_norm": 3.542391300201416,
424
+ "learning_rate": 2.372611464968153e-05,
425
+ "loss": 0.3414,
426
  "step": 960
427
  },
428
  {
429
  "epoch": 1.56,
430
+ "grad_norm": 4.226655006408691,
431
+ "learning_rate": 2.2133757961783442e-05,
432
+ "loss": 0.383,
433
  "step": 980
434
  },
435
  {
436
  "epoch": 1.59,
437
+ "grad_norm": 5.462564945220947,
438
+ "learning_rate": 2.054140127388535e-05,
439
+ "loss": 0.4011,
440
  "step": 1000
441
  },
442
  {
443
  "epoch": 1.59,
444
+ "eval_accuracy": 0.7457142857142857,
445
+ "eval_loss": 0.8153719305992126,
446
+ "eval_runtime": 14.4617,
447
+ "eval_samples_per_second": 72.605,
448
+ "eval_steps_per_second": 9.128,
449
  "step": 1000
450
  },
451
  {
452
  "epoch": 1.62,
453
+ "grad_norm": 6.1501569747924805,
454
+ "learning_rate": 1.8949044585987264e-05,
455
+ "loss": 0.3402,
456
  "step": 1020
457
  },
458
  {
459
  "epoch": 1.66,
460
+ "grad_norm": 2.9438650608062744,
461
+ "learning_rate": 1.7356687898089173e-05,
462
+ "loss": 0.2997,
463
  "step": 1040
464
  },
465
  {
466
  "epoch": 1.69,
467
+ "grad_norm": 1.3817728757858276,
468
+ "learning_rate": 1.5764331210191083e-05,
469
+ "loss": 0.3485,
470
  "step": 1060
471
  },
472
  {
473
  "epoch": 1.72,
474
+ "grad_norm": 0.5100256204605103,
475
+ "learning_rate": 1.4171974522292993e-05,
476
+ "loss": 0.3804,
477
  "step": 1080
478
  },
479
  {
480
  "epoch": 1.75,
481
+ "grad_norm": 7.605688095092773,
482
+ "learning_rate": 1.2579617834394904e-05,
483
+ "loss": 0.3493,
484
  "step": 1100
485
  },
486
  {
487
  "epoch": 1.75,
488
+ "eval_accuracy": 0.7752380952380953,
489
+ "eval_loss": 0.7397615313529968,
490
+ "eval_runtime": 14.8106,
491
+ "eval_samples_per_second": 70.895,
492
+ "eval_steps_per_second": 8.913,
493
  "step": 1100
494
  },
495
  {
496
  "epoch": 1.78,
497
+ "grad_norm": 10.322568893432617,
498
+ "learning_rate": 1.0987261146496815e-05,
499
+ "loss": 0.4022,
500
  "step": 1120
501
  },
502
  {
503
  "epoch": 1.82,
504
+ "grad_norm": 5.649250030517578,
505
+ "learning_rate": 9.394904458598726e-06,
506
+ "loss": 0.2426,
507
  "step": 1140
508
  },
509
  {
510
  "epoch": 1.85,
511
+ "grad_norm": 7.395249366760254,
512
+ "learning_rate": 7.802547770700637e-06,
513
+ "loss": 0.2628,
514
  "step": 1160
515
  },
516
  {
517
  "epoch": 1.88,
518
+ "grad_norm": 1.7934772968292236,
519
+ "learning_rate": 6.210191082802548e-06,
520
+ "loss": 0.3818,
521
  "step": 1180
522
  },
523
  {
524
  "epoch": 1.91,
525
+ "grad_norm": 6.324862480163574,
526
+ "learning_rate": 4.6178343949044585e-06,
527
+ "loss": 0.3389,
528
  "step": 1200
529
  },
530
  {
531
  "epoch": 1.91,
532
+ "eval_accuracy": 0.7676190476190476,
533
+ "eval_loss": 0.7365464568138123,
534
+ "eval_runtime": 14.7187,
535
+ "eval_samples_per_second": 71.338,
536
+ "eval_steps_per_second": 8.968,
537
  "step": 1200
538
  },
539
  {
540
  "epoch": 1.94,
541
+ "grad_norm": 4.285161018371582,
542
+ "learning_rate": 3.0254777070063695e-06,
543
+ "loss": 0.3351,
544
  "step": 1220
545
  },
546
  {
547
  "epoch": 1.97,
548
+ "grad_norm": 4.406313896179199,
549
+ "learning_rate": 1.4331210191082802e-06,
550
+ "loss": 0.2856,
551
  "step": 1240
552
  },
553
  {
554
+ "epoch": 2.0,
555
+ "step": 1256,
556
+ "total_flos": 1.555375746295849e+18,
557
+ "train_loss": 0.6562361546382782,
558
+ "train_runtime": 775.5335,
559
+ "train_samples_per_second": 25.879,
560
+ "train_steps_per_second": 1.62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  }
562
  ],
563
  "logging_steps": 20,
564
+ "max_steps": 1256,
565
  "num_input_tokens_seen": 0,
566
+ "num_train_epochs": 2,
567
  "save_steps": 100,
568
+ "total_flos": 1.555375746295849e+18,
569
  "train_batch_size": 16,
570
  "trial_name": null,
571
  "trial_params": null