Raihan004 commited on
Commit
5b5e6cd
1 Parent(s): 2f30a44

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - imagefolder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: imagefolder
18
  type: imagefolder
19
  config: default
20
  split: train
@@ -22,7 +23,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.8664323374340949
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # Action_model
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.5153
36
- - Accuracy: 0.8664
37
 
38
  ## Model description
39
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: action_class
19
  type: imagefolder
20
  config: default
21
  split: train
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.8576449912126538
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
31
 
32
  # Action_model
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the action_class dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.4589
37
+ - Accuracy: 0.8576
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8629173989455184,
4
- "eval_loss": 0.47900134325027466,
5
- "eval_runtime": 7.7338,
6
- "eval_samples_per_second": 73.573,
7
- "eval_steps_per_second": 9.31,
8
  "total_flos": 3.3230947683690086e+18,
9
- "train_loss": 0.3864157530798841,
10
- "train_runtime": 1135.6162,
11
- "train_samples_per_second": 37.759,
12
- "train_steps_per_second": 1.18
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8576449912126538,
4
+ "eval_loss": 0.4589254856109619,
5
+ "eval_runtime": 7.9329,
6
+ "eval_samples_per_second": 71.727,
7
+ "eval_steps_per_second": 9.076,
8
  "total_flos": 3.3230947683690086e+18,
9
+ "train_loss": 0.45543073504718384,
10
+ "train_runtime": 1353.2313,
11
+ "train_samples_per_second": 31.687,
12
+ "train_steps_per_second": 1.98
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8629173989455184,
4
- "eval_loss": 0.47900134325027466,
5
- "eval_runtime": 7.7338,
6
- "eval_samples_per_second": 73.573,
7
- "eval_steps_per_second": 9.31
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8576449912126538,
4
+ "eval_loss": 0.4589254856109619,
5
+ "eval_runtime": 7.9329,
6
+ "eval_samples_per_second": 71.727,
7
+ "eval_steps_per_second": 9.076
8
  }
runs/May09_07-16-20_361db62a36de/events.out.tfevents.1715259917.361db62a36de.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:649da8ed98001a4af8b5268f38716de03351fd30b732596dea56d6bc9be2f61e
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 3.3230947683690086e+18,
4
- "train_loss": 0.3864157530798841,
5
- "train_runtime": 1135.6162,
6
- "train_samples_per_second": 37.759,
7
- "train_steps_per_second": 1.18
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 3.3230947683690086e+18,
4
+ "train_loss": 0.45543073504718384,
5
+ "train_runtime": 1353.2313,
6
+ "train_samples_per_second": 31.687,
7
+ "train_steps_per_second": 1.98
8
  }
trainer_state.json CHANGED
@@ -1,616 +1,2140 @@
1
  {
2
- "best_metric": 0.47900134325027466,
3
- "best_model_checkpoint": "Action_model/checkpoint-800",
4
  "epoch": 10.0,
5
  "eval_steps": 100,
6
- "global_step": 1340,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.15,
13
- "grad_norm": 1.667541265487671,
14
- "learning_rate": 9.850746268656717e-05,
15
- "loss": 1.0751,
 
 
 
 
 
 
 
16
  "step": 20
17
  },
18
  {
19
- "epoch": 0.3,
20
- "grad_norm": 4.206006050109863,
21
- "learning_rate": 9.701492537313434e-05,
22
- "loss": 0.8775,
 
 
 
 
 
 
 
23
  "step": 40
24
  },
25
  {
26
- "epoch": 0.45,
27
- "grad_norm": 2.1778831481933594,
28
- "learning_rate": 9.552238805970149e-05,
29
- "loss": 0.7614,
 
 
 
 
 
 
 
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.6,
34
- "grad_norm": 2.380641222000122,
35
- "learning_rate": 9.402985074626867e-05,
36
- "loss": 0.701,
 
 
 
 
 
 
 
37
  "step": 80
38
  },
39
  {
40
- "epoch": 0.75,
41
- "grad_norm": 3.6440467834472656,
42
- "learning_rate": 9.253731343283582e-05,
43
- "loss": 0.7766,
 
 
 
 
 
 
 
44
  "step": 100
45
  },
46
  {
47
- "epoch": 0.75,
48
- "eval_accuracy": 0.8224956063268892,
49
- "eval_loss": 0.6780304312705994,
50
- "eval_runtime": 8.1524,
51
- "eval_samples_per_second": 69.796,
52
- "eval_steps_per_second": 8.832,
53
  "step": 100
54
  },
55
  {
56
- "epoch": 0.9,
57
- "grad_norm": 3.5913617610931396,
58
- "learning_rate": 9.104477611940299e-05,
59
- "loss": 0.8058,
 
 
 
 
 
 
 
60
  "step": 120
61
  },
62
  {
63
- "epoch": 1.04,
64
- "grad_norm": 3.761014938354492,
65
- "learning_rate": 8.955223880597016e-05,
66
- "loss": 0.7853,
 
 
 
 
 
 
 
67
  "step": 140
68
  },
69
  {
70
- "epoch": 1.19,
71
- "grad_norm": 2.7116057872772217,
72
- "learning_rate": 8.813432835820896e-05,
73
- "loss": 0.6584,
 
 
 
 
 
 
 
74
  "step": 160
75
  },
76
  {
77
- "epoch": 1.34,
78
- "grad_norm": 4.109683990478516,
79
- "learning_rate": 8.664179104477612e-05,
80
- "loss": 0.6056,
 
 
 
 
 
 
 
81
  "step": 180
82
  },
83
  {
84
- "epoch": 1.49,
85
- "grad_norm": 3.7420246601104736,
86
- "learning_rate": 8.514925373134329e-05,
87
- "loss": 0.61,
 
 
 
 
 
 
 
88
  "step": 200
89
  },
90
  {
91
- "epoch": 1.49,
92
- "eval_accuracy": 0.8242530755711776,
93
- "eval_loss": 0.6279409527778625,
94
- "eval_runtime": 7.7566,
95
- "eval_samples_per_second": 73.357,
96
- "eval_steps_per_second": 9.282,
97
  "step": 200
98
  },
99
  {
100
- "epoch": 1.64,
101
- "grad_norm": 4.6456379890441895,
102
- "learning_rate": 8.365671641791046e-05,
103
- "loss": 0.6016,
 
 
 
 
 
 
 
104
  "step": 220
105
  },
106
  {
107
- "epoch": 1.79,
108
- "grad_norm": 3.174193859100342,
109
- "learning_rate": 8.216417910447761e-05,
110
- "loss": 0.5734,
 
 
 
 
 
 
 
111
  "step": 240
112
  },
113
  {
114
- "epoch": 1.94,
115
- "grad_norm": 4.6657304763793945,
116
- "learning_rate": 8.067164179104479e-05,
117
- "loss": 0.5684,
 
 
 
 
 
 
 
118
  "step": 260
119
  },
120
  {
121
- "epoch": 2.09,
122
- "grad_norm": 4.963318347930908,
123
- "learning_rate": 7.917910447761194e-05,
124
- "loss": 0.5957,
 
 
 
 
 
 
 
125
  "step": 280
126
  },
127
  {
128
- "epoch": 2.24,
129
- "grad_norm": 5.692253589630127,
130
- "learning_rate": 7.776119402985074e-05,
131
- "loss": 0.4734,
 
 
 
 
 
 
 
132
  "step": 300
133
  },
134
  {
135
- "epoch": 2.24,
136
- "eval_accuracy": 0.827768014059754,
137
- "eval_loss": 0.5593364834785461,
138
- "eval_runtime": 7.7448,
139
- "eval_samples_per_second": 73.468,
140
- "eval_steps_per_second": 9.297,
141
  "step": 300
142
  },
143
  {
144
- "epoch": 2.39,
145
- "grad_norm": 3.6985843181610107,
146
- "learning_rate": 7.626865671641792e-05,
147
- "loss": 0.4646,
 
 
 
 
 
 
 
148
  "step": 320
149
  },
150
  {
151
- "epoch": 2.54,
152
- "grad_norm": 3.7862443923950195,
153
- "learning_rate": 7.477611940298508e-05,
154
- "loss": 0.4568,
 
 
 
 
 
 
 
155
  "step": 340
156
  },
157
  {
158
- "epoch": 2.69,
159
- "grad_norm": 4.695005893707275,
160
- "learning_rate": 7.328358208955224e-05,
161
- "loss": 0.4876,
 
 
 
 
 
 
 
162
  "step": 360
163
  },
164
  {
165
- "epoch": 2.84,
166
- "grad_norm": 3.9707813262939453,
167
- "learning_rate": 7.179104477611941e-05,
168
- "loss": 0.4723,
 
 
 
 
 
 
 
169
  "step": 380
170
  },
171
  {
172
- "epoch": 2.99,
173
- "grad_norm": 3.133072853088379,
174
- "learning_rate": 7.029850746268657e-05,
175
- "loss": 0.5275,
 
 
 
 
 
 
 
176
  "step": 400
177
  },
178
  {
179
- "epoch": 2.99,
180
- "eval_accuracy": 0.8418277680140598,
181
- "eval_loss": 0.5148488879203796,
182
- "eval_runtime": 7.8181,
183
- "eval_samples_per_second": 72.78,
184
- "eval_steps_per_second": 9.209,
185
  "step": 400
186
  },
187
  {
188
- "epoch": 3.13,
189
- "grad_norm": 3.1703858375549316,
190
- "learning_rate": 6.880597014925374e-05,
191
- "loss": 0.353,
 
 
 
 
 
 
 
192
  "step": 420
193
  },
194
  {
195
- "epoch": 3.28,
196
- "grad_norm": 3.551959276199341,
197
- "learning_rate": 6.73134328358209e-05,
198
- "loss": 0.3559,
 
 
 
 
 
 
 
199
  "step": 440
200
  },
201
  {
202
- "epoch": 3.43,
203
- "grad_norm": 3.8375322818756104,
204
- "learning_rate": 6.582089552238806e-05,
205
- "loss": 0.376,
 
 
 
 
 
 
 
206
  "step": 460
207
  },
208
  {
209
- "epoch": 3.58,
210
- "grad_norm": 4.613718032836914,
211
- "learning_rate": 6.432835820895523e-05,
212
- "loss": 0.4183,
 
 
 
 
 
 
 
213
  "step": 480
214
  },
215
  {
216
- "epoch": 3.73,
217
- "grad_norm": 9.122322082519531,
218
- "learning_rate": 6.283582089552239e-05,
219
- "loss": 0.3767,
 
 
 
 
 
 
 
220
  "step": 500
221
  },
222
  {
223
- "epoch": 3.73,
224
- "eval_accuracy": 0.843585237258348,
225
- "eval_loss": 0.5129419565200806,
226
- "eval_runtime": 7.7011,
227
- "eval_samples_per_second": 73.886,
228
- "eval_steps_per_second": 9.349,
229
  "step": 500
230
  },
231
  {
232
- "epoch": 3.88,
233
- "grad_norm": 1.9917536973953247,
234
- "learning_rate": 6.134328358208955e-05,
235
- "loss": 0.3943,
 
 
 
 
 
 
 
236
  "step": 520
237
  },
238
  {
239
- "epoch": 4.03,
240
- "grad_norm": 3.007828712463379,
241
- "learning_rate": 5.985074626865672e-05,
242
- "loss": 0.3885,
 
 
 
 
 
 
 
243
  "step": 540
244
  },
245
  {
246
- "epoch": 4.18,
247
- "grad_norm": 6.075244426727295,
248
- "learning_rate": 5.835820895522388e-05,
249
- "loss": 0.3312,
 
 
 
 
 
 
 
250
  "step": 560
251
  },
252
  {
253
- "epoch": 4.33,
254
- "grad_norm": 4.8264641761779785,
255
- "learning_rate": 5.686567164179105e-05,
256
- "loss": 0.3408,
 
 
 
 
 
 
 
257
  "step": 580
258
  },
259
  {
260
- "epoch": 4.48,
261
- "grad_norm": 2.8609578609466553,
262
- "learning_rate": 5.537313432835821e-05,
263
- "loss": 0.3207,
 
 
 
 
 
 
 
264
  "step": 600
265
  },
266
  {
267
- "epoch": 4.48,
268
- "eval_accuracy": 0.8558875219683656,
269
- "eval_loss": 0.4966126084327698,
270
- "eval_runtime": 7.6408,
271
- "eval_samples_per_second": 74.468,
272
- "eval_steps_per_second": 9.423,
273
  "step": 600
274
  },
275
  {
276
- "epoch": 4.63,
277
- "grad_norm": 5.234914302825928,
278
- "learning_rate": 5.388059701492537e-05,
279
- "loss": 0.3306,
 
 
 
 
 
 
 
280
  "step": 620
281
  },
282
  {
283
- "epoch": 4.78,
284
- "grad_norm": 4.566553592681885,
285
- "learning_rate": 5.238805970149254e-05,
286
- "loss": 0.3532,
 
 
 
 
 
 
 
287
  "step": 640
288
  },
289
  {
290
- "epoch": 4.93,
291
- "grad_norm": 4.077399253845215,
292
- "learning_rate": 5.08955223880597e-05,
293
- "loss": 0.3501,
 
 
 
 
 
 
 
294
  "step": 660
295
  },
296
  {
297
- "epoch": 5.07,
298
- "grad_norm": 3.5527923107147217,
299
- "learning_rate": 4.940298507462687e-05,
300
- "loss": 0.3147,
 
 
 
 
 
 
 
301
  "step": 680
302
  },
303
  {
304
- "epoch": 5.22,
305
- "grad_norm": 4.944146633148193,
306
- "learning_rate": 4.7910447761194035e-05,
307
- "loss": 0.3155,
 
 
 
 
 
 
 
308
  "step": 700
309
  },
310
  {
311
- "epoch": 5.22,
312
- "eval_accuracy": 0.8453427065026362,
313
- "eval_loss": 0.5251042246818542,
314
- "eval_runtime": 7.8327,
315
- "eval_samples_per_second": 72.644,
316
- "eval_steps_per_second": 9.192,
317
  "step": 700
318
  },
319
  {
320
- "epoch": 5.37,
321
- "grad_norm": 2.9990365505218506,
322
- "learning_rate": 4.6417910447761195e-05,
323
- "loss": 0.3121,
 
 
 
 
 
 
 
324
  "step": 720
325
  },
326
  {
327
- "epoch": 5.52,
328
- "grad_norm": 1.7021130323410034,
329
- "learning_rate": 4.492537313432836e-05,
330
- "loss": 0.3563,
 
 
 
 
 
 
 
331
  "step": 740
332
  },
333
  {
334
- "epoch": 5.67,
335
- "grad_norm": 4.41218376159668,
336
- "learning_rate": 4.343283582089552e-05,
337
- "loss": 0.3447,
 
 
 
 
 
 
 
338
  "step": 760
339
  },
340
  {
341
- "epoch": 5.82,
342
- "grad_norm": 2.955658197402954,
343
- "learning_rate": 4.194029850746269e-05,
344
- "loss": 0.2839,
 
 
 
 
 
 
 
345
  "step": 780
346
  },
347
  {
348
- "epoch": 5.97,
349
- "grad_norm": 3.0852389335632324,
350
- "learning_rate": 4.044776119402985e-05,
351
- "loss": 0.2565,
 
 
 
 
 
 
 
352
  "step": 800
353
  },
354
  {
355
- "epoch": 5.97,
356
- "eval_accuracy": 0.8629173989455184,
357
- "eval_loss": 0.47900134325027466,
358
- "eval_runtime": 7.78,
359
- "eval_samples_per_second": 73.136,
360
- "eval_steps_per_second": 9.255,
361
  "step": 800
362
  },
363
  {
364
- "epoch": 6.12,
365
- "grad_norm": 2.099174976348877,
366
- "learning_rate": 3.895522388059702e-05,
367
- "loss": 0.2818,
 
 
 
 
 
 
 
368
  "step": 820
369
  },
370
  {
371
- "epoch": 6.27,
372
- "grad_norm": 3.712127685546875,
373
- "learning_rate": 3.746268656716418e-05,
374
- "loss": 0.2444,
 
 
 
 
 
 
 
375
  "step": 840
376
  },
377
  {
378
- "epoch": 6.42,
379
- "grad_norm": 2.1818361282348633,
380
- "learning_rate": 3.5970149253731346e-05,
381
- "loss": 0.2418,
 
 
 
 
 
 
 
382
  "step": 860
383
  },
384
  {
385
- "epoch": 6.57,
386
- "grad_norm": 2.211638927459717,
387
- "learning_rate": 3.447761194029851e-05,
388
- "loss": 0.2684,
 
 
 
 
 
 
 
389
  "step": 880
390
  },
391
  {
392
- "epoch": 6.72,
393
- "grad_norm": 6.349803924560547,
394
- "learning_rate": 3.298507462686568e-05,
395
- "loss": 0.2791,
 
 
 
 
 
 
 
396
  "step": 900
397
  },
398
  {
399
- "epoch": 6.72,
400
- "eval_accuracy": 0.8523725834797891,
401
- "eval_loss": 0.5110830664634705,
402
- "eval_runtime": 7.7612,
403
- "eval_samples_per_second": 73.313,
404
- "eval_steps_per_second": 9.277,
405
  "step": 900
406
  },
407
  {
408
- "epoch": 6.87,
409
- "grad_norm": 4.6033759117126465,
410
- "learning_rate": 3.149253731343284e-05,
411
- "loss": 0.2444,
 
 
 
 
 
 
 
412
  "step": 920
413
  },
414
  {
415
- "epoch": 7.01,
416
- "grad_norm": 6.079771995544434,
417
- "learning_rate": 3e-05,
418
- "loss": 0.2812,
 
 
 
 
 
 
 
419
  "step": 940
420
  },
421
  {
422
- "epoch": 7.16,
423
- "grad_norm": 3.743011474609375,
424
- "learning_rate": 2.8507462686567167e-05,
425
- "loss": 0.183,
 
 
 
 
 
 
 
426
  "step": 960
427
  },
428
  {
429
- "epoch": 7.31,
430
- "grad_norm": 4.840090751647949,
431
- "learning_rate": 2.701492537313433e-05,
432
- "loss": 0.2689,
 
 
 
 
 
 
 
433
  "step": 980
434
  },
435
  {
436
- "epoch": 7.46,
437
- "grad_norm": 3.213412284851074,
438
- "learning_rate": 2.5522388059701496e-05,
439
- "loss": 0.1987,
 
 
 
 
 
 
 
440
  "step": 1000
441
  },
442
  {
443
- "epoch": 7.46,
444
- "eval_accuracy": 0.8453427065026362,
445
- "eval_loss": 0.5002422332763672,
446
- "eval_runtime": 7.6836,
447
- "eval_samples_per_second": 74.054,
448
- "eval_steps_per_second": 9.371,
449
  "step": 1000
450
  },
451
  {
452
- "epoch": 7.61,
453
- "grad_norm": 2.2559454441070557,
454
- "learning_rate": 2.402985074626866e-05,
455
- "loss": 0.2254,
 
 
 
 
 
 
 
456
  "step": 1020
457
  },
458
  {
459
- "epoch": 7.76,
460
- "grad_norm": 4.895073413848877,
461
- "learning_rate": 2.2537313432835822e-05,
462
- "loss": 0.283,
 
 
 
 
 
 
 
463
  "step": 1040
464
  },
465
  {
466
- "epoch": 7.91,
467
- "grad_norm": 3.8669703006744385,
468
- "learning_rate": 2.1044776119402985e-05,
469
- "loss": 0.281,
 
 
 
 
 
 
 
470
  "step": 1060
471
  },
472
  {
473
- "epoch": 8.06,
474
- "grad_norm": 2.0825252532958984,
475
- "learning_rate": 1.9552238805970148e-05,
476
- "loss": 0.1955,
 
 
 
 
 
 
 
477
  "step": 1080
478
  },
479
  {
480
- "epoch": 8.21,
481
- "grad_norm": 1.101592779159546,
482
- "learning_rate": 1.8059701492537314e-05,
483
- "loss": 0.2083,
 
 
 
 
 
 
 
484
  "step": 1100
485
  },
486
  {
487
- "epoch": 8.21,
488
- "eval_accuracy": 0.8629173989455184,
489
- "eval_loss": 0.5034471154212952,
490
- "eval_runtime": 7.7486,
491
- "eval_samples_per_second": 73.432,
492
- "eval_steps_per_second": 9.292,
493
  "step": 1100
494
  },
495
  {
496
- "epoch": 8.36,
497
- "grad_norm": 2.6236236095428467,
498
- "learning_rate": 1.6567164179104477e-05,
499
- "loss": 0.1409,
 
 
 
 
 
 
 
500
  "step": 1120
501
  },
502
  {
503
- "epoch": 8.51,
504
- "grad_norm": 1.433937668800354,
505
- "learning_rate": 1.5074626865671642e-05,
506
- "loss": 0.2434,
 
 
 
 
 
 
 
507
  "step": 1140
508
  },
509
  {
510
- "epoch": 8.66,
511
- "grad_norm": 2.8774006366729736,
512
- "learning_rate": 1.3582089552238805e-05,
513
- "loss": 0.2044,
 
 
 
 
 
 
 
514
  "step": 1160
515
  },
516
  {
517
- "epoch": 8.81,
518
- "grad_norm": 4.404654026031494,
519
- "learning_rate": 1.2089552238805971e-05,
520
- "loss": 0.2153,
 
 
 
 
 
 
 
521
  "step": 1180
522
  },
523
  {
524
- "epoch": 8.96,
525
- "grad_norm": 4.886945724487305,
526
- "learning_rate": 1.0597014925373134e-05,
527
- "loss": 0.2567,
 
 
 
 
 
 
 
528
  "step": 1200
529
  },
530
  {
531
- "epoch": 8.96,
532
  "eval_accuracy": 0.8576449912126538,
533
- "eval_loss": 0.4995073080062866,
534
- "eval_runtime": 7.7157,
535
- "eval_samples_per_second": 73.745,
536
- "eval_steps_per_second": 9.332,
537
  "step": 1200
538
  },
539
  {
540
- "epoch": 9.1,
541
- "grad_norm": 0.4216682016849518,
542
- "learning_rate": 9.104477611940299e-06,
543
- "loss": 0.203,
 
 
 
 
 
 
 
544
  "step": 1220
545
  },
546
  {
547
- "epoch": 9.25,
548
- "grad_norm": 4.639057159423828,
549
- "learning_rate": 7.611940298507463e-06,
550
- "loss": 0.1934,
 
 
 
 
 
 
 
551
  "step": 1240
552
  },
553
  {
554
- "epoch": 9.4,
555
- "grad_norm": 4.426870822906494,
556
- "learning_rate": 6.119402985074627e-06,
557
- "loss": 0.2067,
 
 
 
 
 
 
 
558
  "step": 1260
559
  },
560
  {
561
- "epoch": 9.55,
562
- "grad_norm": 2.948902130126953,
563
- "learning_rate": 4.626865671641791e-06,
564
- "loss": 0.2065,
 
 
 
 
 
 
 
565
  "step": 1280
566
  },
567
  {
568
- "epoch": 9.7,
569
- "grad_norm": 2.3631768226623535,
570
- "learning_rate": 3.134328358208955e-06,
571
- "loss": 0.2127,
 
 
 
 
 
 
 
572
  "step": 1300
573
  },
574
  {
575
- "epoch": 9.7,
576
- "eval_accuracy": 0.8488576449912126,
577
- "eval_loss": 0.5034462809562683,
578
- "eval_runtime": 7.6386,
579
- "eval_samples_per_second": 74.49,
580
- "eval_steps_per_second": 9.426,
581
  "step": 1300
582
  },
583
  {
584
- "epoch": 9.85,
585
- "grad_norm": 2.909392833709717,
586
- "learning_rate": 1.6417910447761194e-06,
587
- "loss": 0.1547,
 
 
 
 
 
 
 
588
  "step": 1320
589
  },
590
  {
591
- "epoch": 10.0,
592
- "grad_norm": 2.5194036960601807,
593
- "learning_rate": 1.4925373134328358e-07,
594
- "loss": 0.2161,
 
 
 
 
 
 
 
595
  "step": 1340
596
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  {
598
  "epoch": 10.0,
599
- "step": 1340,
600
  "total_flos": 3.3230947683690086e+18,
601
- "train_loss": 0.3864157530798841,
602
- "train_runtime": 1135.6162,
603
- "train_samples_per_second": 37.759,
604
- "train_steps_per_second": 1.18
605
  }
606
  ],
607
- "logging_steps": 20,
608
- "max_steps": 1340,
609
  "num_input_tokens_seen": 0,
610
  "num_train_epochs": 10,
611
  "save_steps": 100,
612
  "total_flos": 3.3230947683690086e+18,
613
- "train_batch_size": 32,
614
  "trial_name": null,
615
  "trial_params": null
616
  }
 
1
  {
2
+ "best_metric": 0.4589254856109619,
3
+ "best_model_checkpoint": "Action_model/checkpoint-1500",
4
  "epoch": 10.0,
5
  "eval_steps": 100,
6
+ "global_step": 2680,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.04,
13
+ "grad_norm": 1.7369908094406128,
14
+ "learning_rate": 9.96268656716418e-05,
15
+ "loss": 2.2759,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.07,
20
+ "grad_norm": 1.753720998764038,
21
+ "learning_rate": 9.925373134328359e-05,
22
+ "loss": 2.1743,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.11,
27
+ "grad_norm": 1.8532754182815552,
28
+ "learning_rate": 9.888059701492539e-05,
29
+ "loss": 2.0233,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.15,
34
+ "grad_norm": 2.195688486099243,
35
+ "learning_rate": 9.850746268656717e-05,
36
+ "loss": 1.8293,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.19,
41
+ "grad_norm": 2.392077684402466,
42
+ "learning_rate": 9.813432835820896e-05,
43
+ "loss": 1.7307,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.22,
48
+ "grad_norm": 2.851775646209717,
49
+ "learning_rate": 9.776119402985075e-05,
50
+ "loss": 1.5716,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.26,
55
+ "grad_norm": 2.2557411193847656,
56
+ "learning_rate": 9.738805970149254e-05,
57
+ "loss": 1.4694,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.3,
62
+ "grad_norm": 2.4612302780151367,
63
+ "learning_rate": 9.701492537313434e-05,
64
+ "loss": 1.3609,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.34,
69
+ "grad_norm": 2.7514560222625732,
70
+ "learning_rate": 9.664179104477612e-05,
71
+ "loss": 1.2871,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.37,
76
+ "grad_norm": 3.6256659030914307,
77
+ "learning_rate": 9.626865671641792e-05,
78
+ "loss": 1.2754,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.37,
83
+ "eval_accuracy": 0.7328646748681898,
84
+ "eval_loss": 1.1163370609283447,
85
+ "eval_runtime": 12.5514,
86
+ "eval_samples_per_second": 45.333,
87
+ "eval_steps_per_second": 5.736,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 0.41,
92
+ "grad_norm": 2.642601728439331,
93
+ "learning_rate": 9.58955223880597e-05,
94
+ "loss": 1.2354,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.45,
99
+ "grad_norm": 2.4862725734710693,
100
+ "learning_rate": 9.552238805970149e-05,
101
+ "loss": 1.169,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 0.49,
106
+ "grad_norm": 3.962764263153076,
107
+ "learning_rate": 9.514925373134329e-05,
108
+ "loss": 1.2546,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.52,
113
+ "grad_norm": 2.9388816356658936,
114
+ "learning_rate": 9.477611940298507e-05,
115
+ "loss": 1.1702,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 0.56,
120
+ "grad_norm": 4.958592414855957,
121
+ "learning_rate": 9.440298507462687e-05,
122
+ "loss": 1.0865,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 0.6,
127
+ "grad_norm": 3.4470815658569336,
128
+ "learning_rate": 9.402985074626867e-05,
129
+ "loss": 1.0097,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 0.63,
134
+ "grad_norm": 4.423004627227783,
135
+ "learning_rate": 9.365671641791045e-05,
136
+ "loss": 1.0749,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 0.67,
141
+ "grad_norm": 2.808164358139038,
142
+ "learning_rate": 9.328358208955224e-05,
143
+ "loss": 0.9732,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 0.71,
148
+ "grad_norm": 6.00456428527832,
149
+ "learning_rate": 9.291044776119402e-05,
150
+ "loss": 1.0009,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 0.75,
155
+ "grad_norm": 5.091552734375,
156
+ "learning_rate": 9.253731343283582e-05,
157
+ "loss": 0.9345,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 0.75,
162
+ "eval_accuracy": 0.7996485061511424,
163
+ "eval_loss": 0.8296495079994202,
164
+ "eval_runtime": 7.8912,
165
+ "eval_samples_per_second": 72.105,
166
+ "eval_steps_per_second": 9.124,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 0.78,
171
+ "grad_norm": 3.2533326148986816,
172
+ "learning_rate": 9.216417910447762e-05,
173
+ "loss": 0.793,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 0.82,
178
+ "grad_norm": 6.073918342590332,
179
+ "learning_rate": 9.17910447761194e-05,
180
+ "loss": 0.9835,
181
  "step": 220
182
  },
183
  {
184
+ "epoch": 0.86,
185
+ "grad_norm": 3.6311192512512207,
186
+ "learning_rate": 9.14179104477612e-05,
187
+ "loss": 0.8801,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.9,
192
+ "grad_norm": 4.446895599365234,
193
+ "learning_rate": 9.104477611940299e-05,
194
+ "loss": 1.0534,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 0.93,
199
+ "grad_norm": 4.668705463409424,
200
+ "learning_rate": 9.067164179104479e-05,
201
+ "loss": 0.9396,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 0.97,
206
+ "grad_norm": 6.191302299499512,
207
+ "learning_rate": 9.029850746268657e-05,
208
+ "loss": 0.9275,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 1.01,
213
+ "grad_norm": 3.170959711074829,
214
+ "learning_rate": 8.992537313432836e-05,
215
+ "loss": 0.8595,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 1.04,
220
+ "grad_norm": 3.690964460372925,
221
+ "learning_rate": 8.955223880597016e-05,
222
+ "loss": 0.733,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 1.08,
227
+ "grad_norm": 4.871851444244385,
228
+ "learning_rate": 8.917910447761194e-05,
229
+ "loss": 0.7623,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 1.12,
234
+ "grad_norm": 3.3851799964904785,
235
+ "learning_rate": 8.880597014925374e-05,
236
+ "loss": 0.8816,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 1.12,
241
+ "eval_accuracy": 0.8101933216168717,
242
+ "eval_loss": 0.7156229615211487,
243
+ "eval_runtime": 7.8519,
244
+ "eval_samples_per_second": 72.467,
245
+ "eval_steps_per_second": 9.17,
246
  "step": 300
247
  },
248
  {
249
+ "epoch": 1.16,
250
+ "grad_norm": 3.334380865097046,
251
+ "learning_rate": 8.843283582089554e-05,
252
+ "loss": 0.8567,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 1.19,
257
+ "grad_norm": 4.673859596252441,
258
+ "learning_rate": 8.805970149253732e-05,
259
+ "loss": 0.7926,
260
  "step": 320
261
  },
262
  {
263
+ "epoch": 1.23,
264
+ "grad_norm": 3.3042550086975098,
265
+ "learning_rate": 8.76865671641791e-05,
266
+ "loss": 0.6847,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 1.27,
271
+ "grad_norm": 5.4356513023376465,
272
+ "learning_rate": 8.731343283582089e-05,
273
+ "loss": 0.7656,
274
  "step": 340
275
  },
276
  {
277
+ "epoch": 1.31,
278
+ "grad_norm": 7.050413131713867,
279
+ "learning_rate": 8.694029850746269e-05,
280
+ "loss": 0.6658,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 1.34,
285
+ "grad_norm": 5.980592727661133,
286
+ "learning_rate": 8.656716417910447e-05,
287
+ "loss": 0.7948,
288
  "step": 360
289
  },
290
  {
291
+ "epoch": 1.38,
292
+ "grad_norm": 3.894716739654541,
293
+ "learning_rate": 8.619402985074627e-05,
294
+ "loss": 0.8381,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 1.42,
299
+ "grad_norm": 7.189664363861084,
300
+ "learning_rate": 8.582089552238807e-05,
301
+ "loss": 0.6532,
302
  "step": 380
303
  },
304
  {
305
+ "epoch": 1.46,
306
+ "grad_norm": 4.317276477813721,
307
+ "learning_rate": 8.544776119402986e-05,
308
+ "loss": 0.7763,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 1.49,
313
+ "grad_norm": 4.480589866638184,
314
+ "learning_rate": 8.511194029850747e-05,
315
+ "loss": 0.7425,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 1.49,
320
+ "eval_accuracy": 0.8066783831282952,
321
+ "eval_loss": 0.6529447436332703,
322
+ "eval_runtime": 7.793,
323
+ "eval_samples_per_second": 73.014,
324
+ "eval_steps_per_second": 9.239,
325
  "step": 400
326
  },
327
  {
328
+ "epoch": 1.53,
329
+ "grad_norm": 4.1799163818359375,
330
+ "learning_rate": 8.473880597014926e-05,
331
+ "loss": 0.6928,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 1.57,
336
+ "grad_norm": 4.81996488571167,
337
+ "learning_rate": 8.436567164179105e-05,
338
+ "loss": 0.7769,
339
  "step": 420
340
  },
341
  {
342
+ "epoch": 1.6,
343
+ "grad_norm": 7.18645715713501,
344
+ "learning_rate": 8.399253731343283e-05,
345
+ "loss": 0.6848,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 1.64,
350
+ "grad_norm": 3.888197660446167,
351
+ "learning_rate": 8.361940298507463e-05,
352
+ "loss": 0.5977,
353
  "step": 440
354
  },
355
  {
356
+ "epoch": 1.68,
357
+ "grad_norm": 7.374312877655029,
358
+ "learning_rate": 8.324626865671642e-05,
359
+ "loss": 0.6001,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 1.72,
364
+ "grad_norm": 6.553064823150635,
365
+ "learning_rate": 8.287313432835821e-05,
366
+ "loss": 0.6683,
367
  "step": 460
368
  },
369
  {
370
+ "epoch": 1.75,
371
+ "grad_norm": 3.466761589050293,
372
+ "learning_rate": 8.25e-05,
373
+ "loss": 0.6484,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 1.79,
378
+ "grad_norm": 3.534076690673828,
379
+ "learning_rate": 8.21268656716418e-05,
380
+ "loss": 0.6589,
381
  "step": 480
382
  },
383
  {
384
+ "epoch": 1.83,
385
+ "grad_norm": 3.581280469894409,
386
+ "learning_rate": 8.17537313432836e-05,
387
+ "loss": 0.6173,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 1.87,
392
+ "grad_norm": 6.162041664123535,
393
+ "learning_rate": 8.138059701492538e-05,
394
+ "loss": 0.6883,
395
  "step": 500
396
  },
397
  {
398
+ "epoch": 1.87,
399
+ "eval_accuracy": 0.8242530755711776,
400
+ "eval_loss": 0.6078779697418213,
401
+ "eval_runtime": 7.6716,
402
+ "eval_samples_per_second": 74.169,
403
+ "eval_steps_per_second": 9.385,
404
  "step": 500
405
  },
406
  {
407
+ "epoch": 1.9,
408
+ "grad_norm": 5.477086067199707,
409
+ "learning_rate": 8.100746268656717e-05,
410
+ "loss": 0.5952,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 1.94,
415
+ "grad_norm": 2.389667510986328,
416
+ "learning_rate": 8.063432835820895e-05,
417
+ "loss": 0.5193,
418
  "step": 520
419
  },
420
  {
421
+ "epoch": 1.98,
422
+ "grad_norm": 5.730781555175781,
423
+ "learning_rate": 8.026119402985075e-05,
424
+ "loss": 0.6818,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 2.01,
429
+ "grad_norm": 6.305990219116211,
430
+ "learning_rate": 7.992537313432836e-05,
431
+ "loss": 0.5738,
432
  "step": 540
433
  },
434
  {
435
+ "epoch": 2.05,
436
+ "grad_norm": 3.507434368133545,
437
+ "learning_rate": 7.955223880597016e-05,
438
+ "loss": 0.5685,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 2.09,
443
+ "grad_norm": 12.683993339538574,
444
+ "learning_rate": 7.917910447761194e-05,
445
+ "loss": 0.6684,
446
  "step": 560
447
  },
448
  {
449
+ "epoch": 2.13,
450
+ "grad_norm": 5.5166916847229,
451
+ "learning_rate": 7.880597014925374e-05,
452
+ "loss": 0.4787,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 2.16,
457
+ "grad_norm": 6.427499294281006,
458
+ "learning_rate": 7.843283582089552e-05,
459
+ "loss": 0.5818,
460
  "step": 580
461
  },
462
  {
463
+ "epoch": 2.2,
464
+ "grad_norm": 5.062973976135254,
465
+ "learning_rate": 7.805970149253732e-05,
466
+ "loss": 0.4766,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 2.24,
471
+ "grad_norm": 5.720675945281982,
472
+ "learning_rate": 7.768656716417911e-05,
473
+ "loss": 0.5454,
474
  "step": 600
475
  },
476
  {
477
+ "epoch": 2.24,
478
+ "eval_accuracy": 0.8347978910369068,
479
+ "eval_loss": 0.5604887008666992,
480
+ "eval_runtime": 7.7133,
481
+ "eval_samples_per_second": 73.769,
482
+ "eval_steps_per_second": 9.335,
483
  "step": 600
484
  },
485
  {
486
+ "epoch": 2.28,
487
+ "grad_norm": 7.875051021575928,
488
+ "learning_rate": 7.731343283582089e-05,
489
+ "loss": 0.5935,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 2.31,
494
+ "grad_norm": 4.378401756286621,
495
+ "learning_rate": 7.694029850746269e-05,
496
+ "loss": 0.4639,
497
  "step": 620
498
  },
499
  {
500
+ "epoch": 2.35,
501
+ "grad_norm": 7.522930145263672,
502
+ "learning_rate": 7.656716417910448e-05,
503
+ "loss": 0.4867,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 2.39,
508
+ "grad_norm": 6.3615288734436035,
509
+ "learning_rate": 7.619402985074627e-05,
510
+ "loss": 0.5302,
511
  "step": 640
512
  },
513
  {
514
+ "epoch": 2.43,
515
+ "grad_norm": 3.8204784393310547,
516
+ "learning_rate": 7.582089552238806e-05,
517
+ "loss": 0.3864,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 2.46,
522
+ "grad_norm": 2.3520662784576416,
523
+ "learning_rate": 7.544776119402986e-05,
524
+ "loss": 0.6458,
525
  "step": 660
526
  },
527
  {
528
+ "epoch": 2.5,
529
+ "grad_norm": 3.9832942485809326,
530
+ "learning_rate": 7.507462686567166e-05,
531
+ "loss": 0.494,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 2.54,
536
+ "grad_norm": 3.6783320903778076,
537
+ "learning_rate": 7.470149253731343e-05,
538
+ "loss": 0.6213,
539
  "step": 680
540
  },
541
  {
542
+ "epoch": 2.57,
543
+ "grad_norm": 4.528789520263672,
544
+ "learning_rate": 7.432835820895523e-05,
545
+ "loss": 0.615,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 2.61,
550
+ "grad_norm": 5.556227207183838,
551
+ "learning_rate": 7.395522388059701e-05,
552
+ "loss": 0.5383,
553
  "step": 700
554
  },
555
  {
556
+ "epoch": 2.61,
557
+ "eval_accuracy": 0.8295254833040422,
558
+ "eval_loss": 0.5571200251579285,
559
+ "eval_runtime": 7.8934,
560
+ "eval_samples_per_second": 72.085,
561
+ "eval_steps_per_second": 9.122,
562
  "step": 700
563
  },
564
  {
565
+ "epoch": 2.65,
566
+ "grad_norm": 4.617480754852295,
567
+ "learning_rate": 7.358208955223881e-05,
568
+ "loss": 0.4987,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 2.69,
573
+ "grad_norm": 4.6940412521362305,
574
+ "learning_rate": 7.32089552238806e-05,
575
+ "loss": 0.5466,
576
  "step": 720
577
  },
578
  {
579
+ "epoch": 2.72,
580
+ "grad_norm": 3.8839175701141357,
581
+ "learning_rate": 7.283582089552239e-05,
582
+ "loss": 0.5409,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 2.76,
587
+ "grad_norm": 6.855696201324463,
588
+ "learning_rate": 7.246268656716419e-05,
589
+ "loss": 0.3972,
590
  "step": 740
591
  },
592
  {
593
+ "epoch": 2.8,
594
+ "grad_norm": 3.9779269695281982,
595
+ "learning_rate": 7.208955223880597e-05,
596
+ "loss": 0.4719,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 2.84,
601
+ "grad_norm": 10.327420234680176,
602
+ "learning_rate": 7.171641791044776e-05,
603
+ "loss": 0.668,
604
  "step": 760
605
  },
606
  {
607
+ "epoch": 2.87,
608
+ "grad_norm": 5.06951904296875,
609
+ "learning_rate": 7.134328358208956e-05,
610
+ "loss": 0.5899,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 2.91,
615
+ "grad_norm": 5.539373397827148,
616
+ "learning_rate": 7.097014925373134e-05,
617
+ "loss": 0.5813,
618
  "step": 780
619
  },
620
  {
621
+ "epoch": 2.95,
622
+ "grad_norm": 4.622121334075928,
623
+ "learning_rate": 7.059701492537314e-05,
624
+ "loss": 0.5294,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 2.99,
629
+ "grad_norm": 2.6457552909851074,
630
+ "learning_rate": 7.022388059701493e-05,
631
+ "loss": 0.5442,
632
  "step": 800
633
  },
634
  {
635
+ "epoch": 2.99,
636
+ "eval_accuracy": 0.8189806678383128,
637
+ "eval_loss": 0.5864126682281494,
638
+ "eval_runtime": 7.8507,
639
+ "eval_samples_per_second": 72.478,
640
+ "eval_steps_per_second": 9.171,
641
  "step": 800
642
  },
643
  {
644
+ "epoch": 3.02,
645
+ "grad_norm": 3.373798370361328,
646
+ "learning_rate": 6.985074626865672e-05,
647
+ "loss": 0.4183,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 3.06,
652
+ "grad_norm": 4.0179667472839355,
653
+ "learning_rate": 6.947761194029851e-05,
654
+ "loss": 0.3611,
655
  "step": 820
656
  },
657
  {
658
+ "epoch": 3.1,
659
+ "grad_norm": 7.72437858581543,
660
+ "learning_rate": 6.91044776119403e-05,
661
+ "loss": 0.4543,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 3.13,
666
+ "grad_norm": 3.1097893714904785,
667
+ "learning_rate": 6.873134328358209e-05,
668
+ "loss": 0.5194,
669
  "step": 840
670
  },
671
  {
672
+ "epoch": 3.17,
673
+ "grad_norm": 6.581250190734863,
674
+ "learning_rate": 6.835820895522388e-05,
675
+ "loss": 0.3839,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 3.21,
680
+ "grad_norm": 5.605171203613281,
681
+ "learning_rate": 6.798507462686568e-05,
682
+ "loss": 0.4499,
683
  "step": 860
684
  },
685
  {
686
+ "epoch": 3.25,
687
+ "grad_norm": 2.834651231765747,
688
+ "learning_rate": 6.761194029850747e-05,
689
+ "loss": 0.5067,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 3.28,
694
+ "grad_norm": 4.615099906921387,
695
+ "learning_rate": 6.723880597014926e-05,
696
+ "loss": 0.4869,
697
  "step": 880
698
  },
699
  {
700
+ "epoch": 3.32,
701
+ "grad_norm": 6.115981101989746,
702
+ "learning_rate": 6.686567164179106e-05,
703
+ "loss": 0.4793,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 3.36,
708
+ "grad_norm": 1.1021697521209717,
709
+ "learning_rate": 6.649253731343283e-05,
710
+ "loss": 0.3986,
711
  "step": 900
712
  },
713
  {
714
+ "epoch": 3.36,
715
+ "eval_accuracy": 0.8312829525483304,
716
+ "eval_loss": 0.5632173418998718,
717
+ "eval_runtime": 7.731,
718
+ "eval_samples_per_second": 73.6,
719
+ "eval_steps_per_second": 9.313,
720
  "step": 900
721
  },
722
  {
723
+ "epoch": 3.4,
724
+ "grad_norm": 7.019008159637451,
725
+ "learning_rate": 6.611940298507463e-05,
726
+ "loss": 0.383,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 3.43,
731
+ "grad_norm": 2.586031913757324,
732
+ "learning_rate": 6.574626865671642e-05,
733
+ "loss": 0.2752,
734
  "step": 920
735
  },
736
  {
737
+ "epoch": 3.47,
738
+ "grad_norm": 2.5189669132232666,
739
+ "learning_rate": 6.537313432835821e-05,
740
+ "loss": 0.2944,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 3.51,
745
+ "grad_norm": 10.028382301330566,
746
+ "learning_rate": 6.500000000000001e-05,
747
+ "loss": 0.4378,
748
  "step": 940
749
  },
750
  {
751
+ "epoch": 3.54,
752
+ "grad_norm": 1.8697803020477295,
753
+ "learning_rate": 6.462686567164179e-05,
754
+ "loss": 0.3956,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 3.58,
759
+ "grad_norm": 5.872415065765381,
760
+ "learning_rate": 6.425373134328359e-05,
761
+ "loss": 0.338,
762
  "step": 960
763
  },
764
  {
765
+ "epoch": 3.62,
766
+ "grad_norm": 8.272451400756836,
767
+ "learning_rate": 6.388059701492538e-05,
768
+ "loss": 0.4264,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 3.66,
773
+ "grad_norm": 9.422249794006348,
774
+ "learning_rate": 6.350746268656716e-05,
775
+ "loss": 0.4258,
776
  "step": 980
777
  },
778
  {
779
+ "epoch": 3.69,
780
+ "grad_norm": 8.768738746643066,
781
+ "learning_rate": 6.313432835820896e-05,
782
+ "loss": 0.3308,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 3.73,
787
+ "grad_norm": 6.355968475341797,
788
+ "learning_rate": 6.276119402985074e-05,
789
+ "loss": 0.3438,
790
  "step": 1000
791
  },
792
  {
793
+ "epoch": 3.73,
794
+ "eval_accuracy": 0.836555360281195,
795
+ "eval_loss": 0.5606371760368347,
796
+ "eval_runtime": 7.818,
797
+ "eval_samples_per_second": 72.781,
798
+ "eval_steps_per_second": 9.21,
799
  "step": 1000
800
  },
801
  {
802
+ "epoch": 3.77,
803
+ "grad_norm": 3.973480463027954,
804
+ "learning_rate": 6.238805970149254e-05,
805
+ "loss": 0.5042,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 3.81,
810
+ "grad_norm": 5.739313125610352,
811
+ "learning_rate": 6.201492537313434e-05,
812
+ "loss": 0.4515,
813
  "step": 1020
814
  },
815
  {
816
+ "epoch": 3.84,
817
+ "grad_norm": 4.196649074554443,
818
+ "learning_rate": 6.164179104477613e-05,
819
+ "loss": 0.4404,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 3.88,
824
+ "grad_norm": 4.671971321105957,
825
+ "learning_rate": 6.126865671641791e-05,
826
+ "loss": 0.4746,
827
  "step": 1040
828
  },
829
  {
830
+ "epoch": 3.92,
831
+ "grad_norm": 6.87581205368042,
832
+ "learning_rate": 6.08955223880597e-05,
833
+ "loss": 0.4637,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 3.96,
838
+ "grad_norm": 7.224815368652344,
839
+ "learning_rate": 6.052238805970149e-05,
840
+ "loss": 0.4754,
841
  "step": 1060
842
  },
843
  {
844
+ "epoch": 3.99,
845
+ "grad_norm": 4.4340314865112305,
846
+ "learning_rate": 6.014925373134329e-05,
847
+ "loss": 0.4165,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 4.03,
852
+ "grad_norm": 1.151932716369629,
853
+ "learning_rate": 5.9776119402985076e-05,
854
+ "loss": 0.3498,
855
  "step": 1080
856
  },
857
  {
858
+ "epoch": 4.07,
859
+ "grad_norm": 6.31879997253418,
860
+ "learning_rate": 5.940298507462687e-05,
861
+ "loss": 0.3505,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 4.1,
866
+ "grad_norm": 4.674696445465088,
867
+ "learning_rate": 5.902985074626865e-05,
868
+ "loss": 0.4345,
869
  "step": 1100
870
  },
871
  {
872
+ "epoch": 4.1,
873
+ "eval_accuracy": 0.836555360281195,
874
+ "eval_loss": 0.5353797674179077,
875
+ "eval_runtime": 7.9559,
876
+ "eval_samples_per_second": 71.519,
877
+ "eval_steps_per_second": 9.05,
878
  "step": 1100
879
  },
880
  {
881
+ "epoch": 4.14,
882
+ "grad_norm": 6.790203094482422,
883
+ "learning_rate": 5.865671641791045e-05,
884
+ "loss": 0.3189,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 4.18,
889
+ "grad_norm": 5.554905414581299,
890
+ "learning_rate": 5.828358208955225e-05,
891
+ "loss": 0.3255,
892
  "step": 1120
893
  },
894
  {
895
+ "epoch": 4.22,
896
+ "grad_norm": 1.87189781665802,
897
+ "learning_rate": 5.7910447761194034e-05,
898
+ "loss": 0.2613,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 4.25,
903
+ "grad_norm": 3.4729249477386475,
904
+ "learning_rate": 5.7537313432835826e-05,
905
+ "loss": 0.4037,
906
  "step": 1140
907
  },
908
  {
909
+ "epoch": 4.29,
910
+ "grad_norm": 3.2373063564300537,
911
+ "learning_rate": 5.716417910447761e-05,
912
+ "loss": 0.384,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 4.33,
917
+ "grad_norm": 1.8042526245117188,
918
+ "learning_rate": 5.679104477611941e-05,
919
+ "loss": 0.4024,
920
  "step": 1160
921
  },
922
  {
923
+ "epoch": 4.37,
924
+ "grad_norm": 0.9592193365097046,
925
+ "learning_rate": 5.64179104477612e-05,
926
+ "loss": 0.3646,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 4.4,
931
+ "grad_norm": 4.0469584465026855,
932
+ "learning_rate": 5.6044776119402986e-05,
933
+ "loss": 0.3622,
934
  "step": 1180
935
  },
936
  {
937
+ "epoch": 4.44,
938
+ "grad_norm": 4.470405578613281,
939
+ "learning_rate": 5.5671641791044784e-05,
940
+ "loss": 0.2996,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 4.48,
945
+ "grad_norm": 6.086768627166748,
946
+ "learning_rate": 5.529850746268657e-05,
947
+ "loss": 0.4523,
948
  "step": 1200
949
  },
950
  {
951
+ "epoch": 4.48,
952
  "eval_accuracy": 0.8576449912126538,
953
+ "eval_loss": 0.49876561760902405,
954
+ "eval_runtime": 7.8527,
955
+ "eval_samples_per_second": 72.459,
956
+ "eval_steps_per_second": 9.169,
957
  "step": 1200
958
  },
959
  {
960
+ "epoch": 4.51,
961
+ "grad_norm": 3.478428363800049,
962
+ "learning_rate": 5.492537313432836e-05,
963
+ "loss": 0.4198,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 4.55,
968
+ "grad_norm": 4.539990425109863,
969
+ "learning_rate": 5.455223880597016e-05,
970
+ "loss": 0.3125,
971
  "step": 1220
972
  },
973
  {
974
+ "epoch": 4.59,
975
+ "grad_norm": 3.971435070037842,
976
+ "learning_rate": 5.4179104477611943e-05,
977
+ "loss": 0.2773,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 4.63,
982
+ "grad_norm": 7.168191909790039,
983
+ "learning_rate": 5.3805970149253735e-05,
984
+ "loss": 0.4852,
985
  "step": 1240
986
  },
987
  {
988
+ "epoch": 4.66,
989
+ "grad_norm": 2.896576166152954,
990
+ "learning_rate": 5.343283582089552e-05,
991
+ "loss": 0.3425,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 4.7,
996
+ "grad_norm": 1.4190607070922852,
997
+ "learning_rate": 5.305970149253732e-05,
998
+ "loss": 0.2219,
999
  "step": 1260
1000
  },
1001
  {
1002
+ "epoch": 4.74,
1003
+ "grad_norm": 5.066045761108398,
1004
+ "learning_rate": 5.268656716417911e-05,
1005
+ "loss": 0.3447,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 4.78,
1010
+ "grad_norm": 4.2649126052856445,
1011
+ "learning_rate": 5.2313432835820895e-05,
1012
+ "loss": 0.3931,
1013
  "step": 1280
1014
  },
1015
  {
1016
+ "epoch": 4.81,
1017
+ "grad_norm": 5.704684734344482,
1018
+ "learning_rate": 5.197761194029851e-05,
1019
+ "loss": 0.4274,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 4.85,
1024
+ "grad_norm": 6.395939350128174,
1025
+ "learning_rate": 5.16044776119403e-05,
1026
+ "loss": 0.3162,
1027
  "step": 1300
1028
  },
1029
  {
1030
+ "epoch": 4.85,
1031
+ "eval_accuracy": 0.8541300527240774,
1032
+ "eval_loss": 0.5099390745162964,
1033
+ "eval_runtime": 7.9919,
1034
+ "eval_samples_per_second": 71.197,
1035
+ "eval_steps_per_second": 9.009,
1036
  "step": 1300
1037
  },
1038
  {
1039
+ "epoch": 4.89,
1040
+ "grad_norm": 2.4717729091644287,
1041
+ "learning_rate": 5.123134328358209e-05,
1042
+ "loss": 0.3442,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 4.93,
1047
+ "grad_norm": 0.6504545211791992,
1048
+ "learning_rate": 5.0858208955223885e-05,
1049
+ "loss": 0.3313,
1050
  "step": 1320
1051
  },
1052
  {
1053
+ "epoch": 4.96,
1054
+ "grad_norm": 4.316141128540039,
1055
+ "learning_rate": 5.048507462686567e-05,
1056
+ "loss": 0.3787,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 5.0,
1061
+ "grad_norm": 4.9243998527526855,
1062
+ "learning_rate": 5.011194029850746e-05,
1063
+ "loss": 0.38,
1064
  "step": 1340
1065
  },
1066
+ {
1067
+ "epoch": 5.04,
1068
+ "grad_norm": 5.312038421630859,
1069
+ "learning_rate": 4.973880597014925e-05,
1070
+ "loss": 0.3268,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 5.07,
1075
+ "grad_norm": 3.5483176708221436,
1076
+ "learning_rate": 4.9365671641791045e-05,
1077
+ "loss": 0.3423,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 5.11,
1082
+ "grad_norm": 4.414547920227051,
1083
+ "learning_rate": 4.899253731343284e-05,
1084
+ "loss": 0.2421,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 5.15,
1089
+ "grad_norm": 5.7323689460754395,
1090
+ "learning_rate": 4.861940298507463e-05,
1091
+ "loss": 0.2795,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 5.19,
1096
+ "grad_norm": 4.2763471603393555,
1097
+ "learning_rate": 4.824626865671642e-05,
1098
+ "loss": 0.2402,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 5.22,
1103
+ "grad_norm": 9.259199142456055,
1104
+ "learning_rate": 4.787313432835821e-05,
1105
+ "loss": 0.3793,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 5.22,
1110
+ "eval_accuracy": 0.843585237258348,
1111
+ "eval_loss": 0.5190387964248657,
1112
+ "eval_runtime": 7.7562,
1113
+ "eval_samples_per_second": 73.361,
1114
+ "eval_steps_per_second": 9.283,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 5.26,
1119
+ "grad_norm": 4.773892402648926,
1120
+ "learning_rate": 4.75e-05,
1121
+ "loss": 0.3476,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 5.3,
1126
+ "grad_norm": 1.1271159648895264,
1127
+ "learning_rate": 4.7126865671641794e-05,
1128
+ "loss": 0.1949,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 5.34,
1133
+ "grad_norm": 2.823958158493042,
1134
+ "learning_rate": 4.6753731343283586e-05,
1135
+ "loss": 0.3009,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 5.37,
1140
+ "grad_norm": 0.35977163910865784,
1141
+ "learning_rate": 4.638059701492538e-05,
1142
+ "loss": 0.1821,
1143
+ "step": 1440
1144
+ },
1145
+ {
1146
+ "epoch": 5.41,
1147
+ "grad_norm": 3.380308151245117,
1148
+ "learning_rate": 4.600746268656716e-05,
1149
+ "loss": 0.323,
1150
+ "step": 1450
1151
+ },
1152
+ {
1153
+ "epoch": 5.45,
1154
+ "grad_norm": 5.946179389953613,
1155
+ "learning_rate": 4.5634328358208954e-05,
1156
+ "loss": 0.5344,
1157
+ "step": 1460
1158
+ },
1159
+ {
1160
+ "epoch": 5.49,
1161
+ "grad_norm": 8.254781723022461,
1162
+ "learning_rate": 4.526119402985075e-05,
1163
+ "loss": 0.2799,
1164
+ "step": 1470
1165
+ },
1166
+ {
1167
+ "epoch": 5.52,
1168
+ "grad_norm": 6.808130741119385,
1169
+ "learning_rate": 4.4888059701492544e-05,
1170
+ "loss": 0.3173,
1171
+ "step": 1480
1172
+ },
1173
+ {
1174
+ "epoch": 5.56,
1175
+ "grad_norm": 17.452037811279297,
1176
+ "learning_rate": 4.451492537313433e-05,
1177
+ "loss": 0.3251,
1178
+ "step": 1490
1179
+ },
1180
+ {
1181
+ "epoch": 5.6,
1182
+ "grad_norm": 2.3097095489501953,
1183
+ "learning_rate": 4.414179104477612e-05,
1184
+ "loss": 0.3228,
1185
+ "step": 1500
1186
+ },
1187
+ {
1188
+ "epoch": 5.6,
1189
+ "eval_accuracy": 0.8576449912126538,
1190
+ "eval_loss": 0.4589254856109619,
1191
+ "eval_runtime": 8.0547,
1192
+ "eval_samples_per_second": 70.642,
1193
+ "eval_steps_per_second": 8.939,
1194
+ "step": 1500
1195
+ },
1196
+ {
1197
+ "epoch": 5.63,
1198
+ "grad_norm": 3.337970018386841,
1199
+ "learning_rate": 4.376865671641791e-05,
1200
+ "loss": 0.2528,
1201
+ "step": 1510
1202
+ },
1203
+ {
1204
+ "epoch": 5.67,
1205
+ "grad_norm": 0.5921415090560913,
1206
+ "learning_rate": 4.33955223880597e-05,
1207
+ "loss": 0.2459,
1208
+ "step": 1520
1209
+ },
1210
+ {
1211
+ "epoch": 5.71,
1212
+ "grad_norm": 4.148998260498047,
1213
+ "learning_rate": 4.3022388059701495e-05,
1214
+ "loss": 0.2927,
1215
+ "step": 1530
1216
+ },
1217
+ {
1218
+ "epoch": 5.75,
1219
+ "grad_norm": 5.740537166595459,
1220
+ "learning_rate": 4.2649253731343286e-05,
1221
+ "loss": 0.423,
1222
+ "step": 1540
1223
+ },
1224
+ {
1225
+ "epoch": 5.78,
1226
+ "grad_norm": 5.316250324249268,
1227
+ "learning_rate": 4.227611940298508e-05,
1228
+ "loss": 0.3735,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 5.82,
1233
+ "grad_norm": 5.52378511428833,
1234
+ "learning_rate": 4.190298507462686e-05,
1235
+ "loss": 0.3613,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 5.86,
1240
+ "grad_norm": 2.1002511978149414,
1241
+ "learning_rate": 4.152985074626866e-05,
1242
+ "loss": 0.259,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 5.9,
1247
+ "grad_norm": 5.339119911193848,
1248
+ "learning_rate": 4.115671641791045e-05,
1249
+ "loss": 0.3355,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 5.93,
1254
+ "grad_norm": 3.0551536083221436,
1255
+ "learning_rate": 4.0783582089552244e-05,
1256
+ "loss": 0.4342,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 5.97,
1261
+ "grad_norm": 6.549235820770264,
1262
+ "learning_rate": 4.041044776119403e-05,
1263
+ "loss": 0.1795,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 5.97,
1268
+ "eval_accuracy": 0.8488576449912126,
1269
+ "eval_loss": 0.5095508694648743,
1270
+ "eval_runtime": 7.7872,
1271
+ "eval_samples_per_second": 73.068,
1272
+ "eval_steps_per_second": 9.246,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 6.01,
1277
+ "grad_norm": 11.5170316696167,
1278
+ "learning_rate": 4.003731343283582e-05,
1279
+ "loss": 0.3778,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 6.04,
1284
+ "grad_norm": 6.004143238067627,
1285
+ "learning_rate": 3.966417910447761e-05,
1286
+ "loss": 0.3624,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 6.08,
1291
+ "grad_norm": 4.328847885131836,
1292
+ "learning_rate": 3.9291044776119404e-05,
1293
+ "loss": 0.3478,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 6.12,
1298
+ "grad_norm": 3.5757558345794678,
1299
+ "learning_rate": 3.8917910447761195e-05,
1300
+ "loss": 0.2208,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 6.16,
1305
+ "grad_norm": 8.37783432006836,
1306
+ "learning_rate": 3.854477611940299e-05,
1307
+ "loss": 0.3614,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 6.19,
1312
+ "grad_norm": 2.4890713691711426,
1313
+ "learning_rate": 3.817164179104478e-05,
1314
+ "loss": 0.2514,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 6.23,
1319
+ "grad_norm": 8.873276710510254,
1320
+ "learning_rate": 3.7798507462686563e-05,
1321
+ "loss": 0.2233,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 6.27,
1326
+ "grad_norm": 0.29393309354782104,
1327
+ "learning_rate": 3.742537313432836e-05,
1328
+ "loss": 0.2474,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 6.31,
1333
+ "grad_norm": 3.810150384902954,
1334
+ "learning_rate": 3.7052238805970153e-05,
1335
+ "loss": 0.2481,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 6.34,
1340
+ "grad_norm": 1.989057183265686,
1341
+ "learning_rate": 3.6679104477611945e-05,
1342
+ "loss": 0.2626,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 6.34,
1347
+ "eval_accuracy": 0.8488576449912126,
1348
+ "eval_loss": 0.5402765274047852,
1349
+ "eval_runtime": 7.9293,
1350
+ "eval_samples_per_second": 71.759,
1351
+ "eval_steps_per_second": 9.08,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 6.38,
1356
+ "grad_norm": 8.488819122314453,
1357
+ "learning_rate": 3.630597014925373e-05,
1358
+ "loss": 0.2826,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 6.42,
1363
+ "grad_norm": 5.542993068695068,
1364
+ "learning_rate": 3.593283582089552e-05,
1365
+ "loss": 0.3552,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 6.46,
1370
+ "grad_norm": 6.646905422210693,
1371
+ "learning_rate": 3.555970149253732e-05,
1372
+ "loss": 0.4405,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 6.49,
1377
+ "grad_norm": 4.022976398468018,
1378
+ "learning_rate": 3.5186567164179105e-05,
1379
+ "loss": 0.2738,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 6.53,
1384
+ "grad_norm": 3.5472657680511475,
1385
+ "learning_rate": 3.4813432835820896e-05,
1386
+ "loss": 0.2807,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 6.57,
1391
+ "grad_norm": 12.070052146911621,
1392
+ "learning_rate": 3.444029850746269e-05,
1393
+ "loss": 0.3634,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 6.6,
1398
+ "grad_norm": 5.368374347686768,
1399
+ "learning_rate": 3.406716417910448e-05,
1400
+ "loss": 0.3252,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 6.64,
1405
+ "grad_norm": 5.566130638122559,
1406
+ "learning_rate": 3.369402985074627e-05,
1407
+ "loss": 0.3034,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 6.68,
1412
+ "grad_norm": 5.875336170196533,
1413
+ "learning_rate": 3.332089552238806e-05,
1414
+ "loss": 0.3406,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 6.72,
1419
+ "grad_norm": 2.4168920516967773,
1420
+ "learning_rate": 3.2947761194029854e-05,
1421
+ "loss": 0.3041,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 6.72,
1426
+ "eval_accuracy": 0.8488576449912126,
1427
+ "eval_loss": 0.4907586872577667,
1428
+ "eval_runtime": 7.8209,
1429
+ "eval_samples_per_second": 72.754,
1430
+ "eval_steps_per_second": 9.206,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 6.75,
1435
+ "grad_norm": 3.1040282249450684,
1436
+ "learning_rate": 3.2574626865671646e-05,
1437
+ "loss": 0.3167,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 6.79,
1442
+ "grad_norm": 1.8458846807479858,
1443
+ "learning_rate": 3.220149253731343e-05,
1444
+ "loss": 0.2061,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 6.83,
1449
+ "grad_norm": 0.4053177833557129,
1450
+ "learning_rate": 3.182835820895523e-05,
1451
+ "loss": 0.3113,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 6.87,
1456
+ "grad_norm": 0.23064230382442474,
1457
+ "learning_rate": 3.145522388059702e-05,
1458
+ "loss": 0.2368,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 6.9,
1463
+ "grad_norm": 1.006479263305664,
1464
+ "learning_rate": 3.1082089552238805e-05,
1465
+ "loss": 0.2265,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 6.94,
1470
+ "grad_norm": 4.072957992553711,
1471
+ "learning_rate": 3.07089552238806e-05,
1472
+ "loss": 0.2976,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 6.98,
1477
+ "grad_norm": 16.575963973999023,
1478
+ "learning_rate": 3.033582089552239e-05,
1479
+ "loss": 0.1504,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 7.01,
1484
+ "grad_norm": 2.9144656658172607,
1485
+ "learning_rate": 2.9962686567164183e-05,
1486
+ "loss": 0.2156,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 7.05,
1491
+ "grad_norm": 4.547207832336426,
1492
+ "learning_rate": 2.958955223880597e-05,
1493
+ "loss": 0.2693,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 7.09,
1498
+ "grad_norm": 0.5566532611846924,
1499
+ "learning_rate": 2.9216417910447763e-05,
1500
+ "loss": 0.1831,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 7.09,
1505
+ "eval_accuracy": 0.8383128295254832,
1506
+ "eval_loss": 0.5721341967582703,
1507
+ "eval_runtime": 7.7377,
1508
+ "eval_samples_per_second": 73.536,
1509
+ "eval_steps_per_second": 9.305,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 7.13,
1514
+ "grad_norm": 7.9241838455200195,
1515
+ "learning_rate": 2.8843283582089555e-05,
1516
+ "loss": 0.3037,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 7.16,
1521
+ "grad_norm": 4.847833156585693,
1522
+ "learning_rate": 2.8470149253731343e-05,
1523
+ "loss": 0.2744,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 7.2,
1528
+ "grad_norm": 4.368974208831787,
1529
+ "learning_rate": 2.8097014925373134e-05,
1530
+ "loss": 0.1603,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 7.24,
1535
+ "grad_norm": 5.848027229309082,
1536
+ "learning_rate": 2.772388059701493e-05,
1537
+ "loss": 0.3318,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 7.28,
1542
+ "grad_norm": 5.53363037109375,
1543
+ "learning_rate": 2.7350746268656718e-05,
1544
+ "loss": 0.2568,
1545
+ "step": 1950
1546
+ },
1547
+ {
1548
+ "epoch": 7.31,
1549
+ "grad_norm": 1.3791863918304443,
1550
+ "learning_rate": 2.697761194029851e-05,
1551
+ "loss": 0.2186,
1552
+ "step": 1960
1553
+ },
1554
+ {
1555
+ "epoch": 7.35,
1556
+ "grad_norm": 13.533841133117676,
1557
+ "learning_rate": 2.6604477611940297e-05,
1558
+ "loss": 0.2772,
1559
+ "step": 1970
1560
+ },
1561
+ {
1562
+ "epoch": 7.39,
1563
+ "grad_norm": 1.113595962524414,
1564
+ "learning_rate": 2.623134328358209e-05,
1565
+ "loss": 0.3396,
1566
+ "step": 1980
1567
+ },
1568
+ {
1569
+ "epoch": 7.43,
1570
+ "grad_norm": 3.193376064300537,
1571
+ "learning_rate": 2.5858208955223884e-05,
1572
+ "loss": 0.2171,
1573
+ "step": 1990
1574
+ },
1575
+ {
1576
+ "epoch": 7.46,
1577
+ "grad_norm": 2.8687243461608887,
1578
+ "learning_rate": 2.5485074626865672e-05,
1579
+ "loss": 0.2275,
1580
+ "step": 2000
1581
+ },
1582
+ {
1583
+ "epoch": 7.46,
1584
+ "eval_accuracy": 0.8312829525483304,
1585
+ "eval_loss": 0.5349107980728149,
1586
+ "eval_runtime": 8.0113,
1587
+ "eval_samples_per_second": 71.025,
1588
+ "eval_steps_per_second": 8.987,
1589
+ "step": 2000
1590
+ },
1591
+ {
1592
+ "epoch": 7.5,
1593
+ "grad_norm": 6.330258846282959,
1594
+ "learning_rate": 2.5111940298507464e-05,
1595
+ "loss": 0.2165,
1596
+ "step": 2010
1597
+ },
1598
+ {
1599
+ "epoch": 7.54,
1600
+ "grad_norm": 2.457519769668579,
1601
+ "learning_rate": 2.4738805970149252e-05,
1602
+ "loss": 0.3275,
1603
+ "step": 2020
1604
+ },
1605
+ {
1606
+ "epoch": 7.57,
1607
+ "grad_norm": 1.468772053718567,
1608
+ "learning_rate": 2.4365671641791047e-05,
1609
+ "loss": 0.186,
1610
+ "step": 2030
1611
+ },
1612
+ {
1613
+ "epoch": 7.61,
1614
+ "grad_norm": 4.308888912200928,
1615
+ "learning_rate": 2.3992537313432835e-05,
1616
+ "loss": 0.3182,
1617
+ "step": 2040
1618
+ },
1619
+ {
1620
+ "epoch": 7.65,
1621
+ "grad_norm": 1.8849867582321167,
1622
+ "learning_rate": 2.361940298507463e-05,
1623
+ "loss": 0.2631,
1624
+ "step": 2050
1625
+ },
1626
+ {
1627
+ "epoch": 7.69,
1628
+ "grad_norm": 2.6795170307159424,
1629
+ "learning_rate": 2.3246268656716418e-05,
1630
+ "loss": 0.1724,
1631
+ "step": 2060
1632
+ },
1633
+ {
1634
+ "epoch": 7.72,
1635
+ "grad_norm": 0.22702960669994354,
1636
+ "learning_rate": 2.287313432835821e-05,
1637
+ "loss": 0.2542,
1638
+ "step": 2070
1639
+ },
1640
+ {
1641
+ "epoch": 7.76,
1642
+ "grad_norm": 4.6633429527282715,
1643
+ "learning_rate": 2.25e-05,
1644
+ "loss": 0.259,
1645
+ "step": 2080
1646
+ },
1647
+ {
1648
+ "epoch": 7.8,
1649
+ "grad_norm": 6.543178558349609,
1650
+ "learning_rate": 2.2126865671641793e-05,
1651
+ "loss": 0.3752,
1652
+ "step": 2090
1653
+ },
1654
+ {
1655
+ "epoch": 7.84,
1656
+ "grad_norm": 7.109080791473389,
1657
+ "learning_rate": 2.1753731343283585e-05,
1658
+ "loss": 0.1762,
1659
+ "step": 2100
1660
+ },
1661
+ {
1662
+ "epoch": 7.84,
1663
+ "eval_accuracy": 0.8541300527240774,
1664
+ "eval_loss": 0.5203543901443481,
1665
+ "eval_runtime": 7.8922,
1666
+ "eval_samples_per_second": 72.096,
1667
+ "eval_steps_per_second": 9.123,
1668
+ "step": 2100
1669
+ },
1670
+ {
1671
+ "epoch": 7.87,
1672
+ "grad_norm": 3.3965115547180176,
1673
+ "learning_rate": 2.1380597014925373e-05,
1674
+ "loss": 0.1965,
1675
+ "step": 2110
1676
+ },
1677
+ {
1678
+ "epoch": 7.91,
1679
+ "grad_norm": 0.1386798918247223,
1680
+ "learning_rate": 2.1007462686567164e-05,
1681
+ "loss": 0.1448,
1682
+ "step": 2120
1683
+ },
1684
+ {
1685
+ "epoch": 7.95,
1686
+ "grad_norm": 8.268773078918457,
1687
+ "learning_rate": 2.0634328358208956e-05,
1688
+ "loss": 0.2203,
1689
+ "step": 2130
1690
+ },
1691
+ {
1692
+ "epoch": 7.99,
1693
+ "grad_norm": 2.712890625,
1694
+ "learning_rate": 2.0261194029850748e-05,
1695
+ "loss": 0.2104,
1696
+ "step": 2140
1697
+ },
1698
+ {
1699
+ "epoch": 8.02,
1700
+ "grad_norm": 2.0390050411224365,
1701
+ "learning_rate": 1.988805970149254e-05,
1702
+ "loss": 0.2063,
1703
+ "step": 2150
1704
+ },
1705
+ {
1706
+ "epoch": 8.06,
1707
+ "grad_norm": 4.355598449707031,
1708
+ "learning_rate": 1.951492537313433e-05,
1709
+ "loss": 0.1356,
1710
+ "step": 2160
1711
+ },
1712
+ {
1713
+ "epoch": 8.1,
1714
+ "grad_norm": 9.854630470275879,
1715
+ "learning_rate": 1.914179104477612e-05,
1716
+ "loss": 0.1686,
1717
+ "step": 2170
1718
+ },
1719
+ {
1720
+ "epoch": 8.13,
1721
+ "grad_norm": 4.178330421447754,
1722
+ "learning_rate": 1.8768656716417914e-05,
1723
+ "loss": 0.2578,
1724
+ "step": 2180
1725
+ },
1726
+ {
1727
+ "epoch": 8.17,
1728
+ "grad_norm": 5.019784450531006,
1729
+ "learning_rate": 1.8395522388059702e-05,
1730
+ "loss": 0.1923,
1731
+ "step": 2190
1732
+ },
1733
+ {
1734
+ "epoch": 8.21,
1735
+ "grad_norm": 3.8136210441589355,
1736
+ "learning_rate": 1.8022388059701494e-05,
1737
+ "loss": 0.2112,
1738
+ "step": 2200
1739
+ },
1740
+ {
1741
+ "epoch": 8.21,
1742
+ "eval_accuracy": 0.8629173989455184,
1743
+ "eval_loss": 0.5188840627670288,
1744
+ "eval_runtime": 8.1412,
1745
+ "eval_samples_per_second": 69.891,
1746
+ "eval_steps_per_second": 8.844,
1747
+ "step": 2200
1748
+ },
1749
+ {
1750
+ "epoch": 8.25,
1751
+ "grad_norm": 2.7035305500030518,
1752
+ "learning_rate": 1.7649253731343285e-05,
1753
+ "loss": 0.2501,
1754
+ "step": 2210
1755
+ },
1756
+ {
1757
+ "epoch": 8.28,
1758
+ "grad_norm": 6.736306190490723,
1759
+ "learning_rate": 1.7276119402985073e-05,
1760
+ "loss": 0.2213,
1761
+ "step": 2220
1762
+ },
1763
+ {
1764
+ "epoch": 8.32,
1765
+ "grad_norm": 3.0436556339263916,
1766
+ "learning_rate": 1.690298507462687e-05,
1767
+ "loss": 0.1285,
1768
+ "step": 2230
1769
+ },
1770
+ {
1771
+ "epoch": 8.36,
1772
+ "grad_norm": 4.729572772979736,
1773
+ "learning_rate": 1.6529850746268657e-05,
1774
+ "loss": 0.2984,
1775
+ "step": 2240
1776
+ },
1777
+ {
1778
+ "epoch": 8.4,
1779
+ "grad_norm": 3.6665098667144775,
1780
+ "learning_rate": 1.6156716417910448e-05,
1781
+ "loss": 0.1796,
1782
+ "step": 2250
1783
+ },
1784
+ {
1785
+ "epoch": 8.43,
1786
+ "grad_norm": 8.485068321228027,
1787
+ "learning_rate": 1.578358208955224e-05,
1788
+ "loss": 0.2137,
1789
+ "step": 2260
1790
+ },
1791
+ {
1792
+ "epoch": 8.47,
1793
+ "grad_norm": 4.643974304199219,
1794
+ "learning_rate": 1.541044776119403e-05,
1795
+ "loss": 0.3009,
1796
+ "step": 2270
1797
+ },
1798
+ {
1799
+ "epoch": 8.51,
1800
+ "grad_norm": 2.91859769821167,
1801
+ "learning_rate": 1.5037313432835823e-05,
1802
+ "loss": 0.1855,
1803
+ "step": 2280
1804
+ },
1805
+ {
1806
+ "epoch": 8.54,
1807
+ "grad_norm": 9.799684524536133,
1808
+ "learning_rate": 1.4664179104477613e-05,
1809
+ "loss": 0.2186,
1810
+ "step": 2290
1811
+ },
1812
+ {
1813
+ "epoch": 8.58,
1814
+ "grad_norm": 4.92659330368042,
1815
+ "learning_rate": 1.4291044776119403e-05,
1816
+ "loss": 0.1242,
1817
+ "step": 2300
1818
+ },
1819
+ {
1820
+ "epoch": 8.58,
1821
+ "eval_accuracy": 0.8471001757469244,
1822
+ "eval_loss": 0.5376706123352051,
1823
+ "eval_runtime": 7.8653,
1824
+ "eval_samples_per_second": 72.343,
1825
+ "eval_steps_per_second": 9.154,
1826
+ "step": 2300
1827
+ },
1828
+ {
1829
+ "epoch": 8.62,
1830
+ "grad_norm": 0.7728621363639832,
1831
+ "learning_rate": 1.3917910447761196e-05,
1832
+ "loss": 0.2769,
1833
+ "step": 2310
1834
+ },
1835
+ {
1836
+ "epoch": 8.66,
1837
+ "grad_norm": 3.757192373275757,
1838
+ "learning_rate": 1.3544776119402986e-05,
1839
+ "loss": 0.31,
1840
+ "step": 2320
1841
+ },
1842
+ {
1843
+ "epoch": 8.69,
1844
+ "grad_norm": 5.901330471038818,
1845
+ "learning_rate": 1.3171641791044777e-05,
1846
+ "loss": 0.2488,
1847
+ "step": 2330
1848
+ },
1849
+ {
1850
+ "epoch": 8.73,
1851
+ "grad_norm": 0.1360226422548294,
1852
+ "learning_rate": 1.2798507462686567e-05,
1853
+ "loss": 0.2359,
1854
+ "step": 2340
1855
+ },
1856
+ {
1857
+ "epoch": 8.77,
1858
+ "grad_norm": 5.801501750946045,
1859
+ "learning_rate": 1.2425373134328359e-05,
1860
+ "loss": 0.23,
1861
+ "step": 2350
1862
+ },
1863
+ {
1864
+ "epoch": 8.81,
1865
+ "grad_norm": 3.3060359954833984,
1866
+ "learning_rate": 1.2052238805970149e-05,
1867
+ "loss": 0.1114,
1868
+ "step": 2360
1869
+ },
1870
+ {
1871
+ "epoch": 8.84,
1872
+ "grad_norm": 2.0813100337982178,
1873
+ "learning_rate": 1.167910447761194e-05,
1874
+ "loss": 0.1569,
1875
+ "step": 2370
1876
+ },
1877
+ {
1878
+ "epoch": 8.88,
1879
+ "grad_norm": 0.42951256036758423,
1880
+ "learning_rate": 1.1305970149253732e-05,
1881
+ "loss": 0.2636,
1882
+ "step": 2380
1883
+ },
1884
+ {
1885
+ "epoch": 8.92,
1886
+ "grad_norm": 3.2714788913726807,
1887
+ "learning_rate": 1.0932835820895524e-05,
1888
+ "loss": 0.2197,
1889
+ "step": 2390
1890
+ },
1891
+ {
1892
+ "epoch": 8.96,
1893
+ "grad_norm": 4.24855375289917,
1894
+ "learning_rate": 1.0559701492537313e-05,
1895
+ "loss": 0.1207,
1896
+ "step": 2400
1897
+ },
1898
+ {
1899
+ "epoch": 8.96,
1900
+ "eval_accuracy": 0.8558875219683656,
1901
+ "eval_loss": 0.5324714779853821,
1902
+ "eval_runtime": 7.9022,
1903
+ "eval_samples_per_second": 72.006,
1904
+ "eval_steps_per_second": 9.111,
1905
+ "step": 2400
1906
+ },
1907
+ {
1908
+ "epoch": 8.99,
1909
+ "grad_norm": 3.989713430404663,
1910
+ "learning_rate": 1.0186567164179105e-05,
1911
+ "loss": 0.2336,
1912
+ "step": 2410
1913
+ },
1914
+ {
1915
+ "epoch": 9.03,
1916
+ "grad_norm": 5.590869903564453,
1917
+ "learning_rate": 9.813432835820897e-06,
1918
+ "loss": 0.2292,
1919
+ "step": 2420
1920
+ },
1921
+ {
1922
+ "epoch": 9.07,
1923
+ "grad_norm": 3.405966281890869,
1924
+ "learning_rate": 9.440298507462688e-06,
1925
+ "loss": 0.1654,
1926
+ "step": 2430
1927
+ },
1928
+ {
1929
+ "epoch": 9.1,
1930
+ "grad_norm": 3.733381986618042,
1931
+ "learning_rate": 9.067164179104478e-06,
1932
+ "loss": 0.2104,
1933
+ "step": 2440
1934
+ },
1935
+ {
1936
+ "epoch": 9.14,
1937
+ "grad_norm": 0.1994183361530304,
1938
+ "learning_rate": 8.694029850746268e-06,
1939
+ "loss": 0.0789,
1940
+ "step": 2450
1941
+ },
1942
+ {
1943
+ "epoch": 9.18,
1944
+ "grad_norm": 7.948019504547119,
1945
+ "learning_rate": 8.32089552238806e-06,
1946
+ "loss": 0.3335,
1947
+ "step": 2460
1948
+ },
1949
+ {
1950
+ "epoch": 9.22,
1951
+ "grad_norm": 3.020522117614746,
1952
+ "learning_rate": 7.947761194029851e-06,
1953
+ "loss": 0.1838,
1954
+ "step": 2470
1955
+ },
1956
+ {
1957
+ "epoch": 9.25,
1958
+ "grad_norm": 2.4797592163085938,
1959
+ "learning_rate": 7.574626865671643e-06,
1960
+ "loss": 0.1573,
1961
+ "step": 2480
1962
+ },
1963
+ {
1964
+ "epoch": 9.29,
1965
+ "grad_norm": 0.7854322195053101,
1966
+ "learning_rate": 7.201492537313433e-06,
1967
+ "loss": 0.1868,
1968
+ "step": 2490
1969
+ },
1970
+ {
1971
+ "epoch": 9.33,
1972
+ "grad_norm": 8.424530982971191,
1973
+ "learning_rate": 6.828358208955224e-06,
1974
+ "loss": 0.1806,
1975
+ "step": 2500
1976
+ },
1977
+ {
1978
+ "epoch": 9.33,
1979
+ "eval_accuracy": 0.8646748681898067,
1980
+ "eval_loss": 0.5149648785591125,
1981
+ "eval_runtime": 7.8422,
1982
+ "eval_samples_per_second": 72.556,
1983
+ "eval_steps_per_second": 9.181,
1984
+ "step": 2500
1985
+ },
1986
+ {
1987
+ "epoch": 9.37,
1988
+ "grad_norm": 2.9176523685455322,
1989
+ "learning_rate": 6.455223880597015e-06,
1990
+ "loss": 0.1977,
1991
+ "step": 2510
1992
+ },
1993
+ {
1994
+ "epoch": 9.4,
1995
+ "grad_norm": 4.15384578704834,
1996
+ "learning_rate": 6.082089552238806e-06,
1997
+ "loss": 0.2007,
1998
+ "step": 2520
1999
+ },
2000
+ {
2001
+ "epoch": 9.44,
2002
+ "grad_norm": 2.4758641719818115,
2003
+ "learning_rate": 5.708955223880597e-06,
2004
+ "loss": 0.2,
2005
+ "step": 2530
2006
+ },
2007
+ {
2008
+ "epoch": 9.48,
2009
+ "grad_norm": 4.053123950958252,
2010
+ "learning_rate": 5.335820895522389e-06,
2011
+ "loss": 0.2514,
2012
+ "step": 2540
2013
+ },
2014
+ {
2015
+ "epoch": 9.51,
2016
+ "grad_norm": 2.3916337490081787,
2017
+ "learning_rate": 4.9626865671641796e-06,
2018
+ "loss": 0.2104,
2019
+ "step": 2550
2020
+ },
2021
+ {
2022
+ "epoch": 9.55,
2023
+ "grad_norm": 4.113661766052246,
2024
+ "learning_rate": 4.58955223880597e-06,
2025
+ "loss": 0.1998,
2026
+ "step": 2560
2027
+ },
2028
+ {
2029
+ "epoch": 9.59,
2030
+ "grad_norm": 3.558722972869873,
2031
+ "learning_rate": 4.216417910447761e-06,
2032
+ "loss": 0.144,
2033
+ "step": 2570
2034
+ },
2035
+ {
2036
+ "epoch": 9.63,
2037
+ "grad_norm": 2.689765691757202,
2038
+ "learning_rate": 3.843283582089553e-06,
2039
+ "loss": 0.1691,
2040
+ "step": 2580
2041
+ },
2042
+ {
2043
+ "epoch": 9.66,
2044
+ "grad_norm": 4.95484733581543,
2045
+ "learning_rate": 3.4701492537313434e-06,
2046
+ "loss": 0.1875,
2047
+ "step": 2590
2048
+ },
2049
+ {
2050
+ "epoch": 9.7,
2051
+ "grad_norm": 6.025635242462158,
2052
+ "learning_rate": 3.0970149253731345e-06,
2053
+ "loss": 0.1793,
2054
+ "step": 2600
2055
+ },
2056
+ {
2057
+ "epoch": 9.7,
2058
+ "eval_accuracy": 0.8664323374340949,
2059
+ "eval_loss": 0.5153330564498901,
2060
+ "eval_runtime": 7.9144,
2061
+ "eval_samples_per_second": 71.894,
2062
+ "eval_steps_per_second": 9.097,
2063
+ "step": 2600
2064
+ },
2065
+ {
2066
+ "epoch": 9.74,
2067
+ "grad_norm": 0.3092793822288513,
2068
+ "learning_rate": 2.7238805970149257e-06,
2069
+ "loss": 0.1385,
2070
+ "step": 2610
2071
+ },
2072
+ {
2073
+ "epoch": 9.78,
2074
+ "grad_norm": 1.1317028999328613,
2075
+ "learning_rate": 2.3507462686567164e-06,
2076
+ "loss": 0.1628,
2077
+ "step": 2620
2078
+ },
2079
+ {
2080
+ "epoch": 9.81,
2081
+ "grad_norm": 7.642726898193359,
2082
+ "learning_rate": 1.9776119402985076e-06,
2083
+ "loss": 0.2142,
2084
+ "step": 2630
2085
+ },
2086
+ {
2087
+ "epoch": 9.85,
2088
+ "grad_norm": 4.3891191482543945,
2089
+ "learning_rate": 1.6044776119402985e-06,
2090
+ "loss": 0.2115,
2091
+ "step": 2640
2092
+ },
2093
+ {
2094
+ "epoch": 9.89,
2095
+ "grad_norm": 5.876834869384766,
2096
+ "learning_rate": 1.2313432835820897e-06,
2097
+ "loss": 0.2859,
2098
+ "step": 2650
2099
+ },
2100
+ {
2101
+ "epoch": 9.93,
2102
+ "grad_norm": 1.6104581356048584,
2103
+ "learning_rate": 8.582089552238806e-07,
2104
+ "loss": 0.2752,
2105
+ "step": 2660
2106
+ },
2107
+ {
2108
+ "epoch": 9.96,
2109
+ "grad_norm": 5.835386276245117,
2110
+ "learning_rate": 4.850746268656717e-07,
2111
+ "loss": 0.2057,
2112
+ "step": 2670
2113
+ },
2114
+ {
2115
+ "epoch": 10.0,
2116
+ "grad_norm": 7.006475925445557,
2117
+ "learning_rate": 1.119402985074627e-07,
2118
+ "loss": 0.2098,
2119
+ "step": 2680
2120
+ },
2121
  {
2122
  "epoch": 10.0,
2123
+ "step": 2680,
2124
  "total_flos": 3.3230947683690086e+18,
2125
+ "train_loss": 0.45543073504718384,
2126
+ "train_runtime": 1353.2313,
2127
+ "train_samples_per_second": 31.687,
2128
+ "train_steps_per_second": 1.98
2129
  }
2130
  ],
2131
+ "logging_steps": 10,
2132
+ "max_steps": 2680,
2133
  "num_input_tokens_seen": 0,
2134
  "num_train_epochs": 10,
2135
  "save_steps": 100,
2136
  "total_flos": 3.3230947683690086e+18,
2137
+ "train_batch_size": 16,
2138
  "trial_name": null,
2139
  "trial_params": null
2140
  }