davanstrien HF staff commited on
Commit
7b2ef42
1 Parent(s): 885ea28

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,8 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: timm/mobilenetv3_large_100.miil_in21k
5
  tags:
 
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -16,10 +18,10 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # test-timm
18
 
19
- This model is a fine-tuned version of [timm/mobilenetv3_large_100.miil_in21k](https://huggingface.co/timm/mobilenetv3_large_100.miil_in21k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.5000
22
- - Accuracy: 0.7756
23
 
24
  ## Model description
25
 
 
3
  license: apache-2.0
4
  base_model: timm/mobilenetv3_large_100.miil_in21k
5
  tags:
6
+ - image-classification
7
+ - vision
8
  - generated_from_trainer
9
  metrics:
10
  - accuracy
 
18
 
19
  # test-timm
20
 
21
+ This model is a fine-tuned version of [timm/mobilenetv3_large_100.miil_in21k](https://huggingface.co/timm/mobilenetv3_large_100.miil_in21k) on the davanstrien/zenodo-presentations-open-labels dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.4904
24
+ - Accuracy: 0.7874
25
 
26
  ## Model description
27
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.6811023622047244,
4
- "eval_loss": 0.5247489213943481,
5
- "eval_runtime": 5.1879,
6
- "eval_samples_per_second": 48.96,
7
- "eval_steps_per_second": 6.168,
8
- "total_flos": 7.24330215447552e+16,
9
- "train_loss": 0.5726550849278768,
10
- "train_runtime": 239.6067,
11
- "train_samples_per_second": 29.945,
12
- "train_steps_per_second": 3.756
13
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.7874015748031497,
4
+ "eval_loss": 0.4903908967971802,
5
+ "eval_runtime": 3.9654,
6
+ "eval_samples_per_second": 64.055,
7
+ "eval_steps_per_second": 1.009,
8
+ "total_flos": 2.72467378584576e+17,
9
+ "train_loss": 0.4791690407628598,
10
+ "train_runtime": 1616.3187,
11
+ "train_samples_per_second": 44.391,
12
+ "train_steps_per_second": 0.711
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.6811023622047244,
4
- "eval_loss": 0.5247489213943481,
5
- "eval_runtime": 5.1879,
6
- "eval_samples_per_second": 48.96,
7
- "eval_steps_per_second": 6.168
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.7874015748031497,
4
+ "eval_loss": 0.4903908967971802,
5
+ "eval_runtime": 3.9654,
6
+ "eval_samples_per_second": 64.055,
7
+ "eval_steps_per_second": 1.009
8
  }
runs/Oct11_09-57-14_ed9849b3ed7e/events.out.tfevents.1728642265.ed9849b3ed7e.18938.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a78099521846bf932e10ca606f522731882787ea3ebe82eb67d5e94cfa500323
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 7.24330215447552e+16,
4
- "train_loss": 0.5726550849278768,
5
- "train_runtime": 239.6067,
6
- "train_samples_per_second": 29.945,
7
- "train_steps_per_second": 3.756
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "total_flos": 2.72467378584576e+17,
4
+ "train_loss": 0.4791690407628598,
5
+ "train_runtime": 1616.3187,
6
+ "train_samples_per_second": 44.391,
7
+ "train_steps_per_second": 0.711
8
  }
trainer_state.json CHANGED
@@ -1,702 +1,1282 @@
1
  {
2
- "best_metric": 0.5247489213943481,
3
- "best_model_checkpoint": "./beans_outputs/checkpoint-720",
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05555555555555555,
13
- "grad_norm": 3.8575210571289062,
14
- "learning_rate": 1.977777777777778e-05,
15
- "loss": 0.6949,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.1111111111111111,
20
- "grad_norm": 3.2087113857269287,
21
- "learning_rate": 1.9555555555555557e-05,
22
- "loss": 0.6848,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.16666666666666666,
27
- "grad_norm": 2.9537899494171143,
28
- "learning_rate": 1.9333333333333333e-05,
29
- "loss": 0.6871,
 
 
 
 
 
 
 
 
 
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.2222222222222222,
34
- "grad_norm": 2.586965560913086,
35
- "learning_rate": 1.9111111111111113e-05,
36
- "loss": 0.6731,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.2777777777777778,
41
- "grad_norm": 3.3346517086029053,
42
- "learning_rate": 1.888888888888889e-05,
43
- "loss": 0.6728,
 
 
 
 
 
 
 
 
 
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.3333333333333333,
48
- "grad_norm": 2.4391684532165527,
49
- "learning_rate": 1.866666666666667e-05,
50
- "loss": 0.6617,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.3888888888888889,
55
- "grad_norm": 1.6900883913040161,
56
- "learning_rate": 1.8444444444444448e-05,
57
- "loss": 0.6671,
 
 
 
 
 
 
 
 
 
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.4444444444444444,
62
- "grad_norm": 2.3360586166381836,
63
- "learning_rate": 1.8222222222222224e-05,
64
- "loss": 0.6341,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.5,
69
- "grad_norm": 2.310093641281128,
70
- "learning_rate": 1.8e-05,
71
- "loss": 0.6446,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.5555555555555556,
76
- "grad_norm": 2.071892499923706,
77
- "learning_rate": 1.7777777777777777e-05,
78
- "loss": 0.6478,
 
 
 
 
 
 
 
 
 
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.6111111111111112,
83
- "grad_norm": 1.9108268022537231,
84
- "learning_rate": 1.7555555555555556e-05,
85
- "loss": 0.6333,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.6666666666666666,
90
- "grad_norm": 1.8000233173370361,
91
- "learning_rate": 1.7333333333333336e-05,
92
- "loss": 0.6368,
 
 
 
 
 
 
 
 
 
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.7222222222222222,
97
- "grad_norm": 2.010941982269287,
98
- "learning_rate": 1.7111111111111112e-05,
99
- "loss": 0.6417,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.7777777777777778,
104
- "grad_norm": 2.502824068069458,
105
- "learning_rate": 1.688888888888889e-05,
106
- "loss": 0.6175,
 
 
 
 
 
 
 
 
 
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.8333333333333334,
111
- "grad_norm": 2.920900821685791,
112
- "learning_rate": 1.6666666666666667e-05,
113
- "loss": 0.5894,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.8888888888888888,
118
- "grad_norm": 2.418879985809326,
119
- "learning_rate": 1.6444444444444444e-05,
120
- "loss": 0.6012,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.9444444444444444,
125
- "grad_norm": 2.625758171081543,
126
- "learning_rate": 1.6222222222222223e-05,
127
- "loss": 0.5761,
 
 
 
 
 
 
 
 
 
128
  "step": 170
129
  },
130
  {
131
- "epoch": 1.0,
132
- "grad_norm": 3.7166590690612793,
133
- "learning_rate": 1.6000000000000003e-05,
134
- "loss": 0.6138,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 1.0,
139
- "eval_accuracy": 0.6811023622047244,
140
- "eval_loss": 0.6001904010772705,
141
- "eval_runtime": 5.9005,
142
- "eval_samples_per_second": 43.047,
143
- "eval_steps_per_second": 5.423,
144
- "step": 180
145
  },
146
  {
147
- "epoch": 1.0555555555555556,
148
- "grad_norm": 3.4292004108428955,
149
- "learning_rate": 1.577777777777778e-05,
150
- "loss": 0.6237,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.1111111111111112,
155
- "grad_norm": 2.9556570053100586,
156
- "learning_rate": 1.555555555555556e-05,
157
- "loss": 0.6242,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 1.1666666666666667,
162
- "grad_norm": 2.3123323917388916,
163
- "learning_rate": 1.5333333333333334e-05,
164
- "loss": 0.6179,
 
 
 
 
 
 
 
 
 
165
  "step": 210
166
  },
167
  {
168
- "epoch": 1.2222222222222223,
169
- "grad_norm": 1.728090763092041,
170
- "learning_rate": 1.5111111111111112e-05,
171
- "loss": 0.6208,
172
  "step": 220
173
  },
174
  {
175
- "epoch": 1.2777777777777777,
176
- "grad_norm": 2.110353946685791,
177
- "learning_rate": 1.488888888888889e-05,
178
- "loss": 0.5744,
179
  "step": 230
180
  },
181
  {
182
- "epoch": 1.3333333333333333,
183
- "grad_norm": 2.0783276557922363,
184
- "learning_rate": 1.4666666666666666e-05,
185
- "loss": 0.5388,
 
 
 
 
 
 
 
 
 
186
  "step": 240
187
  },
188
  {
189
- "epoch": 1.3888888888888888,
190
- "grad_norm": 2.3175930976867676,
191
- "learning_rate": 1.4444444444444446e-05,
192
- "loss": 0.5846,
193
  "step": 250
194
  },
195
  {
196
- "epoch": 1.4444444444444444,
197
- "grad_norm": 3.0923080444335938,
198
- "learning_rate": 1.4222222222222224e-05,
199
- "loss": 0.5831,
 
 
 
 
 
 
 
 
 
200
  "step": 260
201
  },
202
  {
203
- "epoch": 1.5,
204
- "grad_norm": 1.994885802268982,
205
- "learning_rate": 1.4e-05,
206
- "loss": 0.5929,
207
  "step": 270
208
  },
209
  {
210
- "epoch": 1.5555555555555556,
211
- "grad_norm": 3.28375244140625,
212
- "learning_rate": 1.377777777777778e-05,
213
- "loss": 0.5965,
 
 
 
 
 
 
 
 
 
214
  "step": 280
215
  },
216
  {
217
- "epoch": 1.6111111111111112,
218
- "grad_norm": 2.8813462257385254,
219
- "learning_rate": 1.3555555555555557e-05,
220
- "loss": 0.5358,
221
  "step": 290
222
  },
223
  {
224
- "epoch": 1.6666666666666665,
225
- "grad_norm": 3.112164258956909,
226
- "learning_rate": 1.3333333333333333e-05,
227
- "loss": 0.5877,
 
 
 
 
 
 
 
 
 
228
  "step": 300
229
  },
230
  {
231
- "epoch": 1.7222222222222223,
232
- "grad_norm": 1.542144775390625,
233
- "learning_rate": 1.3111111111111113e-05,
234
- "loss": 0.5485,
235
  "step": 310
236
  },
237
  {
238
- "epoch": 1.7777777777777777,
239
- "grad_norm": 2.144716262817383,
240
- "learning_rate": 1.288888888888889e-05,
241
- "loss": 0.6028,
242
  "step": 320
243
  },
244
  {
245
- "epoch": 1.8333333333333335,
246
- "grad_norm": 2.632028102874756,
247
- "learning_rate": 1.2666666666666667e-05,
248
- "loss": 0.5295,
 
 
 
 
 
 
 
 
 
249
  "step": 330
250
  },
251
  {
252
- "epoch": 1.8888888888888888,
253
- "grad_norm": 2.2505481243133545,
254
- "learning_rate": 1.2444444444444446e-05,
255
- "loss": 0.6173,
256
  "step": 340
257
  },
258
  {
259
- "epoch": 1.9444444444444444,
260
- "grad_norm": 2.7676846981048584,
261
- "learning_rate": 1.2222222222222224e-05,
262
- "loss": 0.5421,
 
 
 
 
 
 
 
 
 
263
  "step": 350
264
  },
265
  {
266
- "epoch": 2.0,
267
- "grad_norm": 2.5781171321868896,
268
- "learning_rate": 1.2e-05,
269
- "loss": 0.5028,
270
  "step": 360
271
  },
272
  {
273
- "epoch": 2.0,
274
- "eval_accuracy": 0.6811023622047244,
275
- "eval_loss": 0.5528703927993774,
276
- "eval_runtime": 4.4862,
277
- "eval_samples_per_second": 56.618,
278
- "eval_steps_per_second": 7.133,
279
- "step": 360
280
  },
281
  {
282
- "epoch": 2.0555555555555554,
283
- "grad_norm": 2.5306832790374756,
284
- "learning_rate": 1.177777777777778e-05,
285
- "loss": 0.5849,
286
  "step": 370
287
  },
288
  {
289
- "epoch": 2.111111111111111,
290
- "grad_norm": 2.1644484996795654,
291
- "learning_rate": 1.1555555555555556e-05,
292
- "loss": 0.5268,
293
  "step": 380
294
  },
295
  {
296
- "epoch": 2.1666666666666665,
297
- "grad_norm": 1.9074407815933228,
298
- "learning_rate": 1.1333333333333334e-05,
299
- "loss": 0.55,
300
  "step": 390
301
  },
302
  {
303
- "epoch": 2.2222222222222223,
304
- "grad_norm": 3.7348294258117676,
305
- "learning_rate": 1.1111111111111113e-05,
306
- "loss": 0.6125,
 
 
 
 
 
 
 
 
 
307
  "step": 400
308
  },
309
  {
310
- "epoch": 2.2777777777777777,
311
- "grad_norm": 2.921757221221924,
312
- "learning_rate": 1.088888888888889e-05,
313
- "loss": 0.642,
314
  "step": 410
315
  },
316
  {
317
- "epoch": 2.3333333333333335,
318
- "grad_norm": 2.2528910636901855,
319
- "learning_rate": 1.0666666666666667e-05,
320
- "loss": 0.5009,
 
 
 
 
 
 
 
 
 
321
  "step": 420
322
  },
323
  {
324
- "epoch": 2.388888888888889,
325
- "grad_norm": 1.6619905233383179,
326
- "learning_rate": 1.0444444444444445e-05,
327
- "loss": 0.5158,
328
  "step": 430
329
  },
330
  {
331
- "epoch": 2.4444444444444446,
332
- "grad_norm": 3.4696500301361084,
333
- "learning_rate": 1.0222222222222223e-05,
334
- "loss": 0.6036,
 
 
 
 
 
 
 
 
 
335
  "step": 440
336
  },
337
  {
338
- "epoch": 2.5,
339
- "grad_norm": 2.936615467071533,
340
- "learning_rate": 1e-05,
341
- "loss": 0.5424,
342
  "step": 450
343
  },
344
  {
345
- "epoch": 2.5555555555555554,
346
- "grad_norm": 4.1920952796936035,
347
- "learning_rate": 9.777777777777779e-06,
348
- "loss": 0.5532,
 
 
 
 
 
 
 
 
 
349
  "step": 460
350
  },
351
  {
352
- "epoch": 2.611111111111111,
353
- "grad_norm": 2.4296085834503174,
354
- "learning_rate": 9.555555555555556e-06,
355
- "loss": 0.5436,
356
  "step": 470
357
  },
358
  {
359
- "epoch": 2.6666666666666665,
360
- "grad_norm": 2.5725982189178467,
361
- "learning_rate": 9.333333333333334e-06,
362
- "loss": 0.5015,
363
  "step": 480
364
  },
365
  {
366
- "epoch": 2.7222222222222223,
367
- "grad_norm": 4.97006368637085,
368
- "learning_rate": 9.111111111111112e-06,
369
- "loss": 0.5775,
 
 
 
 
 
 
 
 
 
370
  "step": 490
371
  },
372
  {
373
- "epoch": 2.7777777777777777,
374
- "grad_norm": 3.6014504432678223,
375
- "learning_rate": 8.888888888888888e-06,
376
- "loss": 0.5689,
377
  "step": 500
378
  },
379
  {
380
- "epoch": 2.8333333333333335,
381
- "grad_norm": 1.8251533508300781,
382
- "learning_rate": 8.666666666666668e-06,
383
- "loss": 0.5137,
 
 
 
 
 
 
 
 
 
384
  "step": 510
385
  },
386
  {
387
- "epoch": 2.888888888888889,
388
- "grad_norm": 3.503689765930176,
389
- "learning_rate": 8.444444444444446e-06,
390
- "loss": 0.5827,
391
  "step": 520
392
  },
393
  {
394
- "epoch": 2.9444444444444446,
395
- "grad_norm": 2.4786183834075928,
396
- "learning_rate": 8.222222222222222e-06,
397
- "loss": 0.5202,
398
- "step": 530
 
 
399
  },
400
  {
401
- "epoch": 3.0,
402
- "grad_norm": 2.0592217445373535,
403
- "learning_rate": 8.000000000000001e-06,
404
- "loss": 0.5103,
405
- "step": 540
406
  },
407
  {
408
- "epoch": 3.0,
409
- "eval_accuracy": 0.6811023622047244,
410
- "eval_loss": 0.5325487852096558,
411
- "eval_runtime": 5.1901,
412
- "eval_samples_per_second": 48.94,
413
- "eval_steps_per_second": 6.166,
414
  "step": 540
415
  },
416
  {
417
- "epoch": 3.0555555555555554,
418
- "grad_norm": 1.9031552076339722,
419
- "learning_rate": 7.77777777777778e-06,
420
- "loss": 0.5273,
421
  "step": 550
422
  },
423
  {
424
- "epoch": 3.111111111111111,
425
- "grad_norm": 2.656759262084961,
426
- "learning_rate": 7.555555555555556e-06,
427
- "loss": 0.5824,
 
 
 
 
 
 
 
 
 
428
  "step": 560
429
  },
430
  {
431
- "epoch": 3.1666666666666665,
432
- "grad_norm": 1.8883424997329712,
433
- "learning_rate": 7.333333333333333e-06,
434
- "loss": 0.5653,
435
  "step": 570
436
  },
437
  {
438
- "epoch": 3.2222222222222223,
439
- "grad_norm": 2.118739128112793,
440
- "learning_rate": 7.111111111111112e-06,
441
- "loss": 0.5057,
 
 
 
 
 
 
 
 
 
442
  "step": 580
443
  },
444
  {
445
- "epoch": 3.2777777777777777,
446
- "grad_norm": 1.90389883518219,
447
- "learning_rate": 6.88888888888889e-06,
448
- "loss": 0.6118,
449
  "step": 590
450
  },
451
  {
452
- "epoch": 3.3333333333333335,
453
- "grad_norm": 1.8850902318954468,
454
- "learning_rate": 6.666666666666667e-06,
455
- "loss": 0.5682,
 
 
 
 
 
 
 
 
 
456
  "step": 600
457
  },
458
  {
459
- "epoch": 3.388888888888889,
460
- "grad_norm": 5.424034118652344,
461
- "learning_rate": 6.444444444444445e-06,
462
- "loss": 0.5501,
463
  "step": 610
464
  },
465
  {
466
- "epoch": 3.4444444444444446,
467
- "grad_norm": 6.140316963195801,
468
- "learning_rate": 6.222222222222223e-06,
469
- "loss": 0.6259,
470
  "step": 620
471
  },
472
  {
473
- "epoch": 3.5,
474
- "grad_norm": 2.766357183456421,
475
- "learning_rate": 6e-06,
476
- "loss": 0.5077,
 
 
 
 
 
 
 
 
 
477
  "step": 630
478
  },
479
  {
480
- "epoch": 3.5555555555555554,
481
- "grad_norm": 2.0831124782562256,
482
- "learning_rate": 5.777777777777778e-06,
483
- "loss": 0.5609,
484
  "step": 640
485
  },
486
  {
487
- "epoch": 3.611111111111111,
488
- "grad_norm": 1.9845359325408936,
489
- "learning_rate": 5.555555555555557e-06,
490
- "loss": 0.4875,
 
 
 
 
 
 
 
 
 
491
  "step": 650
492
  },
493
  {
494
- "epoch": 3.6666666666666665,
495
- "grad_norm": 1.5764096975326538,
496
- "learning_rate": 5.333333333333334e-06,
497
- "loss": 0.4979,
498
  "step": 660
499
  },
500
  {
501
- "epoch": 3.7222222222222223,
502
- "grad_norm": 3.984339952468872,
503
- "learning_rate": 5.1111111111111115e-06,
504
- "loss": 0.5522,
 
 
 
 
 
 
 
 
 
505
  "step": 670
506
  },
507
  {
508
- "epoch": 3.7777777777777777,
509
- "grad_norm": 2.4655332565307617,
510
- "learning_rate": 4.888888888888889e-06,
511
- "loss": 0.5476,
512
  "step": 680
513
  },
514
  {
515
- "epoch": 3.8333333333333335,
516
- "grad_norm": 2.9143753051757812,
517
- "learning_rate": 4.666666666666667e-06,
518
- "loss": 0.5901,
 
 
 
 
 
 
 
 
 
519
  "step": 690
520
  },
521
  {
522
- "epoch": 3.888888888888889,
523
- "grad_norm": 2.023881435394287,
524
- "learning_rate": 4.444444444444444e-06,
525
- "loss": 0.5842,
526
  "step": 700
527
  },
528
  {
529
- "epoch": 3.9444444444444446,
530
- "grad_norm": 2.730069398880005,
531
- "learning_rate": 4.222222222222223e-06,
532
- "loss": 0.5443,
533
  "step": 710
534
  },
535
  {
536
- "epoch": 4.0,
537
- "grad_norm": 2.496239185333252,
538
- "learning_rate": 4.000000000000001e-06,
539
- "loss": 0.4892,
540
- "step": 720
 
 
541
  },
542
  {
543
- "epoch": 4.0,
544
- "eval_accuracy": 0.6811023622047244,
545
- "eval_loss": 0.5247489213943481,
546
- "eval_runtime": 4.9059,
547
- "eval_samples_per_second": 51.774,
548
- "eval_steps_per_second": 6.523,
549
  "step": 720
550
  },
551
  {
552
- "epoch": 4.055555555555555,
553
- "grad_norm": 1.9344916343688965,
554
- "learning_rate": 3.777777777777778e-06,
555
- "loss": 0.5017,
556
  "step": 730
557
  },
558
  {
559
- "epoch": 4.111111111111111,
560
- "grad_norm": 3.2695019245147705,
561
- "learning_rate": 3.555555555555556e-06,
562
- "loss": 0.5974,
 
 
 
 
 
 
 
 
 
563
  "step": 740
564
  },
565
  {
566
- "epoch": 4.166666666666667,
567
- "grad_norm": 4.992809772491455,
568
- "learning_rate": 3.3333333333333333e-06,
569
- "loss": 0.5307,
570
  "step": 750
571
  },
572
  {
573
- "epoch": 4.222222222222222,
574
- "grad_norm": 3.5359721183776855,
575
- "learning_rate": 3.1111111111111116e-06,
576
- "loss": 0.5372,
 
 
 
 
 
 
 
 
 
577
  "step": 760
578
  },
579
  {
580
- "epoch": 4.277777777777778,
581
- "grad_norm": 2.123009204864502,
582
- "learning_rate": 2.888888888888889e-06,
583
- "loss": 0.49,
584
  "step": 770
585
  },
586
  {
587
- "epoch": 4.333333333333333,
588
- "grad_norm": 2.8134477138519287,
589
- "learning_rate": 2.666666666666667e-06,
590
- "loss": 0.4749,
591
  "step": 780
592
  },
593
  {
594
- "epoch": 4.388888888888889,
595
- "grad_norm": 2.195077896118164,
596
- "learning_rate": 2.4444444444444447e-06,
597
- "loss": 0.5876,
 
 
 
 
 
 
 
 
 
598
  "step": 790
599
  },
600
  {
601
- "epoch": 4.444444444444445,
602
- "grad_norm": 4.078568935394287,
603
- "learning_rate": 2.222222222222222e-06,
604
- "loss": 0.5609,
605
  "step": 800
606
  },
607
  {
608
- "epoch": 4.5,
609
- "grad_norm": 2.6274867057800293,
610
- "learning_rate": 2.0000000000000003e-06,
611
- "loss": 0.5392,
 
 
 
 
 
 
 
 
 
612
  "step": 810
613
  },
614
  {
615
- "epoch": 4.555555555555555,
616
- "grad_norm": 1.7526293992996216,
617
- "learning_rate": 1.777777777777778e-06,
618
- "loss": 0.4965,
619
  "step": 820
620
  },
621
  {
622
- "epoch": 4.611111111111111,
623
- "grad_norm": 2.3001506328582764,
624
- "learning_rate": 1.5555555555555558e-06,
625
- "loss": 0.5345,
 
 
 
 
 
 
 
 
 
626
  "step": 830
627
  },
628
  {
629
- "epoch": 4.666666666666667,
630
- "grad_norm": 2.5624406337738037,
631
- "learning_rate": 1.3333333333333334e-06,
632
- "loss": 0.5765,
633
  "step": 840
634
  },
635
  {
636
- "epoch": 4.722222222222222,
637
- "grad_norm": 2.199018716812134,
638
- "learning_rate": 1.111111111111111e-06,
639
- "loss": 0.5547,
640
  "step": 850
641
  },
642
  {
643
- "epoch": 4.777777777777778,
644
- "grad_norm": 3.756605625152588,
645
- "learning_rate": 8.88888888888889e-07,
646
- "loss": 0.5021,
 
 
 
 
 
 
 
 
 
647
  "step": 860
648
  },
649
  {
650
- "epoch": 4.833333333333333,
651
- "grad_norm": 1.6385400295257568,
652
- "learning_rate": 6.666666666666667e-07,
653
- "loss": 0.5858,
654
  "step": 870
655
  },
656
  {
657
- "epoch": 4.888888888888889,
658
- "grad_norm": 5.481604099273682,
659
- "learning_rate": 4.444444444444445e-07,
660
- "loss": 0.5062,
 
 
 
 
 
 
 
 
 
661
  "step": 880
662
  },
663
  {
664
- "epoch": 4.944444444444445,
665
- "grad_norm": 2.554666042327881,
666
- "learning_rate": 2.2222222222222224e-07,
667
- "loss": 0.5355,
668
  "step": 890
669
  },
670
  {
671
- "epoch": 5.0,
672
- "grad_norm": 5.525686264038086,
673
- "learning_rate": 0.0,
674
- "loss": 0.5779,
675
- "step": 900
 
 
676
  },
677
  {
678
- "epoch": 5.0,
679
- "eval_accuracy": 0.6811023622047244,
680
- "eval_loss": 0.530208170413971,
681
- "eval_runtime": 5.531,
682
- "eval_samples_per_second": 45.923,
683
- "eval_steps_per_second": 5.786,
684
  "step": 900
685
  },
686
  {
687
- "epoch": 5.0,
688
- "step": 900,
689
- "total_flos": 7.24330215447552e+16,
690
- "train_loss": 0.5726550849278768,
691
- "train_runtime": 239.6067,
692
- "train_samples_per_second": 29.945,
693
- "train_steps_per_second": 3.756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
  }
695
  ],
696
  "logging_steps": 10,
697
- "max_steps": 900,
698
  "num_input_tokens_seen": 0,
699
- "num_train_epochs": 5,
700
  "save_steps": 500,
701
  "stateful_callbacks": {
702
  "TrainerControl": {
@@ -710,8 +1290,8 @@
710
  "attributes": {}
711
  }
712
  },
713
- "total_flos": 7.24330215447552e+16,
714
- "train_batch_size": 8,
715
  "trial_name": null,
716
  "trial_params": null
717
  }
 
1
  {
2
+ "best_metric": 0.4903908967971802,
3
+ "best_model_checkpoint": "./beans_outputs/checkpoint-621",
4
+ "epoch": 50.0,
5
  "eval_steps": 500,
6
+ "global_step": 1150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.43478260869565216,
13
+ "grad_norm": 3.492309093475342,
14
+ "learning_rate": 1.9826086956521742e-05,
15
+ "loss": 0.7305,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.8695652173913043,
20
+ "grad_norm": 4.003854751586914,
21
+ "learning_rate": 1.965217391304348e-05,
22
+ "loss": 0.6794,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 1.0,
27
+ "eval_accuracy": 0.6062992125984252,
28
+ "eval_loss": 0.6559741497039795,
29
+ "eval_runtime": 4.6669,
30
+ "eval_samples_per_second": 54.425,
31
+ "eval_steps_per_second": 0.857,
32
+ "step": 23
33
+ },
34
+ {
35
+ "epoch": 1.3043478260869565,
36
+ "grad_norm": 4.675185680389404,
37
+ "learning_rate": 1.947826086956522e-05,
38
+ "loss": 0.6399,
39
  "step": 30
40
  },
41
  {
42
+ "epoch": 1.7391304347826086,
43
+ "grad_norm": 4.350035667419434,
44
+ "learning_rate": 1.9304347826086957e-05,
45
+ "loss": 0.6215,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 2.0,
50
+ "eval_accuracy": 0.7362204724409449,
51
+ "eval_loss": 0.5833372473716736,
52
+ "eval_runtime": 3.7107,
53
+ "eval_samples_per_second": 68.451,
54
+ "eval_steps_per_second": 1.078,
55
+ "step": 46
56
+ },
57
+ {
58
+ "epoch": 2.1739130434782608,
59
+ "grad_norm": 3.180147886276245,
60
+ "learning_rate": 1.9130434782608697e-05,
61
+ "loss": 0.5964,
62
  "step": 50
63
  },
64
  {
65
+ "epoch": 2.608695652173913,
66
+ "grad_norm": 3.246190309524536,
67
+ "learning_rate": 1.8956521739130434e-05,
68
+ "loss": 0.5784,
69
  "step": 60
70
  },
71
  {
72
+ "epoch": 3.0,
73
+ "eval_accuracy": 0.7598425196850394,
74
+ "eval_loss": 0.5489528179168701,
75
+ "eval_runtime": 4.3517,
76
+ "eval_samples_per_second": 58.367,
77
+ "eval_steps_per_second": 0.919,
78
+ "step": 69
79
+ },
80
+ {
81
+ "epoch": 3.0434782608695654,
82
+ "grad_norm": 2.3400914669036865,
83
+ "learning_rate": 1.8782608695652175e-05,
84
+ "loss": 0.5412,
85
  "step": 70
86
  },
87
  {
88
+ "epoch": 3.4782608695652173,
89
+ "grad_norm": 3.5264837741851807,
90
+ "learning_rate": 1.8608695652173912e-05,
91
+ "loss": 0.5659,
92
  "step": 80
93
  },
94
  {
95
+ "epoch": 3.9130434782608696,
96
+ "grad_norm": 4.993140697479248,
97
+ "learning_rate": 1.8434782608695653e-05,
98
+ "loss": 0.5347,
99
  "step": 90
100
  },
101
  {
102
+ "epoch": 4.0,
103
+ "eval_accuracy": 0.7637795275590551,
104
+ "eval_loss": 0.5305963754653931,
105
+ "eval_runtime": 3.9321,
106
+ "eval_samples_per_second": 64.596,
107
+ "eval_steps_per_second": 1.017,
108
+ "step": 92
109
+ },
110
+ {
111
+ "epoch": 4.3478260869565215,
112
+ "grad_norm": 2.20806622505188,
113
+ "learning_rate": 1.8260869565217393e-05,
114
+ "loss": 0.5086,
115
  "step": 100
116
  },
117
  {
118
+ "epoch": 4.782608695652174,
119
+ "grad_norm": 4.256261825561523,
120
+ "learning_rate": 1.808695652173913e-05,
121
+ "loss": 0.5307,
122
  "step": 110
123
  },
124
  {
125
+ "epoch": 5.0,
126
+ "eval_accuracy": 0.7637795275590551,
127
+ "eval_loss": 0.5235078930854797,
128
+ "eval_runtime": 3.7141,
129
+ "eval_samples_per_second": 68.389,
130
+ "eval_steps_per_second": 1.077,
131
+ "step": 115
132
+ },
133
+ {
134
+ "epoch": 5.217391304347826,
135
+ "grad_norm": 2.6543545722961426,
136
+ "learning_rate": 1.791304347826087e-05,
137
+ "loss": 0.5085,
138
  "step": 120
139
  },
140
  {
141
+ "epoch": 5.6521739130434785,
142
+ "grad_norm": 4.274487495422363,
143
+ "learning_rate": 1.773913043478261e-05,
144
+ "loss": 0.5391,
145
  "step": 130
146
  },
147
  {
148
+ "epoch": 6.0,
149
+ "eval_accuracy": 0.7677165354330708,
150
+ "eval_loss": 0.5090441703796387,
151
+ "eval_runtime": 4.3438,
152
+ "eval_samples_per_second": 58.475,
153
+ "eval_steps_per_second": 0.921,
154
+ "step": 138
155
+ },
156
+ {
157
+ "epoch": 6.086956521739131,
158
+ "grad_norm": 3.147414445877075,
159
+ "learning_rate": 1.756521739130435e-05,
160
+ "loss": 0.4977,
161
  "step": 140
162
  },
163
  {
164
+ "epoch": 6.521739130434782,
165
+ "grad_norm": 4.254673004150391,
166
+ "learning_rate": 1.739130434782609e-05,
167
+ "loss": 0.5297,
168
  "step": 150
169
  },
170
  {
171
+ "epoch": 6.956521739130435,
172
+ "grad_norm": 2.083784818649292,
173
+ "learning_rate": 1.721739130434783e-05,
174
+ "loss": 0.48,
175
  "step": 160
176
  },
177
  {
178
+ "epoch": 7.0,
179
+ "eval_accuracy": 0.7716535433070866,
180
+ "eval_loss": 0.5108471512794495,
181
+ "eval_runtime": 4.1877,
182
+ "eval_samples_per_second": 60.653,
183
+ "eval_steps_per_second": 0.955,
184
+ "step": 161
185
+ },
186
+ {
187
+ "epoch": 7.391304347826087,
188
+ "grad_norm": 4.193545818328857,
189
+ "learning_rate": 1.7043478260869566e-05,
190
+ "loss": 0.4826,
191
  "step": 170
192
  },
193
  {
194
+ "epoch": 7.826086956521739,
195
+ "grad_norm": 2.05076003074646,
196
+ "learning_rate": 1.6869565217391307e-05,
197
+ "loss": 0.473,
198
  "step": 180
199
  },
200
  {
201
+ "epoch": 8.0,
202
+ "eval_accuracy": 0.7755905511811023,
203
+ "eval_loss": 0.5028324127197266,
204
+ "eval_runtime": 3.7021,
205
+ "eval_samples_per_second": 68.61,
206
+ "eval_steps_per_second": 1.08,
207
+ "step": 184
208
  },
209
  {
210
+ "epoch": 8.26086956521739,
211
+ "grad_norm": 3.007233142852783,
212
+ "learning_rate": 1.6695652173913044e-05,
213
+ "loss": 0.5255,
214
  "step": 190
215
  },
216
  {
217
+ "epoch": 8.695652173913043,
218
+ "grad_norm": 2.196945905685425,
219
+ "learning_rate": 1.6521739130434785e-05,
220
+ "loss": 0.5014,
221
  "step": 200
222
  },
223
  {
224
+ "epoch": 9.0,
225
+ "eval_accuracy": 0.7716535433070866,
226
+ "eval_loss": 0.5054498314857483,
227
+ "eval_runtime": 4.225,
228
+ "eval_samples_per_second": 60.119,
229
+ "eval_steps_per_second": 0.947,
230
+ "step": 207
231
+ },
232
+ {
233
+ "epoch": 9.130434782608695,
234
+ "grad_norm": 2.184353828430176,
235
+ "learning_rate": 1.6347826086956525e-05,
236
+ "loss": 0.5044,
237
  "step": 210
238
  },
239
  {
240
+ "epoch": 9.565217391304348,
241
+ "grad_norm": 4.106619358062744,
242
+ "learning_rate": 1.6173913043478262e-05,
243
+ "loss": 0.4822,
244
  "step": 220
245
  },
246
  {
247
+ "epoch": 10.0,
248
+ "grad_norm": 4.000082969665527,
249
+ "learning_rate": 1.6000000000000003e-05,
250
+ "loss": 0.496,
251
  "step": 230
252
  },
253
  {
254
+ "epoch": 10.0,
255
+ "eval_accuracy": 0.7716535433070866,
256
+ "eval_loss": 0.5039955973625183,
257
+ "eval_runtime": 5.3498,
258
+ "eval_samples_per_second": 47.478,
259
+ "eval_steps_per_second": 0.748,
260
+ "step": 230
261
+ },
262
+ {
263
+ "epoch": 10.434782608695652,
264
+ "grad_norm": 5.726933002471924,
265
+ "learning_rate": 1.582608695652174e-05,
266
+ "loss": 0.5101,
267
  "step": 240
268
  },
269
  {
270
+ "epoch": 10.869565217391305,
271
+ "grad_norm": 4.100568771362305,
272
+ "learning_rate": 1.565217391304348e-05,
273
+ "loss": 0.4688,
274
  "step": 250
275
  },
276
  {
277
+ "epoch": 11.0,
278
+ "eval_accuracy": 0.7677165354330708,
279
+ "eval_loss": 0.4972316324710846,
280
+ "eval_runtime": 3.7607,
281
+ "eval_samples_per_second": 67.54,
282
+ "eval_steps_per_second": 1.064,
283
+ "step": 253
284
+ },
285
+ {
286
+ "epoch": 11.304347826086957,
287
+ "grad_norm": 2.6119587421417236,
288
+ "learning_rate": 1.5478260869565217e-05,
289
+ "loss": 0.485,
290
  "step": 260
291
  },
292
  {
293
+ "epoch": 11.73913043478261,
294
+ "grad_norm": 3.003861427307129,
295
+ "learning_rate": 1.5304347826086958e-05,
296
+ "loss": 0.4943,
297
  "step": 270
298
  },
299
  {
300
+ "epoch": 12.0,
301
+ "eval_accuracy": 0.7637795275590551,
302
+ "eval_loss": 0.49771231412887573,
303
+ "eval_runtime": 4.9203,
304
+ "eval_samples_per_second": 51.622,
305
+ "eval_steps_per_second": 0.813,
306
+ "step": 276
307
+ },
308
+ {
309
+ "epoch": 12.173913043478262,
310
+ "grad_norm": 2.9490270614624023,
311
+ "learning_rate": 1.5130434782608697e-05,
312
+ "loss": 0.4505,
313
  "step": 280
314
  },
315
  {
316
+ "epoch": 12.608695652173914,
317
+ "grad_norm": 2.8131847381591797,
318
+ "learning_rate": 1.4956521739130436e-05,
319
+ "loss": 0.5012,
320
  "step": 290
321
  },
322
  {
323
+ "epoch": 13.0,
324
+ "eval_accuracy": 0.7716535433070866,
325
+ "eval_loss": 0.5057242512702942,
326
+ "eval_runtime": 3.7024,
327
+ "eval_samples_per_second": 68.605,
328
+ "eval_steps_per_second": 1.08,
329
+ "step": 299
330
+ },
331
+ {
332
+ "epoch": 13.043478260869565,
333
+ "grad_norm": 17.65978240966797,
334
+ "learning_rate": 1.4782608695652174e-05,
335
+ "loss": 0.4768,
336
  "step": 300
337
  },
338
  {
339
+ "epoch": 13.478260869565217,
340
+ "grad_norm": 2.085587978363037,
341
+ "learning_rate": 1.4608695652173915e-05,
342
+ "loss": 0.4729,
343
  "step": 310
344
  },
345
  {
346
+ "epoch": 13.91304347826087,
347
+ "grad_norm": 4.59744119644165,
348
+ "learning_rate": 1.4434782608695654e-05,
349
+ "loss": 0.4639,
350
  "step": 320
351
  },
352
  {
353
+ "epoch": 14.0,
354
+ "eval_accuracy": 0.7716535433070866,
355
+ "eval_loss": 0.5010089874267578,
356
+ "eval_runtime": 3.7018,
357
+ "eval_samples_per_second": 68.616,
358
+ "eval_steps_per_second": 1.081,
359
+ "step": 322
360
+ },
361
+ {
362
+ "epoch": 14.347826086956522,
363
+ "grad_norm": 2.4057395458221436,
364
+ "learning_rate": 1.4260869565217392e-05,
365
+ "loss": 0.4751,
366
  "step": 330
367
  },
368
  {
369
+ "epoch": 14.782608695652174,
370
+ "grad_norm": 3.549567222595215,
371
+ "learning_rate": 1.4086956521739133e-05,
372
+ "loss": 0.4709,
373
  "step": 340
374
  },
375
  {
376
+ "epoch": 15.0,
377
+ "eval_accuracy": 0.7795275590551181,
378
+ "eval_loss": 0.4948899447917938,
379
+ "eval_runtime": 4.9714,
380
+ "eval_samples_per_second": 51.092,
381
+ "eval_steps_per_second": 0.805,
382
+ "step": 345
383
+ },
384
+ {
385
+ "epoch": 15.217391304347826,
386
+ "grad_norm": 6.705427646636963,
387
+ "learning_rate": 1.391304347826087e-05,
388
+ "loss": 0.4379,
389
  "step": 350
390
  },
391
  {
392
+ "epoch": 15.652173913043478,
393
+ "grad_norm": 2.444533348083496,
394
+ "learning_rate": 1.373913043478261e-05,
395
+ "loss": 0.4888,
396
  "step": 360
397
  },
398
  {
399
+ "epoch": 16.0,
400
+ "eval_accuracy": 0.7834645669291339,
401
+ "eval_loss": 0.49550917744636536,
402
+ "eval_runtime": 3.6768,
403
+ "eval_samples_per_second": 69.081,
404
+ "eval_steps_per_second": 1.088,
405
+ "step": 368
406
  },
407
  {
408
+ "epoch": 16.08695652173913,
409
+ "grad_norm": 5.470461845397949,
410
+ "learning_rate": 1.3565217391304348e-05,
411
+ "loss": 0.4952,
412
  "step": 370
413
  },
414
  {
415
+ "epoch": 16.52173913043478,
416
+ "grad_norm": 2.0678608417510986,
417
+ "learning_rate": 1.3391304347826088e-05,
418
+ "loss": 0.4784,
419
  "step": 380
420
  },
421
  {
422
+ "epoch": 16.956521739130434,
423
+ "grad_norm": 6.63480806350708,
424
+ "learning_rate": 1.3217391304347827e-05,
425
+ "loss": 0.4594,
426
  "step": 390
427
  },
428
  {
429
+ "epoch": 17.0,
430
+ "eval_accuracy": 0.7716535433070866,
431
+ "eval_loss": 0.49856194853782654,
432
+ "eval_runtime": 3.7219,
433
+ "eval_samples_per_second": 68.245,
434
+ "eval_steps_per_second": 1.075,
435
+ "step": 391
436
+ },
437
+ {
438
+ "epoch": 17.391304347826086,
439
+ "grad_norm": 4.448991298675537,
440
+ "learning_rate": 1.3043478260869566e-05,
441
+ "loss": 0.4607,
442
  "step": 400
443
  },
444
  {
445
+ "epoch": 17.82608695652174,
446
+ "grad_norm": 2.716780424118042,
447
+ "learning_rate": 1.2869565217391305e-05,
448
+ "loss": 0.4745,
449
  "step": 410
450
  },
451
  {
452
+ "epoch": 18.0,
453
+ "eval_accuracy": 0.7677165354330708,
454
+ "eval_loss": 0.501070499420166,
455
+ "eval_runtime": 4.5054,
456
+ "eval_samples_per_second": 56.377,
457
+ "eval_steps_per_second": 0.888,
458
+ "step": 414
459
+ },
460
+ {
461
+ "epoch": 18.26086956521739,
462
+ "grad_norm": 2.406355857849121,
463
+ "learning_rate": 1.2695652173913045e-05,
464
+ "loss": 0.4639,
465
  "step": 420
466
  },
467
  {
468
+ "epoch": 18.695652173913043,
469
+ "grad_norm": 5.627669811248779,
470
+ "learning_rate": 1.2521739130434784e-05,
471
+ "loss": 0.4667,
472
  "step": 430
473
  },
474
  {
475
+ "epoch": 19.0,
476
+ "eval_accuracy": 0.7755905511811023,
477
+ "eval_loss": 0.4928434491157532,
478
+ "eval_runtime": 4.0475,
479
+ "eval_samples_per_second": 62.756,
480
+ "eval_steps_per_second": 0.988,
481
+ "step": 437
482
+ },
483
+ {
484
+ "epoch": 19.130434782608695,
485
+ "grad_norm": 4.074652671813965,
486
+ "learning_rate": 1.2347826086956523e-05,
487
+ "loss": 0.4671,
488
  "step": 440
489
  },
490
  {
491
+ "epoch": 19.565217391304348,
492
+ "grad_norm": 5.88148832321167,
493
+ "learning_rate": 1.2173913043478263e-05,
494
+ "loss": 0.4442,
495
  "step": 450
496
  },
497
  {
498
+ "epoch": 20.0,
499
+ "grad_norm": 3.00347900390625,
500
+ "learning_rate": 1.2e-05,
501
+ "loss": 0.4551,
502
+ "step": 460
503
+ },
504
+ {
505
+ "epoch": 20.0,
506
+ "eval_accuracy": 0.7795275590551181,
507
+ "eval_loss": 0.5055357217788696,
508
+ "eval_runtime": 3.6885,
509
+ "eval_samples_per_second": 68.862,
510
+ "eval_steps_per_second": 1.084,
511
  "step": 460
512
  },
513
  {
514
+ "epoch": 20.434782608695652,
515
+ "grad_norm": 10.164237976074219,
516
+ "learning_rate": 1.182608695652174e-05,
517
+ "loss": 0.4657,
518
  "step": 470
519
  },
520
  {
521
+ "epoch": 20.869565217391305,
522
+ "grad_norm": 2.1962711811065674,
523
+ "learning_rate": 1.1652173913043478e-05,
524
+ "loss": 0.4657,
525
  "step": 480
526
  },
527
  {
528
+ "epoch": 21.0,
529
+ "eval_accuracy": 0.7755905511811023,
530
+ "eval_loss": 0.4928124248981476,
531
+ "eval_runtime": 4.4478,
532
+ "eval_samples_per_second": 57.107,
533
+ "eval_steps_per_second": 0.899,
534
+ "step": 483
535
+ },
536
+ {
537
+ "epoch": 21.304347826086957,
538
+ "grad_norm": 5.0302228927612305,
539
+ "learning_rate": 1.1478260869565218e-05,
540
+ "loss": 0.4564,
541
  "step": 490
542
  },
543
  {
544
+ "epoch": 21.73913043478261,
545
+ "grad_norm": 3.5275819301605225,
546
+ "learning_rate": 1.1304347826086957e-05,
547
+ "loss": 0.4818,
548
  "step": 500
549
  },
550
  {
551
+ "epoch": 22.0,
552
+ "eval_accuracy": 0.7755905511811023,
553
+ "eval_loss": 0.5001721978187561,
554
+ "eval_runtime": 4.0355,
555
+ "eval_samples_per_second": 62.942,
556
+ "eval_steps_per_second": 0.991,
557
+ "step": 506
558
+ },
559
+ {
560
+ "epoch": 22.17391304347826,
561
+ "grad_norm": 6.920666694641113,
562
+ "learning_rate": 1.1130434782608696e-05,
563
+ "loss": 0.4608,
564
  "step": 510
565
  },
566
  {
567
+ "epoch": 22.608695652173914,
568
+ "grad_norm": 2.2840707302093506,
569
+ "learning_rate": 1.0956521739130435e-05,
570
+ "loss": 0.4633,
571
  "step": 520
572
  },
573
  {
574
+ "epoch": 23.0,
575
+ "eval_accuracy": 0.7834645669291339,
576
+ "eval_loss": 0.49459317326545715,
577
+ "eval_runtime": 3.7179,
578
+ "eval_samples_per_second": 68.319,
579
+ "eval_steps_per_second": 1.076,
580
+ "step": 529
581
  },
582
  {
583
+ "epoch": 23.043478260869566,
584
+ "grad_norm": 6.509201526641846,
585
+ "learning_rate": 1.0782608695652175e-05,
586
+ "loss": 0.4694,
587
+ "step": 530
588
  },
589
  {
590
+ "epoch": 23.47826086956522,
591
+ "grad_norm": 2.403275489807129,
592
+ "learning_rate": 1.0608695652173914e-05,
593
+ "loss": 0.4874,
 
 
594
  "step": 540
595
  },
596
  {
597
+ "epoch": 23.91304347826087,
598
+ "grad_norm": 2.1320598125457764,
599
+ "learning_rate": 1.0434782608695653e-05,
600
+ "loss": 0.4779,
601
  "step": 550
602
  },
603
  {
604
+ "epoch": 24.0,
605
+ "eval_accuracy": 0.7795275590551181,
606
+ "eval_loss": 0.49417200684547424,
607
+ "eval_runtime": 4.3215,
608
+ "eval_samples_per_second": 58.776,
609
+ "eval_steps_per_second": 0.926,
610
+ "step": 552
611
+ },
612
+ {
613
+ "epoch": 24.347826086956523,
614
+ "grad_norm": 3.7421488761901855,
615
+ "learning_rate": 1.0260869565217393e-05,
616
+ "loss": 0.4579,
617
  "step": 560
618
  },
619
  {
620
+ "epoch": 24.782608695652176,
621
+ "grad_norm": 3.07060170173645,
622
+ "learning_rate": 1.008695652173913e-05,
623
+ "loss": 0.4718,
624
  "step": 570
625
  },
626
  {
627
+ "epoch": 25.0,
628
+ "eval_accuracy": 0.7834645669291339,
629
+ "eval_loss": 0.49625155329704285,
630
+ "eval_runtime": 5.5612,
631
+ "eval_samples_per_second": 45.674,
632
+ "eval_steps_per_second": 0.719,
633
+ "step": 575
634
+ },
635
+ {
636
+ "epoch": 25.217391304347824,
637
+ "grad_norm": 4.446998596191406,
638
+ "learning_rate": 9.913043478260871e-06,
639
+ "loss": 0.443,
640
  "step": 580
641
  },
642
  {
643
+ "epoch": 25.652173913043477,
644
+ "grad_norm": 2.4786624908447266,
645
+ "learning_rate": 9.73913043478261e-06,
646
+ "loss": 0.4511,
647
  "step": 590
648
  },
649
  {
650
+ "epoch": 26.0,
651
+ "eval_accuracy": 0.7716535433070866,
652
+ "eval_loss": 0.5011107325553894,
653
+ "eval_runtime": 3.7637,
654
+ "eval_samples_per_second": 67.487,
655
+ "eval_steps_per_second": 1.063,
656
+ "step": 598
657
+ },
658
+ {
659
+ "epoch": 26.08695652173913,
660
+ "grad_norm": 5.552999019622803,
661
+ "learning_rate": 9.565217391304349e-06,
662
+ "loss": 0.4631,
663
  "step": 600
664
  },
665
  {
666
+ "epoch": 26.52173913043478,
667
+ "grad_norm": 5.050811290740967,
668
+ "learning_rate": 9.391304347826087e-06,
669
+ "loss": 0.4564,
670
  "step": 610
671
  },
672
  {
673
+ "epoch": 26.956521739130434,
674
+ "grad_norm": 2.0711512565612793,
675
+ "learning_rate": 9.217391304347826e-06,
676
+ "loss": 0.4798,
677
  "step": 620
678
  },
679
  {
680
+ "epoch": 27.0,
681
+ "eval_accuracy": 0.7874015748031497,
682
+ "eval_loss": 0.4903908967971802,
683
+ "eval_runtime": 4.9056,
684
+ "eval_samples_per_second": 51.777,
685
+ "eval_steps_per_second": 0.815,
686
+ "step": 621
687
+ },
688
+ {
689
+ "epoch": 27.391304347826086,
690
+ "grad_norm": 4.117509365081787,
691
+ "learning_rate": 9.043478260869565e-06,
692
+ "loss": 0.4411,
693
  "step": 630
694
  },
695
  {
696
+ "epoch": 27.82608695652174,
697
+ "grad_norm": 4.448685646057129,
698
+ "learning_rate": 8.869565217391306e-06,
699
+ "loss": 0.4868,
700
  "step": 640
701
  },
702
  {
703
+ "epoch": 28.0,
704
+ "eval_accuracy": 0.7834645669291339,
705
+ "eval_loss": 0.4982087016105652,
706
+ "eval_runtime": 3.7322,
707
+ "eval_samples_per_second": 68.057,
708
+ "eval_steps_per_second": 1.072,
709
+ "step": 644
710
+ },
711
+ {
712
+ "epoch": 28.26086956521739,
713
+ "grad_norm": 3.0993807315826416,
714
+ "learning_rate": 8.695652173913044e-06,
715
+ "loss": 0.4414,
716
  "step": 650
717
  },
718
  {
719
+ "epoch": 28.695652173913043,
720
+ "grad_norm": 4.982347011566162,
721
+ "learning_rate": 8.521739130434783e-06,
722
+ "loss": 0.4653,
723
  "step": 660
724
  },
725
  {
726
+ "epoch": 29.0,
727
+ "eval_accuracy": 0.7874015748031497,
728
+ "eval_loss": 0.498798668384552,
729
+ "eval_runtime": 3.7347,
730
+ "eval_samples_per_second": 68.012,
731
+ "eval_steps_per_second": 1.071,
732
+ "step": 667
733
+ },
734
+ {
735
+ "epoch": 29.130434782608695,
736
+ "grad_norm": 3.081833600997925,
737
+ "learning_rate": 8.347826086956522e-06,
738
+ "loss": 0.4503,
739
  "step": 670
740
  },
741
  {
742
+ "epoch": 29.565217391304348,
743
+ "grad_norm": 4.352429389953613,
744
+ "learning_rate": 8.173913043478263e-06,
745
+ "loss": 0.4674,
746
  "step": 680
747
  },
748
  {
749
+ "epoch": 30.0,
750
+ "grad_norm": 5.281393051147461,
751
+ "learning_rate": 8.000000000000001e-06,
752
+ "loss": 0.4613,
753
+ "step": 690
754
+ },
755
+ {
756
+ "epoch": 30.0,
757
+ "eval_accuracy": 0.7795275590551181,
758
+ "eval_loss": 0.49851593375205994,
759
+ "eval_runtime": 4.8766,
760
+ "eval_samples_per_second": 52.085,
761
+ "eval_steps_per_second": 0.82,
762
  "step": 690
763
  },
764
  {
765
+ "epoch": 30.434782608695652,
766
+ "grad_norm": 2.2079997062683105,
767
+ "learning_rate": 7.82608695652174e-06,
768
+ "loss": 0.4574,
769
  "step": 700
770
  },
771
  {
772
+ "epoch": 30.869565217391305,
773
+ "grad_norm": 4.6935858726501465,
774
+ "learning_rate": 7.652173913043479e-06,
775
+ "loss": 0.4675,
776
  "step": 710
777
  },
778
  {
779
+ "epoch": 31.0,
780
+ "eval_accuracy": 0.7716535433070866,
781
+ "eval_loss": 0.5060083270072937,
782
+ "eval_runtime": 3.7305,
783
+ "eval_samples_per_second": 68.087,
784
+ "eval_steps_per_second": 1.072,
785
+ "step": 713
786
  },
787
  {
788
+ "epoch": 31.304347826086957,
789
+ "grad_norm": 4.8790602684021,
790
+ "learning_rate": 7.478260869565218e-06,
791
+ "loss": 0.4802,
 
 
792
  "step": 720
793
  },
794
  {
795
+ "epoch": 31.73913043478261,
796
+ "grad_norm": 5.6365485191345215,
797
+ "learning_rate": 7.304347826086957e-06,
798
+ "loss": 0.4587,
799
  "step": 730
800
  },
801
  {
802
+ "epoch": 32.0,
803
+ "eval_accuracy": 0.7716535433070866,
804
+ "eval_loss": 0.5059147477149963,
805
+ "eval_runtime": 3.7699,
806
+ "eval_samples_per_second": 67.376,
807
+ "eval_steps_per_second": 1.061,
808
+ "step": 736
809
+ },
810
+ {
811
+ "epoch": 32.17391304347826,
812
+ "grad_norm": 5.480165004730225,
813
+ "learning_rate": 7.130434782608696e-06,
814
+ "loss": 0.4541,
815
  "step": 740
816
  },
817
  {
818
+ "epoch": 32.608695652173914,
819
+ "grad_norm": 2.053098440170288,
820
+ "learning_rate": 6.956521739130435e-06,
821
+ "loss": 0.464,
822
  "step": 750
823
  },
824
  {
825
+ "epoch": 33.0,
826
+ "eval_accuracy": 0.7795275590551181,
827
+ "eval_loss": 0.5041583180427551,
828
+ "eval_runtime": 4.9981,
829
+ "eval_samples_per_second": 50.82,
830
+ "eval_steps_per_second": 0.8,
831
+ "step": 759
832
+ },
833
+ {
834
+ "epoch": 33.04347826086956,
835
+ "grad_norm": 3.6429481506347656,
836
+ "learning_rate": 6.782608695652174e-06,
837
+ "loss": 0.454,
838
  "step": 760
839
  },
840
  {
841
+ "epoch": 33.47826086956522,
842
+ "grad_norm": 2.436143636703491,
843
+ "learning_rate": 6.6086956521739135e-06,
844
+ "loss": 0.4612,
845
  "step": 770
846
  },
847
  {
848
+ "epoch": 33.91304347826087,
849
+ "grad_norm": 2.5793776512145996,
850
+ "learning_rate": 6.434782608695652e-06,
851
+ "loss": 0.4374,
852
  "step": 780
853
  },
854
  {
855
+ "epoch": 34.0,
856
+ "eval_accuracy": 0.7677165354330708,
857
+ "eval_loss": 0.5063456296920776,
858
+ "eval_runtime": 3.7117,
859
+ "eval_samples_per_second": 68.432,
860
+ "eval_steps_per_second": 1.078,
861
+ "step": 782
862
+ },
863
+ {
864
+ "epoch": 34.34782608695652,
865
+ "grad_norm": 3.71374773979187,
866
+ "learning_rate": 6.260869565217392e-06,
867
+ "loss": 0.4667,
868
  "step": 790
869
  },
870
  {
871
+ "epoch": 34.78260869565217,
872
+ "grad_norm": 4.282368183135986,
873
+ "learning_rate": 6.086956521739132e-06,
874
+ "loss": 0.4864,
875
  "step": 800
876
  },
877
  {
878
+ "epoch": 35.0,
879
+ "eval_accuracy": 0.7677165354330708,
880
+ "eval_loss": 0.5039507150650024,
881
+ "eval_runtime": 3.6837,
882
+ "eval_samples_per_second": 68.952,
883
+ "eval_steps_per_second": 1.086,
884
+ "step": 805
885
+ },
886
+ {
887
+ "epoch": 35.21739130434783,
888
+ "grad_norm": 2.896638870239258,
889
+ "learning_rate": 5.91304347826087e-06,
890
+ "loss": 0.4922,
891
  "step": 810
892
  },
893
  {
894
+ "epoch": 35.65217391304348,
895
+ "grad_norm": 2.2342097759246826,
896
+ "learning_rate": 5.739130434782609e-06,
897
+ "loss": 0.4354,
898
  "step": 820
899
  },
900
  {
901
+ "epoch": 36.0,
902
+ "eval_accuracy": 0.7716535433070866,
903
+ "eval_loss": 0.5108994841575623,
904
+ "eval_runtime": 4.9899,
905
+ "eval_samples_per_second": 50.902,
906
+ "eval_steps_per_second": 0.802,
907
+ "step": 828
908
+ },
909
+ {
910
+ "epoch": 36.08695652173913,
911
+ "grad_norm": 8.385408401489258,
912
+ "learning_rate": 5.565217391304348e-06,
913
+ "loss": 0.4585,
914
  "step": 830
915
  },
916
  {
917
+ "epoch": 36.52173913043478,
918
+ "grad_norm": 2.839411497116089,
919
+ "learning_rate": 5.391304347826088e-06,
920
+ "loss": 0.4497,
921
  "step": 840
922
  },
923
  {
924
+ "epoch": 36.95652173913044,
925
+ "grad_norm": 2.479076623916626,
926
+ "learning_rate": 5.2173913043478265e-06,
927
+ "loss": 0.4655,
928
  "step": 850
929
  },
930
  {
931
+ "epoch": 37.0,
932
+ "eval_accuracy": 0.7716535433070866,
933
+ "eval_loss": 0.510716438293457,
934
+ "eval_runtime": 3.6997,
935
+ "eval_samples_per_second": 68.653,
936
+ "eval_steps_per_second": 1.081,
937
+ "step": 851
938
+ },
939
+ {
940
+ "epoch": 37.391304347826086,
941
+ "grad_norm": 2.271686553955078,
942
+ "learning_rate": 5.043478260869565e-06,
943
+ "loss": 0.4462,
944
  "step": 860
945
  },
946
  {
947
+ "epoch": 37.82608695652174,
948
+ "grad_norm": 3.4210402965545654,
949
+ "learning_rate": 4.869565217391305e-06,
950
+ "loss": 0.4691,
951
  "step": 870
952
  },
953
  {
954
+ "epoch": 38.0,
955
+ "eval_accuracy": 0.7677165354330708,
956
+ "eval_loss": 0.5093376636505127,
957
+ "eval_runtime": 3.7287,
958
+ "eval_samples_per_second": 68.119,
959
+ "eval_steps_per_second": 1.073,
960
+ "step": 874
961
+ },
962
+ {
963
+ "epoch": 38.26086956521739,
964
+ "grad_norm": 5.694761276245117,
965
+ "learning_rate": 4.695652173913044e-06,
966
+ "loss": 0.4592,
967
  "step": 880
968
  },
969
  {
970
+ "epoch": 38.69565217391305,
971
+ "grad_norm": 2.2949883937835693,
972
+ "learning_rate": 4.5217391304347826e-06,
973
+ "loss": 0.4826,
974
  "step": 890
975
  },
976
  {
977
+ "epoch": 39.0,
978
+ "eval_accuracy": 0.7716535433070866,
979
+ "eval_loss": 0.5044277906417847,
980
+ "eval_runtime": 4.9781,
981
+ "eval_samples_per_second": 51.024,
982
+ "eval_steps_per_second": 0.804,
983
+ "step": 897
984
  },
985
  {
986
+ "epoch": 39.130434782608695,
987
+ "grad_norm": 3.4144210815429688,
988
+ "learning_rate": 4.347826086956522e-06,
989
+ "loss": 0.4407,
 
 
990
  "step": 900
991
  },
992
  {
993
+ "epoch": 39.56521739130435,
994
+ "grad_norm": 2.22868013381958,
995
+ "learning_rate": 4.173913043478261e-06,
996
+ "loss": 0.4482,
997
+ "step": 910
998
+ },
999
+ {
1000
+ "epoch": 40.0,
1001
+ "grad_norm": 3.2193689346313477,
1002
+ "learning_rate": 4.000000000000001e-06,
1003
+ "loss": 0.4577,
1004
+ "step": 920
1005
+ },
1006
+ {
1007
+ "epoch": 40.0,
1008
+ "eval_accuracy": 0.7795275590551181,
1009
+ "eval_loss": 0.4999626874923706,
1010
+ "eval_runtime": 3.6952,
1011
+ "eval_samples_per_second": 68.738,
1012
+ "eval_steps_per_second": 1.082,
1013
+ "step": 920
1014
+ },
1015
+ {
1016
+ "epoch": 40.43478260869565,
1017
+ "grad_norm": 4.500718593597412,
1018
+ "learning_rate": 3.8260869565217395e-06,
1019
+ "loss": 0.4585,
1020
+ "step": 930
1021
+ },
1022
+ {
1023
+ "epoch": 40.869565217391305,
1024
+ "grad_norm": 1.9281222820281982,
1025
+ "learning_rate": 3.6521739130434787e-06,
1026
+ "loss": 0.4636,
1027
+ "step": 940
1028
+ },
1029
+ {
1030
+ "epoch": 41.0,
1031
+ "eval_accuracy": 0.7716535433070866,
1032
+ "eval_loss": 0.4962589144706726,
1033
+ "eval_runtime": 3.6977,
1034
+ "eval_samples_per_second": 68.69,
1035
+ "eval_steps_per_second": 1.082,
1036
+ "step": 943
1037
+ },
1038
+ {
1039
+ "epoch": 41.30434782608695,
1040
+ "grad_norm": 2.193452835083008,
1041
+ "learning_rate": 3.4782608695652175e-06,
1042
+ "loss": 0.4306,
1043
+ "step": 950
1044
+ },
1045
+ {
1046
+ "epoch": 41.73913043478261,
1047
+ "grad_norm": 2.2370336055755615,
1048
+ "learning_rate": 3.3043478260869567e-06,
1049
+ "loss": 0.4361,
1050
+ "step": 960
1051
+ },
1052
+ {
1053
+ "epoch": 42.0,
1054
+ "eval_accuracy": 0.7716535433070866,
1055
+ "eval_loss": 0.4958040118217468,
1056
+ "eval_runtime": 4.9548,
1057
+ "eval_samples_per_second": 51.264,
1058
+ "eval_steps_per_second": 0.807,
1059
+ "step": 966
1060
+ },
1061
+ {
1062
+ "epoch": 42.17391304347826,
1063
+ "grad_norm": 3.6354355812072754,
1064
+ "learning_rate": 3.130434782608696e-06,
1065
+ "loss": 0.4514,
1066
+ "step": 970
1067
+ },
1068
+ {
1069
+ "epoch": 42.608695652173914,
1070
+ "grad_norm": 1.8955118656158447,
1071
+ "learning_rate": 2.956521739130435e-06,
1072
+ "loss": 0.4534,
1073
+ "step": 980
1074
+ },
1075
+ {
1076
+ "epoch": 43.0,
1077
+ "eval_accuracy": 0.7795275590551181,
1078
+ "eval_loss": 0.5007808208465576,
1079
+ "eval_runtime": 3.7121,
1080
+ "eval_samples_per_second": 68.424,
1081
+ "eval_steps_per_second": 1.078,
1082
+ "step": 989
1083
+ },
1084
+ {
1085
+ "epoch": 43.04347826086956,
1086
+ "grad_norm": 2.2034902572631836,
1087
+ "learning_rate": 2.782608695652174e-06,
1088
+ "loss": 0.4176,
1089
+ "step": 990
1090
+ },
1091
+ {
1092
+ "epoch": 43.47826086956522,
1093
+ "grad_norm": 4.387076377868652,
1094
+ "learning_rate": 2.6086956521739132e-06,
1095
+ "loss": 0.4748,
1096
+ "step": 1000
1097
+ },
1098
+ {
1099
+ "epoch": 43.91304347826087,
1100
+ "grad_norm": 5.444644927978516,
1101
+ "learning_rate": 2.4347826086956525e-06,
1102
+ "loss": 0.4559,
1103
+ "step": 1010
1104
+ },
1105
+ {
1106
+ "epoch": 44.0,
1107
+ "eval_accuracy": 0.7795275590551181,
1108
+ "eval_loss": 0.5025174021720886,
1109
+ "eval_runtime": 3.7093,
1110
+ "eval_samples_per_second": 68.476,
1111
+ "eval_steps_per_second": 1.078,
1112
+ "step": 1012
1113
+ },
1114
+ {
1115
+ "epoch": 44.34782608695652,
1116
+ "grad_norm": 2.2067017555236816,
1117
+ "learning_rate": 2.2608695652173913e-06,
1118
+ "loss": 0.4882,
1119
+ "step": 1020
1120
+ },
1121
+ {
1122
+ "epoch": 44.78260869565217,
1123
+ "grad_norm": 3.562736988067627,
1124
+ "learning_rate": 2.0869565217391305e-06,
1125
+ "loss": 0.4189,
1126
+ "step": 1030
1127
+ },
1128
+ {
1129
+ "epoch": 45.0,
1130
+ "eval_accuracy": 0.7755905511811023,
1131
+ "eval_loss": 0.5014046430587769,
1132
+ "eval_runtime": 4.9992,
1133
+ "eval_samples_per_second": 50.808,
1134
+ "eval_steps_per_second": 0.8,
1135
+ "step": 1035
1136
+ },
1137
+ {
1138
+ "epoch": 45.21739130434783,
1139
+ "grad_norm": 10.402663230895996,
1140
+ "learning_rate": 1.9130434782608697e-06,
1141
+ "loss": 0.4432,
1142
+ "step": 1040
1143
+ },
1144
+ {
1145
+ "epoch": 45.65217391304348,
1146
+ "grad_norm": 4.949878215789795,
1147
+ "learning_rate": 1.7391304347826088e-06,
1148
+ "loss": 0.4861,
1149
+ "step": 1050
1150
+ },
1151
+ {
1152
+ "epoch": 46.0,
1153
+ "eval_accuracy": 0.7677165354330708,
1154
+ "eval_loss": 0.5003762245178223,
1155
+ "eval_runtime": 3.7019,
1156
+ "eval_samples_per_second": 68.614,
1157
+ "eval_steps_per_second": 1.081,
1158
+ "step": 1058
1159
+ },
1160
+ {
1161
+ "epoch": 46.08695652173913,
1162
+ "grad_norm": 1.938593864440918,
1163
+ "learning_rate": 1.565217391304348e-06,
1164
+ "loss": 0.4326,
1165
+ "step": 1060
1166
+ },
1167
+ {
1168
+ "epoch": 46.52173913043478,
1169
+ "grad_norm": 3.236699342727661,
1170
+ "learning_rate": 1.391304347826087e-06,
1171
+ "loss": 0.4726,
1172
+ "step": 1070
1173
+ },
1174
+ {
1175
+ "epoch": 46.95652173913044,
1176
+ "grad_norm": 3.047184944152832,
1177
+ "learning_rate": 1.2173913043478262e-06,
1178
+ "loss": 0.4709,
1179
+ "step": 1080
1180
+ },
1181
+ {
1182
+ "epoch": 47.0,
1183
+ "eval_accuracy": 0.7795275590551181,
1184
+ "eval_loss": 0.5004997849464417,
1185
+ "eval_runtime": 3.7143,
1186
+ "eval_samples_per_second": 68.384,
1187
+ "eval_steps_per_second": 1.077,
1188
+ "step": 1081
1189
+ },
1190
+ {
1191
+ "epoch": 47.391304347826086,
1192
+ "grad_norm": 2.8639461994171143,
1193
+ "learning_rate": 1.0434782608695653e-06,
1194
+ "loss": 0.4649,
1195
+ "step": 1090
1196
+ },
1197
+ {
1198
+ "epoch": 47.82608695652174,
1199
+ "grad_norm": 3.7704715728759766,
1200
+ "learning_rate": 8.695652173913044e-07,
1201
+ "loss": 0.4726,
1202
+ "step": 1100
1203
+ },
1204
+ {
1205
+ "epoch": 48.0,
1206
+ "eval_accuracy": 0.7716535433070866,
1207
+ "eval_loss": 0.5007592439651489,
1208
+ "eval_runtime": 4.8498,
1209
+ "eval_samples_per_second": 52.373,
1210
+ "eval_steps_per_second": 0.825,
1211
+ "step": 1104
1212
+ },
1213
+ {
1214
+ "epoch": 48.26086956521739,
1215
+ "grad_norm": 4.941337585449219,
1216
+ "learning_rate": 6.956521739130435e-07,
1217
+ "loss": 0.4314,
1218
+ "step": 1110
1219
+ },
1220
+ {
1221
+ "epoch": 48.69565217391305,
1222
+ "grad_norm": 3.2265655994415283,
1223
+ "learning_rate": 5.217391304347826e-07,
1224
+ "loss": 0.4441,
1225
+ "step": 1120
1226
+ },
1227
+ {
1228
+ "epoch": 49.0,
1229
+ "eval_accuracy": 0.7755905511811023,
1230
+ "eval_loss": 0.4987953305244446,
1231
+ "eval_runtime": 3.6681,
1232
+ "eval_samples_per_second": 69.246,
1233
+ "eval_steps_per_second": 1.09,
1234
+ "step": 1127
1235
+ },
1236
+ {
1237
+ "epoch": 49.130434782608695,
1238
+ "grad_norm": 3.7678611278533936,
1239
+ "learning_rate": 3.4782608695652175e-07,
1240
+ "loss": 0.4571,
1241
+ "step": 1130
1242
+ },
1243
+ {
1244
+ "epoch": 49.56521739130435,
1245
+ "grad_norm": 3.657460927963257,
1246
+ "learning_rate": 1.7391304347826088e-07,
1247
+ "loss": 0.4558,
1248
+ "step": 1140
1249
+ },
1250
+ {
1251
+ "epoch": 50.0,
1252
+ "grad_norm": 3.096832513809204,
1253
+ "learning_rate": 0.0,
1254
+ "loss": 0.4579,
1255
+ "step": 1150
1256
+ },
1257
+ {
1258
+ "epoch": 50.0,
1259
+ "eval_accuracy": 0.7755905511811023,
1260
+ "eval_loss": 0.499985009431839,
1261
+ "eval_runtime": 3.8189,
1262
+ "eval_samples_per_second": 66.512,
1263
+ "eval_steps_per_second": 1.047,
1264
+ "step": 1150
1265
+ },
1266
+ {
1267
+ "epoch": 50.0,
1268
+ "step": 1150,
1269
+ "total_flos": 2.72467378584576e+17,
1270
+ "train_loss": 0.4791690407628598,
1271
+ "train_runtime": 1616.3187,
1272
+ "train_samples_per_second": 44.391,
1273
+ "train_steps_per_second": 0.711
1274
  }
1275
  ],
1276
  "logging_steps": 10,
1277
+ "max_steps": 1150,
1278
  "num_input_tokens_seen": 0,
1279
+ "num_train_epochs": 50,
1280
  "save_steps": 500,
1281
  "stateful_callbacks": {
1282
  "TrainerControl": {
 
1290
  "attributes": {}
1291
  }
1292
  },
1293
+ "total_flos": 2.72467378584576e+17,
1294
+ "train_batch_size": 64,
1295
  "trial_name": null,
1296
  "trial_params": null
1297
  }