amazingvince commited on
Commit
b43c5de
1 Parent(s): 579e08b

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +10 -10
  3. eval_results.json +6 -6
  4. train_results.json +4 -4
  5. trainer_state.json +714 -714
README.md CHANGED
@@ -1,11 +1,23 @@
1
  ---
2
  tags:
3
  - generated_from_trainer
 
 
4
  metrics:
5
  - accuracy
6
  model-index:
7
  - name: bitllama-goodwiki
8
- results: []
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # bitllama-goodwiki
15
 
16
- This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
  - Loss: 3.0525
19
  - Accuracy: 0.4285
 
1
  ---
2
  tags:
3
  - generated_from_trainer
4
+ datasets:
5
+ - BEE-spoke-data/goodwiki-deduped-split
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: bitllama-goodwiki
10
+ results:
11
+ - task:
12
+ name: Causal Language Modeling
13
+ type: text-generation
14
+ dataset:
15
+ name: BEE-spoke-data/goodwiki-deduped-split
16
+ type: BEE-spoke-data/goodwiki-deduped-split
17
+ metrics:
18
+ - name: Accuracy
19
+ type: accuracy
20
+ value: 0.4285134482793542
21
  ---
22
 
23
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
25
 
26
  # bitllama-goodwiki
27
 
28
+ This model was trained from scratch on the BEE-spoke-data/goodwiki-deduped-split dataset.
29
  It achieves the following results on the evaluation set:
30
  - Loss: 3.0525
31
  - Accuracy: 0.4285
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.40893571648684346,
4
- "eval_loss": 3.1802382469177246,
5
- "eval_runtime": 158.447,
6
  "eval_samples": 5065,
7
- "eval_samples_per_second": 31.967,
8
- "eval_steps_per_second": 4.001,
9
- "perplexity": 24.052483299499787,
10
- "train_loss": 3.8723633342356467,
11
- "train_runtime": 7855.38,
12
  "train_samples": 90649,
13
- "train_samples_per_second": 11.54,
14
- "train_steps_per_second": 0.361
15
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_accuracy": 0.4285134482793542,
4
+ "eval_loss": 3.0525197982788086,
5
+ "eval_runtime": 151.0776,
6
  "eval_samples": 5065,
7
+ "eval_samples_per_second": 33.526,
8
+ "eval_steps_per_second": 4.197,
9
+ "perplexity": 21.168617922121435,
10
+ "train_loss": 3.7639108537323183,
11
+ "train_runtime": 10013.9285,
12
  "train_samples": 90649,
13
+ "train_samples_per_second": 9.052,
14
+ "train_steps_per_second": 0.283
15
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.40893571648684346,
4
- "eval_loss": 3.1802382469177246,
5
- "eval_runtime": 158.447,
6
  "eval_samples": 5065,
7
- "eval_samples_per_second": 31.967,
8
- "eval_steps_per_second": 4.001,
9
- "perplexity": 24.052483299499787
10
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_accuracy": 0.4285134482793542,
4
+ "eval_loss": 3.0525197982788086,
5
+ "eval_runtime": 151.0776,
6
  "eval_samples": 5065,
7
+ "eval_samples_per_second": 33.526,
8
+ "eval_steps_per_second": 4.197,
9
+ "perplexity": 21.168617922121435
10
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 3.8723633342356467,
4
- "train_runtime": 7855.38,
5
  "train_samples": 90649,
6
- "train_samples_per_second": 11.54,
7
- "train_steps_per_second": 0.361
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 3.7639108537323183,
4
+ "train_runtime": 10013.9285,
5
  "train_samples": 90649,
6
+ "train_samples_per_second": 9.052,
7
+ "train_steps_per_second": 0.283
8
  }
trainer_state.json CHANGED
@@ -10,1968 +10,1968 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 0.0008,
14
- "loss": 10.5836,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
- "learning_rate": 0.0008,
20
- "loss": 7.9958,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
- "learning_rate": 0.0008,
26
- "loss": 7.3759,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.01,
31
- "learning_rate": 0.0008,
32
- "loss": 7.0127,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.01,
37
- "learning_rate": 0.0008,
38
- "loss": 6.7344,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.02,
43
- "learning_rate": 0.0008,
44
- "loss": 6.4652,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.02,
49
- "learning_rate": 0.0008,
50
- "loss": 6.2654,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.02,
55
- "learning_rate": 0.0008,
56
- "loss": 6.15,
57
  "step": 70
58
  },
59
  {
60
  "epoch": 0.03,
61
- "learning_rate": 0.0008,
62
- "loss": 6.0468,
63
  "step": 80
64
  },
65
  {
66
  "epoch": 0.03,
67
- "learning_rate": 0.0008,
68
- "loss": 6.0073,
69
  "step": 90
70
  },
71
  {
72
  "epoch": 0.04,
73
  "learning_rate": 0.0008,
74
- "loss": 5.8703,
75
  "step": 100
76
  },
77
  {
78
  "epoch": 0.04,
79
- "eval_accuracy": 0.16497375833750882,
80
- "eval_loss": 5.843543529510498,
81
- "eval_runtime": 134.6754,
82
- "eval_samples_per_second": 37.609,
83
- "eval_steps_per_second": 4.708,
84
  "step": 100
85
  },
86
  {
87
  "epoch": 0.04,
88
- "learning_rate": 0.0008,
89
- "loss": 5.8128,
90
  "step": 110
91
  },
92
  {
93
  "epoch": 0.04,
94
- "learning_rate": 0.0008,
95
- "loss": 5.7662,
96
  "step": 120
97
  },
98
  {
99
  "epoch": 0.05,
100
- "learning_rate": 0.0008,
101
- "loss": 5.7228,
102
  "step": 130
103
  },
104
  {
105
  "epoch": 0.05,
106
- "learning_rate": 0.0008,
107
- "loss": 5.6553,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.05,
112
- "learning_rate": 0.0008,
113
- "loss": 5.559,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.06,
118
- "learning_rate": 0.0008,
119
- "loss": 5.4848,
120
  "step": 160
121
  },
122
  {
123
  "epoch": 0.06,
124
- "learning_rate": 0.0008,
125
- "loss": 5.3898,
126
  "step": 170
127
  },
128
  {
129
  "epoch": 0.06,
130
- "learning_rate": 0.0008,
131
- "loss": 5.3722,
132
  "step": 180
133
  },
134
  {
135
  "epoch": 0.07,
136
- "learning_rate": 0.0008,
137
- "loss": 5.3227,
138
  "step": 190
139
  },
140
  {
141
  "epoch": 0.07,
142
- "learning_rate": 0.0008,
143
- "loss": 5.2405,
144
  "step": 200
145
  },
146
  {
147
  "epoch": 0.07,
148
- "eval_accuracy": 0.21142017475794642,
149
- "eval_loss": 5.181127548217773,
150
- "eval_runtime": 136.0594,
151
- "eval_samples_per_second": 37.226,
152
- "eval_steps_per_second": 4.66,
153
  "step": 200
154
  },
155
  {
156
  "epoch": 0.07,
157
- "learning_rate": 0.0008,
158
- "loss": 5.1793,
159
  "step": 210
160
  },
161
  {
162
  "epoch": 0.08,
163
- "learning_rate": 0.0008,
164
- "loss": 5.1427,
165
  "step": 220
166
  },
167
  {
168
  "epoch": 0.08,
169
- "learning_rate": 0.0008,
170
- "loss": 5.1075,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.08,
175
- "learning_rate": 0.0008,
176
- "loss": 5.0223,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.09,
181
- "learning_rate": 0.0008,
182
- "loss": 5.0391,
183
  "step": 250
184
  },
185
  {
186
  "epoch": 0.09,
187
- "learning_rate": 0.0008,
188
- "loss": 4.9901,
189
  "step": 260
190
  },
191
  {
192
  "epoch": 0.1,
193
- "learning_rate": 0.0008,
194
- "loss": 4.8777,
195
  "step": 270
196
  },
197
  {
198
  "epoch": 0.1,
199
- "learning_rate": 0.0008,
200
- "loss": 4.8738,
201
  "step": 280
202
  },
203
  {
204
  "epoch": 0.1,
205
- "learning_rate": 0.0008,
206
- "loss": 4.8455,
207
  "step": 290
208
  },
209
  {
210
  "epoch": 0.11,
211
- "learning_rate": 0.0008,
212
- "loss": 4.8345,
213
  "step": 300
214
  },
215
  {
216
  "epoch": 0.11,
217
- "eval_accuracy": 0.2410882272518809,
218
- "eval_loss": 4.779561996459961,
219
- "eval_runtime": 135.467,
220
- "eval_samples_per_second": 37.389,
221
- "eval_steps_per_second": 4.68,
222
  "step": 300
223
  },
224
  {
225
  "epoch": 0.11,
226
- "learning_rate": 0.0008,
227
- "loss": 4.7863,
228
  "step": 310
229
  },
230
  {
231
  "epoch": 0.11,
232
- "learning_rate": 0.0008,
233
- "loss": 4.772,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.12,
238
- "learning_rate": 0.0008,
239
- "loss": 4.7222,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.12,
244
- "learning_rate": 0.0008,
245
- "loss": 4.6841,
246
  "step": 340
247
  },
248
  {
249
  "epoch": 0.12,
250
- "learning_rate": 0.0008,
251
- "loss": 4.6861,
252
  "step": 350
253
  },
254
  {
255
  "epoch": 0.13,
256
- "learning_rate": 0.0008,
257
- "loss": 4.6413,
258
  "step": 360
259
  },
260
  {
261
  "epoch": 0.13,
262
- "learning_rate": 0.0008,
263
- "loss": 4.6305,
264
  "step": 370
265
  },
266
  {
267
  "epoch": 0.13,
268
- "learning_rate": 0.0008,
269
- "loss": 4.5794,
270
  "step": 380
271
  },
272
  {
273
  "epoch": 0.14,
274
- "learning_rate": 0.0008,
275
- "loss": 4.5664,
276
  "step": 390
277
  },
278
  {
279
  "epoch": 0.14,
280
- "learning_rate": 0.0008,
281
- "loss": 4.5823,
282
  "step": 400
283
  },
284
  {
285
  "epoch": 0.14,
286
- "eval_accuracy": 0.2612630816483902,
287
- "eval_loss": 4.523487091064453,
288
- "eval_runtime": 136.8619,
289
- "eval_samples_per_second": 37.008,
290
- "eval_steps_per_second": 4.632,
291
  "step": 400
292
  },
293
  {
294
  "epoch": 0.14,
295
- "learning_rate": 0.0008,
296
- "loss": 4.5246,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.15,
301
- "learning_rate": 0.0008,
302
- "loss": 4.4874,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.15,
307
- "learning_rate": 0.0008,
308
- "loss": 4.5041,
309
  "step": 430
310
  },
311
  {
312
  "epoch": 0.16,
313
- "learning_rate": 0.0008,
314
- "loss": 4.4828,
315
  "step": 440
316
  },
317
  {
318
  "epoch": 0.16,
319
- "learning_rate": 0.0008,
320
- "loss": 4.5062,
321
  "step": 450
322
  },
323
  {
324
  "epoch": 0.16,
325
- "learning_rate": 0.0008,
326
- "loss": 4.4316,
327
  "step": 460
328
  },
329
  {
330
  "epoch": 0.17,
331
- "learning_rate": 0.0008,
332
- "loss": 4.3892,
333
  "step": 470
334
  },
335
  {
336
  "epoch": 0.17,
337
- "learning_rate": 0.0008,
338
- "loss": 4.4085,
339
  "step": 480
340
  },
341
  {
342
  "epoch": 0.17,
343
- "learning_rate": 0.0008,
344
- "loss": 4.3641,
345
  "step": 490
346
  },
347
  {
348
  "epoch": 0.18,
349
- "learning_rate": 0.0008,
350
- "loss": 4.3576,
351
  "step": 500
352
  },
353
  {
354
  "epoch": 0.18,
355
- "eval_accuracy": 0.27608254392940623,
356
- "eval_loss": 4.34656286239624,
357
- "eval_runtime": 135.1659,
358
- "eval_samples_per_second": 37.472,
359
- "eval_steps_per_second": 4.691,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.18,
364
- "learning_rate": 0.0008,
365
- "loss": 4.3959,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.18,
370
- "learning_rate": 0.0008,
371
- "loss": 4.3741,
372
  "step": 520
373
  },
374
  {
375
  "epoch": 0.19,
376
- "learning_rate": 0.0008,
377
- "loss": 4.3384,
378
  "step": 530
379
  },
380
  {
381
  "epoch": 0.19,
382
- "learning_rate": 0.0008,
383
- "loss": 4.3328,
384
  "step": 540
385
  },
386
  {
387
  "epoch": 0.19,
388
- "learning_rate": 0.0008,
389
- "loss": 4.2581,
390
  "step": 550
391
  },
392
  {
393
  "epoch": 0.2,
394
- "learning_rate": 0.0008,
395
- "loss": 4.2962,
396
  "step": 560
397
  },
398
  {
399
  "epoch": 0.2,
400
- "learning_rate": 0.0008,
401
- "loss": 4.2691,
402
  "step": 570
403
  },
404
  {
405
  "epoch": 0.2,
406
- "learning_rate": 0.0008,
407
- "loss": 4.2617,
408
  "step": 580
409
  },
410
  {
411
  "epoch": 0.21,
412
- "learning_rate": 0.0008,
413
- "loss": 4.2339,
414
  "step": 590
415
  },
416
  {
417
  "epoch": 0.21,
418
- "learning_rate": 0.0008,
419
- "loss": 4.2139,
420
  "step": 600
421
  },
422
  {
423
  "epoch": 0.21,
424
- "eval_accuracy": 0.28767700402823865,
425
- "eval_loss": 4.215435981750488,
426
- "eval_runtime": 135.5466,
427
- "eval_samples_per_second": 37.367,
428
- "eval_steps_per_second": 4.677,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.22,
433
- "learning_rate": 0.0008,
434
- "loss": 4.1746,
435
  "step": 610
436
  },
437
  {
438
  "epoch": 0.22,
439
- "learning_rate": 0.0008,
440
- "loss": 4.1812,
441
  "step": 620
442
  },
443
  {
444
  "epoch": 0.22,
445
- "learning_rate": 0.0008,
446
- "loss": 4.2263,
447
  "step": 630
448
  },
449
  {
450
  "epoch": 0.23,
451
- "learning_rate": 0.0008,
452
- "loss": 4.1972,
453
  "step": 640
454
  },
455
  {
456
  "epoch": 0.23,
457
- "learning_rate": 0.0008,
458
- "loss": 4.121,
459
  "step": 650
460
  },
461
  {
462
  "epoch": 0.23,
463
- "learning_rate": 0.0008,
464
- "loss": 4.2055,
465
  "step": 660
466
  },
467
  {
468
  "epoch": 0.24,
469
- "learning_rate": 0.0008,
470
- "loss": 4.1756,
471
  "step": 670
472
  },
473
  {
474
  "epoch": 0.24,
475
- "learning_rate": 0.0008,
476
- "loss": 4.1643,
477
  "step": 680
478
  },
479
  {
480
  "epoch": 0.24,
481
- "learning_rate": 0.0008,
482
- "loss": 4.1494,
483
  "step": 690
484
  },
485
  {
486
  "epoch": 0.25,
487
- "learning_rate": 0.0008,
488
- "loss": 4.1117,
489
  "step": 700
490
  },
491
  {
492
  "epoch": 0.25,
493
- "eval_accuracy": 0.29982421968247663,
494
- "eval_loss": 4.101539134979248,
495
- "eval_runtime": 136.1806,
496
- "eval_samples_per_second": 37.193,
497
- "eval_steps_per_second": 4.656,
498
  "step": 700
499
  },
500
  {
501
  "epoch": 0.25,
502
- "learning_rate": 0.0008,
503
- "loss": 4.1379,
504
  "step": 710
505
  },
506
  {
507
  "epoch": 0.25,
508
- "learning_rate": 0.0008,
509
- "loss": 4.1173,
510
  "step": 720
511
  },
512
  {
513
  "epoch": 0.26,
514
- "learning_rate": 0.0008,
515
- "loss": 4.1307,
516
  "step": 730
517
  },
518
  {
519
  "epoch": 0.26,
520
- "learning_rate": 0.0008,
521
- "loss": 4.1291,
522
  "step": 740
523
  },
524
  {
525
  "epoch": 0.26,
526
- "learning_rate": 0.0008,
527
- "loss": 4.0763,
528
  "step": 750
529
  },
530
  {
531
  "epoch": 0.27,
532
- "learning_rate": 0.0008,
533
- "loss": 4.0861,
534
  "step": 760
535
  },
536
  {
537
  "epoch": 0.27,
538
- "learning_rate": 0.0008,
539
- "loss": 4.0251,
540
  "step": 770
541
  },
542
  {
543
  "epoch": 0.28,
544
- "learning_rate": 0.0008,
545
- "loss": 4.0384,
546
  "step": 780
547
  },
548
  {
549
  "epoch": 0.28,
550
- "learning_rate": 0.0008,
551
- "loss": 4.0343,
552
  "step": 790
553
  },
554
  {
555
  "epoch": 0.28,
556
- "learning_rate": 0.0008,
557
- "loss": 4.0243,
558
  "step": 800
559
  },
560
  {
561
  "epoch": 0.28,
562
- "eval_accuracy": 0.3069846755249659,
563
- "eval_loss": 4.026556491851807,
564
- "eval_runtime": 135.6288,
565
- "eval_samples_per_second": 37.345,
566
- "eval_steps_per_second": 4.675,
567
  "step": 800
568
  },
569
  {
570
  "epoch": 0.29,
571
- "learning_rate": 0.0008,
572
- "loss": 4.0532,
573
  "step": 810
574
  },
575
  {
576
  "epoch": 0.29,
577
- "learning_rate": 0.0008,
578
- "loss": 4.0039,
579
  "step": 820
580
  },
581
  {
582
  "epoch": 0.29,
583
- "learning_rate": 0.0008,
584
- "loss": 4.0164,
585
  "step": 830
586
  },
587
  {
588
  "epoch": 0.3,
589
- "learning_rate": 0.0008,
590
- "loss": 4.0274,
591
  "step": 840
592
  },
593
  {
594
  "epoch": 0.3,
595
- "learning_rate": 0.0008,
596
- "loss": 4.0533,
597
  "step": 850
598
  },
599
  {
600
  "epoch": 0.3,
601
- "learning_rate": 0.0008,
602
- "loss": 4.0149,
603
  "step": 860
604
  },
605
  {
606
  "epoch": 0.31,
607
- "learning_rate": 0.0008,
608
- "loss": 4.0083,
609
  "step": 870
610
  },
611
  {
612
  "epoch": 0.31,
613
- "learning_rate": 0.0008,
614
- "loss": 3.9943,
615
  "step": 880
616
  },
617
  {
618
  "epoch": 0.31,
619
- "learning_rate": 0.0008,
620
- "loss": 4.0045,
621
  "step": 890
622
  },
623
  {
624
  "epoch": 0.32,
625
- "learning_rate": 0.0008,
626
- "loss": 3.9702,
627
  "step": 900
628
  },
629
  {
630
  "epoch": 0.32,
631
- "eval_accuracy": 0.3145666183290887,
632
- "eval_loss": 3.9495697021484375,
633
- "eval_runtime": 136.0896,
634
- "eval_samples_per_second": 37.218,
635
- "eval_steps_per_second": 4.659,
636
  "step": 900
637
  },
638
  {
639
  "epoch": 0.32,
640
- "learning_rate": 0.0008,
641
- "loss": 3.9303,
642
  "step": 910
643
  },
644
  {
645
  "epoch": 0.32,
646
- "learning_rate": 0.0008,
647
- "loss": 3.91,
648
  "step": 920
649
  },
650
  {
651
  "epoch": 0.33,
652
- "learning_rate": 0.0008,
653
- "loss": 3.9182,
654
  "step": 930
655
  },
656
  {
657
  "epoch": 0.33,
658
- "learning_rate": 0.0008,
659
- "loss": 3.9482,
660
  "step": 940
661
  },
662
  {
663
  "epoch": 0.34,
664
- "learning_rate": 0.0008,
665
- "loss": 3.9333,
666
  "step": 950
667
  },
668
  {
669
  "epoch": 0.34,
670
- "learning_rate": 0.0008,
671
- "loss": 3.9235,
672
  "step": 960
673
  },
674
  {
675
  "epoch": 0.34,
676
- "learning_rate": 0.0008,
677
- "loss": 3.9282,
678
  "step": 970
679
  },
680
  {
681
  "epoch": 0.35,
682
- "learning_rate": 0.0008,
683
- "loss": 3.9222,
684
  "step": 980
685
  },
686
  {
687
  "epoch": 0.35,
688
- "learning_rate": 0.0008,
689
- "loss": 3.9284,
690
  "step": 990
691
  },
692
  {
693
  "epoch": 0.35,
694
- "learning_rate": 0.0008,
695
- "loss": 3.8875,
696
  "step": 1000
697
  },
698
  {
699
  "epoch": 0.35,
700
- "eval_accuracy": 0.32133616189343134,
701
- "eval_loss": 3.88588809967041,
702
- "eval_runtime": 137.0008,
703
- "eval_samples_per_second": 36.971,
704
- "eval_steps_per_second": 4.628,
705
  "step": 1000
706
  },
707
  {
708
  "epoch": 0.36,
709
- "learning_rate": 0.0008,
710
- "loss": 3.9029,
711
  "step": 1010
712
  },
713
  {
714
  "epoch": 0.36,
715
- "learning_rate": 0.0008,
716
- "loss": 3.9049,
717
  "step": 1020
718
  },
719
  {
720
  "epoch": 0.36,
721
- "learning_rate": 0.0008,
722
- "loss": 3.8323,
723
  "step": 1030
724
  },
725
  {
726
  "epoch": 0.37,
727
- "learning_rate": 0.0008,
728
- "loss": 3.8533,
729
  "step": 1040
730
  },
731
  {
732
  "epoch": 0.37,
733
- "learning_rate": 0.0008,
734
- "loss": 3.8698,
735
  "step": 1050
736
  },
737
  {
738
  "epoch": 0.37,
739
- "learning_rate": 0.0008,
740
- "loss": 3.8734,
741
  "step": 1060
742
  },
743
  {
744
  "epoch": 0.38,
745
- "learning_rate": 0.0008,
746
- "loss": 3.8485,
747
  "step": 1070
748
  },
749
  {
750
  "epoch": 0.38,
751
- "learning_rate": 0.0008,
752
- "loss": 3.8412,
753
  "step": 1080
754
  },
755
  {
756
  "epoch": 0.38,
757
- "learning_rate": 0.0008,
758
- "loss": 3.8368,
759
  "step": 1090
760
  },
761
  {
762
  "epoch": 0.39,
763
- "learning_rate": 0.0008,
764
- "loss": 3.8557,
765
  "step": 1100
766
  },
767
  {
768
  "epoch": 0.39,
769
- "eval_accuracy": 0.3289136679926949,
770
- "eval_loss": 3.8198728561401367,
771
- "eval_runtime": 136.5004,
772
- "eval_samples_per_second": 37.106,
773
- "eval_steps_per_second": 4.645,
774
  "step": 1100
775
  },
776
  {
777
  "epoch": 0.39,
778
- "learning_rate": 0.0008,
779
- "loss": 3.8311,
780
  "step": 1110
781
  },
782
  {
783
  "epoch": 0.4,
784
- "learning_rate": 0.0008,
785
- "loss": 3.7672,
786
  "step": 1120
787
  },
788
  {
789
  "epoch": 0.4,
790
- "learning_rate": 0.0008,
791
- "loss": 3.8207,
792
  "step": 1130
793
  },
794
  {
795
  "epoch": 0.4,
796
- "learning_rate": 0.0008,
797
- "loss": 3.821,
798
  "step": 1140
799
  },
800
  {
801
  "epoch": 0.41,
802
- "learning_rate": 0.0008,
803
- "loss": 3.8063,
804
  "step": 1150
805
  },
806
  {
807
  "epoch": 0.41,
808
- "learning_rate": 0.0008,
809
- "loss": 3.774,
810
  "step": 1160
811
  },
812
  {
813
  "epoch": 0.41,
814
- "learning_rate": 0.0008,
815
- "loss": 3.7655,
816
  "step": 1170
817
  },
818
  {
819
  "epoch": 0.42,
820
- "learning_rate": 0.0008,
821
- "loss": 3.7827,
822
  "step": 1180
823
  },
824
  {
825
  "epoch": 0.42,
826
- "learning_rate": 0.0008,
827
- "loss": 3.8189,
828
  "step": 1190
829
  },
830
  {
831
  "epoch": 0.42,
832
- "learning_rate": 0.0008,
833
- "loss": 3.7383,
834
  "step": 1200
835
  },
836
  {
837
  "epoch": 0.42,
838
- "eval_accuracy": 0.337325274605507,
839
- "eval_loss": 3.7486014366149902,
840
- "eval_runtime": 135.2402,
841
- "eval_samples_per_second": 37.452,
842
- "eval_steps_per_second": 4.688,
843
  "step": 1200
844
  },
845
  {
846
  "epoch": 0.43,
847
- "learning_rate": 0.0008,
848
- "loss": 3.7548,
849
  "step": 1210
850
  },
851
  {
852
  "epoch": 0.43,
853
- "learning_rate": 0.0008,
854
- "loss": 3.7807,
855
  "step": 1220
856
  },
857
  {
858
  "epoch": 0.43,
859
- "learning_rate": 0.0008,
860
- "loss": 3.7654,
861
  "step": 1230
862
  },
863
  {
864
  "epoch": 0.44,
865
- "learning_rate": 0.0008,
866
- "loss": 3.7265,
867
  "step": 1240
868
  },
869
  {
870
  "epoch": 0.44,
871
- "learning_rate": 0.0008,
872
- "loss": 3.6892,
873
  "step": 1250
874
  },
875
  {
876
  "epoch": 0.44,
877
- "learning_rate": 0.0008,
878
- "loss": 3.686,
879
  "step": 1260
880
  },
881
  {
882
  "epoch": 0.45,
883
- "learning_rate": 0.0008,
884
- "loss": 3.7261,
885
  "step": 1270
886
  },
887
  {
888
  "epoch": 0.45,
889
- "learning_rate": 0.0008,
890
- "loss": 3.6922,
891
  "step": 1280
892
  },
893
  {
894
  "epoch": 0.46,
895
- "learning_rate": 0.0008,
896
- "loss": 3.7293,
897
  "step": 1290
898
  },
899
  {
900
  "epoch": 0.46,
901
- "learning_rate": 0.0008,
902
- "loss": 3.6718,
903
  "step": 1300
904
  },
905
  {
906
  "epoch": 0.46,
907
- "eval_accuracy": 0.34615152022245255,
908
- "eval_loss": 3.6764721870422363,
909
- "eval_runtime": 136.7336,
910
- "eval_samples_per_second": 37.043,
911
- "eval_steps_per_second": 4.637,
912
  "step": 1300
913
  },
914
  {
915
  "epoch": 0.46,
916
- "learning_rate": 0.0008,
917
- "loss": 3.7167,
918
  "step": 1310
919
  },
920
  {
921
  "epoch": 0.47,
922
- "learning_rate": 0.0008,
923
- "loss": 3.694,
924
  "step": 1320
925
  },
926
  {
927
  "epoch": 0.47,
928
- "learning_rate": 0.0008,
929
- "loss": 3.6661,
930
  "step": 1330
931
  },
932
  {
933
  "epoch": 0.47,
934
- "learning_rate": 0.0008,
935
- "loss": 3.6659,
936
  "step": 1340
937
  },
938
  {
939
  "epoch": 0.48,
940
- "learning_rate": 0.0008,
941
- "loss": 3.6365,
942
  "step": 1350
943
  },
944
  {
945
  "epoch": 0.48,
946
- "learning_rate": 0.0008,
947
- "loss": 3.6914,
948
  "step": 1360
949
  },
950
  {
951
  "epoch": 0.48,
952
- "learning_rate": 0.0008,
953
- "loss": 3.5961,
954
  "step": 1370
955
  },
956
  {
957
  "epoch": 0.49,
958
- "learning_rate": 0.0008,
959
- "loss": 3.6102,
960
  "step": 1380
961
  },
962
  {
963
  "epoch": 0.49,
964
- "learning_rate": 0.0008,
965
- "loss": 3.6198,
966
  "step": 1390
967
  },
968
  {
969
  "epoch": 0.49,
970
- "learning_rate": 0.0008,
971
- "loss": 3.6072,
972
  "step": 1400
973
  },
974
  {
975
  "epoch": 0.49,
976
- "eval_accuracy": 0.3554265481809269,
977
- "eval_loss": 3.609574794769287,
978
- "eval_runtime": 136.1057,
979
- "eval_samples_per_second": 37.214,
980
- "eval_steps_per_second": 4.658,
981
  "step": 1400
982
  },
983
  {
984
  "epoch": 0.5,
985
- "learning_rate": 0.0008,
986
- "loss": 3.5831,
987
  "step": 1410
988
  },
989
  {
990
  "epoch": 0.5,
991
- "learning_rate": 0.0008,
992
- "loss": 3.5985,
993
  "step": 1420
994
  },
995
  {
996
  "epoch": 0.5,
997
- "learning_rate": 0.0008,
998
- "loss": 3.5513,
999
  "step": 1430
1000
  },
1001
  {
1002
  "epoch": 0.51,
1003
- "learning_rate": 0.0008,
1004
- "loss": 3.5913,
1005
  "step": 1440
1006
  },
1007
  {
1008
  "epoch": 0.51,
1009
- "learning_rate": 0.0008,
1010
- "loss": 3.5758,
1011
  "step": 1450
1012
  },
1013
  {
1014
  "epoch": 0.52,
1015
- "learning_rate": 0.0008,
1016
- "loss": 3.5511,
1017
  "step": 1460
1018
  },
1019
  {
1020
  "epoch": 0.52,
1021
- "learning_rate": 0.0008,
1022
- "loss": 3.5757,
1023
  "step": 1470
1024
  },
1025
  {
1026
  "epoch": 0.52,
1027
- "learning_rate": 0.0008,
1028
- "loss": 3.5504,
1029
  "step": 1480
1030
  },
1031
  {
1032
  "epoch": 0.53,
1033
- "learning_rate": 0.0008,
1034
- "loss": 3.6189,
1035
  "step": 1490
1036
  },
1037
  {
1038
  "epoch": 0.53,
1039
- "learning_rate": 0.0008,
1040
- "loss": 3.5629,
1041
  "step": 1500
1042
  },
1043
  {
1044
  "epoch": 0.53,
1045
- "eval_accuracy": 0.3621603087560782,
1046
- "eval_loss": 3.5538198947906494,
1047
- "eval_runtime": 135.3711,
1048
- "eval_samples_per_second": 37.416,
1049
- "eval_steps_per_second": 4.683,
1050
  "step": 1500
1051
  },
1052
  {
1053
  "epoch": 0.53,
1054
- "learning_rate": 0.0008,
1055
- "loss": 3.5262,
1056
  "step": 1510
1057
  },
1058
  {
1059
  "epoch": 0.54,
1060
- "learning_rate": 0.0008,
1061
- "loss": 3.5047,
1062
  "step": 1520
1063
  },
1064
  {
1065
  "epoch": 0.54,
1066
- "learning_rate": 0.0008,
1067
- "loss": 3.5069,
1068
  "step": 1530
1069
  },
1070
  {
1071
  "epoch": 0.54,
1072
- "learning_rate": 0.0008,
1073
- "loss": 3.5734,
1074
  "step": 1540
1075
  },
1076
  {
1077
  "epoch": 0.55,
1078
- "learning_rate": 0.0008,
1079
- "loss": 3.478,
1080
  "step": 1550
1081
  },
1082
  {
1083
  "epoch": 0.55,
1084
- "learning_rate": 0.0008,
1085
- "loss": 3.5185,
1086
  "step": 1560
1087
  },
1088
  {
1089
  "epoch": 0.55,
1090
- "learning_rate": 0.0008,
1091
- "loss": 3.5316,
1092
  "step": 1570
1093
  },
1094
  {
1095
  "epoch": 0.56,
1096
- "learning_rate": 0.0008,
1097
- "loss": 3.4971,
1098
  "step": 1580
1099
  },
1100
  {
1101
  "epoch": 0.56,
1102
- "learning_rate": 0.0008,
1103
- "loss": 3.51,
1104
  "step": 1590
1105
  },
1106
  {
1107
  "epoch": 0.56,
1108
- "learning_rate": 0.0008,
1109
- "loss": 3.4883,
1110
  "step": 1600
1111
  },
1112
  {
1113
  "epoch": 0.56,
1114
- "eval_accuracy": 0.3692090753762398,
1115
- "eval_loss": 3.501894950866699,
1116
- "eval_runtime": 136.0113,
1117
- "eval_samples_per_second": 37.24,
1118
- "eval_steps_per_second": 4.661,
1119
  "step": 1600
1120
  },
1121
  {
1122
  "epoch": 0.57,
1123
- "learning_rate": 0.0008,
1124
- "loss": 3.5305,
1125
  "step": 1610
1126
  },
1127
  {
1128
  "epoch": 0.57,
1129
- "learning_rate": 0.0008,
1130
- "loss": 3.531,
1131
  "step": 1620
1132
  },
1133
  {
1134
  "epoch": 0.58,
1135
- "learning_rate": 0.0008,
1136
- "loss": 3.4821,
1137
  "step": 1630
1138
  },
1139
  {
1140
  "epoch": 0.58,
1141
- "learning_rate": 0.0008,
1142
- "loss": 3.488,
1143
  "step": 1640
1144
  },
1145
  {
1146
  "epoch": 0.58,
1147
- "learning_rate": 0.0008,
1148
- "loss": 3.442,
1149
  "step": 1650
1150
  },
1151
  {
1152
  "epoch": 0.59,
1153
- "learning_rate": 0.0008,
1154
- "loss": 3.4698,
1155
  "step": 1660
1156
  },
1157
  {
1158
  "epoch": 0.59,
1159
- "learning_rate": 0.0008,
1160
- "loss": 3.4318,
1161
  "step": 1670
1162
  },
1163
  {
1164
  "epoch": 0.59,
1165
- "learning_rate": 0.0008,
1166
- "loss": 3.4443,
1167
  "step": 1680
1168
  },
1169
  {
1170
  "epoch": 0.6,
1171
- "learning_rate": 0.0008,
1172
- "loss": 3.4336,
1173
  "step": 1690
1174
  },
1175
  {
1176
  "epoch": 0.6,
1177
- "learning_rate": 0.0008,
1178
- "loss": 3.4743,
1179
  "step": 1700
1180
  },
1181
  {
1182
  "epoch": 0.6,
1183
- "eval_accuracy": 0.3742321968778136,
1184
- "eval_loss": 3.4613401889801025,
1185
- "eval_runtime": 135.8854,
1186
- "eval_samples_per_second": 37.274,
1187
- "eval_steps_per_second": 4.666,
1188
  "step": 1700
1189
  },
1190
  {
1191
  "epoch": 0.6,
1192
- "learning_rate": 0.0008,
1193
- "loss": 3.4524,
1194
  "step": 1710
1195
  },
1196
  {
1197
  "epoch": 0.61,
1198
- "learning_rate": 0.0008,
1199
- "loss": 3.4768,
1200
  "step": 1720
1201
  },
1202
  {
1203
  "epoch": 0.61,
1204
- "learning_rate": 0.0008,
1205
- "loss": 3.4931,
1206
  "step": 1730
1207
  },
1208
  {
1209
  "epoch": 0.61,
1210
- "learning_rate": 0.0008,
1211
- "loss": 3.4805,
1212
  "step": 1740
1213
  },
1214
  {
1215
  "epoch": 0.62,
1216
- "learning_rate": 0.0008,
1217
- "loss": 3.4251,
1218
  "step": 1750
1219
  },
1220
  {
1221
  "epoch": 0.62,
1222
- "learning_rate": 0.0008,
1223
- "loss": 3.4601,
1224
  "step": 1760
1225
  },
1226
  {
1227
  "epoch": 0.62,
1228
- "learning_rate": 0.0008,
1229
- "loss": 3.44,
1230
  "step": 1770
1231
  },
1232
  {
1233
  "epoch": 0.63,
1234
- "learning_rate": 0.0008,
1235
- "loss": 3.3998,
1236
  "step": 1780
1237
  },
1238
  {
1239
  "epoch": 0.63,
1240
- "learning_rate": 0.0008,
1241
- "loss": 3.432,
1242
  "step": 1790
1243
  },
1244
  {
1245
  "epoch": 0.64,
1246
- "learning_rate": 0.0008,
1247
- "loss": 3.4294,
1248
  "step": 1800
1249
  },
1250
  {
1251
  "epoch": 0.64,
1252
- "eval_accuracy": 0.37978685491155284,
1253
- "eval_loss": 3.4203567504882812,
1254
- "eval_runtime": 135.08,
1255
- "eval_samples_per_second": 37.496,
1256
- "eval_steps_per_second": 4.694,
1257
  "step": 1800
1258
  },
1259
  {
1260
  "epoch": 0.64,
1261
- "learning_rate": 0.0008,
1262
- "loss": 3.452,
1263
  "step": 1810
1264
  },
1265
  {
1266
  "epoch": 0.64,
1267
- "learning_rate": 0.0008,
1268
- "loss": 3.4274,
1269
  "step": 1820
1270
  },
1271
  {
1272
  "epoch": 0.65,
1273
- "learning_rate": 0.0008,
1274
- "loss": 3.429,
1275
  "step": 1830
1276
  },
1277
  {
1278
  "epoch": 0.65,
1279
- "learning_rate": 0.0008,
1280
- "loss": 3.3932,
1281
  "step": 1840
1282
  },
1283
  {
1284
  "epoch": 0.65,
1285
- "learning_rate": 0.0008,
1286
- "loss": 3.4144,
1287
  "step": 1850
1288
  },
1289
  {
1290
  "epoch": 0.66,
1291
- "learning_rate": 0.0008,
1292
- "loss": 3.414,
1293
  "step": 1860
1294
  },
1295
  {
1296
  "epoch": 0.66,
1297
- "learning_rate": 0.0008,
1298
- "loss": 3.4192,
1299
  "step": 1870
1300
  },
1301
  {
1302
  "epoch": 0.66,
1303
- "learning_rate": 0.0008,
1304
- "loss": 3.4295,
1305
  "step": 1880
1306
  },
1307
  {
1308
  "epoch": 0.67,
1309
- "learning_rate": 0.0008,
1310
- "loss": 3.3573,
1311
  "step": 1890
1312
  },
1313
  {
1314
  "epoch": 0.67,
1315
- "learning_rate": 0.0008,
1316
- "loss": 3.3752,
1317
  "step": 1900
1318
  },
1319
  {
1320
  "epoch": 0.67,
1321
- "eval_accuracy": 0.3848666890752412,
1322
- "eval_loss": 3.381606340408325,
1323
- "eval_runtime": 136.2948,
1324
- "eval_samples_per_second": 37.162,
1325
- "eval_steps_per_second": 4.652,
1326
  "step": 1900
1327
  },
1328
  {
1329
  "epoch": 0.67,
1330
- "learning_rate": 0.0008,
1331
- "loss": 3.4002,
1332
  "step": 1910
1333
  },
1334
  {
1335
  "epoch": 0.68,
1336
- "learning_rate": 0.0008,
1337
- "loss": 3.4114,
1338
  "step": 1920
1339
  },
1340
  {
1341
  "epoch": 0.68,
1342
- "learning_rate": 0.0008,
1343
- "loss": 3.3961,
1344
  "step": 1930
1345
  },
1346
  {
1347
  "epoch": 0.68,
1348
- "learning_rate": 0.0008,
1349
- "loss": 3.4121,
1350
  "step": 1940
1351
  },
1352
  {
1353
  "epoch": 0.69,
1354
- "learning_rate": 0.0008,
1355
- "loss": 3.3707,
1356
  "step": 1950
1357
  },
1358
  {
1359
  "epoch": 0.69,
1360
- "learning_rate": 0.0008,
1361
- "loss": 3.4174,
1362
  "step": 1960
1363
  },
1364
  {
1365
  "epoch": 0.7,
1366
- "learning_rate": 0.0008,
1367
- "loss": 3.3769,
1368
  "step": 1970
1369
  },
1370
  {
1371
  "epoch": 0.7,
1372
- "learning_rate": 0.0008,
1373
- "loss": 3.3729,
1374
  "step": 1980
1375
  },
1376
  {
1377
  "epoch": 0.7,
1378
- "learning_rate": 0.0008,
1379
- "loss": 3.3652,
1380
  "step": 1990
1381
  },
1382
  {
1383
  "epoch": 0.71,
1384
- "learning_rate": 0.0008,
1385
- "loss": 3.3636,
1386
  "step": 2000
1387
  },
1388
  {
1389
  "epoch": 0.71,
1390
- "eval_accuracy": 0.38698338309355035,
1391
- "eval_loss": 3.359571695327759,
1392
- "eval_runtime": 135.0648,
1393
- "eval_samples_per_second": 37.501,
1394
- "eval_steps_per_second": 4.694,
1395
  "step": 2000
1396
  },
1397
  {
1398
  "epoch": 0.71,
1399
- "learning_rate": 0.0008,
1400
- "loss": 3.3801,
1401
  "step": 2010
1402
  },
1403
  {
1404
  "epoch": 0.71,
1405
- "learning_rate": 0.0008,
1406
- "loss": 3.3348,
1407
  "step": 2020
1408
  },
1409
  {
1410
  "epoch": 0.72,
1411
- "learning_rate": 0.0008,
1412
- "loss": 3.3398,
1413
  "step": 2030
1414
  },
1415
  {
1416
  "epoch": 0.72,
1417
- "learning_rate": 0.0008,
1418
- "loss": 3.3589,
1419
  "step": 2040
1420
  },
1421
  {
1422
  "epoch": 0.72,
1423
- "learning_rate": 0.0008,
1424
- "loss": 3.3357,
1425
  "step": 2050
1426
  },
1427
  {
1428
  "epoch": 0.73,
1429
- "learning_rate": 0.0008,
1430
- "loss": 3.3038,
1431
  "step": 2060
1432
  },
1433
  {
1434
  "epoch": 0.73,
1435
- "learning_rate": 0.0008,
1436
- "loss": 3.3231,
1437
  "step": 2070
1438
  },
1439
  {
1440
  "epoch": 0.73,
1441
- "learning_rate": 0.0008,
1442
- "loss": 3.348,
1443
  "step": 2080
1444
  },
1445
  {
1446
  "epoch": 0.74,
1447
- "learning_rate": 0.0008,
1448
- "loss": 3.3503,
1449
  "step": 2090
1450
  },
1451
  {
1452
  "epoch": 0.74,
1453
- "learning_rate": 0.0008,
1454
- "loss": 3.332,
1455
  "step": 2100
1456
  },
1457
  {
1458
  "epoch": 0.74,
1459
- "eval_accuracy": 0.3909963826387881,
1460
- "eval_loss": 3.3281729221343994,
1461
- "eval_runtime": 134.6874,
1462
- "eval_samples_per_second": 37.606,
1463
- "eval_steps_per_second": 4.707,
1464
  "step": 2100
1465
  },
1466
  {
1467
  "epoch": 0.74,
1468
- "learning_rate": 0.0008,
1469
- "loss": 3.3549,
1470
  "step": 2110
1471
  },
1472
  {
1473
  "epoch": 0.75,
1474
- "learning_rate": 0.0008,
1475
- "loss": 3.3504,
1476
  "step": 2120
1477
  },
1478
  {
1479
  "epoch": 0.75,
1480
- "learning_rate": 0.0008,
1481
- "loss": 3.3259,
1482
  "step": 2130
1483
  },
1484
  {
1485
  "epoch": 0.76,
1486
- "learning_rate": 0.0008,
1487
- "loss": 3.3455,
1488
  "step": 2140
1489
  },
1490
  {
1491
  "epoch": 0.76,
1492
- "learning_rate": 0.0008,
1493
- "loss": 3.3343,
1494
  "step": 2150
1495
  },
1496
  {
1497
  "epoch": 0.76,
1498
- "learning_rate": 0.0008,
1499
- "loss": 3.2952,
1500
  "step": 2160
1501
  },
1502
  {
1503
  "epoch": 0.77,
1504
- "learning_rate": 0.0008,
1505
- "loss": 3.3179,
1506
  "step": 2170
1507
  },
1508
  {
1509
  "epoch": 0.77,
1510
- "learning_rate": 0.0008,
1511
- "loss": 3.2569,
1512
  "step": 2180
1513
  },
1514
  {
1515
  "epoch": 0.77,
1516
- "learning_rate": 0.0008,
1517
- "loss": 3.3059,
1518
  "step": 2190
1519
  },
1520
  {
1521
  "epoch": 0.78,
1522
- "learning_rate": 0.0008,
1523
- "loss": 3.2884,
1524
  "step": 2200
1525
  },
1526
  {
1527
  "epoch": 0.78,
1528
- "eval_accuracy": 0.3946998737950368,
1529
- "eval_loss": 3.3027358055114746,
1530
- "eval_runtime": 135.1879,
1531
- "eval_samples_per_second": 37.466,
1532
- "eval_steps_per_second": 4.69,
1533
  "step": 2200
1534
  },
1535
  {
1536
  "epoch": 0.78,
1537
- "learning_rate": 0.0008,
1538
- "loss": 3.3218,
1539
  "step": 2210
1540
  },
1541
  {
1542
  "epoch": 0.78,
1543
- "learning_rate": 0.0008,
1544
- "loss": 3.2865,
1545
  "step": 2220
1546
  },
1547
  {
1548
  "epoch": 0.79,
1549
- "learning_rate": 0.0008,
1550
- "loss": 3.3339,
1551
  "step": 2230
1552
  },
1553
  {
1554
  "epoch": 0.79,
1555
- "learning_rate": 0.0008,
1556
- "loss": 3.2913,
1557
  "step": 2240
1558
  },
1559
  {
1560
  "epoch": 0.79,
1561
- "learning_rate": 0.0008,
1562
- "loss": 3.2425,
1563
  "step": 2250
1564
  },
1565
  {
1566
  "epoch": 0.8,
1567
- "learning_rate": 0.0008,
1568
- "loss": 3.263,
1569
  "step": 2260
1570
  },
1571
  {
1572
  "epoch": 0.8,
1573
- "learning_rate": 0.0008,
1574
- "loss": 3.2918,
1575
  "step": 2270
1576
  },
1577
  {
1578
  "epoch": 0.8,
1579
- "learning_rate": 0.0008,
1580
- "loss": 3.2684,
1581
  "step": 2280
1582
  },
1583
  {
1584
  "epoch": 0.81,
1585
- "learning_rate": 0.0008,
1586
- "loss": 3.279,
1587
  "step": 2290
1588
  },
1589
  {
1590
  "epoch": 0.81,
1591
- "learning_rate": 0.0008,
1592
- "loss": 3.2838,
1593
  "step": 2300
1594
  },
1595
  {
1596
  "epoch": 0.81,
1597
- "eval_accuracy": 0.3971535644824415,
1598
- "eval_loss": 3.280911445617676,
1599
- "eval_runtime": 135.9892,
1600
- "eval_samples_per_second": 37.246,
1601
- "eval_steps_per_second": 4.662,
1602
  "step": 2300
1603
  },
1604
  {
1605
  "epoch": 0.82,
1606
- "learning_rate": 0.0008,
1607
- "loss": 3.2524,
1608
  "step": 2310
1609
  },
1610
  {
1611
  "epoch": 0.82,
1612
- "learning_rate": 0.0008,
1613
- "loss": 3.2807,
1614
  "step": 2320
1615
  },
1616
  {
1617
  "epoch": 0.82,
1618
- "learning_rate": 0.0008,
1619
- "loss": 3.2276,
1620
  "step": 2330
1621
  },
1622
  {
1623
  "epoch": 0.83,
1624
- "learning_rate": 0.0008,
1625
- "loss": 3.3045,
1626
  "step": 2340
1627
  },
1628
  {
1629
  "epoch": 0.83,
1630
- "learning_rate": 0.0008,
1631
- "loss": 3.2835,
1632
  "step": 2350
1633
  },
1634
  {
1635
  "epoch": 0.83,
1636
- "learning_rate": 0.0008,
1637
- "loss": 3.2816,
1638
  "step": 2360
1639
  },
1640
  {
1641
  "epoch": 0.84,
1642
- "learning_rate": 0.0008,
1643
- "loss": 3.2464,
1644
  "step": 2370
1645
  },
1646
  {
1647
  "epoch": 0.84,
1648
- "learning_rate": 0.0008,
1649
- "loss": 3.2671,
1650
  "step": 2380
1651
  },
1652
  {
1653
  "epoch": 0.84,
1654
- "learning_rate": 0.0008,
1655
- "loss": 3.2733,
1656
  "step": 2390
1657
  },
1658
  {
1659
  "epoch": 0.85,
1660
- "learning_rate": 0.0008,
1661
- "loss": 3.2505,
1662
  "step": 2400
1663
  },
1664
  {
1665
  "epoch": 0.85,
1666
- "eval_accuracy": 0.4000816932394745,
1667
- "eval_loss": 3.256648302078247,
1668
- "eval_runtime": 136.2762,
1669
- "eval_samples_per_second": 37.167,
1670
- "eval_steps_per_second": 4.652,
1671
  "step": 2400
1672
  },
1673
  {
1674
  "epoch": 0.85,
1675
- "learning_rate": 0.0008,
1676
- "loss": 3.2053,
1677
  "step": 2410
1678
  },
1679
  {
1680
  "epoch": 0.85,
1681
- "learning_rate": 0.0008,
1682
- "loss": 3.2693,
1683
  "step": 2420
1684
  },
1685
  {
1686
  "epoch": 0.86,
1687
- "learning_rate": 0.0008,
1688
- "loss": 3.2955,
1689
  "step": 2430
1690
  },
1691
  {
1692
  "epoch": 0.86,
1693
- "learning_rate": 0.0008,
1694
- "loss": 3.2347,
1695
  "step": 2440
1696
  },
1697
  {
1698
  "epoch": 0.86,
1699
- "learning_rate": 0.0008,
1700
- "loss": 3.2516,
1701
  "step": 2450
1702
  },
1703
  {
1704
  "epoch": 0.87,
1705
- "learning_rate": 0.0008,
1706
- "loss": 3.2309,
1707
  "step": 2460
1708
  },
1709
  {
1710
  "epoch": 0.87,
1711
- "learning_rate": 0.0008,
1712
- "loss": 3.2451,
1713
  "step": 2470
1714
  },
1715
  {
1716
  "epoch": 0.88,
1717
- "learning_rate": 0.0008,
1718
- "loss": 3.2464,
1719
  "step": 2480
1720
  },
1721
  {
1722
  "epoch": 0.88,
1723
- "learning_rate": 0.0008,
1724
- "loss": 3.2484,
1725
  "step": 2490
1726
  },
1727
  {
1728
  "epoch": 0.88,
1729
- "learning_rate": 0.0008,
1730
- "loss": 3.2496,
1731
  "step": 2500
1732
  },
1733
  {
1734
  "epoch": 0.88,
1735
- "eval_accuracy": 0.4018057388777355,
1736
- "eval_loss": 3.2397472858428955,
1737
- "eval_runtime": 135.4382,
1738
- "eval_samples_per_second": 37.397,
1739
- "eval_steps_per_second": 4.681,
1740
  "step": 2500
1741
  },
1742
  {
1743
  "epoch": 0.89,
1744
- "learning_rate": 0.0008,
1745
- "loss": 3.2371,
1746
  "step": 2510
1747
  },
1748
  {
1749
  "epoch": 0.89,
1750
- "learning_rate": 0.0008,
1751
- "loss": 3.232,
1752
  "step": 2520
1753
  },
1754
  {
1755
  "epoch": 0.89,
1756
- "learning_rate": 0.0008,
1757
- "loss": 3.2479,
1758
  "step": 2530
1759
  },
1760
  {
1761
  "epoch": 0.9,
1762
- "learning_rate": 0.0008,
1763
- "loss": 3.2212,
1764
  "step": 2540
1765
  },
1766
  {
1767
  "epoch": 0.9,
1768
- "learning_rate": 0.0008,
1769
- "loss": 3.2145,
1770
  "step": 2550
1771
  },
1772
  {
1773
  "epoch": 0.9,
1774
- "learning_rate": 0.0008,
1775
- "loss": 3.2344,
1776
  "step": 2560
1777
  },
1778
  {
1779
  "epoch": 0.91,
1780
- "learning_rate": 0.0008,
1781
- "loss": 3.2176,
1782
  "step": 2570
1783
  },
1784
  {
1785
  "epoch": 0.91,
1786
- "learning_rate": 0.0008,
1787
- "loss": 3.1954,
1788
  "step": 2580
1789
  },
1790
  {
1791
  "epoch": 0.91,
1792
- "learning_rate": 0.0008,
1793
- "loss": 3.2363,
1794
  "step": 2590
1795
  },
1796
  {
1797
  "epoch": 0.92,
1798
- "learning_rate": 0.0008,
1799
- "loss": 3.2237,
1800
  "step": 2600
1801
  },
1802
  {
1803
  "epoch": 0.92,
1804
- "eval_accuracy": 0.4046295086204693,
1805
- "eval_loss": 3.2158236503601074,
1806
- "eval_runtime": 136.1181,
1807
- "eval_samples_per_second": 37.21,
1808
- "eval_steps_per_second": 4.658,
1809
  "step": 2600
1810
  },
1811
  {
1812
  "epoch": 0.92,
1813
- "learning_rate": 0.0008,
1814
- "loss": 3.2029,
1815
  "step": 2610
1816
  },
1817
  {
1818
  "epoch": 0.92,
1819
- "learning_rate": 0.0008,
1820
- "loss": 3.2231,
1821
  "step": 2620
1822
  },
1823
  {
1824
  "epoch": 0.93,
1825
- "learning_rate": 0.0008,
1826
- "loss": 3.2197,
1827
  "step": 2630
1828
  },
1829
  {
1830
  "epoch": 0.93,
1831
- "learning_rate": 0.0008,
1832
- "loss": 3.2342,
1833
  "step": 2640
1834
  },
1835
  {
1836
  "epoch": 0.94,
1837
- "learning_rate": 0.0008,
1838
- "loss": 3.1778,
1839
  "step": 2650
1840
  },
1841
  {
1842
  "epoch": 0.94,
1843
- "learning_rate": 0.0008,
1844
- "loss": 3.2043,
1845
  "step": 2660
1846
  },
1847
  {
1848
  "epoch": 0.94,
1849
- "learning_rate": 0.0008,
1850
- "loss": 3.2018,
1851
  "step": 2670
1852
  },
1853
  {
1854
  "epoch": 0.95,
1855
- "learning_rate": 0.0008,
1856
- "loss": 3.184,
1857
  "step": 2680
1858
  },
1859
  {
1860
  "epoch": 0.95,
1861
- "learning_rate": 0.0008,
1862
- "loss": 3.1926,
1863
  "step": 2690
1864
  },
1865
  {
1866
  "epoch": 0.95,
1867
- "learning_rate": 0.0008,
1868
- "loss": 3.2353,
1869
  "step": 2700
1870
  },
1871
  {
1872
  "epoch": 0.95,
1873
- "eval_accuracy": 0.40629597354566505,
1874
- "eval_loss": 3.2031331062316895,
1875
- "eval_runtime": 135.6589,
1876
- "eval_samples_per_second": 37.336,
1877
- "eval_steps_per_second": 4.673,
1878
  "step": 2700
1879
  },
1880
  {
1881
  "epoch": 0.96,
1882
- "learning_rate": 0.0008,
1883
- "loss": 3.1691,
1884
  "step": 2710
1885
  },
1886
  {
1887
  "epoch": 0.96,
1888
- "learning_rate": 0.0008,
1889
- "loss": 3.1855,
1890
  "step": 2720
1891
  },
1892
  {
1893
  "epoch": 0.96,
1894
- "learning_rate": 0.0008,
1895
- "loss": 3.168,
1896
  "step": 2730
1897
  },
1898
  {
1899
  "epoch": 0.97,
1900
- "learning_rate": 0.0008,
1901
- "loss": 3.2,
1902
  "step": 2740
1903
  },
1904
  {
1905
  "epoch": 0.97,
1906
- "learning_rate": 0.0008,
1907
- "loss": 3.1977,
1908
  "step": 2750
1909
  },
1910
  {
1911
  "epoch": 0.97,
1912
- "learning_rate": 0.0008,
1913
- "loss": 3.2194,
1914
  "step": 2760
1915
  },
1916
  {
1917
  "epoch": 0.98,
1918
- "learning_rate": 0.0008,
1919
- "loss": 3.155,
1920
  "step": 2770
1921
  },
1922
  {
1923
  "epoch": 0.98,
1924
- "learning_rate": 0.0008,
1925
- "loss": 3.184,
1926
  "step": 2780
1927
  },
1928
  {
1929
  "epoch": 0.98,
1930
- "learning_rate": 0.0008,
1931
- "loss": 3.1804,
1932
  "step": 2790
1933
  },
1934
  {
1935
  "epoch": 0.99,
1936
- "learning_rate": 0.0008,
1937
- "loss": 3.169,
1938
  "step": 2800
1939
  },
1940
  {
1941
  "epoch": 0.99,
1942
- "eval_accuracy": 0.4082295088133695,
1943
- "eval_loss": 3.183588743209839,
1944
- "eval_runtime": 136.3124,
1945
- "eval_samples_per_second": 37.157,
1946
- "eval_steps_per_second": 4.651,
1947
  "step": 2800
1948
  },
1949
  {
1950
  "epoch": 0.99,
1951
- "learning_rate": 0.0008,
1952
- "loss": 3.2064,
1953
  "step": 2810
1954
  },
1955
  {
1956
  "epoch": 1.0,
1957
- "learning_rate": 0.0008,
1958
- "loss": 3.1264,
1959
  "step": 2820
1960
  },
1961
  {
1962
  "epoch": 1.0,
1963
- "learning_rate": 0.0008,
1964
- "loss": 3.2367,
1965
  "step": 2830
1966
  },
1967
  {
1968
  "epoch": 1.0,
1969
  "step": 2833,
1970
- "total_flos": 2.0603056543707955e+17,
1971
- "train_loss": 3.8723633342356467,
1972
- "train_runtime": 7855.38,
1973
- "train_samples_per_second": 11.54,
1974
- "train_steps_per_second": 0.361
1975
  }
1976
  ],
1977
  "logging_steps": 10,
@@ -1979,8 +1979,8 @@
1979
  "num_input_tokens_seen": 0,
1980
  "num_train_epochs": 1,
1981
  "save_steps": 1000.0,
1982
- "total_flos": 2.0603056543707955e+17,
1983
- "train_batch_size": 16,
1984
  "trial_name": null,
1985
  "trial_params": null
1986
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 8.000000000000001e-06,
14
+ "loss": 10.5725,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
+ "learning_rate": 8e-05,
20
+ "loss": 9.6876,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
+ "learning_rate": 0.00016,
26
+ "loss": 8.4164,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.01,
31
+ "learning_rate": 0.00024,
32
+ "loss": 7.5148,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.01,
37
+ "learning_rate": 0.00032,
38
+ "loss": 7.2623,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.02,
43
+ "learning_rate": 0.0004,
44
+ "loss": 7.0235,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.02,
49
+ "learning_rate": 0.00048,
50
+ "loss": 6.7411,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.02,
55
+ "learning_rate": 0.00056,
56
+ "loss": 6.535,
57
  "step": 70
58
  },
59
  {
60
  "epoch": 0.03,
61
+ "learning_rate": 0.00064,
62
+ "loss": 6.3619,
63
  "step": 80
64
  },
65
  {
66
  "epoch": 0.03,
67
+ "learning_rate": 0.00072,
68
+ "loss": 6.285,
69
  "step": 90
70
  },
71
  {
72
  "epoch": 0.04,
73
  "learning_rate": 0.0008,
74
+ "loss": 6.1199,
75
  "step": 100
76
  },
77
  {
78
  "epoch": 0.04,
79
+ "eval_accuracy": 0.15415359968673006,
80
+ "eval_loss": 6.0748748779296875,
81
+ "eval_runtime": 149.9041,
82
+ "eval_samples_per_second": 33.788,
83
+ "eval_steps_per_second": 4.229,
84
  "step": 100
85
  },
86
  {
87
  "epoch": 0.04,
88
+ "learning_rate": 0.0007999735731319962,
89
+ "loss": 6.0192,
90
  "step": 110
91
  },
92
  {
93
  "epoch": 0.04,
94
+ "learning_rate": 0.0007998942960198819,
95
+ "loss": 5.9528,
96
  "step": 120
97
  },
98
  {
99
  "epoch": 0.05,
100
+ "learning_rate": 0.0007997621791388858,
101
+ "loss": 5.8808,
102
  "step": 130
103
  },
104
  {
105
  "epoch": 0.05,
106
+ "learning_rate": 0.0007995772399461845,
107
+ "loss": 5.7862,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.05,
112
+ "learning_rate": 0.0007993395028785968,
113
+ "loss": 5.7041,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.06,
118
+ "learning_rate": 0.0007990489993493526,
119
+ "loss": 5.6365,
120
  "step": 160
121
  },
122
  {
123
  "epoch": 0.06,
124
+ "learning_rate": 0.0007987057677439444,
125
+ "loss": 5.5384,
126
  "step": 170
127
  },
128
  {
129
  "epoch": 0.06,
130
+ "learning_rate": 0.0007983098534150538,
131
+ "loss": 5.5325,
132
  "step": 180
133
  },
134
  {
135
  "epoch": 0.07,
136
+ "learning_rate": 0.0007978613086765592,
137
+ "loss": 5.4794,
138
  "step": 190
139
  },
140
  {
141
  "epoch": 0.07,
142
+ "learning_rate": 0.0007973601927966237,
143
+ "loss": 5.3869,
144
  "step": 200
145
  },
146
  {
147
  "epoch": 0.07,
148
+ "eval_accuracy": 0.20318275703591465,
149
+ "eval_loss": 5.326748371124268,
150
+ "eval_runtime": 149.8598,
151
+ "eval_samples_per_second": 33.798,
152
+ "eval_steps_per_second": 4.231,
153
  "step": 200
154
  },
155
  {
156
  "epoch": 0.07,
157
+ "learning_rate": 0.0007968065719898634,
158
+ "loss": 5.3202,
159
  "step": 210
160
  },
161
  {
162
  "epoch": 0.08,
163
+ "learning_rate": 0.0007962005194085981,
164
+ "loss": 5.2673,
165
  "step": 220
166
  },
167
  {
168
  "epoch": 0.08,
169
+ "learning_rate": 0.0007955421151331857,
170
+ "loss": 5.2441,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.08,
175
+ "learning_rate": 0.0007948314461614408,
176
+ "loss": 5.1405,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.09,
181
+ "learning_rate": 0.0007940686063971387,
182
+ "loss": 5.1529,
183
  "step": 250
184
  },
185
  {
186
  "epoch": 0.09,
187
+ "learning_rate": 0.0007932536966376081,
188
+ "loss": 5.1036,
189
  "step": 260
190
  },
191
  {
192
  "epoch": 0.1,
193
+ "learning_rate": 0.0007923868245604124,
194
+ "loss": 4.9852,
195
  "step": 270
196
  },
197
  {
198
  "epoch": 0.1,
199
+ "learning_rate": 0.0007914681047091216,
200
+ "loss": 4.9747,
201
  "step": 280
202
  },
203
  {
204
  "epoch": 0.1,
205
+ "learning_rate": 0.0007904976584781766,
206
+ "loss": 4.9381,
207
  "step": 290
208
  },
209
  {
210
  "epoch": 0.11,
211
+ "learning_rate": 0.0007894756140968497,
212
+ "loss": 4.9187,
213
  "step": 300
214
  },
215
  {
216
  "epoch": 0.11,
217
+ "eval_accuracy": 0.23856663569010775,
218
+ "eval_loss": 4.856618404388428,
219
+ "eval_runtime": 149.2653,
220
+ "eval_samples_per_second": 33.933,
221
+ "eval_steps_per_second": 4.247,
222
  "step": 300
223
  },
224
  {
225
  "epoch": 0.11,
226
+ "learning_rate": 0.0007884021066123009,
227
+ "loss": 4.8608,
228
  "step": 310
229
  },
230
  {
231
  "epoch": 0.11,
232
+ "learning_rate": 0.0007872772778717331,
233
+ "loss": 4.8439,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.12,
238
+ "learning_rate": 0.0007861012765036494,
239
+ "loss": 4.7976,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.12,
244
+ "learning_rate": 0.0007848742578982146,
245
+ "loss": 4.752,
246
  "step": 340
247
  },
248
  {
249
  "epoch": 0.12,
250
+ "learning_rate": 0.0007835963841867223,
251
+ "loss": 4.7569,
252
  "step": 350
253
  },
254
  {
255
  "epoch": 0.13,
256
+ "learning_rate": 0.0007822678242201718,
257
+ "loss": 4.698,
258
  "step": 360
259
  },
260
  {
261
  "epoch": 0.13,
262
+ "learning_rate": 0.0007808887535469578,
263
+ "loss": 4.6793,
264
  "step": 370
265
  },
266
  {
267
  "epoch": 0.13,
268
+ "learning_rate": 0.0007794593543896733,
269
+ "loss": 4.6234,
270
  "step": 380
271
  },
272
  {
273
  "epoch": 0.14,
274
+ "learning_rate": 0.0007779798156210327,
275
+ "loss": 4.6046,
276
  "step": 390
277
  },
278
  {
279
  "epoch": 0.14,
280
+ "learning_rate": 0.0007764503327389145,
281
+ "loss": 4.6185,
282
  "step": 400
283
  },
284
  {
285
  "epoch": 0.14,
286
+ "eval_accuracy": 0.2624355291325133,
287
+ "eval_loss": 4.553475379943848,
288
+ "eval_runtime": 149.5941,
289
+ "eval_samples_per_second": 33.858,
290
+ "eval_steps_per_second": 4.238,
291
  "step": 400
292
  },
293
  {
294
  "epoch": 0.14,
295
+ "learning_rate": 0.00077487110784053,
296
+ "loss": 4.551,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.15,
301
+ "learning_rate": 0.0007732423495957192,
302
+ "loss": 4.5083,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.15,
307
+ "learning_rate": 0.0007715642732193774,
308
+ "loss": 4.522,
309
  "step": 430
310
  },
311
  {
312
  "epoch": 0.16,
313
+ "learning_rate": 0.0007698371004430193,
314
+ "loss": 4.4942,
315
  "step": 440
316
  },
317
  {
318
  "epoch": 0.16,
319
+ "learning_rate": 0.0007680610594854798,
320
+ "loss": 4.5164,
321
  "step": 450
322
  },
323
  {
324
  "epoch": 0.16,
325
+ "learning_rate": 0.0007662363850227587,
326
+ "loss": 4.4405,
327
  "step": 460
328
  },
329
  {
330
  "epoch": 0.17,
331
+ "learning_rate": 0.0007643633181570117,
332
+ "loss": 4.3987,
333
  "step": 470
334
  },
335
  {
336
  "epoch": 0.17,
337
+ "learning_rate": 0.000762442106384693,
338
+ "loss": 4.4089,
339
  "step": 480
340
  },
341
  {
342
  "epoch": 0.17,
343
+ "learning_rate": 0.0007604730035638523,
344
+ "loss": 4.3598,
345
  "step": 490
346
  },
347
  {
348
  "epoch": 0.18,
349
+ "learning_rate": 0.0007584562698805911,
350
+ "loss": 4.3509,
351
  "step": 500
352
  },
353
  {
354
  "epoch": 0.18,
355
+ "eval_accuracy": 0.2801221637037998,
356
+ "eval_loss": 4.338791847229004,
357
+ "eval_runtime": 148.9964,
358
+ "eval_samples_per_second": 33.994,
359
+ "eval_steps_per_second": 4.255,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.18,
364
+ "learning_rate": 0.0007563921718146838,
365
+ "loss": 4.3866,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.18,
370
+ "learning_rate": 0.0007542809821043658,
371
+ "loss": 4.3586,
372
  "step": 520
373
  },
374
  {
375
  "epoch": 0.19,
376
+ "learning_rate": 0.0007521229797102965,
377
+ "loss": 4.319,
378
  "step": 530
379
  },
380
  {
381
  "epoch": 0.19,
382
+ "learning_rate": 0.0007499184497786977,
383
+ "loss": 4.3143,
384
  "step": 540
385
  },
386
  {
387
  "epoch": 0.19,
388
+ "learning_rate": 0.0007476676836036771,
389
+ "loss": 4.2326,
390
  "step": 550
391
  },
392
  {
393
  "epoch": 0.2,
394
+ "learning_rate": 0.0007453709785887376,
395
+ "loss": 4.2689,
396
  "step": 560
397
  },
398
  {
399
  "epoch": 0.2,
400
+ "learning_rate": 0.0007430286382074807,
401
+ "loss": 4.2383,
402
  "step": 570
403
  },
404
  {
405
  "epoch": 0.2,
406
+ "learning_rate": 0.0007406409719635068,
407
+ "loss": 4.232,
408
  "step": 580
409
  },
410
  {
411
  "epoch": 0.21,
412
+ "learning_rate": 0.0007382082953495193,
413
+ "loss": 4.1941,
414
  "step": 590
415
  },
416
  {
417
  "epoch": 0.21,
418
+ "learning_rate": 0.0007357309298056369,
419
+ "loss": 4.1666,
420
  "step": 600
421
  },
422
  {
423
  "epoch": 0.21,
424
+ "eval_accuracy": 0.2955689374718788,
425
+ "eval_loss": 4.16923713684082,
426
+ "eval_runtime": 148.9107,
427
+ "eval_samples_per_second": 34.014,
428
+ "eval_steps_per_second": 4.258,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.22,
433
+ "learning_rate": 0.0007332092026769209,
434
+ "loss": 4.1266,
435
  "step": 610
436
  },
437
  {
438
  "epoch": 0.22,
439
+ "learning_rate": 0.0007306434471701209,
440
+ "loss": 4.1373,
441
  "step": 620
442
  },
443
  {
444
  "epoch": 0.22,
445
+ "learning_rate": 0.0007280340023096477,
446
+ "loss": 4.1767,
447
  "step": 630
448
  },
449
  {
450
  "epoch": 0.23,
451
+ "learning_rate": 0.0007253812128927756,
452
+ "loss": 4.139,
453
  "step": 640
454
  },
455
  {
456
  "epoch": 0.23,
457
+ "learning_rate": 0.0007226854294440834,
458
+ "loss": 4.0591,
459
  "step": 650
460
  },
461
  {
462
  "epoch": 0.23,
463
+ "learning_rate": 0.0007199470081691381,
464
+ "loss": 4.1488,
465
  "step": 660
466
  },
467
  {
468
  "epoch": 0.24,
469
+ "learning_rate": 0.0007171663109074274,
470
+ "loss": 4.1125,
471
  "step": 670
472
  },
473
  {
474
  "epoch": 0.24,
475
+ "learning_rate": 0.0007143437050845489,
476
+ "loss": 4.1009,
477
  "step": 680
478
  },
479
  {
480
  "epoch": 0.24,
481
+ "learning_rate": 0.0007114795636636599,
482
+ "loss": 4.085,
483
  "step": 690
484
  },
485
  {
486
  "epoch": 0.25,
487
+ "learning_rate": 0.000708574265096197,
488
+ "loss": 4.0456,
489
  "step": 700
490
  },
491
  {
492
  "epoch": 0.25,
493
+ "eval_accuracy": 0.3088623661815066,
494
+ "eval_loss": 4.03993558883667,
495
+ "eval_runtime": 148.8369,
496
+ "eval_samples_per_second": 34.031,
497
+ "eval_steps_per_second": 4.26,
498
  "step": 700
499
  },
500
  {
501
  "epoch": 0.25,
502
+ "learning_rate": 0.0007056281932718689,
503
+ "loss": 4.0732,
504
  "step": 710
505
  },
506
  {
507
  "epoch": 0.25,
508
+ "learning_rate": 0.0007026417374679316,
509
+ "loss": 4.0439,
510
  "step": 720
511
  },
512
  {
513
  "epoch": 0.26,
514
+ "learning_rate": 0.000699615292297752,
515
+ "loss": 4.0528,
516
  "step": 730
517
  },
518
  {
519
  "epoch": 0.26,
520
+ "learning_rate": 0.0006965492576586652,
521
+ "loss": 4.05,
522
  "step": 740
523
  },
524
  {
525
  "epoch": 0.26,
526
+ "learning_rate": 0.0006934440386791345,
527
+ "loss": 3.9947,
528
  "step": 750
529
  },
530
  {
531
  "epoch": 0.27,
532
+ "learning_rate": 0.0006903000456652207,
533
+ "loss": 4.002,
534
  "step": 760
535
  },
536
  {
537
  "epoch": 0.27,
538
+ "learning_rate": 0.0006871176940463655,
539
+ "loss": 3.937,
540
  "step": 770
541
  },
542
  {
543
  "epoch": 0.28,
544
+ "learning_rate": 0.0006838974043204999,
545
+ "loss": 3.949,
546
  "step": 780
547
  },
548
  {
549
  "epoch": 0.28,
550
+ "learning_rate": 0.0006806396019984811,
551
+ "loss": 3.9419,
552
  "step": 790
553
  },
554
  {
555
  "epoch": 0.28,
556
+ "learning_rate": 0.0006773447175478696,
557
+ "loss": 3.9273,
558
  "step": 800
559
  },
560
  {
561
  "epoch": 0.28,
562
+ "eval_accuracy": 0.31934504591266155,
563
+ "eval_loss": 3.9317612648010254,
564
+ "eval_runtime": 150.5837,
565
+ "eval_samples_per_second": 33.636,
566
+ "eval_steps_per_second": 4.21,
567
  "step": 800
568
  },
569
  {
570
  "epoch": 0.29,
571
+ "learning_rate": 0.000674013186336047,
572
+ "loss": 3.9558,
573
  "step": 810
574
  },
575
  {
576
  "epoch": 0.29,
577
+ "learning_rate": 0.0006706454485726915,
578
+ "loss": 3.9083,
579
  "step": 820
580
  },
581
  {
582
  "epoch": 0.29,
583
+ "learning_rate": 0.0006672419492516099,
584
+ "loss": 3.9169,
585
  "step": 830
586
  },
587
  {
588
  "epoch": 0.3,
589
+ "learning_rate": 0.0006638031380919385,
590
+ "loss": 3.9215,
591
  "step": 840
592
  },
593
  {
594
  "epoch": 0.3,
595
+ "learning_rate": 0.0006603294694787206,
596
+ "loss": 3.9422,
597
  "step": 850
598
  },
599
  {
600
  "epoch": 0.3,
601
+ "learning_rate": 0.0006568214024028656,
602
+ "loss": 3.9031,
603
  "step": 860
604
  },
605
  {
606
  "epoch": 0.31,
607
+ "learning_rate": 0.0006532794004005016,
608
+ "loss": 3.8931,
609
  "step": 870
610
  },
611
  {
612
  "epoch": 0.31,
613
+ "learning_rate": 0.0006497039314917254,
614
+ "loss": 3.871,
615
  "step": 880
616
  },
617
  {
618
  "epoch": 0.31,
619
+ "learning_rate": 0.0006460954681187614,
620
+ "loss": 3.878,
621
  "step": 890
622
  },
623
  {
624
  "epoch": 0.32,
625
+ "learning_rate": 0.0006424544870835359,
626
+ "loss": 3.8447,
627
  "step": 900
628
  },
629
  {
630
  "epoch": 0.32,
631
+ "eval_accuracy": 0.3326595971954238,
632
+ "eval_loss": 3.817286968231201,
633
+ "eval_runtime": 151.9107,
634
+ "eval_samples_per_second": 33.342,
635
+ "eval_steps_per_second": 4.174,
636
  "step": 900
637
  },
638
  {
639
  "epoch": 0.32,
640
+ "learning_rate": 0.0006387814694846751,
641
+ "loss": 3.7965,
642
  "step": 910
643
  },
644
  {
645
  "epoch": 0.32,
646
+ "learning_rate": 0.0006350769006539354,
647
+ "loss": 3.7753,
648
  "step": 920
649
  },
650
  {
651
  "epoch": 0.33,
652
+ "learning_rate": 0.000631341270092074,
653
+ "loss": 3.7734,
654
  "step": 930
655
  },
656
  {
657
  "epoch": 0.33,
658
+ "learning_rate": 0.00062757507140417,
659
+ "loss": 3.8001,
660
  "step": 940
661
  },
662
  {
663
  "epoch": 0.34,
664
+ "learning_rate": 0.0006237788022344014,
665
+ "loss": 3.7775,
666
  "step": 950
667
  },
668
  {
669
  "epoch": 0.34,
670
+ "learning_rate": 0.0006199529642002892,
671
+ "loss": 3.7659,
672
  "step": 960
673
  },
674
  {
675
  "epoch": 0.34,
676
+ "learning_rate": 0.0006160980628264175,
677
+ "loss": 3.7701,
678
  "step": 970
679
  },
680
  {
681
  "epoch": 0.35,
682
+ "learning_rate": 0.0006122146074776347,
683
+ "loss": 3.7496,
684
  "step": 980
685
  },
686
  {
687
  "epoch": 0.35,
688
+ "learning_rate": 0.0006083031112917506,
689
+ "loss": 3.7569,
690
  "step": 990
691
  },
692
  {
693
  "epoch": 0.35,
694
+ "learning_rate": 0.0006043640911117322,
695
+ "loss": 3.7143,
696
  "step": 1000
697
  },
698
  {
699
  "epoch": 0.35,
700
+ "eval_accuracy": 0.3461449616152692,
701
+ "eval_loss": 3.7108187675476074,
702
+ "eval_runtime": 150.717,
703
+ "eval_samples_per_second": 33.606,
704
+ "eval_steps_per_second": 4.207,
705
  "step": 1000
706
  },
707
  {
708
  "epoch": 0.36,
709
+ "learning_rate": 0.0006003980674174113,
710
+ "loss": 3.727,
711
  "step": 1010
712
  },
713
  {
714
  "epoch": 0.36,
715
+ "learning_rate": 0.0005964055642567111,
716
+ "loss": 3.7216,
717
  "step": 1020
718
  },
719
  {
720
  "epoch": 0.36,
721
+ "learning_rate": 0.0005923871091764019,
722
+ "loss": 3.6425,
723
  "step": 1030
724
  },
725
  {
726
  "epoch": 0.37,
727
+ "learning_rate": 0.0005883432331523935,
728
+ "loss": 3.656,
729
  "step": 1040
730
  },
731
  {
732
  "epoch": 0.37,
733
+ "learning_rate": 0.0005842744705195756,
734
+ "loss": 3.6711,
735
  "step": 1050
736
  },
737
  {
738
  "epoch": 0.37,
739
+ "learning_rate": 0.0005801813589012133,
740
+ "loss": 3.6739,
741
  "step": 1060
742
  },
743
  {
744
  "epoch": 0.38,
745
+ "learning_rate": 0.0005760644391379089,
746
+ "loss": 3.6481,
747
  "step": 1070
748
  },
749
  {
750
  "epoch": 0.38,
751
+ "learning_rate": 0.0005719242552161383,
752
+ "loss": 3.6327,
753
  "step": 1080
754
  },
755
  {
756
  "epoch": 0.38,
757
+ "learning_rate": 0.0005677613541963716,
758
+ "loss": 3.6286,
759
  "step": 1090
760
  },
761
  {
762
  "epoch": 0.39,
763
+ "learning_rate": 0.0005635762861407874,
764
+ "loss": 3.6485,
765
  "step": 1100
766
  },
767
  {
768
  "epoch": 0.39,
769
+ "eval_accuracy": 0.3589528604931205,
770
+ "eval_loss": 3.6115522384643555,
771
+ "eval_runtime": 151.483,
772
+ "eval_samples_per_second": 33.436,
773
+ "eval_steps_per_second": 4.185,
774
  "step": 1100
775
  },
776
  {
777
  "epoch": 0.39,
778
+ "learning_rate": 0.0005593696040405915,
779
+ "loss": 3.6201,
780
  "step": 1110
781
  },
782
  {
783
  "epoch": 0.4,
784
+ "learning_rate": 0.0005551418637429465,
785
+ "loss": 3.5593,
786
  "step": 1120
787
  },
788
  {
789
  "epoch": 0.4,
790
+ "learning_rate": 0.0005508936238775265,
791
+ "loss": 3.6036,
792
  "step": 1130
793
  },
794
  {
795
  "epoch": 0.4,
796
+ "learning_rate": 0.0005466254457827025,
797
+ "loss": 3.6029,
798
  "step": 1140
799
  },
800
  {
801
  "epoch": 0.41,
802
+ "learning_rate": 0.0005423378934313702,
803
+ "loss": 3.585,
804
  "step": 1150
805
  },
806
  {
807
  "epoch": 0.41,
808
+ "learning_rate": 0.0005380315333564296,
809
+ "loss": 3.5505,
810
  "step": 1160
811
  },
812
  {
813
  "epoch": 0.41,
814
+ "learning_rate": 0.0005337069345759272,
815
+ "loss": 3.5358,
816
  "step": 1170
817
  },
818
  {
819
  "epoch": 0.42,
820
+ "learning_rate": 0.0005293646685178686,
821
+ "loss": 3.5578,
822
  "step": 1180
823
  },
824
  {
825
  "epoch": 0.42,
826
+ "learning_rate": 0.0005250053089447138,
827
+ "loss": 3.5917,
828
  "step": 1190
829
  },
830
  {
831
  "epoch": 0.42,
832
+ "learning_rate": 0.0005206294318775628,
833
+ "loss": 3.5171,
834
  "step": 1200
835
  },
836
  {
837
  "epoch": 0.42,
838
+ "eval_accuracy": 0.36927514369860115,
839
+ "eval_loss": 3.530304431915283,
840
+ "eval_runtime": 151.4072,
841
+ "eval_samples_per_second": 33.453,
842
+ "eval_steps_per_second": 4.187,
843
  "step": 1200
844
  },
845
  {
846
  "epoch": 0.43,
847
+ "learning_rate": 0.0005162376155200437,
848
+ "loss": 3.5322,
849
  "step": 1210
850
  },
851
  {
852
  "epoch": 0.43,
853
+ "learning_rate": 0.0005118304401819125,
854
+ "loss": 3.5639,
855
  "step": 1220
856
  },
857
  {
858
  "epoch": 0.43,
859
+ "learning_rate": 0.0005074084882023739,
860
+ "loss": 3.5472,
861
  "step": 1230
862
  },
863
  {
864
  "epoch": 0.44,
865
+ "learning_rate": 0.0005029723438731346,
866
+ "loss": 3.4967,
867
  "step": 1240
868
  },
869
  {
870
  "epoch": 0.44,
871
+ "learning_rate": 0.0004985225933611971,
872
+ "loss": 3.466,
873
  "step": 1250
874
  },
875
  {
876
  "epoch": 0.44,
877
+ "learning_rate": 0.000494059824631409,
878
+ "loss": 3.4608,
879
  "step": 1260
880
  },
881
  {
882
  "epoch": 0.45,
883
+ "learning_rate": 0.0004895846273687709,
884
+ "loss": 3.5004,
885
  "step": 1270
886
  },
887
  {
888
  "epoch": 0.45,
889
+ "learning_rate": 0.0004850975929005197,
890
+ "loss": 3.4747,
891
  "step": 1280
892
  },
893
  {
894
  "epoch": 0.46,
895
+ "learning_rate": 0.00048059931411799335,
896
+ "loss": 3.5048,
897
  "step": 1290
898
  },
899
  {
900
  "epoch": 0.46,
901
+ "learning_rate": 0.00047609038539829,
902
+ "loss": 3.4464,
903
  "step": 1300
904
  },
905
  {
906
  "epoch": 0.46,
907
+ "eval_accuracy": 0.3779672272186056,
908
+ "eval_loss": 3.455420970916748,
909
+ "eval_runtime": 151.2649,
910
+ "eval_samples_per_second": 33.484,
911
+ "eval_steps_per_second": 4.191,
912
  "step": 1300
913
  },
914
  {
915
  "epoch": 0.46,
916
+ "learning_rate": 0.0004715714025257304,
917
+ "loss": 3.4953,
918
  "step": 1310
919
  },
920
  {
921
  "epoch": 0.47,
922
+ "learning_rate": 0.00046704296261313393,
923
+ "loss": 3.471,
924
  "step": 1320
925
  },
926
  {
927
  "epoch": 0.47,
928
+ "learning_rate": 0.0004625056640229197,
929
+ "loss": 3.4471,
930
  "step": 1330
931
  },
932
  {
933
  "epoch": 0.47,
934
+ "learning_rate": 0.0004579601062880422,
935
+ "loss": 3.4493,
936
  "step": 1340
937
  },
938
  {
939
  "epoch": 0.48,
940
+ "learning_rate": 0.00045340689003277285,
941
+ "loss": 3.4145,
942
  "step": 1350
943
  },
944
  {
945
  "epoch": 0.48,
946
+ "learning_rate": 0.0004488466168933368,
947
+ "loss": 3.4739,
948
  "step": 1360
949
  },
950
  {
951
  "epoch": 0.48,
952
+ "learning_rate": 0.00044427988943841534,
953
+ "loss": 3.3819,
954
  "step": 1370
955
  },
956
  {
957
  "epoch": 0.49,
958
+ "learning_rate": 0.0004397073110895268,
959
+ "loss": 3.3975,
960
  "step": 1380
961
  },
962
  {
963
  "epoch": 0.49,
964
+ "learning_rate": 0.0004351294860412936,
965
+ "loss": 3.4112,
966
  "step": 1390
967
  },
968
  {
969
  "epoch": 0.49,
970
+ "learning_rate": 0.000430547019181607,
971
+ "loss": 3.3955,
972
  "step": 1400
973
  },
974
  {
975
  "epoch": 0.49,
976
+ "eval_accuracy": 0.38511794160042556,
977
+ "eval_loss": 3.3999252319335938,
978
+ "eval_runtime": 150.5869,
979
+ "eval_samples_per_second": 33.635,
980
+ "eval_steps_per_second": 4.21,
981
  "step": 1400
982
  },
983
  {
984
  "epoch": 0.5,
985
+ "learning_rate": 0.00042596051601170143,
986
+ "loss": 3.3769,
987
  "step": 1410
988
  },
989
  {
990
  "epoch": 0.5,
991
+ "learning_rate": 0.00042137058256614605,
992
+ "loss": 3.389,
993
  "step": 1420
994
  },
995
  {
996
  "epoch": 0.5,
997
+ "learning_rate": 0.00041677782533276747,
998
+ "loss": 3.3465,
999
  "step": 1430
1000
  },
1001
  {
1002
  "epoch": 0.51,
1003
+ "learning_rate": 0.00041218285117251163,
1004
+ "loss": 3.3847,
1005
  "step": 1440
1006
  },
1007
  {
1008
  "epoch": 0.51,
1009
+ "learning_rate": 0.0004075862672392566,
1010
+ "loss": 3.3683,
1011
  "step": 1450
1012
  },
1013
  {
1014
  "epoch": 0.52,
1015
+ "learning_rate": 0.0004029886808995867,
1016
+ "loss": 3.3386,
1017
  "step": 1460
1018
  },
1019
  {
1020
  "epoch": 0.52,
1021
+ "learning_rate": 0.00039839069965253864,
1022
+ "loss": 3.3675,
1023
  "step": 1470
1024
  },
1025
  {
1026
  "epoch": 0.52,
1027
+ "learning_rate": 0.0003937929310493297,
1028
+ "loss": 3.3393,
1029
  "step": 1480
1030
  },
1031
  {
1032
  "epoch": 0.53,
1033
+ "learning_rate": 0.0003891959826130802,
1034
+ "loss": 3.4105,
1035
  "step": 1490
1036
  },
1037
  {
1038
  "epoch": 0.53,
1039
+ "learning_rate": 0.0003846004617585376,
1040
+ "loss": 3.3551,
1041
  "step": 1500
1042
  },
1043
  {
1044
  "epoch": 0.53,
1045
+ "eval_accuracy": 0.39192461845543836,
1046
+ "eval_loss": 3.3431735038757324,
1047
+ "eval_runtime": 150.786,
1048
+ "eval_samples_per_second": 33.591,
1049
+ "eval_steps_per_second": 4.205,
1050
  "step": 1500
1051
  },
1052
  {
1053
  "epoch": 0.53,
1054
+ "learning_rate": 0.00038000697571181723,
1055
+ "loss": 3.3163,
1056
  "step": 1510
1057
  },
1058
  {
1059
  "epoch": 0.54,
1060
+ "learning_rate": 0.00037541613143016596,
1061
+ "loss": 3.2978,
1062
  "step": 1520
1063
  },
1064
  {
1065
  "epoch": 0.54,
1066
+ "learning_rate": 0.00037082853552176324,
1067
+ "loss": 3.3012,
1068
  "step": 1530
1069
  },
1070
  {
1071
  "epoch": 0.54,
1072
+ "learning_rate": 0.0003662447941655669,
1073
+ "loss": 3.3617,
1074
  "step": 1540
1075
  },
1076
  {
1077
  "epoch": 0.55,
1078
+ "learning_rate": 0.00036166551303121566,
1079
+ "loss": 3.2746,
1080
  "step": 1550
1081
  },
1082
  {
1083
  "epoch": 0.55,
1084
+ "learning_rate": 0.00035709129719900003,
1085
+ "loss": 3.312,
1086
  "step": 1560
1087
  },
1088
  {
1089
  "epoch": 0.55,
1090
+ "learning_rate": 0.0003525227510799099,
1091
+ "loss": 3.3274,
1092
  "step": 1570
1093
  },
1094
  {
1095
  "epoch": 0.56,
1096
+ "learning_rate": 0.0003479604783357719,
1097
+ "loss": 3.2888,
1098
  "step": 1580
1099
  },
1100
  {
1101
  "epoch": 0.56,
1102
+ "learning_rate": 0.0003434050817994838,
1103
+ "loss": 3.3067,
1104
  "step": 1590
1105
  },
1106
  {
1107
  "epoch": 0.56,
1108
+ "learning_rate": 0.00033885716339536047,
1109
+ "loss": 3.2787,
1110
  "step": 1600
1111
  },
1112
  {
1113
  "epoch": 0.56,
1114
+ "eval_accuracy": 0.39735948545797645,
1115
+ "eval_loss": 3.2980780601501465,
1116
+ "eval_runtime": 151.955,
1117
+ "eval_samples_per_second": 33.332,
1118
+ "eval_steps_per_second": 4.172,
1119
  "step": 1600
1120
  },
1121
  {
1122
  "epoch": 0.57,
1123
+ "learning_rate": 0.00033431732405959886,
1124
+ "loss": 3.3245,
1125
  "step": 1610
1126
  },
1127
  {
1128
  "epoch": 0.57,
1129
+ "learning_rate": 0.0003297861636608732,
1130
+ "loss": 3.328,
1131
  "step": 1620
1132
  },
1133
  {
1134
  "epoch": 0.58,
1135
+ "learning_rate": 0.00032526428092107256,
1136
+ "loss": 3.2773,
1137
  "step": 1630
1138
  },
1139
  {
1140
  "epoch": 0.58,
1141
+ "learning_rate": 0.0003207522733361881,
1142
+ "loss": 3.2792,
1143
  "step": 1640
1144
  },
1145
  {
1146
  "epoch": 0.58,
1147
+ "learning_rate": 0.00031625073709736444,
1148
+ "loss": 3.2355,
1149
  "step": 1650
1150
  },
1151
  {
1152
  "epoch": 0.59,
1153
+ "learning_rate": 0.00031176026701212125,
1154
+ "loss": 3.2635,
1155
  "step": 1660
1156
  },
1157
  {
1158
  "epoch": 0.59,
1159
+ "learning_rate": 0.00030728145642576,
1160
+ "loss": 3.226,
1161
  "step": 1670
1162
  },
1163
  {
1164
  "epoch": 0.59,
1165
+ "learning_rate": 0.0003028148971429614,
1166
+ "loss": 3.2433,
1167
  "step": 1680
1168
  },
1169
  {
1170
  "epoch": 0.6,
1171
+ "learning_rate": 0.00029836117934958843,
1172
+ "loss": 3.2282,
1173
  "step": 1690
1174
  },
1175
  {
1176
  "epoch": 0.6,
1177
+ "learning_rate": 0.00029392089153470243,
1178
+ "loss": 3.2705,
1179
  "step": 1700
1180
  },
1181
  {
1182
  "epoch": 0.6,
1183
+ "eval_accuracy": 0.4022830704505329,
1184
+ "eval_loss": 3.2566046714782715,
1185
+ "eval_runtime": 151.1757,
1186
+ "eval_samples_per_second": 33.504,
1187
+ "eval_steps_per_second": 4.194,
1188
  "step": 1700
1189
  },
1190
  {
1191
  "epoch": 0.6,
1192
+ "learning_rate": 0.0002894946204128031,
1193
+ "loss": 3.2523,
1194
  "step": 1710
1195
  },
1196
  {
1197
  "epoch": 0.61,
1198
+ "learning_rate": 0.00028508295084630423,
1199
+ "loss": 3.2703,
1200
  "step": 1720
1201
  },
1202
  {
1203
  "epoch": 0.61,
1204
+ "learning_rate": 0.0002806864657682521,
1205
+ "loss": 3.2855,
1206
  "step": 1730
1207
  },
1208
  {
1209
  "epoch": 0.61,
1210
+ "learning_rate": 0.0002763057461053008,
1211
+ "loss": 3.2752,
1212
  "step": 1740
1213
  },
1214
  {
1215
  "epoch": 0.62,
1216
+ "learning_rate": 0.00027194137070095224,
1217
+ "loss": 3.2225,
1218
  "step": 1750
1219
  },
1220
  {
1221
  "epoch": 0.62,
1222
+ "learning_rate": 0.0002675939162390696,
1223
+ "loss": 3.2595,
1224
  "step": 1760
1225
  },
1226
  {
1227
  "epoch": 0.62,
1228
+ "learning_rate": 0.0002632639571676793,
1229
+ "loss": 3.2349,
1230
  "step": 1770
1231
  },
1232
  {
1233
  "epoch": 0.63,
1234
+ "learning_rate": 0.0002589520656230653,
1235
+ "loss": 3.1926,
1236
  "step": 1780
1237
  },
1238
  {
1239
  "epoch": 0.63,
1240
+ "learning_rate": 0.00025465881135417135,
1241
+ "loss": 3.2271,
1242
  "step": 1790
1243
  },
1244
  {
1245
  "epoch": 0.64,
1246
+ "learning_rate": 0.00025038476164731643,
1247
+ "loss": 3.2281,
1248
  "step": 1800
1249
  },
1250
  {
1251
  "epoch": 0.64,
1252
+ "eval_accuracy": 0.40748857910186626,
1253
+ "eval_loss": 3.217235803604126,
1254
+ "eval_runtime": 151.2208,
1255
+ "eval_samples_per_second": 33.494,
1256
+ "eval_steps_per_second": 4.193,
1257
  "step": 1800
1258
  },
1259
  {
1260
  "epoch": 0.64,
1261
+ "learning_rate": 0.00024613048125123803,
1262
+ "loss": 3.2461,
1263
  "step": 1810
1264
  },
1265
  {
1266
  "epoch": 0.64,
1267
+ "learning_rate": 0.00024189653230246853,
1268
+ "loss": 3.2236,
1269
  "step": 1820
1270
  },
1271
  {
1272
  "epoch": 0.65,
1273
+ "learning_rate": 0.0002376834742510578,
1274
+ "loss": 3.2269,
1275
  "step": 1830
1276
  },
1277
  {
1278
  "epoch": 0.65,
1279
+ "learning_rate": 0.00023349186378665126,
1280
+ "loss": 3.1916,
1281
  "step": 1840
1282
  },
1283
  {
1284
  "epoch": 0.65,
1285
+ "learning_rate": 0.00022932225476493065,
1286
+ "loss": 3.2088,
1287
  "step": 1850
1288
  },
1289
  {
1290
  "epoch": 0.66,
1291
+ "learning_rate": 0.00022517519813443292,
1292
+ "loss": 3.212,
1293
  "step": 1860
1294
  },
1295
  {
1296
  "epoch": 0.66,
1297
+ "learning_rate": 0.00022105124186374818,
1298
+ "loss": 3.2159,
1299
  "step": 1870
1300
  },
1301
  {
1302
  "epoch": 0.66,
1303
+ "learning_rate": 0.0002169509308691171,
1304
+ "loss": 3.2286,
1305
  "step": 1880
1306
  },
1307
  {
1308
  "epoch": 0.67,
1309
+ "learning_rate": 0.0002128748069424268,
1310
+ "loss": 3.1553,
1311
  "step": 1890
1312
  },
1313
  {
1314
  "epoch": 0.67,
1315
+ "learning_rate": 0.00020882340867962174,
1316
+ "loss": 3.1759,
1317
  "step": 1900
1318
  },
1319
  {
1320
  "epoch": 0.67,
1321
+ "eval_accuracy": 0.41184416942232654,
1322
+ "eval_loss": 3.1826136112213135,
1323
+ "eval_runtime": 150.8644,
1324
+ "eval_samples_per_second": 33.573,
1325
+ "eval_steps_per_second": 4.202,
1326
  "step": 1900
1327
  },
1328
  {
1329
  "epoch": 0.67,
1330
+ "learning_rate": 0.00020479727140953813,
1331
+ "loss": 3.1996,
1332
  "step": 1910
1333
  },
1334
  {
1335
  "epoch": 0.68,
1336
+ "learning_rate": 0.00020079692712316648,
1337
+ "loss": 3.207,
1338
  "step": 1920
1339
  },
1340
  {
1341
  "epoch": 0.68,
1342
+ "learning_rate": 0.00019682290440335907,
1343
+ "loss": 3.1934,
1344
  "step": 1930
1345
  },
1346
  {
1347
  "epoch": 0.68,
1348
+ "learning_rate": 0.00019287572835498522,
1349
+ "loss": 3.2055,
1350
  "step": 1940
1351
  },
1352
  {
1353
  "epoch": 0.69,
1354
+ "learning_rate": 0.0001889559205355469,
1355
+ "loss": 3.165,
1356
  "step": 1950
1357
  },
1358
  {
1359
  "epoch": 0.69,
1360
+ "learning_rate": 0.00018506399888626373,
1361
+ "loss": 3.2182,
1362
  "step": 1960
1363
  },
1364
  {
1365
  "epoch": 0.7,
1366
+ "learning_rate": 0.00018120047766363384,
1367
+ "loss": 3.173,
1368
  "step": 1970
1369
  },
1370
  {
1371
  "epoch": 0.7,
1372
+ "learning_rate": 0.0001773658673714842,
1373
+ "loss": 3.1718,
1374
  "step": 1980
1375
  },
1376
  {
1377
  "epoch": 0.7,
1378
+ "learning_rate": 0.0001735606746935151,
1379
+ "loss": 3.1621,
1380
  "step": 1990
1381
  },
1382
  {
1383
  "epoch": 0.71,
1384
+ "learning_rate": 0.00016978540242634958,
1385
+ "loss": 3.1603,
1386
  "step": 2000
1387
  },
1388
  {
1389
  "epoch": 0.71,
1390
+ "eval_accuracy": 0.4152190550686701,
1391
+ "eval_loss": 3.1547319889068604,
1392
+ "eval_runtime": 150.8119,
1393
+ "eval_samples_per_second": 33.585,
1394
+ "eval_steps_per_second": 4.204,
1395
  "step": 2000
1396
  },
1397
  {
1398
  "epoch": 0.71,
1399
+ "learning_rate": 0.00016604054941309713,
1400
+ "loss": 3.1781,
1401
  "step": 2010
1402
  },
1403
  {
1404
  "epoch": 0.71,
1405
+ "learning_rate": 0.0001623266104774391,
1406
+ "loss": 3.1261,
1407
  "step": 2020
1408
  },
1409
  {
1410
  "epoch": 0.72,
1411
+ "learning_rate": 0.00015864407635824562,
1412
+ "loss": 3.1293,
1413
  "step": 2030
1414
  },
1415
  {
1416
  "epoch": 0.72,
1417
+ "learning_rate": 0.0001549934336447321,
1418
+ "loss": 3.1486,
1419
  "step": 2040
1420
  },
1421
  {
1422
  "epoch": 0.72,
1423
+ "learning_rate": 0.00015137516471216422,
1424
+ "loss": 3.1247,
1425
  "step": 2050
1426
  },
1427
  {
1428
  "epoch": 0.73,
1429
+ "learning_rate": 0.00014778974765811928,
1430
+ "loss": 3.0984,
1431
  "step": 2060
1432
  },
1433
  {
1434
  "epoch": 0.73,
1435
+ "learning_rate": 0.00014423765623931364,
1436
+ "loss": 3.1173,
1437
  "step": 2070
1438
  },
1439
  {
1440
  "epoch": 0.73,
1441
+ "learning_rate": 0.0001407193598090021,
1442
+ "loss": 3.147,
1443
  "step": 2080
1444
  },
1445
  {
1446
  "epoch": 0.74,
1447
+ "learning_rate": 0.000137235323254962,
1448
+ "loss": 3.147,
1449
  "step": 2090
1450
  },
1451
  {
1452
  "epoch": 0.74,
1453
+ "learning_rate": 0.00013378600693806378,
1454
+ "loss": 3.1328,
1455
  "step": 2100
1456
  },
1457
  {
1458
  "epoch": 0.74,
1459
+ "eval_accuracy": 0.4185663559848014,
1460
+ "eval_loss": 3.1282718181610107,
1461
+ "eval_runtime": 150.9461,
1462
+ "eval_samples_per_second": 33.555,
1463
+ "eval_steps_per_second": 4.2,
1464
  "step": 2100
1465
  },
1466
  {
1467
  "epoch": 0.74,
1468
+ "learning_rate": 0.0001303718666314425,
1469
+ "loss": 3.1565,
1470
  "step": 2110
1471
  },
1472
  {
1473
  "epoch": 0.75,
1474
+ "learning_rate": 0.00012699335346027447,
1475
+ "loss": 3.1537,
1476
  "step": 2120
1477
  },
1478
  {
1479
  "epoch": 0.75,
1480
+ "learning_rate": 0.0001236509138421674,
1481
+ "loss": 3.1268,
1482
  "step": 2130
1483
  },
1484
  {
1485
  "epoch": 0.76,
1486
+ "learning_rate": 0.00012034498942817482,
1487
+ "loss": 3.1449,
1488
  "step": 2140
1489
  },
1490
  {
1491
  "epoch": 0.76,
1492
+ "learning_rate": 0.0001170760170444369,
1493
+ "loss": 3.1358,
1494
  "step": 2150
1495
  },
1496
  {
1497
  "epoch": 0.76,
1498
+ "learning_rate": 0.00011384442863446211,
1499
+ "loss": 3.0987,
1500
  "step": 2160
1501
  },
1502
  {
1503
  "epoch": 0.77,
1504
+ "learning_rate": 0.00011065065120205264,
1505
+ "loss": 3.1229,
1506
  "step": 2170
1507
  },
1508
  {
1509
  "epoch": 0.77,
1510
+ "learning_rate": 0.00010749510675488115,
1511
+ "loss": 3.0585,
1512
  "step": 2180
1513
  },
1514
  {
1515
  "epoch": 0.77,
1516
+ "learning_rate": 0.00010437821224873104,
1517
+ "loss": 3.1092,
1518
  "step": 2190
1519
  },
1520
  {
1521
  "epoch": 0.78,
1522
+ "learning_rate": 0.00010130037953240043,
1523
+ "loss": 3.0916,
1524
  "step": 2200
1525
  },
1526
  {
1527
  "epoch": 0.78,
1528
+ "eval_accuracy": 0.4215069268054616,
1529
+ "eval_loss": 3.105459690093994,
1530
+ "eval_runtime": 151.0796,
1531
+ "eval_samples_per_second": 33.525,
1532
+ "eval_steps_per_second": 4.196,
1533
  "step": 2200
1534
  },
1535
  {
1536
  "epoch": 0.78,
1537
+ "learning_rate": 9.826201529328414e-05,
1538
+ "loss": 3.1225,
1539
  "step": 2210
1540
  },
1541
  {
1542
  "epoch": 0.78,
1543
+ "learning_rate": 9.526352100363562e-05,
1544
+ "loss": 3.0914,
1545
  "step": 2220
1546
  },
1547
  {
1548
  "epoch": 0.79,
1549
+ "learning_rate": 9.230529286751886e-05,
1550
+ "loss": 3.1395,
1551
  "step": 2230
1552
  },
1553
  {
1554
  "epoch": 0.79,
1555
+ "learning_rate": 8.938772176845631e-05,
1556
+ "loss": 3.0953,
1557
  "step": 2240
1558
  },
1559
  {
1560
  "epoch": 0.79,
1561
+ "learning_rate": 8.651119321777952e-05,
1562
+ "loss": 3.0434,
1563
  "step": 2250
1564
  },
1565
  {
1566
  "epoch": 0.8,
1567
+ "learning_rate": 8.367608730369015e-05,
1568
+ "loss": 3.0658,
1569
  "step": 2260
1570
  },
1571
  {
1572
  "epoch": 0.8,
1573
+ "learning_rate": 8.088277864103697e-05,
1574
+ "loss": 3.0928,
1575
  "step": 2270
1576
  },
1577
  {
1578
  "epoch": 0.8,
1579
+ "learning_rate": 7.81316363218167e-05,
1580
+ "loss": 3.0746,
1581
  "step": 2280
1582
  },
1583
  {
1584
  "epoch": 0.81,
1585
+ "learning_rate": 7.542302386640385e-05,
1586
+ "loss": 3.0813,
1587
  "step": 2290
1588
  },
1589
  {
1590
  "epoch": 0.81,
1591
+ "learning_rate": 7.27572991755178e-05,
1592
+ "loss": 3.0939,
1593
  "step": 2300
1594
  },
1595
  {
1596
  "epoch": 0.81,
1597
+ "eval_accuracy": 0.42381410978240375,
1598
+ "eval_loss": 3.0875043869018555,
1599
+ "eval_runtime": 152.0694,
1600
+ "eval_samples_per_second": 33.307,
1601
+ "eval_steps_per_second": 4.169,
1602
  "step": 2300
1603
  },
1604
  {
1605
  "epoch": 0.82,
1606
+ "learning_rate": 7.013481448293085e-05,
1607
+ "loss": 3.0575,
1608
  "step": 2310
1609
  },
1610
  {
1611
  "epoch": 0.82,
1612
+ "learning_rate": 6.755591630892744e-05,
1613
+ "loss": 3.0907,
1614
  "step": 2320
1615
  },
1616
  {
1617
  "epoch": 0.82,
1618
+ "learning_rate": 6.502094541451573e-05,
1619
+ "loss": 3.0385,
1620
  "step": 2330
1621
  },
1622
  {
1623
  "epoch": 0.83,
1624
+ "learning_rate": 6.253023675640158e-05,
1625
+ "loss": 3.1125,
1626
  "step": 2340
1627
  },
1628
  {
1629
  "epoch": 0.83,
1630
+ "learning_rate": 6.008411944273e-05,
1631
+ "loss": 3.0955,
1632
  "step": 2350
1633
  },
1634
  {
1635
  "epoch": 0.83,
1636
+ "learning_rate": 5.7682916689597535e-05,
1637
+ "loss": 3.0929,
1638
  "step": 2360
1639
  },
1640
  {
1641
  "epoch": 0.84,
1642
+ "learning_rate": 5.5326945778345586e-05,
1643
+ "loss": 3.0564,
1644
  "step": 2370
1645
  },
1646
  {
1647
  "epoch": 0.84,
1648
+ "learning_rate": 5.301651801363528e-05,
1649
+ "loss": 3.0774,
1650
  "step": 2380
1651
  },
1652
  {
1653
  "epoch": 0.84,
1654
+ "learning_rate": 5.075193868231454e-05,
1655
+ "loss": 3.0838,
1656
  "step": 2390
1657
  },
1658
  {
1659
  "epoch": 0.85,
1660
+ "learning_rate": 4.853350701307897e-05,
1661
+ "loss": 3.0584,
1662
  "step": 2400
1663
  },
1664
  {
1665
  "epoch": 0.85,
1666
+ "eval_accuracy": 0.42569305429031773,
1667
+ "eval_loss": 3.073211431503296,
1668
+ "eval_runtime": 151.0975,
1669
+ "eval_samples_per_second": 33.521,
1670
+ "eval_steps_per_second": 4.196,
1671
  "step": 2400
1672
  },
1673
  {
1674
  "epoch": 0.85,
1675
+ "learning_rate": 4.636151613693276e-05,
1676
+ "loss": 3.0236,
1677
  "step": 2410
1678
  },
1679
  {
1680
  "epoch": 0.85,
1681
+ "learning_rate": 4.423625304845702e-05,
1682
+ "loss": 3.0852,
1683
  "step": 2420
1684
  },
1685
  {
1686
  "epoch": 0.86,
1687
+ "learning_rate": 4.215799856788727e-05,
1688
+ "loss": 3.1131,
1689
  "step": 2430
1690
  },
1691
  {
1692
  "epoch": 0.86,
1693
+ "learning_rate": 4.012702730400766e-05,
1694
+ "loss": 3.0559,
1695
  "step": 2440
1696
  },
1697
  {
1698
  "epoch": 0.86,
1699
+ "learning_rate": 3.8143607617865796e-05,
1700
+ "loss": 3.0686,
1701
  "step": 2450
1702
  },
1703
  {
1704
  "epoch": 0.87,
1705
+ "learning_rate": 3.620800158731288e-05,
1706
+ "loss": 3.0508,
1707
  "step": 2460
1708
  },
1709
  {
1710
  "epoch": 0.87,
1711
+ "learning_rate": 3.4320464972374246e-05,
1712
+ "loss": 3.0623,
1713
  "step": 2470
1714
  },
1715
  {
1716
  "epoch": 0.88,
1717
+ "learning_rate": 3.24812471814548e-05,
1718
+ "loss": 3.0673,
1719
  "step": 2480
1720
  },
1721
  {
1722
  "epoch": 0.88,
1723
+ "learning_rate": 3.069059123838347e-05,
1724
+ "loss": 3.0689,
1725
  "step": 2490
1726
  },
1727
  {
1728
  "epoch": 0.88,
1729
+ "learning_rate": 2.894873375030156e-05,
1730
+ "loss": 3.0711,
1731
  "step": 2500
1732
  },
1733
  {
1734
  "epoch": 0.88,
1735
+ "eval_accuracy": 0.42705097532758074,
1736
+ "eval_loss": 3.0630664825439453,
1737
+ "eval_runtime": 151.1382,
1738
+ "eval_samples_per_second": 33.512,
1739
+ "eval_steps_per_second": 4.195,
1740
  "step": 2500
1741
  },
1742
  {
1743
  "epoch": 0.89,
1744
+ "learning_rate": 2.7255904876398687e-05,
1745
+ "loss": 3.0574,
1746
  "step": 2510
1747
  },
1748
  {
1749
  "epoch": 0.89,
1750
+ "learning_rate": 2.5612328297500663e-05,
1751
+ "loss": 3.0566,
1752
  "step": 2520
1753
  },
1754
  {
1755
  "epoch": 0.89,
1756
+ "learning_rate": 2.4018221186514223e-05,
1757
+ "loss": 3.0702,
1758
  "step": 2530
1759
  },
1760
  {
1761
  "epoch": 0.9,
1762
+ "learning_rate": 2.2473794179730344e-05,
1763
+ "loss": 3.0446,
1764
  "step": 2540
1765
  },
1766
  {
1767
  "epoch": 0.9,
1768
+ "learning_rate": 2.0979251348992235e-05,
1769
+ "loss": 3.0475,
1770
  "step": 2550
1771
  },
1772
  {
1773
  "epoch": 0.9,
1774
+ "learning_rate": 1.953479017473052e-05,
1775
+ "loss": 3.065,
1776
  "step": 2560
1777
  },
1778
  {
1779
  "epoch": 0.91,
1780
+ "learning_rate": 1.8140601519869026e-05,
1781
+ "loss": 3.0562,
1782
  "step": 2570
1783
  },
1784
  {
1785
  "epoch": 0.91,
1786
+ "learning_rate": 1.6796869604605735e-05,
1787
+ "loss": 3.0301,
1788
  "step": 2580
1789
  },
1790
  {
1791
  "epoch": 0.91,
1792
+ "learning_rate": 1.5503771982070226e-05,
1793
+ "loss": 3.0694,
1794
  "step": 2590
1795
  },
1796
  {
1797
  "epoch": 0.92,
1798
+ "learning_rate": 1.4261479514863452e-05,
1799
+ "loss": 3.0612,
1800
  "step": 2600
1801
  },
1802
  {
1803
  "epoch": 0.92,
1804
+ "eval_accuracy": 0.4279909780571187,
1805
+ "eval_loss": 3.0565025806427,
1806
+ "eval_runtime": 151.9486,
1807
+ "eval_samples_per_second": 33.334,
1808
+ "eval_steps_per_second": 4.172,
1809
  "step": 2600
1810
  },
1811
  {
1812
  "epoch": 0.92,
1813
+ "learning_rate": 1.3070156352480877e-05,
1814
+ "loss": 3.0469,
1815
  "step": 2610
1816
  },
1817
  {
1818
  "epoch": 0.92,
1819
+ "learning_rate": 1.1929959909622045e-05,
1820
+ "loss": 3.0645,
1821
  "step": 2620
1822
  },
1823
  {
1824
  "epoch": 0.93,
1825
+ "learning_rate": 1.084104084539166e-05,
1826
+ "loss": 3.0639,
1827
  "step": 2630
1828
  },
1829
  {
1830
  "epoch": 0.93,
1831
+ "learning_rate": 9.803543043391417e-06,
1832
+ "loss": 3.0735,
1833
  "step": 2640
1834
  },
1835
  {
1836
  "epoch": 0.94,
1837
+ "learning_rate": 8.817603592708779e-06,
1838
+ "loss": 3.02,
1839
  "step": 2650
1840
  },
1841
  {
1842
  "epoch": 0.94,
1843
+ "learning_rate": 7.8833527698023e-06,
1844
+ "loss": 3.0485,
1845
  "step": 2660
1846
  },
1847
  {
1848
  "epoch": 0.94,
1849
+ "learning_rate": 7.0009140212878854e-06,
1850
+ "loss": 3.0469,
1851
  "step": 2670
1852
  },
1853
  {
1854
  "epoch": 0.95,
1855
+ "learning_rate": 6.170403947627179e-06,
1856
+ "loss": 3.033,
1857
  "step": 2680
1858
  },
1859
  {
1860
  "epoch": 0.95,
1861
+ "learning_rate": 5.39193228772068e-06,
1862
+ "loss": 3.0413,
1863
  "step": 2690
1864
  },
1865
  {
1866
  "epoch": 0.95,
1867
+ "learning_rate": 4.665601904407347e-06,
1868
+ "loss": 3.081,
1869
  "step": 2700
1870
  },
1871
  {
1872
  "epoch": 0.95,
1873
+ "eval_accuracy": 0.42835112275156717,
1874
+ "eval_loss": 3.0534024238586426,
1875
+ "eval_runtime": 151.3942,
1876
+ "eval_samples_per_second": 33.456,
1877
+ "eval_steps_per_second": 4.188,
1878
  "step": 2700
1879
  },
1880
  {
1881
  "epoch": 0.96,
1882
+ "learning_rate": 3.99150877087302e-06,
1883
+ "loss": 3.0195,
1884
  "step": 2710
1885
  },
1886
  {
1887
  "epoch": 0.96,
1888
+ "learning_rate": 3.3697419579690194e-06,
1889
+ "loss": 3.0411,
1890
  "step": 2720
1891
  },
1892
  {
1893
  "epoch": 0.96,
1894
+ "learning_rate": 2.800383622442837e-06,
1895
+ "loss": 3.0234,
1896
  "step": 2730
1897
  },
1898
  {
1899
  "epoch": 0.97,
1900
+ "learning_rate": 2.2835089960823395e-06,
1901
+ "loss": 3.0571,
1902
  "step": 2740
1903
  },
1904
  {
1905
  "epoch": 0.97,
1906
+ "learning_rate": 1.8191863757751392e-06,
1907
+ "loss": 3.0598,
1908
  "step": 2750
1909
  },
1910
  {
1911
  "epoch": 0.97,
1912
+ "learning_rate": 1.4074771144842568e-06,
1913
+ "loss": 3.0829,
1914
  "step": 2760
1915
  },
1916
  {
1917
  "epoch": 0.98,
1918
+ "learning_rate": 1.04843561314123e-06,
1919
+ "loss": 3.0236,
1920
  "step": 2770
1921
  },
1922
  {
1923
  "epoch": 0.98,
1924
+ "learning_rate": 7.421093134578616e-07,
1925
+ "loss": 3.0504,
1926
  "step": 2780
1927
  },
1928
  {
1929
  "epoch": 0.98,
1930
+ "learning_rate": 4.885386916575474e-07,
1931
+ "loss": 3.0446,
1932
  "step": 2790
1933
  },
1934
  {
1935
  "epoch": 0.99,
1936
+ "learning_rate": 2.877572531271078e-07,
1937
+ "loss": 3.0378,
1938
  "step": 2800
1939
  },
1940
  {
1941
  "epoch": 0.99,
1942
+ "eval_accuracy": 0.4284987878632974,
1943
+ "eval_loss": 3.052541494369507,
1944
+ "eval_runtime": 151.2777,
1945
+ "eval_samples_per_second": 33.481,
1946
+ "eval_steps_per_second": 4.191,
1947
  "step": 2800
1948
  },
1949
  {
1950
  "epoch": 0.99,
1951
+ "learning_rate": 1.3979152798935247e-07,
1952
+ "loss": 3.0729,
1953
  "step": 2810
1954
  },
1955
  {
1956
  "epoch": 1.0,
1957
+ "learning_rate": 4.4661067597751015e-08,
1958
+ "loss": 2.9979,
1959
  "step": 2820
1960
  },
1961
  {
1962
  "epoch": 1.0,
1963
+ "learning_rate": 2.3784419529437883e-09,
1964
+ "loss": 3.1122,
1965
  "step": 2830
1966
  },
1967
  {
1968
  "epoch": 1.0,
1969
  "step": 2833,
1970
+ "total_flos": 4.4316720769479475e+17,
1971
+ "train_loss": 3.7639108537323183,
1972
+ "train_runtime": 10013.9285,
1973
+ "train_samples_per_second": 9.052,
1974
+ "train_steps_per_second": 0.283
1975
  }
1976
  ],
1977
  "logging_steps": 10,
 
1979
  "num_input_tokens_seen": 0,
1980
  "num_train_epochs": 1,
1981
  "save_steps": 1000.0,
1982
+ "total_flos": 4.4316720769479475e+17,
1983
+ "train_batch_size": 8,
1984
  "trial_name": null,
1985
  "trial_params": null
1986
  }