amazingvince commited on
Commit
1f91365
1 Parent(s): fc5d451

Upload folder using huggingface_hub

Browse files
latest CHANGED
@@ -1 +1 @@
1
- global_step1600
 
1
+ global_step2000
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12bcd55c8c5483c2da24f33e5437c9af6ef1e9f226291b0a1f759ccca73b1dc8
3
  size 4944210912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ee84e03f44b623efe240c3597bb0a0dd775b052757823794d834266f22d2c46
3
  size 4944210912
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c2c606de25454f548d5a5f1e87c6c8fa8f46023382c6fa93b733220af67f053
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a83630f79f75d0d2bb42b192cb7ad0bc20d129202c371bc6dfed62ed617388c8
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05616832d31132adf8fd752ad8b3ade0d48aa08c50eb2905a1ce8002cbac6fa
3
  size 4541564920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16118c183ee356e5fdd89c77c0b53fa90e510fdcc3c5fa15999020c2bd60d8b6
3
  size 4541564920
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 4096,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.06498946459851235,
5
- "eval_steps": 400,
6
- "global_step": 1600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -17,43 +17,43 @@
17
  {
18
  "epoch": 0.0,
19
  "learning_rate": 4.048582995951417e-08,
20
- "loss": 3.6853,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 8.097165991902834e-08,
26
- "loss": 3.6632,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 1.214574898785425e-07,
32
- "loss": 3.393,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 1.6194331983805668e-07,
38
- "loss": 3.0892,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 2.0242914979757083e-07,
44
- "loss": 2.659,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.0,
49
  "learning_rate": 2.42914979757085e-07,
50
- "loss": 2.1215,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 2.8340080971659917e-07,
56
- "loss": 1.7238,
57
  "step": 35
58
  },
59
  {
@@ -65,31 +65,31 @@
65
  {
66
  "epoch": 0.0,
67
  "learning_rate": 3.6437246963562754e-07,
68
- "loss": 1.187,
69
  "step": 45
70
  },
71
  {
72
  "epoch": 0.0,
73
  "learning_rate": 4.0485829959514166e-07,
74
- "loss": 1.1262,
75
  "step": 50
76
  },
77
  {
78
  "epoch": 0.0,
79
  "learning_rate": 4.4534412955465585e-07,
80
- "loss": 1.0247,
81
  "step": 55
82
  },
83
  {
84
  "epoch": 0.0,
85
  "learning_rate": 4.8582995951417e-07,
86
- "loss": 0.9772,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.0,
91
  "learning_rate": 5.263157894736842e-07,
92
- "loss": 0.9556,
93
  "step": 65
94
  },
95
  {
@@ -101,13 +101,13 @@
101
  {
102
  "epoch": 0.0,
103
  "learning_rate": 6.072874493927125e-07,
104
- "loss": 0.8989,
105
  "step": 75
106
  },
107
  {
108
  "epoch": 0.0,
109
  "learning_rate": 6.477732793522267e-07,
110
- "loss": 0.8862,
111
  "step": 80
112
  },
113
  {
@@ -125,19 +125,19 @@
125
  {
126
  "epoch": 0.0,
127
  "learning_rate": 7.692307692307693e-07,
128
- "loss": 0.8479,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 0.0,
133
  "learning_rate": 8.097165991902833e-07,
134
- "loss": 0.8216,
135
  "step": 100
136
  },
137
  {
138
  "epoch": 0.0,
139
  "learning_rate": 8.502024291497975e-07,
140
- "loss": 0.8435,
141
  "step": 105
142
  },
143
  {
@@ -167,19 +167,19 @@
167
  {
168
  "epoch": 0.01,
169
  "learning_rate": 1.0526315789473683e-06,
170
- "loss": 0.8037,
171
  "step": 130
172
  },
173
  {
174
  "epoch": 0.01,
175
  "learning_rate": 1.0931174089068826e-06,
176
- "loss": 0.8031,
177
  "step": 135
178
  },
179
  {
180
  "epoch": 0.01,
181
  "learning_rate": 1.1336032388663967e-06,
182
- "loss": 0.7934,
183
  "step": 140
184
  },
185
  {
@@ -203,13 +203,13 @@
203
  {
204
  "epoch": 0.01,
205
  "learning_rate": 1.2955465587044534e-06,
206
- "loss": 0.8285,
207
  "step": 160
208
  },
209
  {
210
  "epoch": 0.01,
211
  "learning_rate": 1.3360323886639675e-06,
212
- "loss": 0.7743,
213
  "step": 165
214
  },
215
  {
@@ -227,13 +227,13 @@
227
  {
228
  "epoch": 0.01,
229
  "learning_rate": 1.4574898785425101e-06,
230
- "loss": 0.772,
231
  "step": 180
232
  },
233
  {
234
  "epoch": 0.01,
235
  "learning_rate": 1.4979757085020242e-06,
236
- "loss": 0.7553,
237
  "step": 185
238
  },
239
  {
@@ -245,19 +245,19 @@
245
  {
246
  "epoch": 0.01,
247
  "learning_rate": 1.5789473684210526e-06,
248
- "loss": 0.7758,
249
  "step": 195
250
  },
251
  {
252
  "epoch": 0.01,
253
  "learning_rate": 1.6194331983805667e-06,
254
- "loss": 0.7248,
255
  "step": 200
256
  },
257
  {
258
  "epoch": 0.01,
259
  "learning_rate": 1.6599190283400807e-06,
260
- "loss": 0.7672,
261
  "step": 205
262
  },
263
  {
@@ -281,31 +281,31 @@
281
  {
282
  "epoch": 0.01,
283
  "learning_rate": 1.8218623481781377e-06,
284
- "loss": 0.7382,
285
  "step": 225
286
  },
287
  {
288
  "epoch": 0.01,
289
  "learning_rate": 1.8623481781376518e-06,
290
- "loss": 0.7609,
291
  "step": 230
292
  },
293
  {
294
  "epoch": 0.01,
295
  "learning_rate": 1.9028340080971658e-06,
296
- "loss": 0.7665,
297
  "step": 235
298
  },
299
  {
300
  "epoch": 0.01,
301
  "learning_rate": 1.94331983805668e-06,
302
- "loss": 0.7566,
303
  "step": 240
304
  },
305
  {
306
  "epoch": 0.01,
307
  "learning_rate": 1.983805668016194e-06,
308
- "loss": 0.7509,
309
  "step": 245
310
  },
311
  {
@@ -329,37 +329,37 @@
329
  {
330
  "epoch": 0.01,
331
  "learning_rate": 1.999997308265467e-06,
332
- "loss": 0.7509,
333
  "step": 265
334
  },
335
  {
336
  "epoch": 0.01,
337
  "learning_rate": 1.999995605163075e-06,
338
- "loss": 0.7695,
339
  "step": 270
340
  },
341
  {
342
  "epoch": 0.01,
343
  "learning_rate": 1.9999934866712048e-06,
344
- "loss": 0.7675,
345
  "step": 275
346
  },
347
  {
348
  "epoch": 0.01,
349
  "learning_rate": 1.9999909527907367e-06,
350
- "loss": 0.7599,
351
  "step": 280
352
  },
353
  {
354
  "epoch": 0.01,
355
  "learning_rate": 1.9999880035227236e-06,
356
- "loss": 0.778,
357
  "step": 285
358
  },
359
  {
360
  "epoch": 0.01,
361
  "learning_rate": 1.9999846388683895e-06,
362
- "loss": 0.7767,
363
  "step": 290
364
  },
365
  {
@@ -377,31 +377,31 @@
377
  {
378
  "epoch": 0.01,
379
  "learning_rate": 1.999972052602305e-06,
380
- "loss": 0.7417,
381
  "step": 305
382
  },
383
  {
384
  "epoch": 0.01,
385
  "learning_rate": 1.999967026418392e-06,
386
- "loss": 0.7828,
387
  "step": 310
388
  },
389
  {
390
  "epoch": 0.01,
391
  "learning_rate": 1.999961584856872e-06,
392
- "loss": 0.7268,
393
  "step": 315
394
  },
395
  {
396
  "epoch": 0.01,
397
  "learning_rate": 1.9999557279200056e-06,
398
- "loss": 0.7329,
399
  "step": 320
400
  },
401
  {
402
  "epoch": 0.01,
403
  "learning_rate": 1.9999494556102263e-06,
404
- "loss": 0.7068,
405
  "step": 325
406
  },
407
  {
@@ -413,153 +413,145 @@
413
  {
414
  "epoch": 0.01,
415
  "learning_rate": 1.999935664882522e-06,
416
- "loss": 0.7239,
417
  "step": 335
418
  },
419
  {
420
  "epoch": 0.01,
421
  "learning_rate": 1.9999281464703247e-06,
422
- "loss": 0.7189,
423
  "step": 340
424
  },
425
  {
426
  "epoch": 0.01,
427
  "learning_rate": 1.999920212696672e-06,
428
- "loss": 0.7475,
429
  "step": 345
430
  },
431
  {
432
  "epoch": 0.01,
433
  "learning_rate": 1.999911863564859e-06,
434
- "loss": 0.7164,
435
  "step": 350
436
  },
437
  {
438
  "epoch": 0.01,
439
  "learning_rate": 1.9999030990783527e-06,
440
- "loss": 0.7142,
441
  "step": 355
442
  },
443
  {
444
  "epoch": 0.01,
445
  "learning_rate": 1.999893919240795e-06,
446
- "loss": 0.709,
447
  "step": 360
448
  },
449
  {
450
  "epoch": 0.01,
451
  "learning_rate": 1.9998843240559986e-06,
452
- "loss": 0.7701,
453
  "step": 365
454
  },
455
  {
456
  "epoch": 0.02,
457
  "learning_rate": 1.9998743135279497e-06,
458
- "loss": 0.7455,
459
  "step": 370
460
  },
461
  {
462
  "epoch": 0.02,
463
  "learning_rate": 1.999863887660806e-06,
464
- "loss": 0.7528,
465
  "step": 375
466
  },
467
  {
468
  "epoch": 0.02,
469
  "learning_rate": 1.999853046458899e-06,
470
- "loss": 0.7012,
471
  "step": 380
472
  },
473
  {
474
  "epoch": 0.02,
475
  "learning_rate": 1.9998417899267313e-06,
476
- "loss": 0.7626,
477
  "step": 385
478
  },
479
  {
480
  "epoch": 0.02,
481
  "learning_rate": 1.999830118068979e-06,
482
- "loss": 0.7326,
483
  "step": 390
484
  },
485
  {
486
  "epoch": 0.02,
487
  "learning_rate": 1.999818030890491e-06,
488
- "loss": 0.7229,
489
  "step": 395
490
  },
491
  {
492
  "epoch": 0.02,
493
  "learning_rate": 1.999805528396288e-06,
494
- "loss": 0.7548,
495
- "step": 400
496
- },
497
- {
498
- "epoch": 0.02,
499
- "eval_loss": 0.7065751552581787,
500
- "eval_runtime": 145.6465,
501
- "eval_samples_per_second": 16.245,
502
- "eval_steps_per_second": 2.712,
503
  "step": 400
504
  },
505
  {
506
  "epoch": 0.02,
507
  "learning_rate": 1.9997926105915627e-06,
508
- "loss": 0.7118,
509
  "step": 405
510
  },
511
  {
512
  "epoch": 0.02,
513
  "learning_rate": 1.999779277481682e-06,
514
- "loss": 0.75,
515
  "step": 410
516
  },
517
  {
518
  "epoch": 0.02,
519
  "learning_rate": 1.9997655290721834e-06,
520
- "loss": 0.7281,
521
  "step": 415
522
  },
523
  {
524
  "epoch": 0.02,
525
  "learning_rate": 1.9997513653687786e-06,
526
- "loss": 0.7342,
527
  "step": 420
528
  },
529
  {
530
  "epoch": 0.02,
531
  "learning_rate": 1.999736786377351e-06,
532
- "loss": 0.7679,
533
  "step": 425
534
  },
535
  {
536
  "epoch": 0.02,
537
  "learning_rate": 1.9997217921039567e-06,
538
- "loss": 0.7422,
539
  "step": 430
540
  },
541
  {
542
  "epoch": 0.02,
543
  "learning_rate": 1.9997063825548237e-06,
544
- "loss": 0.7138,
545
  "step": 435
546
  },
547
  {
548
  "epoch": 0.02,
549
  "learning_rate": 1.9996905577363533e-06,
550
- "loss": 0.7608,
551
  "step": 440
552
  },
553
  {
554
  "epoch": 0.02,
555
  "learning_rate": 1.9996743176551186e-06,
556
- "loss": 0.7544,
557
  "step": 445
558
  },
559
  {
560
  "epoch": 0.02,
561
  "learning_rate": 1.999657662317866e-06,
562
- "loss": 0.7428,
563
  "step": 450
564
  },
565
  {
@@ -571,31 +563,31 @@
571
  {
572
  "epoch": 0.02,
573
  "learning_rate": 1.999623105903154e-06,
574
- "loss": 0.7283,
575
  "step": 460
576
  },
577
  {
578
  "epoch": 0.02,
579
  "learning_rate": 1.999605204840049e-06,
580
- "loss": 0.7598,
581
  "step": 465
582
  },
583
  {
584
  "epoch": 0.02,
585
  "learning_rate": 1.9995868885496343e-06,
586
- "loss": 0.7411,
587
  "step": 470
588
  },
589
  {
590
  "epoch": 0.02,
591
  "learning_rate": 1.9995681570395195e-06,
592
- "loss": 0.783,
593
  "step": 475
594
  },
595
  {
596
  "epoch": 0.02,
597
  "learning_rate": 1.9995490103174847e-06,
598
- "loss": 0.7341,
599
  "step": 480
600
  },
601
  {
@@ -607,55 +599,55 @@
607
  {
608
  "epoch": 0.02,
609
  "learning_rate": 1.9995094712696413e-06,
610
- "loss": 0.7664,
611
  "step": 490
612
  },
613
  {
614
  "epoch": 0.02,
615
  "learning_rate": 1.9994890789602576e-06,
616
- "loss": 0.7352,
617
  "step": 495
618
  },
619
  {
620
  "epoch": 0.02,
621
  "learning_rate": 1.999468271471802e-06,
622
- "loss": 0.7342,
623
  "step": 500
624
  },
625
  {
626
  "epoch": 0.02,
627
  "learning_rate": 1.9994470488129185e-06,
628
- "loss": 0.7475,
629
  "step": 505
630
  },
631
  {
632
  "epoch": 0.02,
633
  "learning_rate": 1.9994254109924223e-06,
634
- "loss": 0.7253,
635
  "step": 510
636
  },
637
  {
638
  "epoch": 0.02,
639
  "learning_rate": 1.9994033580193017e-06,
640
- "loss": 0.7301,
641
  "step": 515
642
  },
643
  {
644
  "epoch": 0.02,
645
  "learning_rate": 1.999380889902718e-06,
646
- "loss": 0.7114,
647
  "step": 520
648
  },
649
  {
650
  "epoch": 0.02,
651
  "learning_rate": 1.9993580066520034e-06,
652
- "loss": 0.7449,
653
  "step": 525
654
  },
655
  {
656
  "epoch": 0.02,
657
  "learning_rate": 1.9993347082766636e-06,
658
- "loss": 0.7522,
659
  "step": 530
660
  },
661
  {
@@ -673,241 +665,241 @@
673
  {
674
  "epoch": 0.02,
675
  "learning_rate": 1.999262322500535e-06,
676
- "loss": 0.7039,
677
  "step": 545
678
  },
679
  {
680
  "epoch": 0.02,
681
  "learning_rate": 1.9992373637251982e-06,
682
- "loss": 0.7095,
683
  "step": 550
684
  },
685
  {
686
  "epoch": 0.02,
687
  "learning_rate": 1.999211989875351e-06,
688
- "loss": 0.7138,
689
  "step": 555
690
  },
691
  {
692
  "epoch": 0.02,
693
  "learning_rate": 1.999186200961532e-06,
694
- "loss": 0.7423,
695
  "step": 560
696
  },
697
  {
698
  "epoch": 0.02,
699
  "learning_rate": 1.9991599969944552e-06,
700
- "loss": 0.7347,
701
  "step": 565
702
  },
703
  {
704
  "epoch": 0.02,
705
  "learning_rate": 1.9991333779850043e-06,
706
- "loss": 0.7125,
707
  "step": 570
708
  },
709
  {
710
  "epoch": 0.02,
711
  "learning_rate": 1.999106343944237e-06,
712
- "loss": 0.7339,
713
  "step": 575
714
  },
715
  {
716
  "epoch": 0.02,
717
  "learning_rate": 1.9990788948833833e-06,
718
- "loss": 0.7443,
719
  "step": 580
720
  },
721
  {
722
  "epoch": 0.02,
723
  "learning_rate": 1.999051030813845e-06,
724
- "loss": 0.7176,
725
  "step": 585
726
  },
727
  {
728
  "epoch": 0.02,
729
  "learning_rate": 1.999022751747197e-06,
730
- "loss": 0.729,
731
  "step": 590
732
  },
733
  {
734
  "epoch": 0.02,
735
  "learning_rate": 1.998994057695185e-06,
736
- "loss": 0.7155,
737
  "step": 595
738
  },
739
  {
740
  "epoch": 0.02,
741
  "learning_rate": 1.99896494866973e-06,
742
- "loss": 0.6842,
743
  "step": 600
744
  },
745
  {
746
  "epoch": 0.02,
747
  "learning_rate": 1.9989354246829222e-06,
748
- "loss": 0.7505,
749
  "step": 605
750
  },
751
  {
752
  "epoch": 0.02,
753
  "learning_rate": 1.9989054857470267e-06,
754
- "loss": 0.7319,
755
  "step": 610
756
  },
757
  {
758
  "epoch": 0.02,
759
  "learning_rate": 1.9988751318744787e-06,
760
- "loss": 0.7825,
761
  "step": 615
762
  },
763
  {
764
  "epoch": 0.03,
765
  "learning_rate": 1.998844363077888e-06,
766
- "loss": 0.7227,
767
  "step": 620
768
  },
769
  {
770
  "epoch": 0.03,
771
  "learning_rate": 1.998813179370035e-06,
772
- "loss": 0.7377,
773
  "step": 625
774
  },
775
  {
776
  "epoch": 0.03,
777
  "learning_rate": 1.9987815807638733e-06,
778
- "loss": 0.6932,
779
  "step": 630
780
  },
781
  {
782
  "epoch": 0.03,
783
  "learning_rate": 1.9987495672725294e-06,
784
- "loss": 0.7004,
785
  "step": 635
786
  },
787
  {
788
  "epoch": 0.03,
789
  "learning_rate": 1.9987171389093e-06,
790
- "loss": 0.7694,
791
  "step": 640
792
  },
793
  {
794
  "epoch": 0.03,
795
  "learning_rate": 1.998684295687657e-06,
796
- "loss": 0.7103,
797
  "step": 645
798
  },
799
  {
800
  "epoch": 0.03,
801
  "learning_rate": 1.998651037621242e-06,
802
- "loss": 0.7809,
803
  "step": 650
804
  },
805
  {
806
  "epoch": 0.03,
807
  "learning_rate": 1.9986173647238715e-06,
808
- "loss": 0.7522,
809
  "step": 655
810
  },
811
  {
812
  "epoch": 0.03,
813
  "learning_rate": 1.9985832770095313e-06,
814
- "loss": 0.7232,
815
  "step": 660
816
  },
817
  {
818
  "epoch": 0.03,
819
  "learning_rate": 1.998548774492382e-06,
820
- "loss": 0.7197,
821
  "step": 665
822
  },
823
  {
824
  "epoch": 0.03,
825
  "learning_rate": 1.9985138571867557e-06,
826
- "loss": 0.7298,
827
  "step": 670
828
  },
829
  {
830
  "epoch": 0.03,
831
  "learning_rate": 1.998478525107157e-06,
832
- "loss": 0.7372,
833
  "step": 675
834
  },
835
  {
836
  "epoch": 0.03,
837
  "learning_rate": 1.998442778268262e-06,
838
- "loss": 0.712,
839
  "step": 680
840
  },
841
  {
842
  "epoch": 0.03,
843
  "learning_rate": 1.99840661668492e-06,
844
- "loss": 0.7539,
845
  "step": 685
846
  },
847
  {
848
  "epoch": 0.03,
849
  "learning_rate": 1.998370040372151e-06,
850
- "loss": 0.768,
851
  "step": 690
852
  },
853
  {
854
  "epoch": 0.03,
855
  "learning_rate": 1.99833304934515e-06,
856
- "loss": 0.7025,
857
  "step": 695
858
  },
859
  {
860
  "epoch": 0.03,
861
  "learning_rate": 1.9982956436192827e-06,
862
- "loss": 0.7794,
863
  "step": 700
864
  },
865
  {
866
  "epoch": 0.03,
867
  "learning_rate": 1.9982578232100866e-06,
868
- "loss": 0.7322,
869
  "step": 705
870
  },
871
  {
872
  "epoch": 0.03,
873
  "learning_rate": 1.9982195881332714e-06,
874
- "loss": 0.7728,
875
  "step": 710
876
  },
877
  {
878
  "epoch": 0.03,
879
  "learning_rate": 1.9981809384047207e-06,
880
- "loss": 0.7405,
881
  "step": 715
882
  },
883
  {
884
  "epoch": 0.03,
885
  "learning_rate": 1.9981418740404886e-06,
886
- "loss": 0.7516,
887
  "step": 720
888
  },
889
  {
890
  "epoch": 0.03,
891
  "learning_rate": 1.998102395056802e-06,
892
- "loss": 0.7337,
893
  "step": 725
894
  },
895
  {
896
  "epoch": 0.03,
897
  "learning_rate": 1.998062501470061e-06,
898
- "loss": 0.7191,
899
  "step": 730
900
  },
901
  {
902
  "epoch": 0.03,
903
  "learning_rate": 1.998022193296836e-06,
904
- "loss": 0.7427,
905
  "step": 735
906
  },
907
  {
908
  "epoch": 0.03,
909
  "learning_rate": 1.9979814705538715e-06,
910
- "loss": 0.6949,
911
  "step": 740
912
  },
913
  {
@@ -931,81 +923,81 @@
931
  {
932
  "epoch": 0.03,
933
  "learning_rate": 1.9978144342255147e-06,
934
- "loss": 0.7009,
935
  "step": 760
936
  },
937
  {
938
  "epoch": 0.03,
939
  "learning_rate": 1.9977716388910325e-06,
940
- "loss": 0.73,
941
  "step": 765
942
  },
943
  {
944
  "epoch": 0.03,
945
  "learning_rate": 1.997728429090889e-06,
946
- "loss": 0.7655,
947
  "step": 770
948
  },
949
  {
950
  "epoch": 0.03,
951
  "learning_rate": 1.9976848048430323e-06,
952
- "loss": 0.7427,
953
  "step": 775
954
  },
955
  {
956
  "epoch": 0.03,
957
  "learning_rate": 1.9976407661655844e-06,
958
- "loss": 0.7059,
959
  "step": 780
960
  },
961
  {
962
  "epoch": 0.03,
963
  "learning_rate": 1.997596313076838e-06,
964
- "loss": 0.6849,
965
  "step": 785
966
  },
967
  {
968
  "epoch": 0.03,
969
  "learning_rate": 1.9975514455952584e-06,
970
- "loss": 0.7366,
971
  "step": 790
972
  },
973
  {
974
  "epoch": 0.03,
975
  "learning_rate": 1.9975061637394834e-06,
976
- "loss": 0.7218,
977
  "step": 795
978
  },
979
  {
980
  "epoch": 0.03,
981
  "learning_rate": 1.997460467528323e-06,
982
- "loss": 0.7159,
983
  "step": 800
984
  },
985
  {
986
  "epoch": 0.03,
987
- "eval_loss": 0.689598023891449,
988
- "eval_runtime": 144.3237,
989
- "eval_samples_per_second": 16.394,
990
- "eval_steps_per_second": 2.737,
991
  "step": 800
992
  },
993
  {
994
  "epoch": 0.03,
995
  "learning_rate": 1.997414356980759e-06,
996
- "loss": 0.7908,
997
  "step": 805
998
  },
999
  {
1000
  "epoch": 0.03,
1001
  "learning_rate": 1.9973678321159443e-06,
1002
- "loss": 0.7038,
1003
  "step": 810
1004
  },
1005
  {
1006
  "epoch": 0.03,
1007
  "learning_rate": 1.9973208929532063e-06,
1008
- "loss": 0.7079,
1009
  "step": 815
1010
  },
1011
  {
@@ -1017,13 +1009,13 @@
1017
  {
1018
  "epoch": 0.03,
1019
  "learning_rate": 1.997225771812122e-06,
1020
- "loss": 0.7225,
1021
  "step": 825
1022
  },
1023
  {
1024
  "epoch": 0.03,
1025
  "learning_rate": 1.9971775898732893e-06,
1026
- "loss": 0.727,
1027
  "step": 830
1028
  },
1029
  {
@@ -1041,7 +1033,7 @@
1041
  {
1042
  "epoch": 0.03,
1043
  "learning_rate": 1.9970305588243145e-06,
1044
- "loss": 0.7061,
1045
  "step": 845
1046
  },
1047
  {
@@ -1053,19 +1045,19 @@
1053
  {
1054
  "epoch": 0.03,
1055
  "learning_rate": 1.9969304673019494e-06,
1056
- "loss": 0.7163,
1057
  "step": 855
1058
  },
1059
  {
1060
  "epoch": 0.03,
1061
  "learning_rate": 1.99687980035596e-06,
1062
- "loss": 0.7295,
1063
  "step": 860
1064
  },
1065
  {
1066
  "epoch": 0.04,
1067
  "learning_rate": 1.996828719314771e-06,
1068
- "loss": 0.7198,
1069
  "step": 865
1070
  },
1071
  {
@@ -1077,67 +1069,67 @@
1077
  {
1078
  "epoch": 0.04,
1079
  "learning_rate": 1.99672531503184e-06,
1080
- "loss": 0.7348,
1081
  "step": 875
1082
  },
1083
  {
1084
  "epoch": 0.04,
1085
  "learning_rate": 1.996672991833051e-06,
1086
- "loss": 0.7151,
1087
  "step": 880
1088
  },
1089
  {
1090
  "epoch": 0.04,
1091
  "learning_rate": 1.996620254624969e-06,
1092
- "loss": 0.7141,
1093
  "step": 885
1094
  },
1095
  {
1096
  "epoch": 0.04,
1097
  "learning_rate": 1.9965671034295e-06,
1098
- "loss": 0.7306,
1099
  "step": 890
1100
  },
1101
  {
1102
  "epoch": 0.04,
1103
  "learning_rate": 1.996513538268723e-06,
1104
- "loss": 0.7806,
1105
  "step": 895
1106
  },
1107
  {
1108
  "epoch": 0.04,
1109
  "learning_rate": 1.9964595591648883e-06,
1110
- "loss": 0.7404,
1111
  "step": 900
1112
  },
1113
  {
1114
  "epoch": 0.04,
1115
  "learning_rate": 1.9964051661404185e-06,
1116
- "loss": 0.6829,
1117
  "step": 905
1118
  },
1119
  {
1120
  "epoch": 0.04,
1121
  "learning_rate": 1.9963503592179078e-06,
1122
- "loss": 0.7181,
1123
  "step": 910
1124
  },
1125
  {
1126
  "epoch": 0.04,
1127
  "learning_rate": 1.996295138420122e-06,
1128
- "loss": 0.7605,
1129
  "step": 915
1130
  },
1131
  {
1132
  "epoch": 0.04,
1133
  "learning_rate": 1.9962395037700007e-06,
1134
- "loss": 0.7469,
1135
  "step": 920
1136
  },
1137
  {
1138
  "epoch": 0.04,
1139
  "learning_rate": 1.996183455290653e-06,
1140
- "loss": 0.6913,
1141
  "step": 925
1142
  },
1143
  {
@@ -1149,73 +1141,73 @@
1149
  {
1150
  "epoch": 0.04,
1151
  "learning_rate": 1.996070116937579e-06,
1152
- "loss": 0.7193,
1153
  "step": 935
1154
  },
1155
  {
1156
  "epoch": 0.04,
1157
  "learning_rate": 1.9960128271109326e-06,
1158
- "loss": 0.6973,
1159
  "step": 940
1160
  },
1161
  {
1162
  "epoch": 0.04,
1163
  "learning_rate": 1.9959551235492195e-06,
1164
- "loss": 0.7398,
1165
  "step": 945
1166
  },
1167
  {
1168
  "epoch": 0.04,
1169
  "learning_rate": 1.9958970062764095e-06,
1170
- "loss": 0.7474,
1171
  "step": 950
1172
  },
1173
  {
1174
  "epoch": 0.04,
1175
  "learning_rate": 1.9958384753166437e-06,
1176
- "loss": 0.7088,
1177
  "step": 955
1178
  },
1179
  {
1180
  "epoch": 0.04,
1181
  "learning_rate": 1.995779530694236e-06,
1182
- "loss": 0.6909,
1183
  "step": 960
1184
  },
1185
  {
1186
  "epoch": 0.04,
1187
  "learning_rate": 1.9957201724336704e-06,
1188
- "loss": 0.705,
1189
  "step": 965
1190
  },
1191
  {
1192
  "epoch": 0.04,
1193
  "learning_rate": 1.9956604005596043e-06,
1194
- "loss": 0.6961,
1195
  "step": 970
1196
  },
1197
  {
1198
  "epoch": 0.04,
1199
  "learning_rate": 1.9956002150968667e-06,
1200
- "loss": 0.7066,
1201
  "step": 975
1202
  },
1203
  {
1204
  "epoch": 0.04,
1205
  "learning_rate": 1.9955396160704582e-06,
1206
- "loss": 0.6803,
1207
  "step": 980
1208
  },
1209
  {
1210
  "epoch": 0.04,
1211
  "learning_rate": 1.99547860350555e-06,
1212
- "loss": 0.6762,
1213
  "step": 985
1214
  },
1215
  {
1216
  "epoch": 0.04,
1217
  "learning_rate": 1.995417177427488e-06,
1218
- "loss": 0.7174,
1219
  "step": 990
1220
  },
1221
  {
@@ -1227,31 +1219,31 @@
1227
  {
1228
  "epoch": 0.04,
1229
  "learning_rate": 1.995293084834134e-06,
1230
- "loss": 0.7111,
1231
  "step": 1000
1232
  },
1233
  {
1234
  "epoch": 0.04,
1235
  "learning_rate": 1.9952304183703893e-06,
1236
- "loss": 0.7128,
1237
  "step": 1005
1238
  },
1239
  {
1240
  "epoch": 0.04,
1241
  "learning_rate": 1.9951673384965835e-06,
1242
- "loss": 0.7116,
1243
  "step": 1010
1244
  },
1245
  {
1246
  "epoch": 0.04,
1247
  "learning_rate": 1.99510384523892e-06,
1248
- "loss": 0.7691,
1249
  "step": 1015
1250
  },
1251
  {
1252
  "epoch": 0.04,
1253
  "learning_rate": 1.995039938623773e-06,
1254
- "loss": 0.7378,
1255
  "step": 1020
1256
  },
1257
  {
@@ -1263,13 +1255,13 @@
1263
  {
1264
  "epoch": 0.04,
1265
  "learning_rate": 1.9949108854273855e-06,
1266
- "loss": 0.7286,
1267
  "step": 1030
1268
  },
1269
  {
1270
  "epoch": 0.04,
1271
  "learning_rate": 1.9948457388997528e-06,
1272
- "loss": 0.7043,
1273
  "step": 1035
1274
  },
1275
  {
@@ -1281,67 +1273,67 @@
1281
  {
1282
  "epoch": 0.04,
1283
  "learning_rate": 1.994714206120914e-06,
1284
- "loss": 0.7251,
1285
  "step": 1045
1286
  },
1287
  {
1288
  "epoch": 0.04,
1289
  "learning_rate": 1.9946478199243466e-06,
1290
- "loss": 0.7202,
1291
  "step": 1050
1292
  },
1293
  {
1294
  "epoch": 0.04,
1295
  "learning_rate": 1.9945810205597246e-06,
1296
- "loss": 0.701,
1297
  "step": 1055
1298
  },
1299
  {
1300
  "epoch": 0.04,
1301
  "learning_rate": 1.9945138080547957e-06,
1302
- "loss": 0.6945,
1303
  "step": 1060
1304
  },
1305
  {
1306
  "epoch": 0.04,
1307
  "learning_rate": 1.99444618243748e-06,
1308
- "loss": 0.7149,
1309
  "step": 1065
1310
  },
1311
  {
1312
  "epoch": 0.04,
1313
  "learning_rate": 1.994378143735868e-06,
1314
- "loss": 0.707,
1315
  "step": 1070
1316
  },
1317
  {
1318
  "epoch": 0.04,
1319
  "learning_rate": 1.9943096919782225e-06,
1320
- "loss": 0.6998,
1321
  "step": 1075
1322
  },
1323
  {
1324
  "epoch": 0.04,
1325
  "learning_rate": 1.994240827192978e-06,
1326
- "loss": 0.696,
1327
  "step": 1080
1328
  },
1329
  {
1330
  "epoch": 0.04,
1331
  "learning_rate": 1.9941715494087408e-06,
1332
- "loss": 0.7346,
1333
  "step": 1085
1334
  },
1335
  {
1336
  "epoch": 0.04,
1337
  "learning_rate": 1.9941018586542866e-06,
1338
- "loss": 0.6981,
1339
  "step": 1090
1340
  },
1341
  {
1342
  "epoch": 0.04,
1343
  "learning_rate": 1.9940317549585665e-06,
1344
- "loss": 0.7253,
1345
  "step": 1095
1346
  },
1347
  {
@@ -1353,79 +1345,79 @@
1353
  {
1354
  "epoch": 0.04,
1355
  "learning_rate": 1.993890308859978e-06,
1356
- "loss": 0.7262,
1357
  "step": 1105
1358
  },
1359
  {
1360
  "epoch": 0.05,
1361
  "learning_rate": 1.9938189665158654e-06,
1362
- "loss": 0.6877,
1363
  "step": 1110
1364
  },
1365
  {
1366
  "epoch": 0.05,
1367
  "learning_rate": 1.9937472113479966e-06,
1368
- "loss": 0.7085,
1369
  "step": 1115
1370
  },
1371
  {
1372
  "epoch": 0.05,
1373
  "learning_rate": 1.9936750433861787e-06,
1374
- "loss": 0.7426,
1375
  "step": 1120
1376
  },
1377
  {
1378
  "epoch": 0.05,
1379
  "learning_rate": 1.993602462660389e-06,
1380
- "loss": 0.7109,
1381
  "step": 1125
1382
  },
1383
  {
1384
  "epoch": 0.05,
1385
  "learning_rate": 1.993529469200777e-06,
1386
- "loss": 0.7023,
1387
  "step": 1130
1388
  },
1389
  {
1390
  "epoch": 0.05,
1391
  "learning_rate": 1.993456063037664e-06,
1392
- "loss": 0.6973,
1393
  "step": 1135
1394
  },
1395
  {
1396
  "epoch": 0.05,
1397
  "learning_rate": 1.9933822442015416e-06,
1398
- "loss": 0.7345,
1399
  "step": 1140
1400
  },
1401
  {
1402
  "epoch": 0.05,
1403
  "learning_rate": 1.993308012723074e-06,
1404
- "loss": 0.7175,
1405
  "step": 1145
1406
  },
1407
  {
1408
  "epoch": 0.05,
1409
  "learning_rate": 1.993233368633096e-06,
1410
- "loss": 0.6995,
1411
  "step": 1150
1412
  },
1413
  {
1414
  "epoch": 0.05,
1415
  "learning_rate": 1.993158311962614e-06,
1416
- "loss": 0.6925,
1417
  "step": 1155
1418
  },
1419
  {
1420
  "epoch": 0.05,
1421
  "learning_rate": 1.9930828427428066e-06,
1422
- "loss": 0.7135,
1423
  "step": 1160
1424
  },
1425
  {
1426
  "epoch": 0.05,
1427
  "learning_rate": 1.9930069610050224e-06,
1428
- "loss": 0.7213,
1429
  "step": 1165
1430
  },
1431
  {
@@ -1437,63 +1429,55 @@
1437
  {
1438
  "epoch": 0.05,
1439
  "learning_rate": 1.992853960101778e-06,
1440
- "loss": 0.679,
1441
  "step": 1175
1442
  },
1443
  {
1444
  "epoch": 0.05,
1445
  "learning_rate": 1.9927768409998733e-06,
1446
- "loss": 0.7349,
1447
  "step": 1180
1448
  },
1449
  {
1450
  "epoch": 0.05,
1451
  "learning_rate": 1.992699309507102e-06,
1452
- "loss": 0.6724,
1453
  "step": 1185
1454
  },
1455
  {
1456
  "epoch": 0.05,
1457
  "learning_rate": 1.992621365655671e-06,
1458
- "loss": 0.7149,
1459
  "step": 1190
1460
  },
1461
  {
1462
  "epoch": 0.05,
1463
  "learning_rate": 1.9925430094779566e-06,
1464
- "loss": 0.6983,
1465
  "step": 1195
1466
  },
1467
  {
1468
  "epoch": 0.05,
1469
  "learning_rate": 1.9924642410065075e-06,
1470
- "loss": 0.7384,
1471
- "step": 1200
1472
- },
1473
- {
1474
- "epoch": 0.05,
1475
- "eval_loss": 0.6822034120559692,
1476
- "eval_runtime": 145.3328,
1477
- "eval_samples_per_second": 16.28,
1478
- "eval_steps_per_second": 2.718,
1479
  "step": 1200
1480
  },
1481
  {
1482
  "epoch": 0.05,
1483
  "learning_rate": 1.992385060274044e-06,
1484
- "loss": 0.6982,
1485
  "step": 1205
1486
  },
1487
  {
1488
  "epoch": 0.05,
1489
  "learning_rate": 1.9923054673134564e-06,
1490
- "loss": 0.7896,
1491
  "step": 1210
1492
  },
1493
  {
1494
  "epoch": 0.05,
1495
  "learning_rate": 1.992225462157807e-06,
1496
- "loss": 0.7141,
1497
  "step": 1215
1498
  },
1499
  {
@@ -1505,7 +1489,7 @@
1505
  {
1506
  "epoch": 0.05,
1507
  "learning_rate": 1.9920642153944288e-06,
1508
- "loss": 0.7244,
1509
  "step": 1225
1510
  },
1511
  {
@@ -1517,13 +1501,13 @@
1517
  {
1518
  "epoch": 0.05,
1519
  "learning_rate": 1.991901320251831e-06,
1520
- "loss": 0.6934,
1521
  "step": 1235
1522
  },
1523
  {
1524
  "epoch": 0.05,
1525
  "learning_rate": 1.9918192546227995e-06,
1526
- "loss": 0.7273,
1527
  "step": 1240
1528
  },
1529
  {
@@ -1535,91 +1519,91 @@
1535
  {
1536
  "epoch": 0.05,
1537
  "learning_rate": 1.9916538874197176e-06,
1538
- "loss": 0.7635,
1539
  "step": 1250
1540
  },
1541
  {
1542
  "epoch": 0.05,
1543
  "learning_rate": 1.9915705859143594e-06,
1544
- "loss": 0.6727,
1545
  "step": 1255
1546
  },
1547
  {
1548
  "epoch": 0.05,
1549
  "learning_rate": 1.9914868725192025e-06,
1550
- "loss": 0.6944,
1551
  "step": 1260
1552
  },
1553
  {
1554
  "epoch": 0.05,
1555
  "learning_rate": 1.991402747269022e-06,
1556
- "loss": 0.743,
1557
  "step": 1265
1558
  },
1559
  {
1560
  "epoch": 0.05,
1561
  "learning_rate": 1.991318210198761e-06,
1562
- "loss": 0.7016,
1563
  "step": 1270
1564
  },
1565
  {
1566
  "epoch": 0.05,
1567
  "learning_rate": 1.991233261343537e-06,
1568
- "loss": 0.6771,
1569
  "step": 1275
1570
  },
1571
  {
1572
  "epoch": 0.05,
1573
  "learning_rate": 1.9911479007386364e-06,
1574
- "loss": 0.7277,
1575
  "step": 1280
1576
  },
1577
  {
1578
  "epoch": 0.05,
1579
  "learning_rate": 1.991062128419517e-06,
1580
- "loss": 0.7478,
1581
  "step": 1285
1582
  },
1583
  {
1584
  "epoch": 0.05,
1585
  "learning_rate": 1.9909759444218085e-06,
1586
- "loss": 0.7236,
1587
  "step": 1290
1588
  },
1589
  {
1590
  "epoch": 0.05,
1591
  "learning_rate": 1.9908893487813106e-06,
1592
- "loss": 0.7117,
1593
  "step": 1295
1594
  },
1595
  {
1596
  "epoch": 0.05,
1597
  "learning_rate": 1.990802341533994e-06,
1598
- "loss": 0.7349,
1599
  "step": 1300
1600
  },
1601
  {
1602
  "epoch": 0.05,
1603
  "learning_rate": 1.9907149227160016e-06,
1604
- "loss": 0.7086,
1605
  "step": 1305
1606
  },
1607
  {
1608
  "epoch": 0.05,
1609
  "learning_rate": 1.9906270923636457e-06,
1610
- "loss": 0.7175,
1611
  "step": 1310
1612
  },
1613
  {
1614
  "epoch": 0.05,
1615
  "learning_rate": 1.9905388505134107e-06,
1616
- "loss": 0.6936,
1617
  "step": 1315
1618
  },
1619
  {
1620
  "epoch": 0.05,
1621
  "learning_rate": 1.990450197201951e-06,
1622
- "loss": 0.7008,
1623
  "step": 1320
1624
  },
1625
  {
@@ -1631,31 +1615,31 @@
1631
  {
1632
  "epoch": 0.05,
1633
  "learning_rate": 1.9902716563428335e-06,
1634
- "loss": 0.7222,
1635
  "step": 1330
1636
  },
1637
  {
1638
  "epoch": 0.05,
1639
  "learning_rate": 1.9901817688693395e-06,
1640
- "loss": 0.7026,
1641
  "step": 1335
1642
  },
1643
  {
1644
  "epoch": 0.05,
1645
  "learning_rate": 1.99009147008295e-06,
1646
- "loss": 0.714,
1647
  "step": 1340
1648
  },
1649
  {
1650
  "epoch": 0.05,
1651
  "learning_rate": 1.9900007600211735e-06,
1652
- "loss": 0.6605,
1653
  "step": 1345
1654
  },
1655
  {
1656
  "epoch": 0.05,
1657
  "learning_rate": 1.9899096387216914e-06,
1658
- "loss": 0.7451,
1659
  "step": 1350
1660
  },
1661
  {
@@ -1667,7 +1651,7 @@
1667
  {
1668
  "epoch": 0.06,
1669
  "learning_rate": 1.9897261625611822e-06,
1670
- "loss": 0.692,
1671
  "step": 1360
1672
  },
1673
  {
@@ -1679,13 +1663,13 @@
1679
  {
1680
  "epoch": 0.06,
1681
  "learning_rate": 1.989541041906281e-06,
1682
- "loss": 0.7147,
1683
  "step": 1370
1684
  },
1685
  {
1686
  "epoch": 0.06,
1687
  "learning_rate": 1.9894478649894484e-06,
1688
- "loss": 0.7042,
1689
  "step": 1375
1690
  },
1691
  {
@@ -1697,7 +1681,7 @@
1697
  {
1698
  "epoch": 0.06,
1699
  "learning_rate": 1.9892602781705427e-06,
1700
- "loss": 0.6959,
1701
  "step": 1385
1702
  },
1703
  {
@@ -1715,49 +1699,49 @@
1715
  {
1716
  "epoch": 0.06,
1717
  "learning_rate": 1.98897581606478e-06,
1718
- "loss": 0.6866,
1719
  "step": 1400
1720
  },
1721
  {
1722
  "epoch": 0.06,
1723
  "learning_rate": 1.988880173686265e-06,
1724
- "loss": 0.7433,
1725
  "step": 1405
1726
  },
1727
  {
1728
  "epoch": 0.06,
1729
  "learning_rate": 1.988784120535525e-06,
1730
- "loss": 0.7481,
1731
  "step": 1410
1732
  },
1733
  {
1734
  "epoch": 0.06,
1735
  "learning_rate": 1.988687656652461e-06,
1736
- "loss": 0.7067,
1737
  "step": 1415
1738
  },
1739
  {
1740
  "epoch": 0.06,
1741
  "learning_rate": 1.9885907820771415e-06,
1742
- "loss": 0.7131,
1743
  "step": 1420
1744
  },
1745
  {
1746
  "epoch": 0.06,
1747
  "learning_rate": 1.988493496849809e-06,
1748
- "loss": 0.7312,
1749
  "step": 1425
1750
  },
1751
  {
1752
  "epoch": 0.06,
1753
  "learning_rate": 1.9883958010108736e-06,
1754
- "loss": 0.6986,
1755
  "step": 1430
1756
  },
1757
  {
1758
  "epoch": 0.06,
1759
  "learning_rate": 1.9882976946009186e-06,
1760
- "loss": 0.7088,
1761
  "step": 1435
1762
  },
1763
  {
@@ -1769,31 +1753,31 @@
1769
  {
1770
  "epoch": 0.06,
1771
  "learning_rate": 1.9881002502311285e-06,
1772
- "loss": 0.6539,
1773
  "step": 1445
1774
  },
1775
  {
1776
  "epoch": 0.06,
1777
  "learning_rate": 1.9880009123533095e-06,
1778
- "loss": 0.7095,
1779
  "step": 1450
1780
  },
1781
  {
1782
  "epoch": 0.06,
1783
  "learning_rate": 1.9879011640685043e-06,
1784
- "loss": 0.733,
1785
  "step": 1455
1786
  },
1787
  {
1788
  "epoch": 0.06,
1789
  "learning_rate": 1.9878010054181463e-06,
1790
- "loss": 0.7415,
1791
  "step": 1460
1792
  },
1793
  {
1794
  "epoch": 0.06,
1795
  "learning_rate": 1.9877004364438414e-06,
1796
- "loss": 0.7086,
1797
  "step": 1465
1798
  },
1799
  {
@@ -1805,25 +1789,25 @@
1805
  {
1806
  "epoch": 0.06,
1807
  "learning_rate": 1.9874980676906617e-06,
1808
- "loss": 0.708,
1809
  "step": 1475
1810
  },
1811
  {
1812
  "epoch": 0.06,
1813
  "learning_rate": 1.9873962679958494e-06,
1814
- "loss": 0.6988,
1815
  "step": 1480
1816
  },
1817
  {
1818
  "epoch": 0.06,
1819
  "learning_rate": 1.987294058145214e-06,
1820
- "loss": 0.7455,
1821
  "step": 1485
1822
  },
1823
  {
1824
  "epoch": 0.06,
1825
  "learning_rate": 1.987191438181213e-06,
1826
- "loss": 0.7406,
1827
  "step": 1490
1828
  },
1829
  {
@@ -1835,25 +1819,25 @@
1835
  {
1836
  "epoch": 0.06,
1837
  "learning_rate": 1.986984968083793e-06,
1838
- "loss": 0.7195,
1839
  "step": 1500
1840
  },
1841
  {
1842
  "epoch": 0.06,
1843
  "learning_rate": 1.9868811180361402e-06,
1844
- "loss": 0.7384,
1845
  "step": 1505
1846
  },
1847
  {
1848
  "epoch": 0.06,
1849
  "learning_rate": 1.9867768580466536e-06,
1850
- "loss": 0.7021,
1851
  "step": 1510
1852
  },
1853
  {
1854
  "epoch": 0.06,
1855
  "learning_rate": 1.986672188158641e-06,
1856
- "loss": 0.724,
1857
  "step": 1515
1858
  },
1859
  {
@@ -1865,114 +1849,594 @@
1865
  {
1866
  "epoch": 0.06,
1867
  "learning_rate": 1.986461618861127e-06,
1868
- "loss": 0.7033,
1869
  "step": 1525
1870
  },
1871
  {
1872
  "epoch": 0.06,
1873
  "learning_rate": 1.986355719539093e-06,
1874
- "loss": 0.7234,
1875
  "step": 1530
1876
  },
1877
  {
1878
  "epoch": 0.06,
1879
  "learning_rate": 1.9862494104934717e-06,
1880
- "loss": 0.718,
1881
  "step": 1535
1882
  },
1883
  {
1884
  "epoch": 0.06,
1885
  "learning_rate": 1.9861426917684214e-06,
1886
- "loss": 0.7014,
1887
  "step": 1540
1888
  },
1889
  {
1890
  "epoch": 0.06,
1891
  "learning_rate": 1.986035563408273e-06,
1892
- "loss": 0.6941,
1893
  "step": 1545
1894
  },
1895
  {
1896
  "epoch": 0.06,
1897
  "learning_rate": 1.9859280254575268e-06,
1898
- "loss": 0.7435,
1899
  "step": 1550
1900
  },
1901
  {
1902
  "epoch": 0.06,
1903
  "learning_rate": 1.9858200779608526e-06,
1904
- "loss": 0.7123,
1905
  "step": 1555
1906
  },
1907
  {
1908
  "epoch": 0.06,
1909
  "learning_rate": 1.9857117209630913e-06,
1910
- "loss": 0.7182,
1911
  "step": 1560
1912
  },
1913
  {
1914
  "epoch": 0.06,
1915
  "learning_rate": 1.9856029545092536e-06,
1916
- "loss": 0.6819,
1917
  "step": 1565
1918
  },
1919
  {
1920
  "epoch": 0.06,
1921
  "learning_rate": 1.985493778644519e-06,
1922
- "loss": 0.6962,
1923
  "step": 1570
1924
  },
1925
  {
1926
  "epoch": 0.06,
1927
  "learning_rate": 1.9853841934142396e-06,
1928
- "loss": 0.7435,
1929
  "step": 1575
1930
  },
1931
  {
1932
  "epoch": 0.06,
1933
  "learning_rate": 1.9852741988639356e-06,
1934
- "loss": 0.7122,
1935
  "step": 1580
1936
  },
1937
  {
1938
  "epoch": 0.06,
1939
  "learning_rate": 1.9851637950392974e-06,
1940
- "loss": 0.7239,
1941
  "step": 1585
1942
  },
1943
  {
1944
  "epoch": 0.06,
1945
  "learning_rate": 1.9850529819861863e-06,
1946
- "loss": 0.7111,
1947
  "step": 1590
1948
  },
1949
  {
1950
  "epoch": 0.06,
1951
  "learning_rate": 1.984941759750633e-06,
1952
- "loss": 0.6724,
1953
  "step": 1595
1954
  },
1955
  {
1956
  "epoch": 0.06,
1957
  "learning_rate": 1.984830128378838e-06,
1958
- "loss": 0.7164,
1959
  "step": 1600
1960
  },
1961
  {
1962
  "epoch": 0.06,
1963
- "eval_loss": 0.6777188777923584,
1964
- "eval_runtime": 146.6927,
1965
- "eval_samples_per_second": 16.129,
1966
- "eval_steps_per_second": 2.693,
1967
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1968
  }
1969
  ],
1970
  "logging_steps": 5,
1971
  "max_steps": 24619,
1972
  "num_input_tokens_seen": 0,
1973
  "num_train_epochs": 1,
1974
- "save_steps": 200,
1975
- "total_flos": 222925164388352.0,
1976
  "trial_name": null,
1977
  "trial_params": null
1978
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.08123683074814043,
5
+ "eval_steps": 800,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
17
  {
18
  "epoch": 0.0,
19
  "learning_rate": 4.048582995951417e-08,
20
+ "loss": 3.6858,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 8.097165991902834e-08,
26
+ "loss": 3.6628,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 1.214574898785425e-07,
32
+ "loss": 3.3928,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 1.6194331983805668e-07,
38
+ "loss": 3.0895,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 2.0242914979757083e-07,
44
+ "loss": 2.6585,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.0,
49
  "learning_rate": 2.42914979757085e-07,
50
+ "loss": 2.1214,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 2.8340080971659917e-07,
56
+ "loss": 1.7237,
57
  "step": 35
58
  },
59
  {
 
65
  {
66
  "epoch": 0.0,
67
  "learning_rate": 3.6437246963562754e-07,
68
+ "loss": 1.1867,
69
  "step": 45
70
  },
71
  {
72
  "epoch": 0.0,
73
  "learning_rate": 4.0485829959514166e-07,
74
+ "loss": 1.1259,
75
  "step": 50
76
  },
77
  {
78
  "epoch": 0.0,
79
  "learning_rate": 4.4534412955465585e-07,
80
+ "loss": 1.0245,
81
  "step": 55
82
  },
83
  {
84
  "epoch": 0.0,
85
  "learning_rate": 4.8582995951417e-07,
86
+ "loss": 0.977,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.0,
91
  "learning_rate": 5.263157894736842e-07,
92
+ "loss": 0.9554,
93
  "step": 65
94
  },
95
  {
 
101
  {
102
  "epoch": 0.0,
103
  "learning_rate": 6.072874493927125e-07,
104
+ "loss": 0.8987,
105
  "step": 75
106
  },
107
  {
108
  "epoch": 0.0,
109
  "learning_rate": 6.477732793522267e-07,
110
+ "loss": 0.8863,
111
  "step": 80
112
  },
113
  {
 
125
  {
126
  "epoch": 0.0,
127
  "learning_rate": 7.692307692307693e-07,
128
+ "loss": 0.848,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 0.0,
133
  "learning_rate": 8.097165991902833e-07,
134
+ "loss": 0.8213,
135
  "step": 100
136
  },
137
  {
138
  "epoch": 0.0,
139
  "learning_rate": 8.502024291497975e-07,
140
+ "loss": 0.8434,
141
  "step": 105
142
  },
143
  {
 
167
  {
168
  "epoch": 0.01,
169
  "learning_rate": 1.0526315789473683e-06,
170
+ "loss": 0.8038,
171
  "step": 130
172
  },
173
  {
174
  "epoch": 0.01,
175
  "learning_rate": 1.0931174089068826e-06,
176
+ "loss": 0.803,
177
  "step": 135
178
  },
179
  {
180
  "epoch": 0.01,
181
  "learning_rate": 1.1336032388663967e-06,
182
+ "loss": 0.7935,
183
  "step": 140
184
  },
185
  {
 
203
  {
204
  "epoch": 0.01,
205
  "learning_rate": 1.2955465587044534e-06,
206
+ "loss": 0.8286,
207
  "step": 160
208
  },
209
  {
210
  "epoch": 0.01,
211
  "learning_rate": 1.3360323886639675e-06,
212
+ "loss": 0.7742,
213
  "step": 165
214
  },
215
  {
 
227
  {
228
  "epoch": 0.01,
229
  "learning_rate": 1.4574898785425101e-06,
230
+ "loss": 0.7721,
231
  "step": 180
232
  },
233
  {
234
  "epoch": 0.01,
235
  "learning_rate": 1.4979757085020242e-06,
236
+ "loss": 0.7554,
237
  "step": 185
238
  },
239
  {
 
245
  {
246
  "epoch": 0.01,
247
  "learning_rate": 1.5789473684210526e-06,
248
+ "loss": 0.7759,
249
  "step": 195
250
  },
251
  {
252
  "epoch": 0.01,
253
  "learning_rate": 1.6194331983805667e-06,
254
+ "loss": 0.7249,
255
  "step": 200
256
  },
257
  {
258
  "epoch": 0.01,
259
  "learning_rate": 1.6599190283400807e-06,
260
+ "loss": 0.7673,
261
  "step": 205
262
  },
263
  {
 
281
  {
282
  "epoch": 0.01,
283
  "learning_rate": 1.8218623481781377e-06,
284
+ "loss": 0.7383,
285
  "step": 225
286
  },
287
  {
288
  "epoch": 0.01,
289
  "learning_rate": 1.8623481781376518e-06,
290
+ "loss": 0.7608,
291
  "step": 230
292
  },
293
  {
294
  "epoch": 0.01,
295
  "learning_rate": 1.9028340080971658e-06,
296
+ "loss": 0.7663,
297
  "step": 235
298
  },
299
  {
300
  "epoch": 0.01,
301
  "learning_rate": 1.94331983805668e-06,
302
+ "loss": 0.7567,
303
  "step": 240
304
  },
305
  {
306
  "epoch": 0.01,
307
  "learning_rate": 1.983805668016194e-06,
308
+ "loss": 0.751,
309
  "step": 245
310
  },
311
  {
 
329
  {
330
  "epoch": 0.01,
331
  "learning_rate": 1.999997308265467e-06,
332
+ "loss": 0.7508,
333
  "step": 265
334
  },
335
  {
336
  "epoch": 0.01,
337
  "learning_rate": 1.999995605163075e-06,
338
+ "loss": 0.7696,
339
  "step": 270
340
  },
341
  {
342
  "epoch": 0.01,
343
  "learning_rate": 1.9999934866712048e-06,
344
+ "loss": 0.7676,
345
  "step": 275
346
  },
347
  {
348
  "epoch": 0.01,
349
  "learning_rate": 1.9999909527907367e-06,
350
+ "loss": 0.7601,
351
  "step": 280
352
  },
353
  {
354
  "epoch": 0.01,
355
  "learning_rate": 1.9999880035227236e-06,
356
+ "loss": 0.7779,
357
  "step": 285
358
  },
359
  {
360
  "epoch": 0.01,
361
  "learning_rate": 1.9999846388683895e-06,
362
+ "loss": 0.7768,
363
  "step": 290
364
  },
365
  {
 
377
  {
378
  "epoch": 0.01,
379
  "learning_rate": 1.999972052602305e-06,
380
+ "loss": 0.742,
381
  "step": 305
382
  },
383
  {
384
  "epoch": 0.01,
385
  "learning_rate": 1.999967026418392e-06,
386
+ "loss": 0.783,
387
  "step": 310
388
  },
389
  {
390
  "epoch": 0.01,
391
  "learning_rate": 1.999961584856872e-06,
392
+ "loss": 0.7269,
393
  "step": 315
394
  },
395
  {
396
  "epoch": 0.01,
397
  "learning_rate": 1.9999557279200056e-06,
398
+ "loss": 0.7336,
399
  "step": 320
400
  },
401
  {
402
  "epoch": 0.01,
403
  "learning_rate": 1.9999494556102263e-06,
404
+ "loss": 0.7072,
405
  "step": 325
406
  },
407
  {
 
413
  {
414
  "epoch": 0.01,
415
  "learning_rate": 1.999935664882522e-06,
416
+ "loss": 0.7237,
417
  "step": 335
418
  },
419
  {
420
  "epoch": 0.01,
421
  "learning_rate": 1.9999281464703247e-06,
422
+ "loss": 0.719,
423
  "step": 340
424
  },
425
  {
426
  "epoch": 0.01,
427
  "learning_rate": 1.999920212696672e-06,
428
+ "loss": 0.748,
429
  "step": 345
430
  },
431
  {
432
  "epoch": 0.01,
433
  "learning_rate": 1.999911863564859e-06,
434
+ "loss": 0.7167,
435
  "step": 350
436
  },
437
  {
438
  "epoch": 0.01,
439
  "learning_rate": 1.9999030990783527e-06,
440
+ "loss": 0.7151,
441
  "step": 355
442
  },
443
  {
444
  "epoch": 0.01,
445
  "learning_rate": 1.999893919240795e-06,
446
+ "loss": 0.7095,
447
  "step": 360
448
  },
449
  {
450
  "epoch": 0.01,
451
  "learning_rate": 1.9998843240559986e-06,
452
+ "loss": 0.7703,
453
  "step": 365
454
  },
455
  {
456
  "epoch": 0.02,
457
  "learning_rate": 1.9998743135279497e-06,
458
+ "loss": 0.7456,
459
  "step": 370
460
  },
461
  {
462
  "epoch": 0.02,
463
  "learning_rate": 1.999863887660806e-06,
464
+ "loss": 0.7532,
465
  "step": 375
466
  },
467
  {
468
  "epoch": 0.02,
469
  "learning_rate": 1.999853046458899e-06,
470
+ "loss": 0.7014,
471
  "step": 380
472
  },
473
  {
474
  "epoch": 0.02,
475
  "learning_rate": 1.9998417899267313e-06,
476
+ "loss": 0.7629,
477
  "step": 385
478
  },
479
  {
480
  "epoch": 0.02,
481
  "learning_rate": 1.999830118068979e-06,
482
+ "loss": 0.7329,
483
  "step": 390
484
  },
485
  {
486
  "epoch": 0.02,
487
  "learning_rate": 1.999818030890491e-06,
488
+ "loss": 0.723,
489
  "step": 395
490
  },
491
  {
492
  "epoch": 0.02,
493
  "learning_rate": 1.999805528396288e-06,
494
+ "loss": 0.7549,
 
 
 
 
 
 
 
 
495
  "step": 400
496
  },
497
  {
498
  "epoch": 0.02,
499
  "learning_rate": 1.9997926105915627e-06,
500
+ "loss": 0.7121,
501
  "step": 405
502
  },
503
  {
504
  "epoch": 0.02,
505
  "learning_rate": 1.999779277481682e-06,
506
+ "loss": 0.7506,
507
  "step": 410
508
  },
509
  {
510
  "epoch": 0.02,
511
  "learning_rate": 1.9997655290721834e-06,
512
+ "loss": 0.7284,
513
  "step": 415
514
  },
515
  {
516
  "epoch": 0.02,
517
  "learning_rate": 1.9997513653687786e-06,
518
+ "loss": 0.7344,
519
  "step": 420
520
  },
521
  {
522
  "epoch": 0.02,
523
  "learning_rate": 1.999736786377351e-06,
524
+ "loss": 0.7684,
525
  "step": 425
526
  },
527
  {
528
  "epoch": 0.02,
529
  "learning_rate": 1.9997217921039567e-06,
530
+ "loss": 0.7427,
531
  "step": 430
532
  },
533
  {
534
  "epoch": 0.02,
535
  "learning_rate": 1.9997063825548237e-06,
536
+ "loss": 0.7139,
537
  "step": 435
538
  },
539
  {
540
  "epoch": 0.02,
541
  "learning_rate": 1.9996905577363533e-06,
542
+ "loss": 0.761,
543
  "step": 440
544
  },
545
  {
546
  "epoch": 0.02,
547
  "learning_rate": 1.9996743176551186e-06,
548
+ "loss": 0.7545,
549
  "step": 445
550
  },
551
  {
552
  "epoch": 0.02,
553
  "learning_rate": 1.999657662317866e-06,
554
+ "loss": 0.7431,
555
  "step": 450
556
  },
557
  {
 
563
  {
564
  "epoch": 0.02,
565
  "learning_rate": 1.999623105903154e-06,
566
+ "loss": 0.7284,
567
  "step": 460
568
  },
569
  {
570
  "epoch": 0.02,
571
  "learning_rate": 1.999605204840049e-06,
572
+ "loss": 0.76,
573
  "step": 465
574
  },
575
  {
576
  "epoch": 0.02,
577
  "learning_rate": 1.9995868885496343e-06,
578
+ "loss": 0.7413,
579
  "step": 470
580
  },
581
  {
582
  "epoch": 0.02,
583
  "learning_rate": 1.9995681570395195e-06,
584
+ "loss": 0.7837,
585
  "step": 475
586
  },
587
  {
588
  "epoch": 0.02,
589
  "learning_rate": 1.9995490103174847e-06,
590
+ "loss": 0.7347,
591
  "step": 480
592
  },
593
  {
 
599
  {
600
  "epoch": 0.02,
601
  "learning_rate": 1.9995094712696413e-06,
602
+ "loss": 0.7665,
603
  "step": 490
604
  },
605
  {
606
  "epoch": 0.02,
607
  "learning_rate": 1.9994890789602576e-06,
608
+ "loss": 0.7353,
609
  "step": 495
610
  },
611
  {
612
  "epoch": 0.02,
613
  "learning_rate": 1.999468271471802e-06,
614
+ "loss": 0.7344,
615
  "step": 500
616
  },
617
  {
618
  "epoch": 0.02,
619
  "learning_rate": 1.9994470488129185e-06,
620
+ "loss": 0.7476,
621
  "step": 505
622
  },
623
  {
624
  "epoch": 0.02,
625
  "learning_rate": 1.9994254109924223e-06,
626
+ "loss": 0.7257,
627
  "step": 510
628
  },
629
  {
630
  "epoch": 0.02,
631
  "learning_rate": 1.9994033580193017e-06,
632
+ "loss": 0.7306,
633
  "step": 515
634
  },
635
  {
636
  "epoch": 0.02,
637
  "learning_rate": 1.999380889902718e-06,
638
+ "loss": 0.7115,
639
  "step": 520
640
  },
641
  {
642
  "epoch": 0.02,
643
  "learning_rate": 1.9993580066520034e-06,
644
+ "loss": 0.7452,
645
  "step": 525
646
  },
647
  {
648
  "epoch": 0.02,
649
  "learning_rate": 1.9993347082766636e-06,
650
+ "loss": 0.7523,
651
  "step": 530
652
  },
653
  {
 
665
  {
666
  "epoch": 0.02,
667
  "learning_rate": 1.999262322500535e-06,
668
+ "loss": 0.7043,
669
  "step": 545
670
  },
671
  {
672
  "epoch": 0.02,
673
  "learning_rate": 1.9992373637251982e-06,
674
+ "loss": 0.7098,
675
  "step": 550
676
  },
677
  {
678
  "epoch": 0.02,
679
  "learning_rate": 1.999211989875351e-06,
680
+ "loss": 0.7142,
681
  "step": 555
682
  },
683
  {
684
  "epoch": 0.02,
685
  "learning_rate": 1.999186200961532e-06,
686
+ "loss": 0.7424,
687
  "step": 560
688
  },
689
  {
690
  "epoch": 0.02,
691
  "learning_rate": 1.9991599969944552e-06,
692
+ "loss": 0.7348,
693
  "step": 565
694
  },
695
  {
696
  "epoch": 0.02,
697
  "learning_rate": 1.9991333779850043e-06,
698
+ "loss": 0.7126,
699
  "step": 570
700
  },
701
  {
702
  "epoch": 0.02,
703
  "learning_rate": 1.999106343944237e-06,
704
+ "loss": 0.7341,
705
  "step": 575
706
  },
707
  {
708
  "epoch": 0.02,
709
  "learning_rate": 1.9990788948833833e-06,
710
+ "loss": 0.7445,
711
  "step": 580
712
  },
713
  {
714
  "epoch": 0.02,
715
  "learning_rate": 1.999051030813845e-06,
716
+ "loss": 0.7181,
717
  "step": 585
718
  },
719
  {
720
  "epoch": 0.02,
721
  "learning_rate": 1.999022751747197e-06,
722
+ "loss": 0.7295,
723
  "step": 590
724
  },
725
  {
726
  "epoch": 0.02,
727
  "learning_rate": 1.998994057695185e-06,
728
+ "loss": 0.7159,
729
  "step": 595
730
  },
731
  {
732
  "epoch": 0.02,
733
  "learning_rate": 1.99896494866973e-06,
734
+ "loss": 0.6844,
735
  "step": 600
736
  },
737
  {
738
  "epoch": 0.02,
739
  "learning_rate": 1.9989354246829222e-06,
740
+ "loss": 0.7511,
741
  "step": 605
742
  },
743
  {
744
  "epoch": 0.02,
745
  "learning_rate": 1.9989054857470267e-06,
746
+ "loss": 0.7322,
747
  "step": 610
748
  },
749
  {
750
  "epoch": 0.02,
751
  "learning_rate": 1.9988751318744787e-06,
752
+ "loss": 0.7829,
753
  "step": 615
754
  },
755
  {
756
  "epoch": 0.03,
757
  "learning_rate": 1.998844363077888e-06,
758
+ "loss": 0.7229,
759
  "step": 620
760
  },
761
  {
762
  "epoch": 0.03,
763
  "learning_rate": 1.998813179370035e-06,
764
+ "loss": 0.738,
765
  "step": 625
766
  },
767
  {
768
  "epoch": 0.03,
769
  "learning_rate": 1.9987815807638733e-06,
770
+ "loss": 0.6934,
771
  "step": 630
772
  },
773
  {
774
  "epoch": 0.03,
775
  "learning_rate": 1.9987495672725294e-06,
776
+ "loss": 0.7005,
777
  "step": 635
778
  },
779
  {
780
  "epoch": 0.03,
781
  "learning_rate": 1.9987171389093e-06,
782
+ "loss": 0.7692,
783
  "step": 640
784
  },
785
  {
786
  "epoch": 0.03,
787
  "learning_rate": 1.998684295687657e-06,
788
+ "loss": 0.7101,
789
  "step": 645
790
  },
791
  {
792
  "epoch": 0.03,
793
  "learning_rate": 1.998651037621242e-06,
794
+ "loss": 0.7813,
795
  "step": 650
796
  },
797
  {
798
  "epoch": 0.03,
799
  "learning_rate": 1.9986173647238715e-06,
800
+ "loss": 0.7526,
801
  "step": 655
802
  },
803
  {
804
  "epoch": 0.03,
805
  "learning_rate": 1.9985832770095313e-06,
806
+ "loss": 0.7235,
807
  "step": 660
808
  },
809
  {
810
  "epoch": 0.03,
811
  "learning_rate": 1.998548774492382e-06,
812
+ "loss": 0.7201,
813
  "step": 665
814
  },
815
  {
816
  "epoch": 0.03,
817
  "learning_rate": 1.9985138571867557e-06,
818
+ "loss": 0.7303,
819
  "step": 670
820
  },
821
  {
822
  "epoch": 0.03,
823
  "learning_rate": 1.998478525107157e-06,
824
+ "loss": 0.7375,
825
  "step": 675
826
  },
827
  {
828
  "epoch": 0.03,
829
  "learning_rate": 1.998442778268262e-06,
830
+ "loss": 0.7123,
831
  "step": 680
832
  },
833
  {
834
  "epoch": 0.03,
835
  "learning_rate": 1.99840661668492e-06,
836
+ "loss": 0.7541,
837
  "step": 685
838
  },
839
  {
840
  "epoch": 0.03,
841
  "learning_rate": 1.998370040372151e-06,
842
+ "loss": 0.7685,
843
  "step": 690
844
  },
845
  {
846
  "epoch": 0.03,
847
  "learning_rate": 1.99833304934515e-06,
848
+ "loss": 0.7029,
849
  "step": 695
850
  },
851
  {
852
  "epoch": 0.03,
853
  "learning_rate": 1.9982956436192827e-06,
854
+ "loss": 0.7797,
855
  "step": 700
856
  },
857
  {
858
  "epoch": 0.03,
859
  "learning_rate": 1.9982578232100866e-06,
860
+ "loss": 0.7326,
861
  "step": 705
862
  },
863
  {
864
  "epoch": 0.03,
865
  "learning_rate": 1.9982195881332714e-06,
866
+ "loss": 0.773,
867
  "step": 710
868
  },
869
  {
870
  "epoch": 0.03,
871
  "learning_rate": 1.9981809384047207e-06,
872
+ "loss": 0.741,
873
  "step": 715
874
  },
875
  {
876
  "epoch": 0.03,
877
  "learning_rate": 1.9981418740404886e-06,
878
+ "loss": 0.7518,
879
  "step": 720
880
  },
881
  {
882
  "epoch": 0.03,
883
  "learning_rate": 1.998102395056802e-06,
884
+ "loss": 0.7338,
885
  "step": 725
886
  },
887
  {
888
  "epoch": 0.03,
889
  "learning_rate": 1.998062501470061e-06,
890
+ "loss": 0.7192,
891
  "step": 730
892
  },
893
  {
894
  "epoch": 0.03,
895
  "learning_rate": 1.998022193296836e-06,
896
+ "loss": 0.7429,
897
  "step": 735
898
  },
899
  {
900
  "epoch": 0.03,
901
  "learning_rate": 1.9979814705538715e-06,
902
+ "loss": 0.6953,
903
  "step": 740
904
  },
905
  {
 
923
  {
924
  "epoch": 0.03,
925
  "learning_rate": 1.9978144342255147e-06,
926
+ "loss": 0.7008,
927
  "step": 760
928
  },
929
  {
930
  "epoch": 0.03,
931
  "learning_rate": 1.9977716388910325e-06,
932
+ "loss": 0.7301,
933
  "step": 765
934
  },
935
  {
936
  "epoch": 0.03,
937
  "learning_rate": 1.997728429090889e-06,
938
+ "loss": 0.7662,
939
  "step": 770
940
  },
941
  {
942
  "epoch": 0.03,
943
  "learning_rate": 1.9976848048430323e-06,
944
+ "loss": 0.7428,
945
  "step": 775
946
  },
947
  {
948
  "epoch": 0.03,
949
  "learning_rate": 1.9976407661655844e-06,
950
+ "loss": 0.706,
951
  "step": 780
952
  },
953
  {
954
  "epoch": 0.03,
955
  "learning_rate": 1.997596313076838e-06,
956
+ "loss": 0.6853,
957
  "step": 785
958
  },
959
  {
960
  "epoch": 0.03,
961
  "learning_rate": 1.9975514455952584e-06,
962
+ "loss": 0.7363,
963
  "step": 790
964
  },
965
  {
966
  "epoch": 0.03,
967
  "learning_rate": 1.9975061637394834e-06,
968
+ "loss": 0.7217,
969
  "step": 795
970
  },
971
  {
972
  "epoch": 0.03,
973
  "learning_rate": 1.997460467528323e-06,
974
+ "loss": 0.7161,
975
  "step": 800
976
  },
977
  {
978
  "epoch": 0.03,
979
+ "eval_loss": 0.6896045207977295,
980
+ "eval_runtime": 140.4315,
981
+ "eval_samples_per_second": 16.848,
982
+ "eval_steps_per_second": 2.813,
983
  "step": 800
984
  },
985
  {
986
  "epoch": 0.03,
987
  "learning_rate": 1.997414356980759e-06,
988
+ "loss": 0.7911,
989
  "step": 805
990
  },
991
  {
992
  "epoch": 0.03,
993
  "learning_rate": 1.9973678321159443e-06,
994
+ "loss": 0.7037,
995
  "step": 810
996
  },
997
  {
998
  "epoch": 0.03,
999
  "learning_rate": 1.9973208929532063e-06,
1000
+ "loss": 0.7083,
1001
  "step": 815
1002
  },
1003
  {
 
1009
  {
1010
  "epoch": 0.03,
1011
  "learning_rate": 1.997225771812122e-06,
1012
+ "loss": 0.7227,
1013
  "step": 825
1014
  },
1015
  {
1016
  "epoch": 0.03,
1017
  "learning_rate": 1.9971775898732893e-06,
1018
+ "loss": 0.7271,
1019
  "step": 830
1020
  },
1021
  {
 
1033
  {
1034
  "epoch": 0.03,
1035
  "learning_rate": 1.9970305588243145e-06,
1036
+ "loss": 0.706,
1037
  "step": 845
1038
  },
1039
  {
 
1045
  {
1046
  "epoch": 0.03,
1047
  "learning_rate": 1.9969304673019494e-06,
1048
+ "loss": 0.7165,
1049
  "step": 855
1050
  },
1051
  {
1052
  "epoch": 0.03,
1053
  "learning_rate": 1.99687980035596e-06,
1054
+ "loss": 0.729,
1055
  "step": 860
1056
  },
1057
  {
1058
  "epoch": 0.04,
1059
  "learning_rate": 1.996828719314771e-06,
1060
+ "loss": 0.7199,
1061
  "step": 865
1062
  },
1063
  {
 
1069
  {
1070
  "epoch": 0.04,
1071
  "learning_rate": 1.99672531503184e-06,
1072
+ "loss": 0.735,
1073
  "step": 875
1074
  },
1075
  {
1076
  "epoch": 0.04,
1077
  "learning_rate": 1.996672991833051e-06,
1078
+ "loss": 0.7153,
1079
  "step": 880
1080
  },
1081
  {
1082
  "epoch": 0.04,
1083
  "learning_rate": 1.996620254624969e-06,
1084
+ "loss": 0.714,
1085
  "step": 885
1086
  },
1087
  {
1088
  "epoch": 0.04,
1089
  "learning_rate": 1.9965671034295e-06,
1090
+ "loss": 0.7309,
1091
  "step": 890
1092
  },
1093
  {
1094
  "epoch": 0.04,
1095
  "learning_rate": 1.996513538268723e-06,
1096
+ "loss": 0.7808,
1097
  "step": 895
1098
  },
1099
  {
1100
  "epoch": 0.04,
1101
  "learning_rate": 1.9964595591648883e-06,
1102
+ "loss": 0.7407,
1103
  "step": 900
1104
  },
1105
  {
1106
  "epoch": 0.04,
1107
  "learning_rate": 1.9964051661404185e-06,
1108
+ "loss": 0.6831,
1109
  "step": 905
1110
  },
1111
  {
1112
  "epoch": 0.04,
1113
  "learning_rate": 1.9963503592179078e-06,
1114
+ "loss": 0.7178,
1115
  "step": 910
1116
  },
1117
  {
1118
  "epoch": 0.04,
1119
  "learning_rate": 1.996295138420122e-06,
1120
+ "loss": 0.7607,
1121
  "step": 915
1122
  },
1123
  {
1124
  "epoch": 0.04,
1125
  "learning_rate": 1.9962395037700007e-06,
1126
+ "loss": 0.747,
1127
  "step": 920
1128
  },
1129
  {
1130
  "epoch": 0.04,
1131
  "learning_rate": 1.996183455290653e-06,
1132
+ "loss": 0.6911,
1133
  "step": 925
1134
  },
1135
  {
 
1141
  {
1142
  "epoch": 0.04,
1143
  "learning_rate": 1.996070116937579e-06,
1144
+ "loss": 0.7195,
1145
  "step": 935
1146
  },
1147
  {
1148
  "epoch": 0.04,
1149
  "learning_rate": 1.9960128271109326e-06,
1150
+ "loss": 0.6974,
1151
  "step": 940
1152
  },
1153
  {
1154
  "epoch": 0.04,
1155
  "learning_rate": 1.9959551235492195e-06,
1156
+ "loss": 0.7399,
1157
  "step": 945
1158
  },
1159
  {
1160
  "epoch": 0.04,
1161
  "learning_rate": 1.9958970062764095e-06,
1162
+ "loss": 0.7475,
1163
  "step": 950
1164
  },
1165
  {
1166
  "epoch": 0.04,
1167
  "learning_rate": 1.9958384753166437e-06,
1168
+ "loss": 0.7091,
1169
  "step": 955
1170
  },
1171
  {
1172
  "epoch": 0.04,
1173
  "learning_rate": 1.995779530694236e-06,
1174
+ "loss": 0.6908,
1175
  "step": 960
1176
  },
1177
  {
1178
  "epoch": 0.04,
1179
  "learning_rate": 1.9957201724336704e-06,
1180
+ "loss": 0.7052,
1181
  "step": 965
1182
  },
1183
  {
1184
  "epoch": 0.04,
1185
  "learning_rate": 1.9956604005596043e-06,
1186
+ "loss": 0.6963,
1187
  "step": 970
1188
  },
1189
  {
1190
  "epoch": 0.04,
1191
  "learning_rate": 1.9956002150968667e-06,
1192
+ "loss": 0.7064,
1193
  "step": 975
1194
  },
1195
  {
1196
  "epoch": 0.04,
1197
  "learning_rate": 1.9955396160704582e-06,
1198
+ "loss": 0.6804,
1199
  "step": 980
1200
  },
1201
  {
1202
  "epoch": 0.04,
1203
  "learning_rate": 1.99547860350555e-06,
1204
+ "loss": 0.6759,
1205
  "step": 985
1206
  },
1207
  {
1208
  "epoch": 0.04,
1209
  "learning_rate": 1.995417177427488e-06,
1210
+ "loss": 0.7175,
1211
  "step": 990
1212
  },
1213
  {
 
1219
  {
1220
  "epoch": 0.04,
1221
  "learning_rate": 1.995293084834134e-06,
1222
+ "loss": 0.7109,
1223
  "step": 1000
1224
  },
1225
  {
1226
  "epoch": 0.04,
1227
  "learning_rate": 1.9952304183703893e-06,
1228
+ "loss": 0.7129,
1229
  "step": 1005
1230
  },
1231
  {
1232
  "epoch": 0.04,
1233
  "learning_rate": 1.9951673384965835e-06,
1234
+ "loss": 0.7117,
1235
  "step": 1010
1236
  },
1237
  {
1238
  "epoch": 0.04,
1239
  "learning_rate": 1.99510384523892e-06,
1240
+ "loss": 0.7694,
1241
  "step": 1015
1242
  },
1243
  {
1244
  "epoch": 0.04,
1245
  "learning_rate": 1.995039938623773e-06,
1246
+ "loss": 0.7381,
1247
  "step": 1020
1248
  },
1249
  {
 
1255
  {
1256
  "epoch": 0.04,
1257
  "learning_rate": 1.9949108854273855e-06,
1258
+ "loss": 0.7288,
1259
  "step": 1030
1260
  },
1261
  {
1262
  "epoch": 0.04,
1263
  "learning_rate": 1.9948457388997528e-06,
1264
+ "loss": 0.7045,
1265
  "step": 1035
1266
  },
1267
  {
 
1273
  {
1274
  "epoch": 0.04,
1275
  "learning_rate": 1.994714206120914e-06,
1276
+ "loss": 0.725,
1277
  "step": 1045
1278
  },
1279
  {
1280
  "epoch": 0.04,
1281
  "learning_rate": 1.9946478199243466e-06,
1282
+ "loss": 0.7203,
1283
  "step": 1050
1284
  },
1285
  {
1286
  "epoch": 0.04,
1287
  "learning_rate": 1.9945810205597246e-06,
1288
+ "loss": 0.7011,
1289
  "step": 1055
1290
  },
1291
  {
1292
  "epoch": 0.04,
1293
  "learning_rate": 1.9945138080547957e-06,
1294
+ "loss": 0.6946,
1295
  "step": 1060
1296
  },
1297
  {
1298
  "epoch": 0.04,
1299
  "learning_rate": 1.99444618243748e-06,
1300
+ "loss": 0.7151,
1301
  "step": 1065
1302
  },
1303
  {
1304
  "epoch": 0.04,
1305
  "learning_rate": 1.994378143735868e-06,
1306
+ "loss": 0.7074,
1307
  "step": 1070
1308
  },
1309
  {
1310
  "epoch": 0.04,
1311
  "learning_rate": 1.9943096919782225e-06,
1312
+ "loss": 0.7,
1313
  "step": 1075
1314
  },
1315
  {
1316
  "epoch": 0.04,
1317
  "learning_rate": 1.994240827192978e-06,
1318
+ "loss": 0.6957,
1319
  "step": 1080
1320
  },
1321
  {
1322
  "epoch": 0.04,
1323
  "learning_rate": 1.9941715494087408e-06,
1324
+ "loss": 0.7348,
1325
  "step": 1085
1326
  },
1327
  {
1328
  "epoch": 0.04,
1329
  "learning_rate": 1.9941018586542866e-06,
1330
+ "loss": 0.6984,
1331
  "step": 1090
1332
  },
1333
  {
1334
  "epoch": 0.04,
1335
  "learning_rate": 1.9940317549585665e-06,
1336
+ "loss": 0.7252,
1337
  "step": 1095
1338
  },
1339
  {
 
1345
  {
1346
  "epoch": 0.04,
1347
  "learning_rate": 1.993890308859978e-06,
1348
+ "loss": 0.7261,
1349
  "step": 1105
1350
  },
1351
  {
1352
  "epoch": 0.05,
1353
  "learning_rate": 1.9938189665158654e-06,
1354
+ "loss": 0.6879,
1355
  "step": 1110
1356
  },
1357
  {
1358
  "epoch": 0.05,
1359
  "learning_rate": 1.9937472113479966e-06,
1360
+ "loss": 0.7088,
1361
  "step": 1115
1362
  },
1363
  {
1364
  "epoch": 0.05,
1365
  "learning_rate": 1.9936750433861787e-06,
1366
+ "loss": 0.7428,
1367
  "step": 1120
1368
  },
1369
  {
1370
  "epoch": 0.05,
1371
  "learning_rate": 1.993602462660389e-06,
1372
+ "loss": 0.7111,
1373
  "step": 1125
1374
  },
1375
  {
1376
  "epoch": 0.05,
1377
  "learning_rate": 1.993529469200777e-06,
1378
+ "loss": 0.7027,
1379
  "step": 1130
1380
  },
1381
  {
1382
  "epoch": 0.05,
1383
  "learning_rate": 1.993456063037664e-06,
1384
+ "loss": 0.6969,
1385
  "step": 1135
1386
  },
1387
  {
1388
  "epoch": 0.05,
1389
  "learning_rate": 1.9933822442015416e-06,
1390
+ "loss": 0.7343,
1391
  "step": 1140
1392
  },
1393
  {
1394
  "epoch": 0.05,
1395
  "learning_rate": 1.993308012723074e-06,
1396
+ "loss": 0.7174,
1397
  "step": 1145
1398
  },
1399
  {
1400
  "epoch": 0.05,
1401
  "learning_rate": 1.993233368633096e-06,
1402
+ "loss": 0.6997,
1403
  "step": 1150
1404
  },
1405
  {
1406
  "epoch": 0.05,
1407
  "learning_rate": 1.993158311962614e-06,
1408
+ "loss": 0.693,
1409
  "step": 1155
1410
  },
1411
  {
1412
  "epoch": 0.05,
1413
  "learning_rate": 1.9930828427428066e-06,
1414
+ "loss": 0.7136,
1415
  "step": 1160
1416
  },
1417
  {
1418
  "epoch": 0.05,
1419
  "learning_rate": 1.9930069610050224e-06,
1420
+ "loss": 0.7211,
1421
  "step": 1165
1422
  },
1423
  {
 
1429
  {
1430
  "epoch": 0.05,
1431
  "learning_rate": 1.992853960101778e-06,
1432
+ "loss": 0.6787,
1433
  "step": 1175
1434
  },
1435
  {
1436
  "epoch": 0.05,
1437
  "learning_rate": 1.9927768409998733e-06,
1438
+ "loss": 0.7348,
1439
  "step": 1180
1440
  },
1441
  {
1442
  "epoch": 0.05,
1443
  "learning_rate": 1.992699309507102e-06,
1444
+ "loss": 0.6718,
1445
  "step": 1185
1446
  },
1447
  {
1448
  "epoch": 0.05,
1449
  "learning_rate": 1.992621365655671e-06,
1450
+ "loss": 0.7146,
1451
  "step": 1190
1452
  },
1453
  {
1454
  "epoch": 0.05,
1455
  "learning_rate": 1.9925430094779566e-06,
1456
+ "loss": 0.6982,
1457
  "step": 1195
1458
  },
1459
  {
1460
  "epoch": 0.05,
1461
  "learning_rate": 1.9924642410065075e-06,
1462
+ "loss": 0.7379,
 
 
 
 
 
 
 
 
1463
  "step": 1200
1464
  },
1465
  {
1466
  "epoch": 0.05,
1467
  "learning_rate": 1.992385060274044e-06,
1468
+ "loss": 0.6983,
1469
  "step": 1205
1470
  },
1471
  {
1472
  "epoch": 0.05,
1473
  "learning_rate": 1.9923054673134564e-06,
1474
+ "loss": 0.7893,
1475
  "step": 1210
1476
  },
1477
  {
1478
  "epoch": 0.05,
1479
  "learning_rate": 1.992225462157807e-06,
1480
+ "loss": 0.714,
1481
  "step": 1215
1482
  },
1483
  {
 
1489
  {
1490
  "epoch": 0.05,
1491
  "learning_rate": 1.9920642153944288e-06,
1492
+ "loss": 0.7238,
1493
  "step": 1225
1494
  },
1495
  {
 
1501
  {
1502
  "epoch": 0.05,
1503
  "learning_rate": 1.991901320251831e-06,
1504
+ "loss": 0.6936,
1505
  "step": 1235
1506
  },
1507
  {
1508
  "epoch": 0.05,
1509
  "learning_rate": 1.9918192546227995e-06,
1510
+ "loss": 0.7271,
1511
  "step": 1240
1512
  },
1513
  {
 
1519
  {
1520
  "epoch": 0.05,
1521
  "learning_rate": 1.9916538874197176e-06,
1522
+ "loss": 0.7637,
1523
  "step": 1250
1524
  },
1525
  {
1526
  "epoch": 0.05,
1527
  "learning_rate": 1.9915705859143594e-06,
1528
+ "loss": 0.6722,
1529
  "step": 1255
1530
  },
1531
  {
1532
  "epoch": 0.05,
1533
  "learning_rate": 1.9914868725192025e-06,
1534
+ "loss": 0.6943,
1535
  "step": 1260
1536
  },
1537
  {
1538
  "epoch": 0.05,
1539
  "learning_rate": 1.991402747269022e-06,
1540
+ "loss": 0.7433,
1541
  "step": 1265
1542
  },
1543
  {
1544
  "epoch": 0.05,
1545
  "learning_rate": 1.991318210198761e-06,
1546
+ "loss": 0.7015,
1547
  "step": 1270
1548
  },
1549
  {
1550
  "epoch": 0.05,
1551
  "learning_rate": 1.991233261343537e-06,
1552
+ "loss": 0.6772,
1553
  "step": 1275
1554
  },
1555
  {
1556
  "epoch": 0.05,
1557
  "learning_rate": 1.9911479007386364e-06,
1558
+ "loss": 0.7278,
1559
  "step": 1280
1560
  },
1561
  {
1562
  "epoch": 0.05,
1563
  "learning_rate": 1.991062128419517e-06,
1564
+ "loss": 0.7471,
1565
  "step": 1285
1566
  },
1567
  {
1568
  "epoch": 0.05,
1569
  "learning_rate": 1.9909759444218085e-06,
1570
+ "loss": 0.7234,
1571
  "step": 1290
1572
  },
1573
  {
1574
  "epoch": 0.05,
1575
  "learning_rate": 1.9908893487813106e-06,
1576
+ "loss": 0.7118,
1577
  "step": 1295
1578
  },
1579
  {
1580
  "epoch": 0.05,
1581
  "learning_rate": 1.990802341533994e-06,
1582
+ "loss": 0.7351,
1583
  "step": 1300
1584
  },
1585
  {
1586
  "epoch": 0.05,
1587
  "learning_rate": 1.9907149227160016e-06,
1588
+ "loss": 0.7084,
1589
  "step": 1305
1590
  },
1591
  {
1592
  "epoch": 0.05,
1593
  "learning_rate": 1.9906270923636457e-06,
1594
+ "loss": 0.7174,
1595
  "step": 1310
1596
  },
1597
  {
1598
  "epoch": 0.05,
1599
  "learning_rate": 1.9905388505134107e-06,
1600
+ "loss": 0.6935,
1601
  "step": 1315
1602
  },
1603
  {
1604
  "epoch": 0.05,
1605
  "learning_rate": 1.990450197201951e-06,
1606
+ "loss": 0.7004,
1607
  "step": 1320
1608
  },
1609
  {
 
1615
  {
1616
  "epoch": 0.05,
1617
  "learning_rate": 1.9902716563428335e-06,
1618
+ "loss": 0.7226,
1619
  "step": 1330
1620
  },
1621
  {
1622
  "epoch": 0.05,
1623
  "learning_rate": 1.9901817688693395e-06,
1624
+ "loss": 0.7025,
1625
  "step": 1335
1626
  },
1627
  {
1628
  "epoch": 0.05,
1629
  "learning_rate": 1.99009147008295e-06,
1630
+ "loss": 0.7139,
1631
  "step": 1340
1632
  },
1633
  {
1634
  "epoch": 0.05,
1635
  "learning_rate": 1.9900007600211735e-06,
1636
+ "loss": 0.6609,
1637
  "step": 1345
1638
  },
1639
  {
1640
  "epoch": 0.05,
1641
  "learning_rate": 1.9899096387216914e-06,
1642
+ "loss": 0.7452,
1643
  "step": 1350
1644
  },
1645
  {
 
1651
  {
1652
  "epoch": 0.06,
1653
  "learning_rate": 1.9897261625611822e-06,
1654
+ "loss": 0.6925,
1655
  "step": 1360
1656
  },
1657
  {
 
1663
  {
1664
  "epoch": 0.06,
1665
  "learning_rate": 1.989541041906281e-06,
1666
+ "loss": 0.7146,
1667
  "step": 1370
1668
  },
1669
  {
1670
  "epoch": 0.06,
1671
  "learning_rate": 1.9894478649894484e-06,
1672
+ "loss": 0.704,
1673
  "step": 1375
1674
  },
1675
  {
 
1681
  {
1682
  "epoch": 0.06,
1683
  "learning_rate": 1.9892602781705427e-06,
1684
+ "loss": 0.6958,
1685
  "step": 1385
1686
  },
1687
  {
 
1699
  {
1700
  "epoch": 0.06,
1701
  "learning_rate": 1.98897581606478e-06,
1702
+ "loss": 0.6868,
1703
  "step": 1400
1704
  },
1705
  {
1706
  "epoch": 0.06,
1707
  "learning_rate": 1.988880173686265e-06,
1708
+ "loss": 0.7437,
1709
  "step": 1405
1710
  },
1711
  {
1712
  "epoch": 0.06,
1713
  "learning_rate": 1.988784120535525e-06,
1714
+ "loss": 0.7484,
1715
  "step": 1410
1716
  },
1717
  {
1718
  "epoch": 0.06,
1719
  "learning_rate": 1.988687656652461e-06,
1720
+ "loss": 0.7063,
1721
  "step": 1415
1722
  },
1723
  {
1724
  "epoch": 0.06,
1725
  "learning_rate": 1.9885907820771415e-06,
1726
+ "loss": 0.713,
1727
  "step": 1420
1728
  },
1729
  {
1730
  "epoch": 0.06,
1731
  "learning_rate": 1.988493496849809e-06,
1732
+ "loss": 0.7313,
1733
  "step": 1425
1734
  },
1735
  {
1736
  "epoch": 0.06,
1737
  "learning_rate": 1.9883958010108736e-06,
1738
+ "loss": 0.6987,
1739
  "step": 1430
1740
  },
1741
  {
1742
  "epoch": 0.06,
1743
  "learning_rate": 1.9882976946009186e-06,
1744
+ "loss": 0.7089,
1745
  "step": 1435
1746
  },
1747
  {
 
1753
  {
1754
  "epoch": 0.06,
1755
  "learning_rate": 1.9881002502311285e-06,
1756
+ "loss": 0.6538,
1757
  "step": 1445
1758
  },
1759
  {
1760
  "epoch": 0.06,
1761
  "learning_rate": 1.9880009123533095e-06,
1762
+ "loss": 0.7096,
1763
  "step": 1450
1764
  },
1765
  {
1766
  "epoch": 0.06,
1767
  "learning_rate": 1.9879011640685043e-06,
1768
+ "loss": 0.7329,
1769
  "step": 1455
1770
  },
1771
  {
1772
  "epoch": 0.06,
1773
  "learning_rate": 1.9878010054181463e-06,
1774
+ "loss": 0.7414,
1775
  "step": 1460
1776
  },
1777
  {
1778
  "epoch": 0.06,
1779
  "learning_rate": 1.9877004364438414e-06,
1780
+ "loss": 0.7089,
1781
  "step": 1465
1782
  },
1783
  {
 
1789
  {
1790
  "epoch": 0.06,
1791
  "learning_rate": 1.9874980676906617e-06,
1792
+ "loss": 0.7078,
1793
  "step": 1475
1794
  },
1795
  {
1796
  "epoch": 0.06,
1797
  "learning_rate": 1.9873962679958494e-06,
1798
+ "loss": 0.6987,
1799
  "step": 1480
1800
  },
1801
  {
1802
  "epoch": 0.06,
1803
  "learning_rate": 1.987294058145214e-06,
1804
+ "loss": 0.7456,
1805
  "step": 1485
1806
  },
1807
  {
1808
  "epoch": 0.06,
1809
  "learning_rate": 1.987191438181213e-06,
1810
+ "loss": 0.7402,
1811
  "step": 1490
1812
  },
1813
  {
 
1819
  {
1820
  "epoch": 0.06,
1821
  "learning_rate": 1.986984968083793e-06,
1822
+ "loss": 0.7197,
1823
  "step": 1500
1824
  },
1825
  {
1826
  "epoch": 0.06,
1827
  "learning_rate": 1.9868811180361402e-06,
1828
+ "loss": 0.7386,
1829
  "step": 1505
1830
  },
1831
  {
1832
  "epoch": 0.06,
1833
  "learning_rate": 1.9867768580466536e-06,
1834
+ "loss": 0.7024,
1835
  "step": 1510
1836
  },
1837
  {
1838
  "epoch": 0.06,
1839
  "learning_rate": 1.986672188158641e-06,
1840
+ "loss": 0.7241,
1841
  "step": 1515
1842
  },
1843
  {
 
1849
  {
1850
  "epoch": 0.06,
1851
  "learning_rate": 1.986461618861127e-06,
1852
+ "loss": 0.7035,
1853
  "step": 1525
1854
  },
1855
  {
1856
  "epoch": 0.06,
1857
  "learning_rate": 1.986355719539093e-06,
1858
+ "loss": 0.723,
1859
  "step": 1530
1860
  },
1861
  {
1862
  "epoch": 0.06,
1863
  "learning_rate": 1.9862494104934717e-06,
1864
+ "loss": 0.7184,
1865
  "step": 1535
1866
  },
1867
  {
1868
  "epoch": 0.06,
1869
  "learning_rate": 1.9861426917684214e-06,
1870
+ "loss": 0.7018,
1871
  "step": 1540
1872
  },
1873
  {
1874
  "epoch": 0.06,
1875
  "learning_rate": 1.986035563408273e-06,
1876
+ "loss": 0.6943,
1877
  "step": 1545
1878
  },
1879
  {
1880
  "epoch": 0.06,
1881
  "learning_rate": 1.9859280254575268e-06,
1882
+ "loss": 0.7434,
1883
  "step": 1550
1884
  },
1885
  {
1886
  "epoch": 0.06,
1887
  "learning_rate": 1.9858200779608526e-06,
1888
+ "loss": 0.7122,
1889
  "step": 1555
1890
  },
1891
  {
1892
  "epoch": 0.06,
1893
  "learning_rate": 1.9857117209630913e-06,
1894
+ "loss": 0.7187,
1895
  "step": 1560
1896
  },
1897
  {
1898
  "epoch": 0.06,
1899
  "learning_rate": 1.9856029545092536e-06,
1900
+ "loss": 0.6825,
1901
  "step": 1565
1902
  },
1903
  {
1904
  "epoch": 0.06,
1905
  "learning_rate": 1.985493778644519e-06,
1906
+ "loss": 0.6964,
1907
  "step": 1570
1908
  },
1909
  {
1910
  "epoch": 0.06,
1911
  "learning_rate": 1.9853841934142396e-06,
1912
+ "loss": 0.7437,
1913
  "step": 1575
1914
  },
1915
  {
1916
  "epoch": 0.06,
1917
  "learning_rate": 1.9852741988639356e-06,
1918
+ "loss": 0.7125,
1919
  "step": 1580
1920
  },
1921
  {
1922
  "epoch": 0.06,
1923
  "learning_rate": 1.9851637950392974e-06,
1924
+ "loss": 0.7241,
1925
  "step": 1585
1926
  },
1927
  {
1928
  "epoch": 0.06,
1929
  "learning_rate": 1.9850529819861863e-06,
1930
+ "loss": 0.7113,
1931
  "step": 1590
1932
  },
1933
  {
1934
  "epoch": 0.06,
1935
  "learning_rate": 1.984941759750633e-06,
1936
+ "loss": 0.6725,
1937
  "step": 1595
1938
  },
1939
  {
1940
  "epoch": 0.06,
1941
  "learning_rate": 1.984830128378838e-06,
1942
+ "loss": 0.7166,
1943
  "step": 1600
1944
  },
1945
  {
1946
  "epoch": 0.06,
1947
+ "eval_loss": 0.6776626706123352,
1948
+ "eval_runtime": 140.3492,
1949
+ "eval_samples_per_second": 16.858,
1950
+ "eval_steps_per_second": 2.814,
1951
  "step": 1600
1952
+ },
1953
+ {
1954
+ "epoch": 0.07,
1955
+ "learning_rate": 1.9847180879171727e-06,
1956
+ "loss": 0.7111,
1957
+ "step": 1605
1958
+ },
1959
+ {
1960
+ "epoch": 0.07,
1961
+ "learning_rate": 1.9846056384121768e-06,
1962
+ "loss": 0.7004,
1963
+ "step": 1610
1964
+ },
1965
+ {
1966
+ "epoch": 0.07,
1967
+ "learning_rate": 1.9844927799105612e-06,
1968
+ "loss": 0.7221,
1969
+ "step": 1615
1970
+ },
1971
+ {
1972
+ "epoch": 0.07,
1973
+ "learning_rate": 1.984379512459207e-06,
1974
+ "loss": 0.7363,
1975
+ "step": 1620
1976
+ },
1977
+ {
1978
+ "epoch": 0.07,
1979
+ "learning_rate": 1.984265836105163e-06,
1980
+ "loss": 0.7107,
1981
+ "step": 1625
1982
+ },
1983
+ {
1984
+ "epoch": 0.07,
1985
+ "learning_rate": 1.9841517508956506e-06,
1986
+ "loss": 0.7081,
1987
+ "step": 1630
1988
+ },
1989
+ {
1990
+ "epoch": 0.07,
1991
+ "learning_rate": 1.9840372568780594e-06,
1992
+ "loss": 0.6796,
1993
+ "step": 1635
1994
+ },
1995
+ {
1996
+ "epoch": 0.07,
1997
+ "learning_rate": 1.9839223540999496e-06,
1998
+ "loss": 0.7207,
1999
+ "step": 1640
2000
+ },
2001
+ {
2002
+ "epoch": 0.07,
2003
+ "learning_rate": 1.9838070426090505e-06,
2004
+ "loss": 0.716,
2005
+ "step": 1645
2006
+ },
2007
+ {
2008
+ "epoch": 0.07,
2009
+ "learning_rate": 1.983691322453261e-06,
2010
+ "loss": 0.7306,
2011
+ "step": 1650
2012
+ },
2013
+ {
2014
+ "epoch": 0.07,
2015
+ "learning_rate": 1.983575193680651e-06,
2016
+ "loss": 0.724,
2017
+ "step": 1655
2018
+ },
2019
+ {
2020
+ "epoch": 0.07,
2021
+ "learning_rate": 1.983458656339459e-06,
2022
+ "loss": 0.7447,
2023
+ "step": 1660
2024
+ },
2025
+ {
2026
+ "epoch": 0.07,
2027
+ "learning_rate": 1.9833417104780942e-06,
2028
+ "loss": 0.6929,
2029
+ "step": 1665
2030
+ },
2031
+ {
2032
+ "epoch": 0.07,
2033
+ "learning_rate": 1.9832243561451346e-06,
2034
+ "loss": 0.7228,
2035
+ "step": 1670
2036
+ },
2037
+ {
2038
+ "epoch": 0.07,
2039
+ "learning_rate": 1.9831065933893275e-06,
2040
+ "loss": 0.6824,
2041
+ "step": 1675
2042
+ },
2043
+ {
2044
+ "epoch": 0.07,
2045
+ "learning_rate": 1.982988422259591e-06,
2046
+ "loss": 0.7056,
2047
+ "step": 1680
2048
+ },
2049
+ {
2050
+ "epoch": 0.07,
2051
+ "learning_rate": 1.9828698428050123e-06,
2052
+ "loss": 0.6943,
2053
+ "step": 1685
2054
+ },
2055
+ {
2056
+ "epoch": 0.07,
2057
+ "learning_rate": 1.982750855074849e-06,
2058
+ "loss": 0.7101,
2059
+ "step": 1690
2060
+ },
2061
+ {
2062
+ "epoch": 0.07,
2063
+ "learning_rate": 1.9826314591185263e-06,
2064
+ "loss": 0.6786,
2065
+ "step": 1695
2066
+ },
2067
+ {
2068
+ "epoch": 0.07,
2069
+ "learning_rate": 1.9825116549856408e-06,
2070
+ "loss": 0.6954,
2071
+ "step": 1700
2072
+ },
2073
+ {
2074
+ "epoch": 0.07,
2075
+ "learning_rate": 1.9823914427259584e-06,
2076
+ "loss": 0.7165,
2077
+ "step": 1705
2078
+ },
2079
+ {
2080
+ "epoch": 0.07,
2081
+ "learning_rate": 1.982270822389414e-06,
2082
+ "loss": 0.7208,
2083
+ "step": 1710
2084
+ },
2085
+ {
2086
+ "epoch": 0.07,
2087
+ "learning_rate": 1.9821497940261124e-06,
2088
+ "loss": 0.6981,
2089
+ "step": 1715
2090
+ },
2091
+ {
2092
+ "epoch": 0.07,
2093
+ "learning_rate": 1.982028357686327e-06,
2094
+ "loss": 0.6914,
2095
+ "step": 1720
2096
+ },
2097
+ {
2098
+ "epoch": 0.07,
2099
+ "learning_rate": 1.9819065134205026e-06,
2100
+ "loss": 0.7291,
2101
+ "step": 1725
2102
+ },
2103
+ {
2104
+ "epoch": 0.07,
2105
+ "learning_rate": 1.9817842612792513e-06,
2106
+ "loss": 0.6882,
2107
+ "step": 1730
2108
+ },
2109
+ {
2110
+ "epoch": 0.07,
2111
+ "learning_rate": 1.981661601313356e-06,
2112
+ "loss": 0.685,
2113
+ "step": 1735
2114
+ },
2115
+ {
2116
+ "epoch": 0.07,
2117
+ "learning_rate": 1.981538533573768e-06,
2118
+ "loss": 0.6954,
2119
+ "step": 1740
2120
+ },
2121
+ {
2122
+ "epoch": 0.07,
2123
+ "learning_rate": 1.9814150581116093e-06,
2124
+ "loss": 0.7104,
2125
+ "step": 1745
2126
+ },
2127
+ {
2128
+ "epoch": 0.07,
2129
+ "learning_rate": 1.9812911749781705e-06,
2130
+ "loss": 0.7026,
2131
+ "step": 1750
2132
+ },
2133
+ {
2134
+ "epoch": 0.07,
2135
+ "learning_rate": 1.981166884224911e-06,
2136
+ "loss": 0.6907,
2137
+ "step": 1755
2138
+ },
2139
+ {
2140
+ "epoch": 0.07,
2141
+ "learning_rate": 1.981042185903461e-06,
2142
+ "loss": 0.6988,
2143
+ "step": 1760
2144
+ },
2145
+ {
2146
+ "epoch": 0.07,
2147
+ "learning_rate": 1.980917080065618e-06,
2148
+ "loss": 0.6894,
2149
+ "step": 1765
2150
+ },
2151
+ {
2152
+ "epoch": 0.07,
2153
+ "learning_rate": 1.98079156676335e-06,
2154
+ "loss": 0.7308,
2155
+ "step": 1770
2156
+ },
2157
+ {
2158
+ "epoch": 0.07,
2159
+ "learning_rate": 1.9806656460487955e-06,
2160
+ "loss": 0.6688,
2161
+ "step": 1775
2162
+ },
2163
+ {
2164
+ "epoch": 0.07,
2165
+ "learning_rate": 1.9805393179742596e-06,
2166
+ "loss": 0.7028,
2167
+ "step": 1780
2168
+ },
2169
+ {
2170
+ "epoch": 0.07,
2171
+ "learning_rate": 1.980412582592218e-06,
2172
+ "loss": 0.6982,
2173
+ "step": 1785
2174
+ },
2175
+ {
2176
+ "epoch": 0.07,
2177
+ "learning_rate": 1.980285439955316e-06,
2178
+ "loss": 0.7326,
2179
+ "step": 1790
2180
+ },
2181
+ {
2182
+ "epoch": 0.07,
2183
+ "learning_rate": 1.980157890116367e-06,
2184
+ "loss": 0.7204,
2185
+ "step": 1795
2186
+ },
2187
+ {
2188
+ "epoch": 0.07,
2189
+ "learning_rate": 1.980029933128354e-06,
2190
+ "loss": 0.7016,
2191
+ "step": 1800
2192
+ },
2193
+ {
2194
+ "epoch": 0.07,
2195
+ "learning_rate": 1.9799015690444302e-06,
2196
+ "loss": 0.7076,
2197
+ "step": 1805
2198
+ },
2199
+ {
2200
+ "epoch": 0.07,
2201
+ "learning_rate": 1.9797727979179156e-06,
2202
+ "loss": 0.7121,
2203
+ "step": 1810
2204
+ },
2205
+ {
2206
+ "epoch": 0.07,
2207
+ "learning_rate": 1.9796436198023016e-06,
2208
+ "loss": 0.7204,
2209
+ "step": 1815
2210
+ },
2211
+ {
2212
+ "epoch": 0.07,
2213
+ "learning_rate": 1.9795140347512472e-06,
2214
+ "loss": 0.7178,
2215
+ "step": 1820
2216
+ },
2217
+ {
2218
+ "epoch": 0.07,
2219
+ "learning_rate": 1.979384042818581e-06,
2220
+ "loss": 0.7223,
2221
+ "step": 1825
2222
+ },
2223
+ {
2224
+ "epoch": 0.07,
2225
+ "learning_rate": 1.979253644058301e-06,
2226
+ "loss": 0.7066,
2227
+ "step": 1830
2228
+ },
2229
+ {
2230
+ "epoch": 0.07,
2231
+ "learning_rate": 1.979122838524573e-06,
2232
+ "loss": 0.6873,
2233
+ "step": 1835
2234
+ },
2235
+ {
2236
+ "epoch": 0.07,
2237
+ "learning_rate": 1.9789916262717328e-06,
2238
+ "loss": 0.6822,
2239
+ "step": 1840
2240
+ },
2241
+ {
2242
+ "epoch": 0.07,
2243
+ "learning_rate": 1.9788600073542848e-06,
2244
+ "loss": 0.6947,
2245
+ "step": 1845
2246
+ },
2247
+ {
2248
+ "epoch": 0.08,
2249
+ "learning_rate": 1.978727981826902e-06,
2250
+ "loss": 0.7092,
2251
+ "step": 1850
2252
+ },
2253
+ {
2254
+ "epoch": 0.08,
2255
+ "learning_rate": 1.978595549744427e-06,
2256
+ "loss": 0.7166,
2257
+ "step": 1855
2258
+ },
2259
+ {
2260
+ "epoch": 0.08,
2261
+ "learning_rate": 1.9784627111618715e-06,
2262
+ "loss": 0.6842,
2263
+ "step": 1860
2264
+ },
2265
+ {
2266
+ "epoch": 0.08,
2267
+ "learning_rate": 1.9783294661344145e-06,
2268
+ "loss": 0.7161,
2269
+ "step": 1865
2270
+ },
2271
+ {
2272
+ "epoch": 0.08,
2273
+ "learning_rate": 1.978195814717405e-06,
2274
+ "loss": 0.6881,
2275
+ "step": 1870
2276
+ },
2277
+ {
2278
+ "epoch": 0.08,
2279
+ "learning_rate": 1.978061756966361e-06,
2280
+ "loss": 0.7342,
2281
+ "step": 1875
2282
+ },
2283
+ {
2284
+ "epoch": 0.08,
2285
+ "learning_rate": 1.977927292936969e-06,
2286
+ "loss": 0.6767,
2287
+ "step": 1880
2288
+ },
2289
+ {
2290
+ "epoch": 0.08,
2291
+ "learning_rate": 1.9777924226850842e-06,
2292
+ "loss": 0.7096,
2293
+ "step": 1885
2294
+ },
2295
+ {
2296
+ "epoch": 0.08,
2297
+ "learning_rate": 1.97765714626673e-06,
2298
+ "loss": 0.694,
2299
+ "step": 1890
2300
+ },
2301
+ {
2302
+ "epoch": 0.08,
2303
+ "learning_rate": 1.977521463738099e-06,
2304
+ "loss": 0.7152,
2305
+ "step": 1895
2306
+ },
2307
+ {
2308
+ "epoch": 0.08,
2309
+ "learning_rate": 1.9773853751555537e-06,
2310
+ "loss": 0.6618,
2311
+ "step": 1900
2312
+ },
2313
+ {
2314
+ "epoch": 0.08,
2315
+ "learning_rate": 1.977248880575623e-06,
2316
+ "loss": 0.689,
2317
+ "step": 1905
2318
+ },
2319
+ {
2320
+ "epoch": 0.08,
2321
+ "learning_rate": 1.9771119800550054e-06,
2322
+ "loss": 0.6892,
2323
+ "step": 1910
2324
+ },
2325
+ {
2326
+ "epoch": 0.08,
2327
+ "learning_rate": 1.9769746736505694e-06,
2328
+ "loss": 0.7179,
2329
+ "step": 1915
2330
+ },
2331
+ {
2332
+ "epoch": 0.08,
2333
+ "learning_rate": 1.97683696141935e-06,
2334
+ "loss": 0.6888,
2335
+ "step": 1920
2336
+ },
2337
+ {
2338
+ "epoch": 0.08,
2339
+ "learning_rate": 1.9766988434185514e-06,
2340
+ "loss": 0.7041,
2341
+ "step": 1925
2342
+ },
2343
+ {
2344
+ "epoch": 0.08,
2345
+ "learning_rate": 1.976560319705547e-06,
2346
+ "loss": 0.6969,
2347
+ "step": 1930
2348
+ },
2349
+ {
2350
+ "epoch": 0.08,
2351
+ "learning_rate": 1.9764213903378786e-06,
2352
+ "loss": 0.7162,
2353
+ "step": 1935
2354
+ },
2355
+ {
2356
+ "epoch": 0.08,
2357
+ "learning_rate": 1.9762820553732563e-06,
2358
+ "loss": 0.7178,
2359
+ "step": 1940
2360
+ },
2361
+ {
2362
+ "epoch": 0.08,
2363
+ "learning_rate": 1.976142314869558e-06,
2364
+ "loss": 0.7309,
2365
+ "step": 1945
2366
+ },
2367
+ {
2368
+ "epoch": 0.08,
2369
+ "learning_rate": 1.976002168884831e-06,
2370
+ "loss": 0.7198,
2371
+ "step": 1950
2372
+ },
2373
+ {
2374
+ "epoch": 0.08,
2375
+ "learning_rate": 1.975861617477291e-06,
2376
+ "loss": 0.7131,
2377
+ "step": 1955
2378
+ },
2379
+ {
2380
+ "epoch": 0.08,
2381
+ "learning_rate": 1.9757206607053218e-06,
2382
+ "loss": 0.7087,
2383
+ "step": 1960
2384
+ },
2385
+ {
2386
+ "epoch": 0.08,
2387
+ "learning_rate": 1.9755792986274755e-06,
2388
+ "loss": 0.6708,
2389
+ "step": 1965
2390
+ },
2391
+ {
2392
+ "epoch": 0.08,
2393
+ "learning_rate": 1.975437531302472e-06,
2394
+ "loss": 0.7141,
2395
+ "step": 1970
2396
+ },
2397
+ {
2398
+ "epoch": 0.08,
2399
+ "learning_rate": 1.975295358789201e-06,
2400
+ "loss": 0.7152,
2401
+ "step": 1975
2402
+ },
2403
+ {
2404
+ "epoch": 0.08,
2405
+ "learning_rate": 1.9751527811467195e-06,
2406
+ "loss": 0.7172,
2407
+ "step": 1980
2408
+ },
2409
+ {
2410
+ "epoch": 0.08,
2411
+ "learning_rate": 1.9750097984342534e-06,
2412
+ "loss": 0.7472,
2413
+ "step": 1985
2414
+ },
2415
+ {
2416
+ "epoch": 0.08,
2417
+ "learning_rate": 1.9748664107111962e-06,
2418
+ "loss": 0.7129,
2419
+ "step": 1990
2420
+ },
2421
+ {
2422
+ "epoch": 0.08,
2423
+ "learning_rate": 1.9747226180371094e-06,
2424
+ "loss": 0.7066,
2425
+ "step": 1995
2426
+ },
2427
+ {
2428
+ "epoch": 0.08,
2429
+ "learning_rate": 1.974578420471724e-06,
2430
+ "loss": 0.7049,
2431
+ "step": 2000
2432
  }
2433
  ],
2434
  "logging_steps": 5,
2435
  "max_steps": 24619,
2436
  "num_input_tokens_seen": 0,
2437
  "num_train_epochs": 1,
2438
+ "save_steps": 400,
2439
+ "total_flos": 278443669200896.0,
2440
  "trial_name": null,
2441
  "trial_params": null
2442
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bde57295731524ffcd1fb395ad4bc588952a10b5811a7f7d30c1b52c6f6f1358
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42c2be9c6a28ce5748825d5c0e95370dd7ccd6a17d959b497384bbfa41a17e35
3
  size 5752