jdannem6 commited on
Commit
0588207
1 Parent(s): 2bbb312

Uploaded checkpoint-27500

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1793 -3
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1a61d604982debabd50f305447a20e849e6d906fb944a90bfb88ee03a35dd98
3
  size 2692969128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40a3f22374b10f1dacc2052bbe1eeb9d3ec51c4d0215210d58e218ca693293da
3
  size 2692969128
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63f96e1ef451a36c123151a3e8a6afe5e5f10261ae5123a4626d1fb00336a925
3
  size 5386075202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae1cc318abadbe97655b2e89cba3a93d2fadb650d801744bdfc7c6a4ccca5c0
3
  size 5386075202
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dc3ee4325da0f20e64010c8e1fb9c1567edc642dd9ab4a2d4367d1009e4383e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38fbaa4aaa427747240a6b65afd267f1edcb968fa67c3bf21b881737ad1b8da3
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0769304697a92d05b3f54a364ac1e52204140fdb95fd093b56a8d6138f45860
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30c1646ffd4f2e4e86a7c5c87af0949f3be46b7539d2a0137b1bb01bf3e8bbe5
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.6027244925498962,
3
  "best_model_checkpoint": "runs/deepseek_CMU-AIR2/math-deepseek_FULL_HardArith_Interm_20240424-065814/checkpoint-5000",
4
- "epoch": 0.125,
5
  "eval_steps": 500,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3587,6 +3587,1796 @@
3587
  "eval_samples_per_second": 26.216,
3588
  "eval_steps_per_second": 26.216,
3589
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3590
  }
3591
  ],
3592
  "logging_steps": 10,
@@ -3594,7 +5384,7 @@
3594
  "num_input_tokens_seen": 0,
3595
  "num_train_epochs": 1,
3596
  "save_steps": 2500,
3597
- "total_flos": 7.866849165312e+16,
3598
  "train_batch_size": 1,
3599
  "trial_name": null,
3600
  "trial_params": null
 
1
  {
2
  "best_metric": 0.6027244925498962,
3
  "best_model_checkpoint": "runs/deepseek_CMU-AIR2/math-deepseek_FULL_HardArith_Interm_20240424-065814/checkpoint-5000",
4
+ "epoch": 0.1875,
5
  "eval_steps": 500,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3587
  "eval_samples_per_second": 26.216,
3588
  "eval_steps_per_second": 26.216,
3589
  "step": 5000
3590
+ },
3591
+ {
3592
+ "epoch": 0.13,
3593
+ "grad_norm": 4.53125,
3594
+ "learning_rate": 1.0505263157894739e-05,
3595
+ "loss": 0.7573,
3596
+ "step": 5010
3597
+ },
3598
+ {
3599
+ "epoch": 0.13,
3600
+ "grad_norm": 1.8515625,
3601
+ "learning_rate": 1.048421052631579e-05,
3602
+ "loss": 0.5092,
3603
+ "step": 5020
3604
+ },
3605
+ {
3606
+ "epoch": 0.13,
3607
+ "grad_norm": 14.9375,
3608
+ "learning_rate": 1.0463157894736844e-05,
3609
+ "loss": 0.642,
3610
+ "step": 5030
3611
+ },
3612
+ {
3613
+ "epoch": 0.13,
3614
+ "grad_norm": 4.90625,
3615
+ "learning_rate": 1.0442105263157895e-05,
3616
+ "loss": 0.6185,
3617
+ "step": 5040
3618
+ },
3619
+ {
3620
+ "epoch": 0.13,
3621
+ "grad_norm": 4.5625,
3622
+ "learning_rate": 1.0421052631578948e-05,
3623
+ "loss": 0.6183,
3624
+ "step": 5050
3625
+ },
3626
+ {
3627
+ "epoch": 0.13,
3628
+ "grad_norm": 6.125,
3629
+ "learning_rate": 1.04e-05,
3630
+ "loss": 0.5496,
3631
+ "step": 5060
3632
+ },
3633
+ {
3634
+ "epoch": 0.13,
3635
+ "grad_norm": 4.5625,
3636
+ "learning_rate": 1.0378947368421053e-05,
3637
+ "loss": 0.5861,
3638
+ "step": 5070
3639
+ },
3640
+ {
3641
+ "epoch": 0.13,
3642
+ "grad_norm": 3.515625,
3643
+ "learning_rate": 1.0357894736842107e-05,
3644
+ "loss": 0.6365,
3645
+ "step": 5080
3646
+ },
3647
+ {
3648
+ "epoch": 0.13,
3649
+ "grad_norm": 4.75,
3650
+ "learning_rate": 1.0336842105263158e-05,
3651
+ "loss": 0.6688,
3652
+ "step": 5090
3653
+ },
3654
+ {
3655
+ "epoch": 0.13,
3656
+ "grad_norm": 2.703125,
3657
+ "learning_rate": 1.0315789473684213e-05,
3658
+ "loss": 0.5361,
3659
+ "step": 5100
3660
+ },
3661
+ {
3662
+ "epoch": 0.13,
3663
+ "grad_norm": 4.125,
3664
+ "learning_rate": 1.0294736842105264e-05,
3665
+ "loss": 0.6006,
3666
+ "step": 5110
3667
+ },
3668
+ {
3669
+ "epoch": 0.13,
3670
+ "grad_norm": 3.890625,
3671
+ "learning_rate": 1.0273684210526316e-05,
3672
+ "loss": 0.6353,
3673
+ "step": 5120
3674
+ },
3675
+ {
3676
+ "epoch": 0.13,
3677
+ "grad_norm": 6.25,
3678
+ "learning_rate": 1.0252631578947369e-05,
3679
+ "loss": 0.6475,
3680
+ "step": 5130
3681
+ },
3682
+ {
3683
+ "epoch": 0.13,
3684
+ "grad_norm": 4.625,
3685
+ "learning_rate": 1.0231578947368422e-05,
3686
+ "loss": 0.498,
3687
+ "step": 5140
3688
+ },
3689
+ {
3690
+ "epoch": 0.13,
3691
+ "grad_norm": 4.75,
3692
+ "learning_rate": 1.0210526315789476e-05,
3693
+ "loss": 0.6467,
3694
+ "step": 5150
3695
+ },
3696
+ {
3697
+ "epoch": 0.13,
3698
+ "grad_norm": 4.1875,
3699
+ "learning_rate": 1.0189473684210527e-05,
3700
+ "loss": 0.611,
3701
+ "step": 5160
3702
+ },
3703
+ {
3704
+ "epoch": 0.13,
3705
+ "grad_norm": 9.75,
3706
+ "learning_rate": 1.0168421052631581e-05,
3707
+ "loss": 0.5392,
3708
+ "step": 5170
3709
+ },
3710
+ {
3711
+ "epoch": 0.13,
3712
+ "grad_norm": 13.25,
3713
+ "learning_rate": 1.0147368421052632e-05,
3714
+ "loss": 0.645,
3715
+ "step": 5180
3716
+ },
3717
+ {
3718
+ "epoch": 0.13,
3719
+ "grad_norm": 4.125,
3720
+ "learning_rate": 1.0126315789473685e-05,
3721
+ "loss": 0.6287,
3722
+ "step": 5190
3723
+ },
3724
+ {
3725
+ "epoch": 0.13,
3726
+ "grad_norm": 4.21875,
3727
+ "learning_rate": 1.0105263157894738e-05,
3728
+ "loss": 0.5719,
3729
+ "step": 5200
3730
+ },
3731
+ {
3732
+ "epoch": 0.13,
3733
+ "grad_norm": 2.671875,
3734
+ "learning_rate": 1.008421052631579e-05,
3735
+ "loss": 0.621,
3736
+ "step": 5210
3737
+ },
3738
+ {
3739
+ "epoch": 0.13,
3740
+ "grad_norm": 2.984375,
3741
+ "learning_rate": 1.0063157894736843e-05,
3742
+ "loss": 0.5169,
3743
+ "step": 5220
3744
+ },
3745
+ {
3746
+ "epoch": 0.13,
3747
+ "grad_norm": 8.375,
3748
+ "learning_rate": 1.0042105263157896e-05,
3749
+ "loss": 0.6793,
3750
+ "step": 5230
3751
+ },
3752
+ {
3753
+ "epoch": 0.13,
3754
+ "grad_norm": 117.0,
3755
+ "learning_rate": 1.002105263157895e-05,
3756
+ "loss": 0.6005,
3757
+ "step": 5240
3758
+ },
3759
+ {
3760
+ "epoch": 0.13,
3761
+ "grad_norm": 3.5625,
3762
+ "learning_rate": 1e-05,
3763
+ "loss": 0.6674,
3764
+ "step": 5250
3765
+ },
3766
+ {
3767
+ "epoch": 0.13,
3768
+ "grad_norm": 3.9375,
3769
+ "learning_rate": 9.978947368421053e-06,
3770
+ "loss": 0.6105,
3771
+ "step": 5260
3772
+ },
3773
+ {
3774
+ "epoch": 0.13,
3775
+ "grad_norm": 4.75,
3776
+ "learning_rate": 9.957894736842106e-06,
3777
+ "loss": 0.5876,
3778
+ "step": 5270
3779
+ },
3780
+ {
3781
+ "epoch": 0.13,
3782
+ "grad_norm": 3.203125,
3783
+ "learning_rate": 9.936842105263159e-06,
3784
+ "loss": 0.5551,
3785
+ "step": 5280
3786
+ },
3787
+ {
3788
+ "epoch": 0.13,
3789
+ "grad_norm": 4.84375,
3790
+ "learning_rate": 9.915789473684211e-06,
3791
+ "loss": 0.5699,
3792
+ "step": 5290
3793
+ },
3794
+ {
3795
+ "epoch": 0.13,
3796
+ "grad_norm": 3.5625,
3797
+ "learning_rate": 9.894736842105264e-06,
3798
+ "loss": 0.5652,
3799
+ "step": 5300
3800
+ },
3801
+ {
3802
+ "epoch": 0.13,
3803
+ "grad_norm": 7.59375,
3804
+ "learning_rate": 9.873684210526317e-06,
3805
+ "loss": 0.6494,
3806
+ "step": 5310
3807
+ },
3808
+ {
3809
+ "epoch": 0.13,
3810
+ "grad_norm": 6.03125,
3811
+ "learning_rate": 9.85263157894737e-06,
3812
+ "loss": 0.6,
3813
+ "step": 5320
3814
+ },
3815
+ {
3816
+ "epoch": 0.13,
3817
+ "grad_norm": 3.3125,
3818
+ "learning_rate": 9.831578947368422e-06,
3819
+ "loss": 0.603,
3820
+ "step": 5330
3821
+ },
3822
+ {
3823
+ "epoch": 0.13,
3824
+ "grad_norm": 3.546875,
3825
+ "learning_rate": 9.810526315789475e-06,
3826
+ "loss": 0.6117,
3827
+ "step": 5340
3828
+ },
3829
+ {
3830
+ "epoch": 0.13,
3831
+ "grad_norm": 4.625,
3832
+ "learning_rate": 9.789473684210527e-06,
3833
+ "loss": 0.5883,
3834
+ "step": 5350
3835
+ },
3836
+ {
3837
+ "epoch": 0.13,
3838
+ "grad_norm": 3.234375,
3839
+ "learning_rate": 9.76842105263158e-06,
3840
+ "loss": 0.5722,
3841
+ "step": 5360
3842
+ },
3843
+ {
3844
+ "epoch": 0.13,
3845
+ "grad_norm": 3.171875,
3846
+ "learning_rate": 9.747368421052633e-06,
3847
+ "loss": 0.5692,
3848
+ "step": 5370
3849
+ },
3850
+ {
3851
+ "epoch": 0.13,
3852
+ "grad_norm": 3.515625,
3853
+ "learning_rate": 9.726315789473685e-06,
3854
+ "loss": 0.7724,
3855
+ "step": 5380
3856
+ },
3857
+ {
3858
+ "epoch": 0.13,
3859
+ "grad_norm": 4.25,
3860
+ "learning_rate": 9.705263157894738e-06,
3861
+ "loss": 0.6017,
3862
+ "step": 5390
3863
+ },
3864
+ {
3865
+ "epoch": 0.14,
3866
+ "grad_norm": 6.53125,
3867
+ "learning_rate": 9.68421052631579e-06,
3868
+ "loss": 0.6049,
3869
+ "step": 5400
3870
+ },
3871
+ {
3872
+ "epoch": 0.14,
3873
+ "grad_norm": 4.34375,
3874
+ "learning_rate": 9.663157894736843e-06,
3875
+ "loss": 0.5158,
3876
+ "step": 5410
3877
+ },
3878
+ {
3879
+ "epoch": 0.14,
3880
+ "grad_norm": 5.34375,
3881
+ "learning_rate": 9.642105263157896e-06,
3882
+ "loss": 0.6487,
3883
+ "step": 5420
3884
+ },
3885
+ {
3886
+ "epoch": 0.14,
3887
+ "grad_norm": 5.71875,
3888
+ "learning_rate": 9.621052631578947e-06,
3889
+ "loss": 0.5599,
3890
+ "step": 5430
3891
+ },
3892
+ {
3893
+ "epoch": 0.14,
3894
+ "grad_norm": 5.34375,
3895
+ "learning_rate": 9.600000000000001e-06,
3896
+ "loss": 0.5035,
3897
+ "step": 5440
3898
+ },
3899
+ {
3900
+ "epoch": 0.14,
3901
+ "grad_norm": 4.4375,
3902
+ "learning_rate": 9.578947368421054e-06,
3903
+ "loss": 0.5768,
3904
+ "step": 5450
3905
+ },
3906
+ {
3907
+ "epoch": 0.14,
3908
+ "grad_norm": 48.25,
3909
+ "learning_rate": 9.557894736842107e-06,
3910
+ "loss": 0.5815,
3911
+ "step": 5460
3912
+ },
3913
+ {
3914
+ "epoch": 0.14,
3915
+ "grad_norm": 3.40625,
3916
+ "learning_rate": 9.53684210526316e-06,
3917
+ "loss": 0.6133,
3918
+ "step": 5470
3919
+ },
3920
+ {
3921
+ "epoch": 0.14,
3922
+ "grad_norm": 2.40625,
3923
+ "learning_rate": 9.515789473684212e-06,
3924
+ "loss": 0.5997,
3925
+ "step": 5480
3926
+ },
3927
+ {
3928
+ "epoch": 0.14,
3929
+ "grad_norm": 3.578125,
3930
+ "learning_rate": 9.494736842105265e-06,
3931
+ "loss": 0.5689,
3932
+ "step": 5490
3933
+ },
3934
+ {
3935
+ "epoch": 0.14,
3936
+ "grad_norm": 7.0625,
3937
+ "learning_rate": 9.473684210526315e-06,
3938
+ "loss": 0.6511,
3939
+ "step": 5500
3940
+ },
3941
+ {
3942
+ "epoch": 0.14,
3943
+ "eval_loss": 0.6153059005737305,
3944
+ "eval_runtime": 38.2125,
3945
+ "eval_samples_per_second": 26.169,
3946
+ "eval_steps_per_second": 26.169,
3947
+ "step": 5500
3948
+ },
3949
+ {
3950
+ "epoch": 0.14,
3951
+ "grad_norm": 3.1875,
3952
+ "learning_rate": 9.452631578947368e-06,
3953
+ "loss": 0.5449,
3954
+ "step": 5510
3955
+ },
3956
+ {
3957
+ "epoch": 0.14,
3958
+ "grad_norm": 4.40625,
3959
+ "learning_rate": 9.43157894736842e-06,
3960
+ "loss": 0.6634,
3961
+ "step": 5520
3962
+ },
3963
+ {
3964
+ "epoch": 0.14,
3965
+ "grad_norm": 7.875,
3966
+ "learning_rate": 9.410526315789475e-06,
3967
+ "loss": 0.6222,
3968
+ "step": 5530
3969
+ },
3970
+ {
3971
+ "epoch": 0.14,
3972
+ "grad_norm": 5.4375,
3973
+ "learning_rate": 9.389473684210528e-06,
3974
+ "loss": 0.5707,
3975
+ "step": 5540
3976
+ },
3977
+ {
3978
+ "epoch": 0.14,
3979
+ "grad_norm": 3.828125,
3980
+ "learning_rate": 9.36842105263158e-06,
3981
+ "loss": 0.4959,
3982
+ "step": 5550
3983
+ },
3984
+ {
3985
+ "epoch": 0.14,
3986
+ "grad_norm": 6.0625,
3987
+ "learning_rate": 9.347368421052633e-06,
3988
+ "loss": 0.5941,
3989
+ "step": 5560
3990
+ },
3991
+ {
3992
+ "epoch": 0.14,
3993
+ "grad_norm": 1.6015625,
3994
+ "learning_rate": 9.326315789473684e-06,
3995
+ "loss": 0.6101,
3996
+ "step": 5570
3997
+ },
3998
+ {
3999
+ "epoch": 0.14,
4000
+ "grad_norm": 4.4375,
4001
+ "learning_rate": 9.305263157894737e-06,
4002
+ "loss": 0.5916,
4003
+ "step": 5580
4004
+ },
4005
+ {
4006
+ "epoch": 0.14,
4007
+ "grad_norm": 4.34375,
4008
+ "learning_rate": 9.28421052631579e-06,
4009
+ "loss": 0.5927,
4010
+ "step": 5590
4011
+ },
4012
+ {
4013
+ "epoch": 0.14,
4014
+ "grad_norm": 5.90625,
4015
+ "learning_rate": 9.263157894736842e-06,
4016
+ "loss": 0.5575,
4017
+ "step": 5600
4018
+ },
4019
+ {
4020
+ "epoch": 0.14,
4021
+ "grad_norm": 8.5,
4022
+ "learning_rate": 9.242105263157896e-06,
4023
+ "loss": 0.5709,
4024
+ "step": 5610
4025
+ },
4026
+ {
4027
+ "epoch": 0.14,
4028
+ "grad_norm": 3.5,
4029
+ "learning_rate": 9.221052631578949e-06,
4030
+ "loss": 0.6237,
4031
+ "step": 5620
4032
+ },
4033
+ {
4034
+ "epoch": 0.14,
4035
+ "grad_norm": 5.25,
4036
+ "learning_rate": 9.200000000000002e-06,
4037
+ "loss": 0.6183,
4038
+ "step": 5630
4039
+ },
4040
+ {
4041
+ "epoch": 0.14,
4042
+ "grad_norm": 6.3125,
4043
+ "learning_rate": 9.178947368421053e-06,
4044
+ "loss": 0.579,
4045
+ "step": 5640
4046
+ },
4047
+ {
4048
+ "epoch": 0.14,
4049
+ "grad_norm": 4.8125,
4050
+ "learning_rate": 9.157894736842105e-06,
4051
+ "loss": 0.5801,
4052
+ "step": 5650
4053
+ },
4054
+ {
4055
+ "epoch": 0.14,
4056
+ "grad_norm": 3.71875,
4057
+ "learning_rate": 9.136842105263158e-06,
4058
+ "loss": 0.5568,
4059
+ "step": 5660
4060
+ },
4061
+ {
4062
+ "epoch": 0.14,
4063
+ "grad_norm": 2.84375,
4064
+ "learning_rate": 9.11578947368421e-06,
4065
+ "loss": 0.6277,
4066
+ "step": 5670
4067
+ },
4068
+ {
4069
+ "epoch": 0.14,
4070
+ "grad_norm": 3.125,
4071
+ "learning_rate": 9.094736842105263e-06,
4072
+ "loss": 0.5189,
4073
+ "step": 5680
4074
+ },
4075
+ {
4076
+ "epoch": 0.14,
4077
+ "grad_norm": 2.3125,
4078
+ "learning_rate": 9.073684210526316e-06,
4079
+ "loss": 0.6304,
4080
+ "step": 5690
4081
+ },
4082
+ {
4083
+ "epoch": 0.14,
4084
+ "grad_norm": 2.359375,
4085
+ "learning_rate": 9.05263157894737e-06,
4086
+ "loss": 0.5956,
4087
+ "step": 5700
4088
+ },
4089
+ {
4090
+ "epoch": 0.14,
4091
+ "grad_norm": 11.9375,
4092
+ "learning_rate": 9.031578947368423e-06,
4093
+ "loss": 0.6483,
4094
+ "step": 5710
4095
+ },
4096
+ {
4097
+ "epoch": 0.14,
4098
+ "grad_norm": 5.09375,
4099
+ "learning_rate": 9.010526315789474e-06,
4100
+ "loss": 0.601,
4101
+ "step": 5720
4102
+ },
4103
+ {
4104
+ "epoch": 0.14,
4105
+ "grad_norm": 3.5,
4106
+ "learning_rate": 8.989473684210527e-06,
4107
+ "loss": 0.5887,
4108
+ "step": 5730
4109
+ },
4110
+ {
4111
+ "epoch": 0.14,
4112
+ "grad_norm": 3.09375,
4113
+ "learning_rate": 8.96842105263158e-06,
4114
+ "loss": 0.63,
4115
+ "step": 5740
4116
+ },
4117
+ {
4118
+ "epoch": 0.14,
4119
+ "grad_norm": 4.4375,
4120
+ "learning_rate": 8.947368421052632e-06,
4121
+ "loss": 0.5988,
4122
+ "step": 5750
4123
+ },
4124
+ {
4125
+ "epoch": 0.14,
4126
+ "grad_norm": 3.265625,
4127
+ "learning_rate": 8.926315789473685e-06,
4128
+ "loss": 0.5692,
4129
+ "step": 5760
4130
+ },
4131
+ {
4132
+ "epoch": 0.14,
4133
+ "grad_norm": 3.015625,
4134
+ "learning_rate": 8.905263157894737e-06,
4135
+ "loss": 0.5917,
4136
+ "step": 5770
4137
+ },
4138
+ {
4139
+ "epoch": 0.14,
4140
+ "grad_norm": 4.40625,
4141
+ "learning_rate": 8.884210526315792e-06,
4142
+ "loss": 0.565,
4143
+ "step": 5780
4144
+ },
4145
+ {
4146
+ "epoch": 0.14,
4147
+ "grad_norm": 4.15625,
4148
+ "learning_rate": 8.863157894736842e-06,
4149
+ "loss": 0.5388,
4150
+ "step": 5790
4151
+ },
4152
+ {
4153
+ "epoch": 0.14,
4154
+ "grad_norm": 7.875,
4155
+ "learning_rate": 8.842105263157895e-06,
4156
+ "loss": 0.6622,
4157
+ "step": 5800
4158
+ },
4159
+ {
4160
+ "epoch": 0.15,
4161
+ "grad_norm": 6.25,
4162
+ "learning_rate": 8.821052631578948e-06,
4163
+ "loss": 0.6043,
4164
+ "step": 5810
4165
+ },
4166
+ {
4167
+ "epoch": 0.15,
4168
+ "grad_norm": 3.953125,
4169
+ "learning_rate": 8.8e-06,
4170
+ "loss": 0.6431,
4171
+ "step": 5820
4172
+ },
4173
+ {
4174
+ "epoch": 0.15,
4175
+ "grad_norm": 2.421875,
4176
+ "learning_rate": 8.778947368421053e-06,
4177
+ "loss": 0.563,
4178
+ "step": 5830
4179
+ },
4180
+ {
4181
+ "epoch": 0.15,
4182
+ "grad_norm": 7.4375,
4183
+ "learning_rate": 8.757894736842106e-06,
4184
+ "loss": 0.6342,
4185
+ "step": 5840
4186
+ },
4187
+ {
4188
+ "epoch": 0.15,
4189
+ "grad_norm": 4.46875,
4190
+ "learning_rate": 8.736842105263158e-06,
4191
+ "loss": 0.6206,
4192
+ "step": 5850
4193
+ },
4194
+ {
4195
+ "epoch": 0.15,
4196
+ "grad_norm": 2.828125,
4197
+ "learning_rate": 8.715789473684211e-06,
4198
+ "loss": 0.5046,
4199
+ "step": 5860
4200
+ },
4201
+ {
4202
+ "epoch": 0.15,
4203
+ "grad_norm": 13.75,
4204
+ "learning_rate": 8.694736842105264e-06,
4205
+ "loss": 0.5405,
4206
+ "step": 5870
4207
+ },
4208
+ {
4209
+ "epoch": 0.15,
4210
+ "grad_norm": 4.6875,
4211
+ "learning_rate": 8.673684210526316e-06,
4212
+ "loss": 0.6021,
4213
+ "step": 5880
4214
+ },
4215
+ {
4216
+ "epoch": 0.15,
4217
+ "grad_norm": 4.0,
4218
+ "learning_rate": 8.652631578947369e-06,
4219
+ "loss": 0.7437,
4220
+ "step": 5890
4221
+ },
4222
+ {
4223
+ "epoch": 0.15,
4224
+ "grad_norm": 3.4375,
4225
+ "learning_rate": 8.631578947368422e-06,
4226
+ "loss": 0.6618,
4227
+ "step": 5900
4228
+ },
4229
+ {
4230
+ "epoch": 0.15,
4231
+ "grad_norm": 3.34375,
4232
+ "learning_rate": 8.610526315789474e-06,
4233
+ "loss": 0.6088,
4234
+ "step": 5910
4235
+ },
4236
+ {
4237
+ "epoch": 0.15,
4238
+ "grad_norm": 4.90625,
4239
+ "learning_rate": 8.589473684210527e-06,
4240
+ "loss": 0.5808,
4241
+ "step": 5920
4242
+ },
4243
+ {
4244
+ "epoch": 0.15,
4245
+ "grad_norm": 4.25,
4246
+ "learning_rate": 8.56842105263158e-06,
4247
+ "loss": 0.6744,
4248
+ "step": 5930
4249
+ },
4250
+ {
4251
+ "epoch": 0.15,
4252
+ "grad_norm": 4.4375,
4253
+ "learning_rate": 8.547368421052632e-06,
4254
+ "loss": 0.5432,
4255
+ "step": 5940
4256
+ },
4257
+ {
4258
+ "epoch": 0.15,
4259
+ "grad_norm": 6.65625,
4260
+ "learning_rate": 8.526315789473685e-06,
4261
+ "loss": 0.6601,
4262
+ "step": 5950
4263
+ },
4264
+ {
4265
+ "epoch": 0.15,
4266
+ "grad_norm": 4.3125,
4267
+ "learning_rate": 8.505263157894738e-06,
4268
+ "loss": 0.4648,
4269
+ "step": 5960
4270
+ },
4271
+ {
4272
+ "epoch": 0.15,
4273
+ "grad_norm": 2.953125,
4274
+ "learning_rate": 8.48421052631579e-06,
4275
+ "loss": 0.5452,
4276
+ "step": 5970
4277
+ },
4278
+ {
4279
+ "epoch": 0.15,
4280
+ "grad_norm": 5.09375,
4281
+ "learning_rate": 8.463157894736843e-06,
4282
+ "loss": 0.5761,
4283
+ "step": 5980
4284
+ },
4285
+ {
4286
+ "epoch": 0.15,
4287
+ "grad_norm": 2.828125,
4288
+ "learning_rate": 8.442105263157896e-06,
4289
+ "loss": 0.5394,
4290
+ "step": 5990
4291
+ },
4292
+ {
4293
+ "epoch": 0.15,
4294
+ "grad_norm": 5.65625,
4295
+ "learning_rate": 8.421052631578948e-06,
4296
+ "loss": 0.6339,
4297
+ "step": 6000
4298
+ },
4299
+ {
4300
+ "epoch": 0.15,
4301
+ "eval_loss": 0.6167545318603516,
4302
+ "eval_runtime": 38.2022,
4303
+ "eval_samples_per_second": 26.177,
4304
+ "eval_steps_per_second": 26.177,
4305
+ "step": 6000
4306
+ },
4307
+ {
4308
+ "epoch": 0.15,
4309
+ "grad_norm": 3.609375,
4310
+ "learning_rate": 8.400000000000001e-06,
4311
+ "loss": 0.5043,
4312
+ "step": 6010
4313
+ },
4314
+ {
4315
+ "epoch": 0.15,
4316
+ "grad_norm": 3.171875,
4317
+ "learning_rate": 8.378947368421054e-06,
4318
+ "loss": 0.6012,
4319
+ "step": 6020
4320
+ },
4321
+ {
4322
+ "epoch": 0.15,
4323
+ "grad_norm": 7.03125,
4324
+ "learning_rate": 8.357894736842106e-06,
4325
+ "loss": 0.5963,
4326
+ "step": 6030
4327
+ },
4328
+ {
4329
+ "epoch": 0.15,
4330
+ "grad_norm": 5.9375,
4331
+ "learning_rate": 8.336842105263159e-06,
4332
+ "loss": 0.6281,
4333
+ "step": 6040
4334
+ },
4335
+ {
4336
+ "epoch": 0.15,
4337
+ "grad_norm": 2.59375,
4338
+ "learning_rate": 8.315789473684212e-06,
4339
+ "loss": 0.4643,
4340
+ "step": 6050
4341
+ },
4342
+ {
4343
+ "epoch": 0.15,
4344
+ "grad_norm": 7.4375,
4345
+ "learning_rate": 8.294736842105264e-06,
4346
+ "loss": 0.6371,
4347
+ "step": 6060
4348
+ },
4349
+ {
4350
+ "epoch": 0.15,
4351
+ "grad_norm": 2.421875,
4352
+ "learning_rate": 8.273684210526317e-06,
4353
+ "loss": 0.5769,
4354
+ "step": 6070
4355
+ },
4356
+ {
4357
+ "epoch": 0.15,
4358
+ "grad_norm": 3.703125,
4359
+ "learning_rate": 8.25263157894737e-06,
4360
+ "loss": 0.7226,
4361
+ "step": 6080
4362
+ },
4363
+ {
4364
+ "epoch": 0.15,
4365
+ "grad_norm": 4.375,
4366
+ "learning_rate": 8.231578947368422e-06,
4367
+ "loss": 0.541,
4368
+ "step": 6090
4369
+ },
4370
+ {
4371
+ "epoch": 0.15,
4372
+ "grad_norm": 4.46875,
4373
+ "learning_rate": 8.210526315789475e-06,
4374
+ "loss": 0.5965,
4375
+ "step": 6100
4376
+ },
4377
+ {
4378
+ "epoch": 0.15,
4379
+ "grad_norm": 6.28125,
4380
+ "learning_rate": 8.189473684210527e-06,
4381
+ "loss": 0.5999,
4382
+ "step": 6110
4383
+ },
4384
+ {
4385
+ "epoch": 0.15,
4386
+ "grad_norm": 6.34375,
4387
+ "learning_rate": 8.16842105263158e-06,
4388
+ "loss": 0.6388,
4389
+ "step": 6120
4390
+ },
4391
+ {
4392
+ "epoch": 0.15,
4393
+ "grad_norm": 5.6875,
4394
+ "learning_rate": 8.147368421052633e-06,
4395
+ "loss": 0.5696,
4396
+ "step": 6130
4397
+ },
4398
+ {
4399
+ "epoch": 0.15,
4400
+ "grad_norm": 2.4375,
4401
+ "learning_rate": 8.126315789473684e-06,
4402
+ "loss": 0.4886,
4403
+ "step": 6140
4404
+ },
4405
+ {
4406
+ "epoch": 0.15,
4407
+ "grad_norm": 5.59375,
4408
+ "learning_rate": 8.105263157894736e-06,
4409
+ "loss": 0.5406,
4410
+ "step": 6150
4411
+ },
4412
+ {
4413
+ "epoch": 0.15,
4414
+ "grad_norm": 6.9375,
4415
+ "learning_rate": 8.08421052631579e-06,
4416
+ "loss": 0.6134,
4417
+ "step": 6160
4418
+ },
4419
+ {
4420
+ "epoch": 0.15,
4421
+ "grad_norm": 4.875,
4422
+ "learning_rate": 8.063157894736843e-06,
4423
+ "loss": 0.6944,
4424
+ "step": 6170
4425
+ },
4426
+ {
4427
+ "epoch": 0.15,
4428
+ "grad_norm": 7.4375,
4429
+ "learning_rate": 8.042105263157896e-06,
4430
+ "loss": 0.5848,
4431
+ "step": 6180
4432
+ },
4433
+ {
4434
+ "epoch": 0.15,
4435
+ "grad_norm": 3.40625,
4436
+ "learning_rate": 8.021052631578949e-06,
4437
+ "loss": 0.5441,
4438
+ "step": 6190
4439
+ },
4440
+ {
4441
+ "epoch": 0.15,
4442
+ "grad_norm": 3.6875,
4443
+ "learning_rate": 8.000000000000001e-06,
4444
+ "loss": 0.678,
4445
+ "step": 6200
4446
+ },
4447
+ {
4448
+ "epoch": 0.16,
4449
+ "grad_norm": 2.625,
4450
+ "learning_rate": 7.978947368421052e-06,
4451
+ "loss": 0.639,
4452
+ "step": 6210
4453
+ },
4454
+ {
4455
+ "epoch": 0.16,
4456
+ "grad_norm": 6.09375,
4457
+ "learning_rate": 7.957894736842105e-06,
4458
+ "loss": 0.6824,
4459
+ "step": 6220
4460
+ },
4461
+ {
4462
+ "epoch": 0.16,
4463
+ "grad_norm": 4.4375,
4464
+ "learning_rate": 7.936842105263158e-06,
4465
+ "loss": 0.6219,
4466
+ "step": 6230
4467
+ },
4468
+ {
4469
+ "epoch": 0.16,
4470
+ "grad_norm": 3.796875,
4471
+ "learning_rate": 7.915789473684212e-06,
4472
+ "loss": 0.6267,
4473
+ "step": 6240
4474
+ },
4475
+ {
4476
+ "epoch": 0.16,
4477
+ "grad_norm": 3.453125,
4478
+ "learning_rate": 7.894736842105265e-06,
4479
+ "loss": 0.5348,
4480
+ "step": 6250
4481
+ },
4482
+ {
4483
+ "epoch": 0.16,
4484
+ "grad_norm": 5.90625,
4485
+ "learning_rate": 7.873684210526317e-06,
4486
+ "loss": 0.6523,
4487
+ "step": 6260
4488
+ },
4489
+ {
4490
+ "epoch": 0.16,
4491
+ "grad_norm": 8.25,
4492
+ "learning_rate": 7.85263157894737e-06,
4493
+ "loss": 0.6024,
4494
+ "step": 6270
4495
+ },
4496
+ {
4497
+ "epoch": 0.16,
4498
+ "grad_norm": 2.875,
4499
+ "learning_rate": 7.831578947368421e-06,
4500
+ "loss": 0.5896,
4501
+ "step": 6280
4502
+ },
4503
+ {
4504
+ "epoch": 0.16,
4505
+ "grad_norm": 7.40625,
4506
+ "learning_rate": 7.810526315789474e-06,
4507
+ "loss": 0.7023,
4508
+ "step": 6290
4509
+ },
4510
+ {
4511
+ "epoch": 0.16,
4512
+ "grad_norm": 4.09375,
4513
+ "learning_rate": 7.789473684210526e-06,
4514
+ "loss": 0.5793,
4515
+ "step": 6300
4516
+ },
4517
+ {
4518
+ "epoch": 0.16,
4519
+ "grad_norm": 6.375,
4520
+ "learning_rate": 7.768421052631579e-06,
4521
+ "loss": 0.6332,
4522
+ "step": 6310
4523
+ },
4524
+ {
4525
+ "epoch": 0.16,
4526
+ "grad_norm": 5.3125,
4527
+ "learning_rate": 7.747368421052631e-06,
4528
+ "loss": 0.5124,
4529
+ "step": 6320
4530
+ },
4531
+ {
4532
+ "epoch": 0.16,
4533
+ "grad_norm": 2.203125,
4534
+ "learning_rate": 7.726315789473686e-06,
4535
+ "loss": 0.6329,
4536
+ "step": 6330
4537
+ },
4538
+ {
4539
+ "epoch": 0.16,
4540
+ "grad_norm": 9.0,
4541
+ "learning_rate": 7.705263157894738e-06,
4542
+ "loss": 0.7025,
4543
+ "step": 6340
4544
+ },
4545
+ {
4546
+ "epoch": 0.16,
4547
+ "grad_norm": 7.09375,
4548
+ "learning_rate": 7.68421052631579e-06,
4549
+ "loss": 0.5197,
4550
+ "step": 6350
4551
+ },
4552
+ {
4553
+ "epoch": 0.16,
4554
+ "grad_norm": 3.78125,
4555
+ "learning_rate": 7.663157894736842e-06,
4556
+ "loss": 0.6173,
4557
+ "step": 6360
4558
+ },
4559
+ {
4560
+ "epoch": 0.16,
4561
+ "grad_norm": 2.578125,
4562
+ "learning_rate": 7.642105263157895e-06,
4563
+ "loss": 0.5852,
4564
+ "step": 6370
4565
+ },
4566
+ {
4567
+ "epoch": 0.16,
4568
+ "grad_norm": 3.109375,
4569
+ "learning_rate": 7.621052631578948e-06,
4570
+ "loss": 0.5688,
4571
+ "step": 6380
4572
+ },
4573
+ {
4574
+ "epoch": 0.16,
4575
+ "grad_norm": 9.0625,
4576
+ "learning_rate": 7.600000000000001e-06,
4577
+ "loss": 0.5457,
4578
+ "step": 6390
4579
+ },
4580
+ {
4581
+ "epoch": 0.16,
4582
+ "grad_norm": 1.9296875,
4583
+ "learning_rate": 7.578947368421054e-06,
4584
+ "loss": 0.6564,
4585
+ "step": 6400
4586
+ },
4587
+ {
4588
+ "epoch": 0.16,
4589
+ "grad_norm": 2.15625,
4590
+ "learning_rate": 7.557894736842106e-06,
4591
+ "loss": 0.5177,
4592
+ "step": 6410
4593
+ },
4594
+ {
4595
+ "epoch": 0.16,
4596
+ "grad_norm": 3.6875,
4597
+ "learning_rate": 7.536842105263158e-06,
4598
+ "loss": 0.6287,
4599
+ "step": 6420
4600
+ },
4601
+ {
4602
+ "epoch": 0.16,
4603
+ "grad_norm": 3.578125,
4604
+ "learning_rate": 7.515789473684211e-06,
4605
+ "loss": 0.6026,
4606
+ "step": 6430
4607
+ },
4608
+ {
4609
+ "epoch": 0.16,
4610
+ "grad_norm": 2.8125,
4611
+ "learning_rate": 7.494736842105263e-06,
4612
+ "loss": 0.5795,
4613
+ "step": 6440
4614
+ },
4615
+ {
4616
+ "epoch": 0.16,
4617
+ "grad_norm": 3.09375,
4618
+ "learning_rate": 7.473684210526316e-06,
4619
+ "loss": 0.6848,
4620
+ "step": 6450
4621
+ },
4622
+ {
4623
+ "epoch": 0.16,
4624
+ "grad_norm": 3.421875,
4625
+ "learning_rate": 7.4526315789473695e-06,
4626
+ "loss": 0.4139,
4627
+ "step": 6460
4628
+ },
4629
+ {
4630
+ "epoch": 0.16,
4631
+ "grad_norm": 5.0,
4632
+ "learning_rate": 7.431578947368422e-06,
4633
+ "loss": 0.5696,
4634
+ "step": 6470
4635
+ },
4636
+ {
4637
+ "epoch": 0.16,
4638
+ "grad_norm": 3.953125,
4639
+ "learning_rate": 7.410526315789475e-06,
4640
+ "loss": 0.6651,
4641
+ "step": 6480
4642
+ },
4643
+ {
4644
+ "epoch": 0.16,
4645
+ "grad_norm": 2.40625,
4646
+ "learning_rate": 7.3894736842105275e-06,
4647
+ "loss": 0.6557,
4648
+ "step": 6490
4649
+ },
4650
+ {
4651
+ "epoch": 0.16,
4652
+ "grad_norm": 8.5625,
4653
+ "learning_rate": 7.368421052631579e-06,
4654
+ "loss": 0.5214,
4655
+ "step": 6500
4656
+ },
4657
+ {
4658
+ "epoch": 0.16,
4659
+ "eval_loss": 0.6133315563201904,
4660
+ "eval_runtime": 38.1535,
4661
+ "eval_samples_per_second": 26.21,
4662
+ "eval_steps_per_second": 26.21,
4663
+ "step": 6500
4664
+ },
4665
+ {
4666
+ "epoch": 0.16,
4667
+ "grad_norm": 2.3125,
4668
+ "learning_rate": 7.347368421052632e-06,
4669
+ "loss": 0.6287,
4670
+ "step": 6510
4671
+ },
4672
+ {
4673
+ "epoch": 0.16,
4674
+ "grad_norm": 6.75,
4675
+ "learning_rate": 7.326315789473685e-06,
4676
+ "loss": 0.6126,
4677
+ "step": 6520
4678
+ },
4679
+ {
4680
+ "epoch": 0.16,
4681
+ "grad_norm": 4.90625,
4682
+ "learning_rate": 7.305263157894737e-06,
4683
+ "loss": 0.68,
4684
+ "step": 6530
4685
+ },
4686
+ {
4687
+ "epoch": 0.16,
4688
+ "grad_norm": 4.96875,
4689
+ "learning_rate": 7.28421052631579e-06,
4690
+ "loss": 0.4831,
4691
+ "step": 6540
4692
+ },
4693
+ {
4694
+ "epoch": 0.16,
4695
+ "grad_norm": 3.328125,
4696
+ "learning_rate": 7.263157894736843e-06,
4697
+ "loss": 0.6983,
4698
+ "step": 6550
4699
+ },
4700
+ {
4701
+ "epoch": 0.16,
4702
+ "grad_norm": 2.640625,
4703
+ "learning_rate": 7.242105263157896e-06,
4704
+ "loss": 0.5214,
4705
+ "step": 6560
4706
+ },
4707
+ {
4708
+ "epoch": 0.16,
4709
+ "grad_norm": 4.5625,
4710
+ "learning_rate": 7.221052631578948e-06,
4711
+ "loss": 0.5789,
4712
+ "step": 6570
4713
+ },
4714
+ {
4715
+ "epoch": 0.16,
4716
+ "grad_norm": 11.3125,
4717
+ "learning_rate": 7.2000000000000005e-06,
4718
+ "loss": 0.6369,
4719
+ "step": 6580
4720
+ },
4721
+ {
4722
+ "epoch": 0.16,
4723
+ "grad_norm": 4.5625,
4724
+ "learning_rate": 7.178947368421053e-06,
4725
+ "loss": 0.608,
4726
+ "step": 6590
4727
+ },
4728
+ {
4729
+ "epoch": 0.17,
4730
+ "grad_norm": 3.25,
4731
+ "learning_rate": 7.157894736842106e-06,
4732
+ "loss": 0.6514,
4733
+ "step": 6600
4734
+ },
4735
+ {
4736
+ "epoch": 0.17,
4737
+ "grad_norm": 8.5625,
4738
+ "learning_rate": 7.1368421052631585e-06,
4739
+ "loss": 0.4964,
4740
+ "step": 6610
4741
+ },
4742
+ {
4743
+ "epoch": 0.17,
4744
+ "grad_norm": 5.40625,
4745
+ "learning_rate": 7.115789473684211e-06,
4746
+ "loss": 0.6726,
4747
+ "step": 6620
4748
+ },
4749
+ {
4750
+ "epoch": 0.17,
4751
+ "grad_norm": 2.65625,
4752
+ "learning_rate": 7.094736842105265e-06,
4753
+ "loss": 0.636,
4754
+ "step": 6630
4755
+ },
4756
+ {
4757
+ "epoch": 0.17,
4758
+ "grad_norm": 5.5625,
4759
+ "learning_rate": 7.073684210526316e-06,
4760
+ "loss": 0.69,
4761
+ "step": 6640
4762
+ },
4763
+ {
4764
+ "epoch": 0.17,
4765
+ "grad_norm": 15.375,
4766
+ "learning_rate": 7.052631578947369e-06,
4767
+ "loss": 0.5622,
4768
+ "step": 6650
4769
+ },
4770
+ {
4771
+ "epoch": 0.17,
4772
+ "grad_norm": 6.5625,
4773
+ "learning_rate": 7.031578947368422e-06,
4774
+ "loss": 0.5597,
4775
+ "step": 6660
4776
+ },
4777
+ {
4778
+ "epoch": 0.17,
4779
+ "grad_norm": 3.46875,
4780
+ "learning_rate": 7.010526315789474e-06,
4781
+ "loss": 0.6246,
4782
+ "step": 6670
4783
+ },
4784
+ {
4785
+ "epoch": 0.17,
4786
+ "grad_norm": 3.515625,
4787
+ "learning_rate": 6.989473684210527e-06,
4788
+ "loss": 0.5107,
4789
+ "step": 6680
4790
+ },
4791
+ {
4792
+ "epoch": 0.17,
4793
+ "grad_norm": 3.84375,
4794
+ "learning_rate": 6.96842105263158e-06,
4795
+ "loss": 0.646,
4796
+ "step": 6690
4797
+ },
4798
+ {
4799
+ "epoch": 0.17,
4800
+ "grad_norm": 3.5625,
4801
+ "learning_rate": 6.947368421052632e-06,
4802
+ "loss": 0.659,
4803
+ "step": 6700
4804
+ },
4805
+ {
4806
+ "epoch": 0.17,
4807
+ "grad_norm": 2.703125,
4808
+ "learning_rate": 6.926315789473684e-06,
4809
+ "loss": 0.4536,
4810
+ "step": 6710
4811
+ },
4812
+ {
4813
+ "epoch": 0.17,
4814
+ "grad_norm": 2.703125,
4815
+ "learning_rate": 6.905263157894737e-06,
4816
+ "loss": 0.6416,
4817
+ "step": 6720
4818
+ },
4819
+ {
4820
+ "epoch": 0.17,
4821
+ "grad_norm": 2.34375,
4822
+ "learning_rate": 6.8842105263157895e-06,
4823
+ "loss": 0.6546,
4824
+ "step": 6730
4825
+ },
4826
+ {
4827
+ "epoch": 0.17,
4828
+ "grad_norm": 3.046875,
4829
+ "learning_rate": 6.863157894736843e-06,
4830
+ "loss": 0.6834,
4831
+ "step": 6740
4832
+ },
4833
+ {
4834
+ "epoch": 0.17,
4835
+ "grad_norm": 5.53125,
4836
+ "learning_rate": 6.842105263157896e-06,
4837
+ "loss": 0.6329,
4838
+ "step": 6750
4839
+ },
4840
+ {
4841
+ "epoch": 0.17,
4842
+ "grad_norm": 4.75,
4843
+ "learning_rate": 6.821052631578948e-06,
4844
+ "loss": 0.5559,
4845
+ "step": 6760
4846
+ },
4847
+ {
4848
+ "epoch": 0.17,
4849
+ "grad_norm": 3.484375,
4850
+ "learning_rate": 6.800000000000001e-06,
4851
+ "loss": 0.5685,
4852
+ "step": 6770
4853
+ },
4854
+ {
4855
+ "epoch": 0.17,
4856
+ "grad_norm": 5.03125,
4857
+ "learning_rate": 6.778947368421053e-06,
4858
+ "loss": 0.6151,
4859
+ "step": 6780
4860
+ },
4861
+ {
4862
+ "epoch": 0.17,
4863
+ "grad_norm": 3.296875,
4864
+ "learning_rate": 6.7578947368421054e-06,
4865
+ "loss": 0.6197,
4866
+ "step": 6790
4867
+ },
4868
+ {
4869
+ "epoch": 0.17,
4870
+ "grad_norm": 3.578125,
4871
+ "learning_rate": 6.736842105263158e-06,
4872
+ "loss": 0.565,
4873
+ "step": 6800
4874
+ },
4875
+ {
4876
+ "epoch": 0.17,
4877
+ "grad_norm": 4.09375,
4878
+ "learning_rate": 6.715789473684211e-06,
4879
+ "loss": 0.5766,
4880
+ "step": 6810
4881
+ },
4882
+ {
4883
+ "epoch": 0.17,
4884
+ "grad_norm": 2.5625,
4885
+ "learning_rate": 6.694736842105264e-06,
4886
+ "loss": 0.5622,
4887
+ "step": 6820
4888
+ },
4889
+ {
4890
+ "epoch": 0.17,
4891
+ "grad_norm": 13.125,
4892
+ "learning_rate": 6.673684210526317e-06,
4893
+ "loss": 0.6691,
4894
+ "step": 6830
4895
+ },
4896
+ {
4897
+ "epoch": 0.17,
4898
+ "grad_norm": 9.375,
4899
+ "learning_rate": 6.6526315789473695e-06,
4900
+ "loss": 0.5872,
4901
+ "step": 6840
4902
+ },
4903
+ {
4904
+ "epoch": 0.17,
4905
+ "grad_norm": 3.296875,
4906
+ "learning_rate": 6.631578947368421e-06,
4907
+ "loss": 0.5883,
4908
+ "step": 6850
4909
+ },
4910
+ {
4911
+ "epoch": 0.17,
4912
+ "grad_norm": 12.0625,
4913
+ "learning_rate": 6.610526315789474e-06,
4914
+ "loss": 0.6339,
4915
+ "step": 6860
4916
+ },
4917
+ {
4918
+ "epoch": 0.17,
4919
+ "grad_norm": 2.96875,
4920
+ "learning_rate": 6.589473684210527e-06,
4921
+ "loss": 0.612,
4922
+ "step": 6870
4923
+ },
4924
+ {
4925
+ "epoch": 0.17,
4926
+ "grad_norm": 3.34375,
4927
+ "learning_rate": 6.568421052631579e-06,
4928
+ "loss": 0.5187,
4929
+ "step": 6880
4930
+ },
4931
+ {
4932
+ "epoch": 0.17,
4933
+ "grad_norm": 2.828125,
4934
+ "learning_rate": 6.547368421052632e-06,
4935
+ "loss": 0.5306,
4936
+ "step": 6890
4937
+ },
4938
+ {
4939
+ "epoch": 0.17,
4940
+ "grad_norm": 3.671875,
4941
+ "learning_rate": 6.526315789473685e-06,
4942
+ "loss": 0.5485,
4943
+ "step": 6900
4944
+ },
4945
+ {
4946
+ "epoch": 0.17,
4947
+ "grad_norm": 9.5625,
4948
+ "learning_rate": 6.505263157894738e-06,
4949
+ "loss": 0.6115,
4950
+ "step": 6910
4951
+ },
4952
+ {
4953
+ "epoch": 0.17,
4954
+ "grad_norm": 3.9375,
4955
+ "learning_rate": 6.484210526315789e-06,
4956
+ "loss": 0.5446,
4957
+ "step": 6920
4958
+ },
4959
+ {
4960
+ "epoch": 0.17,
4961
+ "grad_norm": 1.7109375,
4962
+ "learning_rate": 6.463157894736843e-06,
4963
+ "loss": 0.6284,
4964
+ "step": 6930
4965
+ },
4966
+ {
4967
+ "epoch": 0.17,
4968
+ "grad_norm": 8.5625,
4969
+ "learning_rate": 6.442105263157895e-06,
4970
+ "loss": 0.526,
4971
+ "step": 6940
4972
+ },
4973
+ {
4974
+ "epoch": 0.17,
4975
+ "grad_norm": 4.34375,
4976
+ "learning_rate": 6.421052631578948e-06,
4977
+ "loss": 0.6324,
4978
+ "step": 6950
4979
+ },
4980
+ {
4981
+ "epoch": 0.17,
4982
+ "grad_norm": 5.28125,
4983
+ "learning_rate": 6.4000000000000006e-06,
4984
+ "loss": 0.575,
4985
+ "step": 6960
4986
+ },
4987
+ {
4988
+ "epoch": 0.17,
4989
+ "grad_norm": 3.09375,
4990
+ "learning_rate": 6.378947368421053e-06,
4991
+ "loss": 0.5664,
4992
+ "step": 6970
4993
+ },
4994
+ {
4995
+ "epoch": 0.17,
4996
+ "grad_norm": 3.75,
4997
+ "learning_rate": 6.357894736842106e-06,
4998
+ "loss": 0.5779,
4999
+ "step": 6980
5000
+ },
5001
+ {
5002
+ "epoch": 0.17,
5003
+ "grad_norm": 5.15625,
5004
+ "learning_rate": 6.336842105263158e-06,
5005
+ "loss": 0.5258,
5006
+ "step": 6990
5007
+ },
5008
+ {
5009
+ "epoch": 0.17,
5010
+ "grad_norm": 5.0625,
5011
+ "learning_rate": 6.31578947368421e-06,
5012
+ "loss": 0.5891,
5013
+ "step": 7000
5014
+ },
5015
+ {
5016
+ "epoch": 0.17,
5017
+ "eval_loss": 0.6335042119026184,
5018
+ "eval_runtime": 38.1328,
5019
+ "eval_samples_per_second": 26.224,
5020
+ "eval_steps_per_second": 26.224,
5021
+ "step": 7000
5022
+ },
5023
+ {
5024
+ "epoch": 0.18,
5025
+ "grad_norm": 5.53125,
5026
+ "learning_rate": 6.294736842105264e-06,
5027
+ "loss": 0.524,
5028
+ "step": 7010
5029
+ },
5030
+ {
5031
+ "epoch": 0.18,
5032
+ "grad_norm": 4.46875,
5033
+ "learning_rate": 6.2736842105263165e-06,
5034
+ "loss": 0.6672,
5035
+ "step": 7020
5036
+ },
5037
+ {
5038
+ "epoch": 0.18,
5039
+ "grad_norm": 2.4375,
5040
+ "learning_rate": 6.252631578947369e-06,
5041
+ "loss": 0.5906,
5042
+ "step": 7030
5043
+ },
5044
+ {
5045
+ "epoch": 0.18,
5046
+ "grad_norm": 3.703125,
5047
+ "learning_rate": 6.231578947368422e-06,
5048
+ "loss": 0.6795,
5049
+ "step": 7040
5050
+ },
5051
+ {
5052
+ "epoch": 0.18,
5053
+ "grad_norm": 5.09375,
5054
+ "learning_rate": 6.2105263157894745e-06,
5055
+ "loss": 0.574,
5056
+ "step": 7050
5057
+ },
5058
+ {
5059
+ "epoch": 0.18,
5060
+ "grad_norm": 3.921875,
5061
+ "learning_rate": 6.189473684210526e-06,
5062
+ "loss": 0.6008,
5063
+ "step": 7060
5064
+ },
5065
+ {
5066
+ "epoch": 0.18,
5067
+ "grad_norm": 5.84375,
5068
+ "learning_rate": 6.168421052631579e-06,
5069
+ "loss": 0.5116,
5070
+ "step": 7070
5071
+ },
5072
+ {
5073
+ "epoch": 0.18,
5074
+ "grad_norm": 3.203125,
5075
+ "learning_rate": 6.1473684210526316e-06,
5076
+ "loss": 0.6047,
5077
+ "step": 7080
5078
+ },
5079
+ {
5080
+ "epoch": 0.18,
5081
+ "grad_norm": 2.15625,
5082
+ "learning_rate": 6.126315789473685e-06,
5083
+ "loss": 0.3864,
5084
+ "step": 7090
5085
+ },
5086
+ {
5087
+ "epoch": 0.18,
5088
+ "grad_norm": 3.65625,
5089
+ "learning_rate": 6.105263157894738e-06,
5090
+ "loss": 0.5869,
5091
+ "step": 7100
5092
+ },
5093
+ {
5094
+ "epoch": 0.18,
5095
+ "grad_norm": 9.875,
5096
+ "learning_rate": 6.08421052631579e-06,
5097
+ "loss": 0.5377,
5098
+ "step": 7110
5099
+ },
5100
+ {
5101
+ "epoch": 0.18,
5102
+ "grad_norm": 3.078125,
5103
+ "learning_rate": 6.063157894736843e-06,
5104
+ "loss": 0.5649,
5105
+ "step": 7120
5106
+ },
5107
+ {
5108
+ "epoch": 0.18,
5109
+ "grad_norm": 4.875,
5110
+ "learning_rate": 6.042105263157895e-06,
5111
+ "loss": 0.5955,
5112
+ "step": 7130
5113
+ },
5114
+ {
5115
+ "epoch": 0.18,
5116
+ "grad_norm": 3.53125,
5117
+ "learning_rate": 6.0210526315789475e-06,
5118
+ "loss": 0.5352,
5119
+ "step": 7140
5120
+ },
5121
+ {
5122
+ "epoch": 0.18,
5123
+ "grad_norm": 9.625,
5124
+ "learning_rate": 6e-06,
5125
+ "loss": 0.5801,
5126
+ "step": 7150
5127
+ },
5128
+ {
5129
+ "epoch": 0.18,
5130
+ "grad_norm": 4.375,
5131
+ "learning_rate": 5.978947368421053e-06,
5132
+ "loss": 0.4599,
5133
+ "step": 7160
5134
+ },
5135
+ {
5136
+ "epoch": 0.18,
5137
+ "grad_norm": 3.3125,
5138
+ "learning_rate": 5.9578947368421055e-06,
5139
+ "loss": 0.5337,
5140
+ "step": 7170
5141
+ },
5142
+ {
5143
+ "epoch": 0.18,
5144
+ "grad_norm": 3.40625,
5145
+ "learning_rate": 5.936842105263159e-06,
5146
+ "loss": 0.5677,
5147
+ "step": 7180
5148
+ },
5149
+ {
5150
+ "epoch": 0.18,
5151
+ "grad_norm": 2.90625,
5152
+ "learning_rate": 5.915789473684212e-06,
5153
+ "loss": 0.6926,
5154
+ "step": 7190
5155
+ },
5156
+ {
5157
+ "epoch": 0.18,
5158
+ "grad_norm": 3.15625,
5159
+ "learning_rate": 5.8947368421052634e-06,
5160
+ "loss": 0.6243,
5161
+ "step": 7200
5162
+ },
5163
+ {
5164
+ "epoch": 0.18,
5165
+ "grad_norm": 5.5,
5166
+ "learning_rate": 5.873684210526316e-06,
5167
+ "loss": 0.5837,
5168
+ "step": 7210
5169
+ },
5170
+ {
5171
+ "epoch": 0.18,
5172
+ "grad_norm": 4.53125,
5173
+ "learning_rate": 5.852631578947369e-06,
5174
+ "loss": 0.6096,
5175
+ "step": 7220
5176
+ },
5177
+ {
5178
+ "epoch": 0.18,
5179
+ "grad_norm": 3.984375,
5180
+ "learning_rate": 5.831578947368421e-06,
5181
+ "loss": 0.5598,
5182
+ "step": 7230
5183
+ },
5184
+ {
5185
+ "epoch": 0.18,
5186
+ "grad_norm": 4.40625,
5187
+ "learning_rate": 5.810526315789474e-06,
5188
+ "loss": 0.5448,
5189
+ "step": 7240
5190
+ },
5191
+ {
5192
+ "epoch": 0.18,
5193
+ "grad_norm": 3.671875,
5194
+ "learning_rate": 5.789473684210527e-06,
5195
+ "loss": 0.5487,
5196
+ "step": 7250
5197
+ },
5198
+ {
5199
+ "epoch": 0.18,
5200
+ "grad_norm": 10.5625,
5201
+ "learning_rate": 5.76842105263158e-06,
5202
+ "loss": 0.5739,
5203
+ "step": 7260
5204
+ },
5205
+ {
5206
+ "epoch": 0.18,
5207
+ "grad_norm": 9.4375,
5208
+ "learning_rate": 5.747368421052633e-06,
5209
+ "loss": 0.6278,
5210
+ "step": 7270
5211
+ },
5212
+ {
5213
+ "epoch": 0.18,
5214
+ "grad_norm": 3.578125,
5215
+ "learning_rate": 5.726315789473685e-06,
5216
+ "loss": 0.5801,
5217
+ "step": 7280
5218
+ },
5219
+ {
5220
+ "epoch": 0.18,
5221
+ "grad_norm": 3.796875,
5222
+ "learning_rate": 5.705263157894737e-06,
5223
+ "loss": 0.4992,
5224
+ "step": 7290
5225
+ },
5226
+ {
5227
+ "epoch": 0.18,
5228
+ "grad_norm": 4.8125,
5229
+ "learning_rate": 5.68421052631579e-06,
5230
+ "loss": 0.607,
5231
+ "step": 7300
5232
+ },
5233
+ {
5234
+ "epoch": 0.18,
5235
+ "grad_norm": 5.0,
5236
+ "learning_rate": 5.663157894736843e-06,
5237
+ "loss": 0.5189,
5238
+ "step": 7310
5239
+ },
5240
+ {
5241
+ "epoch": 0.18,
5242
+ "grad_norm": 5.78125,
5243
+ "learning_rate": 5.642105263157895e-06,
5244
+ "loss": 0.6886,
5245
+ "step": 7320
5246
+ },
5247
+ {
5248
+ "epoch": 0.18,
5249
+ "grad_norm": 3.46875,
5250
+ "learning_rate": 5.621052631578948e-06,
5251
+ "loss": 0.6127,
5252
+ "step": 7330
5253
+ },
5254
+ {
5255
+ "epoch": 0.18,
5256
+ "grad_norm": 3.875,
5257
+ "learning_rate": 5.600000000000001e-06,
5258
+ "loss": 0.625,
5259
+ "step": 7340
5260
+ },
5261
+ {
5262
+ "epoch": 0.18,
5263
+ "grad_norm": 16.25,
5264
+ "learning_rate": 5.578947368421052e-06,
5265
+ "loss": 0.537,
5266
+ "step": 7350
5267
+ },
5268
+ {
5269
+ "epoch": 0.18,
5270
+ "grad_norm": 3.90625,
5271
+ "learning_rate": 5.557894736842105e-06,
5272
+ "loss": 0.5933,
5273
+ "step": 7360
5274
+ },
5275
+ {
5276
+ "epoch": 0.18,
5277
+ "grad_norm": 13.375,
5278
+ "learning_rate": 5.5368421052631586e-06,
5279
+ "loss": 0.6001,
5280
+ "step": 7370
5281
+ },
5282
+ {
5283
+ "epoch": 0.18,
5284
+ "grad_norm": 14.75,
5285
+ "learning_rate": 5.515789473684211e-06,
5286
+ "loss": 0.6809,
5287
+ "step": 7380
5288
+ },
5289
+ {
5290
+ "epoch": 0.18,
5291
+ "grad_norm": 5.90625,
5292
+ "learning_rate": 5.494736842105264e-06,
5293
+ "loss": 0.5997,
5294
+ "step": 7390
5295
+ },
5296
+ {
5297
+ "epoch": 0.18,
5298
+ "grad_norm": 4.03125,
5299
+ "learning_rate": 5.4736842105263165e-06,
5300
+ "loss": 0.6418,
5301
+ "step": 7400
5302
+ },
5303
+ {
5304
+ "epoch": 0.19,
5305
+ "grad_norm": 3.65625,
5306
+ "learning_rate": 5.452631578947369e-06,
5307
+ "loss": 0.5285,
5308
+ "step": 7410
5309
+ },
5310
+ {
5311
+ "epoch": 0.19,
5312
+ "grad_norm": 10.0625,
5313
+ "learning_rate": 5.431578947368421e-06,
5314
+ "loss": 0.6465,
5315
+ "step": 7420
5316
+ },
5317
+ {
5318
+ "epoch": 0.19,
5319
+ "grad_norm": 3.359375,
5320
+ "learning_rate": 5.410526315789474e-06,
5321
+ "loss": 0.6697,
5322
+ "step": 7430
5323
+ },
5324
+ {
5325
+ "epoch": 0.19,
5326
+ "grad_norm": 4.21875,
5327
+ "learning_rate": 5.389473684210526e-06,
5328
+ "loss": 0.5943,
5329
+ "step": 7440
5330
+ },
5331
+ {
5332
+ "epoch": 0.19,
5333
+ "grad_norm": 5.875,
5334
+ "learning_rate": 5.36842105263158e-06,
5335
+ "loss": 0.6122,
5336
+ "step": 7450
5337
+ },
5338
+ {
5339
+ "epoch": 0.19,
5340
+ "grad_norm": 2.4375,
5341
+ "learning_rate": 5.3473684210526325e-06,
5342
+ "loss": 0.5825,
5343
+ "step": 7460
5344
+ },
5345
+ {
5346
+ "epoch": 0.19,
5347
+ "grad_norm": 27.75,
5348
+ "learning_rate": 5.326315789473685e-06,
5349
+ "loss": 0.6151,
5350
+ "step": 7470
5351
+ },
5352
+ {
5353
+ "epoch": 0.19,
5354
+ "grad_norm": 3.28125,
5355
+ "learning_rate": 5.305263157894738e-06,
5356
+ "loss": 0.621,
5357
+ "step": 7480
5358
+ },
5359
+ {
5360
+ "epoch": 0.19,
5361
+ "grad_norm": 3.265625,
5362
+ "learning_rate": 5.2842105263157896e-06,
5363
+ "loss": 0.5397,
5364
+ "step": 7490
5365
+ },
5366
+ {
5367
+ "epoch": 0.19,
5368
+ "grad_norm": 4.53125,
5369
+ "learning_rate": 5.263157894736842e-06,
5370
+ "loss": 0.6428,
5371
+ "step": 7500
5372
+ },
5373
+ {
5374
+ "epoch": 0.19,
5375
+ "eval_loss": 0.6119253039360046,
5376
+ "eval_runtime": 38.1677,
5377
+ "eval_samples_per_second": 26.2,
5378
+ "eval_steps_per_second": 26.2,
5379
+ "step": 7500
5380
  }
5381
  ],
5382
  "logging_steps": 10,
 
5384
  "num_input_tokens_seen": 0,
5385
  "num_train_epochs": 1,
5386
  "save_steps": 2500,
5387
+ "total_flos": 1.1800273747968e+17,
5388
  "train_batch_size": 1,
5389
  "trial_name": null,
5390
  "trial_params": null