RonanMcGovern commited on
Commit
cdad970
·
verified ·
1 Parent(s): 6abffdc

Upload folder using huggingface_hub

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d2a42cc56e07c013f73e56708f9890aeba1ef6f2507446949476e63f34240fa
3
  size 185097216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b11b2ff539c63d3c6bac569f7766703b625a1ff96963c99b63259f1624298ab
3
  size 185097216
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9dd0b2a653496b95d07793491c50539b083d1eff242426d4d9cf66c7f12edaaa
3
  size 370329978
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9d2a0ec897c5663a4be748f0fc34398833a23f3d592a7fbdfecc9b082648a1
3
  size 370329978
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72aed7659a4568e39ea9a56bdc92196603df7d730a90c6411d24926b5d12ad03
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4c1665affd4ed287cc307c033aa9b83f5129c6f23f546aeee5169ca1b8994af
3
  size 16433
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ccb7aad33c882b66b79a28ece740dc71664d087d1f12ad61b65b18df1beca55
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2abd31d46a5b8aba640a7054a1f1886e89e9f6b46ac0ecd782b859dc4214d256
3
  size 16433
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf3954ab65b1da5768888e4c50598bd4c761244f7027ce8da56d21582e829948
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6356c5d3297eec0185fbb12716367c7cafc523337fc20be9926fc46325c429a
3
  size 16433
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9373b1a76204778613e7dd6b7f31b34c6bb969dbb0b802f9f62ca911909492f2
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866ba9b91c55d95ef1da6feb07d2ffecceb23df8b5608e080221fc6dbac9ed2f
3
  size 16433
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1fc04b2ab887669d463a51ba4296461e18e6793bbf2221bc603bff1af03a7a6
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:394d36bdefd8333a39a124a3477c46f23015ea9e02d59f4ba6dcce13ab46168f
3
  size 16433
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81879e4ef695491cab2ffd4ab75ee6208dd18e8e7339d7b4aff8a08580e999d0
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:577c7045aadc9972169583d0b06cfc68d51e9cd2d016cd56617f0749af75cb31
3
  size 16433
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0af269f614eb79e3ee82816f8d169e28a4fa1006f684260473e683e7ccea9d58
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd7094860928ca612494a904481849cce54a143aaf53d511c3fdafa80f4fefa
3
  size 16433
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e9f29cd3c33bc4028b8be6ed5e960831203bf34fb31356c09cdb29b4c851d3f
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e1cc2af93b92de277256d30ceb20997578b8263be191983b475c3ad46a0790
3
  size 16433
rng_state_8.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62adbe536ff6dcfa7fb600a17a73fb0c108c33b017287d093401133eb29bb6d4
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe4f2fce602372e0140548196138ac3dca07af36635f6b15963598f4bc25bf6b
3
  size 16433
rng_state_9.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b7dd71e7a0ba4defb86d5997709ee80f19a9326e37319a19a3ae8c437ad97fa
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4790382837b89dc610979dcc6f00fc79c7847cb5b0f57da5641fc8d16ea0a4a
3
  size 16433
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4729d5e48ae081b5107dc5941bf9dd080c7d24c9e46db7051ab1bfcf68e98eee
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75fc12f97c880c2c5755340b9a95c8af52d0a5dc3399cb79b9acc0bfca303aae
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.49979917224036463,
5
  "eval_steps": 1431,
6
- "global_step": 7155,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3624,6 +3624,1450 @@
3624
  "eval_samples_per_second": 109.582,
3625
  "eval_steps_per_second": 1.425,
3626
  "step": 7155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3627
  }
3628
  ],
3629
  "logging_steps": 14,
@@ -3643,7 +5087,7 @@
3643
  "attributes": {}
3644
  }
3645
  },
3646
- "total_flos": 1.906493869248217e+18,
3647
  "train_batch_size": 8,
3648
  "trial_name": null,
3649
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6997188411365105,
5
  "eval_steps": 1431,
6
+ "global_step": 10017,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3624
  "eval_samples_per_second": 109.582,
3625
  "eval_steps_per_second": 1.425,
3626
  "step": 7155
3627
+ },
3628
+ {
3629
+ "epoch": 0.5007072629795855,
3630
+ "grad_norm": 0.380859375,
3631
+ "learning_rate": 0.001,
3632
+ "loss": 1.2987,
3633
+ "step": 7168
3634
+ },
3635
+ {
3636
+ "epoch": 0.5016852068525924,
3637
+ "grad_norm": 0.267578125,
3638
+ "learning_rate": 0.001,
3639
+ "loss": 1.2759,
3640
+ "step": 7182
3641
+ },
3642
+ {
3643
+ "epoch": 0.5026631507255994,
3644
+ "grad_norm": 0.376953125,
3645
+ "learning_rate": 0.001,
3646
+ "loss": 1.2814,
3647
+ "step": 7196
3648
+ },
3649
+ {
3650
+ "epoch": 0.5036410945986064,
3651
+ "grad_norm": 0.359375,
3652
+ "learning_rate": 0.001,
3653
+ "loss": 1.2701,
3654
+ "step": 7210
3655
+ },
3656
+ {
3657
+ "epoch": 0.5046190384716134,
3658
+ "grad_norm": 0.314453125,
3659
+ "learning_rate": 0.001,
3660
+ "loss": 1.2857,
3661
+ "step": 7224
3662
+ },
3663
+ {
3664
+ "epoch": 0.5055969823446205,
3665
+ "grad_norm": 0.234375,
3666
+ "learning_rate": 0.001,
3667
+ "loss": 1.2707,
3668
+ "step": 7238
3669
+ },
3670
+ {
3671
+ "epoch": 0.5065749262176275,
3672
+ "grad_norm": 0.306640625,
3673
+ "learning_rate": 0.001,
3674
+ "loss": 1.2851,
3675
+ "step": 7252
3676
+ },
3677
+ {
3678
+ "epoch": 0.5075528700906344,
3679
+ "grad_norm": 0.337890625,
3680
+ "learning_rate": 0.001,
3681
+ "loss": 1.2722,
3682
+ "step": 7266
3683
+ },
3684
+ {
3685
+ "epoch": 0.5085308139636414,
3686
+ "grad_norm": 0.345703125,
3687
+ "learning_rate": 0.001,
3688
+ "loss": 1.277,
3689
+ "step": 7280
3690
+ },
3691
+ {
3692
+ "epoch": 0.5095087578366484,
3693
+ "grad_norm": 0.33203125,
3694
+ "learning_rate": 0.001,
3695
+ "loss": 1.3021,
3696
+ "step": 7294
3697
+ },
3698
+ {
3699
+ "epoch": 0.5104867017096555,
3700
+ "grad_norm": 0.2314453125,
3701
+ "learning_rate": 0.001,
3702
+ "loss": 1.2856,
3703
+ "step": 7308
3704
+ },
3705
+ {
3706
+ "epoch": 0.5114646455826625,
3707
+ "grad_norm": 0.25,
3708
+ "learning_rate": 0.001,
3709
+ "loss": 1.2704,
3710
+ "step": 7322
3711
+ },
3712
+ {
3713
+ "epoch": 0.5124425894556695,
3714
+ "grad_norm": 0.27734375,
3715
+ "learning_rate": 0.001,
3716
+ "loss": 1.2837,
3717
+ "step": 7336
3718
+ },
3719
+ {
3720
+ "epoch": 0.5134205333286764,
3721
+ "grad_norm": 0.283203125,
3722
+ "learning_rate": 0.001,
3723
+ "loss": 1.277,
3724
+ "step": 7350
3725
+ },
3726
+ {
3727
+ "epoch": 0.5143984772016834,
3728
+ "grad_norm": 0.2734375,
3729
+ "learning_rate": 0.001,
3730
+ "loss": 1.2838,
3731
+ "step": 7364
3732
+ },
3733
+ {
3734
+ "epoch": 0.5153764210746905,
3735
+ "grad_norm": 0.296875,
3736
+ "learning_rate": 0.001,
3737
+ "loss": 1.2762,
3738
+ "step": 7378
3739
+ },
3740
+ {
3741
+ "epoch": 0.5163543649476975,
3742
+ "grad_norm": 0.298828125,
3743
+ "learning_rate": 0.001,
3744
+ "loss": 1.2749,
3745
+ "step": 7392
3746
+ },
3747
+ {
3748
+ "epoch": 0.5173323088207045,
3749
+ "grad_norm": 0.28125,
3750
+ "learning_rate": 0.001,
3751
+ "loss": 1.2791,
3752
+ "step": 7406
3753
+ },
3754
+ {
3755
+ "epoch": 0.5183102526937114,
3756
+ "grad_norm": 0.400390625,
3757
+ "learning_rate": 0.001,
3758
+ "loss": 1.2708,
3759
+ "step": 7420
3760
+ },
3761
+ {
3762
+ "epoch": 0.5192881965667184,
3763
+ "grad_norm": 0.328125,
3764
+ "learning_rate": 0.001,
3765
+ "loss": 1.2773,
3766
+ "step": 7434
3767
+ },
3768
+ {
3769
+ "epoch": 0.5202661404397255,
3770
+ "grad_norm": 0.287109375,
3771
+ "learning_rate": 0.001,
3772
+ "loss": 1.2783,
3773
+ "step": 7448
3774
+ },
3775
+ {
3776
+ "epoch": 0.5212440843127325,
3777
+ "grad_norm": 0.341796875,
3778
+ "learning_rate": 0.001,
3779
+ "loss": 1.2944,
3780
+ "step": 7462
3781
+ },
3782
+ {
3783
+ "epoch": 0.5222220281857395,
3784
+ "grad_norm": 0.369140625,
3785
+ "learning_rate": 0.001,
3786
+ "loss": 1.2714,
3787
+ "step": 7476
3788
+ },
3789
+ {
3790
+ "epoch": 0.5231999720587465,
3791
+ "grad_norm": 0.2421875,
3792
+ "learning_rate": 0.001,
3793
+ "loss": 1.2711,
3794
+ "step": 7490
3795
+ },
3796
+ {
3797
+ "epoch": 0.5241779159317534,
3798
+ "grad_norm": 0.51953125,
3799
+ "learning_rate": 0.001,
3800
+ "loss": 1.2808,
3801
+ "step": 7504
3802
+ },
3803
+ {
3804
+ "epoch": 0.5251558598047605,
3805
+ "grad_norm": 0.271484375,
3806
+ "learning_rate": 0.001,
3807
+ "loss": 1.2765,
3808
+ "step": 7518
3809
+ },
3810
+ {
3811
+ "epoch": 0.5261338036777675,
3812
+ "grad_norm": 0.30078125,
3813
+ "learning_rate": 0.001,
3814
+ "loss": 1.2702,
3815
+ "step": 7532
3816
+ },
3817
+ {
3818
+ "epoch": 0.5271117475507745,
3819
+ "grad_norm": 0.26953125,
3820
+ "learning_rate": 0.001,
3821
+ "loss": 1.2802,
3822
+ "step": 7546
3823
+ },
3824
+ {
3825
+ "epoch": 0.5280896914237815,
3826
+ "grad_norm": 0.57421875,
3827
+ "learning_rate": 0.001,
3828
+ "loss": 1.2733,
3829
+ "step": 7560
3830
+ },
3831
+ {
3832
+ "epoch": 0.5290676352967885,
3833
+ "grad_norm": 0.494140625,
3834
+ "learning_rate": 0.001,
3835
+ "loss": 1.2575,
3836
+ "step": 7574
3837
+ },
3838
+ {
3839
+ "epoch": 0.5300455791697956,
3840
+ "grad_norm": 0.376953125,
3841
+ "learning_rate": 0.001,
3842
+ "loss": 1.2863,
3843
+ "step": 7588
3844
+ },
3845
+ {
3846
+ "epoch": 0.5310235230428025,
3847
+ "grad_norm": 0.369140625,
3848
+ "learning_rate": 0.001,
3849
+ "loss": 1.2815,
3850
+ "step": 7602
3851
+ },
3852
+ {
3853
+ "epoch": 0.5320014669158095,
3854
+ "grad_norm": 0.3046875,
3855
+ "learning_rate": 0.001,
3856
+ "loss": 1.2745,
3857
+ "step": 7616
3858
+ },
3859
+ {
3860
+ "epoch": 0.5329794107888165,
3861
+ "grad_norm": 0.271484375,
3862
+ "learning_rate": 0.001,
3863
+ "loss": 1.294,
3864
+ "step": 7630
3865
+ },
3866
+ {
3867
+ "epoch": 0.5339573546618235,
3868
+ "grad_norm": 0.2265625,
3869
+ "learning_rate": 0.001,
3870
+ "loss": 1.2797,
3871
+ "step": 7644
3872
+ },
3873
+ {
3874
+ "epoch": 0.5349352985348306,
3875
+ "grad_norm": 0.3203125,
3876
+ "learning_rate": 0.001,
3877
+ "loss": 1.2665,
3878
+ "step": 7658
3879
+ },
3880
+ {
3881
+ "epoch": 0.5359132424078376,
3882
+ "grad_norm": 0.458984375,
3883
+ "learning_rate": 0.001,
3884
+ "loss": 1.28,
3885
+ "step": 7672
3886
+ },
3887
+ {
3888
+ "epoch": 0.5368911862808445,
3889
+ "grad_norm": 0.359375,
3890
+ "learning_rate": 0.001,
3891
+ "loss": 1.3057,
3892
+ "step": 7686
3893
+ },
3894
+ {
3895
+ "epoch": 0.5378691301538515,
3896
+ "grad_norm": 0.37109375,
3897
+ "learning_rate": 0.001,
3898
+ "loss": 1.258,
3899
+ "step": 7700
3900
+ },
3901
+ {
3902
+ "epoch": 0.5388470740268585,
3903
+ "grad_norm": 0.33203125,
3904
+ "learning_rate": 0.001,
3905
+ "loss": 1.2742,
3906
+ "step": 7714
3907
+ },
3908
+ {
3909
+ "epoch": 0.5398250178998655,
3910
+ "grad_norm": 0.365234375,
3911
+ "learning_rate": 0.001,
3912
+ "loss": 1.277,
3913
+ "step": 7728
3914
+ },
3915
+ {
3916
+ "epoch": 0.5408029617728726,
3917
+ "grad_norm": 0.412109375,
3918
+ "learning_rate": 0.001,
3919
+ "loss": 1.2819,
3920
+ "step": 7742
3921
+ },
3922
+ {
3923
+ "epoch": 0.5417809056458796,
3924
+ "grad_norm": 0.263671875,
3925
+ "learning_rate": 0.001,
3926
+ "loss": 1.3018,
3927
+ "step": 7756
3928
+ },
3929
+ {
3930
+ "epoch": 0.5427588495188865,
3931
+ "grad_norm": 0.263671875,
3932
+ "learning_rate": 0.001,
3933
+ "loss": 1.2619,
3934
+ "step": 7770
3935
+ },
3936
+ {
3937
+ "epoch": 0.5437367933918935,
3938
+ "grad_norm": 0.296875,
3939
+ "learning_rate": 0.001,
3940
+ "loss": 1.2513,
3941
+ "step": 7784
3942
+ },
3943
+ {
3944
+ "epoch": 0.5447147372649005,
3945
+ "grad_norm": 0.271484375,
3946
+ "learning_rate": 0.001,
3947
+ "loss": 1.2456,
3948
+ "step": 7798
3949
+ },
3950
+ {
3951
+ "epoch": 0.5456926811379076,
3952
+ "grad_norm": 0.62109375,
3953
+ "learning_rate": 0.001,
3954
+ "loss": 1.2768,
3955
+ "step": 7812
3956
+ },
3957
+ {
3958
+ "epoch": 0.5466706250109146,
3959
+ "grad_norm": 0.44140625,
3960
+ "learning_rate": 0.001,
3961
+ "loss": 1.265,
3962
+ "step": 7826
3963
+ },
3964
+ {
3965
+ "epoch": 0.5476485688839215,
3966
+ "grad_norm": 0.306640625,
3967
+ "learning_rate": 0.001,
3968
+ "loss": 1.2683,
3969
+ "step": 7840
3970
+ },
3971
+ {
3972
+ "epoch": 0.5486265127569285,
3973
+ "grad_norm": 0.28515625,
3974
+ "learning_rate": 0.001,
3975
+ "loss": 1.2809,
3976
+ "step": 7854
3977
+ },
3978
+ {
3979
+ "epoch": 0.5496044566299355,
3980
+ "grad_norm": 0.263671875,
3981
+ "learning_rate": 0.001,
3982
+ "loss": 1.2498,
3983
+ "step": 7868
3984
+ },
3985
+ {
3986
+ "epoch": 0.5505824005029426,
3987
+ "grad_norm": 0.333984375,
3988
+ "learning_rate": 0.001,
3989
+ "loss": 1.2632,
3990
+ "step": 7882
3991
+ },
3992
+ {
3993
+ "epoch": 0.5515603443759496,
3994
+ "grad_norm": 0.29296875,
3995
+ "learning_rate": 0.001,
3996
+ "loss": 1.2711,
3997
+ "step": 7896
3998
+ },
3999
+ {
4000
+ "epoch": 0.5525382882489566,
4001
+ "grad_norm": 0.373046875,
4002
+ "learning_rate": 0.001,
4003
+ "loss": 1.2813,
4004
+ "step": 7910
4005
+ },
4006
+ {
4007
+ "epoch": 0.5535162321219635,
4008
+ "grad_norm": 0.427734375,
4009
+ "learning_rate": 0.001,
4010
+ "loss": 1.2993,
4011
+ "step": 7924
4012
+ },
4013
+ {
4014
+ "epoch": 0.5544941759949705,
4015
+ "grad_norm": 0.373046875,
4016
+ "learning_rate": 0.001,
4017
+ "loss": 1.3001,
4018
+ "step": 7938
4019
+ },
4020
+ {
4021
+ "epoch": 0.5554721198679776,
4022
+ "grad_norm": 0.416015625,
4023
+ "learning_rate": 0.001,
4024
+ "loss": 1.2786,
4025
+ "step": 7952
4026
+ },
4027
+ {
4028
+ "epoch": 0.5564500637409846,
4029
+ "grad_norm": 0.298828125,
4030
+ "learning_rate": 0.001,
4031
+ "loss": 1.2976,
4032
+ "step": 7966
4033
+ },
4034
+ {
4035
+ "epoch": 0.5574280076139916,
4036
+ "grad_norm": 0.30078125,
4037
+ "learning_rate": 0.001,
4038
+ "loss": 1.286,
4039
+ "step": 7980
4040
+ },
4041
+ {
4042
+ "epoch": 0.5584059514869986,
4043
+ "grad_norm": 0.59765625,
4044
+ "learning_rate": 0.001,
4045
+ "loss": 1.282,
4046
+ "step": 7994
4047
+ },
4048
+ {
4049
+ "epoch": 0.5593838953600055,
4050
+ "grad_norm": 0.244140625,
4051
+ "learning_rate": 0.001,
4052
+ "loss": 1.2853,
4053
+ "step": 8008
4054
+ },
4055
+ {
4056
+ "epoch": 0.5603618392330126,
4057
+ "grad_norm": 0.2265625,
4058
+ "learning_rate": 0.001,
4059
+ "loss": 1.2572,
4060
+ "step": 8022
4061
+ },
4062
+ {
4063
+ "epoch": 0.5613397831060196,
4064
+ "grad_norm": 0.306640625,
4065
+ "learning_rate": 0.001,
4066
+ "loss": 1.2572,
4067
+ "step": 8036
4068
+ },
4069
+ {
4070
+ "epoch": 0.5623177269790266,
4071
+ "grad_norm": 0.63671875,
4072
+ "learning_rate": 0.001,
4073
+ "loss": 1.315,
4074
+ "step": 8050
4075
+ },
4076
+ {
4077
+ "epoch": 0.5632956708520336,
4078
+ "grad_norm": 0.30859375,
4079
+ "learning_rate": 0.001,
4080
+ "loss": 1.3007,
4081
+ "step": 8064
4082
+ },
4083
+ {
4084
+ "epoch": 0.5642736147250406,
4085
+ "grad_norm": 0.271484375,
4086
+ "learning_rate": 0.001,
4087
+ "loss": 1.2737,
4088
+ "step": 8078
4089
+ },
4090
+ {
4091
+ "epoch": 0.5652515585980477,
4092
+ "grad_norm": 0.24609375,
4093
+ "learning_rate": 0.001,
4094
+ "loss": 1.2766,
4095
+ "step": 8092
4096
+ },
4097
+ {
4098
+ "epoch": 0.5662295024710546,
4099
+ "grad_norm": 0.314453125,
4100
+ "learning_rate": 0.001,
4101
+ "loss": 1.3102,
4102
+ "step": 8106
4103
+ },
4104
+ {
4105
+ "epoch": 0.5672074463440616,
4106
+ "grad_norm": 0.328125,
4107
+ "learning_rate": 0.001,
4108
+ "loss": 1.3044,
4109
+ "step": 8120
4110
+ },
4111
+ {
4112
+ "epoch": 0.5681853902170686,
4113
+ "grad_norm": 0.380859375,
4114
+ "learning_rate": 0.001,
4115
+ "loss": 1.2612,
4116
+ "step": 8134
4117
+ },
4118
+ {
4119
+ "epoch": 0.5691633340900756,
4120
+ "grad_norm": 0.255859375,
4121
+ "learning_rate": 0.001,
4122
+ "loss": 1.2701,
4123
+ "step": 8148
4124
+ },
4125
+ {
4126
+ "epoch": 0.5701412779630827,
4127
+ "grad_norm": 0.21875,
4128
+ "learning_rate": 0.001,
4129
+ "loss": 1.2649,
4130
+ "step": 8162
4131
+ },
4132
+ {
4133
+ "epoch": 0.5711192218360897,
4134
+ "grad_norm": 0.2236328125,
4135
+ "learning_rate": 0.001,
4136
+ "loss": 1.2761,
4137
+ "step": 8176
4138
+ },
4139
+ {
4140
+ "epoch": 0.5720971657090966,
4141
+ "grad_norm": 0.29296875,
4142
+ "learning_rate": 0.001,
4143
+ "loss": 1.2668,
4144
+ "step": 8190
4145
+ },
4146
+ {
4147
+ "epoch": 0.5730751095821036,
4148
+ "grad_norm": 0.31640625,
4149
+ "learning_rate": 0.001,
4150
+ "loss": 1.2847,
4151
+ "step": 8204
4152
+ },
4153
+ {
4154
+ "epoch": 0.5740530534551106,
4155
+ "grad_norm": 0.3203125,
4156
+ "learning_rate": 0.001,
4157
+ "loss": 1.2722,
4158
+ "step": 8218
4159
+ },
4160
+ {
4161
+ "epoch": 0.5750309973281177,
4162
+ "grad_norm": 0.224609375,
4163
+ "learning_rate": 0.001,
4164
+ "loss": 1.253,
4165
+ "step": 8232
4166
+ },
4167
+ {
4168
+ "epoch": 0.5760089412011247,
4169
+ "grad_norm": 0.2890625,
4170
+ "learning_rate": 0.001,
4171
+ "loss": 1.2454,
4172
+ "step": 8246
4173
+ },
4174
+ {
4175
+ "epoch": 0.5769868850741317,
4176
+ "grad_norm": 0.283203125,
4177
+ "learning_rate": 0.001,
4178
+ "loss": 1.2558,
4179
+ "step": 8260
4180
+ },
4181
+ {
4182
+ "epoch": 0.5779648289471386,
4183
+ "grad_norm": 0.265625,
4184
+ "learning_rate": 0.001,
4185
+ "loss": 1.2765,
4186
+ "step": 8274
4187
+ },
4188
+ {
4189
+ "epoch": 0.5789427728201456,
4190
+ "grad_norm": 0.314453125,
4191
+ "learning_rate": 0.001,
4192
+ "loss": 1.289,
4193
+ "step": 8288
4194
+ },
4195
+ {
4196
+ "epoch": 0.5799207166931526,
4197
+ "grad_norm": 0.333984375,
4198
+ "learning_rate": 0.001,
4199
+ "loss": 1.2724,
4200
+ "step": 8302
4201
+ },
4202
+ {
4203
+ "epoch": 0.5808986605661597,
4204
+ "grad_norm": 0.44140625,
4205
+ "learning_rate": 0.001,
4206
+ "loss": 1.2753,
4207
+ "step": 8316
4208
+ },
4209
+ {
4210
+ "epoch": 0.5818766044391667,
4211
+ "grad_norm": 0.326171875,
4212
+ "learning_rate": 0.001,
4213
+ "loss": 1.2558,
4214
+ "step": 8330
4215
+ },
4216
+ {
4217
+ "epoch": 0.5828545483121736,
4218
+ "grad_norm": 0.271484375,
4219
+ "learning_rate": 0.001,
4220
+ "loss": 1.2697,
4221
+ "step": 8344
4222
+ },
4223
+ {
4224
+ "epoch": 0.5838324921851806,
4225
+ "grad_norm": 0.34375,
4226
+ "learning_rate": 0.001,
4227
+ "loss": 1.2685,
4228
+ "step": 8358
4229
+ },
4230
+ {
4231
+ "epoch": 0.5848104360581876,
4232
+ "grad_norm": 0.275390625,
4233
+ "learning_rate": 0.001,
4234
+ "loss": 1.2724,
4235
+ "step": 8372
4236
+ },
4237
+ {
4238
+ "epoch": 0.5857883799311947,
4239
+ "grad_norm": 0.2255859375,
4240
+ "learning_rate": 0.001,
4241
+ "loss": 1.2287,
4242
+ "step": 8386
4243
+ },
4244
+ {
4245
+ "epoch": 0.5867663238042017,
4246
+ "grad_norm": 0.212890625,
4247
+ "learning_rate": 0.001,
4248
+ "loss": 1.2363,
4249
+ "step": 8400
4250
+ },
4251
+ {
4252
+ "epoch": 0.5877442676772087,
4253
+ "grad_norm": 0.279296875,
4254
+ "learning_rate": 0.001,
4255
+ "loss": 1.2648,
4256
+ "step": 8414
4257
+ },
4258
+ {
4259
+ "epoch": 0.5887222115502156,
4260
+ "grad_norm": 0.427734375,
4261
+ "learning_rate": 0.001,
4262
+ "loss": 1.2949,
4263
+ "step": 8428
4264
+ },
4265
+ {
4266
+ "epoch": 0.5897001554232226,
4267
+ "grad_norm": 0.23828125,
4268
+ "learning_rate": 0.001,
4269
+ "loss": 1.2571,
4270
+ "step": 8442
4271
+ },
4272
+ {
4273
+ "epoch": 0.5906780992962297,
4274
+ "grad_norm": 0.349609375,
4275
+ "learning_rate": 0.001,
4276
+ "loss": 1.2831,
4277
+ "step": 8456
4278
+ },
4279
+ {
4280
+ "epoch": 0.5916560431692367,
4281
+ "grad_norm": 0.34765625,
4282
+ "learning_rate": 0.001,
4283
+ "loss": 1.2965,
4284
+ "step": 8470
4285
+ },
4286
+ {
4287
+ "epoch": 0.5926339870422437,
4288
+ "grad_norm": 0.412109375,
4289
+ "learning_rate": 0.001,
4290
+ "loss": 1.2685,
4291
+ "step": 8484
4292
+ },
4293
+ {
4294
+ "epoch": 0.5936119309152507,
4295
+ "grad_norm": 0.439453125,
4296
+ "learning_rate": 0.001,
4297
+ "loss": 1.2637,
4298
+ "step": 8498
4299
+ },
4300
+ {
4301
+ "epoch": 0.5945898747882576,
4302
+ "grad_norm": 0.3671875,
4303
+ "learning_rate": 0.001,
4304
+ "loss": 1.28,
4305
+ "step": 8512
4306
+ },
4307
+ {
4308
+ "epoch": 0.5955678186612647,
4309
+ "grad_norm": 0.43359375,
4310
+ "learning_rate": 0.001,
4311
+ "loss": 1.2636,
4312
+ "step": 8526
4313
+ },
4314
+ {
4315
+ "epoch": 0.5965457625342717,
4316
+ "grad_norm": 0.333984375,
4317
+ "learning_rate": 0.001,
4318
+ "loss": 1.251,
4319
+ "step": 8540
4320
+ },
4321
+ {
4322
+ "epoch": 0.5975237064072787,
4323
+ "grad_norm": 0.328125,
4324
+ "learning_rate": 0.001,
4325
+ "loss": 1.262,
4326
+ "step": 8554
4327
+ },
4328
+ {
4329
+ "epoch": 0.5985016502802857,
4330
+ "grad_norm": 0.365234375,
4331
+ "learning_rate": 0.001,
4332
+ "loss": 1.2696,
4333
+ "step": 8568
4334
+ },
4335
+ {
4336
+ "epoch": 0.5994795941532927,
4337
+ "grad_norm": 0.28515625,
4338
+ "learning_rate": 0.001,
4339
+ "loss": 1.2872,
4340
+ "step": 8582
4341
+ },
4342
+ {
4343
+ "epoch": 0.5997590066884375,
4344
+ "eval_loss": 1.661841869354248,
4345
+ "eval_runtime": 9.1193,
4346
+ "eval_samples_per_second": 109.657,
4347
+ "eval_steps_per_second": 1.426,
4348
+ "step": 8586
4349
+ },
4350
+ {
4351
+ "epoch": 0.6004575380262998,
4352
+ "grad_norm": 0.2451171875,
4353
+ "learning_rate": 0.001,
4354
+ "loss": 1.2767,
4355
+ "step": 8596
4356
+ },
4357
+ {
4358
+ "epoch": 0.6014354818993067,
4359
+ "grad_norm": 0.33984375,
4360
+ "learning_rate": 0.001,
4361
+ "loss": 1.2623,
4362
+ "step": 8610
4363
+ },
4364
+ {
4365
+ "epoch": 0.6024134257723137,
4366
+ "grad_norm": 0.26953125,
4367
+ "learning_rate": 0.001,
4368
+ "loss": 1.2617,
4369
+ "step": 8624
4370
+ },
4371
+ {
4372
+ "epoch": 0.6033913696453207,
4373
+ "grad_norm": 0.25390625,
4374
+ "learning_rate": 0.001,
4375
+ "loss": 1.2514,
4376
+ "step": 8638
4377
+ },
4378
+ {
4379
+ "epoch": 0.6043693135183277,
4380
+ "grad_norm": 0.255859375,
4381
+ "learning_rate": 0.001,
4382
+ "loss": 1.2664,
4383
+ "step": 8652
4384
+ },
4385
+ {
4386
+ "epoch": 0.6053472573913348,
4387
+ "grad_norm": 0.357421875,
4388
+ "learning_rate": 0.001,
4389
+ "loss": 1.2421,
4390
+ "step": 8666
4391
+ },
4392
+ {
4393
+ "epoch": 0.6063252012643418,
4394
+ "grad_norm": 0.263671875,
4395
+ "learning_rate": 0.001,
4396
+ "loss": 1.2386,
4397
+ "step": 8680
4398
+ },
4399
+ {
4400
+ "epoch": 0.6073031451373487,
4401
+ "grad_norm": 0.259765625,
4402
+ "learning_rate": 0.001,
4403
+ "loss": 1.2601,
4404
+ "step": 8694
4405
+ },
4406
+ {
4407
+ "epoch": 0.6082810890103557,
4408
+ "grad_norm": 0.94921875,
4409
+ "learning_rate": 0.001,
4410
+ "loss": 1.2715,
4411
+ "step": 8708
4412
+ },
4413
+ {
4414
+ "epoch": 0.6092590328833627,
4415
+ "grad_norm": 0.43359375,
4416
+ "learning_rate": 0.001,
4417
+ "loss": 1.2848,
4418
+ "step": 8722
4419
+ },
4420
+ {
4421
+ "epoch": 0.6102369767563698,
4422
+ "grad_norm": 0.34375,
4423
+ "learning_rate": 0.001,
4424
+ "loss": 1.2632,
4425
+ "step": 8736
4426
+ },
4427
+ {
4428
+ "epoch": 0.6112149206293768,
4429
+ "grad_norm": 0.283203125,
4430
+ "learning_rate": 0.001,
4431
+ "loss": 1.2912,
4432
+ "step": 8750
4433
+ },
4434
+ {
4435
+ "epoch": 0.6121928645023837,
4436
+ "grad_norm": 0.388671875,
4437
+ "learning_rate": 0.001,
4438
+ "loss": 1.2613,
4439
+ "step": 8764
4440
+ },
4441
+ {
4442
+ "epoch": 0.6131708083753907,
4443
+ "grad_norm": 0.27734375,
4444
+ "learning_rate": 0.001,
4445
+ "loss": 1.2357,
4446
+ "step": 8778
4447
+ },
4448
+ {
4449
+ "epoch": 0.6141487522483977,
4450
+ "grad_norm": 0.30859375,
4451
+ "learning_rate": 0.001,
4452
+ "loss": 1.2541,
4453
+ "step": 8792
4454
+ },
4455
+ {
4456
+ "epoch": 0.6151266961214047,
4457
+ "grad_norm": 0.32421875,
4458
+ "learning_rate": 0.001,
4459
+ "loss": 1.2746,
4460
+ "step": 8806
4461
+ },
4462
+ {
4463
+ "epoch": 0.6161046399944118,
4464
+ "grad_norm": 0.271484375,
4465
+ "learning_rate": 0.001,
4466
+ "loss": 1.2445,
4467
+ "step": 8820
4468
+ },
4469
+ {
4470
+ "epoch": 0.6170825838674188,
4471
+ "grad_norm": 0.255859375,
4472
+ "learning_rate": 0.001,
4473
+ "loss": 1.2854,
4474
+ "step": 8834
4475
+ },
4476
+ {
4477
+ "epoch": 0.6180605277404257,
4478
+ "grad_norm": 0.54296875,
4479
+ "learning_rate": 0.001,
4480
+ "loss": 1.2746,
4481
+ "step": 8848
4482
+ },
4483
+ {
4484
+ "epoch": 0.6190384716134327,
4485
+ "grad_norm": 0.35546875,
4486
+ "learning_rate": 0.001,
4487
+ "loss": 1.2837,
4488
+ "step": 8862
4489
+ },
4490
+ {
4491
+ "epoch": 0.6200164154864397,
4492
+ "grad_norm": 0.341796875,
4493
+ "learning_rate": 0.001,
4494
+ "loss": 1.2682,
4495
+ "step": 8876
4496
+ },
4497
+ {
4498
+ "epoch": 0.6209943593594468,
4499
+ "grad_norm": 0.51953125,
4500
+ "learning_rate": 0.001,
4501
+ "loss": 1.2751,
4502
+ "step": 8890
4503
+ },
4504
+ {
4505
+ "epoch": 0.6219723032324538,
4506
+ "grad_norm": 0.41015625,
4507
+ "learning_rate": 0.001,
4508
+ "loss": 1.2666,
4509
+ "step": 8904
4510
+ },
4511
+ {
4512
+ "epoch": 0.6229502471054608,
4513
+ "grad_norm": 0.4140625,
4514
+ "learning_rate": 0.001,
4515
+ "loss": 1.2618,
4516
+ "step": 8918
4517
+ },
4518
+ {
4519
+ "epoch": 0.6239281909784677,
4520
+ "grad_norm": 0.267578125,
4521
+ "learning_rate": 0.001,
4522
+ "loss": 1.2721,
4523
+ "step": 8932
4524
+ },
4525
+ {
4526
+ "epoch": 0.6249061348514747,
4527
+ "grad_norm": 0.3515625,
4528
+ "learning_rate": 0.001,
4529
+ "loss": 1.2528,
4530
+ "step": 8946
4531
+ },
4532
+ {
4533
+ "epoch": 0.6258840787244818,
4534
+ "grad_norm": 0.34375,
4535
+ "learning_rate": 0.001,
4536
+ "loss": 1.2771,
4537
+ "step": 8960
4538
+ },
4539
+ {
4540
+ "epoch": 0.6268620225974888,
4541
+ "grad_norm": 0.275390625,
4542
+ "learning_rate": 0.001,
4543
+ "loss": 1.2751,
4544
+ "step": 8974
4545
+ },
4546
+ {
4547
+ "epoch": 0.6278399664704958,
4548
+ "grad_norm": 0.28515625,
4549
+ "learning_rate": 0.001,
4550
+ "loss": 1.2749,
4551
+ "step": 8988
4552
+ },
4553
+ {
4554
+ "epoch": 0.6288179103435028,
4555
+ "grad_norm": 0.27734375,
4556
+ "learning_rate": 0.001,
4557
+ "loss": 1.2851,
4558
+ "step": 9002
4559
+ },
4560
+ {
4561
+ "epoch": 0.6297958542165097,
4562
+ "grad_norm": 0.23828125,
4563
+ "learning_rate": 0.001,
4564
+ "loss": 1.2529,
4565
+ "step": 9016
4566
+ },
4567
+ {
4568
+ "epoch": 0.6307737980895168,
4569
+ "grad_norm": 0.2890625,
4570
+ "learning_rate": 0.001,
4571
+ "loss": 1.2673,
4572
+ "step": 9030
4573
+ },
4574
+ {
4575
+ "epoch": 0.6317517419625238,
4576
+ "grad_norm": 0.259765625,
4577
+ "learning_rate": 0.001,
4578
+ "loss": 1.2746,
4579
+ "step": 9044
4580
+ },
4581
+ {
4582
+ "epoch": 0.6327296858355308,
4583
+ "grad_norm": 0.380859375,
4584
+ "learning_rate": 0.001,
4585
+ "loss": 1.2649,
4586
+ "step": 9058
4587
+ },
4588
+ {
4589
+ "epoch": 0.6337076297085378,
4590
+ "grad_norm": 0.318359375,
4591
+ "learning_rate": 0.001,
4592
+ "loss": 1.2849,
4593
+ "step": 9072
4594
+ },
4595
+ {
4596
+ "epoch": 0.6346855735815448,
4597
+ "grad_norm": 0.30078125,
4598
+ "learning_rate": 0.001,
4599
+ "loss": 1.243,
4600
+ "step": 9086
4601
+ },
4602
+ {
4603
+ "epoch": 0.6356635174545519,
4604
+ "grad_norm": 0.375,
4605
+ "learning_rate": 0.001,
4606
+ "loss": 1.2641,
4607
+ "step": 9100
4608
+ },
4609
+ {
4610
+ "epoch": 0.6366414613275588,
4611
+ "grad_norm": 0.361328125,
4612
+ "learning_rate": 0.001,
4613
+ "loss": 1.2554,
4614
+ "step": 9114
4615
+ },
4616
+ {
4617
+ "epoch": 0.6376194052005658,
4618
+ "grad_norm": 0.396484375,
4619
+ "learning_rate": 0.001,
4620
+ "loss": 1.2396,
4621
+ "step": 9128
4622
+ },
4623
+ {
4624
+ "epoch": 0.6385973490735728,
4625
+ "grad_norm": 0.263671875,
4626
+ "learning_rate": 0.001,
4627
+ "loss": 1.2508,
4628
+ "step": 9142
4629
+ },
4630
+ {
4631
+ "epoch": 0.6395752929465798,
4632
+ "grad_norm": 0.33984375,
4633
+ "learning_rate": 0.001,
4634
+ "loss": 1.2772,
4635
+ "step": 9156
4636
+ },
4637
+ {
4638
+ "epoch": 0.6405532368195869,
4639
+ "grad_norm": 0.53515625,
4640
+ "learning_rate": 0.001,
4641
+ "loss": 1.2453,
4642
+ "step": 9170
4643
+ },
4644
+ {
4645
+ "epoch": 0.6415311806925938,
4646
+ "grad_norm": 0.2099609375,
4647
+ "learning_rate": 0.001,
4648
+ "loss": 1.2764,
4649
+ "step": 9184
4650
+ },
4651
+ {
4652
+ "epoch": 0.6425091245656008,
4653
+ "grad_norm": 0.2333984375,
4654
+ "learning_rate": 0.001,
4655
+ "loss": 1.251,
4656
+ "step": 9198
4657
+ },
4658
+ {
4659
+ "epoch": 0.6434870684386078,
4660
+ "grad_norm": 0.35546875,
4661
+ "learning_rate": 0.001,
4662
+ "loss": 1.2855,
4663
+ "step": 9212
4664
+ },
4665
+ {
4666
+ "epoch": 0.6444650123116148,
4667
+ "grad_norm": 1.1953125,
4668
+ "learning_rate": 0.001,
4669
+ "loss": 1.3198,
4670
+ "step": 9226
4671
+ },
4672
+ {
4673
+ "epoch": 0.6454429561846219,
4674
+ "grad_norm": 0.427734375,
4675
+ "learning_rate": 0.001,
4676
+ "loss": 1.2773,
4677
+ "step": 9240
4678
+ },
4679
+ {
4680
+ "epoch": 0.6464209000576289,
4681
+ "grad_norm": 0.5703125,
4682
+ "learning_rate": 0.001,
4683
+ "loss": 1.2786,
4684
+ "step": 9254
4685
+ },
4686
+ {
4687
+ "epoch": 0.6473988439306358,
4688
+ "grad_norm": 0.3125,
4689
+ "learning_rate": 0.001,
4690
+ "loss": 1.2389,
4691
+ "step": 9268
4692
+ },
4693
+ {
4694
+ "epoch": 0.6483767878036428,
4695
+ "grad_norm": 0.36328125,
4696
+ "learning_rate": 0.001,
4697
+ "loss": 1.2587,
4698
+ "step": 9282
4699
+ },
4700
+ {
4701
+ "epoch": 0.6493547316766498,
4702
+ "grad_norm": 0.380859375,
4703
+ "learning_rate": 0.001,
4704
+ "loss": 1.2806,
4705
+ "step": 9296
4706
+ },
4707
+ {
4708
+ "epoch": 0.6503326755496568,
4709
+ "grad_norm": 0.251953125,
4710
+ "learning_rate": 0.001,
4711
+ "loss": 1.2307,
4712
+ "step": 9310
4713
+ },
4714
+ {
4715
+ "epoch": 0.6513106194226639,
4716
+ "grad_norm": 0.263671875,
4717
+ "learning_rate": 0.001,
4718
+ "loss": 1.2657,
4719
+ "step": 9324
4720
+ },
4721
+ {
4722
+ "epoch": 0.6522885632956709,
4723
+ "grad_norm": 0.263671875,
4724
+ "learning_rate": 0.001,
4725
+ "loss": 1.2605,
4726
+ "step": 9338
4727
+ },
4728
+ {
4729
+ "epoch": 0.6532665071686778,
4730
+ "grad_norm": 0.2412109375,
4731
+ "learning_rate": 0.001,
4732
+ "loss": 1.2538,
4733
+ "step": 9352
4734
+ },
4735
+ {
4736
+ "epoch": 0.6542444510416848,
4737
+ "grad_norm": 0.3203125,
4738
+ "learning_rate": 0.001,
4739
+ "loss": 1.2633,
4740
+ "step": 9366
4741
+ },
4742
+ {
4743
+ "epoch": 0.6552223949146918,
4744
+ "grad_norm": 0.31640625,
4745
+ "learning_rate": 0.001,
4746
+ "loss": 1.2582,
4747
+ "step": 9380
4748
+ },
4749
+ {
4750
+ "epoch": 0.6562003387876989,
4751
+ "grad_norm": 0.328125,
4752
+ "learning_rate": 0.001,
4753
+ "loss": 1.2515,
4754
+ "step": 9394
4755
+ },
4756
+ {
4757
+ "epoch": 0.6571782826607059,
4758
+ "grad_norm": 0.33984375,
4759
+ "learning_rate": 0.001,
4760
+ "loss": 1.2679,
4761
+ "step": 9408
4762
+ },
4763
+ {
4764
+ "epoch": 0.6581562265337129,
4765
+ "grad_norm": 0.3828125,
4766
+ "learning_rate": 0.001,
4767
+ "loss": 1.2539,
4768
+ "step": 9422
4769
+ },
4770
+ {
4771
+ "epoch": 0.6591341704067198,
4772
+ "grad_norm": 0.26171875,
4773
+ "learning_rate": 0.001,
4774
+ "loss": 1.2632,
4775
+ "step": 9436
4776
+ },
4777
+ {
4778
+ "epoch": 0.6601121142797268,
4779
+ "grad_norm": 0.3203125,
4780
+ "learning_rate": 0.001,
4781
+ "loss": 1.2946,
4782
+ "step": 9450
4783
+ },
4784
+ {
4785
+ "epoch": 0.6610900581527339,
4786
+ "grad_norm": 0.38671875,
4787
+ "learning_rate": 0.001,
4788
+ "loss": 1.2691,
4789
+ "step": 9464
4790
+ },
4791
+ {
4792
+ "epoch": 0.6620680020257409,
4793
+ "grad_norm": 0.2890625,
4794
+ "learning_rate": 0.001,
4795
+ "loss": 1.246,
4796
+ "step": 9478
4797
+ },
4798
+ {
4799
+ "epoch": 0.6630459458987479,
4800
+ "grad_norm": 0.431640625,
4801
+ "learning_rate": 0.001,
4802
+ "loss": 1.2606,
4803
+ "step": 9492
4804
+ },
4805
+ {
4806
+ "epoch": 0.6640238897717549,
4807
+ "grad_norm": 0.8671875,
4808
+ "learning_rate": 0.001,
4809
+ "loss": 1.2782,
4810
+ "step": 9506
4811
+ },
4812
+ {
4813
+ "epoch": 0.6650018336447618,
4814
+ "grad_norm": 0.4375,
4815
+ "learning_rate": 0.001,
4816
+ "loss": 1.2687,
4817
+ "step": 9520
4818
+ },
4819
+ {
4820
+ "epoch": 0.6659797775177689,
4821
+ "grad_norm": 0.37109375,
4822
+ "learning_rate": 0.001,
4823
+ "loss": 1.2778,
4824
+ "step": 9534
4825
+ },
4826
+ {
4827
+ "epoch": 0.6669577213907759,
4828
+ "grad_norm": 0.2490234375,
4829
+ "learning_rate": 0.001,
4830
+ "loss": 1.2544,
4831
+ "step": 9548
4832
+ },
4833
+ {
4834
+ "epoch": 0.6679356652637829,
4835
+ "grad_norm": 0.291015625,
4836
+ "learning_rate": 0.001,
4837
+ "loss": 1.246,
4838
+ "step": 9562
4839
+ },
4840
+ {
4841
+ "epoch": 0.6689136091367899,
4842
+ "grad_norm": 0.287109375,
4843
+ "learning_rate": 0.001,
4844
+ "loss": 1.2577,
4845
+ "step": 9576
4846
+ },
4847
+ {
4848
+ "epoch": 0.6698915530097969,
4849
+ "grad_norm": 0.328125,
4850
+ "learning_rate": 0.001,
4851
+ "loss": 1.2639,
4852
+ "step": 9590
4853
+ },
4854
+ {
4855
+ "epoch": 0.670869496882804,
4856
+ "grad_norm": 0.271484375,
4857
+ "learning_rate": 0.001,
4858
+ "loss": 1.2493,
4859
+ "step": 9604
4860
+ },
4861
+ {
4862
+ "epoch": 0.6718474407558109,
4863
+ "grad_norm": 0.2119140625,
4864
+ "learning_rate": 0.001,
4865
+ "loss": 1.2586,
4866
+ "step": 9618
4867
+ },
4868
+ {
4869
+ "epoch": 0.6728253846288179,
4870
+ "grad_norm": 0.240234375,
4871
+ "learning_rate": 0.001,
4872
+ "loss": 1.2719,
4873
+ "step": 9632
4874
+ },
4875
+ {
4876
+ "epoch": 0.6738033285018249,
4877
+ "grad_norm": 0.294921875,
4878
+ "learning_rate": 0.001,
4879
+ "loss": 1.2272,
4880
+ "step": 9646
4881
+ },
4882
+ {
4883
+ "epoch": 0.6747812723748319,
4884
+ "grad_norm": 0.423828125,
4885
+ "learning_rate": 0.001,
4886
+ "loss": 1.2667,
4887
+ "step": 9660
4888
+ },
4889
+ {
4890
+ "epoch": 0.675759216247839,
4891
+ "grad_norm": 0.30859375,
4892
+ "learning_rate": 0.001,
4893
+ "loss": 1.2567,
4894
+ "step": 9674
4895
+ },
4896
+ {
4897
+ "epoch": 0.676737160120846,
4898
+ "grad_norm": 0.21875,
4899
+ "learning_rate": 0.001,
4900
+ "loss": 1.2403,
4901
+ "step": 9688
4902
+ },
4903
+ {
4904
+ "epoch": 0.6777151039938529,
4905
+ "grad_norm": 0.2490234375,
4906
+ "learning_rate": 0.001,
4907
+ "loss": 1.2642,
4908
+ "step": 9702
4909
+ },
4910
+ {
4911
+ "epoch": 0.6786930478668599,
4912
+ "grad_norm": 0.248046875,
4913
+ "learning_rate": 0.001,
4914
+ "loss": 1.2123,
4915
+ "step": 9716
4916
+ },
4917
+ {
4918
+ "epoch": 0.6796709917398669,
4919
+ "grad_norm": 0.271484375,
4920
+ "learning_rate": 0.001,
4921
+ "loss": 1.2413,
4922
+ "step": 9730
4923
+ },
4924
+ {
4925
+ "epoch": 0.680648935612874,
4926
+ "grad_norm": 0.259765625,
4927
+ "learning_rate": 0.001,
4928
+ "loss": 1.2442,
4929
+ "step": 9744
4930
+ },
4931
+ {
4932
+ "epoch": 0.681626879485881,
4933
+ "grad_norm": 0.271484375,
4934
+ "learning_rate": 0.001,
4935
+ "loss": 1.2298,
4936
+ "step": 9758
4937
+ },
4938
+ {
4939
+ "epoch": 0.682604823358888,
4940
+ "grad_norm": 0.232421875,
4941
+ "learning_rate": 0.001,
4942
+ "loss": 1.2421,
4943
+ "step": 9772
4944
+ },
4945
+ {
4946
+ "epoch": 0.6835827672318949,
4947
+ "grad_norm": 0.302734375,
4948
+ "learning_rate": 0.001,
4949
+ "loss": 1.2725,
4950
+ "step": 9786
4951
+ },
4952
+ {
4953
+ "epoch": 0.6845607111049019,
4954
+ "grad_norm": 0.267578125,
4955
+ "learning_rate": 0.001,
4956
+ "loss": 1.2417,
4957
+ "step": 9800
4958
+ },
4959
+ {
4960
+ "epoch": 0.685538654977909,
4961
+ "grad_norm": 0.232421875,
4962
+ "learning_rate": 0.001,
4963
+ "loss": 1.2526,
4964
+ "step": 9814
4965
+ },
4966
+ {
4967
+ "epoch": 0.686516598850916,
4968
+ "grad_norm": 0.265625,
4969
+ "learning_rate": 0.001,
4970
+ "loss": 1.2352,
4971
+ "step": 9828
4972
+ },
4973
+ {
4974
+ "epoch": 0.687494542723923,
4975
+ "grad_norm": 0.361328125,
4976
+ "learning_rate": 0.001,
4977
+ "loss": 1.2653,
4978
+ "step": 9842
4979
+ },
4980
+ {
4981
+ "epoch": 0.68847248659693,
4982
+ "grad_norm": 0.328125,
4983
+ "learning_rate": 0.001,
4984
+ "loss": 1.2569,
4985
+ "step": 9856
4986
+ },
4987
+ {
4988
+ "epoch": 0.6894504304699369,
4989
+ "grad_norm": 0.26171875,
4990
+ "learning_rate": 0.001,
4991
+ "loss": 1.248,
4992
+ "step": 9870
4993
+ },
4994
+ {
4995
+ "epoch": 0.6904283743429439,
4996
+ "grad_norm": 0.265625,
4997
+ "learning_rate": 0.001,
4998
+ "loss": 1.2864,
4999
+ "step": 9884
5000
+ },
5001
+ {
5002
+ "epoch": 0.691406318215951,
5003
+ "grad_norm": 0.345703125,
5004
+ "learning_rate": 0.001,
5005
+ "loss": 1.2663,
5006
+ "step": 9898
5007
+ },
5008
+ {
5009
+ "epoch": 0.692384262088958,
5010
+ "grad_norm": 0.36328125,
5011
+ "learning_rate": 0.001,
5012
+ "loss": 1.2534,
5013
+ "step": 9912
5014
+ },
5015
+ {
5016
+ "epoch": 0.693362205961965,
5017
+ "grad_norm": 0.333984375,
5018
+ "learning_rate": 0.001,
5019
+ "loss": 1.2439,
5020
+ "step": 9926
5021
+ },
5022
+ {
5023
+ "epoch": 0.6943401498349719,
5024
+ "grad_norm": 0.298828125,
5025
+ "learning_rate": 0.001,
5026
+ "loss": 1.2592,
5027
+ "step": 9940
5028
+ },
5029
+ {
5030
+ "epoch": 0.6953180937079789,
5031
+ "grad_norm": 0.3046875,
5032
+ "learning_rate": 0.001,
5033
+ "loss": 1.2655,
5034
+ "step": 9954
5035
+ },
5036
+ {
5037
+ "epoch": 0.696296037580986,
5038
+ "grad_norm": 0.265625,
5039
+ "learning_rate": 0.001,
5040
+ "loss": 1.2569,
5041
+ "step": 9968
5042
+ },
5043
+ {
5044
+ "epoch": 0.697273981453993,
5045
+ "grad_norm": 0.376953125,
5046
+ "learning_rate": 0.001,
5047
+ "loss": 1.2663,
5048
+ "step": 9982
5049
+ },
5050
+ {
5051
+ "epoch": 0.698251925327,
5052
+ "grad_norm": 0.291015625,
5053
+ "learning_rate": 0.001,
5054
+ "loss": 1.2419,
5055
+ "step": 9996
5056
+ },
5057
+ {
5058
+ "epoch": 0.699229869200007,
5059
+ "grad_norm": 0.251953125,
5060
+ "learning_rate": 0.001,
5061
+ "loss": 1.2406,
5062
+ "step": 10010
5063
+ },
5064
+ {
5065
+ "epoch": 0.6997188411365105,
5066
+ "eval_loss": 1.6404287815093994,
5067
+ "eval_runtime": 9.1224,
5068
+ "eval_samples_per_second": 109.621,
5069
+ "eval_steps_per_second": 1.425,
5070
+ "step": 10017
5071
  }
5072
  ],
5073
  "logging_steps": 14,
 
5087
  "attributes": {}
5088
  }
5089
  },
5090
+ "total_flos": 2.66909141699448e+18,
5091
  "train_batch_size": 8,
5092
  "trial_name": null,
5093
  "trial_params": null