gabrielaltay commited on
Commit
c2bb426
1 Parent(s): b1cea36

Training in progress, step 8256, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc14c82541dcf336920bd71d895536d6efea8914fde961d7492c52f5cc9445ca
3
  size 439648328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8555c6c30b01b7b518e204262ad49bdfc8a647ffac30864d51ee8c8057b5b58b
3
  size 439648328
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20f7a55feca53738b4038f4f0b136313282c8cd5a2504a1dbddc6a4ea258d90c
3
  size 879415866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65419cfd6a880659ccadbe3db88894c9a0bc93ac16d9c1b7a0c6cf2cbc2b395
3
  size 879415866
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bf1f36b851a2d974317dd34c21b7dfaf3e62f7b39845e5315f2a309b7bb7d5d
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5528a8a5438254c67bb6f375f3876eeca26717fef489265e3b041c5387c9fb8f
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd7e66b68f061e5603332411190d26a9de796e51e2357e8d4f6ddade728d13e3
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1d87868b3d95ab9fb053bc3e7b7216c1360a2d6ef559d5a4f71fdb1eb48e41
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3f104878651357835ecfee02db6abafb4d96c7a0b56ef61cddd46d8721370a3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c699c45754dba9f295f88b976126b3ed2ecc4605b1af134d5e1f2b88049fd75b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7004072134962187,
5
  "eval_steps": 500,
6
- "global_step": 7224,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12649,6 +12649,1812 @@
12649
  "learning_rate": 1.4979639325189065e-05,
12650
  "loss": 5.2557,
12651
  "step": 7224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12652
  }
12653
  ],
12654
  "logging_steps": 4,
@@ -12656,7 +14462,7 @@
12656
  "num_input_tokens_seen": 0,
12657
  "num_train_epochs": 1,
12658
  "save_steps": 1032,
12659
- "total_flos": 6.084453063878246e+16,
12660
  "train_batch_size": 8,
12661
  "trial_name": null,
12662
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8004653868528214,
5
  "eval_steps": 500,
6
+ "global_step": 8256,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12649
  "learning_rate": 1.4979639325189065e-05,
12650
  "loss": 5.2557,
12651
  "step": 7224
12652
+ },
12653
+ {
12654
+ "epoch": 0.7,
12655
+ "grad_norm": 1.0959160327911377,
12656
+ "learning_rate": 1.4960248206321506e-05,
12657
+ "loss": 5.2666,
12658
+ "step": 7228
12659
+ },
12660
+ {
12661
+ "epoch": 0.7,
12662
+ "grad_norm": 1.0157089233398438,
12663
+ "learning_rate": 1.4940857087453946e-05,
12664
+ "loss": 5.3451,
12665
+ "step": 7232
12666
+ },
12667
+ {
12668
+ "epoch": 0.7,
12669
+ "grad_norm": 1.1046866178512573,
12670
+ "learning_rate": 1.4921465968586387e-05,
12671
+ "loss": 5.2349,
12672
+ "step": 7236
12673
+ },
12674
+ {
12675
+ "epoch": 0.7,
12676
+ "grad_norm": 1.0781642198562622,
12677
+ "learning_rate": 1.4902074849718831e-05,
12678
+ "loss": 5.2609,
12679
+ "step": 7240
12680
+ },
12681
+ {
12682
+ "epoch": 0.7,
12683
+ "grad_norm": 1.0019387006759644,
12684
+ "learning_rate": 1.4882683730851271e-05,
12685
+ "loss": 5.3954,
12686
+ "step": 7244
12687
+ },
12688
+ {
12689
+ "epoch": 0.7,
12690
+ "grad_norm": 1.1083266735076904,
12691
+ "learning_rate": 1.4863292611983712e-05,
12692
+ "loss": 5.2636,
12693
+ "step": 7248
12694
+ },
12695
+ {
12696
+ "epoch": 0.7,
12697
+ "grad_norm": 1.2309002876281738,
12698
+ "learning_rate": 1.4843901493116152e-05,
12699
+ "loss": 5.2955,
12700
+ "step": 7252
12701
+ },
12702
+ {
12703
+ "epoch": 0.7,
12704
+ "grad_norm": 1.087774634361267,
12705
+ "learning_rate": 1.4824510374248596e-05,
12706
+ "loss": 5.3,
12707
+ "step": 7256
12708
+ },
12709
+ {
12710
+ "epoch": 0.7,
12711
+ "grad_norm": 1.075287938117981,
12712
+ "learning_rate": 1.4805119255381037e-05,
12713
+ "loss": 5.3727,
12714
+ "step": 7260
12715
+ },
12716
+ {
12717
+ "epoch": 0.7,
12718
+ "grad_norm": 1.0246081352233887,
12719
+ "learning_rate": 1.4785728136513477e-05,
12720
+ "loss": 5.3558,
12721
+ "step": 7264
12722
+ },
12723
+ {
12724
+ "epoch": 0.7,
12725
+ "grad_norm": 1.124543309211731,
12726
+ "learning_rate": 1.4766337017645918e-05,
12727
+ "loss": 5.3379,
12728
+ "step": 7268
12729
+ },
12730
+ {
12731
+ "epoch": 0.71,
12732
+ "grad_norm": 1.047892689704895,
12733
+ "learning_rate": 1.474694589877836e-05,
12734
+ "loss": 5.3975,
12735
+ "step": 7272
12736
+ },
12737
+ {
12738
+ "epoch": 0.71,
12739
+ "grad_norm": 1.0381947755813599,
12740
+ "learning_rate": 1.47275547799108e-05,
12741
+ "loss": 5.3515,
12742
+ "step": 7276
12743
+ },
12744
+ {
12745
+ "epoch": 0.71,
12746
+ "grad_norm": 1.0230307579040527,
12747
+ "learning_rate": 1.4708163661043243e-05,
12748
+ "loss": 5.2925,
12749
+ "step": 7280
12750
+ },
12751
+ {
12752
+ "epoch": 0.71,
12753
+ "grad_norm": 1.0596458911895752,
12754
+ "learning_rate": 1.4688772542175685e-05,
12755
+ "loss": 5.3191,
12756
+ "step": 7284
12757
+ },
12758
+ {
12759
+ "epoch": 0.71,
12760
+ "grad_norm": 1.1031346321105957,
12761
+ "learning_rate": 1.4669381423308126e-05,
12762
+ "loss": 5.402,
12763
+ "step": 7288
12764
+ },
12765
+ {
12766
+ "epoch": 0.71,
12767
+ "grad_norm": 1.0289580821990967,
12768
+ "learning_rate": 1.4649990304440566e-05,
12769
+ "loss": 5.3957,
12770
+ "step": 7292
12771
+ },
12772
+ {
12773
+ "epoch": 0.71,
12774
+ "grad_norm": 1.1469511985778809,
12775
+ "learning_rate": 1.4630599185573007e-05,
12776
+ "loss": 5.3244,
12777
+ "step": 7296
12778
+ },
12779
+ {
12780
+ "epoch": 0.71,
12781
+ "grad_norm": 1.0669410228729248,
12782
+ "learning_rate": 1.461120806670545e-05,
12783
+ "loss": 5.431,
12784
+ "step": 7300
12785
+ },
12786
+ {
12787
+ "epoch": 0.71,
12788
+ "grad_norm": 1.05574631690979,
12789
+ "learning_rate": 1.4591816947837891e-05,
12790
+ "loss": 5.3382,
12791
+ "step": 7304
12792
+ },
12793
+ {
12794
+ "epoch": 0.71,
12795
+ "grad_norm": 1.0296452045440674,
12796
+ "learning_rate": 1.4572425828970332e-05,
12797
+ "loss": 5.337,
12798
+ "step": 7308
12799
+ },
12800
+ {
12801
+ "epoch": 0.71,
12802
+ "grad_norm": 1.0180591344833374,
12803
+ "learning_rate": 1.4553034710102772e-05,
12804
+ "loss": 5.3239,
12805
+ "step": 7312
12806
+ },
12807
+ {
12808
+ "epoch": 0.71,
12809
+ "grad_norm": 1.0508371591567993,
12810
+ "learning_rate": 1.4533643591235216e-05,
12811
+ "loss": 5.2944,
12812
+ "step": 7316
12813
+ },
12814
+ {
12815
+ "epoch": 0.71,
12816
+ "grad_norm": 1.0255225896835327,
12817
+ "learning_rate": 1.4514252472367657e-05,
12818
+ "loss": 5.3574,
12819
+ "step": 7320
12820
+ },
12821
+ {
12822
+ "epoch": 0.71,
12823
+ "grad_norm": 1.0599967241287231,
12824
+ "learning_rate": 1.4494861353500097e-05,
12825
+ "loss": 5.3568,
12826
+ "step": 7324
12827
+ },
12828
+ {
12829
+ "epoch": 0.71,
12830
+ "grad_norm": 0.9832557439804077,
12831
+ "learning_rate": 1.4475470234632538e-05,
12832
+ "loss": 5.2891,
12833
+ "step": 7328
12834
+ },
12835
+ {
12836
+ "epoch": 0.71,
12837
+ "grad_norm": 1.0541869401931763,
12838
+ "learning_rate": 1.4456079115764982e-05,
12839
+ "loss": 5.2502,
12840
+ "step": 7332
12841
+ },
12842
+ {
12843
+ "epoch": 0.71,
12844
+ "grad_norm": 1.069973111152649,
12845
+ "learning_rate": 1.4436687996897422e-05,
12846
+ "loss": 5.3376,
12847
+ "step": 7336
12848
+ },
12849
+ {
12850
+ "epoch": 0.71,
12851
+ "grad_norm": 1.0768502950668335,
12852
+ "learning_rate": 1.4417296878029863e-05,
12853
+ "loss": 5.2392,
12854
+ "step": 7340
12855
+ },
12856
+ {
12857
+ "epoch": 0.71,
12858
+ "grad_norm": 1.000628113746643,
12859
+ "learning_rate": 1.4397905759162305e-05,
12860
+ "loss": 5.3293,
12861
+ "step": 7344
12862
+ },
12863
+ {
12864
+ "epoch": 0.71,
12865
+ "grad_norm": 0.9960778951644897,
12866
+ "learning_rate": 1.4378514640294746e-05,
12867
+ "loss": 5.2655,
12868
+ "step": 7348
12869
+ },
12870
+ {
12871
+ "epoch": 0.71,
12872
+ "grad_norm": 1.0327279567718506,
12873
+ "learning_rate": 1.4359123521427186e-05,
12874
+ "loss": 5.3477,
12875
+ "step": 7352
12876
+ },
12877
+ {
12878
+ "epoch": 0.71,
12879
+ "grad_norm": 1.0809035301208496,
12880
+ "learning_rate": 1.4339732402559627e-05,
12881
+ "loss": 5.331,
12882
+ "step": 7356
12883
+ },
12884
+ {
12885
+ "epoch": 0.71,
12886
+ "grad_norm": 1.0690807104110718,
12887
+ "learning_rate": 1.432034128369207e-05,
12888
+ "loss": 5.3986,
12889
+ "step": 7360
12890
+ },
12891
+ {
12892
+ "epoch": 0.71,
12893
+ "grad_norm": 1.0245548486709595,
12894
+ "learning_rate": 1.4300950164824511e-05,
12895
+ "loss": 5.3024,
12896
+ "step": 7364
12897
+ },
12898
+ {
12899
+ "epoch": 0.71,
12900
+ "grad_norm": 0.9999493956565857,
12901
+ "learning_rate": 1.4281559045956952e-05,
12902
+ "loss": 5.2878,
12903
+ "step": 7368
12904
+ },
12905
+ {
12906
+ "epoch": 0.71,
12907
+ "grad_norm": 1.0037769079208374,
12908
+ "learning_rate": 1.4262167927089392e-05,
12909
+ "loss": 5.2216,
12910
+ "step": 7372
12911
+ },
12912
+ {
12913
+ "epoch": 0.72,
12914
+ "grad_norm": 1.002113699913025,
12915
+ "learning_rate": 1.4242776808221836e-05,
12916
+ "loss": 5.4347,
12917
+ "step": 7376
12918
+ },
12919
+ {
12920
+ "epoch": 0.72,
12921
+ "grad_norm": 1.04192054271698,
12922
+ "learning_rate": 1.4223385689354277e-05,
12923
+ "loss": 5.3812,
12924
+ "step": 7380
12925
+ },
12926
+ {
12927
+ "epoch": 0.72,
12928
+ "grad_norm": 1.0271295309066772,
12929
+ "learning_rate": 1.4203994570486717e-05,
12930
+ "loss": 5.427,
12931
+ "step": 7384
12932
+ },
12933
+ {
12934
+ "epoch": 0.72,
12935
+ "grad_norm": 1.0071477890014648,
12936
+ "learning_rate": 1.4184603451619158e-05,
12937
+ "loss": 5.2727,
12938
+ "step": 7388
12939
+ },
12940
+ {
12941
+ "epoch": 0.72,
12942
+ "grad_norm": 1.0228255987167358,
12943
+ "learning_rate": 1.4165212332751602e-05,
12944
+ "loss": 5.3244,
12945
+ "step": 7392
12946
+ },
12947
+ {
12948
+ "epoch": 0.72,
12949
+ "grad_norm": 1.0034810304641724,
12950
+ "learning_rate": 1.4145821213884042e-05,
12951
+ "loss": 5.3883,
12952
+ "step": 7396
12953
+ },
12954
+ {
12955
+ "epoch": 0.72,
12956
+ "grad_norm": 1.0305688381195068,
12957
+ "learning_rate": 1.4126430095016483e-05,
12958
+ "loss": 5.3851,
12959
+ "step": 7400
12960
+ },
12961
+ {
12962
+ "epoch": 0.72,
12963
+ "grad_norm": 1.074646234512329,
12964
+ "learning_rate": 1.4107038976148925e-05,
12965
+ "loss": 5.2406,
12966
+ "step": 7404
12967
+ },
12968
+ {
12969
+ "epoch": 0.72,
12970
+ "grad_norm": 0.9961770176887512,
12971
+ "learning_rate": 1.4087647857281366e-05,
12972
+ "loss": 5.2848,
12973
+ "step": 7408
12974
+ },
12975
+ {
12976
+ "epoch": 0.72,
12977
+ "grad_norm": 1.034627079963684,
12978
+ "learning_rate": 1.4068256738413808e-05,
12979
+ "loss": 5.2984,
12980
+ "step": 7412
12981
+ },
12982
+ {
12983
+ "epoch": 0.72,
12984
+ "grad_norm": 1.072096824645996,
12985
+ "learning_rate": 1.4048865619546248e-05,
12986
+ "loss": 5.3182,
12987
+ "step": 7416
12988
+ },
12989
+ {
12990
+ "epoch": 0.72,
12991
+ "grad_norm": 1.0899096727371216,
12992
+ "learning_rate": 1.402947450067869e-05,
12993
+ "loss": 5.3017,
12994
+ "step": 7420
12995
+ },
12996
+ {
12997
+ "epoch": 0.72,
12998
+ "grad_norm": 1.1063376665115356,
12999
+ "learning_rate": 1.4010083381811131e-05,
13000
+ "loss": 5.311,
13001
+ "step": 7424
13002
+ },
13003
+ {
13004
+ "epoch": 0.72,
13005
+ "grad_norm": 1.0748202800750732,
13006
+ "learning_rate": 1.3990692262943572e-05,
13007
+ "loss": 5.318,
13008
+ "step": 7428
13009
+ },
13010
+ {
13011
+ "epoch": 0.72,
13012
+ "grad_norm": 1.0851057767868042,
13013
+ "learning_rate": 1.3971301144076012e-05,
13014
+ "loss": 5.3425,
13015
+ "step": 7432
13016
+ },
13017
+ {
13018
+ "epoch": 0.72,
13019
+ "grad_norm": 0.9942495822906494,
13020
+ "learning_rate": 1.3951910025208456e-05,
13021
+ "loss": 5.3258,
13022
+ "step": 7436
13023
+ },
13024
+ {
13025
+ "epoch": 0.72,
13026
+ "grad_norm": 1.0894228219985962,
13027
+ "learning_rate": 1.3932518906340897e-05,
13028
+ "loss": 5.2769,
13029
+ "step": 7440
13030
+ },
13031
+ {
13032
+ "epoch": 0.72,
13033
+ "grad_norm": 0.9603523015975952,
13034
+ "learning_rate": 1.3913127787473337e-05,
13035
+ "loss": 5.4087,
13036
+ "step": 7444
13037
+ },
13038
+ {
13039
+ "epoch": 0.72,
13040
+ "grad_norm": 1.1049365997314453,
13041
+ "learning_rate": 1.3893736668605778e-05,
13042
+ "loss": 5.339,
13043
+ "step": 7448
13044
+ },
13045
+ {
13046
+ "epoch": 0.72,
13047
+ "grad_norm": 1.021468162536621,
13048
+ "learning_rate": 1.3874345549738222e-05,
13049
+ "loss": 5.3447,
13050
+ "step": 7452
13051
+ },
13052
+ {
13053
+ "epoch": 0.72,
13054
+ "grad_norm": 1.088437557220459,
13055
+ "learning_rate": 1.3854954430870662e-05,
13056
+ "loss": 5.3546,
13057
+ "step": 7456
13058
+ },
13059
+ {
13060
+ "epoch": 0.72,
13061
+ "grad_norm": 1.0266107320785522,
13062
+ "learning_rate": 1.3835563312003103e-05,
13063
+ "loss": 5.1867,
13064
+ "step": 7460
13065
+ },
13066
+ {
13067
+ "epoch": 0.72,
13068
+ "grad_norm": 1.088911533355713,
13069
+ "learning_rate": 1.3816172193135547e-05,
13070
+ "loss": 5.3247,
13071
+ "step": 7464
13072
+ },
13073
+ {
13074
+ "epoch": 0.72,
13075
+ "grad_norm": 1.0675928592681885,
13076
+ "learning_rate": 1.3796781074267987e-05,
13077
+ "loss": 5.2807,
13078
+ "step": 7468
13079
+ },
13080
+ {
13081
+ "epoch": 0.72,
13082
+ "grad_norm": 0.9511438608169556,
13083
+ "learning_rate": 1.3777389955400428e-05,
13084
+ "loss": 5.3774,
13085
+ "step": 7472
13086
+ },
13087
+ {
13088
+ "epoch": 0.72,
13089
+ "grad_norm": 1.0546114444732666,
13090
+ "learning_rate": 1.3757998836532868e-05,
13091
+ "loss": 5.2856,
13092
+ "step": 7476
13093
+ },
13094
+ {
13095
+ "epoch": 0.73,
13096
+ "grad_norm": 1.0445231199264526,
13097
+ "learning_rate": 1.373860771766531e-05,
13098
+ "loss": 5.3805,
13099
+ "step": 7480
13100
+ },
13101
+ {
13102
+ "epoch": 0.73,
13103
+ "grad_norm": 1.0493693351745605,
13104
+ "learning_rate": 1.3719216598797751e-05,
13105
+ "loss": 5.3603,
13106
+ "step": 7484
13107
+ },
13108
+ {
13109
+ "epoch": 0.73,
13110
+ "grad_norm": 1.0047773122787476,
13111
+ "learning_rate": 1.3699825479930192e-05,
13112
+ "loss": 5.3224,
13113
+ "step": 7488
13114
+ },
13115
+ {
13116
+ "epoch": 0.73,
13117
+ "grad_norm": 1.0536508560180664,
13118
+ "learning_rate": 1.3680434361062632e-05,
13119
+ "loss": 5.4235,
13120
+ "step": 7492
13121
+ },
13122
+ {
13123
+ "epoch": 0.73,
13124
+ "grad_norm": 1.0347819328308105,
13125
+ "learning_rate": 1.3661043242195076e-05,
13126
+ "loss": 5.2528,
13127
+ "step": 7496
13128
+ },
13129
+ {
13130
+ "epoch": 0.73,
13131
+ "grad_norm": 1.0392394065856934,
13132
+ "learning_rate": 1.3641652123327517e-05,
13133
+ "loss": 5.4127,
13134
+ "step": 7500
13135
+ },
13136
+ {
13137
+ "epoch": 0.73,
13138
+ "grad_norm": 1.0930792093276978,
13139
+ "learning_rate": 1.3622261004459957e-05,
13140
+ "loss": 5.3221,
13141
+ "step": 7504
13142
+ },
13143
+ {
13144
+ "epoch": 0.73,
13145
+ "grad_norm": 0.9962918758392334,
13146
+ "learning_rate": 1.3602869885592398e-05,
13147
+ "loss": 5.3108,
13148
+ "step": 7508
13149
+ },
13150
+ {
13151
+ "epoch": 0.73,
13152
+ "grad_norm": 1.030331015586853,
13153
+ "learning_rate": 1.3583478766724842e-05,
13154
+ "loss": 5.2632,
13155
+ "step": 7512
13156
+ },
13157
+ {
13158
+ "epoch": 0.73,
13159
+ "grad_norm": 1.008636236190796,
13160
+ "learning_rate": 1.3564087647857282e-05,
13161
+ "loss": 5.3089,
13162
+ "step": 7516
13163
+ },
13164
+ {
13165
+ "epoch": 0.73,
13166
+ "grad_norm": 1.006934642791748,
13167
+ "learning_rate": 1.3544696528989723e-05,
13168
+ "loss": 5.2903,
13169
+ "step": 7520
13170
+ },
13171
+ {
13172
+ "epoch": 0.73,
13173
+ "grad_norm": 1.0194462537765503,
13174
+ "learning_rate": 1.3525305410122167e-05,
13175
+ "loss": 5.3827,
13176
+ "step": 7524
13177
+ },
13178
+ {
13179
+ "epoch": 0.73,
13180
+ "grad_norm": 0.9879323840141296,
13181
+ "learning_rate": 1.3505914291254607e-05,
13182
+ "loss": 5.4307,
13183
+ "step": 7528
13184
+ },
13185
+ {
13186
+ "epoch": 0.73,
13187
+ "grad_norm": 1.0651185512542725,
13188
+ "learning_rate": 1.3486523172387048e-05,
13189
+ "loss": 5.3134,
13190
+ "step": 7532
13191
+ },
13192
+ {
13193
+ "epoch": 0.73,
13194
+ "grad_norm": 1.0858125686645508,
13195
+ "learning_rate": 1.3467132053519488e-05,
13196
+ "loss": 5.3024,
13197
+ "step": 7536
13198
+ },
13199
+ {
13200
+ "epoch": 0.73,
13201
+ "grad_norm": 1.0240780115127563,
13202
+ "learning_rate": 1.344774093465193e-05,
13203
+ "loss": 5.3555,
13204
+ "step": 7540
13205
+ },
13206
+ {
13207
+ "epoch": 0.73,
13208
+ "grad_norm": 1.0629985332489014,
13209
+ "learning_rate": 1.3428349815784373e-05,
13210
+ "loss": 5.3652,
13211
+ "step": 7544
13212
+ },
13213
+ {
13214
+ "epoch": 0.73,
13215
+ "grad_norm": 1.1222716569900513,
13216
+ "learning_rate": 1.3408958696916813e-05,
13217
+ "loss": 5.3845,
13218
+ "step": 7548
13219
+ },
13220
+ {
13221
+ "epoch": 0.73,
13222
+ "grad_norm": 1.0008291006088257,
13223
+ "learning_rate": 1.3389567578049254e-05,
13224
+ "loss": 5.3552,
13225
+ "step": 7552
13226
+ },
13227
+ {
13228
+ "epoch": 0.73,
13229
+ "grad_norm": 1.0860753059387207,
13230
+ "learning_rate": 1.3370176459181696e-05,
13231
+ "loss": 5.3044,
13232
+ "step": 7556
13233
+ },
13234
+ {
13235
+ "epoch": 0.73,
13236
+ "grad_norm": 1.0495448112487793,
13237
+ "learning_rate": 1.3350785340314136e-05,
13238
+ "loss": 5.3259,
13239
+ "step": 7560
13240
+ },
13241
+ {
13242
+ "epoch": 0.73,
13243
+ "grad_norm": 1.0432664155960083,
13244
+ "learning_rate": 1.3331394221446577e-05,
13245
+ "loss": 5.3482,
13246
+ "step": 7564
13247
+ },
13248
+ {
13249
+ "epoch": 0.73,
13250
+ "grad_norm": 1.0440526008605957,
13251
+ "learning_rate": 1.3312003102579018e-05,
13252
+ "loss": 5.2362,
13253
+ "step": 7568
13254
+ },
13255
+ {
13256
+ "epoch": 0.73,
13257
+ "grad_norm": 1.0317118167877197,
13258
+ "learning_rate": 1.3292611983711461e-05,
13259
+ "loss": 5.3579,
13260
+ "step": 7572
13261
+ },
13262
+ {
13263
+ "epoch": 0.73,
13264
+ "grad_norm": 0.9613714218139648,
13265
+ "learning_rate": 1.3273220864843902e-05,
13266
+ "loss": 5.3631,
13267
+ "step": 7576
13268
+ },
13269
+ {
13270
+ "epoch": 0.73,
13271
+ "grad_norm": 1.1608860492706299,
13272
+ "learning_rate": 1.3253829745976343e-05,
13273
+ "loss": 5.3547,
13274
+ "step": 7580
13275
+ },
13276
+ {
13277
+ "epoch": 0.74,
13278
+ "grad_norm": 1.06599760055542,
13279
+ "learning_rate": 1.3234438627108786e-05,
13280
+ "loss": 5.3728,
13281
+ "step": 7584
13282
+ },
13283
+ {
13284
+ "epoch": 0.74,
13285
+ "grad_norm": 1.0115044116973877,
13286
+ "learning_rate": 1.3215047508241227e-05,
13287
+ "loss": 5.2954,
13288
+ "step": 7588
13289
+ },
13290
+ {
13291
+ "epoch": 0.74,
13292
+ "grad_norm": 1.0504167079925537,
13293
+ "learning_rate": 1.3195656389373668e-05,
13294
+ "loss": 5.3378,
13295
+ "step": 7592
13296
+ },
13297
+ {
13298
+ "epoch": 0.74,
13299
+ "grad_norm": 1.1047917604446411,
13300
+ "learning_rate": 1.3176265270506108e-05,
13301
+ "loss": 5.3153,
13302
+ "step": 7596
13303
+ },
13304
+ {
13305
+ "epoch": 0.74,
13306
+ "grad_norm": 1.0862175226211548,
13307
+ "learning_rate": 1.3156874151638552e-05,
13308
+ "loss": 5.4315,
13309
+ "step": 7600
13310
+ },
13311
+ {
13312
+ "epoch": 0.74,
13313
+ "grad_norm": 1.05397629737854,
13314
+ "learning_rate": 1.3137483032770993e-05,
13315
+ "loss": 5.3807,
13316
+ "step": 7604
13317
+ },
13318
+ {
13319
+ "epoch": 0.74,
13320
+ "grad_norm": 1.0143108367919922,
13321
+ "learning_rate": 1.3118091913903433e-05,
13322
+ "loss": 5.346,
13323
+ "step": 7608
13324
+ },
13325
+ {
13326
+ "epoch": 0.74,
13327
+ "grad_norm": 1.0487464666366577,
13328
+ "learning_rate": 1.3098700795035874e-05,
13329
+ "loss": 5.2966,
13330
+ "step": 7612
13331
+ },
13332
+ {
13333
+ "epoch": 0.74,
13334
+ "grad_norm": 1.156467318534851,
13335
+ "learning_rate": 1.3079309676168316e-05,
13336
+ "loss": 5.24,
13337
+ "step": 7616
13338
+ },
13339
+ {
13340
+ "epoch": 0.74,
13341
+ "grad_norm": 1.0586912631988525,
13342
+ "learning_rate": 1.3059918557300756e-05,
13343
+ "loss": 5.3162,
13344
+ "step": 7620
13345
+ },
13346
+ {
13347
+ "epoch": 0.74,
13348
+ "grad_norm": 1.0472930669784546,
13349
+ "learning_rate": 1.3040527438433197e-05,
13350
+ "loss": 5.4309,
13351
+ "step": 7624
13352
+ },
13353
+ {
13354
+ "epoch": 0.74,
13355
+ "grad_norm": 1.0239797830581665,
13356
+ "learning_rate": 1.3021136319565637e-05,
13357
+ "loss": 5.3124,
13358
+ "step": 7628
13359
+ },
13360
+ {
13361
+ "epoch": 0.74,
13362
+ "grad_norm": 1.084915041923523,
13363
+ "learning_rate": 1.3001745200698081e-05,
13364
+ "loss": 5.382,
13365
+ "step": 7632
13366
+ },
13367
+ {
13368
+ "epoch": 0.74,
13369
+ "grad_norm": 1.121639370918274,
13370
+ "learning_rate": 1.2982354081830522e-05,
13371
+ "loss": 5.2888,
13372
+ "step": 7636
13373
+ },
13374
+ {
13375
+ "epoch": 0.74,
13376
+ "grad_norm": 1.06790030002594,
13377
+ "learning_rate": 1.2962962962962962e-05,
13378
+ "loss": 5.3484,
13379
+ "step": 7640
13380
+ },
13381
+ {
13382
+ "epoch": 0.74,
13383
+ "grad_norm": 1.0664012432098389,
13384
+ "learning_rate": 1.2943571844095406e-05,
13385
+ "loss": 5.3624,
13386
+ "step": 7644
13387
+ },
13388
+ {
13389
+ "epoch": 0.74,
13390
+ "grad_norm": 1.032593846321106,
13391
+ "learning_rate": 1.2924180725227847e-05,
13392
+ "loss": 5.3109,
13393
+ "step": 7648
13394
+ },
13395
+ {
13396
+ "epoch": 0.74,
13397
+ "grad_norm": 1.0182029008865356,
13398
+ "learning_rate": 1.2904789606360287e-05,
13399
+ "loss": 5.3785,
13400
+ "step": 7652
13401
+ },
13402
+ {
13403
+ "epoch": 0.74,
13404
+ "grad_norm": 0.9787065982818604,
13405
+ "learning_rate": 1.2885398487492728e-05,
13406
+ "loss": 5.2586,
13407
+ "step": 7656
13408
+ },
13409
+ {
13410
+ "epoch": 0.74,
13411
+ "grad_norm": 1.14923095703125,
13412
+ "learning_rate": 1.2866007368625172e-05,
13413
+ "loss": 5.3378,
13414
+ "step": 7660
13415
+ },
13416
+ {
13417
+ "epoch": 0.74,
13418
+ "grad_norm": 1.0064685344696045,
13419
+ "learning_rate": 1.2846616249757612e-05,
13420
+ "loss": 5.3339,
13421
+ "step": 7664
13422
+ },
13423
+ {
13424
+ "epoch": 0.74,
13425
+ "grad_norm": 1.015594720840454,
13426
+ "learning_rate": 1.2827225130890053e-05,
13427
+ "loss": 5.2412,
13428
+ "step": 7668
13429
+ },
13430
+ {
13431
+ "epoch": 0.74,
13432
+ "grad_norm": 1.1527953147888184,
13433
+ "learning_rate": 1.2807834012022494e-05,
13434
+ "loss": 5.4026,
13435
+ "step": 7672
13436
+ },
13437
+ {
13438
+ "epoch": 0.74,
13439
+ "grad_norm": 1.024170994758606,
13440
+ "learning_rate": 1.2788442893154936e-05,
13441
+ "loss": 5.2431,
13442
+ "step": 7676
13443
+ },
13444
+ {
13445
+ "epoch": 0.74,
13446
+ "grad_norm": 1.0834672451019287,
13447
+ "learning_rate": 1.2769051774287378e-05,
13448
+ "loss": 5.3018,
13449
+ "step": 7680
13450
+ },
13451
+ {
13452
+ "epoch": 0.75,
13453
+ "grad_norm": 1.00787353515625,
13454
+ "learning_rate": 1.2749660655419819e-05,
13455
+ "loss": 5.3129,
13456
+ "step": 7684
13457
+ },
13458
+ {
13459
+ "epoch": 0.75,
13460
+ "grad_norm": 1.0804412364959717,
13461
+ "learning_rate": 1.2730269536552259e-05,
13462
+ "loss": 5.3974,
13463
+ "step": 7688
13464
+ },
13465
+ {
13466
+ "epoch": 0.75,
13467
+ "grad_norm": 1.0361813306808472,
13468
+ "learning_rate": 1.2710878417684701e-05,
13469
+ "loss": 5.291,
13470
+ "step": 7692
13471
+ },
13472
+ {
13473
+ "epoch": 0.75,
13474
+ "grad_norm": 0.9639879465103149,
13475
+ "learning_rate": 1.2691487298817142e-05,
13476
+ "loss": 5.3649,
13477
+ "step": 7696
13478
+ },
13479
+ {
13480
+ "epoch": 0.75,
13481
+ "grad_norm": 1.035768985748291,
13482
+ "learning_rate": 1.2672096179949582e-05,
13483
+ "loss": 5.3159,
13484
+ "step": 7700
13485
+ },
13486
+ {
13487
+ "epoch": 0.75,
13488
+ "grad_norm": 1.002968430519104,
13489
+ "learning_rate": 1.2652705061082026e-05,
13490
+ "loss": 5.1975,
13491
+ "step": 7704
13492
+ },
13493
+ {
13494
+ "epoch": 0.75,
13495
+ "grad_norm": 1.0666626691818237,
13496
+ "learning_rate": 1.2633313942214467e-05,
13497
+ "loss": 5.2216,
13498
+ "step": 7708
13499
+ },
13500
+ {
13501
+ "epoch": 0.75,
13502
+ "grad_norm": 1.031752347946167,
13503
+ "learning_rate": 1.2613922823346907e-05,
13504
+ "loss": 5.4174,
13505
+ "step": 7712
13506
+ },
13507
+ {
13508
+ "epoch": 0.75,
13509
+ "grad_norm": 1.0436227321624756,
13510
+ "learning_rate": 1.2594531704479348e-05,
13511
+ "loss": 5.2295,
13512
+ "step": 7716
13513
+ },
13514
+ {
13515
+ "epoch": 0.75,
13516
+ "grad_norm": 1.0823688507080078,
13517
+ "learning_rate": 1.2575140585611792e-05,
13518
+ "loss": 5.3274,
13519
+ "step": 7720
13520
+ },
13521
+ {
13522
+ "epoch": 0.75,
13523
+ "grad_norm": 1.0667200088500977,
13524
+ "learning_rate": 1.2555749466744232e-05,
13525
+ "loss": 5.3149,
13526
+ "step": 7724
13527
+ },
13528
+ {
13529
+ "epoch": 0.75,
13530
+ "grad_norm": 1.0479573011398315,
13531
+ "learning_rate": 1.2536358347876673e-05,
13532
+ "loss": 5.2653,
13533
+ "step": 7728
13534
+ },
13535
+ {
13536
+ "epoch": 0.75,
13537
+ "grad_norm": 1.1973553895950317,
13538
+ "learning_rate": 1.2516967229009113e-05,
13539
+ "loss": 5.2504,
13540
+ "step": 7732
13541
+ },
13542
+ {
13543
+ "epoch": 0.75,
13544
+ "grad_norm": 1.0791184902191162,
13545
+ "learning_rate": 1.2497576110141556e-05,
13546
+ "loss": 5.2797,
13547
+ "step": 7736
13548
+ },
13549
+ {
13550
+ "epoch": 0.75,
13551
+ "grad_norm": 1.091112732887268,
13552
+ "learning_rate": 1.2478184991273998e-05,
13553
+ "loss": 5.3129,
13554
+ "step": 7740
13555
+ },
13556
+ {
13557
+ "epoch": 0.75,
13558
+ "grad_norm": 1.0607527494430542,
13559
+ "learning_rate": 1.2458793872406438e-05,
13560
+ "loss": 5.2997,
13561
+ "step": 7744
13562
+ },
13563
+ {
13564
+ "epoch": 0.75,
13565
+ "grad_norm": 1.0152305364608765,
13566
+ "learning_rate": 1.243940275353888e-05,
13567
+ "loss": 5.3655,
13568
+ "step": 7748
13569
+ },
13570
+ {
13571
+ "epoch": 0.75,
13572
+ "grad_norm": 1.009064793586731,
13573
+ "learning_rate": 1.2420011634671321e-05,
13574
+ "loss": 5.3058,
13575
+ "step": 7752
13576
+ },
13577
+ {
13578
+ "epoch": 0.75,
13579
+ "grad_norm": 1.0338549613952637,
13580
+ "learning_rate": 1.2400620515803762e-05,
13581
+ "loss": 5.2527,
13582
+ "step": 7756
13583
+ },
13584
+ {
13585
+ "epoch": 0.75,
13586
+ "grad_norm": 1.0737717151641846,
13587
+ "learning_rate": 1.2381229396936204e-05,
13588
+ "loss": 5.3859,
13589
+ "step": 7760
13590
+ },
13591
+ {
13592
+ "epoch": 0.75,
13593
+ "grad_norm": 1.0524028539657593,
13594
+ "learning_rate": 1.2361838278068645e-05,
13595
+ "loss": 5.407,
13596
+ "step": 7764
13597
+ },
13598
+ {
13599
+ "epoch": 0.75,
13600
+ "grad_norm": 1.0182230472564697,
13601
+ "learning_rate": 1.2342447159201087e-05,
13602
+ "loss": 5.3243,
13603
+ "step": 7768
13604
+ },
13605
+ {
13606
+ "epoch": 0.75,
13607
+ "grad_norm": 1.042325496673584,
13608
+ "learning_rate": 1.2323056040333527e-05,
13609
+ "loss": 5.372,
13610
+ "step": 7772
13611
+ },
13612
+ {
13613
+ "epoch": 0.75,
13614
+ "grad_norm": 1.099138855934143,
13615
+ "learning_rate": 1.230366492146597e-05,
13616
+ "loss": 5.3577,
13617
+ "step": 7776
13618
+ },
13619
+ {
13620
+ "epoch": 0.75,
13621
+ "grad_norm": 1.007309913635254,
13622
+ "learning_rate": 1.228427380259841e-05,
13623
+ "loss": 5.252,
13624
+ "step": 7780
13625
+ },
13626
+ {
13627
+ "epoch": 0.75,
13628
+ "grad_norm": 1.1510696411132812,
13629
+ "learning_rate": 1.2264882683730852e-05,
13630
+ "loss": 5.3726,
13631
+ "step": 7784
13632
+ },
13633
+ {
13634
+ "epoch": 0.76,
13635
+ "grad_norm": 1.0110242366790771,
13636
+ "learning_rate": 1.2245491564863293e-05,
13637
+ "loss": 5.3932,
13638
+ "step": 7788
13639
+ },
13640
+ {
13641
+ "epoch": 0.76,
13642
+ "grad_norm": 1.0620808601379395,
13643
+ "learning_rate": 1.2226100445995735e-05,
13644
+ "loss": 5.2691,
13645
+ "step": 7792
13646
+ },
13647
+ {
13648
+ "epoch": 0.76,
13649
+ "grad_norm": 1.052786946296692,
13650
+ "learning_rate": 1.2206709327128176e-05,
13651
+ "loss": 5.3597,
13652
+ "step": 7796
13653
+ },
13654
+ {
13655
+ "epoch": 0.76,
13656
+ "grad_norm": 1.1259844303131104,
13657
+ "learning_rate": 1.2187318208260618e-05,
13658
+ "loss": 5.3324,
13659
+ "step": 7800
13660
+ },
13661
+ {
13662
+ "epoch": 0.76,
13663
+ "grad_norm": 1.0244724750518799,
13664
+ "learning_rate": 1.2167927089393058e-05,
13665
+ "loss": 5.3643,
13666
+ "step": 7804
13667
+ },
13668
+ {
13669
+ "epoch": 0.76,
13670
+ "grad_norm": 1.1225011348724365,
13671
+ "learning_rate": 1.21485359705255e-05,
13672
+ "loss": 5.3501,
13673
+ "step": 7808
13674
+ },
13675
+ {
13676
+ "epoch": 0.76,
13677
+ "grad_norm": 0.9966182112693787,
13678
+ "learning_rate": 1.2129144851657941e-05,
13679
+ "loss": 5.3966,
13680
+ "step": 7812
13681
+ },
13682
+ {
13683
+ "epoch": 0.76,
13684
+ "grad_norm": 1.102308988571167,
13685
+ "learning_rate": 1.2109753732790383e-05,
13686
+ "loss": 5.2625,
13687
+ "step": 7816
13688
+ },
13689
+ {
13690
+ "epoch": 0.76,
13691
+ "grad_norm": 1.0900803804397583,
13692
+ "learning_rate": 1.2095210393639714e-05,
13693
+ "loss": 5.365,
13694
+ "step": 7820
13695
+ },
13696
+ {
13697
+ "epoch": 0.76,
13698
+ "grad_norm": 1.1339807510375977,
13699
+ "learning_rate": 1.2075819274772154e-05,
13700
+ "loss": 5.3816,
13701
+ "step": 7824
13702
+ },
13703
+ {
13704
+ "epoch": 0.76,
13705
+ "grad_norm": 1.077379822731018,
13706
+ "learning_rate": 1.2056428155904597e-05,
13707
+ "loss": 5.1332,
13708
+ "step": 7828
13709
+ },
13710
+ {
13711
+ "epoch": 0.76,
13712
+ "grad_norm": 1.1201417446136475,
13713
+ "learning_rate": 1.2037037037037037e-05,
13714
+ "loss": 5.3037,
13715
+ "step": 7832
13716
+ },
13717
+ {
13718
+ "epoch": 0.76,
13719
+ "grad_norm": 1.1047320365905762,
13720
+ "learning_rate": 1.201764591816948e-05,
13721
+ "loss": 5.4453,
13722
+ "step": 7836
13723
+ },
13724
+ {
13725
+ "epoch": 0.76,
13726
+ "grad_norm": 1.0625344514846802,
13727
+ "learning_rate": 1.1998254799301922e-05,
13728
+ "loss": 5.2676,
13729
+ "step": 7840
13730
+ },
13731
+ {
13732
+ "epoch": 0.76,
13733
+ "grad_norm": 1.0774505138397217,
13734
+ "learning_rate": 1.1978863680434362e-05,
13735
+ "loss": 5.3222,
13736
+ "step": 7844
13737
+ },
13738
+ {
13739
+ "epoch": 0.76,
13740
+ "grad_norm": 1.0557003021240234,
13741
+ "learning_rate": 1.1959472561566804e-05,
13742
+ "loss": 5.3235,
13743
+ "step": 7848
13744
+ },
13745
+ {
13746
+ "epoch": 0.76,
13747
+ "grad_norm": 0.9856312274932861,
13748
+ "learning_rate": 1.1940081442699245e-05,
13749
+ "loss": 5.2947,
13750
+ "step": 7852
13751
+ },
13752
+ {
13753
+ "epoch": 0.76,
13754
+ "grad_norm": 1.0185786485671997,
13755
+ "learning_rate": 1.1920690323831685e-05,
13756
+ "loss": 5.3262,
13757
+ "step": 7856
13758
+ },
13759
+ {
13760
+ "epoch": 0.76,
13761
+ "grad_norm": 1.0777360200881958,
13762
+ "learning_rate": 1.1901299204964126e-05,
13763
+ "loss": 5.254,
13764
+ "step": 7860
13765
+ },
13766
+ {
13767
+ "epoch": 0.76,
13768
+ "grad_norm": 1.0238205194473267,
13769
+ "learning_rate": 1.1881908086096568e-05,
13770
+ "loss": 5.356,
13771
+ "step": 7864
13772
+ },
13773
+ {
13774
+ "epoch": 0.76,
13775
+ "grad_norm": 1.0025471448898315,
13776
+ "learning_rate": 1.1862516967229009e-05,
13777
+ "loss": 5.2541,
13778
+ "step": 7868
13779
+ },
13780
+ {
13781
+ "epoch": 0.76,
13782
+ "grad_norm": 1.030316710472107,
13783
+ "learning_rate": 1.1843125848361451e-05,
13784
+ "loss": 5.2578,
13785
+ "step": 7872
13786
+ },
13787
+ {
13788
+ "epoch": 0.76,
13789
+ "grad_norm": 1.091535210609436,
13790
+ "learning_rate": 1.1823734729493891e-05,
13791
+ "loss": 5.3234,
13792
+ "step": 7876
13793
+ },
13794
+ {
13795
+ "epoch": 0.76,
13796
+ "grad_norm": 1.0827471017837524,
13797
+ "learning_rate": 1.1804343610626334e-05,
13798
+ "loss": 5.2876,
13799
+ "step": 7880
13800
+ },
13801
+ {
13802
+ "epoch": 0.76,
13803
+ "grad_norm": 1.0991337299346924,
13804
+ "learning_rate": 1.1784952491758774e-05,
13805
+ "loss": 5.473,
13806
+ "step": 7884
13807
+ },
13808
+ {
13809
+ "epoch": 0.76,
13810
+ "grad_norm": 1.0421675443649292,
13811
+ "learning_rate": 1.1765561372891216e-05,
13812
+ "loss": 5.3059,
13813
+ "step": 7888
13814
+ },
13815
+ {
13816
+ "epoch": 0.77,
13817
+ "grad_norm": 1.1006362438201904,
13818
+ "learning_rate": 1.1746170254023657e-05,
13819
+ "loss": 5.2836,
13820
+ "step": 7892
13821
+ },
13822
+ {
13823
+ "epoch": 0.77,
13824
+ "grad_norm": 1.0210435390472412,
13825
+ "learning_rate": 1.17267791351561e-05,
13826
+ "loss": 5.3632,
13827
+ "step": 7896
13828
+ },
13829
+ {
13830
+ "epoch": 0.77,
13831
+ "grad_norm": 0.9662442207336426,
13832
+ "learning_rate": 1.1707388016288541e-05,
13833
+ "loss": 5.3246,
13834
+ "step": 7900
13835
+ },
13836
+ {
13837
+ "epoch": 0.77,
13838
+ "grad_norm": 1.1299954652786255,
13839
+ "learning_rate": 1.1687996897420982e-05,
13840
+ "loss": 5.2499,
13841
+ "step": 7904
13842
+ },
13843
+ {
13844
+ "epoch": 0.77,
13845
+ "grad_norm": 1.0942116975784302,
13846
+ "learning_rate": 1.1668605778553424e-05,
13847
+ "loss": 5.2834,
13848
+ "step": 7908
13849
+ },
13850
+ {
13851
+ "epoch": 0.77,
13852
+ "grad_norm": 1.0262593030929565,
13853
+ "learning_rate": 1.1649214659685865e-05,
13854
+ "loss": 5.3228,
13855
+ "step": 7912
13856
+ },
13857
+ {
13858
+ "epoch": 0.77,
13859
+ "grad_norm": 1.0624995231628418,
13860
+ "learning_rate": 1.1629823540818307e-05,
13861
+ "loss": 5.2603,
13862
+ "step": 7916
13863
+ },
13864
+ {
13865
+ "epoch": 0.77,
13866
+ "grad_norm": 1.0851109027862549,
13867
+ "learning_rate": 1.1610432421950747e-05,
13868
+ "loss": 5.2343,
13869
+ "step": 7920
13870
+ },
13871
+ {
13872
+ "epoch": 0.77,
13873
+ "grad_norm": 1.0987358093261719,
13874
+ "learning_rate": 1.1591041303083188e-05,
13875
+ "loss": 5.2615,
13876
+ "step": 7924
13877
+ },
13878
+ {
13879
+ "epoch": 0.77,
13880
+ "grad_norm": 1.0852704048156738,
13881
+ "learning_rate": 1.1571650184215629e-05,
13882
+ "loss": 5.2886,
13883
+ "step": 7928
13884
+ },
13885
+ {
13886
+ "epoch": 0.77,
13887
+ "grad_norm": 1.0782544612884521,
13888
+ "learning_rate": 1.155225906534807e-05,
13889
+ "loss": 5.3585,
13890
+ "step": 7932
13891
+ },
13892
+ {
13893
+ "epoch": 0.77,
13894
+ "grad_norm": 1.0928773880004883,
13895
+ "learning_rate": 1.1532867946480511e-05,
13896
+ "loss": 5.2083,
13897
+ "step": 7936
13898
+ },
13899
+ {
13900
+ "epoch": 0.77,
13901
+ "grad_norm": 0.9779551029205322,
13902
+ "learning_rate": 1.1513476827612954e-05,
13903
+ "loss": 5.2969,
13904
+ "step": 7940
13905
+ },
13906
+ {
13907
+ "epoch": 0.77,
13908
+ "grad_norm": 1.0859659910202026,
13909
+ "learning_rate": 1.1494085708745394e-05,
13910
+ "loss": 5.3044,
13911
+ "step": 7944
13912
+ },
13913
+ {
13914
+ "epoch": 0.77,
13915
+ "grad_norm": 1.0626839399337769,
13916
+ "learning_rate": 1.1474694589877836e-05,
13917
+ "loss": 5.3455,
13918
+ "step": 7948
13919
+ },
13920
+ {
13921
+ "epoch": 0.77,
13922
+ "grad_norm": 1.1160836219787598,
13923
+ "learning_rate": 1.1455303471010277e-05,
13924
+ "loss": 5.3082,
13925
+ "step": 7952
13926
+ },
13927
+ {
13928
+ "epoch": 0.77,
13929
+ "grad_norm": 1.089357614517212,
13930
+ "learning_rate": 1.1435912352142719e-05,
13931
+ "loss": 5.3013,
13932
+ "step": 7956
13933
+ },
13934
+ {
13935
+ "epoch": 0.77,
13936
+ "grad_norm": 0.9616773128509521,
13937
+ "learning_rate": 1.1416521233275161e-05,
13938
+ "loss": 5.2995,
13939
+ "step": 7960
13940
+ },
13941
+ {
13942
+ "epoch": 0.77,
13943
+ "grad_norm": 1.0657833814620972,
13944
+ "learning_rate": 1.1397130114407602e-05,
13945
+ "loss": 5.3208,
13946
+ "step": 7964
13947
+ },
13948
+ {
13949
+ "epoch": 0.77,
13950
+ "grad_norm": 1.0845454931259155,
13951
+ "learning_rate": 1.1377738995540044e-05,
13952
+ "loss": 5.3213,
13953
+ "step": 7968
13954
+ },
13955
+ {
13956
+ "epoch": 0.77,
13957
+ "grad_norm": 1.0332484245300293,
13958
+ "learning_rate": 1.1358347876672485e-05,
13959
+ "loss": 5.2833,
13960
+ "step": 7972
13961
+ },
13962
+ {
13963
+ "epoch": 0.77,
13964
+ "grad_norm": 1.0877047777175903,
13965
+ "learning_rate": 1.1338956757804927e-05,
13966
+ "loss": 5.2889,
13967
+ "step": 7976
13968
+ },
13969
+ {
13970
+ "epoch": 0.77,
13971
+ "grad_norm": 1.0782090425491333,
13972
+ "learning_rate": 1.1319565638937367e-05,
13973
+ "loss": 5.2914,
13974
+ "step": 7980
13975
+ },
13976
+ {
13977
+ "epoch": 0.77,
13978
+ "grad_norm": 1.0773468017578125,
13979
+ "learning_rate": 1.130017452006981e-05,
13980
+ "loss": 5.2853,
13981
+ "step": 7984
13982
+ },
13983
+ {
13984
+ "epoch": 0.77,
13985
+ "grad_norm": 1.1031116247177124,
13986
+ "learning_rate": 1.128078340120225e-05,
13987
+ "loss": 5.2285,
13988
+ "step": 7988
13989
+ },
13990
+ {
13991
+ "epoch": 0.77,
13992
+ "grad_norm": 1.0579017400741577,
13993
+ "learning_rate": 1.126139228233469e-05,
13994
+ "loss": 5.2608,
13995
+ "step": 7992
13996
+ },
13997
+ {
13998
+ "epoch": 0.78,
13999
+ "grad_norm": 1.0039610862731934,
14000
+ "learning_rate": 1.1242001163467133e-05,
14001
+ "loss": 5.2801,
14002
+ "step": 7996
14003
+ },
14004
+ {
14005
+ "epoch": 0.78,
14006
+ "grad_norm": 1.1101170778274536,
14007
+ "learning_rate": 1.1222610044599573e-05,
14008
+ "loss": 5.3003,
14009
+ "step": 8000
14010
+ },
14011
+ {
14012
+ "epoch": 0.78,
14013
+ "grad_norm": 1.1231920719146729,
14014
+ "learning_rate": 1.1203218925732014e-05,
14015
+ "loss": 5.2951,
14016
+ "step": 8004
14017
+ },
14018
+ {
14019
+ "epoch": 0.78,
14020
+ "grad_norm": 1.104062557220459,
14021
+ "learning_rate": 1.1183827806864456e-05,
14022
+ "loss": 5.2419,
14023
+ "step": 8008
14024
+ },
14025
+ {
14026
+ "epoch": 0.78,
14027
+ "grad_norm": 1.0631533861160278,
14028
+ "learning_rate": 1.1164436687996897e-05,
14029
+ "loss": 5.2927,
14030
+ "step": 8012
14031
+ },
14032
+ {
14033
+ "epoch": 0.78,
14034
+ "grad_norm": 1.0923023223876953,
14035
+ "learning_rate": 1.1145045569129339e-05,
14036
+ "loss": 5.3566,
14037
+ "step": 8016
14038
+ },
14039
+ {
14040
+ "epoch": 0.78,
14041
+ "grad_norm": 1.1646323204040527,
14042
+ "learning_rate": 1.1125654450261781e-05,
14043
+ "loss": 5.229,
14044
+ "step": 8020
14045
+ },
14046
+ {
14047
+ "epoch": 0.78,
14048
+ "grad_norm": 1.0775333642959595,
14049
+ "learning_rate": 1.1106263331394222e-05,
14050
+ "loss": 5.3121,
14051
+ "step": 8024
14052
+ },
14053
+ {
14054
+ "epoch": 0.78,
14055
+ "grad_norm": 1.101682186126709,
14056
+ "learning_rate": 1.1086872212526664e-05,
14057
+ "loss": 5.3746,
14058
+ "step": 8028
14059
+ },
14060
+ {
14061
+ "epoch": 0.78,
14062
+ "grad_norm": 1.06061589717865,
14063
+ "learning_rate": 1.1067481093659105e-05,
14064
+ "loss": 5.2878,
14065
+ "step": 8032
14066
+ },
14067
+ {
14068
+ "epoch": 0.78,
14069
+ "grad_norm": 1.0500963926315308,
14070
+ "learning_rate": 1.1048089974791547e-05,
14071
+ "loss": 5.3188,
14072
+ "step": 8036
14073
+ },
14074
+ {
14075
+ "epoch": 0.78,
14076
+ "grad_norm": 1.1280819177627563,
14077
+ "learning_rate": 1.1028698855923987e-05,
14078
+ "loss": 5.2829,
14079
+ "step": 8040
14080
+ },
14081
+ {
14082
+ "epoch": 0.78,
14083
+ "grad_norm": 1.0025454759597778,
14084
+ "learning_rate": 1.100930773705643e-05,
14085
+ "loss": 5.3121,
14086
+ "step": 8044
14087
+ },
14088
+ {
14089
+ "epoch": 0.78,
14090
+ "grad_norm": 1.0445626974105835,
14091
+ "learning_rate": 1.098991661818887e-05,
14092
+ "loss": 5.3264,
14093
+ "step": 8048
14094
+ },
14095
+ {
14096
+ "epoch": 0.78,
14097
+ "grad_norm": 1.0949641466140747,
14098
+ "learning_rate": 1.0970525499321312e-05,
14099
+ "loss": 5.3306,
14100
+ "step": 8052
14101
+ },
14102
+ {
14103
+ "epoch": 0.78,
14104
+ "grad_norm": 1.044668197631836,
14105
+ "learning_rate": 1.0951134380453753e-05,
14106
+ "loss": 5.3335,
14107
+ "step": 8056
14108
+ },
14109
+ {
14110
+ "epoch": 0.78,
14111
+ "grad_norm": 1.079334020614624,
14112
+ "learning_rate": 1.0931743261586193e-05,
14113
+ "loss": 5.3383,
14114
+ "step": 8060
14115
+ },
14116
+ {
14117
+ "epoch": 0.78,
14118
+ "grad_norm": 1.0334477424621582,
14119
+ "learning_rate": 1.0912352142718636e-05,
14120
+ "loss": 5.3128,
14121
+ "step": 8064
14122
+ },
14123
+ {
14124
+ "epoch": 0.78,
14125
+ "grad_norm": 0.9899519681930542,
14126
+ "learning_rate": 1.0892961023851076e-05,
14127
+ "loss": 5.3245,
14128
+ "step": 8068
14129
+ },
14130
+ {
14131
+ "epoch": 0.78,
14132
+ "grad_norm": 1.0130146741867065,
14133
+ "learning_rate": 1.0873569904983517e-05,
14134
+ "loss": 5.2891,
14135
+ "step": 8072
14136
+ },
14137
+ {
14138
+ "epoch": 0.78,
14139
+ "grad_norm": 1.0125179290771484,
14140
+ "learning_rate": 1.0854178786115959e-05,
14141
+ "loss": 5.3007,
14142
+ "step": 8076
14143
+ },
14144
+ {
14145
+ "epoch": 0.78,
14146
+ "grad_norm": 1.0482258796691895,
14147
+ "learning_rate": 1.0834787667248401e-05,
14148
+ "loss": 5.2417,
14149
+ "step": 8080
14150
+ },
14151
+ {
14152
+ "epoch": 0.78,
14153
+ "grad_norm": 1.055640697479248,
14154
+ "learning_rate": 1.0815396548380842e-05,
14155
+ "loss": 5.3779,
14156
+ "step": 8084
14157
+ },
14158
+ {
14159
+ "epoch": 0.78,
14160
+ "grad_norm": 1.0579723119735718,
14161
+ "learning_rate": 1.0796005429513284e-05,
14162
+ "loss": 5.232,
14163
+ "step": 8088
14164
+ },
14165
+ {
14166
+ "epoch": 0.78,
14167
+ "grad_norm": 1.083598017692566,
14168
+ "learning_rate": 1.0776614310645724e-05,
14169
+ "loss": 5.4277,
14170
+ "step": 8092
14171
+ },
14172
+ {
14173
+ "epoch": 0.78,
14174
+ "grad_norm": 0.9834340214729309,
14175
+ "learning_rate": 1.0757223191778167e-05,
14176
+ "loss": 5.2498,
14177
+ "step": 8096
14178
+ },
14179
+ {
14180
+ "epoch": 0.79,
14181
+ "grad_norm": 1.0400574207305908,
14182
+ "learning_rate": 1.0737832072910607e-05,
14183
+ "loss": 5.3321,
14184
+ "step": 8100
14185
+ },
14186
+ {
14187
+ "epoch": 0.79,
14188
+ "grad_norm": 1.083775281906128,
14189
+ "learning_rate": 1.071844095404305e-05,
14190
+ "loss": 5.1589,
14191
+ "step": 8104
14192
+ },
14193
+ {
14194
+ "epoch": 0.79,
14195
+ "grad_norm": 1.035290241241455,
14196
+ "learning_rate": 1.069904983517549e-05,
14197
+ "loss": 5.4106,
14198
+ "step": 8108
14199
+ },
14200
+ {
14201
+ "epoch": 0.79,
14202
+ "grad_norm": 1.0348436832427979,
14203
+ "learning_rate": 1.0679658716307932e-05,
14204
+ "loss": 5.2728,
14205
+ "step": 8112
14206
+ },
14207
+ {
14208
+ "epoch": 0.79,
14209
+ "grad_norm": 1.058597445487976,
14210
+ "learning_rate": 1.0660267597440373e-05,
14211
+ "loss": 5.2081,
14212
+ "step": 8116
14213
+ },
14214
+ {
14215
+ "epoch": 0.79,
14216
+ "grad_norm": 1.0467309951782227,
14217
+ "learning_rate": 1.0640876478572815e-05,
14218
+ "loss": 5.232,
14219
+ "step": 8120
14220
+ },
14221
+ {
14222
+ "epoch": 0.79,
14223
+ "grad_norm": 0.9697101712226868,
14224
+ "learning_rate": 1.0621485359705256e-05,
14225
+ "loss": 5.2641,
14226
+ "step": 8124
14227
+ },
14228
+ {
14229
+ "epoch": 0.79,
14230
+ "grad_norm": 1.0596665143966675,
14231
+ "learning_rate": 1.0602094240837698e-05,
14232
+ "loss": 5.2865,
14233
+ "step": 8128
14234
+ },
14235
+ {
14236
+ "epoch": 0.79,
14237
+ "grad_norm": 1.0948309898376465,
14238
+ "learning_rate": 1.0582703121970138e-05,
14239
+ "loss": 5.2826,
14240
+ "step": 8132
14241
+ },
14242
+ {
14243
+ "epoch": 0.79,
14244
+ "grad_norm": 1.0270111560821533,
14245
+ "learning_rate": 1.0563312003102579e-05,
14246
+ "loss": 5.2726,
14247
+ "step": 8136
14248
+ },
14249
+ {
14250
+ "epoch": 0.79,
14251
+ "grad_norm": 1.0012414455413818,
14252
+ "learning_rate": 1.0543920884235021e-05,
14253
+ "loss": 5.3184,
14254
+ "step": 8140
14255
+ },
14256
+ {
14257
+ "epoch": 0.79,
14258
+ "grad_norm": 1.019332766532898,
14259
+ "learning_rate": 1.0524529765367462e-05,
14260
+ "loss": 5.2908,
14261
+ "step": 8144
14262
+ },
14263
+ {
14264
+ "epoch": 0.79,
14265
+ "grad_norm": 1.0300483703613281,
14266
+ "learning_rate": 1.0505138646499904e-05,
14267
+ "loss": 5.3333,
14268
+ "step": 8148
14269
+ },
14270
+ {
14271
+ "epoch": 0.79,
14272
+ "grad_norm": 1.0536975860595703,
14273
+ "learning_rate": 1.0485747527632344e-05,
14274
+ "loss": 5.2944,
14275
+ "step": 8152
14276
+ },
14277
+ {
14278
+ "epoch": 0.79,
14279
+ "grad_norm": 0.9881764650344849,
14280
+ "learning_rate": 1.0466356408764787e-05,
14281
+ "loss": 5.2767,
14282
+ "step": 8156
14283
+ },
14284
+ {
14285
+ "epoch": 0.79,
14286
+ "grad_norm": 1.0163639783859253,
14287
+ "learning_rate": 1.0446965289897227e-05,
14288
+ "loss": 5.2723,
14289
+ "step": 8160
14290
+ },
14291
+ {
14292
+ "epoch": 0.79,
14293
+ "grad_norm": 1.0919207334518433,
14294
+ "learning_rate": 1.042757417102967e-05,
14295
+ "loss": 5.3327,
14296
+ "step": 8164
14297
+ },
14298
+ {
14299
+ "epoch": 0.79,
14300
+ "grad_norm": 1.0306917428970337,
14301
+ "learning_rate": 1.040818305216211e-05,
14302
+ "loss": 5.2565,
14303
+ "step": 8168
14304
+ },
14305
+ {
14306
+ "epoch": 0.79,
14307
+ "grad_norm": 1.0125017166137695,
14308
+ "learning_rate": 1.0388791933294552e-05,
14309
+ "loss": 5.342,
14310
+ "step": 8172
14311
+ },
14312
+ {
14313
+ "epoch": 0.79,
14314
+ "grad_norm": 1.0879104137420654,
14315
+ "learning_rate": 1.0369400814426993e-05,
14316
+ "loss": 5.3122,
14317
+ "step": 8176
14318
+ },
14319
+ {
14320
+ "epoch": 0.79,
14321
+ "grad_norm": 0.9527262449264526,
14322
+ "learning_rate": 1.0350009695559435e-05,
14323
+ "loss": 5.284,
14324
+ "step": 8180
14325
+ },
14326
+ {
14327
+ "epoch": 0.79,
14328
+ "grad_norm": 1.0486680269241333,
14329
+ "learning_rate": 1.0330618576691875e-05,
14330
+ "loss": 5.2428,
14331
+ "step": 8184
14332
+ },
14333
+ {
14334
+ "epoch": 0.79,
14335
+ "grad_norm": 1.0278397798538208,
14336
+ "learning_rate": 1.0311227457824318e-05,
14337
+ "loss": 5.3466,
14338
+ "step": 8188
14339
+ },
14340
+ {
14341
+ "epoch": 0.79,
14342
+ "grad_norm": 1.0635344982147217,
14343
+ "learning_rate": 1.0291836338956758e-05,
14344
+ "loss": 5.2645,
14345
+ "step": 8192
14346
+ },
14347
+ {
14348
+ "epoch": 0.79,
14349
+ "grad_norm": 1.0180613994598389,
14350
+ "learning_rate": 1.02724452200892e-05,
14351
+ "loss": 5.3552,
14352
+ "step": 8196
14353
+ },
14354
+ {
14355
+ "epoch": 0.8,
14356
+ "grad_norm": 1.0265159606933594,
14357
+ "learning_rate": 1.0253054101221641e-05,
14358
+ "loss": 5.3422,
14359
+ "step": 8200
14360
+ },
14361
+ {
14362
+ "epoch": 0.8,
14363
+ "grad_norm": 1.029842495918274,
14364
+ "learning_rate": 1.0233662982354082e-05,
14365
+ "loss": 5.4009,
14366
+ "step": 8204
14367
+ },
14368
+ {
14369
+ "epoch": 0.8,
14370
+ "grad_norm": 1.0848013162612915,
14371
+ "learning_rate": 1.0214271863486524e-05,
14372
+ "loss": 5.3982,
14373
+ "step": 8208
14374
+ },
14375
+ {
14376
+ "epoch": 0.8,
14377
+ "grad_norm": 1.0277658700942993,
14378
+ "learning_rate": 1.0194880744618964e-05,
14379
+ "loss": 5.3567,
14380
+ "step": 8212
14381
+ },
14382
+ {
14383
+ "epoch": 0.8,
14384
+ "grad_norm": 1.0273760557174683,
14385
+ "learning_rate": 1.0175489625751407e-05,
14386
+ "loss": 5.2762,
14387
+ "step": 8216
14388
+ },
14389
+ {
14390
+ "epoch": 0.8,
14391
+ "grad_norm": 0.9951087832450867,
14392
+ "learning_rate": 1.0156098506883847e-05,
14393
+ "loss": 5.3267,
14394
+ "step": 8220
14395
+ },
14396
+ {
14397
+ "epoch": 0.8,
14398
+ "grad_norm": 1.0347890853881836,
14399
+ "learning_rate": 1.013670738801629e-05,
14400
+ "loss": 5.2681,
14401
+ "step": 8224
14402
+ },
14403
+ {
14404
+ "epoch": 0.8,
14405
+ "grad_norm": 1.076242446899414,
14406
+ "learning_rate": 1.011731626914873e-05,
14407
+ "loss": 5.3621,
14408
+ "step": 8228
14409
+ },
14410
+ {
14411
+ "epoch": 0.8,
14412
+ "grad_norm": 1.02762770652771,
14413
+ "learning_rate": 1.0097925150281172e-05,
14414
+ "loss": 5.3412,
14415
+ "step": 8232
14416
+ },
14417
+ {
14418
+ "epoch": 0.8,
14419
+ "grad_norm": 1.0394989252090454,
14420
+ "learning_rate": 1.0078534031413613e-05,
14421
+ "loss": 5.2555,
14422
+ "step": 8236
14423
+ },
14424
+ {
14425
+ "epoch": 0.8,
14426
+ "grad_norm": 1.0912150144577026,
14427
+ "learning_rate": 1.0059142912546055e-05,
14428
+ "loss": 5.3784,
14429
+ "step": 8240
14430
+ },
14431
+ {
14432
+ "epoch": 0.8,
14433
+ "grad_norm": 1.0681991577148438,
14434
+ "learning_rate": 1.0039751793678495e-05,
14435
+ "loss": 5.3226,
14436
+ "step": 8244
14437
+ },
14438
+ {
14439
+ "epoch": 0.8,
14440
+ "grad_norm": 1.0572381019592285,
14441
+ "learning_rate": 1.0020360674810938e-05,
14442
+ "loss": 5.2935,
14443
+ "step": 8248
14444
+ },
14445
+ {
14446
+ "epoch": 0.8,
14447
+ "grad_norm": 1.0432649850845337,
14448
+ "learning_rate": 1.0000969555943378e-05,
14449
+ "loss": 5.3087,
14450
+ "step": 8252
14451
+ },
14452
+ {
14453
+ "epoch": 0.8,
14454
+ "grad_norm": 1.0084688663482666,
14455
+ "learning_rate": 9.98157843707582e-06,
14456
+ "loss": 5.2778,
14457
+ "step": 8256
14458
  }
14459
  ],
14460
  "logging_steps": 4,
 
14462
  "num_input_tokens_seen": 0,
14463
  "num_train_epochs": 1,
14464
  "save_steps": 1032,
14465
+ "total_flos": 6.953660644432282e+16,
14466
  "train_batch_size": 8,
14467
  "trial_name": null,
14468
  "trial_params": null