DeepDream2045 commited on
Commit
331168c
·
verified ·
1 Parent(s): bf3f6ae

Training in progress, step 881, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e6a1b0a4006de1973994b50c9ff26401b561b77221b5b8a2eaef01f9a5fca18
3
  size 100059752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:963d789019eec055b55d8d0df77a361b9d00860e685194befed461d6ac6a82ce
3
  size 100059752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d056eb98a06a287a0a189d97249dfb2e51ce4d875835c20f88c88c3e24c775e
3
  size 51244404
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37bd0c6ada81cd6cdd2769c64bfc8a35423409aa0a478994e558c9d683ca82f2
3
  size 51244404
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11d65710d9956feee7a6cb68218d916e5532a64f50c66c2fa0799195f0fe1dd0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e3e9e31194f80109fdbafd3c1e2d72c76110c2f31392f20db890a3fe3baca3d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e39ddd079ab2dd8ee3a7f4fbe772c0b95a43be19bf8f2952f47d806834a0447f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aff26be62fe19bc1ef53bf2f0f6ceb3378a857f001e55c5a2722905806c357ff
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.752874378992193,
5
  "eval_steps": 221,
6
- "global_step": 663,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4672,6 +4672,1532 @@
4672
  "eval_samples_per_second": 15.589,
4673
  "eval_steps_per_second": 7.794,
4674
  "step": 663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4675
  }
4676
  ],
4677
  "logging_steps": 1,
@@ -4686,12 +6212,12 @@
4686
  "should_evaluate": false,
4687
  "should_log": false,
4688
  "should_save": true,
4689
- "should_training_stop": false
4690
  },
4691
  "attributes": {}
4692
  }
4693
  },
4694
- "total_flos": 1.0136428871889715e+18,
4695
  "train_batch_size": 2,
4696
  "trial_name": null,
4697
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0009226401703335,
5
  "eval_steps": 221,
6
+ "global_step": 881,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4672
  "eval_samples_per_second": 15.589,
4673
  "eval_steps_per_second": 7.794,
4674
  "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.7540099361249113,
4678
+ "grad_norm": 0.7721088528633118,
4679
+ "learning_rate": 2.9098297132430265e-05,
4680
+ "loss": 1.6602,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.7551454932576295,
4685
+ "grad_norm": 0.7854897379875183,
4686
+ "learning_rate": 2.8844404809282978e-05,
4687
+ "loss": 1.6649,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.7562810503903478,
4692
+ "grad_norm": 0.8402066230773926,
4693
+ "learning_rate": 2.8591438189879748e-05,
4694
+ "loss": 1.6524,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.757416607523066,
4699
+ "grad_norm": 0.7977060079574585,
4700
+ "learning_rate": 2.833940056520772e-05,
4701
+ "loss": 1.6691,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.7585521646557842,
4706
+ "grad_norm": 0.822481095790863,
4707
+ "learning_rate": 2.8088295214168147e-05,
4708
+ "loss": 1.6679,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.7596877217885025,
4713
+ "grad_norm": 0.8438048958778381,
4714
+ "learning_rate": 2.7838125403533854e-05,
4715
+ "loss": 1.5296,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.7608232789212207,
4720
+ "grad_norm": 0.8359370231628418,
4721
+ "learning_rate": 2.7588894387906585e-05,
4722
+ "loss": 1.588,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.761958836053939,
4727
+ "grad_norm": 0.8355525135993958,
4728
+ "learning_rate": 2.734060540967499e-05,
4729
+ "loss": 1.6429,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.7630943931866572,
4734
+ "grad_norm": 0.8638067245483398,
4735
+ "learning_rate": 2.7093261698972005e-05,
4736
+ "loss": 1.5093,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.7642299503193755,
4741
+ "grad_norm": 0.8546352982521057,
4742
+ "learning_rate": 2.6846866473633125e-05,
4743
+ "loss": 1.5595,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.7653655074520936,
4748
+ "grad_norm": 0.9073834419250488,
4749
+ "learning_rate": 2.6601422939154407e-05,
4750
+ "loss": 1.6569,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.7665010645848119,
4755
+ "grad_norm": 0.8955768346786499,
4756
+ "learning_rate": 2.6356934288650903e-05,
4757
+ "loss": 1.6477,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.7676366217175301,
4762
+ "grad_norm": 0.8899676203727722,
4763
+ "learning_rate": 2.6113403702814998e-05,
4764
+ "loss": 1.5093,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.7687721788502484,
4769
+ "grad_norm": 0.872996985912323,
4770
+ "learning_rate": 2.587083434987505e-05,
4771
+ "loss": 1.4281,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.7699077359829667,
4776
+ "grad_norm": 0.9623343348503113,
4777
+ "learning_rate": 2.5629229385554142e-05,
4778
+ "loss": 1.5363,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.7710432931156849,
4783
+ "grad_norm": 0.9317876696586609,
4784
+ "learning_rate": 2.538859195302922e-05,
4785
+ "loss": 1.451,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.7721788502484032,
4790
+ "grad_norm": 1.059510350227356,
4791
+ "learning_rate": 2.514892518288988e-05,
4792
+ "loss": 1.5472,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.7733144073811213,
4797
+ "grad_norm": 0.9734624624252319,
4798
+ "learning_rate": 2.4910232193097994e-05,
4799
+ "loss": 1.4922,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.7744499645138396,
4804
+ "grad_norm": 0.951866090297699,
4805
+ "learning_rate": 2.467251608894683e-05,
4806
+ "loss": 1.3553,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.7755855216465578,
4811
+ "grad_norm": 1.0854949951171875,
4812
+ "learning_rate": 2.443577996302081e-05,
4813
+ "loss": 1.5345,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.7767210787792761,
4818
+ "grad_norm": 1.0024741888046265,
4819
+ "learning_rate": 2.420002689515537e-05,
4820
+ "loss": 1.3282,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.7778566359119943,
4825
+ "grad_norm": 1.0837198495864868,
4826
+ "learning_rate": 2.3965259952396646e-05,
4827
+ "loss": 1.5353,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.7789921930447126,
4832
+ "grad_norm": 1.062203049659729,
4833
+ "learning_rate": 2.3731482188961818e-05,
4834
+ "loss": 1.4269,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.7801277501774309,
4839
+ "grad_norm": 1.1522090435028076,
4840
+ "learning_rate": 2.349869664619917e-05,
4841
+ "loss": 1.4395,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.781263307310149,
4846
+ "grad_norm": 1.122259259223938,
4847
+ "learning_rate": 2.326690635254872e-05,
4848
+ "loss": 1.4416,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.7823988644428673,
4853
+ "grad_norm": 1.2164281606674194,
4854
+ "learning_rate": 2.3036114323502655e-05,
4855
+ "loss": 1.4305,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.7835344215755855,
4860
+ "grad_norm": 1.2222996950149536,
4861
+ "learning_rate": 2.2806323561566146e-05,
4862
+ "loss": 1.403,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.7846699787083038,
4867
+ "grad_norm": 1.3191629648208618,
4868
+ "learning_rate": 2.257753705621839e-05,
4869
+ "loss": 1.3559,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.785805535841022,
4874
+ "grad_norm": 1.3164441585540771,
4875
+ "learning_rate": 2.2349757783873627e-05,
4876
+ "loss": 1.2384,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.7869410929737403,
4881
+ "grad_norm": 1.3931750059127808,
4882
+ "learning_rate": 2.2122988707842353e-05,
4883
+ "loss": 1.4181,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.7880766501064584,
4888
+ "grad_norm": 1.4053387641906738,
4889
+ "learning_rate": 2.18972327782929e-05,
4890
+ "loss": 1.2748,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.7892122072391767,
4895
+ "grad_norm": 1.4738166332244873,
4896
+ "learning_rate": 2.167249293221293e-05,
4897
+ "loss": 1.2564,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.7903477643718949,
4902
+ "grad_norm": 1.6781202554702759,
4903
+ "learning_rate": 2.144877209337145e-05,
4904
+ "loss": 1.3486,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.7914833215046132,
4909
+ "grad_norm": 1.7127809524536133,
4910
+ "learning_rate": 2.122607317228049e-05,
4911
+ "loss": 1.1686,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.7926188786373315,
4916
+ "grad_norm": 1.9314239025115967,
4917
+ "learning_rate": 2.100439906615739e-05,
4918
+ "loss": 1.192,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.7937544357700497,
4923
+ "grad_norm": 2.266965389251709,
4924
+ "learning_rate": 2.0783752658887066e-05,
4925
+ "loss": 0.9822,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.794889992902768,
4930
+ "grad_norm": 3.223053216934204,
4931
+ "learning_rate": 2.056413682098459e-05,
4932
+ "loss": 1.1714,
4933
+ "step": 700
4934
+ },
4935
+ {
4936
+ "epoch": 0.7960255500354861,
4937
+ "grad_norm": 0.675081193447113,
4938
+ "learning_rate": 2.034555440955773e-05,
4939
+ "loss": 1.9009,
4940
+ "step": 701
4941
+ },
4942
+ {
4943
+ "epoch": 0.7971611071682044,
4944
+ "grad_norm": 0.6774374842643738,
4945
+ "learning_rate": 2.0128008268269815e-05,
4946
+ "loss": 1.8244,
4947
+ "step": 702
4948
+ },
4949
+ {
4950
+ "epoch": 0.7982966643009226,
4951
+ "grad_norm": 0.686852216720581,
4952
+ "learning_rate": 1.9911501227302687e-05,
4953
+ "loss": 1.7523,
4954
+ "step": 703
4955
+ },
4956
+ {
4957
+ "epoch": 0.7994322214336409,
4958
+ "grad_norm": 0.7346818447113037,
4959
+ "learning_rate": 1.969603610332007e-05,
4960
+ "loss": 1.7847,
4961
+ "step": 704
4962
+ },
4963
+ {
4964
+ "epoch": 0.8005677785663591,
4965
+ "grad_norm": 0.7139765620231628,
4966
+ "learning_rate": 1.9481615699430654e-05,
4967
+ "loss": 1.6362,
4968
+ "step": 705
4969
+ },
4970
+ {
4971
+ "epoch": 0.8017033356990774,
4972
+ "grad_norm": 0.7906776070594788,
4973
+ "learning_rate": 1.9268242805151902e-05,
4974
+ "loss": 1.8393,
4975
+ "step": 706
4976
+ },
4977
+ {
4978
+ "epoch": 0.8028388928317955,
4979
+ "grad_norm": 0.7764397859573364,
4980
+ "learning_rate": 1.9055920196373523e-05,
4981
+ "loss": 1.8045,
4982
+ "step": 707
4983
+ },
4984
+ {
4985
+ "epoch": 0.8039744499645138,
4986
+ "grad_norm": 0.7411718368530273,
4987
+ "learning_rate": 1.8844650635321483e-05,
4988
+ "loss": 1.6908,
4989
+ "step": 708
4990
+ },
4991
+ {
4992
+ "epoch": 0.8051100070972321,
4993
+ "grad_norm": 0.7711935639381409,
4994
+ "learning_rate": 1.863443687052211e-05,
4995
+ "loss": 1.7031,
4996
+ "step": 709
4997
+ },
4998
+ {
4999
+ "epoch": 0.8062455642299503,
5000
+ "grad_norm": 0.7640191912651062,
5001
+ "learning_rate": 1.842528163676619e-05,
5002
+ "loss": 1.7085,
5003
+ "step": 710
5004
+ },
5005
+ {
5006
+ "epoch": 0.8073811213626686,
5007
+ "grad_norm": 0.8145517110824585,
5008
+ "learning_rate": 1.8217187655073564e-05,
5009
+ "loss": 1.7607,
5010
+ "step": 711
5011
+ },
5012
+ {
5013
+ "epoch": 0.8085166784953868,
5014
+ "grad_norm": 0.7967603206634521,
5015
+ "learning_rate": 1.8010157632657543e-05,
5016
+ "loss": 1.6411,
5017
+ "step": 712
5018
+ },
5019
+ {
5020
+ "epoch": 0.8096522356281051,
5021
+ "grad_norm": 0.8301655054092407,
5022
+ "learning_rate": 1.7804194262889874e-05,
5023
+ "loss": 1.6753,
5024
+ "step": 713
5025
+ },
5026
+ {
5027
+ "epoch": 0.8107877927608232,
5028
+ "grad_norm": 0.7865291237831116,
5029
+ "learning_rate": 1.759930022526556e-05,
5030
+ "loss": 1.641,
5031
+ "step": 714
5032
+ },
5033
+ {
5034
+ "epoch": 0.8119233498935415,
5035
+ "grad_norm": 0.835369884967804,
5036
+ "learning_rate": 1.739547818536804e-05,
5037
+ "loss": 1.6512,
5038
+ "step": 715
5039
+ },
5040
+ {
5041
+ "epoch": 0.8130589070262597,
5042
+ "grad_norm": 0.8319673538208008,
5043
+ "learning_rate": 1.7192730794834556e-05,
5044
+ "loss": 1.651,
5045
+ "step": 716
5046
+ },
5047
+ {
5048
+ "epoch": 0.814194464158978,
5049
+ "grad_norm": 0.8641321659088135,
5050
+ "learning_rate": 1.699106069132165e-05,
5051
+ "loss": 1.674,
5052
+ "step": 717
5053
+ },
5054
+ {
5055
+ "epoch": 0.8153300212916962,
5056
+ "grad_norm": 0.8129961490631104,
5057
+ "learning_rate": 1.6790470498470744e-05,
5058
+ "loss": 1.5624,
5059
+ "step": 718
5060
+ },
5061
+ {
5062
+ "epoch": 0.8164655784244145,
5063
+ "grad_norm": 0.8343265056610107,
5064
+ "learning_rate": 1.6590962825874146e-05,
5065
+ "loss": 1.6264,
5066
+ "step": 719
5067
+ },
5068
+ {
5069
+ "epoch": 0.8176011355571328,
5070
+ "grad_norm": 0.8597946763038635,
5071
+ "learning_rate": 1.639254026904099e-05,
5072
+ "loss": 1.5274,
5073
+ "step": 720
5074
+ },
5075
+ {
5076
+ "epoch": 0.8187366926898509,
5077
+ "grad_norm": 0.8967453837394714,
5078
+ "learning_rate": 1.6195205409363577e-05,
5079
+ "loss": 1.5423,
5080
+ "step": 721
5081
+ },
5082
+ {
5083
+ "epoch": 0.8198722498225692,
5084
+ "grad_norm": 0.8765518069267273,
5085
+ "learning_rate": 1.599896081408373e-05,
5086
+ "loss": 1.6939,
5087
+ "step": 722
5088
+ },
5089
+ {
5090
+ "epoch": 0.8210078069552874,
5091
+ "grad_norm": 0.886623740196228,
5092
+ "learning_rate": 1.5803809036259364e-05,
5093
+ "loss": 1.558,
5094
+ "step": 723
5095
+ },
5096
+ {
5097
+ "epoch": 0.8221433640880057,
5098
+ "grad_norm": 0.9663245677947998,
5099
+ "learning_rate": 1.5609752614731288e-05,
5100
+ "loss": 1.6809,
5101
+ "step": 724
5102
+ },
5103
+ {
5104
+ "epoch": 0.8232789212207239,
5105
+ "grad_norm": 0.940026044845581,
5106
+ "learning_rate": 1.5416794074090258e-05,
5107
+ "loss": 1.5393,
5108
+ "step": 725
5109
+ },
5110
+ {
5111
+ "epoch": 0.8244144783534422,
5112
+ "grad_norm": 0.9838845133781433,
5113
+ "learning_rate": 1.5224935924644069e-05,
5114
+ "loss": 1.5611,
5115
+ "step": 726
5116
+ },
5117
+ {
5118
+ "epoch": 0.8255500354861603,
5119
+ "grad_norm": 1.0032014846801758,
5120
+ "learning_rate": 1.5034180662384857e-05,
5121
+ "loss": 1.5944,
5122
+ "step": 727
5123
+ },
5124
+ {
5125
+ "epoch": 0.8266855926188786,
5126
+ "grad_norm": 0.991060197353363,
5127
+ "learning_rate": 1.4844530768956656e-05,
5128
+ "loss": 1.5018,
5129
+ "step": 728
5130
+ },
5131
+ {
5132
+ "epoch": 0.8278211497515969,
5133
+ "grad_norm": 0.9556703567504883,
5134
+ "learning_rate": 1.4655988711623203e-05,
5135
+ "loss": 1.5761,
5136
+ "step": 729
5137
+ },
5138
+ {
5139
+ "epoch": 0.8289567068843151,
5140
+ "grad_norm": 1.0259032249450684,
5141
+ "learning_rate": 1.4468556943235678e-05,
5142
+ "loss": 1.5824,
5143
+ "step": 730
5144
+ },
5145
+ {
5146
+ "epoch": 0.8300922640170334,
5147
+ "grad_norm": 1.0417308807373047,
5148
+ "learning_rate": 1.4282237902200957e-05,
5149
+ "loss": 1.4678,
5150
+ "step": 731
5151
+ },
5152
+ {
5153
+ "epoch": 0.8312278211497516,
5154
+ "grad_norm": 1.0187368392944336,
5155
+ "learning_rate": 1.409703401244975e-05,
5156
+ "loss": 1.5475,
5157
+ "step": 732
5158
+ },
5159
+ {
5160
+ "epoch": 0.8323633782824699,
5161
+ "grad_norm": 1.1188650131225586,
5162
+ "learning_rate": 1.391294768340513e-05,
5163
+ "loss": 1.6506,
5164
+ "step": 733
5165
+ },
5166
+ {
5167
+ "epoch": 0.833498935415188,
5168
+ "grad_norm": 1.173802375793457,
5169
+ "learning_rate": 1.3729981309951245e-05,
5170
+ "loss": 1.5189,
5171
+ "step": 734
5172
+ },
5173
+ {
5174
+ "epoch": 0.8346344925479063,
5175
+ "grad_norm": 1.0794363021850586,
5176
+ "learning_rate": 1.3548137272402006e-05,
5177
+ "loss": 1.4119,
5178
+ "step": 735
5179
+ },
5180
+ {
5181
+ "epoch": 0.8357700496806245,
5182
+ "grad_norm": 1.0807902812957764,
5183
+ "learning_rate": 1.3367417936470328e-05,
5184
+ "loss": 1.3185,
5185
+ "step": 736
5186
+ },
5187
+ {
5188
+ "epoch": 0.8369056068133428,
5189
+ "grad_norm": 1.0555070638656616,
5190
+ "learning_rate": 1.318782565323714e-05,
5191
+ "loss": 1.2576,
5192
+ "step": 737
5193
+ },
5194
+ {
5195
+ "epoch": 0.838041163946061,
5196
+ "grad_norm": 1.1013679504394531,
5197
+ "learning_rate": 1.300936275912098e-05,
5198
+ "loss": 1.2131,
5199
+ "step": 738
5200
+ },
5201
+ {
5202
+ "epoch": 0.8391767210787793,
5203
+ "grad_norm": 1.1725547313690186,
5204
+ "learning_rate": 1.2832031575847448e-05,
5205
+ "loss": 1.3464,
5206
+ "step": 739
5207
+ },
5208
+ {
5209
+ "epoch": 0.8403122782114976,
5210
+ "grad_norm": 1.2003037929534912,
5211
+ "learning_rate": 1.265583441041911e-05,
5212
+ "loss": 1.3536,
5213
+ "step": 740
5214
+ },
5215
+ {
5216
+ "epoch": 0.8414478353442157,
5217
+ "grad_norm": 1.1799437999725342,
5218
+ "learning_rate": 1.2480773555085434e-05,
5219
+ "loss": 1.2299,
5220
+ "step": 741
5221
+ },
5222
+ {
5223
+ "epoch": 0.842583392476934,
5224
+ "grad_norm": 1.3554902076721191,
5225
+ "learning_rate": 1.2306851287313025e-05,
5226
+ "loss": 1.4439,
5227
+ "step": 742
5228
+ },
5229
+ {
5230
+ "epoch": 0.8437189496096522,
5231
+ "grad_norm": 1.3761941194534302,
5232
+ "learning_rate": 1.2134069869755893e-05,
5233
+ "loss": 1.3434,
5234
+ "step": 743
5235
+ },
5236
+ {
5237
+ "epoch": 0.8448545067423705,
5238
+ "grad_norm": 1.4420794248580933,
5239
+ "learning_rate": 1.1962431550226105e-05,
5240
+ "loss": 1.315,
5241
+ "step": 744
5242
+ },
5243
+ {
5244
+ "epoch": 0.8459900638750887,
5245
+ "grad_norm": 1.3914544582366943,
5246
+ "learning_rate": 1.1791938561664485e-05,
5247
+ "loss": 1.1473,
5248
+ "step": 745
5249
+ },
5250
+ {
5251
+ "epoch": 0.847125621007807,
5252
+ "grad_norm": 1.5766267776489258,
5253
+ "learning_rate": 1.1622593122111624e-05,
5254
+ "loss": 1.3514,
5255
+ "step": 746
5256
+ },
5257
+ {
5258
+ "epoch": 0.8482611781405252,
5259
+ "grad_norm": 1.5784944295883179,
5260
+ "learning_rate": 1.1454397434679021e-05,
5261
+ "loss": 1.2336,
5262
+ "step": 747
5263
+ },
5264
+ {
5265
+ "epoch": 0.8493967352732434,
5266
+ "grad_norm": 1.8584637641906738,
5267
+ "learning_rate": 1.128735368752033e-05,
5268
+ "loss": 1.1117,
5269
+ "step": 748
5270
+ },
5271
+ {
5272
+ "epoch": 0.8505322924059616,
5273
+ "grad_norm": 2.064455270767212,
5274
+ "learning_rate": 1.1121464053802965e-05,
5275
+ "loss": 1.0516,
5276
+ "step": 749
5277
+ },
5278
+ {
5279
+ "epoch": 0.8516678495386799,
5280
+ "grad_norm": 2.9727470874786377,
5281
+ "learning_rate": 1.0956730691679861e-05,
5282
+ "loss": 1.0157,
5283
+ "step": 750
5284
+ },
5285
+ {
5286
+ "epoch": 0.8528034066713982,
5287
+ "grad_norm": 0.5627864599227905,
5288
+ "learning_rate": 1.0793155744261351e-05,
5289
+ "loss": 1.816,
5290
+ "step": 751
5291
+ },
5292
+ {
5293
+ "epoch": 0.8539389638041164,
5294
+ "grad_norm": 0.6203926205635071,
5295
+ "learning_rate": 1.0630741339587257e-05,
5296
+ "loss": 1.8486,
5297
+ "step": 752
5298
+ },
5299
+ {
5300
+ "epoch": 0.8550745209368347,
5301
+ "grad_norm": 0.6811462044715881,
5302
+ "learning_rate": 1.0469489590599257e-05,
5303
+ "loss": 1.764,
5304
+ "step": 753
5305
+ },
5306
+ {
5307
+ "epoch": 0.8562100780695528,
5308
+ "grad_norm": 0.6945324540138245,
5309
+ "learning_rate": 1.0309402595113338e-05,
5310
+ "loss": 1.7796,
5311
+ "step": 754
5312
+ },
5313
+ {
5314
+ "epoch": 0.8573456352022711,
5315
+ "grad_norm": 0.6891714334487915,
5316
+ "learning_rate": 1.0150482435792618e-05,
5317
+ "loss": 1.7623,
5318
+ "step": 755
5319
+ },
5320
+ {
5321
+ "epoch": 0.8584811923349893,
5322
+ "grad_norm": 0.7250452637672424,
5323
+ "learning_rate": 9.992731180120164e-06,
5324
+ "loss": 1.7345,
5325
+ "step": 756
5326
+ },
5327
+ {
5328
+ "epoch": 0.8596167494677076,
5329
+ "grad_norm": 0.7625033259391785,
5330
+ "learning_rate": 9.836150880372041e-06,
5331
+ "loss": 1.7901,
5332
+ "step": 757
5333
+ },
5334
+ {
5335
+ "epoch": 0.8607523066004258,
5336
+ "grad_norm": 0.746115505695343,
5337
+ "learning_rate": 9.680743573590733e-06,
5338
+ "loss": 1.7118,
5339
+ "step": 758
5340
+ },
5341
+ {
5342
+ "epoch": 0.8618878637331441,
5343
+ "grad_norm": 0.7322573661804199,
5344
+ "learning_rate": 9.526511281558593e-06,
5345
+ "loss": 1.6324,
5346
+ "step": 759
5347
+ },
5348
+ {
5349
+ "epoch": 0.8630234208658624,
5350
+ "grad_norm": 0.7643139362335205,
5351
+ "learning_rate": 9.373456010771509e-06,
5352
+ "loss": 1.6827,
5353
+ "step": 760
5354
+ },
5355
+ {
5356
+ "epoch": 0.8641589779985805,
5357
+ "grad_norm": 0.7623146772384644,
5358
+ "learning_rate": 9.221579752412856e-06,
5359
+ "loss": 1.5998,
5360
+ "step": 761
5361
+ },
5362
+ {
5363
+ "epoch": 0.8652945351312988,
5364
+ "grad_norm": 0.7586025595664978,
5365
+ "learning_rate": 9.070884482327524e-06,
5366
+ "loss": 1.6652,
5367
+ "step": 762
5368
+ },
5369
+ {
5370
+ "epoch": 0.866430092264017,
5371
+ "grad_norm": 0.7911468744277954,
5372
+ "learning_rate": 8.921372160996322e-06,
5373
+ "loss": 1.6911,
5374
+ "step": 763
5375
+ },
5376
+ {
5377
+ "epoch": 0.8675656493967353,
5378
+ "grad_norm": 0.7929603457450867,
5379
+ "learning_rate": 8.773044733510338e-06,
5380
+ "loss": 1.6144,
5381
+ "step": 764
5382
+ },
5383
+ {
5384
+ "epoch": 0.8687012065294535,
5385
+ "grad_norm": 0.8179545402526855,
5386
+ "learning_rate": 8.625904129545692e-06,
5387
+ "loss": 1.6333,
5388
+ "step": 765
5389
+ },
5390
+ {
5391
+ "epoch": 0.8698367636621718,
5392
+ "grad_norm": 0.8298175930976868,
5393
+ "learning_rate": 8.479952263338509e-06,
5394
+ "loss": 1.6375,
5395
+ "step": 766
5396
+ },
5397
+ {
5398
+ "epoch": 0.87097232079489,
5399
+ "grad_norm": 0.8845016956329346,
5400
+ "learning_rate": 8.335191033659907e-06,
5401
+ "loss": 1.6268,
5402
+ "step": 767
5403
+ },
5404
+ {
5405
+ "epoch": 0.8721078779276082,
5406
+ "grad_norm": 0.8354196548461914,
5407
+ "learning_rate": 8.191622323791315e-06,
5408
+ "loss": 1.6815,
5409
+ "step": 768
5410
+ },
5411
+ {
5412
+ "epoch": 0.8732434350603264,
5413
+ "grad_norm": 0.8513210415840149,
5414
+ "learning_rate": 8.049248001500021e-06,
5415
+ "loss": 1.6325,
5416
+ "step": 769
5417
+ },
5418
+ {
5419
+ "epoch": 0.8743789921930447,
5420
+ "grad_norm": 0.8357150554656982,
5421
+ "learning_rate": 7.908069919014815e-06,
5422
+ "loss": 1.5492,
5423
+ "step": 770
5424
+ },
5425
+ {
5426
+ "epoch": 0.875514549325763,
5427
+ "grad_norm": 0.923727810382843,
5428
+ "learning_rate": 7.768089913001941e-06,
5429
+ "loss": 1.6026,
5430
+ "step": 771
5431
+ },
5432
+ {
5433
+ "epoch": 0.8766501064584812,
5434
+ "grad_norm": 0.9044084548950195,
5435
+ "learning_rate": 7.629309804541207e-06,
5436
+ "loss": 1.577,
5437
+ "step": 772
5438
+ },
5439
+ {
5440
+ "epoch": 0.8777856635911995,
5441
+ "grad_norm": 0.8825026750564575,
5442
+ "learning_rate": 7.491731399102231e-06,
5443
+ "loss": 1.5215,
5444
+ "step": 773
5445
+ },
5446
+ {
5447
+ "epoch": 0.8789212207239177,
5448
+ "grad_norm": 0.9344937205314636,
5449
+ "learning_rate": 7.355356486520959e-06,
5450
+ "loss": 1.515,
5451
+ "step": 774
5452
+ },
5453
+ {
5454
+ "epoch": 0.8800567778566359,
5455
+ "grad_norm": 0.9843137264251709,
5456
+ "learning_rate": 7.220186840976495e-06,
5457
+ "loss": 1.5535,
5458
+ "step": 775
5459
+ },
5460
+ {
5461
+ "epoch": 0.8811923349893541,
5462
+ "grad_norm": 0.9081253409385681,
5463
+ "learning_rate": 7.086224220967907e-06,
5464
+ "loss": 1.5453,
5465
+ "step": 776
5466
+ },
5467
+ {
5468
+ "epoch": 0.8823278921220724,
5469
+ "grad_norm": 0.9501760005950928,
5470
+ "learning_rate": 6.953470369291348e-06,
5471
+ "loss": 1.6231,
5472
+ "step": 777
5473
+ },
5474
+ {
5475
+ "epoch": 0.8834634492547906,
5476
+ "grad_norm": 0.9582067131996155,
5477
+ "learning_rate": 6.821927013017426e-06,
5478
+ "loss": 1.503,
5479
+ "step": 778
5480
+ },
5481
+ {
5482
+ "epoch": 0.8845990063875089,
5483
+ "grad_norm": 1.0171207189559937,
5484
+ "learning_rate": 6.691595863468703e-06,
5485
+ "loss": 1.6333,
5486
+ "step": 779
5487
+ },
5488
+ {
5489
+ "epoch": 0.8857345635202271,
5490
+ "grad_norm": 0.9824504256248474,
5491
+ "learning_rate": 6.562478616197554e-06,
5492
+ "loss": 1.4158,
5493
+ "step": 780
5494
+ },
5495
+ {
5496
+ "epoch": 0.8868701206529453,
5497
+ "grad_norm": 1.0205291509628296,
5498
+ "learning_rate": 6.4345769509638776e-06,
5499
+ "loss": 1.5389,
5500
+ "step": 781
5501
+ },
5502
+ {
5503
+ "epoch": 0.8880056777856636,
5504
+ "grad_norm": 0.9647020101547241,
5505
+ "learning_rate": 6.307892531713444e-06,
5506
+ "loss": 1.4057,
5507
+ "step": 782
5508
+ },
5509
+ {
5510
+ "epoch": 0.8891412349183818,
5511
+ "grad_norm": 1.0226200819015503,
5512
+ "learning_rate": 6.182427006556135e-06,
5513
+ "loss": 1.4316,
5514
+ "step": 783
5515
+ },
5516
+ {
5517
+ "epoch": 0.8902767920511001,
5518
+ "grad_norm": 1.0292648077011108,
5519
+ "learning_rate": 6.058182007744584e-06,
5520
+ "loss": 1.3882,
5521
+ "step": 784
5522
+ },
5523
+ {
5524
+ "epoch": 0.8914123491838183,
5525
+ "grad_norm": 1.1634976863861084,
5526
+ "learning_rate": 5.935159151652902e-06,
5527
+ "loss": 1.4706,
5528
+ "step": 785
5529
+ },
5530
+ {
5531
+ "epoch": 0.8925479063165366,
5532
+ "grad_norm": 0.993560791015625,
5533
+ "learning_rate": 5.813360038755611e-06,
5534
+ "loss": 1.2897,
5535
+ "step": 786
5536
+ },
5537
+ {
5538
+ "epoch": 0.8936834634492548,
5539
+ "grad_norm": 1.1387815475463867,
5540
+ "learning_rate": 5.6927862536068635e-06,
5541
+ "loss": 1.5279,
5542
+ "step": 787
5543
+ },
5544
+ {
5545
+ "epoch": 0.894819020581973,
5546
+ "grad_norm": 1.2450308799743652,
5547
+ "learning_rate": 5.573439364819855e-06,
5548
+ "loss": 1.4883,
5549
+ "step": 788
5550
+ },
5551
+ {
5552
+ "epoch": 0.8959545777146912,
5553
+ "grad_norm": 1.23203444480896,
5554
+ "learning_rate": 5.455320925046359e-06,
5555
+ "loss": 1.4462,
5556
+ "step": 789
5557
+ },
5558
+ {
5559
+ "epoch": 0.8970901348474095,
5560
+ "grad_norm": 1.1987688541412354,
5561
+ "learning_rate": 5.338432470956589e-06,
5562
+ "loss": 1.3306,
5563
+ "step": 790
5564
+ },
5565
+ {
5566
+ "epoch": 0.8982256919801278,
5567
+ "grad_norm": 1.291505217552185,
5568
+ "learning_rate": 5.222775523219125e-06,
5569
+ "loss": 1.2691,
5570
+ "step": 791
5571
+ },
5572
+ {
5573
+ "epoch": 0.899361249112846,
5574
+ "grad_norm": 1.3048465251922607,
5575
+ "learning_rate": 5.108351586481197e-06,
5576
+ "loss": 1.4038,
5577
+ "step": 792
5578
+ },
5579
+ {
5580
+ "epoch": 0.9004968062455643,
5581
+ "grad_norm": 1.4124550819396973,
5582
+ "learning_rate": 4.99516214934912e-06,
5583
+ "loss": 1.2919,
5584
+ "step": 793
5585
+ },
5586
+ {
5587
+ "epoch": 0.9016323633782825,
5588
+ "grad_norm": 1.405280590057373,
5589
+ "learning_rate": 4.8832086843688564e-06,
5590
+ "loss": 1.3023,
5591
+ "step": 794
5592
+ },
5593
+ {
5594
+ "epoch": 0.9027679205110007,
5595
+ "grad_norm": 1.6001452207565308,
5596
+ "learning_rate": 4.772492648006932e-06,
5597
+ "loss": 1.3003,
5598
+ "step": 795
5599
+ },
5600
+ {
5601
+ "epoch": 0.9039034776437189,
5602
+ "grad_norm": 1.7145951986312866,
5603
+ "learning_rate": 4.663015480631428e-06,
5604
+ "loss": 1.2998,
5605
+ "step": 796
5606
+ },
5607
+ {
5608
+ "epoch": 0.9050390347764372,
5609
+ "grad_norm": 1.6823632717132568,
5610
+ "learning_rate": 4.554778606493315e-06,
5611
+ "loss": 1.2216,
5612
+ "step": 797
5613
+ },
5614
+ {
5615
+ "epoch": 0.9061745919091554,
5616
+ "grad_norm": 1.7903900146484375,
5617
+ "learning_rate": 4.447783433707842e-06,
5618
+ "loss": 0.927,
5619
+ "step": 798
5620
+ },
5621
+ {
5622
+ "epoch": 0.9073101490418737,
5623
+ "grad_norm": 2.1676876544952393,
5624
+ "learning_rate": 4.342031354236265e-06,
5625
+ "loss": 1.1205,
5626
+ "step": 799
5627
+ },
5628
+ {
5629
+ "epoch": 0.9084457061745919,
5630
+ "grad_norm": 3.2513821125030518,
5631
+ "learning_rate": 4.237523743867744e-06,
5632
+ "loss": 1.2247,
5633
+ "step": 800
5634
+ },
5635
+ {
5636
+ "epoch": 0.9095812633073102,
5637
+ "grad_norm": 0.5881676077842712,
5638
+ "learning_rate": 4.134261962201425e-06,
5639
+ "loss": 1.8548,
5640
+ "step": 801
5641
+ },
5642
+ {
5643
+ "epoch": 0.9107168204400284,
5644
+ "grad_norm": 0.6239389777183533,
5645
+ "learning_rate": 4.032247352628748e-06,
5646
+ "loss": 1.8171,
5647
+ "step": 802
5648
+ },
5649
+ {
5650
+ "epoch": 0.9118523775727466,
5651
+ "grad_norm": 0.667490541934967,
5652
+ "learning_rate": 3.931481242315993e-06,
5653
+ "loss": 1.8324,
5654
+ "step": 803
5655
+ },
5656
+ {
5657
+ "epoch": 0.9129879347054649,
5658
+ "grad_norm": 0.7072851061820984,
5659
+ "learning_rate": 3.8319649421869495e-06,
5660
+ "loss": 1.8454,
5661
+ "step": 804
5662
+ },
5663
+ {
5664
+ "epoch": 0.9141234918381831,
5665
+ "grad_norm": 0.695285975933075,
5666
+ "learning_rate": 3.7336997469060276e-06,
5667
+ "loss": 1.6505,
5668
+ "step": 805
5669
+ },
5670
+ {
5671
+ "epoch": 0.9152590489709014,
5672
+ "grad_norm": 0.6985710263252258,
5673
+ "learning_rate": 3.6366869348611887e-06,
5674
+ "loss": 1.7544,
5675
+ "step": 806
5676
+ },
5677
+ {
5678
+ "epoch": 0.9163946061036196,
5679
+ "grad_norm": 0.7601833939552307,
5680
+ "learning_rate": 3.540927768147484e-06,
5681
+ "loss": 1.794,
5682
+ "step": 807
5683
+ },
5684
+ {
5685
+ "epoch": 0.9175301632363378,
5686
+ "grad_norm": 0.7707709670066833,
5687
+ "learning_rate": 3.4464234925505213e-06,
5688
+ "loss": 1.6881,
5689
+ "step": 808
5690
+ },
5691
+ {
5692
+ "epoch": 0.918665720369056,
5693
+ "grad_norm": 0.7650094032287598,
5694
+ "learning_rate": 3.3531753375303897e-06,
5695
+ "loss": 1.6556,
5696
+ "step": 809
5697
+ },
5698
+ {
5699
+ "epoch": 0.9198012775017743,
5700
+ "grad_norm": 0.7413115501403809,
5701
+ "learning_rate": 3.261184516205551e-06,
5702
+ "loss": 1.6912,
5703
+ "step": 810
5704
+ },
5705
+ {
5706
+ "epoch": 0.9209368346344925,
5707
+ "grad_norm": 0.8219679594039917,
5708
+ "learning_rate": 3.1704522253370947e-06,
5709
+ "loss": 1.6756,
5710
+ "step": 811
5711
+ },
5712
+ {
5713
+ "epoch": 0.9220723917672108,
5714
+ "grad_norm": 0.7987946271896362,
5715
+ "learning_rate": 3.080979645313142e-06,
5716
+ "loss": 1.7316,
5717
+ "step": 812
5718
+ },
5719
+ {
5720
+ "epoch": 0.9232079488999291,
5721
+ "grad_norm": 0.7636008858680725,
5722
+ "learning_rate": 2.9927679401335785e-06,
5723
+ "loss": 1.6475,
5724
+ "step": 813
5725
+ },
5726
+ {
5727
+ "epoch": 0.9243435060326473,
5728
+ "grad_norm": 0.8155189752578735,
5729
+ "learning_rate": 2.905818257394799e-06,
5730
+ "loss": 1.7,
5731
+ "step": 814
5732
+ },
5733
+ {
5734
+ "epoch": 0.9254790631653655,
5735
+ "grad_norm": 0.7985911965370178,
5736
+ "learning_rate": 2.8201317282748552e-06,
5737
+ "loss": 1.5697,
5738
+ "step": 815
5739
+ },
5740
+ {
5741
+ "epoch": 0.9266146202980837,
5742
+ "grad_norm": 0.8054267764091492,
5743
+ "learning_rate": 2.735709467518699e-06,
5744
+ "loss": 1.5872,
5745
+ "step": 816
5746
+ },
5747
+ {
5748
+ "epoch": 0.927750177430802,
5749
+ "grad_norm": 0.8176944851875305,
5750
+ "learning_rate": 2.6525525734236944e-06,
5751
+ "loss": 1.5912,
5752
+ "step": 817
5753
+ },
5754
+ {
5755
+ "epoch": 0.9288857345635202,
5756
+ "grad_norm": 0.8318896293640137,
5757
+ "learning_rate": 2.5706621278253406e-06,
5758
+ "loss": 1.5092,
5759
+ "step": 818
5760
+ },
5761
+ {
5762
+ "epoch": 0.9300212916962385,
5763
+ "grad_norm": 0.8194354772567749,
5764
+ "learning_rate": 2.49003919608316e-06,
5765
+ "loss": 1.5934,
5766
+ "step": 819
5767
+ },
5768
+ {
5769
+ "epoch": 0.9311568488289567,
5770
+ "grad_norm": 0.9440639615058899,
5771
+ "learning_rate": 2.4106848270669e-06,
5772
+ "loss": 1.6014,
5773
+ "step": 820
5774
+ },
5775
+ {
5776
+ "epoch": 0.932292405961675,
5777
+ "grad_norm": 0.9147624969482422,
5778
+ "learning_rate": 2.3326000531428195e-06,
5779
+ "loss": 1.6544,
5780
+ "step": 821
5781
+ },
5782
+ {
5783
+ "epoch": 0.9334279630943931,
5784
+ "grad_norm": 0.8831505179405212,
5785
+ "learning_rate": 2.255785890160311e-06,
5786
+ "loss": 1.5767,
5787
+ "step": 822
5788
+ },
5789
+ {
5790
+ "epoch": 0.9345635202271114,
5791
+ "grad_norm": 0.8842580318450928,
5792
+ "learning_rate": 2.1802433374386588e-06,
5793
+ "loss": 1.57,
5794
+ "step": 823
5795
+ },
5796
+ {
5797
+ "epoch": 0.9356990773598297,
5798
+ "grad_norm": 0.9388715028762817,
5799
+ "learning_rate": 2.1059733777540225e-06,
5800
+ "loss": 1.5907,
5801
+ "step": 824
5802
+ },
5803
+ {
5804
+ "epoch": 0.9368346344925479,
5805
+ "grad_norm": 1.0418598651885986,
5806
+ "learning_rate": 2.032976977326706e-06,
5807
+ "loss": 1.6648,
5808
+ "step": 825
5809
+ },
5810
+ {
5811
+ "epoch": 0.9379701916252662,
5812
+ "grad_norm": 0.9417559504508972,
5813
+ "learning_rate": 1.9612550858085334e-06,
5814
+ "loss": 1.5464,
5815
+ "step": 826
5816
+ },
5817
+ {
5818
+ "epoch": 0.9391057487579844,
5819
+ "grad_norm": 0.9899365305900574,
5820
+ "learning_rate": 1.8908086362705357e-06,
5821
+ "loss": 1.6173,
5822
+ "step": 827
5823
+ },
5824
+ {
5825
+ "epoch": 0.9402413058907026,
5826
+ "grad_norm": 0.9272261261940002,
5827
+ "learning_rate": 1.8216385451907624e-06,
5828
+ "loss": 1.3998,
5829
+ "step": 828
5830
+ },
5831
+ {
5832
+ "epoch": 0.9413768630234208,
5833
+ "grad_norm": 1.0163697004318237,
5834
+ "learning_rate": 1.7537457124423895e-06,
5835
+ "loss": 1.4026,
5836
+ "step": 829
5837
+ },
5838
+ {
5839
+ "epoch": 0.9425124201561391,
5840
+ "grad_norm": 0.9311161041259766,
5841
+ "learning_rate": 1.68713102128204e-06,
5842
+ "loss": 1.3825,
5843
+ "step": 830
5844
+ },
5845
+ {
5846
+ "epoch": 0.9436479772888573,
5847
+ "grad_norm": 1.0091522932052612,
5848
+ "learning_rate": 1.62179533833825e-06,
5849
+ "loss": 1.4272,
5850
+ "step": 831
5851
+ },
5852
+ {
5853
+ "epoch": 0.9447835344215756,
5854
+ "grad_norm": 1.104512095451355,
5855
+ "learning_rate": 1.5577395136001982e-06,
5856
+ "loss": 1.6002,
5857
+ "step": 832
5858
+ },
5859
+ {
5860
+ "epoch": 0.9459190915542939,
5861
+ "grad_norm": 1.098473310470581,
5862
+ "learning_rate": 1.4949643804066493e-06,
5863
+ "loss": 1.4385,
5864
+ "step": 833
5865
+ },
5866
+ {
5867
+ "epoch": 0.9470546486870121,
5868
+ "grad_norm": 1.0314809083938599,
5869
+ "learning_rate": 1.4334707554351511e-06,
5870
+ "loss": 1.3949,
5871
+ "step": 834
5872
+ },
5873
+ {
5874
+ "epoch": 0.9481902058197303,
5875
+ "grad_norm": 1.039817452430725,
5876
+ "learning_rate": 1.3732594386913655e-06,
5877
+ "loss": 1.4388,
5878
+ "step": 835
5879
+ },
5880
+ {
5881
+ "epoch": 0.9493257629524485,
5882
+ "grad_norm": 1.1446057558059692,
5883
+ "learning_rate": 1.3143312134986651e-06,
5884
+ "loss": 1.5356,
5885
+ "step": 836
5886
+ },
5887
+ {
5888
+ "epoch": 0.9504613200851668,
5889
+ "grad_norm": 1.1136023998260498,
5890
+ "learning_rate": 1.2566868464879533e-06,
5891
+ "loss": 1.4069,
5892
+ "step": 837
5893
+ },
5894
+ {
5895
+ "epoch": 0.951596877217885,
5896
+ "grad_norm": 1.1392496824264526,
5897
+ "learning_rate": 1.200327087587716e-06,
5898
+ "loss": 1.311,
5899
+ "step": 838
5900
+ },
5901
+ {
5902
+ "epoch": 0.9527324343506033,
5903
+ "grad_norm": 1.1572725772857666,
5904
+ "learning_rate": 1.1452526700141964e-06,
5905
+ "loss": 1.2519,
5906
+ "step": 839
5907
+ },
5908
+ {
5909
+ "epoch": 0.9538679914833215,
5910
+ "grad_norm": 1.2496389150619507,
5911
+ "learning_rate": 1.091464310261947e-06,
5912
+ "loss": 1.3081,
5913
+ "step": 840
5914
+ },
5915
+ {
5916
+ "epoch": 0.9550035486160398,
5917
+ "grad_norm": 1.2605552673339844,
5918
+ "learning_rate": 1.0389627080944153e-06,
5919
+ "loss": 1.3012,
5920
+ "step": 841
5921
+ },
5922
+ {
5923
+ "epoch": 0.9561391057487579,
5924
+ "grad_norm": 1.4142736196517944,
5925
+ "learning_rate": 9.877485465349058e-07,
5926
+ "loss": 1.4079,
5927
+ "step": 842
5928
+ },
5929
+ {
5930
+ "epoch": 0.9572746628814762,
5931
+ "grad_norm": 1.5376014709472656,
5932
+ "learning_rate": 9.378224918576872e-07,
5933
+ "loss": 1.5231,
5934
+ "step": 843
5935
+ },
5936
+ {
5937
+ "epoch": 0.9584102200141945,
5938
+ "grad_norm": 1.5845942497253418,
5939
+ "learning_rate": 8.891851935792673e-07,
5940
+ "loss": 1.2707,
5941
+ "step": 844
5942
+ },
5943
+ {
5944
+ "epoch": 0.9595457771469127,
5945
+ "grad_norm": 1.518162488937378,
5946
+ "learning_rate": 8.418372844500532e-07,
5947
+ "loss": 1.2705,
5948
+ "step": 845
5949
+ },
5950
+ {
5951
+ "epoch": 0.960681334279631,
5952
+ "grad_norm": 1.7950546741485596,
5953
+ "learning_rate": 7.957793804459824e-07,
5954
+ "loss": 1.2607,
5955
+ "step": 846
5956
+ },
5957
+ {
5958
+ "epoch": 0.9618168914123492,
5959
+ "grad_norm": 2.0692031383514404,
5960
+ "learning_rate": 7.51012080760638e-07,
5961
+ "loss": 1.4285,
5962
+ "step": 847
5963
+ },
5964
+ {
5965
+ "epoch": 0.9629524485450675,
5966
+ "grad_norm": 2.0200307369232178,
5967
+ "learning_rate": 7.075359677973569e-07,
5968
+ "loss": 1.1366,
5969
+ "step": 848
5970
+ },
5971
+ {
5972
+ "epoch": 0.9640880056777856,
5973
+ "grad_norm": 2.89170503616333,
5974
+ "learning_rate": 6.653516071616906e-07,
5975
+ "loss": 1.2507,
5976
+ "step": 849
5977
+ },
5978
+ {
5979
+ "epoch": 0.9652235628105039,
5980
+ "grad_norm": 4.06318998336792,
5981
+ "learning_rate": 6.24459547654066e-07,
5982
+ "loss": 1.4431,
5983
+ "step": 850
5984
+ },
5985
+ {
5986
+ "epoch": 0.9663591199432221,
5987
+ "grad_norm": 0.6128149628639221,
5988
+ "learning_rate": 5.84860321262648e-07,
5989
+ "loss": 1.8312,
5990
+ "step": 851
5991
+ },
5992
+ {
5993
+ "epoch": 0.9674946770759404,
5994
+ "grad_norm": 0.7000106573104858,
5995
+ "learning_rate": 5.46554443156333e-07,
5996
+ "loss": 1.8172,
5997
+ "step": 852
5998
+ },
5999
+ {
6000
+ "epoch": 0.9686302342086586,
6001
+ "grad_norm": 0.7351566553115845,
6002
+ "learning_rate": 5.095424116781767e-07,
6003
+ "loss": 1.6679,
6004
+ "step": 853
6005
+ },
6006
+ {
6007
+ "epoch": 0.9697657913413769,
6008
+ "grad_norm": 0.7055298089981079,
6009
+ "learning_rate": 4.738247083387992e-07,
6010
+ "loss": 1.71,
6011
+ "step": 854
6012
+ },
6013
+ {
6014
+ "epoch": 0.9709013484740951,
6015
+ "grad_norm": 0.7372353076934814,
6016
+ "learning_rate": 4.3940179781019055e-07,
6017
+ "loss": 1.7349,
6018
+ "step": 855
6019
+ },
6020
+ {
6021
+ "epoch": 0.9720369056068133,
6022
+ "grad_norm": 0.7767588496208191,
6023
+ "learning_rate": 4.06274127919648e-07,
6024
+ "loss": 1.7337,
6025
+ "step": 856
6026
+ },
6027
+ {
6028
+ "epoch": 0.9731724627395316,
6029
+ "grad_norm": 0.765540599822998,
6030
+ "learning_rate": 3.74442129643926e-07,
6031
+ "loss": 1.646,
6032
+ "step": 857
6033
+ },
6034
+ {
6035
+ "epoch": 0.9743080198722498,
6036
+ "grad_norm": 0.7871925234794617,
6037
+ "learning_rate": 3.439062171036511e-07,
6038
+ "loss": 1.5931,
6039
+ "step": 858
6040
+ },
6041
+ {
6042
+ "epoch": 0.9754435770049681,
6043
+ "grad_norm": 0.8065288066864014,
6044
+ "learning_rate": 3.14666787557949e-07,
6045
+ "loss": 1.5187,
6046
+ "step": 859
6047
+ },
6048
+ {
6049
+ "epoch": 0.9765791341376863,
6050
+ "grad_norm": 0.8028612732887268,
6051
+ "learning_rate": 2.8672422139923715e-07,
6052
+ "loss": 1.5718,
6053
+ "step": 860
6054
+ },
6055
+ {
6056
+ "epoch": 0.9777146912704046,
6057
+ "grad_norm": 0.8129795789718628,
6058
+ "learning_rate": 2.600788821483069e-07,
6059
+ "loss": 1.6316,
6060
+ "step": 861
6061
+ },
6062
+ {
6063
+ "epoch": 0.9788502484031227,
6064
+ "grad_norm": 0.8282390832901001,
6065
+ "learning_rate": 2.3473111644957135e-07,
6066
+ "loss": 1.5771,
6067
+ "step": 862
6068
+ },
6069
+ {
6070
+ "epoch": 0.979985805535841,
6071
+ "grad_norm": 0.8801934719085693,
6072
+ "learning_rate": 2.1068125406659145e-07,
6073
+ "loss": 1.6199,
6074
+ "step": 863
6075
+ },
6076
+ {
6077
+ "epoch": 0.9811213626685593,
6078
+ "grad_norm": 0.9070190191268921,
6079
+ "learning_rate": 1.8792960787774593e-07,
6080
+ "loss": 1.6148,
6081
+ "step": 864
6082
+ },
6083
+ {
6084
+ "epoch": 0.9822569198012775,
6085
+ "grad_norm": 0.9105101823806763,
6086
+ "learning_rate": 1.6647647387219023e-07,
6087
+ "loss": 1.5872,
6088
+ "step": 865
6089
+ },
6090
+ {
6091
+ "epoch": 0.9833924769339958,
6092
+ "grad_norm": 0.97794109582901,
6093
+ "learning_rate": 1.463221311459817e-07,
6094
+ "loss": 1.5238,
6095
+ "step": 866
6096
+ },
6097
+ {
6098
+ "epoch": 0.984528034066714,
6099
+ "grad_norm": 1.0621849298477173,
6100
+ "learning_rate": 1.2746684189846036e-07,
6101
+ "loss": 1.6259,
6102
+ "step": 867
6103
+ },
6104
+ {
6105
+ "epoch": 0.9856635911994323,
6106
+ "grad_norm": 1.1340172290802002,
6107
+ "learning_rate": 1.0991085142886271e-07,
6108
+ "loss": 1.7028,
6109
+ "step": 868
6110
+ },
6111
+ {
6112
+ "epoch": 0.9867991483321504,
6113
+ "grad_norm": 1.1114485263824463,
6114
+ "learning_rate": 9.365438813306871e-08,
6115
+ "loss": 1.5325,
6116
+ "step": 869
6117
+ },
6118
+ {
6119
+ "epoch": 0.9879347054648687,
6120
+ "grad_norm": 1.0111680030822754,
6121
+ "learning_rate": 7.869766350069308e-08,
6122
+ "loss": 1.4184,
6123
+ "step": 870
6124
+ },
6125
+ {
6126
+ "epoch": 0.9890702625975869,
6127
+ "grad_norm": 1.1053588390350342,
6128
+ "learning_rate": 6.504087211229859e-08,
6129
+ "loss": 1.5139,
6130
+ "step": 871
6131
+ },
6132
+ {
6133
+ "epoch": 0.9902058197303052,
6134
+ "grad_norm": 1.0900163650512695,
6135
+ "learning_rate": 5.268419163688698e-08,
6136
+ "loss": 1.4397,
6137
+ "step": 872
6138
+ },
6139
+ {
6140
+ "epoch": 0.9913413768630234,
6141
+ "grad_norm": 1.1527384519577026,
6142
+ "learning_rate": 4.1627782829567476e-08,
6143
+ "loss": 1.2827,
6144
+ "step": 873
6145
+ },
6146
+ {
6147
+ "epoch": 0.9924769339957417,
6148
+ "grad_norm": 1.1613572835922241,
6149
+ "learning_rate": 3.187178952945846e-08,
6150
+ "loss": 1.3983,
6151
+ "step": 874
6152
+ },
6153
+ {
6154
+ "epoch": 0.99361249112846,
6155
+ "grad_norm": 1.2781202793121338,
6156
+ "learning_rate": 2.341633865784454e-08,
6157
+ "loss": 1.3543,
6158
+ "step": 875
6159
+ },
6160
+ {
6161
+ "epoch": 0.9947480482611781,
6162
+ "grad_norm": 1.6025346517562866,
6163
+ "learning_rate": 1.6261540216522264e-08,
6164
+ "loss": 1.4821,
6165
+ "step": 876
6166
+ },
6167
+ {
6168
+ "epoch": 0.9958836053938964,
6169
+ "grad_norm": 1.4461325407028198,
6170
+ "learning_rate": 1.0407487286345774e-08,
6171
+ "loss": 1.2549,
6172
+ "step": 877
6173
+ },
6174
+ {
6175
+ "epoch": 0.9970191625266146,
6176
+ "grad_norm": 1.5728317499160767,
6177
+ "learning_rate": 5.854256026027738e-09,
6178
+ "loss": 1.1668,
6179
+ "step": 878
6180
+ },
6181
+ {
6182
+ "epoch": 0.9981547196593329,
6183
+ "grad_norm": 2.2646923065185547,
6184
+ "learning_rate": 2.6019056711512614e-09,
6185
+ "loss": 1.2263,
6186
+ "step": 879
6187
+ },
6188
+ {
6189
+ "epoch": 0.9992902767920511,
6190
+ "grad_norm": 2.1974527835845947,
6191
+ "learning_rate": 6.50478533403831e-10,
6192
+ "loss": 0.9378,
6193
+ "step": 880
6194
+ },
6195
+ {
6196
+ "epoch": 1.0009226401703335,
6197
+ "grad_norm": 5.330093860626221,
6198
+ "learning_rate": 0.0,
6199
+ "loss": 2.3759,
6200
+ "step": 881
6201
  }
6202
  ],
6203
  "logging_steps": 1,
 
6212
  "should_evaluate": false,
6213
  "should_log": false,
6214
  "should_save": true,
6215
+ "should_training_stop": true
6216
  },
6217
  "attributes": {}
6218
  }
6219
  },
6220
+ "total_flos": 1.3469372301862502e+18,
6221
  "train_batch_size": 2,
6222
  "trial_name": null,
6223
  "trial_params": null