jdannem6 commited on
Commit
a2148d5
1 Parent(s): 1136924

Uploaded checkpoint-30000

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1793 -3
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d97fc9daed8fc42b8286be9a22db8d8f0c98b367d6f684f8724075b9c509868c
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f74097a448e18def20ac64c3351b03daf39ecbb7772ee021bb94d78882a822e8
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da5ec1b3aa5d12330bda5fc479f69a184c2145592be7a96f6bd1ace39646aaf8
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bac2bf27f76ba10b8bb7d0aea6171dd39cb800cb897cebf7d6a638cfd591dce7
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42ed1734c5823abfe806343a4de18dcccd1e9ad5af5349e08097c7bde2aa7437
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc4e81d8710419a2e714a6936530b076f37f0580ef4ada57c8cd6f905915e300
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bae572518ab53ddc674f52a5ef01613875bea64a8d9c53d4b7d4a9aedc712f19
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b36093e06845c6146f3175c64f0e8bdb441d4f7fc67a6962ed0b80b6725daf1
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.262895941734314,
3
  "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-27500",
4
- "epoch": 0.6875,
5
  "eval_steps": 500,
6
- "global_step": 27500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -19697,6 +19697,1796 @@
19697
  "eval_samples_per_second": 15.103,
19698
  "eval_steps_per_second": 15.103,
19699
  "step": 27500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19700
  }
19701
  ],
19702
  "logging_steps": 10,
@@ -19704,7 +21494,7 @@
19704
  "num_input_tokens_seen": 0,
19705
  "num_train_epochs": 1,
19706
  "save_steps": 2500,
19707
- "total_flos": 4.4280846483456e+17,
19708
  "train_batch_size": 1,
19709
  "trial_name": null,
19710
  "trial_params": null
 
1
  {
2
  "best_metric": 1.262895941734314,
3
  "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-27500",
4
+ "epoch": 0.75,
5
  "eval_steps": 500,
6
+ "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
19697
  "eval_samples_per_second": 15.103,
19698
  "eval_steps_per_second": 15.103,
19699
  "step": 27500
19700
+ },
19701
+ {
19702
+ "epoch": 0.69,
19703
+ "grad_norm": 5.283616542816162,
19704
+ "learning_rate": 1.6881355932203391e-06,
19705
+ "loss": 1.3504,
19706
+ "step": 27510
19707
+ },
19708
+ {
19709
+ "epoch": 0.69,
19710
+ "grad_norm": 3.6427159309387207,
19711
+ "learning_rate": 1.6813559322033901e-06,
19712
+ "loss": 1.3028,
19713
+ "step": 27520
19714
+ },
19715
+ {
19716
+ "epoch": 0.69,
19717
+ "grad_norm": 5.701483249664307,
19718
+ "learning_rate": 1.6745762711864409e-06,
19719
+ "loss": 1.4436,
19720
+ "step": 27530
19721
+ },
19722
+ {
19723
+ "epoch": 0.69,
19724
+ "grad_norm": 9.295339584350586,
19725
+ "learning_rate": 1.6677966101694916e-06,
19726
+ "loss": 1.4106,
19727
+ "step": 27540
19728
+ },
19729
+ {
19730
+ "epoch": 0.69,
19731
+ "grad_norm": 3.1163859367370605,
19732
+ "learning_rate": 1.6610169491525424e-06,
19733
+ "loss": 1.2698,
19734
+ "step": 27550
19735
+ },
19736
+ {
19737
+ "epoch": 0.69,
19738
+ "grad_norm": 6.673079967498779,
19739
+ "learning_rate": 1.6542372881355934e-06,
19740
+ "loss": 1.2202,
19741
+ "step": 27560
19742
+ },
19743
+ {
19744
+ "epoch": 0.69,
19745
+ "grad_norm": 4.866211414337158,
19746
+ "learning_rate": 1.6474576271186444e-06,
19747
+ "loss": 1.2538,
19748
+ "step": 27570
19749
+ },
19750
+ {
19751
+ "epoch": 0.69,
19752
+ "grad_norm": 3.319688081741333,
19753
+ "learning_rate": 1.640677966101695e-06,
19754
+ "loss": 1.4981,
19755
+ "step": 27580
19756
+ },
19757
+ {
19758
+ "epoch": 0.69,
19759
+ "grad_norm": 5.170316219329834,
19760
+ "learning_rate": 1.6338983050847459e-06,
19761
+ "loss": 1.3682,
19762
+ "step": 27590
19763
+ },
19764
+ {
19765
+ "epoch": 0.69,
19766
+ "grad_norm": 9.018121719360352,
19767
+ "learning_rate": 1.6271186440677967e-06,
19768
+ "loss": 1.3149,
19769
+ "step": 27600
19770
+ },
19771
+ {
19772
+ "epoch": 0.69,
19773
+ "grad_norm": 6.896349906921387,
19774
+ "learning_rate": 1.6203389830508476e-06,
19775
+ "loss": 1.317,
19776
+ "step": 27610
19777
+ },
19778
+ {
19779
+ "epoch": 0.69,
19780
+ "grad_norm": 7.63593053817749,
19781
+ "learning_rate": 1.6135593220338986e-06,
19782
+ "loss": 1.3238,
19783
+ "step": 27620
19784
+ },
19785
+ {
19786
+ "epoch": 0.69,
19787
+ "grad_norm": 8.823158264160156,
19788
+ "learning_rate": 1.6067796610169492e-06,
19789
+ "loss": 1.4479,
19790
+ "step": 27630
19791
+ },
19792
+ {
19793
+ "epoch": 0.69,
19794
+ "grad_norm": 8.540557861328125,
19795
+ "learning_rate": 1.6000000000000001e-06,
19796
+ "loss": 1.3663,
19797
+ "step": 27640
19798
+ },
19799
+ {
19800
+ "epoch": 0.69,
19801
+ "grad_norm": 8.03847885131836,
19802
+ "learning_rate": 1.593220338983051e-06,
19803
+ "loss": 1.2248,
19804
+ "step": 27650
19805
+ },
19806
+ {
19807
+ "epoch": 0.69,
19808
+ "grad_norm": 5.210377216339111,
19809
+ "learning_rate": 1.5864406779661019e-06,
19810
+ "loss": 1.3373,
19811
+ "step": 27660
19812
+ },
19813
+ {
19814
+ "epoch": 0.69,
19815
+ "grad_norm": 6.994758129119873,
19816
+ "learning_rate": 1.5796610169491526e-06,
19817
+ "loss": 1.3166,
19818
+ "step": 27670
19819
+ },
19820
+ {
19821
+ "epoch": 0.69,
19822
+ "grad_norm": 7.343669414520264,
19823
+ "learning_rate": 1.5728813559322034e-06,
19824
+ "loss": 1.3406,
19825
+ "step": 27680
19826
+ },
19827
+ {
19828
+ "epoch": 0.69,
19829
+ "grad_norm": 12.689948081970215,
19830
+ "learning_rate": 1.5661016949152544e-06,
19831
+ "loss": 1.2073,
19832
+ "step": 27690
19833
+ },
19834
+ {
19835
+ "epoch": 0.69,
19836
+ "grad_norm": 7.018815040588379,
19837
+ "learning_rate": 1.5593220338983054e-06,
19838
+ "loss": 1.2738,
19839
+ "step": 27700
19840
+ },
19841
+ {
19842
+ "epoch": 0.69,
19843
+ "grad_norm": 5.334643840789795,
19844
+ "learning_rate": 1.552542372881356e-06,
19845
+ "loss": 1.2211,
19846
+ "step": 27710
19847
+ },
19848
+ {
19849
+ "epoch": 0.69,
19850
+ "grad_norm": 1.1950844526290894,
19851
+ "learning_rate": 1.545762711864407e-06,
19852
+ "loss": 1.268,
19853
+ "step": 27720
19854
+ },
19855
+ {
19856
+ "epoch": 0.69,
19857
+ "grad_norm": 11.679058074951172,
19858
+ "learning_rate": 1.5389830508474577e-06,
19859
+ "loss": 1.1873,
19860
+ "step": 27730
19861
+ },
19862
+ {
19863
+ "epoch": 0.69,
19864
+ "grad_norm": 7.686078071594238,
19865
+ "learning_rate": 1.5322033898305086e-06,
19866
+ "loss": 1.277,
19867
+ "step": 27740
19868
+ },
19869
+ {
19870
+ "epoch": 0.69,
19871
+ "grad_norm": 2.579845905303955,
19872
+ "learning_rate": 1.5254237288135596e-06,
19873
+ "loss": 1.3752,
19874
+ "step": 27750
19875
+ },
19876
+ {
19877
+ "epoch": 0.69,
19878
+ "grad_norm": 11.227088928222656,
19879
+ "learning_rate": 1.5186440677966102e-06,
19880
+ "loss": 1.2593,
19881
+ "step": 27760
19882
+ },
19883
+ {
19884
+ "epoch": 0.69,
19885
+ "grad_norm": 6.903045654296875,
19886
+ "learning_rate": 1.5118644067796611e-06,
19887
+ "loss": 1.2463,
19888
+ "step": 27770
19889
+ },
19890
+ {
19891
+ "epoch": 0.69,
19892
+ "grad_norm": 12.534771919250488,
19893
+ "learning_rate": 1.505084745762712e-06,
19894
+ "loss": 1.2949,
19895
+ "step": 27780
19896
+ },
19897
+ {
19898
+ "epoch": 0.69,
19899
+ "grad_norm": 4.901615619659424,
19900
+ "learning_rate": 1.4983050847457629e-06,
19901
+ "loss": 1.2975,
19902
+ "step": 27790
19903
+ },
19904
+ {
19905
+ "epoch": 0.69,
19906
+ "grad_norm": 9.669910430908203,
19907
+ "learning_rate": 1.4915254237288139e-06,
19908
+ "loss": 1.3476,
19909
+ "step": 27800
19910
+ },
19911
+ {
19912
+ "epoch": 0.7,
19913
+ "grad_norm": 6.982202529907227,
19914
+ "learning_rate": 1.4847457627118644e-06,
19915
+ "loss": 1.3839,
19916
+ "step": 27810
19917
+ },
19918
+ {
19919
+ "epoch": 0.7,
19920
+ "grad_norm": 9.172724723815918,
19921
+ "learning_rate": 1.4779661016949154e-06,
19922
+ "loss": 1.3519,
19923
+ "step": 27820
19924
+ },
19925
+ {
19926
+ "epoch": 0.7,
19927
+ "grad_norm": 8.561583518981934,
19928
+ "learning_rate": 1.4711864406779664e-06,
19929
+ "loss": 1.3402,
19930
+ "step": 27830
19931
+ },
19932
+ {
19933
+ "epoch": 0.7,
19934
+ "grad_norm": 11.40105152130127,
19935
+ "learning_rate": 1.464406779661017e-06,
19936
+ "loss": 1.2805,
19937
+ "step": 27840
19938
+ },
19939
+ {
19940
+ "epoch": 0.7,
19941
+ "grad_norm": 11.502050399780273,
19942
+ "learning_rate": 1.457627118644068e-06,
19943
+ "loss": 1.4351,
19944
+ "step": 27850
19945
+ },
19946
+ {
19947
+ "epoch": 0.7,
19948
+ "grad_norm": 3.5157549381256104,
19949
+ "learning_rate": 1.4508474576271187e-06,
19950
+ "loss": 1.2948,
19951
+ "step": 27860
19952
+ },
19953
+ {
19954
+ "epoch": 0.7,
19955
+ "grad_norm": 8.663883209228516,
19956
+ "learning_rate": 1.4440677966101696e-06,
19957
+ "loss": 1.3285,
19958
+ "step": 27870
19959
+ },
19960
+ {
19961
+ "epoch": 0.7,
19962
+ "grad_norm": 4.502950191497803,
19963
+ "learning_rate": 1.4372881355932206e-06,
19964
+ "loss": 1.4507,
19965
+ "step": 27880
19966
+ },
19967
+ {
19968
+ "epoch": 0.7,
19969
+ "grad_norm": 2.098318338394165,
19970
+ "learning_rate": 1.4305084745762712e-06,
19971
+ "loss": 1.1753,
19972
+ "step": 27890
19973
+ },
19974
+ {
19975
+ "epoch": 0.7,
19976
+ "grad_norm": 2.721998453140259,
19977
+ "learning_rate": 1.4237288135593222e-06,
19978
+ "loss": 1.3162,
19979
+ "step": 27900
19980
+ },
19981
+ {
19982
+ "epoch": 0.7,
19983
+ "grad_norm": 6.501703262329102,
19984
+ "learning_rate": 1.416949152542373e-06,
19985
+ "loss": 1.2442,
19986
+ "step": 27910
19987
+ },
19988
+ {
19989
+ "epoch": 0.7,
19990
+ "grad_norm": 4.597460746765137,
19991
+ "learning_rate": 1.410169491525424e-06,
19992
+ "loss": 1.2659,
19993
+ "step": 27920
19994
+ },
19995
+ {
19996
+ "epoch": 0.7,
19997
+ "grad_norm": 1.399740219116211,
19998
+ "learning_rate": 1.4033898305084749e-06,
19999
+ "loss": 1.2234,
20000
+ "step": 27930
20001
+ },
20002
+ {
20003
+ "epoch": 0.7,
20004
+ "grad_norm": 1.2826783657073975,
20005
+ "learning_rate": 1.3966101694915254e-06,
20006
+ "loss": 1.3775,
20007
+ "step": 27940
20008
+ },
20009
+ {
20010
+ "epoch": 0.7,
20011
+ "grad_norm": 9.264411926269531,
20012
+ "learning_rate": 1.3898305084745764e-06,
20013
+ "loss": 1.299,
20014
+ "step": 27950
20015
+ },
20016
+ {
20017
+ "epoch": 0.7,
20018
+ "grad_norm": 11.41451644897461,
20019
+ "learning_rate": 1.3830508474576274e-06,
20020
+ "loss": 1.2851,
20021
+ "step": 27960
20022
+ },
20023
+ {
20024
+ "epoch": 0.7,
20025
+ "grad_norm": 4.351644515991211,
20026
+ "learning_rate": 1.376271186440678e-06,
20027
+ "loss": 1.1979,
20028
+ "step": 27970
20029
+ },
20030
+ {
20031
+ "epoch": 0.7,
20032
+ "grad_norm": 9.026026725769043,
20033
+ "learning_rate": 1.369491525423729e-06,
20034
+ "loss": 1.3119,
20035
+ "step": 27980
20036
+ },
20037
+ {
20038
+ "epoch": 0.7,
20039
+ "grad_norm": 6.000504970550537,
20040
+ "learning_rate": 1.3627118644067797e-06,
20041
+ "loss": 1.2543,
20042
+ "step": 27990
20043
+ },
20044
+ {
20045
+ "epoch": 0.7,
20046
+ "grad_norm": 22.265581130981445,
20047
+ "learning_rate": 1.3559322033898307e-06,
20048
+ "loss": 1.34,
20049
+ "step": 28000
20050
+ },
20051
+ {
20052
+ "epoch": 0.7,
20053
+ "eval_loss": 1.3058114051818848,
20054
+ "eval_runtime": 66.2214,
20055
+ "eval_samples_per_second": 15.101,
20056
+ "eval_steps_per_second": 15.101,
20057
+ "step": 28000
20058
+ },
20059
+ {
20060
+ "epoch": 0.7,
20061
+ "grad_norm": 4.122670650482178,
20062
+ "learning_rate": 1.3491525423728816e-06,
20063
+ "loss": 1.2058,
20064
+ "step": 28010
20065
+ },
20066
+ {
20067
+ "epoch": 0.7,
20068
+ "grad_norm": 3.766960859298706,
20069
+ "learning_rate": 1.3423728813559322e-06,
20070
+ "loss": 1.3474,
20071
+ "step": 28020
20072
+ },
20073
+ {
20074
+ "epoch": 0.7,
20075
+ "grad_norm": 2.3636386394500732,
20076
+ "learning_rate": 1.3355932203389832e-06,
20077
+ "loss": 1.1637,
20078
+ "step": 28030
20079
+ },
20080
+ {
20081
+ "epoch": 0.7,
20082
+ "grad_norm": 9.471797943115234,
20083
+ "learning_rate": 1.328813559322034e-06,
20084
+ "loss": 1.3953,
20085
+ "step": 28040
20086
+ },
20087
+ {
20088
+ "epoch": 0.7,
20089
+ "grad_norm": 8.232218742370605,
20090
+ "learning_rate": 1.322033898305085e-06,
20091
+ "loss": 1.4269,
20092
+ "step": 28050
20093
+ },
20094
+ {
20095
+ "epoch": 0.7,
20096
+ "grad_norm": 1.1584389209747314,
20097
+ "learning_rate": 1.3152542372881359e-06,
20098
+ "loss": 1.3231,
20099
+ "step": 28060
20100
+ },
20101
+ {
20102
+ "epoch": 0.7,
20103
+ "grad_norm": 4.911566257476807,
20104
+ "learning_rate": 1.3084745762711864e-06,
20105
+ "loss": 1.241,
20106
+ "step": 28070
20107
+ },
20108
+ {
20109
+ "epoch": 0.7,
20110
+ "grad_norm": 18.741342544555664,
20111
+ "learning_rate": 1.3016949152542374e-06,
20112
+ "loss": 1.3838,
20113
+ "step": 28080
20114
+ },
20115
+ {
20116
+ "epoch": 0.7,
20117
+ "grad_norm": 8.622126579284668,
20118
+ "learning_rate": 1.2949152542372884e-06,
20119
+ "loss": 1.2792,
20120
+ "step": 28090
20121
+ },
20122
+ {
20123
+ "epoch": 0.7,
20124
+ "grad_norm": 4.402095794677734,
20125
+ "learning_rate": 1.288135593220339e-06,
20126
+ "loss": 1.2308,
20127
+ "step": 28100
20128
+ },
20129
+ {
20130
+ "epoch": 0.7,
20131
+ "grad_norm": 5.238518714904785,
20132
+ "learning_rate": 1.28135593220339e-06,
20133
+ "loss": 1.3537,
20134
+ "step": 28110
20135
+ },
20136
+ {
20137
+ "epoch": 0.7,
20138
+ "grad_norm": 5.757815837860107,
20139
+ "learning_rate": 1.2745762711864407e-06,
20140
+ "loss": 1.3987,
20141
+ "step": 28120
20142
+ },
20143
+ {
20144
+ "epoch": 0.7,
20145
+ "grad_norm": 9.999316215515137,
20146
+ "learning_rate": 1.2677966101694917e-06,
20147
+ "loss": 1.1397,
20148
+ "step": 28130
20149
+ },
20150
+ {
20151
+ "epoch": 0.7,
20152
+ "grad_norm": 6.276950359344482,
20153
+ "learning_rate": 1.2610169491525426e-06,
20154
+ "loss": 1.5024,
20155
+ "step": 28140
20156
+ },
20157
+ {
20158
+ "epoch": 0.7,
20159
+ "grad_norm": 5.9979634284973145,
20160
+ "learning_rate": 1.2542372881355932e-06,
20161
+ "loss": 1.4296,
20162
+ "step": 28150
20163
+ },
20164
+ {
20165
+ "epoch": 0.7,
20166
+ "grad_norm": 7.124503135681152,
20167
+ "learning_rate": 1.2474576271186442e-06,
20168
+ "loss": 1.279,
20169
+ "step": 28160
20170
+ },
20171
+ {
20172
+ "epoch": 0.7,
20173
+ "grad_norm": 10.153301239013672,
20174
+ "learning_rate": 1.240677966101695e-06,
20175
+ "loss": 1.3013,
20176
+ "step": 28170
20177
+ },
20178
+ {
20179
+ "epoch": 0.7,
20180
+ "grad_norm": 6.391571521759033,
20181
+ "learning_rate": 1.233898305084746e-06,
20182
+ "loss": 1.2782,
20183
+ "step": 28180
20184
+ },
20185
+ {
20186
+ "epoch": 0.7,
20187
+ "grad_norm": 10.13975715637207,
20188
+ "learning_rate": 1.2271186440677967e-06,
20189
+ "loss": 1.4136,
20190
+ "step": 28190
20191
+ },
20192
+ {
20193
+ "epoch": 0.7,
20194
+ "grad_norm": 5.052265167236328,
20195
+ "learning_rate": 1.2203389830508477e-06,
20196
+ "loss": 1.3246,
20197
+ "step": 28200
20198
+ },
20199
+ {
20200
+ "epoch": 0.71,
20201
+ "grad_norm": 6.8834638595581055,
20202
+ "learning_rate": 1.2135593220338984e-06,
20203
+ "loss": 1.3436,
20204
+ "step": 28210
20205
+ },
20206
+ {
20207
+ "epoch": 0.71,
20208
+ "grad_norm": 10.564448356628418,
20209
+ "learning_rate": 1.2067796610169492e-06,
20210
+ "loss": 1.1619,
20211
+ "step": 28220
20212
+ },
20213
+ {
20214
+ "epoch": 0.71,
20215
+ "grad_norm": 3.8192801475524902,
20216
+ "learning_rate": 1.2000000000000002e-06,
20217
+ "loss": 1.2328,
20218
+ "step": 28230
20219
+ },
20220
+ {
20221
+ "epoch": 0.71,
20222
+ "grad_norm": 5.635645866394043,
20223
+ "learning_rate": 1.193220338983051e-06,
20224
+ "loss": 1.3258,
20225
+ "step": 28240
20226
+ },
20227
+ {
20228
+ "epoch": 0.71,
20229
+ "grad_norm": 6.429792881011963,
20230
+ "learning_rate": 1.186440677966102e-06,
20231
+ "loss": 1.3368,
20232
+ "step": 28250
20233
+ },
20234
+ {
20235
+ "epoch": 0.71,
20236
+ "grad_norm": 11.116402626037598,
20237
+ "learning_rate": 1.1796610169491527e-06,
20238
+ "loss": 1.2527,
20239
+ "step": 28260
20240
+ },
20241
+ {
20242
+ "epoch": 0.71,
20243
+ "grad_norm": 2.7472074031829834,
20244
+ "learning_rate": 1.1728813559322034e-06,
20245
+ "loss": 1.4633,
20246
+ "step": 28270
20247
+ },
20248
+ {
20249
+ "epoch": 0.71,
20250
+ "grad_norm": 4.118687629699707,
20251
+ "learning_rate": 1.1661016949152542e-06,
20252
+ "loss": 1.4852,
20253
+ "step": 28280
20254
+ },
20255
+ {
20256
+ "epoch": 0.71,
20257
+ "grad_norm": 14.502837181091309,
20258
+ "learning_rate": 1.1593220338983052e-06,
20259
+ "loss": 1.1964,
20260
+ "step": 28290
20261
+ },
20262
+ {
20263
+ "epoch": 0.71,
20264
+ "grad_norm": 3.8546032905578613,
20265
+ "learning_rate": 1.152542372881356e-06,
20266
+ "loss": 1.1832,
20267
+ "step": 28300
20268
+ },
20269
+ {
20270
+ "epoch": 0.71,
20271
+ "grad_norm": 2.377305507659912,
20272
+ "learning_rate": 1.145762711864407e-06,
20273
+ "loss": 1.3164,
20274
+ "step": 28310
20275
+ },
20276
+ {
20277
+ "epoch": 0.71,
20278
+ "grad_norm": 2.172879219055176,
20279
+ "learning_rate": 1.1389830508474577e-06,
20280
+ "loss": 1.3163,
20281
+ "step": 28320
20282
+ },
20283
+ {
20284
+ "epoch": 0.71,
20285
+ "grad_norm": 4.290719032287598,
20286
+ "learning_rate": 1.1322033898305087e-06,
20287
+ "loss": 1.2505,
20288
+ "step": 28330
20289
+ },
20290
+ {
20291
+ "epoch": 0.71,
20292
+ "grad_norm": 15.109819412231445,
20293
+ "learning_rate": 1.1254237288135594e-06,
20294
+ "loss": 1.2868,
20295
+ "step": 28340
20296
+ },
20297
+ {
20298
+ "epoch": 0.71,
20299
+ "grad_norm": 11.07304859161377,
20300
+ "learning_rate": 1.1186440677966102e-06,
20301
+ "loss": 1.2118,
20302
+ "step": 28350
20303
+ },
20304
+ {
20305
+ "epoch": 0.71,
20306
+ "grad_norm": 14.64116096496582,
20307
+ "learning_rate": 1.1118644067796612e-06,
20308
+ "loss": 1.2904,
20309
+ "step": 28360
20310
+ },
20311
+ {
20312
+ "epoch": 0.71,
20313
+ "grad_norm": 12.49071216583252,
20314
+ "learning_rate": 1.105084745762712e-06,
20315
+ "loss": 1.286,
20316
+ "step": 28370
20317
+ },
20318
+ {
20319
+ "epoch": 0.71,
20320
+ "grad_norm": 3.750757932662964,
20321
+ "learning_rate": 1.098305084745763e-06,
20322
+ "loss": 1.2723,
20323
+ "step": 28380
20324
+ },
20325
+ {
20326
+ "epoch": 0.71,
20327
+ "grad_norm": 11.984790802001953,
20328
+ "learning_rate": 1.0915254237288137e-06,
20329
+ "loss": 1.1732,
20330
+ "step": 28390
20331
+ },
20332
+ {
20333
+ "epoch": 0.71,
20334
+ "grad_norm": 13.987404823303223,
20335
+ "learning_rate": 1.0847457627118644e-06,
20336
+ "loss": 1.2168,
20337
+ "step": 28400
20338
+ },
20339
+ {
20340
+ "epoch": 0.71,
20341
+ "grad_norm": 10.837672233581543,
20342
+ "learning_rate": 1.0779661016949152e-06,
20343
+ "loss": 1.4036,
20344
+ "step": 28410
20345
+ },
20346
+ {
20347
+ "epoch": 0.71,
20348
+ "grad_norm": 12.291699409484863,
20349
+ "learning_rate": 1.0711864406779662e-06,
20350
+ "loss": 1.1172,
20351
+ "step": 28420
20352
+ },
20353
+ {
20354
+ "epoch": 0.71,
20355
+ "grad_norm": 2.2296297550201416,
20356
+ "learning_rate": 1.064406779661017e-06,
20357
+ "loss": 1.3241,
20358
+ "step": 28430
20359
+ },
20360
+ {
20361
+ "epoch": 0.71,
20362
+ "grad_norm": 6.434119701385498,
20363
+ "learning_rate": 1.057627118644068e-06,
20364
+ "loss": 1.3096,
20365
+ "step": 28440
20366
+ },
20367
+ {
20368
+ "epoch": 0.71,
20369
+ "grad_norm": 3.2857985496520996,
20370
+ "learning_rate": 1.0508474576271187e-06,
20371
+ "loss": 1.3744,
20372
+ "step": 28450
20373
+ },
20374
+ {
20375
+ "epoch": 0.71,
20376
+ "grad_norm": 6.822338104248047,
20377
+ "learning_rate": 1.0440677966101697e-06,
20378
+ "loss": 1.3724,
20379
+ "step": 28460
20380
+ },
20381
+ {
20382
+ "epoch": 0.71,
20383
+ "grad_norm": 6.319363117218018,
20384
+ "learning_rate": 1.0372881355932204e-06,
20385
+ "loss": 1.2181,
20386
+ "step": 28470
20387
+ },
20388
+ {
20389
+ "epoch": 0.71,
20390
+ "grad_norm": 3.777921676635742,
20391
+ "learning_rate": 1.0305084745762712e-06,
20392
+ "loss": 1.3495,
20393
+ "step": 28480
20394
+ },
20395
+ {
20396
+ "epoch": 0.71,
20397
+ "grad_norm": 6.540971755981445,
20398
+ "learning_rate": 1.0237288135593222e-06,
20399
+ "loss": 1.4589,
20400
+ "step": 28490
20401
+ },
20402
+ {
20403
+ "epoch": 0.71,
20404
+ "grad_norm": 7.9154744148254395,
20405
+ "learning_rate": 1.016949152542373e-06,
20406
+ "loss": 1.3015,
20407
+ "step": 28500
20408
+ },
20409
+ {
20410
+ "epoch": 0.71,
20411
+ "eval_loss": 1.3162422180175781,
20412
+ "eval_runtime": 66.2512,
20413
+ "eval_samples_per_second": 15.094,
20414
+ "eval_steps_per_second": 15.094,
20415
+ "step": 28500
20416
+ },
20417
+ {
20418
+ "epoch": 0.71,
20419
+ "grad_norm": 15.148056983947754,
20420
+ "learning_rate": 1.010169491525424e-06,
20421
+ "loss": 1.2363,
20422
+ "step": 28510
20423
+ },
20424
+ {
20425
+ "epoch": 0.71,
20426
+ "grad_norm": 10.743448257446289,
20427
+ "learning_rate": 1.0033898305084747e-06,
20428
+ "loss": 1.1965,
20429
+ "step": 28520
20430
+ },
20431
+ {
20432
+ "epoch": 0.71,
20433
+ "grad_norm": 12.191396713256836,
20434
+ "learning_rate": 9.966101694915254e-07,
20435
+ "loss": 1.4387,
20436
+ "step": 28530
20437
+ },
20438
+ {
20439
+ "epoch": 0.71,
20440
+ "grad_norm": 4.78171968460083,
20441
+ "learning_rate": 9.898305084745762e-07,
20442
+ "loss": 1.4977,
20443
+ "step": 28540
20444
+ },
20445
+ {
20446
+ "epoch": 0.71,
20447
+ "grad_norm": 5.24019718170166,
20448
+ "learning_rate": 9.830508474576272e-07,
20449
+ "loss": 1.5177,
20450
+ "step": 28550
20451
+ },
20452
+ {
20453
+ "epoch": 0.71,
20454
+ "grad_norm": 12.000500679016113,
20455
+ "learning_rate": 9.762711864406782e-07,
20456
+ "loss": 1.1637,
20457
+ "step": 28560
20458
+ },
20459
+ {
20460
+ "epoch": 0.71,
20461
+ "grad_norm": 10.612434387207031,
20462
+ "learning_rate": 9.69491525423729e-07,
20463
+ "loss": 1.334,
20464
+ "step": 28570
20465
+ },
20466
+ {
20467
+ "epoch": 0.71,
20468
+ "grad_norm": 6.343203544616699,
20469
+ "learning_rate": 9.627118644067797e-07,
20470
+ "loss": 1.3152,
20471
+ "step": 28580
20472
+ },
20473
+ {
20474
+ "epoch": 0.71,
20475
+ "grad_norm": 5.986273288726807,
20476
+ "learning_rate": 9.559322033898307e-07,
20477
+ "loss": 1.3724,
20478
+ "step": 28590
20479
+ },
20480
+ {
20481
+ "epoch": 0.71,
20482
+ "grad_norm": 12.447896003723145,
20483
+ "learning_rate": 9.491525423728814e-07,
20484
+ "loss": 1.2404,
20485
+ "step": 28600
20486
+ },
20487
+ {
20488
+ "epoch": 0.72,
20489
+ "grad_norm": 18.211698532104492,
20490
+ "learning_rate": 9.423728813559323e-07,
20491
+ "loss": 1.3091,
20492
+ "step": 28610
20493
+ },
20494
+ {
20495
+ "epoch": 0.72,
20496
+ "grad_norm": 5.227106094360352,
20497
+ "learning_rate": 9.355932203389831e-07,
20498
+ "loss": 1.1275,
20499
+ "step": 28620
20500
+ },
20501
+ {
20502
+ "epoch": 0.72,
20503
+ "grad_norm": 4.874502182006836,
20504
+ "learning_rate": 9.28813559322034e-07,
20505
+ "loss": 1.2959,
20506
+ "step": 28630
20507
+ },
20508
+ {
20509
+ "epoch": 0.72,
20510
+ "grad_norm": 7.699239253997803,
20511
+ "learning_rate": 9.220338983050848e-07,
20512
+ "loss": 1.3687,
20513
+ "step": 28640
20514
+ },
20515
+ {
20516
+ "epoch": 0.72,
20517
+ "grad_norm": 7.712405681610107,
20518
+ "learning_rate": 9.152542372881357e-07,
20519
+ "loss": 1.3746,
20520
+ "step": 28650
20521
+ },
20522
+ {
20523
+ "epoch": 0.72,
20524
+ "grad_norm": 8.854902267456055,
20525
+ "learning_rate": 9.084745762711864e-07,
20526
+ "loss": 1.3399,
20527
+ "step": 28660
20528
+ },
20529
+ {
20530
+ "epoch": 0.72,
20531
+ "grad_norm": 5.26395320892334,
20532
+ "learning_rate": 9.016949152542373e-07,
20533
+ "loss": 1.2296,
20534
+ "step": 28670
20535
+ },
20536
+ {
20537
+ "epoch": 0.72,
20538
+ "grad_norm": 5.966766357421875,
20539
+ "learning_rate": 8.949152542372883e-07,
20540
+ "loss": 1.2806,
20541
+ "step": 28680
20542
+ },
20543
+ {
20544
+ "epoch": 0.72,
20545
+ "grad_norm": 5.988807201385498,
20546
+ "learning_rate": 8.881355932203391e-07,
20547
+ "loss": 1.4652,
20548
+ "step": 28690
20549
+ },
20550
+ {
20551
+ "epoch": 0.72,
20552
+ "grad_norm": 1.523200273513794,
20553
+ "learning_rate": 8.813559322033899e-07,
20554
+ "loss": 1.4848,
20555
+ "step": 28700
20556
+ },
20557
+ {
20558
+ "epoch": 0.72,
20559
+ "grad_norm": 3.4972944259643555,
20560
+ "learning_rate": 8.745762711864407e-07,
20561
+ "loss": 1.296,
20562
+ "step": 28710
20563
+ },
20564
+ {
20565
+ "epoch": 0.72,
20566
+ "grad_norm": 4.062095642089844,
20567
+ "learning_rate": 8.677966101694917e-07,
20568
+ "loss": 1.3716,
20569
+ "step": 28720
20570
+ },
20571
+ {
20572
+ "epoch": 0.72,
20573
+ "grad_norm": 1.065338134765625,
20574
+ "learning_rate": 8.610169491525424e-07,
20575
+ "loss": 1.2375,
20576
+ "step": 28730
20577
+ },
20578
+ {
20579
+ "epoch": 0.72,
20580
+ "grad_norm": 6.1202006340026855,
20581
+ "learning_rate": 8.542372881355933e-07,
20582
+ "loss": 1.3413,
20583
+ "step": 28740
20584
+ },
20585
+ {
20586
+ "epoch": 0.72,
20587
+ "grad_norm": 17.116790771484375,
20588
+ "learning_rate": 8.474576271186441e-07,
20589
+ "loss": 1.2014,
20590
+ "step": 28750
20591
+ },
20592
+ {
20593
+ "epoch": 0.72,
20594
+ "grad_norm": 6.310455799102783,
20595
+ "learning_rate": 8.406779661016951e-07,
20596
+ "loss": 1.2509,
20597
+ "step": 28760
20598
+ },
20599
+ {
20600
+ "epoch": 0.72,
20601
+ "grad_norm": 19.508827209472656,
20602
+ "learning_rate": 8.338983050847458e-07,
20603
+ "loss": 1.4035,
20604
+ "step": 28770
20605
+ },
20606
+ {
20607
+ "epoch": 0.72,
20608
+ "grad_norm": 10.587761878967285,
20609
+ "learning_rate": 8.271186440677967e-07,
20610
+ "loss": 1.3374,
20611
+ "step": 28780
20612
+ },
20613
+ {
20614
+ "epoch": 0.72,
20615
+ "grad_norm": 15.715071678161621,
20616
+ "learning_rate": 8.203389830508475e-07,
20617
+ "loss": 1.3229,
20618
+ "step": 28790
20619
+ },
20620
+ {
20621
+ "epoch": 0.72,
20622
+ "grad_norm": 3.0983753204345703,
20623
+ "learning_rate": 8.135593220338983e-07,
20624
+ "loss": 1.4297,
20625
+ "step": 28800
20626
+ },
20627
+ {
20628
+ "epoch": 0.72,
20629
+ "grad_norm": 4.541349411010742,
20630
+ "learning_rate": 8.067796610169493e-07,
20631
+ "loss": 1.3327,
20632
+ "step": 28810
20633
+ },
20634
+ {
20635
+ "epoch": 0.72,
20636
+ "grad_norm": 4.102433204650879,
20637
+ "learning_rate": 8.000000000000001e-07,
20638
+ "loss": 1.3362,
20639
+ "step": 28820
20640
+ },
20641
+ {
20642
+ "epoch": 0.72,
20643
+ "grad_norm": 6.362680435180664,
20644
+ "learning_rate": 7.932203389830509e-07,
20645
+ "loss": 1.2275,
20646
+ "step": 28830
20647
+ },
20648
+ {
20649
+ "epoch": 0.72,
20650
+ "grad_norm": 9.411408424377441,
20651
+ "learning_rate": 7.864406779661017e-07,
20652
+ "loss": 1.4023,
20653
+ "step": 28840
20654
+ },
20655
+ {
20656
+ "epoch": 0.72,
20657
+ "grad_norm": 6.113147258758545,
20658
+ "learning_rate": 7.796610169491527e-07,
20659
+ "loss": 1.2319,
20660
+ "step": 28850
20661
+ },
20662
+ {
20663
+ "epoch": 0.72,
20664
+ "grad_norm": 6.580341815948486,
20665
+ "learning_rate": 7.728813559322034e-07,
20666
+ "loss": 1.3559,
20667
+ "step": 28860
20668
+ },
20669
+ {
20670
+ "epoch": 0.72,
20671
+ "grad_norm": 10.778473854064941,
20672
+ "learning_rate": 7.661016949152543e-07,
20673
+ "loss": 1.2247,
20674
+ "step": 28870
20675
+ },
20676
+ {
20677
+ "epoch": 0.72,
20678
+ "grad_norm": 4.692105770111084,
20679
+ "learning_rate": 7.593220338983051e-07,
20680
+ "loss": 1.2254,
20681
+ "step": 28880
20682
+ },
20683
+ {
20684
+ "epoch": 0.72,
20685
+ "grad_norm": 8.076458930969238,
20686
+ "learning_rate": 7.52542372881356e-07,
20687
+ "loss": 1.3264,
20688
+ "step": 28890
20689
+ },
20690
+ {
20691
+ "epoch": 0.72,
20692
+ "grad_norm": 11.618599891662598,
20693
+ "learning_rate": 7.457627118644069e-07,
20694
+ "loss": 1.174,
20695
+ "step": 28900
20696
+ },
20697
+ {
20698
+ "epoch": 0.72,
20699
+ "grad_norm": 10.995185852050781,
20700
+ "learning_rate": 7.389830508474577e-07,
20701
+ "loss": 1.21,
20702
+ "step": 28910
20703
+ },
20704
+ {
20705
+ "epoch": 0.72,
20706
+ "grad_norm": 12.011213302612305,
20707
+ "learning_rate": 7.322033898305085e-07,
20708
+ "loss": 1.3272,
20709
+ "step": 28920
20710
+ },
20711
+ {
20712
+ "epoch": 0.72,
20713
+ "grad_norm": 4.367415428161621,
20714
+ "learning_rate": 7.254237288135593e-07,
20715
+ "loss": 1.3997,
20716
+ "step": 28930
20717
+ },
20718
+ {
20719
+ "epoch": 0.72,
20720
+ "grad_norm": 11.401775360107422,
20721
+ "learning_rate": 7.186440677966103e-07,
20722
+ "loss": 1.3507,
20723
+ "step": 28940
20724
+ },
20725
+ {
20726
+ "epoch": 0.72,
20727
+ "grad_norm": 8.856404304504395,
20728
+ "learning_rate": 7.118644067796611e-07,
20729
+ "loss": 1.3274,
20730
+ "step": 28950
20731
+ },
20732
+ {
20733
+ "epoch": 0.72,
20734
+ "grad_norm": 5.873079776763916,
20735
+ "learning_rate": 7.05084745762712e-07,
20736
+ "loss": 1.2987,
20737
+ "step": 28960
20738
+ },
20739
+ {
20740
+ "epoch": 0.72,
20741
+ "grad_norm": 7.142796993255615,
20742
+ "learning_rate": 6.983050847457627e-07,
20743
+ "loss": 1.2049,
20744
+ "step": 28970
20745
+ },
20746
+ {
20747
+ "epoch": 0.72,
20748
+ "grad_norm": 5.9704179763793945,
20749
+ "learning_rate": 6.915254237288137e-07,
20750
+ "loss": 1.3481,
20751
+ "step": 28980
20752
+ },
20753
+ {
20754
+ "epoch": 0.72,
20755
+ "grad_norm": 10.48417854309082,
20756
+ "learning_rate": 6.847457627118645e-07,
20757
+ "loss": 1.3498,
20758
+ "step": 28990
20759
+ },
20760
+ {
20761
+ "epoch": 0.72,
20762
+ "grad_norm": 12.131574630737305,
20763
+ "learning_rate": 6.779661016949153e-07,
20764
+ "loss": 1.3529,
20765
+ "step": 29000
20766
+ },
20767
+ {
20768
+ "epoch": 0.72,
20769
+ "eval_loss": 1.3368879556655884,
20770
+ "eval_runtime": 66.2441,
20771
+ "eval_samples_per_second": 15.096,
20772
+ "eval_steps_per_second": 15.096,
20773
+ "step": 29000
20774
+ },
20775
+ {
20776
+ "epoch": 0.73,
20777
+ "grad_norm": 7.354034423828125,
20778
+ "learning_rate": 6.711864406779661e-07,
20779
+ "loss": 1.2698,
20780
+ "step": 29010
20781
+ },
20782
+ {
20783
+ "epoch": 0.73,
20784
+ "grad_norm": 3.9224977493286133,
20785
+ "learning_rate": 6.64406779661017e-07,
20786
+ "loss": 1.3377,
20787
+ "step": 29020
20788
+ },
20789
+ {
20790
+ "epoch": 0.73,
20791
+ "grad_norm": 8.08880615234375,
20792
+ "learning_rate": 6.576271186440679e-07,
20793
+ "loss": 1.4448,
20794
+ "step": 29030
20795
+ },
20796
+ {
20797
+ "epoch": 0.73,
20798
+ "grad_norm": 3.625216245651245,
20799
+ "learning_rate": 6.508474576271187e-07,
20800
+ "loss": 1.3561,
20801
+ "step": 29040
20802
+ },
20803
+ {
20804
+ "epoch": 0.73,
20805
+ "grad_norm": 13.905680656433105,
20806
+ "learning_rate": 6.440677966101695e-07,
20807
+ "loss": 1.2956,
20808
+ "step": 29050
20809
+ },
20810
+ {
20811
+ "epoch": 0.73,
20812
+ "grad_norm": 3.773040294647217,
20813
+ "learning_rate": 6.372881355932203e-07,
20814
+ "loss": 1.2934,
20815
+ "step": 29060
20816
+ },
20817
+ {
20818
+ "epoch": 0.73,
20819
+ "grad_norm": 5.38674259185791,
20820
+ "learning_rate": 6.305084745762713e-07,
20821
+ "loss": 1.3364,
20822
+ "step": 29070
20823
+ },
20824
+ {
20825
+ "epoch": 0.73,
20826
+ "grad_norm": 4.701632022857666,
20827
+ "learning_rate": 6.237288135593221e-07,
20828
+ "loss": 1.1754,
20829
+ "step": 29080
20830
+ },
20831
+ {
20832
+ "epoch": 0.73,
20833
+ "grad_norm": 8.243915557861328,
20834
+ "learning_rate": 6.16949152542373e-07,
20835
+ "loss": 1.4113,
20836
+ "step": 29090
20837
+ },
20838
+ {
20839
+ "epoch": 0.73,
20840
+ "grad_norm": 4.2352213859558105,
20841
+ "learning_rate": 6.101694915254238e-07,
20842
+ "loss": 1.4673,
20843
+ "step": 29100
20844
+ },
20845
+ {
20846
+ "epoch": 0.73,
20847
+ "grad_norm": 19.758262634277344,
20848
+ "learning_rate": 6.033898305084746e-07,
20849
+ "loss": 1.0756,
20850
+ "step": 29110
20851
+ },
20852
+ {
20853
+ "epoch": 0.73,
20854
+ "grad_norm": 9.024798393249512,
20855
+ "learning_rate": 5.966101694915255e-07,
20856
+ "loss": 1.1835,
20857
+ "step": 29120
20858
+ },
20859
+ {
20860
+ "epoch": 0.73,
20861
+ "grad_norm": 2.7573533058166504,
20862
+ "learning_rate": 5.898305084745763e-07,
20863
+ "loss": 1.3342,
20864
+ "step": 29130
20865
+ },
20866
+ {
20867
+ "epoch": 0.73,
20868
+ "grad_norm": 2.6472461223602295,
20869
+ "learning_rate": 5.830508474576271e-07,
20870
+ "loss": 1.478,
20871
+ "step": 29140
20872
+ },
20873
+ {
20874
+ "epoch": 0.73,
20875
+ "grad_norm": 8.194090843200684,
20876
+ "learning_rate": 5.76271186440678e-07,
20877
+ "loss": 1.2781,
20878
+ "step": 29150
20879
+ },
20880
+ {
20881
+ "epoch": 0.73,
20882
+ "grad_norm": 7.336125373840332,
20883
+ "learning_rate": 5.694915254237288e-07,
20884
+ "loss": 1.3871,
20885
+ "step": 29160
20886
+ },
20887
+ {
20888
+ "epoch": 0.73,
20889
+ "grad_norm": 9.800869941711426,
20890
+ "learning_rate": 5.627118644067797e-07,
20891
+ "loss": 1.237,
20892
+ "step": 29170
20893
+ },
20894
+ {
20895
+ "epoch": 0.73,
20896
+ "grad_norm": 3.7578229904174805,
20897
+ "learning_rate": 5.559322033898306e-07,
20898
+ "loss": 1.3408,
20899
+ "step": 29180
20900
+ },
20901
+ {
20902
+ "epoch": 0.73,
20903
+ "grad_norm": 11.04594612121582,
20904
+ "learning_rate": 5.491525423728815e-07,
20905
+ "loss": 1.4338,
20906
+ "step": 29190
20907
+ },
20908
+ {
20909
+ "epoch": 0.73,
20910
+ "grad_norm": 9.137775421142578,
20911
+ "learning_rate": 5.423728813559322e-07,
20912
+ "loss": 1.2292,
20913
+ "step": 29200
20914
+ },
20915
+ {
20916
+ "epoch": 0.73,
20917
+ "grad_norm": 6.266888618469238,
20918
+ "learning_rate": 5.355932203389831e-07,
20919
+ "loss": 1.3781,
20920
+ "step": 29210
20921
+ },
20922
+ {
20923
+ "epoch": 0.73,
20924
+ "grad_norm": 10.63198184967041,
20925
+ "learning_rate": 5.28813559322034e-07,
20926
+ "loss": 1.2394,
20927
+ "step": 29220
20928
+ },
20929
+ {
20930
+ "epoch": 0.73,
20931
+ "grad_norm": 16.352039337158203,
20932
+ "learning_rate": 5.220338983050848e-07,
20933
+ "loss": 1.3784,
20934
+ "step": 29230
20935
+ },
20936
+ {
20937
+ "epoch": 0.73,
20938
+ "grad_norm": 9.548080444335938,
20939
+ "learning_rate": 5.152542372881356e-07,
20940
+ "loss": 1.3201,
20941
+ "step": 29240
20942
+ },
20943
+ {
20944
+ "epoch": 0.73,
20945
+ "grad_norm": 2.56595516204834,
20946
+ "learning_rate": 5.084745762711865e-07,
20947
+ "loss": 1.2335,
20948
+ "step": 29250
20949
+ },
20950
+ {
20951
+ "epoch": 0.73,
20952
+ "grad_norm": 3.092132568359375,
20953
+ "learning_rate": 5.016949152542373e-07,
20954
+ "loss": 1.2999,
20955
+ "step": 29260
20956
+ },
20957
+ {
20958
+ "epoch": 0.73,
20959
+ "grad_norm": 12.341950416564941,
20960
+ "learning_rate": 4.949152542372881e-07,
20961
+ "loss": 1.2719,
20962
+ "step": 29270
20963
+ },
20964
+ {
20965
+ "epoch": 0.73,
20966
+ "grad_norm": 16.05048179626465,
20967
+ "learning_rate": 4.881355932203391e-07,
20968
+ "loss": 1.1151,
20969
+ "step": 29280
20970
+ },
20971
+ {
20972
+ "epoch": 0.73,
20973
+ "grad_norm": 8.888206481933594,
20974
+ "learning_rate": 4.813559322033898e-07,
20975
+ "loss": 1.0549,
20976
+ "step": 29290
20977
+ },
20978
+ {
20979
+ "epoch": 0.73,
20980
+ "grad_norm": 16.083812713623047,
20981
+ "learning_rate": 4.745762711864407e-07,
20982
+ "loss": 1.2443,
20983
+ "step": 29300
20984
+ },
20985
+ {
20986
+ "epoch": 0.73,
20987
+ "grad_norm": 5.3232741355896,
20988
+ "learning_rate": 4.6779661016949154e-07,
20989
+ "loss": 1.2953,
20990
+ "step": 29310
20991
+ },
20992
+ {
20993
+ "epoch": 0.73,
20994
+ "grad_norm": 15.959102630615234,
20995
+ "learning_rate": 4.610169491525424e-07,
20996
+ "loss": 1.282,
20997
+ "step": 29320
20998
+ },
20999
+ {
21000
+ "epoch": 0.73,
21001
+ "grad_norm": 8.539166450500488,
21002
+ "learning_rate": 4.542372881355932e-07,
21003
+ "loss": 1.2538,
21004
+ "step": 29330
21005
+ },
21006
+ {
21007
+ "epoch": 0.73,
21008
+ "grad_norm": 4.907639980316162,
21009
+ "learning_rate": 4.4745762711864415e-07,
21010
+ "loss": 1.3666,
21011
+ "step": 29340
21012
+ },
21013
+ {
21014
+ "epoch": 0.73,
21015
+ "grad_norm": 2.455517530441284,
21016
+ "learning_rate": 4.4067796610169497e-07,
21017
+ "loss": 1.2269,
21018
+ "step": 29350
21019
+ },
21020
+ {
21021
+ "epoch": 0.73,
21022
+ "grad_norm": 9.032910346984863,
21023
+ "learning_rate": 4.3389830508474584e-07,
21024
+ "loss": 1.2813,
21025
+ "step": 29360
21026
+ },
21027
+ {
21028
+ "epoch": 0.73,
21029
+ "grad_norm": 2.8847286701202393,
21030
+ "learning_rate": 4.2711864406779666e-07,
21031
+ "loss": 1.3336,
21032
+ "step": 29370
21033
+ },
21034
+ {
21035
+ "epoch": 0.73,
21036
+ "grad_norm": 11.284420013427734,
21037
+ "learning_rate": 4.2033898305084753e-07,
21038
+ "loss": 1.286,
21039
+ "step": 29380
21040
+ },
21041
+ {
21042
+ "epoch": 0.73,
21043
+ "grad_norm": 11.412368774414062,
21044
+ "learning_rate": 4.1355932203389835e-07,
21045
+ "loss": 1.3317,
21046
+ "step": 29390
21047
+ },
21048
+ {
21049
+ "epoch": 0.73,
21050
+ "grad_norm": 4.6027302742004395,
21051
+ "learning_rate": 4.0677966101694916e-07,
21052
+ "loss": 1.3263,
21053
+ "step": 29400
21054
+ },
21055
+ {
21056
+ "epoch": 0.74,
21057
+ "grad_norm": 2.39072322845459,
21058
+ "learning_rate": 4.0000000000000003e-07,
21059
+ "loss": 1.3174,
21060
+ "step": 29410
21061
+ },
21062
+ {
21063
+ "epoch": 0.74,
21064
+ "grad_norm": 2.7101926803588867,
21065
+ "learning_rate": 3.9322033898305085e-07,
21066
+ "loss": 1.2673,
21067
+ "step": 29420
21068
+ },
21069
+ {
21070
+ "epoch": 0.74,
21071
+ "grad_norm": 10.152178764343262,
21072
+ "learning_rate": 3.864406779661017e-07,
21073
+ "loss": 1.3241,
21074
+ "step": 29430
21075
+ },
21076
+ {
21077
+ "epoch": 0.74,
21078
+ "grad_norm": 4.905571937561035,
21079
+ "learning_rate": 3.7966101694915254e-07,
21080
+ "loss": 1.2261,
21081
+ "step": 29440
21082
+ },
21083
+ {
21084
+ "epoch": 0.74,
21085
+ "grad_norm": 10.35471248626709,
21086
+ "learning_rate": 3.7288135593220347e-07,
21087
+ "loss": 1.2083,
21088
+ "step": 29450
21089
+ },
21090
+ {
21091
+ "epoch": 0.74,
21092
+ "grad_norm": 11.7713623046875,
21093
+ "learning_rate": 3.6610169491525423e-07,
21094
+ "loss": 1.2647,
21095
+ "step": 29460
21096
+ },
21097
+ {
21098
+ "epoch": 0.74,
21099
+ "grad_norm": 3.181910276412964,
21100
+ "learning_rate": 3.5932203389830516e-07,
21101
+ "loss": 1.4678,
21102
+ "step": 29470
21103
+ },
21104
+ {
21105
+ "epoch": 0.74,
21106
+ "grad_norm": 6.756598472595215,
21107
+ "learning_rate": 3.52542372881356e-07,
21108
+ "loss": 1.273,
21109
+ "step": 29480
21110
+ },
21111
+ {
21112
+ "epoch": 0.74,
21113
+ "grad_norm": 12.660133361816406,
21114
+ "learning_rate": 3.4576271186440684e-07,
21115
+ "loss": 1.3497,
21116
+ "step": 29490
21117
+ },
21118
+ {
21119
+ "epoch": 0.74,
21120
+ "grad_norm": 11.418837547302246,
21121
+ "learning_rate": 3.3898305084745766e-07,
21122
+ "loss": 1.3025,
21123
+ "step": 29500
21124
+ },
21125
+ {
21126
+ "epoch": 0.74,
21127
+ "eval_loss": 1.2983591556549072,
21128
+ "eval_runtime": 66.2327,
21129
+ "eval_samples_per_second": 15.098,
21130
+ "eval_steps_per_second": 15.098,
21131
+ "step": 29500
21132
+ },
21133
+ {
21134
+ "epoch": 0.74,
21135
+ "grad_norm": 5.526192665100098,
21136
+ "learning_rate": 3.322033898305085e-07,
21137
+ "loss": 1.1364,
21138
+ "step": 29510
21139
+ },
21140
+ {
21141
+ "epoch": 0.74,
21142
+ "grad_norm": 4.229588031768799,
21143
+ "learning_rate": 3.2542372881355935e-07,
21144
+ "loss": 1.3351,
21145
+ "step": 29520
21146
+ },
21147
+ {
21148
+ "epoch": 0.74,
21149
+ "grad_norm": 3.8703114986419678,
21150
+ "learning_rate": 3.1864406779661017e-07,
21151
+ "loss": 1.547,
21152
+ "step": 29530
21153
+ },
21154
+ {
21155
+ "epoch": 0.74,
21156
+ "grad_norm": 4.0574445724487305,
21157
+ "learning_rate": 3.1186440677966104e-07,
21158
+ "loss": 1.5269,
21159
+ "step": 29540
21160
+ },
21161
+ {
21162
+ "epoch": 0.74,
21163
+ "grad_norm": 2.609175443649292,
21164
+ "learning_rate": 3.050847457627119e-07,
21165
+ "loss": 1.3574,
21166
+ "step": 29550
21167
+ },
21168
+ {
21169
+ "epoch": 0.74,
21170
+ "grad_norm": 5.445046901702881,
21171
+ "learning_rate": 2.9830508474576273e-07,
21172
+ "loss": 1.3081,
21173
+ "step": 29560
21174
+ },
21175
+ {
21176
+ "epoch": 0.74,
21177
+ "grad_norm": 8.432710647583008,
21178
+ "learning_rate": 2.9152542372881355e-07,
21179
+ "loss": 1.2343,
21180
+ "step": 29570
21181
+ },
21182
+ {
21183
+ "epoch": 0.74,
21184
+ "grad_norm": 10.050646781921387,
21185
+ "learning_rate": 2.847457627118644e-07,
21186
+ "loss": 1.2665,
21187
+ "step": 29580
21188
+ },
21189
+ {
21190
+ "epoch": 0.74,
21191
+ "grad_norm": 6.895621299743652,
21192
+ "learning_rate": 2.779661016949153e-07,
21193
+ "loss": 1.1077,
21194
+ "step": 29590
21195
+ },
21196
+ {
21197
+ "epoch": 0.74,
21198
+ "grad_norm": 6.045702934265137,
21199
+ "learning_rate": 2.711864406779661e-07,
21200
+ "loss": 1.0306,
21201
+ "step": 29600
21202
+ },
21203
+ {
21204
+ "epoch": 0.74,
21205
+ "grad_norm": 2.18096661567688,
21206
+ "learning_rate": 2.64406779661017e-07,
21207
+ "loss": 1.167,
21208
+ "step": 29610
21209
+ },
21210
+ {
21211
+ "epoch": 0.74,
21212
+ "grad_norm": 8.721917152404785,
21213
+ "learning_rate": 2.576271186440678e-07,
21214
+ "loss": 1.2476,
21215
+ "step": 29620
21216
+ },
21217
+ {
21218
+ "epoch": 0.74,
21219
+ "grad_norm": 5.546863079071045,
21220
+ "learning_rate": 2.5084745762711867e-07,
21221
+ "loss": 1.3273,
21222
+ "step": 29630
21223
+ },
21224
+ {
21225
+ "epoch": 0.74,
21226
+ "grad_norm": 7.264005661010742,
21227
+ "learning_rate": 2.4406779661016954e-07,
21228
+ "loss": 1.4333,
21229
+ "step": 29640
21230
+ },
21231
+ {
21232
+ "epoch": 0.74,
21233
+ "grad_norm": 4.217822551727295,
21234
+ "learning_rate": 2.3728813559322036e-07,
21235
+ "loss": 1.5513,
21236
+ "step": 29650
21237
+ },
21238
+ {
21239
+ "epoch": 0.74,
21240
+ "grad_norm": 21.425151824951172,
21241
+ "learning_rate": 2.305084745762712e-07,
21242
+ "loss": 1.15,
21243
+ "step": 29660
21244
+ },
21245
+ {
21246
+ "epoch": 0.74,
21247
+ "grad_norm": 17.36193084716797,
21248
+ "learning_rate": 2.2372881355932207e-07,
21249
+ "loss": 1.4419,
21250
+ "step": 29670
21251
+ },
21252
+ {
21253
+ "epoch": 0.74,
21254
+ "grad_norm": 9.74802017211914,
21255
+ "learning_rate": 2.1694915254237292e-07,
21256
+ "loss": 1.2543,
21257
+ "step": 29680
21258
+ },
21259
+ {
21260
+ "epoch": 0.74,
21261
+ "grad_norm": 12.033125877380371,
21262
+ "learning_rate": 2.1016949152542376e-07,
21263
+ "loss": 1.378,
21264
+ "step": 29690
21265
+ },
21266
+ {
21267
+ "epoch": 0.74,
21268
+ "grad_norm": 14.735278129577637,
21269
+ "learning_rate": 2.0338983050847458e-07,
21270
+ "loss": 1.2712,
21271
+ "step": 29700
21272
+ },
21273
+ {
21274
+ "epoch": 0.74,
21275
+ "grad_norm": 11.541913986206055,
21276
+ "learning_rate": 1.9661016949152543e-07,
21277
+ "loss": 1.0699,
21278
+ "step": 29710
21279
+ },
21280
+ {
21281
+ "epoch": 0.74,
21282
+ "grad_norm": 3.0062718391418457,
21283
+ "learning_rate": 1.8983050847457627e-07,
21284
+ "loss": 1.3865,
21285
+ "step": 29720
21286
+ },
21287
+ {
21288
+ "epoch": 0.74,
21289
+ "grad_norm": 7.821793079376221,
21290
+ "learning_rate": 1.8305084745762712e-07,
21291
+ "loss": 1.1679,
21292
+ "step": 29730
21293
+ },
21294
+ {
21295
+ "epoch": 0.74,
21296
+ "grad_norm": 12.209012031555176,
21297
+ "learning_rate": 1.76271186440678e-07,
21298
+ "loss": 1.1567,
21299
+ "step": 29740
21300
+ },
21301
+ {
21302
+ "epoch": 0.74,
21303
+ "grad_norm": 3.285269021987915,
21304
+ "learning_rate": 1.6949152542372883e-07,
21305
+ "loss": 1.3667,
21306
+ "step": 29750
21307
+ },
21308
+ {
21309
+ "epoch": 0.74,
21310
+ "grad_norm": 5.812708377838135,
21311
+ "learning_rate": 1.6271186440677968e-07,
21312
+ "loss": 1.2521,
21313
+ "step": 29760
21314
+ },
21315
+ {
21316
+ "epoch": 0.74,
21317
+ "grad_norm": 3.4106059074401855,
21318
+ "learning_rate": 1.5593220338983052e-07,
21319
+ "loss": 1.3847,
21320
+ "step": 29770
21321
+ },
21322
+ {
21323
+ "epoch": 0.74,
21324
+ "grad_norm": 9.550897598266602,
21325
+ "learning_rate": 1.4915254237288137e-07,
21326
+ "loss": 1.2853,
21327
+ "step": 29780
21328
+ },
21329
+ {
21330
+ "epoch": 0.74,
21331
+ "grad_norm": 13.644095420837402,
21332
+ "learning_rate": 1.423728813559322e-07,
21333
+ "loss": 1.3305,
21334
+ "step": 29790
21335
+ },
21336
+ {
21337
+ "epoch": 0.74,
21338
+ "grad_norm": 6.250187397003174,
21339
+ "learning_rate": 1.3559322033898305e-07,
21340
+ "loss": 1.2896,
21341
+ "step": 29800
21342
+ },
21343
+ {
21344
+ "epoch": 0.75,
21345
+ "grad_norm": 11.07441234588623,
21346
+ "learning_rate": 1.288135593220339e-07,
21347
+ "loss": 1.2263,
21348
+ "step": 29810
21349
+ },
21350
+ {
21351
+ "epoch": 0.75,
21352
+ "grad_norm": 13.597912788391113,
21353
+ "learning_rate": 1.2203389830508477e-07,
21354
+ "loss": 1.4195,
21355
+ "step": 29820
21356
+ },
21357
+ {
21358
+ "epoch": 0.75,
21359
+ "grad_norm": 4.825064659118652,
21360
+ "learning_rate": 1.152542372881356e-07,
21361
+ "loss": 1.3549,
21362
+ "step": 29830
21363
+ },
21364
+ {
21365
+ "epoch": 0.75,
21366
+ "grad_norm": 3.6931655406951904,
21367
+ "learning_rate": 1.0847457627118646e-07,
21368
+ "loss": 1.3675,
21369
+ "step": 29840
21370
+ },
21371
+ {
21372
+ "epoch": 0.75,
21373
+ "grad_norm": 5.718669414520264,
21374
+ "learning_rate": 1.0169491525423729e-07,
21375
+ "loss": 1.4715,
21376
+ "step": 29850
21377
+ },
21378
+ {
21379
+ "epoch": 0.75,
21380
+ "grad_norm": 6.913152694702148,
21381
+ "learning_rate": 9.491525423728814e-08,
21382
+ "loss": 1.4853,
21383
+ "step": 29860
21384
+ },
21385
+ {
21386
+ "epoch": 0.75,
21387
+ "grad_norm": 5.6454644203186035,
21388
+ "learning_rate": 8.8135593220339e-08,
21389
+ "loss": 1.3638,
21390
+ "step": 29870
21391
+ },
21392
+ {
21393
+ "epoch": 0.75,
21394
+ "grad_norm": 7.006107330322266,
21395
+ "learning_rate": 8.135593220338984e-08,
21396
+ "loss": 1.387,
21397
+ "step": 29880
21398
+ },
21399
+ {
21400
+ "epoch": 0.75,
21401
+ "grad_norm": 7.425577163696289,
21402
+ "learning_rate": 7.457627118644068e-08,
21403
+ "loss": 1.314,
21404
+ "step": 29890
21405
+ },
21406
+ {
21407
+ "epoch": 0.75,
21408
+ "grad_norm": 12.527627944946289,
21409
+ "learning_rate": 6.779661016949153e-08,
21410
+ "loss": 1.3822,
21411
+ "step": 29900
21412
+ },
21413
+ {
21414
+ "epoch": 0.75,
21415
+ "grad_norm": 7.276275634765625,
21416
+ "learning_rate": 6.101694915254239e-08,
21417
+ "loss": 1.1887,
21418
+ "step": 29910
21419
+ },
21420
+ {
21421
+ "epoch": 0.75,
21422
+ "grad_norm": 1.8828225135803223,
21423
+ "learning_rate": 5.423728813559323e-08,
21424
+ "loss": 1.3262,
21425
+ "step": 29920
21426
+ },
21427
+ {
21428
+ "epoch": 0.75,
21429
+ "grad_norm": 3.2337329387664795,
21430
+ "learning_rate": 4.745762711864407e-08,
21431
+ "loss": 1.3487,
21432
+ "step": 29930
21433
+ },
21434
+ {
21435
+ "epoch": 0.75,
21436
+ "grad_norm": 6.47509765625,
21437
+ "learning_rate": 4.067796610169492e-08,
21438
+ "loss": 1.3835,
21439
+ "step": 29940
21440
+ },
21441
+ {
21442
+ "epoch": 0.75,
21443
+ "grad_norm": 8.018141746520996,
21444
+ "learning_rate": 3.3898305084745764e-08,
21445
+ "loss": 1.1818,
21446
+ "step": 29950
21447
+ },
21448
+ {
21449
+ "epoch": 0.75,
21450
+ "grad_norm": 6.676591396331787,
21451
+ "learning_rate": 2.7118644067796615e-08,
21452
+ "loss": 1.2067,
21453
+ "step": 29960
21454
+ },
21455
+ {
21456
+ "epoch": 0.75,
21457
+ "grad_norm": 3.872598648071289,
21458
+ "learning_rate": 2.033898305084746e-08,
21459
+ "loss": 1.2989,
21460
+ "step": 29970
21461
+ },
21462
+ {
21463
+ "epoch": 0.75,
21464
+ "grad_norm": 2.7663283348083496,
21465
+ "learning_rate": 1.3559322033898307e-08,
21466
+ "loss": 1.321,
21467
+ "step": 29980
21468
+ },
21469
+ {
21470
+ "epoch": 0.75,
21471
+ "grad_norm": 6.447133541107178,
21472
+ "learning_rate": 6.779661016949154e-09,
21473
+ "loss": 1.255,
21474
+ "step": 29990
21475
+ },
21476
+ {
21477
+ "epoch": 0.75,
21478
+ "grad_norm": 5.224142074584961,
21479
+ "learning_rate": 0.0,
21480
+ "loss": 1.2453,
21481
+ "step": 30000
21482
+ },
21483
+ {
21484
+ "epoch": 0.75,
21485
+ "eval_loss": 1.307248592376709,
21486
+ "eval_runtime": 66.2202,
21487
+ "eval_samples_per_second": 15.101,
21488
+ "eval_steps_per_second": 15.101,
21489
+ "step": 30000
21490
  }
21491
  ],
21492
  "logging_steps": 10,
 
21494
  "num_input_tokens_seen": 0,
21495
  "num_train_epochs": 1,
21496
  "save_steps": 2500,
21497
+ "total_flos": 4.8306377981952e+17,
21498
  "train_batch_size": 1,
21499
  "trial_name": null,
21500
  "trial_params": null