jdannem6 commited on
Commit
1136924
1 Parent(s): 4f748ac

Uploaded checkpoint-27500

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1795 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58ea76fcf16a2912a570cf295dd1757cd9562cb7f7f8e74d37938855d31dc866
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d97fc9daed8fc42b8286be9a22db8d8f0c98b367d6f684f8724075b9c509868c
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:231c6b24970ef58291d1980aeb742ace763101289d628ec3f4ac808335924d18
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5ec1b3aa5d12330bda5fc479f69a184c2145592be7a96f6bd1ace39646aaf8
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2bd031f30ceb89483d2d8b5eb187850133dcc5a689162e8975b2cc0e61b4001
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42ed1734c5823abfe806343a4de18dcccd1e9ad5af5349e08097c7bde2aa7437
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3bdbaa37c77733a3ea9eb90a36bc290f4f5b9f56abe23cc6586cbaa459f92c6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bae572518ab53ddc674f52a5ef01613875bea64a8d9c53d4b7d4a9aedc712f19
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.3409814834594727,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-25000",
4
- "epoch": 0.625,
5
  "eval_steps": 500,
6
- "global_step": 25000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -17907,6 +17907,1796 @@
17907
  "eval_samples_per_second": 15.125,
17908
  "eval_steps_per_second": 15.125,
17909
  "step": 25000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17910
  }
17911
  ],
17912
  "logging_steps": 10,
@@ -17914,7 +19704,7 @@
17914
  "num_input_tokens_seen": 0,
17915
  "num_train_epochs": 1,
17916
  "save_steps": 2500,
17917
- "total_flos": 4.025531498496e+17,
17918
  "train_batch_size": 1,
17919
  "trial_name": null,
17920
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.262895941734314,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-27500",
4
+ "epoch": 0.6875,
5
  "eval_steps": 500,
6
+ "global_step": 27500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
17907
  "eval_samples_per_second": 15.125,
17908
  "eval_steps_per_second": 15.125,
17909
  "step": 25000
17910
+ },
17911
+ {
17912
+ "epoch": 0.63,
17913
+ "grad_norm": 7.775777339935303,
17914
+ "learning_rate": 3.3830508474576273e-06,
17915
+ "loss": 1.5704,
17916
+ "step": 25010
17917
+ },
17918
+ {
17919
+ "epoch": 0.63,
17920
+ "grad_norm": 6.8910698890686035,
17921
+ "learning_rate": 3.3762711864406783e-06,
17922
+ "loss": 1.2296,
17923
+ "step": 25020
17924
+ },
17925
+ {
17926
+ "epoch": 0.63,
17927
+ "grad_norm": 3.11647367477417,
17928
+ "learning_rate": 3.3694915254237292e-06,
17929
+ "loss": 1.1465,
17930
+ "step": 25030
17931
+ },
17932
+ {
17933
+ "epoch": 0.63,
17934
+ "grad_norm": 4.026238441467285,
17935
+ "learning_rate": 3.3627118644067802e-06,
17936
+ "loss": 1.4266,
17937
+ "step": 25040
17938
+ },
17939
+ {
17940
+ "epoch": 0.63,
17941
+ "grad_norm": 3.3097872734069824,
17942
+ "learning_rate": 3.3559322033898308e-06,
17943
+ "loss": 1.4,
17944
+ "step": 25050
17945
+ },
17946
+ {
17947
+ "epoch": 0.63,
17948
+ "grad_norm": 3.2518560886383057,
17949
+ "learning_rate": 3.3491525423728817e-06,
17950
+ "loss": 1.3535,
17951
+ "step": 25060
17952
+ },
17953
+ {
17954
+ "epoch": 0.63,
17955
+ "grad_norm": 4.915974140167236,
17956
+ "learning_rate": 3.3423728813559327e-06,
17957
+ "loss": 1.3441,
17958
+ "step": 25070
17959
+ },
17960
+ {
17961
+ "epoch": 0.63,
17962
+ "grad_norm": 4.099907398223877,
17963
+ "learning_rate": 3.3355932203389833e-06,
17964
+ "loss": 1.4649,
17965
+ "step": 25080
17966
+ },
17967
+ {
17968
+ "epoch": 0.63,
17969
+ "grad_norm": 7.00651741027832,
17970
+ "learning_rate": 3.3288135593220343e-06,
17971
+ "loss": 1.4445,
17972
+ "step": 25090
17973
+ },
17974
+ {
17975
+ "epoch": 0.63,
17976
+ "grad_norm": 2.7813143730163574,
17977
+ "learning_rate": 3.322033898305085e-06,
17978
+ "loss": 1.2997,
17979
+ "step": 25100
17980
+ },
17981
+ {
17982
+ "epoch": 0.63,
17983
+ "grad_norm": 4.583659648895264,
17984
+ "learning_rate": 3.3152542372881358e-06,
17985
+ "loss": 1.3112,
17986
+ "step": 25110
17987
+ },
17988
+ {
17989
+ "epoch": 0.63,
17990
+ "grad_norm": 9.5046968460083,
17991
+ "learning_rate": 3.3084745762711868e-06,
17992
+ "loss": 1.3488,
17993
+ "step": 25120
17994
+ },
17995
+ {
17996
+ "epoch": 0.63,
17997
+ "grad_norm": 4.829354286193848,
17998
+ "learning_rate": 3.3016949152542377e-06,
17999
+ "loss": 1.2801,
18000
+ "step": 25130
18001
+ },
18002
+ {
18003
+ "epoch": 0.63,
18004
+ "grad_norm": 8.664487838745117,
18005
+ "learning_rate": 3.2949152542372887e-06,
18006
+ "loss": 1.3001,
18007
+ "step": 25140
18008
+ },
18009
+ {
18010
+ "epoch": 0.63,
18011
+ "grad_norm": 1.5184565782546997,
18012
+ "learning_rate": 3.288135593220339e-06,
18013
+ "loss": 1.3178,
18014
+ "step": 25150
18015
+ },
18016
+ {
18017
+ "epoch": 0.63,
18018
+ "grad_norm": 2.527219295501709,
18019
+ "learning_rate": 3.28135593220339e-06,
18020
+ "loss": 1.1742,
18021
+ "step": 25160
18022
+ },
18023
+ {
18024
+ "epoch": 0.63,
18025
+ "grad_norm": 1.9469351768493652,
18026
+ "learning_rate": 3.274576271186441e-06,
18027
+ "loss": 1.3338,
18028
+ "step": 25170
18029
+ },
18030
+ {
18031
+ "epoch": 0.63,
18032
+ "grad_norm": 9.183582305908203,
18033
+ "learning_rate": 3.2677966101694918e-06,
18034
+ "loss": 1.2689,
18035
+ "step": 25180
18036
+ },
18037
+ {
18038
+ "epoch": 0.63,
18039
+ "grad_norm": 8.95080280303955,
18040
+ "learning_rate": 3.2610169491525428e-06,
18041
+ "loss": 1.4399,
18042
+ "step": 25190
18043
+ },
18044
+ {
18045
+ "epoch": 0.63,
18046
+ "grad_norm": 3.5885729789733887,
18047
+ "learning_rate": 3.2542372881355933e-06,
18048
+ "loss": 1.2411,
18049
+ "step": 25200
18050
+ },
18051
+ {
18052
+ "epoch": 0.63,
18053
+ "grad_norm": 6.78897762298584,
18054
+ "learning_rate": 3.2474576271186443e-06,
18055
+ "loss": 1.1516,
18056
+ "step": 25210
18057
+ },
18058
+ {
18059
+ "epoch": 0.63,
18060
+ "grad_norm": 16.358980178833008,
18061
+ "learning_rate": 3.2406779661016953e-06,
18062
+ "loss": 1.3401,
18063
+ "step": 25220
18064
+ },
18065
+ {
18066
+ "epoch": 0.63,
18067
+ "grad_norm": 13.469403266906738,
18068
+ "learning_rate": 3.2338983050847462e-06,
18069
+ "loss": 1.1778,
18070
+ "step": 25230
18071
+ },
18072
+ {
18073
+ "epoch": 0.63,
18074
+ "grad_norm": 9.235873222351074,
18075
+ "learning_rate": 3.2271186440677972e-06,
18076
+ "loss": 1.463,
18077
+ "step": 25240
18078
+ },
18079
+ {
18080
+ "epoch": 0.63,
18081
+ "grad_norm": 5.634754657745361,
18082
+ "learning_rate": 3.2203389830508473e-06,
18083
+ "loss": 1.4592,
18084
+ "step": 25250
18085
+ },
18086
+ {
18087
+ "epoch": 0.63,
18088
+ "grad_norm": 7.916177749633789,
18089
+ "learning_rate": 3.2135593220338983e-06,
18090
+ "loss": 1.2284,
18091
+ "step": 25260
18092
+ },
18093
+ {
18094
+ "epoch": 0.63,
18095
+ "grad_norm": 3.810818672180176,
18096
+ "learning_rate": 3.2067796610169493e-06,
18097
+ "loss": 1.3948,
18098
+ "step": 25270
18099
+ },
18100
+ {
18101
+ "epoch": 0.63,
18102
+ "grad_norm": 3.151276111602783,
18103
+ "learning_rate": 3.2000000000000003e-06,
18104
+ "loss": 1.3252,
18105
+ "step": 25280
18106
+ },
18107
+ {
18108
+ "epoch": 0.63,
18109
+ "grad_norm": 4.654315948486328,
18110
+ "learning_rate": 3.1932203389830513e-06,
18111
+ "loss": 1.3577,
18112
+ "step": 25290
18113
+ },
18114
+ {
18115
+ "epoch": 0.63,
18116
+ "grad_norm": 7.808940410614014,
18117
+ "learning_rate": 3.186440677966102e-06,
18118
+ "loss": 1.419,
18119
+ "step": 25300
18120
+ },
18121
+ {
18122
+ "epoch": 0.63,
18123
+ "grad_norm": 6.512801647186279,
18124
+ "learning_rate": 3.1796610169491528e-06,
18125
+ "loss": 1.2751,
18126
+ "step": 25310
18127
+ },
18128
+ {
18129
+ "epoch": 0.63,
18130
+ "grad_norm": 8.710541725158691,
18131
+ "learning_rate": 3.1728813559322038e-06,
18132
+ "loss": 1.2923,
18133
+ "step": 25320
18134
+ },
18135
+ {
18136
+ "epoch": 0.63,
18137
+ "grad_norm": 3.160752296447754,
18138
+ "learning_rate": 3.1661016949152547e-06,
18139
+ "loss": 1.3357,
18140
+ "step": 25330
18141
+ },
18142
+ {
18143
+ "epoch": 0.63,
18144
+ "grad_norm": 2.4420571327209473,
18145
+ "learning_rate": 3.1593220338983053e-06,
18146
+ "loss": 1.4122,
18147
+ "step": 25340
18148
+ },
18149
+ {
18150
+ "epoch": 0.63,
18151
+ "grad_norm": 12.090134620666504,
18152
+ "learning_rate": 3.1525423728813563e-06,
18153
+ "loss": 1.1703,
18154
+ "step": 25350
18155
+ },
18156
+ {
18157
+ "epoch": 0.63,
18158
+ "grad_norm": 5.109529972076416,
18159
+ "learning_rate": 3.145762711864407e-06,
18160
+ "loss": 1.4391,
18161
+ "step": 25360
18162
+ },
18163
+ {
18164
+ "epoch": 0.63,
18165
+ "grad_norm": 3.0033655166625977,
18166
+ "learning_rate": 3.138983050847458e-06,
18167
+ "loss": 1.2308,
18168
+ "step": 25370
18169
+ },
18170
+ {
18171
+ "epoch": 0.63,
18172
+ "grad_norm": 6.253593921661377,
18173
+ "learning_rate": 3.1322033898305088e-06,
18174
+ "loss": 1.1566,
18175
+ "step": 25380
18176
+ },
18177
+ {
18178
+ "epoch": 0.63,
18179
+ "grad_norm": 5.59571647644043,
18180
+ "learning_rate": 3.1254237288135598e-06,
18181
+ "loss": 1.3234,
18182
+ "step": 25390
18183
+ },
18184
+ {
18185
+ "epoch": 0.64,
18186
+ "grad_norm": 15.45341968536377,
18187
+ "learning_rate": 3.1186440677966107e-06,
18188
+ "loss": 1.3298,
18189
+ "step": 25400
18190
+ },
18191
+ {
18192
+ "epoch": 0.64,
18193
+ "grad_norm": 4.012663841247559,
18194
+ "learning_rate": 3.111864406779661e-06,
18195
+ "loss": 1.3515,
18196
+ "step": 25410
18197
+ },
18198
+ {
18199
+ "epoch": 0.64,
18200
+ "grad_norm": 11.210050582885742,
18201
+ "learning_rate": 3.105084745762712e-06,
18202
+ "loss": 1.5219,
18203
+ "step": 25420
18204
+ },
18205
+ {
18206
+ "epoch": 0.64,
18207
+ "grad_norm": 12.784728050231934,
18208
+ "learning_rate": 3.098305084745763e-06,
18209
+ "loss": 1.1988,
18210
+ "step": 25430
18211
+ },
18212
+ {
18213
+ "epoch": 0.64,
18214
+ "grad_norm": 4.20169734954834,
18215
+ "learning_rate": 3.091525423728814e-06,
18216
+ "loss": 1.421,
18217
+ "step": 25440
18218
+ },
18219
+ {
18220
+ "epoch": 0.64,
18221
+ "grad_norm": 6.456515312194824,
18222
+ "learning_rate": 3.0847457627118648e-06,
18223
+ "loss": 1.1633,
18224
+ "step": 25450
18225
+ },
18226
+ {
18227
+ "epoch": 0.64,
18228
+ "grad_norm": 6.611431121826172,
18229
+ "learning_rate": 3.0779661016949153e-06,
18230
+ "loss": 1.2853,
18231
+ "step": 25460
18232
+ },
18233
+ {
18234
+ "epoch": 0.64,
18235
+ "grad_norm": 4.3997015953063965,
18236
+ "learning_rate": 3.0711864406779663e-06,
18237
+ "loss": 1.2609,
18238
+ "step": 25470
18239
+ },
18240
+ {
18241
+ "epoch": 0.64,
18242
+ "grad_norm": 12.016313552856445,
18243
+ "learning_rate": 3.0644067796610173e-06,
18244
+ "loss": 1.1744,
18245
+ "step": 25480
18246
+ },
18247
+ {
18248
+ "epoch": 0.64,
18249
+ "grad_norm": 5.508640289306641,
18250
+ "learning_rate": 3.0576271186440683e-06,
18251
+ "loss": 1.2545,
18252
+ "step": 25490
18253
+ },
18254
+ {
18255
+ "epoch": 0.64,
18256
+ "grad_norm": 7.7658305168151855,
18257
+ "learning_rate": 3.0508474576271192e-06,
18258
+ "loss": 1.132,
18259
+ "step": 25500
18260
+ },
18261
+ {
18262
+ "epoch": 0.64,
18263
+ "eval_loss": 1.3264555931091309,
18264
+ "eval_runtime": 66.1247,
18265
+ "eval_samples_per_second": 15.123,
18266
+ "eval_steps_per_second": 15.123,
18267
+ "step": 25500
18268
+ },
18269
+ {
18270
+ "epoch": 0.64,
18271
+ "grad_norm": 4.297203063964844,
18272
+ "learning_rate": 3.0440677966101694e-06,
18273
+ "loss": 1.0663,
18274
+ "step": 25510
18275
+ },
18276
+ {
18277
+ "epoch": 0.64,
18278
+ "grad_norm": 12.412454605102539,
18279
+ "learning_rate": 3.0372881355932203e-06,
18280
+ "loss": 1.1782,
18281
+ "step": 25520
18282
+ },
18283
+ {
18284
+ "epoch": 0.64,
18285
+ "grad_norm": 10.640459060668945,
18286
+ "learning_rate": 3.0305084745762713e-06,
18287
+ "loss": 1.4982,
18288
+ "step": 25530
18289
+ },
18290
+ {
18291
+ "epoch": 0.64,
18292
+ "grad_norm": 12.518424987792969,
18293
+ "learning_rate": 3.0237288135593223e-06,
18294
+ "loss": 1.4771,
18295
+ "step": 25540
18296
+ },
18297
+ {
18298
+ "epoch": 0.64,
18299
+ "grad_norm": 7.445399284362793,
18300
+ "learning_rate": 3.0169491525423733e-06,
18301
+ "loss": 1.2959,
18302
+ "step": 25550
18303
+ },
18304
+ {
18305
+ "epoch": 0.64,
18306
+ "grad_norm": 3.960470676422119,
18307
+ "learning_rate": 3.010169491525424e-06,
18308
+ "loss": 1.4464,
18309
+ "step": 25560
18310
+ },
18311
+ {
18312
+ "epoch": 0.64,
18313
+ "grad_norm": 11.259561538696289,
18314
+ "learning_rate": 3.003389830508475e-06,
18315
+ "loss": 1.2178,
18316
+ "step": 25570
18317
+ },
18318
+ {
18319
+ "epoch": 0.64,
18320
+ "grad_norm": 7.605580806732178,
18321
+ "learning_rate": 2.9966101694915258e-06,
18322
+ "loss": 1.3286,
18323
+ "step": 25580
18324
+ },
18325
+ {
18326
+ "epoch": 0.64,
18327
+ "grad_norm": 4.82462215423584,
18328
+ "learning_rate": 2.9898305084745768e-06,
18329
+ "loss": 1.4727,
18330
+ "step": 25590
18331
+ },
18332
+ {
18333
+ "epoch": 0.64,
18334
+ "grad_norm": 4.511000633239746,
18335
+ "learning_rate": 2.9830508474576277e-06,
18336
+ "loss": 1.3572,
18337
+ "step": 25600
18338
+ },
18339
+ {
18340
+ "epoch": 0.64,
18341
+ "grad_norm": 6.090604305267334,
18342
+ "learning_rate": 2.9762711864406783e-06,
18343
+ "loss": 1.4802,
18344
+ "step": 25610
18345
+ },
18346
+ {
18347
+ "epoch": 0.64,
18348
+ "grad_norm": 9.687544822692871,
18349
+ "learning_rate": 2.969491525423729e-06,
18350
+ "loss": 1.3652,
18351
+ "step": 25620
18352
+ },
18353
+ {
18354
+ "epoch": 0.64,
18355
+ "grad_norm": 12.760902404785156,
18356
+ "learning_rate": 2.96271186440678e-06,
18357
+ "loss": 1.3634,
18358
+ "step": 25630
18359
+ },
18360
+ {
18361
+ "epoch": 0.64,
18362
+ "grad_norm": 6.688235759735107,
18363
+ "learning_rate": 2.955932203389831e-06,
18364
+ "loss": 1.209,
18365
+ "step": 25640
18366
+ },
18367
+ {
18368
+ "epoch": 0.64,
18369
+ "grad_norm": 5.238013744354248,
18370
+ "learning_rate": 2.9491525423728818e-06,
18371
+ "loss": 1.5462,
18372
+ "step": 25650
18373
+ },
18374
+ {
18375
+ "epoch": 0.64,
18376
+ "grad_norm": 18.444650650024414,
18377
+ "learning_rate": 2.9423728813559327e-06,
18378
+ "loss": 1.2963,
18379
+ "step": 25660
18380
+ },
18381
+ {
18382
+ "epoch": 0.64,
18383
+ "grad_norm": 5.625192642211914,
18384
+ "learning_rate": 2.935593220338983e-06,
18385
+ "loss": 1.3514,
18386
+ "step": 25670
18387
+ },
18388
+ {
18389
+ "epoch": 0.64,
18390
+ "grad_norm": 5.223139762878418,
18391
+ "learning_rate": 2.928813559322034e-06,
18392
+ "loss": 1.2339,
18393
+ "step": 25680
18394
+ },
18395
+ {
18396
+ "epoch": 0.64,
18397
+ "grad_norm": 4.5006422996521,
18398
+ "learning_rate": 2.922033898305085e-06,
18399
+ "loss": 1.4424,
18400
+ "step": 25690
18401
+ },
18402
+ {
18403
+ "epoch": 0.64,
18404
+ "grad_norm": 9.127293586730957,
18405
+ "learning_rate": 2.915254237288136e-06,
18406
+ "loss": 1.2334,
18407
+ "step": 25700
18408
+ },
18409
+ {
18410
+ "epoch": 0.64,
18411
+ "grad_norm": 5.78049373626709,
18412
+ "learning_rate": 2.9084745762711868e-06,
18413
+ "loss": 1.301,
18414
+ "step": 25710
18415
+ },
18416
+ {
18417
+ "epoch": 0.64,
18418
+ "grad_norm": 5.395204544067383,
18419
+ "learning_rate": 2.9016949152542373e-06,
18420
+ "loss": 1.1889,
18421
+ "step": 25720
18422
+ },
18423
+ {
18424
+ "epoch": 0.64,
18425
+ "grad_norm": 2.3502519130706787,
18426
+ "learning_rate": 2.8949152542372883e-06,
18427
+ "loss": 1.41,
18428
+ "step": 25730
18429
+ },
18430
+ {
18431
+ "epoch": 0.64,
18432
+ "grad_norm": 3.3766727447509766,
18433
+ "learning_rate": 2.8881355932203393e-06,
18434
+ "loss": 1.2768,
18435
+ "step": 25740
18436
+ },
18437
+ {
18438
+ "epoch": 0.64,
18439
+ "grad_norm": 6.30269718170166,
18440
+ "learning_rate": 2.8813559322033903e-06,
18441
+ "loss": 1.3432,
18442
+ "step": 25750
18443
+ },
18444
+ {
18445
+ "epoch": 0.64,
18446
+ "grad_norm": 7.727334022521973,
18447
+ "learning_rate": 2.8745762711864412e-06,
18448
+ "loss": 1.3465,
18449
+ "step": 25760
18450
+ },
18451
+ {
18452
+ "epoch": 0.64,
18453
+ "grad_norm": 8.983928680419922,
18454
+ "learning_rate": 2.8677966101694914e-06,
18455
+ "loss": 1.1202,
18456
+ "step": 25770
18457
+ },
18458
+ {
18459
+ "epoch": 0.64,
18460
+ "grad_norm": 4.910791397094727,
18461
+ "learning_rate": 2.8610169491525424e-06,
18462
+ "loss": 1.2038,
18463
+ "step": 25780
18464
+ },
18465
+ {
18466
+ "epoch": 0.64,
18467
+ "grad_norm": 12.079176902770996,
18468
+ "learning_rate": 2.8542372881355933e-06,
18469
+ "loss": 1.1599,
18470
+ "step": 25790
18471
+ },
18472
+ {
18473
+ "epoch": 0.65,
18474
+ "grad_norm": 3.6942214965820312,
18475
+ "learning_rate": 2.8474576271186443e-06,
18476
+ "loss": 1.2242,
18477
+ "step": 25800
18478
+ },
18479
+ {
18480
+ "epoch": 0.65,
18481
+ "grad_norm": 2.4002459049224854,
18482
+ "learning_rate": 2.8406779661016953e-06,
18483
+ "loss": 1.2288,
18484
+ "step": 25810
18485
+ },
18486
+ {
18487
+ "epoch": 0.65,
18488
+ "grad_norm": 4.8168840408325195,
18489
+ "learning_rate": 2.833898305084746e-06,
18490
+ "loss": 1.4489,
18491
+ "step": 25820
18492
+ },
18493
+ {
18494
+ "epoch": 0.65,
18495
+ "grad_norm": 10.90778636932373,
18496
+ "learning_rate": 2.827118644067797e-06,
18497
+ "loss": 1.4108,
18498
+ "step": 25830
18499
+ },
18500
+ {
18501
+ "epoch": 0.65,
18502
+ "grad_norm": 9.852614402770996,
18503
+ "learning_rate": 2.820338983050848e-06,
18504
+ "loss": 1.3685,
18505
+ "step": 25840
18506
+ },
18507
+ {
18508
+ "epoch": 0.65,
18509
+ "grad_norm": 8.391303062438965,
18510
+ "learning_rate": 2.8135593220338988e-06,
18511
+ "loss": 1.2921,
18512
+ "step": 25850
18513
+ },
18514
+ {
18515
+ "epoch": 0.65,
18516
+ "grad_norm": 5.348249912261963,
18517
+ "learning_rate": 2.8067796610169497e-06,
18518
+ "loss": 1.3463,
18519
+ "step": 25860
18520
+ },
18521
+ {
18522
+ "epoch": 0.65,
18523
+ "grad_norm": 3.3569209575653076,
18524
+ "learning_rate": 2.8000000000000003e-06,
18525
+ "loss": 1.2877,
18526
+ "step": 25870
18527
+ },
18528
+ {
18529
+ "epoch": 0.65,
18530
+ "grad_norm": 9.246468544006348,
18531
+ "learning_rate": 2.793220338983051e-06,
18532
+ "loss": 1.3478,
18533
+ "step": 25880
18534
+ },
18535
+ {
18536
+ "epoch": 0.65,
18537
+ "grad_norm": 7.3121771812438965,
18538
+ "learning_rate": 2.786440677966102e-06,
18539
+ "loss": 1.3116,
18540
+ "step": 25890
18541
+ },
18542
+ {
18543
+ "epoch": 0.65,
18544
+ "grad_norm": 7.324717044830322,
18545
+ "learning_rate": 2.779661016949153e-06,
18546
+ "loss": 1.4986,
18547
+ "step": 25900
18548
+ },
18549
+ {
18550
+ "epoch": 0.65,
18551
+ "grad_norm": 6.7373738288879395,
18552
+ "learning_rate": 2.7728813559322038e-06,
18553
+ "loss": 1.5417,
18554
+ "step": 25910
18555
+ },
18556
+ {
18557
+ "epoch": 0.65,
18558
+ "grad_norm": 10.05711555480957,
18559
+ "learning_rate": 2.7661016949152548e-06,
18560
+ "loss": 1.3439,
18561
+ "step": 25920
18562
+ },
18563
+ {
18564
+ "epoch": 0.65,
18565
+ "grad_norm": 3.843493700027466,
18566
+ "learning_rate": 2.7593220338983053e-06,
18567
+ "loss": 1.1626,
18568
+ "step": 25930
18569
+ },
18570
+ {
18571
+ "epoch": 0.65,
18572
+ "grad_norm": 9.513094902038574,
18573
+ "learning_rate": 2.752542372881356e-06,
18574
+ "loss": 1.3412,
18575
+ "step": 25940
18576
+ },
18577
+ {
18578
+ "epoch": 0.65,
18579
+ "grad_norm": 8.064391136169434,
18580
+ "learning_rate": 2.745762711864407e-06,
18581
+ "loss": 1.3483,
18582
+ "step": 25950
18583
+ },
18584
+ {
18585
+ "epoch": 0.65,
18586
+ "grad_norm": 10.491897583007812,
18587
+ "learning_rate": 2.738983050847458e-06,
18588
+ "loss": 1.3214,
18589
+ "step": 25960
18590
+ },
18591
+ {
18592
+ "epoch": 0.65,
18593
+ "grad_norm": 3.9402971267700195,
18594
+ "learning_rate": 2.732203389830509e-06,
18595
+ "loss": 1.3803,
18596
+ "step": 25970
18597
+ },
18598
+ {
18599
+ "epoch": 0.65,
18600
+ "grad_norm": 9.979386329650879,
18601
+ "learning_rate": 2.7254237288135593e-06,
18602
+ "loss": 1.1408,
18603
+ "step": 25980
18604
+ },
18605
+ {
18606
+ "epoch": 0.65,
18607
+ "grad_norm": 7.461246490478516,
18608
+ "learning_rate": 2.7186440677966103e-06,
18609
+ "loss": 1.3098,
18610
+ "step": 25990
18611
+ },
18612
+ {
18613
+ "epoch": 0.65,
18614
+ "grad_norm": 4.412223815917969,
18615
+ "learning_rate": 2.7118644067796613e-06,
18616
+ "loss": 1.457,
18617
+ "step": 26000
18618
+ },
18619
+ {
18620
+ "epoch": 0.65,
18621
+ "eval_loss": 1.3418877124786377,
18622
+ "eval_runtime": 66.1458,
18623
+ "eval_samples_per_second": 15.118,
18624
+ "eval_steps_per_second": 15.118,
18625
+ "step": 26000
18626
+ },
18627
+ {
18628
+ "epoch": 0.65,
18629
+ "grad_norm": 4.992754936218262,
18630
+ "learning_rate": 2.7050847457627123e-06,
18631
+ "loss": 1.1937,
18632
+ "step": 26010
18633
+ },
18634
+ {
18635
+ "epoch": 0.65,
18636
+ "grad_norm": 5.497439861297607,
18637
+ "learning_rate": 2.6983050847457633e-06,
18638
+ "loss": 1.3435,
18639
+ "step": 26020
18640
+ },
18641
+ {
18642
+ "epoch": 0.65,
18643
+ "grad_norm": 4.283881664276123,
18644
+ "learning_rate": 2.6915254237288134e-06,
18645
+ "loss": 1.3997,
18646
+ "step": 26030
18647
+ },
18648
+ {
18649
+ "epoch": 0.65,
18650
+ "grad_norm": 6.579436779022217,
18651
+ "learning_rate": 2.6847457627118644e-06,
18652
+ "loss": 1.4047,
18653
+ "step": 26040
18654
+ },
18655
+ {
18656
+ "epoch": 0.65,
18657
+ "grad_norm": 4.805014133453369,
18658
+ "learning_rate": 2.6779661016949153e-06,
18659
+ "loss": 1.4021,
18660
+ "step": 26050
18661
+ },
18662
+ {
18663
+ "epoch": 0.65,
18664
+ "grad_norm": 4.568453788757324,
18665
+ "learning_rate": 2.6711864406779663e-06,
18666
+ "loss": 1.4761,
18667
+ "step": 26060
18668
+ },
18669
+ {
18670
+ "epoch": 0.65,
18671
+ "grad_norm": 5.559253692626953,
18672
+ "learning_rate": 2.6644067796610173e-06,
18673
+ "loss": 1.2958,
18674
+ "step": 26070
18675
+ },
18676
+ {
18677
+ "epoch": 0.65,
18678
+ "grad_norm": 14.279343605041504,
18679
+ "learning_rate": 2.657627118644068e-06,
18680
+ "loss": 1.194,
18681
+ "step": 26080
18682
+ },
18683
+ {
18684
+ "epoch": 0.65,
18685
+ "grad_norm": 15.45864486694336,
18686
+ "learning_rate": 2.650847457627119e-06,
18687
+ "loss": 1.2134,
18688
+ "step": 26090
18689
+ },
18690
+ {
18691
+ "epoch": 0.65,
18692
+ "grad_norm": 17.310068130493164,
18693
+ "learning_rate": 2.64406779661017e-06,
18694
+ "loss": 1.2946,
18695
+ "step": 26100
18696
+ },
18697
+ {
18698
+ "epoch": 0.65,
18699
+ "grad_norm": 4.31659460067749,
18700
+ "learning_rate": 2.6372881355932208e-06,
18701
+ "loss": 1.3783,
18702
+ "step": 26110
18703
+ },
18704
+ {
18705
+ "epoch": 0.65,
18706
+ "grad_norm": 14.684484481811523,
18707
+ "learning_rate": 2.6305084745762718e-06,
18708
+ "loss": 1.3414,
18709
+ "step": 26120
18710
+ },
18711
+ {
18712
+ "epoch": 0.65,
18713
+ "grad_norm": 5.934599876403809,
18714
+ "learning_rate": 2.6237288135593223e-06,
18715
+ "loss": 1.3266,
18716
+ "step": 26130
18717
+ },
18718
+ {
18719
+ "epoch": 0.65,
18720
+ "grad_norm": 7.535737037658691,
18721
+ "learning_rate": 2.616949152542373e-06,
18722
+ "loss": 1.2579,
18723
+ "step": 26140
18724
+ },
18725
+ {
18726
+ "epoch": 0.65,
18727
+ "grad_norm": 5.8215556144714355,
18728
+ "learning_rate": 2.610169491525424e-06,
18729
+ "loss": 1.304,
18730
+ "step": 26150
18731
+ },
18732
+ {
18733
+ "epoch": 0.65,
18734
+ "grad_norm": 5.646538734436035,
18735
+ "learning_rate": 2.603389830508475e-06,
18736
+ "loss": 1.2792,
18737
+ "step": 26160
18738
+ },
18739
+ {
18740
+ "epoch": 0.65,
18741
+ "grad_norm": 3.834282398223877,
18742
+ "learning_rate": 2.596610169491526e-06,
18743
+ "loss": 1.4717,
18744
+ "step": 26170
18745
+ },
18746
+ {
18747
+ "epoch": 0.65,
18748
+ "grad_norm": 3.313835620880127,
18749
+ "learning_rate": 2.5898305084745768e-06,
18750
+ "loss": 1.3312,
18751
+ "step": 26180
18752
+ },
18753
+ {
18754
+ "epoch": 0.65,
18755
+ "grad_norm": 20.453479766845703,
18756
+ "learning_rate": 2.5830508474576273e-06,
18757
+ "loss": 1.3784,
18758
+ "step": 26190
18759
+ },
18760
+ {
18761
+ "epoch": 0.66,
18762
+ "grad_norm": 1.0731091499328613,
18763
+ "learning_rate": 2.576271186440678e-06,
18764
+ "loss": 1.4162,
18765
+ "step": 26200
18766
+ },
18767
+ {
18768
+ "epoch": 0.66,
18769
+ "grad_norm": 7.5899577140808105,
18770
+ "learning_rate": 2.569491525423729e-06,
18771
+ "loss": 1.2536,
18772
+ "step": 26210
18773
+ },
18774
+ {
18775
+ "epoch": 0.66,
18776
+ "grad_norm": 4.527303218841553,
18777
+ "learning_rate": 2.56271186440678e-06,
18778
+ "loss": 1.5048,
18779
+ "step": 26220
18780
+ },
18781
+ {
18782
+ "epoch": 0.66,
18783
+ "grad_norm": 3.702897787094116,
18784
+ "learning_rate": 2.555932203389831e-06,
18785
+ "loss": 1.1666,
18786
+ "step": 26230
18787
+ },
18788
+ {
18789
+ "epoch": 0.66,
18790
+ "grad_norm": 3.448979139328003,
18791
+ "learning_rate": 2.5491525423728814e-06,
18792
+ "loss": 1.2812,
18793
+ "step": 26240
18794
+ },
18795
+ {
18796
+ "epoch": 0.66,
18797
+ "grad_norm": 15.188081741333008,
18798
+ "learning_rate": 2.5423728813559323e-06,
18799
+ "loss": 1.3521,
18800
+ "step": 26250
18801
+ },
18802
+ {
18803
+ "epoch": 0.66,
18804
+ "grad_norm": 8.675294876098633,
18805
+ "learning_rate": 2.5355932203389833e-06,
18806
+ "loss": 1.4845,
18807
+ "step": 26260
18808
+ },
18809
+ {
18810
+ "epoch": 0.66,
18811
+ "grad_norm": 5.673226356506348,
18812
+ "learning_rate": 2.5288135593220343e-06,
18813
+ "loss": 1.2264,
18814
+ "step": 26270
18815
+ },
18816
+ {
18817
+ "epoch": 0.66,
18818
+ "grad_norm": 6.015854358673096,
18819
+ "learning_rate": 2.5220338983050853e-06,
18820
+ "loss": 1.3036,
18821
+ "step": 26280
18822
+ },
18823
+ {
18824
+ "epoch": 0.66,
18825
+ "grad_norm": 12.492931365966797,
18826
+ "learning_rate": 2.5152542372881354e-06,
18827
+ "loss": 1.4434,
18828
+ "step": 26290
18829
+ },
18830
+ {
18831
+ "epoch": 0.66,
18832
+ "grad_norm": 5.928922176361084,
18833
+ "learning_rate": 2.5084745762711864e-06,
18834
+ "loss": 1.3118,
18835
+ "step": 26300
18836
+ },
18837
+ {
18838
+ "epoch": 0.66,
18839
+ "grad_norm": 5.230995178222656,
18840
+ "learning_rate": 2.5016949152542374e-06,
18841
+ "loss": 1.3524,
18842
+ "step": 26310
18843
+ },
18844
+ {
18845
+ "epoch": 0.66,
18846
+ "grad_norm": 6.7296671867370605,
18847
+ "learning_rate": 2.4949152542372883e-06,
18848
+ "loss": 1.3651,
18849
+ "step": 26320
18850
+ },
18851
+ {
18852
+ "epoch": 0.66,
18853
+ "grad_norm": 2.378596544265747,
18854
+ "learning_rate": 2.488135593220339e-06,
18855
+ "loss": 1.4236,
18856
+ "step": 26330
18857
+ },
18858
+ {
18859
+ "epoch": 0.66,
18860
+ "grad_norm": 5.858238697052002,
18861
+ "learning_rate": 2.48135593220339e-06,
18862
+ "loss": 1.1553,
18863
+ "step": 26340
18864
+ },
18865
+ {
18866
+ "epoch": 0.66,
18867
+ "grad_norm": 3.7590560913085938,
18868
+ "learning_rate": 2.474576271186441e-06,
18869
+ "loss": 1.4302,
18870
+ "step": 26350
18871
+ },
18872
+ {
18873
+ "epoch": 0.66,
18874
+ "grad_norm": 10.852778434753418,
18875
+ "learning_rate": 2.467796610169492e-06,
18876
+ "loss": 1.1943,
18877
+ "step": 26360
18878
+ },
18879
+ {
18880
+ "epoch": 0.66,
18881
+ "grad_norm": 11.347558975219727,
18882
+ "learning_rate": 2.461016949152543e-06,
18883
+ "loss": 1.2559,
18884
+ "step": 26370
18885
+ },
18886
+ {
18887
+ "epoch": 0.66,
18888
+ "grad_norm": 6.081493854522705,
18889
+ "learning_rate": 2.4542372881355933e-06,
18890
+ "loss": 1.3532,
18891
+ "step": 26380
18892
+ },
18893
+ {
18894
+ "epoch": 0.66,
18895
+ "grad_norm": 8.263628959655762,
18896
+ "learning_rate": 2.4474576271186443e-06,
18897
+ "loss": 1.2041,
18898
+ "step": 26390
18899
+ },
18900
+ {
18901
+ "epoch": 0.66,
18902
+ "grad_norm": 7.144092082977295,
18903
+ "learning_rate": 2.4406779661016953e-06,
18904
+ "loss": 1.2193,
18905
+ "step": 26400
18906
+ },
18907
+ {
18908
+ "epoch": 0.66,
18909
+ "grad_norm": 6.578696250915527,
18910
+ "learning_rate": 2.433898305084746e-06,
18911
+ "loss": 1.3139,
18912
+ "step": 26410
18913
+ },
18914
+ {
18915
+ "epoch": 0.66,
18916
+ "grad_norm": 3.019033193588257,
18917
+ "learning_rate": 2.427118644067797e-06,
18918
+ "loss": 1.4642,
18919
+ "step": 26420
18920
+ },
18921
+ {
18922
+ "epoch": 0.66,
18923
+ "grad_norm": 5.443304538726807,
18924
+ "learning_rate": 2.4203389830508474e-06,
18925
+ "loss": 1.3738,
18926
+ "step": 26430
18927
+ },
18928
+ {
18929
+ "epoch": 0.66,
18930
+ "grad_norm": 10.051207542419434,
18931
+ "learning_rate": 2.4135593220338984e-06,
18932
+ "loss": 1.4396,
18933
+ "step": 26440
18934
+ },
18935
+ {
18936
+ "epoch": 0.66,
18937
+ "grad_norm": 10.604415893554688,
18938
+ "learning_rate": 2.4067796610169493e-06,
18939
+ "loss": 1.2971,
18940
+ "step": 26450
18941
+ },
18942
+ {
18943
+ "epoch": 0.66,
18944
+ "grad_norm": 9.754619598388672,
18945
+ "learning_rate": 2.4000000000000003e-06,
18946
+ "loss": 1.3421,
18947
+ "step": 26460
18948
+ },
18949
+ {
18950
+ "epoch": 0.66,
18951
+ "grad_norm": 6.159849166870117,
18952
+ "learning_rate": 2.393220338983051e-06,
18953
+ "loss": 1.3461,
18954
+ "step": 26470
18955
+ },
18956
+ {
18957
+ "epoch": 0.66,
18958
+ "grad_norm": 1.3210265636444092,
18959
+ "learning_rate": 2.386440677966102e-06,
18960
+ "loss": 1.2615,
18961
+ "step": 26480
18962
+ },
18963
+ {
18964
+ "epoch": 0.66,
18965
+ "grad_norm": 5.67736291885376,
18966
+ "learning_rate": 2.379661016949153e-06,
18967
+ "loss": 1.3296,
18968
+ "step": 26490
18969
+ },
18970
+ {
18971
+ "epoch": 0.66,
18972
+ "grad_norm": 4.055379867553711,
18973
+ "learning_rate": 2.372881355932204e-06,
18974
+ "loss": 1.4705,
18975
+ "step": 26500
18976
+ },
18977
+ {
18978
+ "epoch": 0.66,
18979
+ "eval_loss": 1.2714667320251465,
18980
+ "eval_runtime": 66.1328,
18981
+ "eval_samples_per_second": 15.121,
18982
+ "eval_steps_per_second": 15.121,
18983
+ "step": 26500
18984
+ },
18985
+ {
18986
+ "epoch": 0.66,
18987
+ "grad_norm": 7.19535493850708,
18988
+ "learning_rate": 2.3661016949152544e-06,
18989
+ "loss": 1.2956,
18990
+ "step": 26510
18991
+ },
18992
+ {
18993
+ "epoch": 0.66,
18994
+ "grad_norm": 7.952252388000488,
18995
+ "learning_rate": 2.3593220338983053e-06,
18996
+ "loss": 1.2952,
18997
+ "step": 26520
18998
+ },
18999
+ {
19000
+ "epoch": 0.66,
19001
+ "grad_norm": 1.3878239393234253,
19002
+ "learning_rate": 2.3525423728813563e-06,
19003
+ "loss": 1.3957,
19004
+ "step": 26530
19005
+ },
19006
+ {
19007
+ "epoch": 0.66,
19008
+ "grad_norm": 9.453435897827148,
19009
+ "learning_rate": 2.345762711864407e-06,
19010
+ "loss": 1.3314,
19011
+ "step": 26540
19012
+ },
19013
+ {
19014
+ "epoch": 0.66,
19015
+ "grad_norm": 2.3859434127807617,
19016
+ "learning_rate": 2.338983050847458e-06,
19017
+ "loss": 1.3959,
19018
+ "step": 26550
19019
+ },
19020
+ {
19021
+ "epoch": 0.66,
19022
+ "grad_norm": 4.052861213684082,
19023
+ "learning_rate": 2.3322033898305084e-06,
19024
+ "loss": 1.3097,
19025
+ "step": 26560
19026
+ },
19027
+ {
19028
+ "epoch": 0.66,
19029
+ "grad_norm": 14.907791137695312,
19030
+ "learning_rate": 2.3254237288135594e-06,
19031
+ "loss": 1.3706,
19032
+ "step": 26570
19033
+ },
19034
+ {
19035
+ "epoch": 0.66,
19036
+ "grad_norm": 7.020768642425537,
19037
+ "learning_rate": 2.3186440677966103e-06,
19038
+ "loss": 1.3536,
19039
+ "step": 26580
19040
+ },
19041
+ {
19042
+ "epoch": 0.66,
19043
+ "grad_norm": 15.225879669189453,
19044
+ "learning_rate": 2.3118644067796613e-06,
19045
+ "loss": 1.354,
19046
+ "step": 26590
19047
+ },
19048
+ {
19049
+ "epoch": 0.67,
19050
+ "grad_norm": 6.249582767486572,
19051
+ "learning_rate": 2.305084745762712e-06,
19052
+ "loss": 1.2721,
19053
+ "step": 26600
19054
+ },
19055
+ {
19056
+ "epoch": 0.67,
19057
+ "grad_norm": 4.756400108337402,
19058
+ "learning_rate": 2.298305084745763e-06,
19059
+ "loss": 1.3044,
19060
+ "step": 26610
19061
+ },
19062
+ {
19063
+ "epoch": 0.67,
19064
+ "grad_norm": 9.848575592041016,
19065
+ "learning_rate": 2.291525423728814e-06,
19066
+ "loss": 1.2471,
19067
+ "step": 26620
19068
+ },
19069
+ {
19070
+ "epoch": 0.67,
19071
+ "grad_norm": 9.997851371765137,
19072
+ "learning_rate": 2.284745762711865e-06,
19073
+ "loss": 1.1673,
19074
+ "step": 26630
19075
+ },
19076
+ {
19077
+ "epoch": 0.67,
19078
+ "grad_norm": 11.57490062713623,
19079
+ "learning_rate": 2.2779661016949154e-06,
19080
+ "loss": 1.3241,
19081
+ "step": 26640
19082
+ },
19083
+ {
19084
+ "epoch": 0.67,
19085
+ "grad_norm": 8.282011032104492,
19086
+ "learning_rate": 2.2711864406779663e-06,
19087
+ "loss": 1.2821,
19088
+ "step": 26650
19089
+ },
19090
+ {
19091
+ "epoch": 0.67,
19092
+ "grad_norm": 6.8680644035339355,
19093
+ "learning_rate": 2.2644067796610173e-06,
19094
+ "loss": 1.2355,
19095
+ "step": 26660
19096
+ },
19097
+ {
19098
+ "epoch": 0.67,
19099
+ "grad_norm": 15.327707290649414,
19100
+ "learning_rate": 2.257627118644068e-06,
19101
+ "loss": 1.2088,
19102
+ "step": 26670
19103
+ },
19104
+ {
19105
+ "epoch": 0.67,
19106
+ "grad_norm": 6.978293418884277,
19107
+ "learning_rate": 2.250847457627119e-06,
19108
+ "loss": 1.3372,
19109
+ "step": 26680
19110
+ },
19111
+ {
19112
+ "epoch": 0.67,
19113
+ "grad_norm": 10.799981117248535,
19114
+ "learning_rate": 2.2440677966101694e-06,
19115
+ "loss": 1.4839,
19116
+ "step": 26690
19117
+ },
19118
+ {
19119
+ "epoch": 0.67,
19120
+ "grad_norm": 4.861376762390137,
19121
+ "learning_rate": 2.2372881355932204e-06,
19122
+ "loss": 1.3361,
19123
+ "step": 26700
19124
+ },
19125
+ {
19126
+ "epoch": 0.67,
19127
+ "grad_norm": 5.07356595993042,
19128
+ "learning_rate": 2.2305084745762714e-06,
19129
+ "loss": 1.1581,
19130
+ "step": 26710
19131
+ },
19132
+ {
19133
+ "epoch": 0.67,
19134
+ "grad_norm": 16.509151458740234,
19135
+ "learning_rate": 2.2237288135593223e-06,
19136
+ "loss": 1.3014,
19137
+ "step": 26720
19138
+ },
19139
+ {
19140
+ "epoch": 0.67,
19141
+ "grad_norm": 8.232512474060059,
19142
+ "learning_rate": 2.216949152542373e-06,
19143
+ "loss": 1.4513,
19144
+ "step": 26730
19145
+ },
19146
+ {
19147
+ "epoch": 0.67,
19148
+ "grad_norm": 4.673596382141113,
19149
+ "learning_rate": 2.210169491525424e-06,
19150
+ "loss": 1.3162,
19151
+ "step": 26740
19152
+ },
19153
+ {
19154
+ "epoch": 0.67,
19155
+ "grad_norm": 3.565079927444458,
19156
+ "learning_rate": 2.203389830508475e-06,
19157
+ "loss": 1.3423,
19158
+ "step": 26750
19159
+ },
19160
+ {
19161
+ "epoch": 0.67,
19162
+ "grad_norm": 8.415205001831055,
19163
+ "learning_rate": 2.196610169491526e-06,
19164
+ "loss": 1.3341,
19165
+ "step": 26760
19166
+ },
19167
+ {
19168
+ "epoch": 0.67,
19169
+ "grad_norm": 13.892396926879883,
19170
+ "learning_rate": 2.1898305084745764e-06,
19171
+ "loss": 1.3268,
19172
+ "step": 26770
19173
+ },
19174
+ {
19175
+ "epoch": 0.67,
19176
+ "grad_norm": 12.167963981628418,
19177
+ "learning_rate": 2.1830508474576273e-06,
19178
+ "loss": 1.3763,
19179
+ "step": 26780
19180
+ },
19181
+ {
19182
+ "epoch": 0.67,
19183
+ "grad_norm": 9.574051856994629,
19184
+ "learning_rate": 2.1762711864406783e-06,
19185
+ "loss": 1.5376,
19186
+ "step": 26790
19187
+ },
19188
+ {
19189
+ "epoch": 0.67,
19190
+ "grad_norm": 12.77506160736084,
19191
+ "learning_rate": 2.169491525423729e-06,
19192
+ "loss": 1.2457,
19193
+ "step": 26800
19194
+ },
19195
+ {
19196
+ "epoch": 0.67,
19197
+ "grad_norm": 7.312320232391357,
19198
+ "learning_rate": 2.16271186440678e-06,
19199
+ "loss": 1.3077,
19200
+ "step": 26810
19201
+ },
19202
+ {
19203
+ "epoch": 0.67,
19204
+ "grad_norm": 10.53618335723877,
19205
+ "learning_rate": 2.1559322033898304e-06,
19206
+ "loss": 1.3706,
19207
+ "step": 26820
19208
+ },
19209
+ {
19210
+ "epoch": 0.67,
19211
+ "grad_norm": 4.399540424346924,
19212
+ "learning_rate": 2.1491525423728814e-06,
19213
+ "loss": 1.3237,
19214
+ "step": 26830
19215
+ },
19216
+ {
19217
+ "epoch": 0.67,
19218
+ "grad_norm": 8.757078170776367,
19219
+ "learning_rate": 2.1423728813559324e-06,
19220
+ "loss": 1.2385,
19221
+ "step": 26840
19222
+ },
19223
+ {
19224
+ "epoch": 0.67,
19225
+ "grad_norm": 3.866237163543701,
19226
+ "learning_rate": 2.1355932203389833e-06,
19227
+ "loss": 1.4038,
19228
+ "step": 26850
19229
+ },
19230
+ {
19231
+ "epoch": 0.67,
19232
+ "grad_norm": 4.188089370727539,
19233
+ "learning_rate": 2.128813559322034e-06,
19234
+ "loss": 1.5465,
19235
+ "step": 26860
19236
+ },
19237
+ {
19238
+ "epoch": 0.67,
19239
+ "grad_norm": 10.617339134216309,
19240
+ "learning_rate": 2.122033898305085e-06,
19241
+ "loss": 1.1873,
19242
+ "step": 26870
19243
+ },
19244
+ {
19245
+ "epoch": 0.67,
19246
+ "grad_norm": 3.4754912853240967,
19247
+ "learning_rate": 2.115254237288136e-06,
19248
+ "loss": 1.2539,
19249
+ "step": 26880
19250
+ },
19251
+ {
19252
+ "epoch": 0.67,
19253
+ "grad_norm": 6.057491302490234,
19254
+ "learning_rate": 2.108474576271187e-06,
19255
+ "loss": 1.2034,
19256
+ "step": 26890
19257
+ },
19258
+ {
19259
+ "epoch": 0.67,
19260
+ "grad_norm": 9.13399887084961,
19261
+ "learning_rate": 2.1016949152542374e-06,
19262
+ "loss": 1.4333,
19263
+ "step": 26900
19264
+ },
19265
+ {
19266
+ "epoch": 0.67,
19267
+ "grad_norm": 14.797123908996582,
19268
+ "learning_rate": 2.0949152542372883e-06,
19269
+ "loss": 1.1961,
19270
+ "step": 26910
19271
+ },
19272
+ {
19273
+ "epoch": 0.67,
19274
+ "grad_norm": 6.3039398193359375,
19275
+ "learning_rate": 2.0881355932203393e-06,
19276
+ "loss": 1.0776,
19277
+ "step": 26920
19278
+ },
19279
+ {
19280
+ "epoch": 0.67,
19281
+ "grad_norm": 5.700479507446289,
19282
+ "learning_rate": 2.08135593220339e-06,
19283
+ "loss": 1.5808,
19284
+ "step": 26930
19285
+ },
19286
+ {
19287
+ "epoch": 0.67,
19288
+ "grad_norm": 17.28398895263672,
19289
+ "learning_rate": 2.074576271186441e-06,
19290
+ "loss": 1.4131,
19291
+ "step": 26940
19292
+ },
19293
+ {
19294
+ "epoch": 0.67,
19295
+ "grad_norm": 5.036632061004639,
19296
+ "learning_rate": 2.0677966101694914e-06,
19297
+ "loss": 1.1761,
19298
+ "step": 26950
19299
+ },
19300
+ {
19301
+ "epoch": 0.67,
19302
+ "grad_norm": 4.72507381439209,
19303
+ "learning_rate": 2.0610169491525424e-06,
19304
+ "loss": 1.3633,
19305
+ "step": 26960
19306
+ },
19307
+ {
19308
+ "epoch": 0.67,
19309
+ "grad_norm": 18.702543258666992,
19310
+ "learning_rate": 2.0542372881355934e-06,
19311
+ "loss": 1.3922,
19312
+ "step": 26970
19313
+ },
19314
+ {
19315
+ "epoch": 0.67,
19316
+ "grad_norm": 11.218758583068848,
19317
+ "learning_rate": 2.0474576271186443e-06,
19318
+ "loss": 1.3427,
19319
+ "step": 26980
19320
+ },
19321
+ {
19322
+ "epoch": 0.67,
19323
+ "grad_norm": 5.716217517852783,
19324
+ "learning_rate": 2.0406779661016953e-06,
19325
+ "loss": 1.2603,
19326
+ "step": 26990
19327
+ },
19328
+ {
19329
+ "epoch": 0.68,
19330
+ "grad_norm": 8.085409164428711,
19331
+ "learning_rate": 2.033898305084746e-06,
19332
+ "loss": 1.4003,
19333
+ "step": 27000
19334
+ },
19335
+ {
19336
+ "epoch": 0.68,
19337
+ "eval_loss": 1.3196759223937988,
19338
+ "eval_runtime": 66.1008,
19339
+ "eval_samples_per_second": 15.128,
19340
+ "eval_steps_per_second": 15.128,
19341
+ "step": 27000
19342
+ },
19343
+ {
19344
+ "epoch": 0.68,
19345
+ "grad_norm": 2.983642578125,
19346
+ "learning_rate": 2.027118644067797e-06,
19347
+ "loss": 1.2987,
19348
+ "step": 27010
19349
+ },
19350
+ {
19351
+ "epoch": 0.68,
19352
+ "grad_norm": 3.0962276458740234,
19353
+ "learning_rate": 2.020338983050848e-06,
19354
+ "loss": 1.3189,
19355
+ "step": 27020
19356
+ },
19357
+ {
19358
+ "epoch": 0.68,
19359
+ "grad_norm": 4.857142925262451,
19360
+ "learning_rate": 2.0135593220338984e-06,
19361
+ "loss": 1.3651,
19362
+ "step": 27030
19363
+ },
19364
+ {
19365
+ "epoch": 0.68,
19366
+ "grad_norm": 4.3937482833862305,
19367
+ "learning_rate": 2.0067796610169494e-06,
19368
+ "loss": 1.4407,
19369
+ "step": 27040
19370
+ },
19371
+ {
19372
+ "epoch": 0.68,
19373
+ "grad_norm": 4.091692924499512,
19374
+ "learning_rate": 2.0000000000000003e-06,
19375
+ "loss": 1.2065,
19376
+ "step": 27050
19377
+ },
19378
+ {
19379
+ "epoch": 0.68,
19380
+ "grad_norm": 7.725543975830078,
19381
+ "learning_rate": 1.993220338983051e-06,
19382
+ "loss": 1.1836,
19383
+ "step": 27060
19384
+ },
19385
+ {
19386
+ "epoch": 0.68,
19387
+ "grad_norm": 5.124769687652588,
19388
+ "learning_rate": 1.986440677966102e-06,
19389
+ "loss": 1.3262,
19390
+ "step": 27070
19391
+ },
19392
+ {
19393
+ "epoch": 0.68,
19394
+ "grad_norm": 5.097384452819824,
19395
+ "learning_rate": 1.9796610169491524e-06,
19396
+ "loss": 1.3057,
19397
+ "step": 27080
19398
+ },
19399
+ {
19400
+ "epoch": 0.68,
19401
+ "grad_norm": 3.207469940185547,
19402
+ "learning_rate": 1.9728813559322034e-06,
19403
+ "loss": 1.241,
19404
+ "step": 27090
19405
+ },
19406
+ {
19407
+ "epoch": 0.68,
19408
+ "grad_norm": 8.80706787109375,
19409
+ "learning_rate": 1.9661016949152544e-06,
19410
+ "loss": 1.4066,
19411
+ "step": 27100
19412
+ },
19413
+ {
19414
+ "epoch": 0.68,
19415
+ "grad_norm": 2.1298069953918457,
19416
+ "learning_rate": 1.9593220338983053e-06,
19417
+ "loss": 1.3394,
19418
+ "step": 27110
19419
+ },
19420
+ {
19421
+ "epoch": 0.68,
19422
+ "grad_norm": 9.768424034118652,
19423
+ "learning_rate": 1.9525423728813563e-06,
19424
+ "loss": 1.4714,
19425
+ "step": 27120
19426
+ },
19427
+ {
19428
+ "epoch": 0.68,
19429
+ "grad_norm": 16.757766723632812,
19430
+ "learning_rate": 1.945762711864407e-06,
19431
+ "loss": 1.4322,
19432
+ "step": 27130
19433
+ },
19434
+ {
19435
+ "epoch": 0.68,
19436
+ "grad_norm": 4.268533229827881,
19437
+ "learning_rate": 1.938983050847458e-06,
19438
+ "loss": 1.3988,
19439
+ "step": 27140
19440
+ },
19441
+ {
19442
+ "epoch": 0.68,
19443
+ "grad_norm": 7.853205680847168,
19444
+ "learning_rate": 1.932203389830509e-06,
19445
+ "loss": 1.2837,
19446
+ "step": 27150
19447
+ },
19448
+ {
19449
+ "epoch": 0.68,
19450
+ "grad_norm": 3.1779301166534424,
19451
+ "learning_rate": 1.9254237288135594e-06,
19452
+ "loss": 1.3075,
19453
+ "step": 27160
19454
+ },
19455
+ {
19456
+ "epoch": 0.68,
19457
+ "grad_norm": 7.716470718383789,
19458
+ "learning_rate": 1.9186440677966104e-06,
19459
+ "loss": 1.2207,
19460
+ "step": 27170
19461
+ },
19462
+ {
19463
+ "epoch": 0.68,
19464
+ "grad_norm": 4.0963029861450195,
19465
+ "learning_rate": 1.9118644067796613e-06,
19466
+ "loss": 1.3241,
19467
+ "step": 27180
19468
+ },
19469
+ {
19470
+ "epoch": 0.68,
19471
+ "grad_norm": 10.319551467895508,
19472
+ "learning_rate": 1.9050847457627119e-06,
19473
+ "loss": 1.4133,
19474
+ "step": 27190
19475
+ },
19476
+ {
19477
+ "epoch": 0.68,
19478
+ "grad_norm": 4.663591384887695,
19479
+ "learning_rate": 1.8983050847457629e-06,
19480
+ "loss": 1.2505,
19481
+ "step": 27200
19482
+ },
19483
+ {
19484
+ "epoch": 0.68,
19485
+ "grad_norm": 8.912657737731934,
19486
+ "learning_rate": 1.8915254237288136e-06,
19487
+ "loss": 1.2057,
19488
+ "step": 27210
19489
+ },
19490
+ {
19491
+ "epoch": 0.68,
19492
+ "grad_norm": 11.967384338378906,
19493
+ "learning_rate": 1.8847457627118646e-06,
19494
+ "loss": 1.4043,
19495
+ "step": 27220
19496
+ },
19497
+ {
19498
+ "epoch": 0.68,
19499
+ "grad_norm": 19.304821014404297,
19500
+ "learning_rate": 1.8779661016949156e-06,
19501
+ "loss": 1.3083,
19502
+ "step": 27230
19503
+ },
19504
+ {
19505
+ "epoch": 0.68,
19506
+ "grad_norm": 4.1671600341796875,
19507
+ "learning_rate": 1.8711864406779661e-06,
19508
+ "loss": 1.415,
19509
+ "step": 27240
19510
+ },
19511
+ {
19512
+ "epoch": 0.68,
19513
+ "grad_norm": 3.430443525314331,
19514
+ "learning_rate": 1.8644067796610171e-06,
19515
+ "loss": 1.3154,
19516
+ "step": 27250
19517
+ },
19518
+ {
19519
+ "epoch": 0.68,
19520
+ "grad_norm": 8.876851081848145,
19521
+ "learning_rate": 1.857627118644068e-06,
19522
+ "loss": 1.2701,
19523
+ "step": 27260
19524
+ },
19525
+ {
19526
+ "epoch": 0.68,
19527
+ "grad_norm": 12.077353477478027,
19528
+ "learning_rate": 1.8508474576271189e-06,
19529
+ "loss": 1.3744,
19530
+ "step": 27270
19531
+ },
19532
+ {
19533
+ "epoch": 0.68,
19534
+ "grad_norm": 1.86147940158844,
19535
+ "learning_rate": 1.8440677966101696e-06,
19536
+ "loss": 1.4091,
19537
+ "step": 27280
19538
+ },
19539
+ {
19540
+ "epoch": 0.68,
19541
+ "grad_norm": 6.873291969299316,
19542
+ "learning_rate": 1.8372881355932204e-06,
19543
+ "loss": 1.3488,
19544
+ "step": 27290
19545
+ },
19546
+ {
19547
+ "epoch": 0.68,
19548
+ "grad_norm": 5.088196754455566,
19549
+ "learning_rate": 1.8305084745762714e-06,
19550
+ "loss": 1.437,
19551
+ "step": 27300
19552
+ },
19553
+ {
19554
+ "epoch": 0.68,
19555
+ "grad_norm": 3.1845881938934326,
19556
+ "learning_rate": 1.8237288135593223e-06,
19557
+ "loss": 1.3961,
19558
+ "step": 27310
19559
+ },
19560
+ {
19561
+ "epoch": 0.68,
19562
+ "grad_norm": 8.228582382202148,
19563
+ "learning_rate": 1.816949152542373e-06,
19564
+ "loss": 1.1022,
19565
+ "step": 27320
19566
+ },
19567
+ {
19568
+ "epoch": 0.68,
19569
+ "grad_norm": 15.932997703552246,
19570
+ "learning_rate": 1.8101694915254239e-06,
19571
+ "loss": 1.3529,
19572
+ "step": 27330
19573
+ },
19574
+ {
19575
+ "epoch": 0.68,
19576
+ "grad_norm": 1.3720171451568604,
19577
+ "learning_rate": 1.8033898305084746e-06,
19578
+ "loss": 1.2631,
19579
+ "step": 27340
19580
+ },
19581
+ {
19582
+ "epoch": 0.68,
19583
+ "grad_norm": 3.995771646499634,
19584
+ "learning_rate": 1.7966101694915256e-06,
19585
+ "loss": 1.2678,
19586
+ "step": 27350
19587
+ },
19588
+ {
19589
+ "epoch": 0.68,
19590
+ "grad_norm": 5.7671895027160645,
19591
+ "learning_rate": 1.7898305084745766e-06,
19592
+ "loss": 1.329,
19593
+ "step": 27360
19594
+ },
19595
+ {
19596
+ "epoch": 0.68,
19597
+ "grad_norm": 3.57239031791687,
19598
+ "learning_rate": 1.7830508474576271e-06,
19599
+ "loss": 1.2557,
19600
+ "step": 27370
19601
+ },
19602
+ {
19603
+ "epoch": 0.68,
19604
+ "grad_norm": 8.127071380615234,
19605
+ "learning_rate": 1.7762711864406781e-06,
19606
+ "loss": 1.3917,
19607
+ "step": 27380
19608
+ },
19609
+ {
19610
+ "epoch": 0.68,
19611
+ "grad_norm": 11.008809089660645,
19612
+ "learning_rate": 1.769491525423729e-06,
19613
+ "loss": 1.4125,
19614
+ "step": 27390
19615
+ },
19616
+ {
19617
+ "epoch": 0.69,
19618
+ "grad_norm": 3.305449962615967,
19619
+ "learning_rate": 1.7627118644067799e-06,
19620
+ "loss": 1.4367,
19621
+ "step": 27400
19622
+ },
19623
+ {
19624
+ "epoch": 0.69,
19625
+ "grad_norm": 4.471165180206299,
19626
+ "learning_rate": 1.7559322033898306e-06,
19627
+ "loss": 1.1336,
19628
+ "step": 27410
19629
+ },
19630
+ {
19631
+ "epoch": 0.69,
19632
+ "grad_norm": 4.836517810821533,
19633
+ "learning_rate": 1.7491525423728814e-06,
19634
+ "loss": 1.37,
19635
+ "step": 27420
19636
+ },
19637
+ {
19638
+ "epoch": 0.69,
19639
+ "grad_norm": 6.102312088012695,
19640
+ "learning_rate": 1.7423728813559324e-06,
19641
+ "loss": 1.3035,
19642
+ "step": 27430
19643
+ },
19644
+ {
19645
+ "epoch": 0.69,
19646
+ "grad_norm": 4.232480049133301,
19647
+ "learning_rate": 1.7355932203389834e-06,
19648
+ "loss": 1.2197,
19649
+ "step": 27440
19650
+ },
19651
+ {
19652
+ "epoch": 0.69,
19653
+ "grad_norm": 9.582504272460938,
19654
+ "learning_rate": 1.728813559322034e-06,
19655
+ "loss": 1.2699,
19656
+ "step": 27450
19657
+ },
19658
+ {
19659
+ "epoch": 0.69,
19660
+ "grad_norm": 9.52001667022705,
19661
+ "learning_rate": 1.7220338983050849e-06,
19662
+ "loss": 1.356,
19663
+ "step": 27460
19664
+ },
19665
+ {
19666
+ "epoch": 0.69,
19667
+ "grad_norm": 13.017762184143066,
19668
+ "learning_rate": 1.7152542372881356e-06,
19669
+ "loss": 1.2246,
19670
+ "step": 27470
19671
+ },
19672
+ {
19673
+ "epoch": 0.69,
19674
+ "grad_norm": 7.059675693511963,
19675
+ "learning_rate": 1.7084745762711866e-06,
19676
+ "loss": 1.3978,
19677
+ "step": 27480
19678
+ },
19679
+ {
19680
+ "epoch": 0.69,
19681
+ "grad_norm": 4.05330753326416,
19682
+ "learning_rate": 1.7016949152542376e-06,
19683
+ "loss": 1.3678,
19684
+ "step": 27490
19685
+ },
19686
+ {
19687
+ "epoch": 0.69,
19688
+ "grad_norm": 8.122435569763184,
19689
+ "learning_rate": 1.6949152542372882e-06,
19690
+ "loss": 1.3062,
19691
+ "step": 27500
19692
+ },
19693
+ {
19694
+ "epoch": 0.69,
19695
+ "eval_loss": 1.262895941734314,
19696
+ "eval_runtime": 66.2138,
19697
+ "eval_samples_per_second": 15.103,
19698
+ "eval_steps_per_second": 15.103,
19699
+ "step": 27500
19700
  }
19701
  ],
19702
  "logging_steps": 10,
 
19704
  "num_input_tokens_seen": 0,
19705
  "num_train_epochs": 1,
19706
  "save_steps": 2500,
19707
+ "total_flos": 4.4280846483456e+17,
19708
  "train_batch_size": 1,
19709
  "trial_name": null,
19710
  "trial_params": null