Joemgu commited on
Commit
bacdb22
1 Parent(s): ac13e60

Training in progress, step 1000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b93cb0f5af4a676996d113ae67c14903f845be14efc3d75962cc5c86990b4be
3
  size 4736616809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae99f5b9471b1b88e22d76dfd54369c904f7fbe448047cf0c5671464d7f6274a
3
  size 4736616809
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b06061cdd59a61c03b74896e78f938e36d6d587093dddf2c3beb4c518798564
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc924070c94c5b619e2669e03e1453bcf47c8ac1d6edf146fae93888c7686dcc
3
  size 2368281769
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78177ec8a4e9181f496a71815f95534c1ccdd07dd8b38c39977074346212cd45
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2958aedd09b3f2455955cd165d68fd32ae680af19bdad126f498dd665ace09f
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc38c7f71de03e75da249f8cf736366cbbae8af7a495e0547a8a03a22691e8a0
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af3121ec18c8bd79a8c50e67e6a316e1da89b03afea7df26186a8c09abf682cd
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 2.0316452980041504,
3
- "best_model_checkpoint": "output/checkpoint-800",
4
- "epoch": 0.5469471907531741,
5
- "global_step": 800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -4838,11 +4838,1219 @@
4838
  "eval_samples_per_second": 5.918,
4839
  "eval_steps_per_second": 5.918,
4840
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4841
  }
4842
  ],
4843
  "max_steps": 4386,
4844
  "num_train_epochs": 3,
4845
- "total_flos": 9.813169375148851e+17,
4846
  "trial_name": null,
4847
  "trial_params": null
4848
  }
 
1
  {
2
+ "best_metric": 2.0208685398101807,
3
+ "best_model_checkpoint": "output/checkpoint-1000",
4
+ "epoch": 0.6836839884414676,
5
+ "global_step": 1000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
4838
  "eval_samples_per_second": 5.918,
4839
  "eval_steps_per_second": 5.918,
4840
  "step": 800
4841
+ },
4842
+ {
4843
+ "epoch": 0.55,
4844
+ "learning_rate": 0.0005138557095078833,
4845
+ "loss": 2.1501,
4846
+ "step": 801
4847
+ },
4848
+ {
4849
+ "epoch": 0.55,
4850
+ "learning_rate": 0.0005137123745819397,
4851
+ "loss": 2.0977,
4852
+ "step": 802
4853
+ },
4854
+ {
4855
+ "epoch": 0.55,
4856
+ "learning_rate": 0.0005135690396559961,
4857
+ "loss": 2.2608,
4858
+ "step": 803
4859
+ },
4860
+ {
4861
+ "epoch": 0.55,
4862
+ "learning_rate": 0.0005134257047300525,
4863
+ "loss": 2.0765,
4864
+ "step": 804
4865
+ },
4866
+ {
4867
+ "epoch": 0.55,
4868
+ "learning_rate": 0.0005132823698041089,
4869
+ "loss": 2.1414,
4870
+ "step": 805
4871
+ },
4872
+ {
4873
+ "epoch": 0.55,
4874
+ "learning_rate": 0.0005131390348781653,
4875
+ "loss": 2.2186,
4876
+ "step": 806
4877
+ },
4878
+ {
4879
+ "epoch": 0.55,
4880
+ "learning_rate": 0.0005129956999522216,
4881
+ "loss": 2.1596,
4882
+ "step": 807
4883
+ },
4884
+ {
4885
+ "epoch": 0.55,
4886
+ "learning_rate": 0.000512852365026278,
4887
+ "loss": 2.1241,
4888
+ "step": 808
4889
+ },
4890
+ {
4891
+ "epoch": 0.55,
4892
+ "learning_rate": 0.0005127090301003344,
4893
+ "loss": 2.0742,
4894
+ "step": 809
4895
+ },
4896
+ {
4897
+ "epoch": 0.55,
4898
+ "learning_rate": 0.0005125656951743908,
4899
+ "loss": 2.0797,
4900
+ "step": 810
4901
+ },
4902
+ {
4903
+ "epoch": 0.55,
4904
+ "learning_rate": 0.0005124223602484472,
4905
+ "loss": 2.1866,
4906
+ "step": 811
4907
+ },
4908
+ {
4909
+ "epoch": 0.56,
4910
+ "learning_rate": 0.0005122790253225035,
4911
+ "loss": 2.1895,
4912
+ "step": 812
4913
+ },
4914
+ {
4915
+ "epoch": 0.56,
4916
+ "learning_rate": 0.00051213569039656,
4917
+ "loss": 2.0698,
4918
+ "step": 813
4919
+ },
4920
+ {
4921
+ "epoch": 0.56,
4922
+ "learning_rate": 0.0005119923554706162,
4923
+ "loss": 2.1982,
4924
+ "step": 814
4925
+ },
4926
+ {
4927
+ "epoch": 0.56,
4928
+ "learning_rate": 0.0005118490205446727,
4929
+ "loss": 2.1768,
4930
+ "step": 815
4931
+ },
4932
+ {
4933
+ "epoch": 0.56,
4934
+ "learning_rate": 0.000511705685618729,
4935
+ "loss": 2.0943,
4936
+ "step": 816
4937
+ },
4938
+ {
4939
+ "epoch": 0.56,
4940
+ "learning_rate": 0.0005115623506927855,
4941
+ "loss": 2.067,
4942
+ "step": 817
4943
+ },
4944
+ {
4945
+ "epoch": 0.56,
4946
+ "learning_rate": 0.0005114190157668418,
4947
+ "loss": 2.0416,
4948
+ "step": 818
4949
+ },
4950
+ {
4951
+ "epoch": 0.56,
4952
+ "learning_rate": 0.0005112756808408981,
4953
+ "loss": 2.1266,
4954
+ "step": 819
4955
+ },
4956
+ {
4957
+ "epoch": 0.56,
4958
+ "learning_rate": 0.0005111323459149545,
4959
+ "loss": 2.1835,
4960
+ "step": 820
4961
+ },
4962
+ {
4963
+ "epoch": 0.56,
4964
+ "learning_rate": 0.0005109890109890109,
4965
+ "loss": 2.1033,
4966
+ "step": 821
4967
+ },
4968
+ {
4969
+ "epoch": 0.56,
4970
+ "learning_rate": 0.0005108456760630673,
4971
+ "loss": 2.1953,
4972
+ "step": 822
4973
+ },
4974
+ {
4975
+ "epoch": 0.56,
4976
+ "learning_rate": 0.0005107023411371237,
4977
+ "loss": 2.2002,
4978
+ "step": 823
4979
+ },
4980
+ {
4981
+ "epoch": 0.56,
4982
+ "learning_rate": 0.0005105590062111801,
4983
+ "loss": 2.0887,
4984
+ "step": 824
4985
+ },
4986
+ {
4987
+ "epoch": 0.56,
4988
+ "learning_rate": 0.0005104156712852364,
4989
+ "loss": 2.1419,
4990
+ "step": 825
4991
+ },
4992
+ {
4993
+ "epoch": 0.56,
4994
+ "learning_rate": 0.0005102723363592928,
4995
+ "loss": 2.0993,
4996
+ "step": 826
4997
+ },
4998
+ {
4999
+ "epoch": 0.57,
5000
+ "learning_rate": 0.0005101290014333492,
5001
+ "loss": 2.1632,
5002
+ "step": 827
5003
+ },
5004
+ {
5005
+ "epoch": 0.57,
5006
+ "learning_rate": 0.0005099856665074056,
5007
+ "loss": 2.11,
5008
+ "step": 828
5009
+ },
5010
+ {
5011
+ "epoch": 0.57,
5012
+ "learning_rate": 0.000509842331581462,
5013
+ "loss": 2.1375,
5014
+ "step": 829
5015
+ },
5016
+ {
5017
+ "epoch": 0.57,
5018
+ "learning_rate": 0.0005096989966555184,
5019
+ "loss": 2.0241,
5020
+ "step": 830
5021
+ },
5022
+ {
5023
+ "epoch": 0.57,
5024
+ "learning_rate": 0.0005095556617295748,
5025
+ "loss": 1.9997,
5026
+ "step": 831
5027
+ },
5028
+ {
5029
+ "epoch": 0.57,
5030
+ "learning_rate": 0.0005094123268036311,
5031
+ "loss": 2.095,
5032
+ "step": 832
5033
+ },
5034
+ {
5035
+ "epoch": 0.57,
5036
+ "learning_rate": 0.0005092689918776874,
5037
+ "loss": 2.114,
5038
+ "step": 833
5039
+ },
5040
+ {
5041
+ "epoch": 0.57,
5042
+ "learning_rate": 0.0005091256569517439,
5043
+ "loss": 2.0887,
5044
+ "step": 834
5045
+ },
5046
+ {
5047
+ "epoch": 0.57,
5048
+ "learning_rate": 0.0005089823220258002,
5049
+ "loss": 2.1666,
5050
+ "step": 835
5051
+ },
5052
+ {
5053
+ "epoch": 0.57,
5054
+ "learning_rate": 0.0005088389870998567,
5055
+ "loss": 2.1407,
5056
+ "step": 836
5057
+ },
5058
+ {
5059
+ "epoch": 0.57,
5060
+ "learning_rate": 0.0005086956521739129,
5061
+ "loss": 2.2634,
5062
+ "step": 837
5063
+ },
5064
+ {
5065
+ "epoch": 0.57,
5066
+ "learning_rate": 0.0005085523172479694,
5067
+ "loss": 2.2145,
5068
+ "step": 838
5069
+ },
5070
+ {
5071
+ "epoch": 0.57,
5072
+ "learning_rate": 0.0005084089823220257,
5073
+ "loss": 2.0569,
5074
+ "step": 839
5075
+ },
5076
+ {
5077
+ "epoch": 0.57,
5078
+ "learning_rate": 0.0005082656473960821,
5079
+ "loss": 2.1156,
5080
+ "step": 840
5081
+ },
5082
+ {
5083
+ "epoch": 0.57,
5084
+ "learning_rate": 0.0005081223124701385,
5085
+ "loss": 2.1948,
5086
+ "step": 841
5087
+ },
5088
+ {
5089
+ "epoch": 0.58,
5090
+ "learning_rate": 0.0005079789775441949,
5091
+ "loss": 2.1315,
5092
+ "step": 842
5093
+ },
5094
+ {
5095
+ "epoch": 0.58,
5096
+ "learning_rate": 0.0005078356426182512,
5097
+ "loss": 2.1392,
5098
+ "step": 843
5099
+ },
5100
+ {
5101
+ "epoch": 0.58,
5102
+ "learning_rate": 0.0005076923076923076,
5103
+ "loss": 2.1225,
5104
+ "step": 844
5105
+ },
5106
+ {
5107
+ "epoch": 0.58,
5108
+ "learning_rate": 0.000507548972766364,
5109
+ "loss": 2.1076,
5110
+ "step": 845
5111
+ },
5112
+ {
5113
+ "epoch": 0.58,
5114
+ "learning_rate": 0.0005074056378404204,
5115
+ "loss": 2.0865,
5116
+ "step": 846
5117
+ },
5118
+ {
5119
+ "epoch": 0.58,
5120
+ "learning_rate": 0.0005072623029144768,
5121
+ "loss": 2.159,
5122
+ "step": 847
5123
+ },
5124
+ {
5125
+ "epoch": 0.58,
5126
+ "learning_rate": 0.0005071189679885332,
5127
+ "loss": 2.0958,
5128
+ "step": 848
5129
+ },
5130
+ {
5131
+ "epoch": 0.58,
5132
+ "learning_rate": 0.0005069756330625896,
5133
+ "loss": 2.1292,
5134
+ "step": 849
5135
+ },
5136
+ {
5137
+ "epoch": 0.58,
5138
+ "learning_rate": 0.0005068322981366459,
5139
+ "loss": 2.1155,
5140
+ "step": 850
5141
+ },
5142
+ {
5143
+ "epoch": 0.58,
5144
+ "learning_rate": 0.0005066889632107023,
5145
+ "loss": 2.0926,
5146
+ "step": 851
5147
+ },
5148
+ {
5149
+ "epoch": 0.58,
5150
+ "learning_rate": 0.0005065456282847586,
5151
+ "loss": 2.1385,
5152
+ "step": 852
5153
+ },
5154
+ {
5155
+ "epoch": 0.58,
5156
+ "learning_rate": 0.0005064022933588151,
5157
+ "loss": 2.1187,
5158
+ "step": 853
5159
+ },
5160
+ {
5161
+ "epoch": 0.58,
5162
+ "learning_rate": 0.0005062589584328714,
5163
+ "loss": 2.0747,
5164
+ "step": 854
5165
+ },
5166
+ {
5167
+ "epoch": 0.58,
5168
+ "learning_rate": 0.0005061156235069279,
5169
+ "loss": 2.147,
5170
+ "step": 855
5171
+ },
5172
+ {
5173
+ "epoch": 0.59,
5174
+ "learning_rate": 0.0005059722885809841,
5175
+ "loss": 2.2023,
5176
+ "step": 856
5177
+ },
5178
+ {
5179
+ "epoch": 0.59,
5180
+ "learning_rate": 0.0005058289536550406,
5181
+ "loss": 2.0863,
5182
+ "step": 857
5183
+ },
5184
+ {
5185
+ "epoch": 0.59,
5186
+ "learning_rate": 0.0005056856187290969,
5187
+ "loss": 2.1402,
5188
+ "step": 858
5189
+ },
5190
+ {
5191
+ "epoch": 0.59,
5192
+ "learning_rate": 0.0005055422838031533,
5193
+ "loss": 2.1758,
5194
+ "step": 859
5195
+ },
5196
+ {
5197
+ "epoch": 0.59,
5198
+ "learning_rate": 0.0005053989488772097,
5199
+ "loss": 2.1767,
5200
+ "step": 860
5201
+ },
5202
+ {
5203
+ "epoch": 0.59,
5204
+ "learning_rate": 0.000505255613951266,
5205
+ "loss": 2.2041,
5206
+ "step": 861
5207
+ },
5208
+ {
5209
+ "epoch": 0.59,
5210
+ "learning_rate": 0.0005051122790253224,
5211
+ "loss": 2.1227,
5212
+ "step": 862
5213
+ },
5214
+ {
5215
+ "epoch": 0.59,
5216
+ "learning_rate": 0.0005049689440993788,
5217
+ "loss": 2.1992,
5218
+ "step": 863
5219
+ },
5220
+ {
5221
+ "epoch": 0.59,
5222
+ "learning_rate": 0.0005048256091734352,
5223
+ "loss": 2.0532,
5224
+ "step": 864
5225
+ },
5226
+ {
5227
+ "epoch": 0.59,
5228
+ "learning_rate": 0.0005046822742474916,
5229
+ "loss": 2.1845,
5230
+ "step": 865
5231
+ },
5232
+ {
5233
+ "epoch": 0.59,
5234
+ "learning_rate": 0.000504538939321548,
5235
+ "loss": 2.1466,
5236
+ "step": 866
5237
+ },
5238
+ {
5239
+ "epoch": 0.59,
5240
+ "learning_rate": 0.0005043956043956043,
5241
+ "loss": 2.0371,
5242
+ "step": 867
5243
+ },
5244
+ {
5245
+ "epoch": 0.59,
5246
+ "learning_rate": 0.0005042522694696607,
5247
+ "loss": 2.0987,
5248
+ "step": 868
5249
+ },
5250
+ {
5251
+ "epoch": 0.59,
5252
+ "learning_rate": 0.0005041089345437171,
5253
+ "loss": 2.1345,
5254
+ "step": 869
5255
+ },
5256
+ {
5257
+ "epoch": 0.59,
5258
+ "learning_rate": 0.0005039655996177735,
5259
+ "loss": 2.2204,
5260
+ "step": 870
5261
+ },
5262
+ {
5263
+ "epoch": 0.6,
5264
+ "learning_rate": 0.0005038222646918299,
5265
+ "loss": 2.1403,
5266
+ "step": 871
5267
+ },
5268
+ {
5269
+ "epoch": 0.6,
5270
+ "learning_rate": 0.0005036789297658863,
5271
+ "loss": 2.0582,
5272
+ "step": 872
5273
+ },
5274
+ {
5275
+ "epoch": 0.6,
5276
+ "learning_rate": 0.0005035355948399425,
5277
+ "loss": 2.1603,
5278
+ "step": 873
5279
+ },
5280
+ {
5281
+ "epoch": 0.6,
5282
+ "learning_rate": 0.000503392259913999,
5283
+ "loss": 2.0686,
5284
+ "step": 874
5285
+ },
5286
+ {
5287
+ "epoch": 0.6,
5288
+ "learning_rate": 0.0005032489249880553,
5289
+ "loss": 2.0596,
5290
+ "step": 875
5291
+ },
5292
+ {
5293
+ "epoch": 0.6,
5294
+ "learning_rate": 0.0005031055900621118,
5295
+ "loss": 2.1168,
5296
+ "step": 876
5297
+ },
5298
+ {
5299
+ "epoch": 0.6,
5300
+ "learning_rate": 0.0005029622551361681,
5301
+ "loss": 2.19,
5302
+ "step": 877
5303
+ },
5304
+ {
5305
+ "epoch": 0.6,
5306
+ "learning_rate": 0.0005028189202102246,
5307
+ "loss": 2.2216,
5308
+ "step": 878
5309
+ },
5310
+ {
5311
+ "epoch": 0.6,
5312
+ "learning_rate": 0.0005026755852842808,
5313
+ "loss": 2.1723,
5314
+ "step": 879
5315
+ },
5316
+ {
5317
+ "epoch": 0.6,
5318
+ "learning_rate": 0.0005025322503583372,
5319
+ "loss": 2.1189,
5320
+ "step": 880
5321
+ },
5322
+ {
5323
+ "epoch": 0.6,
5324
+ "learning_rate": 0.0005023889154323936,
5325
+ "loss": 2.1425,
5326
+ "step": 881
5327
+ },
5328
+ {
5329
+ "epoch": 0.6,
5330
+ "learning_rate": 0.00050224558050645,
5331
+ "loss": 2.2003,
5332
+ "step": 882
5333
+ },
5334
+ {
5335
+ "epoch": 0.6,
5336
+ "learning_rate": 0.0005021022455805064,
5337
+ "loss": 2.211,
5338
+ "step": 883
5339
+ },
5340
+ {
5341
+ "epoch": 0.6,
5342
+ "learning_rate": 0.0005019589106545628,
5343
+ "loss": 2.1302,
5344
+ "step": 884
5345
+ },
5346
+ {
5347
+ "epoch": 0.61,
5348
+ "learning_rate": 0.0005018155757286191,
5349
+ "loss": 2.1729,
5350
+ "step": 885
5351
+ },
5352
+ {
5353
+ "epoch": 0.61,
5354
+ "learning_rate": 0.0005016722408026755,
5355
+ "loss": 2.1203,
5356
+ "step": 886
5357
+ },
5358
+ {
5359
+ "epoch": 0.61,
5360
+ "learning_rate": 0.0005015289058767319,
5361
+ "loss": 2.0511,
5362
+ "step": 887
5363
+ },
5364
+ {
5365
+ "epoch": 0.61,
5366
+ "learning_rate": 0.0005013855709507883,
5367
+ "loss": 1.9758,
5368
+ "step": 888
5369
+ },
5370
+ {
5371
+ "epoch": 0.61,
5372
+ "learning_rate": 0.0005012422360248447,
5373
+ "loss": 2.1151,
5374
+ "step": 889
5375
+ },
5376
+ {
5377
+ "epoch": 0.61,
5378
+ "learning_rate": 0.0005010989010989011,
5379
+ "loss": 2.0998,
5380
+ "step": 890
5381
+ },
5382
+ {
5383
+ "epoch": 0.61,
5384
+ "learning_rate": 0.0005009555661729575,
5385
+ "loss": 2.0739,
5386
+ "step": 891
5387
+ },
5388
+ {
5389
+ "epoch": 0.61,
5390
+ "learning_rate": 0.0005008122312470138,
5391
+ "loss": 2.0421,
5392
+ "step": 892
5393
+ },
5394
+ {
5395
+ "epoch": 0.61,
5396
+ "learning_rate": 0.0005006688963210702,
5397
+ "loss": 2.0808,
5398
+ "step": 893
5399
+ },
5400
+ {
5401
+ "epoch": 0.61,
5402
+ "learning_rate": 0.0005005255613951265,
5403
+ "loss": 2.1301,
5404
+ "step": 894
5405
+ },
5406
+ {
5407
+ "epoch": 0.61,
5408
+ "learning_rate": 0.000500382226469183,
5409
+ "loss": 2.1207,
5410
+ "step": 895
5411
+ },
5412
+ {
5413
+ "epoch": 0.61,
5414
+ "learning_rate": 0.0005002388915432393,
5415
+ "loss": 2.1089,
5416
+ "step": 896
5417
+ },
5418
+ {
5419
+ "epoch": 0.61,
5420
+ "learning_rate": 0.0005000955566172958,
5421
+ "loss": 2.1796,
5422
+ "step": 897
5423
+ },
5424
+ {
5425
+ "epoch": 0.61,
5426
+ "learning_rate": 0.000499952221691352,
5427
+ "loss": 2.1123,
5428
+ "step": 898
5429
+ },
5430
+ {
5431
+ "epoch": 0.61,
5432
+ "learning_rate": 0.0004998088867654085,
5433
+ "loss": 2.1304,
5434
+ "step": 899
5435
+ },
5436
+ {
5437
+ "epoch": 0.62,
5438
+ "learning_rate": 0.0004996655518394648,
5439
+ "loss": 2.0691,
5440
+ "step": 900
5441
+ },
5442
+ {
5443
+ "epoch": 0.62,
5444
+ "learning_rate": 0.0004995222169135212,
5445
+ "loss": 2.215,
5446
+ "step": 901
5447
+ },
5448
+ {
5449
+ "epoch": 0.62,
5450
+ "learning_rate": 0.0004993788819875776,
5451
+ "loss": 2.1303,
5452
+ "step": 902
5453
+ },
5454
+ {
5455
+ "epoch": 0.62,
5456
+ "learning_rate": 0.000499235547061634,
5457
+ "loss": 2.0983,
5458
+ "step": 903
5459
+ },
5460
+ {
5461
+ "epoch": 0.62,
5462
+ "learning_rate": 0.0004990922121356903,
5463
+ "loss": 2.0761,
5464
+ "step": 904
5465
+ },
5466
+ {
5467
+ "epoch": 0.62,
5468
+ "learning_rate": 0.0004989488772097467,
5469
+ "loss": 2.1889,
5470
+ "step": 905
5471
+ },
5472
+ {
5473
+ "epoch": 0.62,
5474
+ "learning_rate": 0.0004988055422838031,
5475
+ "loss": 2.0557,
5476
+ "step": 906
5477
+ },
5478
+ {
5479
+ "epoch": 0.62,
5480
+ "learning_rate": 0.0004986622073578595,
5481
+ "loss": 2.1591,
5482
+ "step": 907
5483
+ },
5484
+ {
5485
+ "epoch": 0.62,
5486
+ "learning_rate": 0.0004985188724319159,
5487
+ "loss": 2.1217,
5488
+ "step": 908
5489
+ },
5490
+ {
5491
+ "epoch": 0.62,
5492
+ "learning_rate": 0.0004983755375059723,
5493
+ "loss": 2.0165,
5494
+ "step": 909
5495
+ },
5496
+ {
5497
+ "epoch": 0.62,
5498
+ "learning_rate": 0.0004982322025800286,
5499
+ "loss": 2.2102,
5500
+ "step": 910
5501
+ },
5502
+ {
5503
+ "epoch": 0.62,
5504
+ "learning_rate": 0.000498088867654085,
5505
+ "loss": 2.187,
5506
+ "step": 911
5507
+ },
5508
+ {
5509
+ "epoch": 0.62,
5510
+ "learning_rate": 0.0004979455327281414,
5511
+ "loss": 2.0758,
5512
+ "step": 912
5513
+ },
5514
+ {
5515
+ "epoch": 0.62,
5516
+ "learning_rate": 0.0004978021978021978,
5517
+ "loss": 2.2606,
5518
+ "step": 913
5519
+ },
5520
+ {
5521
+ "epoch": 0.62,
5522
+ "learning_rate": 0.0004976588628762542,
5523
+ "loss": 1.9795,
5524
+ "step": 914
5525
+ },
5526
+ {
5527
+ "epoch": 0.63,
5528
+ "learning_rate": 0.0004975155279503104,
5529
+ "loss": 2.1235,
5530
+ "step": 915
5531
+ },
5532
+ {
5533
+ "epoch": 0.63,
5534
+ "learning_rate": 0.0004973721930243669,
5535
+ "loss": 2.2042,
5536
+ "step": 916
5537
+ },
5538
+ {
5539
+ "epoch": 0.63,
5540
+ "learning_rate": 0.0004972288580984232,
5541
+ "loss": 2.1169,
5542
+ "step": 917
5543
+ },
5544
+ {
5545
+ "epoch": 0.63,
5546
+ "learning_rate": 0.0004970855231724797,
5547
+ "loss": 2.082,
5548
+ "step": 918
5549
+ },
5550
+ {
5551
+ "epoch": 0.63,
5552
+ "learning_rate": 0.000496942188246536,
5553
+ "loss": 2.0924,
5554
+ "step": 919
5555
+ },
5556
+ {
5557
+ "epoch": 0.63,
5558
+ "learning_rate": 0.0004967988533205925,
5559
+ "loss": 2.1969,
5560
+ "step": 920
5561
+ },
5562
+ {
5563
+ "epoch": 0.63,
5564
+ "learning_rate": 0.0004966555183946487,
5565
+ "loss": 2.1238,
5566
+ "step": 921
5567
+ },
5568
+ {
5569
+ "epoch": 0.63,
5570
+ "learning_rate": 0.0004965121834687051,
5571
+ "loss": 2.1776,
5572
+ "step": 922
5573
+ },
5574
+ {
5575
+ "epoch": 0.63,
5576
+ "learning_rate": 0.0004963688485427615,
5577
+ "loss": 2.1328,
5578
+ "step": 923
5579
+ },
5580
+ {
5581
+ "epoch": 0.63,
5582
+ "learning_rate": 0.0004962255136168179,
5583
+ "loss": 2.0723,
5584
+ "step": 924
5585
+ },
5586
+ {
5587
+ "epoch": 0.63,
5588
+ "learning_rate": 0.0004960821786908743,
5589
+ "loss": 2.2225,
5590
+ "step": 925
5591
+ },
5592
+ {
5593
+ "epoch": 0.63,
5594
+ "learning_rate": 0.0004959388437649307,
5595
+ "loss": 2.1698,
5596
+ "step": 926
5597
+ },
5598
+ {
5599
+ "epoch": 0.63,
5600
+ "learning_rate": 0.000495795508838987,
5601
+ "loss": 2.1764,
5602
+ "step": 927
5603
+ },
5604
+ {
5605
+ "epoch": 0.63,
5606
+ "learning_rate": 0.0004956521739130434,
5607
+ "loss": 2.2167,
5608
+ "step": 928
5609
+ },
5610
+ {
5611
+ "epoch": 0.64,
5612
+ "learning_rate": 0.0004955088389870998,
5613
+ "loss": 2.2408,
5614
+ "step": 929
5615
+ },
5616
+ {
5617
+ "epoch": 0.64,
5618
+ "learning_rate": 0.0004953655040611562,
5619
+ "loss": 2.049,
5620
+ "step": 930
5621
+ },
5622
+ {
5623
+ "epoch": 0.64,
5624
+ "learning_rate": 0.0004952221691352126,
5625
+ "loss": 2.0461,
5626
+ "step": 931
5627
+ },
5628
+ {
5629
+ "epoch": 0.64,
5630
+ "learning_rate": 0.000495078834209269,
5631
+ "loss": 2.1895,
5632
+ "step": 932
5633
+ },
5634
+ {
5635
+ "epoch": 0.64,
5636
+ "learning_rate": 0.0004949354992833254,
5637
+ "loss": 2.1358,
5638
+ "step": 933
5639
+ },
5640
+ {
5641
+ "epoch": 0.64,
5642
+ "learning_rate": 0.0004947921643573817,
5643
+ "loss": 2.1586,
5644
+ "step": 934
5645
+ },
5646
+ {
5647
+ "epoch": 0.64,
5648
+ "learning_rate": 0.0004946488294314381,
5649
+ "loss": 2.0463,
5650
+ "step": 935
5651
+ },
5652
+ {
5653
+ "epoch": 0.64,
5654
+ "learning_rate": 0.0004945054945054944,
5655
+ "loss": 2.006,
5656
+ "step": 936
5657
+ },
5658
+ {
5659
+ "epoch": 0.64,
5660
+ "learning_rate": 0.0004943621595795509,
5661
+ "loss": 2.0742,
5662
+ "step": 937
5663
+ },
5664
+ {
5665
+ "epoch": 0.64,
5666
+ "learning_rate": 0.0004942188246536072,
5667
+ "loss": 2.2008,
5668
+ "step": 938
5669
+ },
5670
+ {
5671
+ "epoch": 0.64,
5672
+ "learning_rate": 0.0004940754897276637,
5673
+ "loss": 2.1771,
5674
+ "step": 939
5675
+ },
5676
+ {
5677
+ "epoch": 0.64,
5678
+ "learning_rate": 0.0004939321548017199,
5679
+ "loss": 2.1418,
5680
+ "step": 940
5681
+ },
5682
+ {
5683
+ "epoch": 0.64,
5684
+ "learning_rate": 0.0004937888198757764,
5685
+ "loss": 2.1089,
5686
+ "step": 941
5687
+ },
5688
+ {
5689
+ "epoch": 0.64,
5690
+ "learning_rate": 0.0004936454849498327,
5691
+ "loss": 2.1641,
5692
+ "step": 942
5693
+ },
5694
+ {
5695
+ "epoch": 0.64,
5696
+ "learning_rate": 0.0004935021500238891,
5697
+ "loss": 2.2123,
5698
+ "step": 943
5699
+ },
5700
+ {
5701
+ "epoch": 0.65,
5702
+ "learning_rate": 0.0004933588150979455,
5703
+ "loss": 2.083,
5704
+ "step": 944
5705
+ },
5706
+ {
5707
+ "epoch": 0.65,
5708
+ "learning_rate": 0.0004932154801720019,
5709
+ "loss": 2.0856,
5710
+ "step": 945
5711
+ },
5712
+ {
5713
+ "epoch": 0.65,
5714
+ "learning_rate": 0.0004930721452460582,
5715
+ "loss": 2.0576,
5716
+ "step": 946
5717
+ },
5718
+ {
5719
+ "epoch": 0.65,
5720
+ "learning_rate": 0.0004929288103201146,
5721
+ "loss": 1.9976,
5722
+ "step": 947
5723
+ },
5724
+ {
5725
+ "epoch": 0.65,
5726
+ "learning_rate": 0.000492785475394171,
5727
+ "loss": 2.1001,
5728
+ "step": 948
5729
+ },
5730
+ {
5731
+ "epoch": 0.65,
5732
+ "learning_rate": 0.0004926421404682274,
5733
+ "loss": 2.1942,
5734
+ "step": 949
5735
+ },
5736
+ {
5737
+ "epoch": 0.65,
5738
+ "learning_rate": 0.0004924988055422838,
5739
+ "loss": 2.0722,
5740
+ "step": 950
5741
+ },
5742
+ {
5743
+ "epoch": 0.65,
5744
+ "learning_rate": 0.0004923554706163402,
5745
+ "loss": 2.1542,
5746
+ "step": 951
5747
+ },
5748
+ {
5749
+ "epoch": 0.65,
5750
+ "learning_rate": 0.0004922121356903965,
5751
+ "loss": 2.1446,
5752
+ "step": 952
5753
+ },
5754
+ {
5755
+ "epoch": 0.65,
5756
+ "learning_rate": 0.0004920688007644529,
5757
+ "loss": 2.1166,
5758
+ "step": 953
5759
+ },
5760
+ {
5761
+ "epoch": 0.65,
5762
+ "learning_rate": 0.0004919254658385093,
5763
+ "loss": 2.1406,
5764
+ "step": 954
5765
+ },
5766
+ {
5767
+ "epoch": 0.65,
5768
+ "learning_rate": 0.0004917821309125656,
5769
+ "loss": 2.1346,
5770
+ "step": 955
5771
+ },
5772
+ {
5773
+ "epoch": 0.65,
5774
+ "learning_rate": 0.0004916387959866221,
5775
+ "loss": 2.063,
5776
+ "step": 956
5777
+ },
5778
+ {
5779
+ "epoch": 0.65,
5780
+ "learning_rate": 0.0004914954610606783,
5781
+ "loss": 2.0332,
5782
+ "step": 957
5783
+ },
5784
+ {
5785
+ "epoch": 0.65,
5786
+ "learning_rate": 0.0004913521261347348,
5787
+ "loss": 2.0845,
5788
+ "step": 958
5789
+ },
5790
+ {
5791
+ "epoch": 0.66,
5792
+ "learning_rate": 0.0004912087912087911,
5793
+ "loss": 2.2177,
5794
+ "step": 959
5795
+ },
5796
+ {
5797
+ "epoch": 0.66,
5798
+ "learning_rate": 0.0004910654562828476,
5799
+ "loss": 2.1375,
5800
+ "step": 960
5801
+ },
5802
+ {
5803
+ "epoch": 0.66,
5804
+ "learning_rate": 0.0004909221213569039,
5805
+ "loss": 2.1171,
5806
+ "step": 961
5807
+ },
5808
+ {
5809
+ "epoch": 0.66,
5810
+ "learning_rate": 0.0004907787864309603,
5811
+ "loss": 2.0483,
5812
+ "step": 962
5813
+ },
5814
+ {
5815
+ "epoch": 0.66,
5816
+ "learning_rate": 0.0004906354515050167,
5817
+ "loss": 2.1325,
5818
+ "step": 963
5819
+ },
5820
+ {
5821
+ "epoch": 0.66,
5822
+ "learning_rate": 0.000490492116579073,
5823
+ "loss": 2.0789,
5824
+ "step": 964
5825
+ },
5826
+ {
5827
+ "epoch": 0.66,
5828
+ "learning_rate": 0.0004903487816531294,
5829
+ "loss": 2.1548,
5830
+ "step": 965
5831
+ },
5832
+ {
5833
+ "epoch": 0.66,
5834
+ "learning_rate": 0.0004902054467271858,
5835
+ "loss": 2.2216,
5836
+ "step": 966
5837
+ },
5838
+ {
5839
+ "epoch": 0.66,
5840
+ "learning_rate": 0.0004900621118012422,
5841
+ "loss": 2.0969,
5842
+ "step": 967
5843
+ },
5844
+ {
5845
+ "epoch": 0.66,
5846
+ "learning_rate": 0.0004899187768752986,
5847
+ "loss": 2.025,
5848
+ "step": 968
5849
+ },
5850
+ {
5851
+ "epoch": 0.66,
5852
+ "learning_rate": 0.000489775441949355,
5853
+ "loss": 2.2016,
5854
+ "step": 969
5855
+ },
5856
+ {
5857
+ "epoch": 0.66,
5858
+ "learning_rate": 0.0004896321070234113,
5859
+ "loss": 2.1138,
5860
+ "step": 970
5861
+ },
5862
+ {
5863
+ "epoch": 0.66,
5864
+ "learning_rate": 0.0004894887720974677,
5865
+ "loss": 2.0992,
5866
+ "step": 971
5867
+ },
5868
+ {
5869
+ "epoch": 0.66,
5870
+ "learning_rate": 0.0004893454371715241,
5871
+ "loss": 2.1381,
5872
+ "step": 972
5873
+ },
5874
+ {
5875
+ "epoch": 0.67,
5876
+ "learning_rate": 0.0004892021022455805,
5877
+ "loss": 2.0633,
5878
+ "step": 973
5879
+ },
5880
+ {
5881
+ "epoch": 0.67,
5882
+ "learning_rate": 0.0004890587673196369,
5883
+ "loss": 2.1169,
5884
+ "step": 974
5885
+ },
5886
+ {
5887
+ "epoch": 0.67,
5888
+ "learning_rate": 0.0004889154323936933,
5889
+ "loss": 2.0574,
5890
+ "step": 975
5891
+ },
5892
+ {
5893
+ "epoch": 0.67,
5894
+ "learning_rate": 0.0004887720974677495,
5895
+ "loss": 2.1371,
5896
+ "step": 976
5897
+ },
5898
+ {
5899
+ "epoch": 0.67,
5900
+ "learning_rate": 0.000488628762541806,
5901
+ "loss": 2.178,
5902
+ "step": 977
5903
+ },
5904
+ {
5905
+ "epoch": 0.67,
5906
+ "learning_rate": 0.0004884854276158623,
5907
+ "loss": 2.1797,
5908
+ "step": 978
5909
+ },
5910
+ {
5911
+ "epoch": 0.67,
5912
+ "learning_rate": 0.0004883420926899188,
5913
+ "loss": 2.0527,
5914
+ "step": 979
5915
+ },
5916
+ {
5917
+ "epoch": 0.67,
5918
+ "learning_rate": 0.0004881987577639751,
5919
+ "loss": 2.2112,
5920
+ "step": 980
5921
+ },
5922
+ {
5923
+ "epoch": 0.67,
5924
+ "learning_rate": 0.0004880554228380315,
5925
+ "loss": 2.1351,
5926
+ "step": 981
5927
+ },
5928
+ {
5929
+ "epoch": 0.67,
5930
+ "learning_rate": 0.0004879120879120879,
5931
+ "loss": 2.0178,
5932
+ "step": 982
5933
+ },
5934
+ {
5935
+ "epoch": 0.67,
5936
+ "learning_rate": 0.0004877687529861442,
5937
+ "loss": 2.1084,
5938
+ "step": 983
5939
+ },
5940
+ {
5941
+ "epoch": 0.67,
5942
+ "learning_rate": 0.00048762541806020066,
5943
+ "loss": 2.0922,
5944
+ "step": 984
5945
+ },
5946
+ {
5947
+ "epoch": 0.67,
5948
+ "learning_rate": 0.000487482083134257,
5949
+ "loss": 2.027,
5950
+ "step": 985
5951
+ },
5952
+ {
5953
+ "epoch": 0.67,
5954
+ "learning_rate": 0.0004873387482083134,
5955
+ "loss": 2.1061,
5956
+ "step": 986
5957
+ },
5958
+ {
5959
+ "epoch": 0.67,
5960
+ "learning_rate": 0.00048719541328236975,
5961
+ "loss": 2.0671,
5962
+ "step": 987
5963
+ },
5964
+ {
5965
+ "epoch": 0.68,
5966
+ "learning_rate": 0.0004870520783564262,
5967
+ "loss": 2.112,
5968
+ "step": 988
5969
+ },
5970
+ {
5971
+ "epoch": 0.68,
5972
+ "learning_rate": 0.0004869087434304825,
5973
+ "loss": 2.1656,
5974
+ "step": 989
5975
+ },
5976
+ {
5977
+ "epoch": 0.68,
5978
+ "learning_rate": 0.00048676540850453885,
5979
+ "loss": 2.0472,
5980
+ "step": 990
5981
+ },
5982
+ {
5983
+ "epoch": 0.68,
5984
+ "learning_rate": 0.0004866220735785953,
5985
+ "loss": 2.0411,
5986
+ "step": 991
5987
+ },
5988
+ {
5989
+ "epoch": 0.68,
5990
+ "learning_rate": 0.0004864787386526516,
5991
+ "loss": 2.0884,
5992
+ "step": 992
5993
+ },
5994
+ {
5995
+ "epoch": 0.68,
5996
+ "learning_rate": 0.00048633540372670806,
5997
+ "loss": 2.1037,
5998
+ "step": 993
5999
+ },
6000
+ {
6001
+ "epoch": 0.68,
6002
+ "learning_rate": 0.0004861920688007644,
6003
+ "loss": 2.1009,
6004
+ "step": 994
6005
+ },
6006
+ {
6007
+ "epoch": 0.68,
6008
+ "learning_rate": 0.0004860487338748208,
6009
+ "loss": 2.1671,
6010
+ "step": 995
6011
+ },
6012
+ {
6013
+ "epoch": 0.68,
6014
+ "learning_rate": 0.00048590539894887715,
6015
+ "loss": 2.0654,
6016
+ "step": 996
6017
+ },
6018
+ {
6019
+ "epoch": 0.68,
6020
+ "learning_rate": 0.00048576206402293354,
6021
+ "loss": 2.0784,
6022
+ "step": 997
6023
+ },
6024
+ {
6025
+ "epoch": 0.68,
6026
+ "learning_rate": 0.0004856187290969899,
6027
+ "loss": 2.1642,
6028
+ "step": 998
6029
+ },
6030
+ {
6031
+ "epoch": 0.68,
6032
+ "learning_rate": 0.0004854753941710463,
6033
+ "loss": 2.1117,
6034
+ "step": 999
6035
+ },
6036
+ {
6037
+ "epoch": 0.68,
6038
+ "learning_rate": 0.0004853320592451027,
6039
+ "loss": 2.0927,
6040
+ "step": 1000
6041
+ },
6042
+ {
6043
+ "epoch": 0.68,
6044
+ "eval_loss": 2.0208685398101807,
6045
+ "eval_runtime": 1688.1492,
6046
+ "eval_samples_per_second": 5.924,
6047
+ "eval_steps_per_second": 5.924,
6048
+ "step": 1000
6049
  }
6050
  ],
6051
  "max_steps": 4386,
6052
  "num_train_epochs": 3,
6053
+ "total_flos": 1.2243065569981379e+18,
6054
  "trial_name": null,
6055
  "trial_params": null
6056
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b06061cdd59a61c03b74896e78f938e36d6d587093dddf2c3beb4c518798564
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc924070c94c5b619e2669e03e1453bcf47c8ac1d6edf146fae93888c7686dcc
3
  size 2368281769