mohammadmahdinouri commited on
Commit
947d0c7
·
verified ·
1 Parent(s): 484a886

Training in progress, step 15000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a8af821523f7f828a05693158f90c4eb3a0034faa0c2293ad1124f5f93f2750
3
  size 487156538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6597692baaaecd56cc3119530c69265f378d69e034a383c06ed9343d53fe0ea
3
  size 487156538
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91553bbc60905f9dec7e2b273a7b78b1aaa7bfa9c652a40341722951f2285fa2
3
  size 1059459406
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75301060d0614bae90398039500c97ff95194395b69fb1d587d503e800901639
3
  size 1059459406
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4523bbe94ce68cf422359680d501e02156c5a468572eaddf29b6fc30a80a5c85
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58725715e3a81fd995f3eb6fcd80daea9b56d116fca9b35a744c99f27b82bcc1
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4775592d06656304b14fa76806a517bea34547605af51b0919af58d9e3ad34f6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a83fee6b01b58aa2d91d3e7341c75d9a4ecceb467333bab32df1edbfad5b705
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cc0d600480a1d336ef5ed5d595520ccc7fd9075dda439dbae6adbb69ff279e7
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba3120a58a84b571dd9da9df7ca01c8edbdf1e3273712bd70e4eea91eb7a2a07
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2f1d1a83fbbd54a97f127d07293b97435077087576259edb01b1d629c65d3ad
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f845f06854dde41c169725cc192fbd9612a41feb786e9ebb38d8ffe1fdb6a40a
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf353f402a8187c44395ca6064b65c6f690bca29a45070f40a2616e51dfc5dd0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f1eef33864701acffbe1e463e988c61c385ad81d27a75b2f5773dcc59db84b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.020738405749871125,
6
  "eval_steps": 500,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4908,6 +4908,356 @@
4908
  "learning_rate": 0.0004966664658166431,
4909
  "loss": 18.6651,
4910
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4911
  }
4912
  ],
4913
  "logging_steps": 20,
@@ -4927,7 +5277,7 @@
4927
  "attributes": {}
4928
  }
4929
  },
4930
- "total_flos": 3.1157911970490876e+19,
4931
  "train_batch_size": 48,
4932
  "trial_name": null,
4933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.022219720446290493,
6
  "eval_steps": 500,
7
+ "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4908
  "learning_rate": 0.0004966664658166431,
4909
  "loss": 18.6651,
4910
  "step": 14000
4911
+ },
4912
+ {
4913
+ "epoch": 0.02076803204379951,
4914
+ "grad_norm": 8.6875,
4915
+ "learning_rate": 0.0004966615268816355,
4916
+ "loss": 18.6887,
4917
+ "step": 14020
4918
+ },
4919
+ {
4920
+ "epoch": 0.0207976583377279,
4921
+ "grad_norm": 8.5,
4922
+ "learning_rate": 0.0004966565879466278,
4923
+ "loss": 18.6563,
4924
+ "step": 14040
4925
+ },
4926
+ {
4927
+ "epoch": 0.020827284631656288,
4928
+ "grad_norm": 7.4375,
4929
+ "learning_rate": 0.0004966516490116204,
4930
+ "loss": 18.6696,
4931
+ "step": 14060
4932
+ },
4933
+ {
4934
+ "epoch": 0.020856910925584674,
4935
+ "grad_norm": 13.0,
4936
+ "learning_rate": 0.0004966467100766127,
4937
+ "loss": 18.7075,
4938
+ "step": 14080
4939
+ },
4940
+ {
4941
+ "epoch": 0.02088653721951306,
4942
+ "grad_norm": 8.25,
4943
+ "learning_rate": 0.0004966417711416051,
4944
+ "loss": 18.7774,
4945
+ "step": 14100
4946
+ },
4947
+ {
4948
+ "epoch": 0.02091616351344145,
4949
+ "grad_norm": 8.0625,
4950
+ "learning_rate": 0.0004966368322065976,
4951
+ "loss": 18.7288,
4952
+ "step": 14120
4953
+ },
4954
+ {
4955
+ "epoch": 0.020945789807369837,
4956
+ "grad_norm": 6.9375,
4957
+ "learning_rate": 0.00049663189327159,
4958
+ "loss": 18.7008,
4959
+ "step": 14140
4960
+ },
4961
+ {
4962
+ "epoch": 0.020975416101298223,
4963
+ "grad_norm": 8.25,
4964
+ "learning_rate": 0.0004966269543365825,
4965
+ "loss": 18.6659,
4966
+ "step": 14160
4967
+ },
4968
+ {
4969
+ "epoch": 0.02100504239522661,
4970
+ "grad_norm": 7.625,
4971
+ "learning_rate": 0.0004966220154015749,
4972
+ "loss": 18.5981,
4973
+ "step": 14180
4974
+ },
4975
+ {
4976
+ "epoch": 0.021034668689155,
4977
+ "grad_norm": 7.53125,
4978
+ "learning_rate": 0.0004966170764665674,
4979
+ "loss": 18.6925,
4980
+ "step": 14200
4981
+ },
4982
+ {
4983
+ "epoch": 0.021064294983083386,
4984
+ "grad_norm": 8.125,
4985
+ "learning_rate": 0.0004966121375315598,
4986
+ "loss": 18.7061,
4987
+ "step": 14220
4988
+ },
4989
+ {
4990
+ "epoch": 0.021093921277011773,
4991
+ "grad_norm": 10.3125,
4992
+ "learning_rate": 0.0004966071985965522,
4993
+ "loss": 18.6827,
4994
+ "step": 14240
4995
+ },
4996
+ {
4997
+ "epoch": 0.02112354757094016,
4998
+ "grad_norm": 7.875,
4999
+ "learning_rate": 0.0004966022596615446,
5000
+ "loss": 18.6601,
5001
+ "step": 14260
5002
+ },
5003
+ {
5004
+ "epoch": 0.02115317386486855,
5005
+ "grad_norm": 9.5625,
5006
+ "learning_rate": 0.0004965973207265371,
5007
+ "loss": 18.6155,
5008
+ "step": 14280
5009
+ },
5010
+ {
5011
+ "epoch": 0.021182800158796936,
5012
+ "grad_norm": 9.375,
5013
+ "learning_rate": 0.0004965923817915295,
5014
+ "loss": 18.6963,
5015
+ "step": 14300
5016
+ },
5017
+ {
5018
+ "epoch": 0.021212426452725322,
5019
+ "grad_norm": 7.34375,
5020
+ "learning_rate": 0.000496587442856522,
5021
+ "loss": 18.5988,
5022
+ "step": 14320
5023
+ },
5024
+ {
5025
+ "epoch": 0.02124205274665371,
5026
+ "grad_norm": 7.78125,
5027
+ "learning_rate": 0.0004965825039215144,
5028
+ "loss": 18.6373,
5029
+ "step": 14340
5030
+ },
5031
+ {
5032
+ "epoch": 0.0212716790405821,
5033
+ "grad_norm": 8.125,
5034
+ "learning_rate": 0.0004965775649865069,
5035
+ "loss": 18.6575,
5036
+ "step": 14360
5037
+ },
5038
+ {
5039
+ "epoch": 0.021301305334510485,
5040
+ "grad_norm": 13.1875,
5041
+ "learning_rate": 0.0004965726260514993,
5042
+ "loss": 18.6643,
5043
+ "step": 14380
5044
+ },
5045
+ {
5046
+ "epoch": 0.02133093162843887,
5047
+ "grad_norm": 8.25,
5048
+ "learning_rate": 0.0004965676871164917,
5049
+ "loss": 18.7713,
5050
+ "step": 14400
5051
+ },
5052
+ {
5053
+ "epoch": 0.021360557922367258,
5054
+ "grad_norm": 7.8125,
5055
+ "learning_rate": 0.0004965627481814841,
5056
+ "loss": 18.66,
5057
+ "step": 14420
5058
+ },
5059
+ {
5060
+ "epoch": 0.021390184216295648,
5061
+ "grad_norm": 7.40625,
5062
+ "learning_rate": 0.0004965578092464766,
5063
+ "loss": 18.6699,
5064
+ "step": 14440
5065
+ },
5066
+ {
5067
+ "epoch": 0.021419810510224034,
5068
+ "grad_norm": 8.5625,
5069
+ "learning_rate": 0.000496552870311469,
5070
+ "loss": 18.617,
5071
+ "step": 14460
5072
+ },
5073
+ {
5074
+ "epoch": 0.02144943680415242,
5075
+ "grad_norm": 8.1875,
5076
+ "learning_rate": 0.0004965479313764614,
5077
+ "loss": 18.6727,
5078
+ "step": 14480
5079
+ },
5080
+ {
5081
+ "epoch": 0.021479063098080807,
5082
+ "grad_norm": 7.5625,
5083
+ "learning_rate": 0.0004965429924414539,
5084
+ "loss": 18.5574,
5085
+ "step": 14500
5086
+ },
5087
+ {
5088
+ "epoch": 0.021508689392009197,
5089
+ "grad_norm": 8.875,
5090
+ "learning_rate": 0.0004965380535064463,
5091
+ "loss": 18.7566,
5092
+ "step": 14520
5093
+ },
5094
+ {
5095
+ "epoch": 0.021538315685937583,
5096
+ "grad_norm": 8.3125,
5097
+ "learning_rate": 0.0004965331145714388,
5098
+ "loss": 18.7108,
5099
+ "step": 14540
5100
+ },
5101
+ {
5102
+ "epoch": 0.02156794197986597,
5103
+ "grad_norm": 7.09375,
5104
+ "learning_rate": 0.0004965281756364312,
5105
+ "loss": 18.5922,
5106
+ "step": 14560
5107
+ },
5108
+ {
5109
+ "epoch": 0.021597568273794356,
5110
+ "grad_norm": 7.125,
5111
+ "learning_rate": 0.0004965232367014237,
5112
+ "loss": 18.6615,
5113
+ "step": 14580
5114
+ },
5115
+ {
5116
+ "epoch": 0.021627194567722746,
5117
+ "grad_norm": 7.5625,
5118
+ "learning_rate": 0.0004965182977664161,
5119
+ "loss": 18.6649,
5120
+ "step": 14600
5121
+ },
5122
+ {
5123
+ "epoch": 0.021656820861651133,
5124
+ "grad_norm": 8.3125,
5125
+ "learning_rate": 0.0004965133588314085,
5126
+ "loss": 18.7153,
5127
+ "step": 14620
5128
+ },
5129
+ {
5130
+ "epoch": 0.02168644715557952,
5131
+ "grad_norm": 7.6875,
5132
+ "learning_rate": 0.0004965084198964008,
5133
+ "loss": 18.6446,
5134
+ "step": 14640
5135
+ },
5136
+ {
5137
+ "epoch": 0.021716073449507906,
5138
+ "grad_norm": 6.90625,
5139
+ "learning_rate": 0.0004965034809613934,
5140
+ "loss": 18.6951,
5141
+ "step": 14660
5142
+ },
5143
+ {
5144
+ "epoch": 0.021745699743436295,
5145
+ "grad_norm": 7.875,
5146
+ "learning_rate": 0.0004964985420263857,
5147
+ "loss": 18.6594,
5148
+ "step": 14680
5149
+ },
5150
+ {
5151
+ "epoch": 0.021775326037364682,
5152
+ "grad_norm": 9.25,
5153
+ "learning_rate": 0.0004964936030913782,
5154
+ "loss": 18.6275,
5155
+ "step": 14700
5156
+ },
5157
+ {
5158
+ "epoch": 0.02180495233129307,
5159
+ "grad_norm": 16.5,
5160
+ "learning_rate": 0.0004964886641563706,
5161
+ "loss": 18.6842,
5162
+ "step": 14720
5163
+ },
5164
+ {
5165
+ "epoch": 0.021834578625221458,
5166
+ "grad_norm": 8.5625,
5167
+ "learning_rate": 0.0004964837252213631,
5168
+ "loss": 18.5828,
5169
+ "step": 14740
5170
+ },
5171
+ {
5172
+ "epoch": 0.021864204919149845,
5173
+ "grad_norm": 8.0,
5174
+ "learning_rate": 0.0004964787862863555,
5175
+ "loss": 18.5667,
5176
+ "step": 14760
5177
+ },
5178
+ {
5179
+ "epoch": 0.02189383121307823,
5180
+ "grad_norm": 7.8125,
5181
+ "learning_rate": 0.0004964738473513479,
5182
+ "loss": 18.575,
5183
+ "step": 14780
5184
+ },
5185
+ {
5186
+ "epoch": 0.021923457507006618,
5187
+ "grad_norm": 14.8125,
5188
+ "learning_rate": 0.0004964689084163404,
5189
+ "loss": 18.5941,
5190
+ "step": 14800
5191
+ },
5192
+ {
5193
+ "epoch": 0.021953083800935008,
5194
+ "grad_norm": 7.875,
5195
+ "learning_rate": 0.0004964639694813328,
5196
+ "loss": 18.5917,
5197
+ "step": 14820
5198
+ },
5199
+ {
5200
+ "epoch": 0.021982710094863394,
5201
+ "grad_norm": 9.75,
5202
+ "learning_rate": 0.0004964590305463252,
5203
+ "loss": 18.6876,
5204
+ "step": 14840
5205
+ },
5206
+ {
5207
+ "epoch": 0.02201233638879178,
5208
+ "grad_norm": 6.8125,
5209
+ "learning_rate": 0.0004964540916113176,
5210
+ "loss": 18.5439,
5211
+ "step": 14860
5212
+ },
5213
+ {
5214
+ "epoch": 0.022041962682720167,
5215
+ "grad_norm": 7.96875,
5216
+ "learning_rate": 0.0004964491526763101,
5217
+ "loss": 18.5891,
5218
+ "step": 14880
5219
+ },
5220
+ {
5221
+ "epoch": 0.022071588976648557,
5222
+ "grad_norm": 7.4375,
5223
+ "learning_rate": 0.0004964442137413025,
5224
+ "loss": 18.585,
5225
+ "step": 14900
5226
+ },
5227
+ {
5228
+ "epoch": 0.022101215270576943,
5229
+ "grad_norm": 8.3125,
5230
+ "learning_rate": 0.000496439274806295,
5231
+ "loss": 18.6264,
5232
+ "step": 14920
5233
+ },
5234
+ {
5235
+ "epoch": 0.02213084156450533,
5236
+ "grad_norm": 8.125,
5237
+ "learning_rate": 0.0004964343358712874,
5238
+ "loss": 18.6338,
5239
+ "step": 14940
5240
+ },
5241
+ {
5242
+ "epoch": 0.022160467858433716,
5243
+ "grad_norm": 8.1875,
5244
+ "learning_rate": 0.0004964293969362799,
5245
+ "loss": 18.5912,
5246
+ "step": 14960
5247
+ },
5248
+ {
5249
+ "epoch": 0.022190094152362106,
5250
+ "grad_norm": 7.6875,
5251
+ "learning_rate": 0.0004964244580012723,
5252
+ "loss": 18.6101,
5253
+ "step": 14980
5254
+ },
5255
+ {
5256
+ "epoch": 0.022219720446290493,
5257
+ "grad_norm": 7.59375,
5258
+ "learning_rate": 0.0004964195190662647,
5259
+ "loss": 18.5823,
5260
+ "step": 15000
5261
  }
5262
  ],
5263
  "logging_steps": 20,
 
5277
  "attributes": {}
5278
  }
5279
  },
5280
+ "total_flos": 3.33834741124327e+19,
5281
  "train_batch_size": 48,
5282
  "trial_name": null,
5283
  "trial_params": null