PewterZz commited on
Commit
05e1848
Β·
1 Parent(s): 750a292
{checkpoint-7000 β†’ checkpoint-8000}/README.md RENAMED
File without changes
{checkpoint-7000 β†’ checkpoint-8000}/adapter_config.json RENAMED
File without changes
{checkpoint-7000 β†’ checkpoint-8000}/adapter_model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:904a4ef92c61ccc6c50c07a0afb16c79eca8345d63d05b6fba20d672883caf9c
3
  size 8535970848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad06aee126b0d1bc6ad5f7300e9f866b52d5ddfa03c94459458c802bca279d27
3
  size 8535970848
{checkpoint-7000 β†’ checkpoint-8000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2496581a1d322ee8cdb74b4d9913aa030f52ad250c7a36328009cb9a925d20b3
3
  size 6576969753
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d92dea01c8c29de2a4005cc9690e2f9d72caf688c782c20775dc59d995cfa7b
3
  size 6576969753
{checkpoint-7000 β†’ checkpoint-8000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b98ab2f9046186de382cffe0ac794b64835264151053a24e157ecc237266a430
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c2939c18e59171b164ef03eea9767004735b3684fc33ed54d0af9e9aa5a8ec
3
  size 14645
{checkpoint-7000 β†’ checkpoint-8000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb8edb6711b4ae2f588fdfdebd7cb5e4b8458aa95884c8a213e06a55271fdc66
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ef9fdb1f0fce4718b7fd8d7a72f390bbaa45bb8e37f4d0f2d8b474443eb5ef2
3
  size 1465
{checkpoint-7000 β†’ checkpoint-8000}/trainer_state.json RENAMED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 8.951406649616368,
6
  "eval_steps": 500,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4908,6 +4908,706 @@
4908
  "learning_rate": 0.00017794528798804519,
4909
  "loss": 5.3009,
4910
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4911
  }
4912
  ],
4913
  "logging_steps": 10,
@@ -4927,7 +5627,7 @@
4927
  "attributes": {}
4928
  }
4929
  },
4930
- "total_flos": 9.267749345144734e+18,
4931
  "train_batch_size": 32,
4932
  "trial_name": null,
4933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 10.230179028132993,
6
  "eval_steps": 500,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4908
  "learning_rate": 0.00017794528798804519,
4909
  "loss": 5.3009,
4910
  "step": 7000
4911
+ },
4912
+ {
4913
+ "epoch": 8.964194373401535,
4914
+ "grad_norm": 1.296583652496338,
4915
+ "learning_rate": 0.00017764536719815918,
4916
+ "loss": 5.3158,
4917
+ "step": 7010
4918
+ },
4919
+ {
4920
+ "epoch": 8.976982097186701,
4921
+ "grad_norm": 1.0986963510513306,
4922
+ "learning_rate": 0.00017734533195522424,
4923
+ "loss": 5.3068,
4924
+ "step": 7020
4925
+ },
4926
+ {
4927
+ "epoch": 8.989769820971867,
4928
+ "grad_norm": 1.0342364311218262,
4929
+ "learning_rate": 0.00017704518350139965,
4930
+ "loss": 5.2997,
4931
+ "step": 7030
4932
+ },
4933
+ {
4934
+ "epoch": 9.002557544757034,
4935
+ "grad_norm": 1.445008635520935,
4936
+ "learning_rate": 0.0001767449230793133,
4937
+ "loss": 5.3086,
4938
+ "step": 7040
4939
+ },
4940
+ {
4941
+ "epoch": 9.0153452685422,
4942
+ "grad_norm": 1.7098116874694824,
4943
+ "learning_rate": 0.00017644455193205666,
4944
+ "loss": 5.3005,
4945
+ "step": 7050
4946
+ },
4947
+ {
4948
+ "epoch": 9.028132992327366,
4949
+ "grad_norm": 0.9708366990089417,
4950
+ "learning_rate": 0.00017614407130317968,
4951
+ "loss": 5.2737,
4952
+ "step": 7060
4953
+ },
4954
+ {
4955
+ "epoch": 9.040920716112533,
4956
+ "grad_norm": 1.089858889579773,
4957
+ "learning_rate": 0.00017584348243668556,
4958
+ "loss": 5.262,
4959
+ "step": 7070
4960
+ },
4961
+ {
4962
+ "epoch": 9.053708439897699,
4963
+ "grad_norm": 2.534782886505127,
4964
+ "learning_rate": 0.00017554278657702549,
4965
+ "loss": 5.2854,
4966
+ "step": 7080
4967
+ },
4968
+ {
4969
+ "epoch": 9.066496163682864,
4970
+ "grad_norm": 1.3707194328308105,
4971
+ "learning_rate": 0.00017524198496909373,
4972
+ "loss": 5.2936,
4973
+ "step": 7090
4974
+ },
4975
+ {
4976
+ "epoch": 9.07928388746803,
4977
+ "grad_norm": 1.4009782075881958,
4978
+ "learning_rate": 0.0001749410788582223,
4979
+ "loss": 5.2795,
4980
+ "step": 7100
4981
+ },
4982
+ {
4983
+ "epoch": 9.092071611253196,
4984
+ "grad_norm": 1.1263126134872437,
4985
+ "learning_rate": 0.00017464006949017584,
4986
+ "loss": 5.2806,
4987
+ "step": 7110
4988
+ },
4989
+ {
4990
+ "epoch": 9.104859335038363,
4991
+ "grad_norm": 0.9099717140197754,
4992
+ "learning_rate": 0.00017433895811114658,
4993
+ "loss": 5.3049,
4994
+ "step": 7120
4995
+ },
4996
+ {
4997
+ "epoch": 9.117647058823529,
4998
+ "grad_norm": 1.3195571899414062,
4999
+ "learning_rate": 0.00017403774596774893,
5000
+ "loss": 5.2803,
5001
+ "step": 7130
5002
+ },
5003
+ {
5004
+ "epoch": 9.130434782608695,
5005
+ "grad_norm": 1.1069055795669556,
5006
+ "learning_rate": 0.00017373643430701463,
5007
+ "loss": 5.2579,
5008
+ "step": 7140
5009
+ },
5010
+ {
5011
+ "epoch": 9.143222506393862,
5012
+ "grad_norm": 1.1975563764572144,
5013
+ "learning_rate": 0.00017343502437638727,
5014
+ "loss": 5.2795,
5015
+ "step": 7150
5016
+ },
5017
+ {
5018
+ "epoch": 9.156010230179028,
5019
+ "grad_norm": 1.6220418214797974,
5020
+ "learning_rate": 0.00017313351742371746,
5021
+ "loss": 5.2797,
5022
+ "step": 7160
5023
+ },
5024
+ {
5025
+ "epoch": 9.168797953964194,
5026
+ "grad_norm": 1.490394115447998,
5027
+ "learning_rate": 0.00017283191469725728,
5028
+ "loss": 5.2768,
5029
+ "step": 7170
5030
+ },
5031
+ {
5032
+ "epoch": 9.18158567774936,
5033
+ "grad_norm": 1.4838827848434448,
5034
+ "learning_rate": 0.00017253021744565548,
5035
+ "loss": 5.2871,
5036
+ "step": 7180
5037
+ },
5038
+ {
5039
+ "epoch": 9.194373401534527,
5040
+ "grad_norm": 5.531613826751709,
5041
+ "learning_rate": 0.0001722284269179521,
5042
+ "loss": 5.2537,
5043
+ "step": 7190
5044
+ },
5045
+ {
5046
+ "epoch": 9.207161125319693,
5047
+ "grad_norm": 1.5700544118881226,
5048
+ "learning_rate": 0.0001719265443635733,
5049
+ "loss": 5.2875,
5050
+ "step": 7200
5051
+ },
5052
+ {
5053
+ "epoch": 9.21994884910486,
5054
+ "grad_norm": 1.2028346061706543,
5055
+ "learning_rate": 0.00017162457103232632,
5056
+ "loss": 5.2707,
5057
+ "step": 7210
5058
+ },
5059
+ {
5060
+ "epoch": 9.232736572890026,
5061
+ "grad_norm": 1.9414821863174438,
5062
+ "learning_rate": 0.00017132250817439412,
5063
+ "loss": 5.2918,
5064
+ "step": 7220
5065
+ },
5066
+ {
5067
+ "epoch": 9.245524296675192,
5068
+ "grad_norm": 3.8366823196411133,
5069
+ "learning_rate": 0.00017102035704033038,
5070
+ "loss": 5.277,
5071
+ "step": 7230
5072
+ },
5073
+ {
5074
+ "epoch": 9.258312020460359,
5075
+ "grad_norm": 41.69551086425781,
5076
+ "learning_rate": 0.0001707181188810542,
5077
+ "loss": 5.2691,
5078
+ "step": 7240
5079
+ },
5080
+ {
5081
+ "epoch": 9.271099744245525,
5082
+ "grad_norm": 2.0435402393341064,
5083
+ "learning_rate": 0.00017041579494784506,
5084
+ "loss": 5.3075,
5085
+ "step": 7250
5086
+ },
5087
+ {
5088
+ "epoch": 9.28388746803069,
5089
+ "grad_norm": 2.1493489742279053,
5090
+ "learning_rate": 0.00017011338649233743,
5091
+ "loss": 5.3234,
5092
+ "step": 7260
5093
+ },
5094
+ {
5095
+ "epoch": 9.296675191815856,
5096
+ "grad_norm": 3.627615213394165,
5097
+ "learning_rate": 0.0001698108947665158,
5098
+ "loss": 5.3018,
5099
+ "step": 7270
5100
+ },
5101
+ {
5102
+ "epoch": 9.309462915601022,
5103
+ "grad_norm": 24.722545623779297,
5104
+ "learning_rate": 0.00016950832102270927,
5105
+ "loss": 5.3123,
5106
+ "step": 7280
5107
+ },
5108
+ {
5109
+ "epoch": 9.322250639386189,
5110
+ "grad_norm": 19.32564353942871,
5111
+ "learning_rate": 0.00016920566651358666,
5112
+ "loss": 5.346,
5113
+ "step": 7290
5114
+ },
5115
+ {
5116
+ "epoch": 9.335038363171355,
5117
+ "grad_norm": 173.1358184814453,
5118
+ "learning_rate": 0.00016890293249215109,
5119
+ "loss": 5.3385,
5120
+ "step": 7300
5121
+ },
5122
+ {
5123
+ "epoch": 9.347826086956522,
5124
+ "grad_norm": 139.24111938476562,
5125
+ "learning_rate": 0.0001686001202117348,
5126
+ "loss": 5.3411,
5127
+ "step": 7310
5128
+ },
5129
+ {
5130
+ "epoch": 9.360613810741688,
5131
+ "grad_norm": 12987.6923828125,
5132
+ "learning_rate": 0.00016829723092599418,
5133
+ "loss": 5.3288,
5134
+ "step": 7320
5135
+ },
5136
+ {
5137
+ "epoch": 9.373401534526854,
5138
+ "grad_norm": 53.43489074707031,
5139
+ "learning_rate": 0.00016799426588890427,
5140
+ "loss": 5.3403,
5141
+ "step": 7330
5142
+ },
5143
+ {
5144
+ "epoch": 9.38618925831202,
5145
+ "grad_norm": 29.375526428222656,
5146
+ "learning_rate": 0.00016769122635475385,
5147
+ "loss": 5.3186,
5148
+ "step": 7340
5149
+ },
5150
+ {
5151
+ "epoch": 9.398976982097187,
5152
+ "grad_norm": 47.31606674194336,
5153
+ "learning_rate": 0.00016738811357813998,
5154
+ "loss": 5.3178,
5155
+ "step": 7350
5156
+ },
5157
+ {
5158
+ "epoch": 9.411764705882353,
5159
+ "grad_norm": 12.416561126708984,
5160
+ "learning_rate": 0.00016708492881396307,
5161
+ "loss": 5.3385,
5162
+ "step": 7360
5163
+ },
5164
+ {
5165
+ "epoch": 9.42455242966752,
5166
+ "grad_norm": 9.562813758850098,
5167
+ "learning_rate": 0.0001667816733174215,
5168
+ "loss": 5.3481,
5169
+ "step": 7370
5170
+ },
5171
+ {
5172
+ "epoch": 9.437340153452686,
5173
+ "grad_norm": 3.935084819793701,
5174
+ "learning_rate": 0.00016647834834400654,
5175
+ "loss": 5.3439,
5176
+ "step": 7380
5177
+ },
5178
+ {
5179
+ "epoch": 9.450127877237852,
5180
+ "grad_norm": 2.569079875946045,
5181
+ "learning_rate": 0.00016617495514949704,
5182
+ "loss": 5.3225,
5183
+ "step": 7390
5184
+ },
5185
+ {
5186
+ "epoch": 9.462915601023019,
5187
+ "grad_norm": 1.732500672340393,
5188
+ "learning_rate": 0.0001658714949899543,
5189
+ "loss": 5.3235,
5190
+ "step": 7400
5191
+ },
5192
+ {
5193
+ "epoch": 9.475703324808185,
5194
+ "grad_norm": 1.2219961881637573,
5195
+ "learning_rate": 0.00016556796912171689,
5196
+ "loss": 5.3413,
5197
+ "step": 7410
5198
+ },
5199
+ {
5200
+ "epoch": 9.48849104859335,
5201
+ "grad_norm": 0.9043082594871521,
5202
+ "learning_rate": 0.00016526437880139537,
5203
+ "loss": 5.288,
5204
+ "step": 7420
5205
+ },
5206
+ {
5207
+ "epoch": 9.501278772378516,
5208
+ "grad_norm": 1.0241756439208984,
5209
+ "learning_rate": 0.0001649607252858672,
5210
+ "loss": 5.302,
5211
+ "step": 7430
5212
+ },
5213
+ {
5214
+ "epoch": 9.514066496163682,
5215
+ "grad_norm": 1.5669431686401367,
5216
+ "learning_rate": 0.00016465700983227138,
5217
+ "loss": 5.2899,
5218
+ "step": 7440
5219
+ },
5220
+ {
5221
+ "epoch": 9.526854219948849,
5222
+ "grad_norm": 1.5617988109588623,
5223
+ "learning_rate": 0.00016435323369800344,
5224
+ "loss": 5.2868,
5225
+ "step": 7450
5226
+ },
5227
+ {
5228
+ "epoch": 9.539641943734015,
5229
+ "grad_norm": 1.1847914457321167,
5230
+ "learning_rate": 0.00016404939814071003,
5231
+ "loss": 5.2617,
5232
+ "step": 7460
5233
+ },
5234
+ {
5235
+ "epoch": 9.552429667519181,
5236
+ "grad_norm": 0.8560781478881836,
5237
+ "learning_rate": 0.0001637455044182839,
5238
+ "loss": 5.2855,
5239
+ "step": 7470
5240
+ },
5241
+ {
5242
+ "epoch": 9.565217391304348,
5243
+ "grad_norm": 0.9926068782806396,
5244
+ "learning_rate": 0.0001634415537888585,
5245
+ "loss": 5.265,
5246
+ "step": 7480
5247
+ },
5248
+ {
5249
+ "epoch": 9.578005115089514,
5250
+ "grad_norm": 2.2098798751831055,
5251
+ "learning_rate": 0.00016313754751080302,
5252
+ "loss": 5.2773,
5253
+ "step": 7490
5254
+ },
5255
+ {
5256
+ "epoch": 9.59079283887468,
5257
+ "grad_norm": 1.3162308931350708,
5258
+ "learning_rate": 0.00016283348684271694,
5259
+ "loss": 5.276,
5260
+ "step": 7500
5261
+ },
5262
+ {
5263
+ "epoch": 9.603580562659847,
5264
+ "grad_norm": 1.2072679996490479,
5265
+ "learning_rate": 0.00016252937304342494,
5266
+ "loss": 5.2825,
5267
+ "step": 7510
5268
+ },
5269
+ {
5270
+ "epoch": 9.616368286445013,
5271
+ "grad_norm": 1.212632656097412,
5272
+ "learning_rate": 0.0001622252073719717,
5273
+ "loss": 5.2609,
5274
+ "step": 7520
5275
+ },
5276
+ {
5277
+ "epoch": 9.62915601023018,
5278
+ "grad_norm": 1.244361400604248,
5279
+ "learning_rate": 0.0001619209910876165,
5280
+ "loss": 5.247,
5281
+ "step": 7530
5282
+ },
5283
+ {
5284
+ "epoch": 9.641943734015346,
5285
+ "grad_norm": 1.2935197353363037,
5286
+ "learning_rate": 0.00016161672544982842,
5287
+ "loss": 5.2666,
5288
+ "step": 7540
5289
+ },
5290
+ {
5291
+ "epoch": 9.654731457800512,
5292
+ "grad_norm": 1.270849347114563,
5293
+ "learning_rate": 0.00016131241171828063,
5294
+ "loss": 5.2556,
5295
+ "step": 7550
5296
+ },
5297
+ {
5298
+ "epoch": 9.667519181585678,
5299
+ "grad_norm": 1.5244646072387695,
5300
+ "learning_rate": 0.00016100805115284555,
5301
+ "loss": 5.2594,
5302
+ "step": 7560
5303
+ },
5304
+ {
5305
+ "epoch": 9.680306905370845,
5306
+ "grad_norm": 1.245428204536438,
5307
+ "learning_rate": 0.00016070364501358944,
5308
+ "loss": 5.2452,
5309
+ "step": 7570
5310
+ },
5311
+ {
5312
+ "epoch": 9.693094629156011,
5313
+ "grad_norm": 2.1988797187805176,
5314
+ "learning_rate": 0.00016039919456076727,
5315
+ "loss": 5.289,
5316
+ "step": 7580
5317
+ },
5318
+ {
5319
+ "epoch": 9.705882352941176,
5320
+ "grad_norm": 4.7878031730651855,
5321
+ "learning_rate": 0.00016009470105481736,
5322
+ "loss": 5.2933,
5323
+ "step": 7590
5324
+ },
5325
+ {
5326
+ "epoch": 9.718670076726342,
5327
+ "grad_norm": 1.0919615030288696,
5328
+ "learning_rate": 0.00015979016575635644,
5329
+ "loss": 5.2634,
5330
+ "step": 7600
5331
+ },
5332
+ {
5333
+ "epoch": 9.731457800511508,
5334
+ "grad_norm": 1.3478410243988037,
5335
+ "learning_rate": 0.00015948558992617416,
5336
+ "loss": 5.2808,
5337
+ "step": 7610
5338
+ },
5339
+ {
5340
+ "epoch": 9.744245524296675,
5341
+ "grad_norm": 1.088616132736206,
5342
+ "learning_rate": 0.00015918097482522798,
5343
+ "loss": 5.2656,
5344
+ "step": 7620
5345
+ },
5346
+ {
5347
+ "epoch": 9.757033248081841,
5348
+ "grad_norm": 1.4594988822937012,
5349
+ "learning_rate": 0.00015887632171463794,
5350
+ "loss": 5.2422,
5351
+ "step": 7630
5352
+ },
5353
+ {
5354
+ "epoch": 9.769820971867007,
5355
+ "grad_norm": 1.346463918685913,
5356
+ "learning_rate": 0.00015857163185568153,
5357
+ "loss": 5.2668,
5358
+ "step": 7640
5359
+ },
5360
+ {
5361
+ "epoch": 9.782608695652174,
5362
+ "grad_norm": 2.3005871772766113,
5363
+ "learning_rate": 0.00015826690650978825,
5364
+ "loss": 5.2295,
5365
+ "step": 7650
5366
+ },
5367
+ {
5368
+ "epoch": 9.79539641943734,
5369
+ "grad_norm": 1.372439980506897,
5370
+ "learning_rate": 0.0001579621469385346,
5371
+ "loss": 5.2419,
5372
+ "step": 7660
5373
+ },
5374
+ {
5375
+ "epoch": 9.808184143222507,
5376
+ "grad_norm": 1.0069390535354614,
5377
+ "learning_rate": 0.00015765735440363872,
5378
+ "loss": 5.2692,
5379
+ "step": 7670
5380
+ },
5381
+ {
5382
+ "epoch": 9.820971867007673,
5383
+ "grad_norm": 1.3591490983963013,
5384
+ "learning_rate": 0.00015735253016695527,
5385
+ "loss": 5.269,
5386
+ "step": 7680
5387
+ },
5388
+ {
5389
+ "epoch": 9.83375959079284,
5390
+ "grad_norm": 1.47946298122406,
5391
+ "learning_rate": 0.00015704767549047015,
5392
+ "loss": 5.2615,
5393
+ "step": 7690
5394
+ },
5395
+ {
5396
+ "epoch": 9.846547314578006,
5397
+ "grad_norm": 2.2882235050201416,
5398
+ "learning_rate": 0.00015674279163629528,
5399
+ "loss": 5.2452,
5400
+ "step": 7700
5401
+ },
5402
+ {
5403
+ "epoch": 9.859335038363172,
5404
+ "grad_norm": 1.406764268875122,
5405
+ "learning_rate": 0.00015643787986666333,
5406
+ "loss": 5.2515,
5407
+ "step": 7710
5408
+ },
5409
+ {
5410
+ "epoch": 9.872122762148338,
5411
+ "grad_norm": 0.9985896348953247,
5412
+ "learning_rate": 0.00015613294144392256,
5413
+ "loss": 5.2536,
5414
+ "step": 7720
5415
+ },
5416
+ {
5417
+ "epoch": 9.884910485933505,
5418
+ "grad_norm": 1.1201996803283691,
5419
+ "learning_rate": 0.00015582797763053166,
5420
+ "loss": 5.2459,
5421
+ "step": 7730
5422
+ },
5423
+ {
5424
+ "epoch": 9.89769820971867,
5425
+ "grad_norm": 1.2289462089538574,
5426
+ "learning_rate": 0.00015552298968905432,
5427
+ "loss": 5.2162,
5428
+ "step": 7740
5429
+ },
5430
+ {
5431
+ "epoch": 9.910485933503836,
5432
+ "grad_norm": 1.3729525804519653,
5433
+ "learning_rate": 0.00015521797888215424,
5434
+ "loss": 5.2488,
5435
+ "step": 7750
5436
+ },
5437
+ {
5438
+ "epoch": 9.923273657289002,
5439
+ "grad_norm": 1.3408769369125366,
5440
+ "learning_rate": 0.00015491294647258967,
5441
+ "loss": 5.2608,
5442
+ "step": 7760
5443
+ },
5444
+ {
5445
+ "epoch": 9.936061381074168,
5446
+ "grad_norm": 1.3026928901672363,
5447
+ "learning_rate": 0.0001546078937232083,
5448
+ "loss": 5.227,
5449
+ "step": 7770
5450
+ },
5451
+ {
5452
+ "epoch": 9.948849104859335,
5453
+ "grad_norm": 1.3715628385543823,
5454
+ "learning_rate": 0.00015430282189694212,
5455
+ "loss": 5.2677,
5456
+ "step": 7780
5457
+ },
5458
+ {
5459
+ "epoch": 9.961636828644501,
5460
+ "grad_norm": 1.0734012126922607,
5461
+ "learning_rate": 0.00015399773225680208,
5462
+ "loss": 5.2575,
5463
+ "step": 7790
5464
+ },
5465
+ {
5466
+ "epoch": 9.974424552429667,
5467
+ "grad_norm": 1.0179634094238281,
5468
+ "learning_rate": 0.00015369262606587281,
5469
+ "loss": 5.2117,
5470
+ "step": 7800
5471
+ },
5472
+ {
5473
+ "epoch": 9.987212276214834,
5474
+ "grad_norm": 1.2135021686553955,
5475
+ "learning_rate": 0.00015338750458730746,
5476
+ "loss": 5.2387,
5477
+ "step": 7810
5478
+ },
5479
+ {
5480
+ "epoch": 10.0,
5481
+ "grad_norm": 1.7299256324768066,
5482
+ "learning_rate": 0.00015308236908432264,
5483
+ "loss": 5.2517,
5484
+ "step": 7820
5485
+ },
5486
+ {
5487
+ "epoch": 10.012787723785166,
5488
+ "grad_norm": 1.3696908950805664,
5489
+ "learning_rate": 0.00015277722082019272,
5490
+ "loss": 5.2373,
5491
+ "step": 7830
5492
+ },
5493
+ {
5494
+ "epoch": 10.025575447570333,
5495
+ "grad_norm": 1.4967882633209229,
5496
+ "learning_rate": 0.00015247206105824522,
5497
+ "loss": 5.2045,
5498
+ "step": 7840
5499
+ },
5500
+ {
5501
+ "epoch": 10.038363171355499,
5502
+ "grad_norm": 1.4495280981063843,
5503
+ "learning_rate": 0.00015216689106185505,
5504
+ "loss": 5.2286,
5505
+ "step": 7850
5506
+ },
5507
+ {
5508
+ "epoch": 10.051150895140665,
5509
+ "grad_norm": 2.255189895629883,
5510
+ "learning_rate": 0.00015186171209443958,
5511
+ "loss": 5.1753,
5512
+ "step": 7860
5513
+ },
5514
+ {
5515
+ "epoch": 10.063938618925832,
5516
+ "grad_norm": 1.22813081741333,
5517
+ "learning_rate": 0.00015155652541945326,
5518
+ "loss": 5.2259,
5519
+ "step": 7870
5520
+ },
5521
+ {
5522
+ "epoch": 10.076726342710998,
5523
+ "grad_norm": 1.1559652090072632,
5524
+ "learning_rate": 0.00015125133230038256,
5525
+ "loss": 5.1997,
5526
+ "step": 7880
5527
+ },
5528
+ {
5529
+ "epoch": 10.089514066496164,
5530
+ "grad_norm": 1.5692169666290283,
5531
+ "learning_rate": 0.00015094613400074052,
5532
+ "loss": 5.2184,
5533
+ "step": 7890
5534
+ },
5535
+ {
5536
+ "epoch": 10.10230179028133,
5537
+ "grad_norm": 1.143264651298523,
5538
+ "learning_rate": 0.00015064093178406165,
5539
+ "loss": 5.2217,
5540
+ "step": 7900
5541
+ },
5542
+ {
5543
+ "epoch": 10.115089514066495,
5544
+ "grad_norm": 1.3026010990142822,
5545
+ "learning_rate": 0.00015033572691389673,
5546
+ "loss": 5.2219,
5547
+ "step": 7910
5548
+ },
5549
+ {
5550
+ "epoch": 10.127877237851662,
5551
+ "grad_norm": 1.255898356437683,
5552
+ "learning_rate": 0.00015003052065380742,
5553
+ "loss": 5.1964,
5554
+ "step": 7920
5555
+ },
5556
+ {
5557
+ "epoch": 10.140664961636828,
5558
+ "grad_norm": 1.5015469789505005,
5559
+ "learning_rate": 0.00014972531426736131,
5560
+ "loss": 5.241,
5561
+ "step": 7930
5562
+ },
5563
+ {
5564
+ "epoch": 10.153452685421994,
5565
+ "grad_norm": 1.5134257078170776,
5566
+ "learning_rate": 0.0001494201090181263,
5567
+ "loss": 5.2255,
5568
+ "step": 7940
5569
+ },
5570
+ {
5571
+ "epoch": 10.16624040920716,
5572
+ "grad_norm": 1.295350432395935,
5573
+ "learning_rate": 0.00014911490616966575,
5574
+ "loss": 5.2201,
5575
+ "step": 7950
5576
+ },
5577
+ {
5578
+ "epoch": 10.179028132992327,
5579
+ "grad_norm": 1.8612785339355469,
5580
+ "learning_rate": 0.000148809706985533,
5581
+ "loss": 5.2109,
5582
+ "step": 7960
5583
+ },
5584
+ {
5585
+ "epoch": 10.191815856777493,
5586
+ "grad_norm": 1.5443718433380127,
5587
+ "learning_rate": 0.0001485045127292662,
5588
+ "loss": 5.2247,
5589
+ "step": 7970
5590
+ },
5591
+ {
5592
+ "epoch": 10.20460358056266,
5593
+ "grad_norm": 1.7104902267456055,
5594
+ "learning_rate": 0.00014819932466438317,
5595
+ "loss": 5.2141,
5596
+ "step": 7980
5597
+ },
5598
+ {
5599
+ "epoch": 10.217391304347826,
5600
+ "grad_norm": 1.7787065505981445,
5601
+ "learning_rate": 0.00014789414405437607,
5602
+ "loss": 5.1911,
5603
+ "step": 7990
5604
+ },
5605
+ {
5606
+ "epoch": 10.230179028132993,
5607
+ "grad_norm": 1.0850389003753662,
5608
+ "learning_rate": 0.0001475889721627062,
5609
+ "loss": 5.2156,
5610
+ "step": 8000
5611
  }
5612
  ],
5613
  "logging_steps": 10,
 
5627
  "attributes": {}
5628
  }
5629
  },
5630
+ "total_flos": 1.0590849988794814e+19,
5631
  "train_batch_size": 32,
5632
  "trial_name": null,
5633
  "trial_params": null
{checkpoint-7000 β†’ checkpoint-8000}/training_args.bin RENAMED
File without changes