irodkin commited on
Commit
a03ccaa
·
verified ·
1 Parent(s): 0f0a241

Training checkpoint at step 15000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 13900,
3
- "best_metric": 2.3990118503570557,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
5
- "epoch": 0.28,
6
  "eval_steps": 100,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5048,6 +5048,366 @@
5048
  "eval_samples_per_second": 3.222,
5049
  "eval_steps_per_second": 1.611,
5050
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5051
  }
5052
  ],
5053
  "logging_steps": 25,
@@ -5067,7 +5427,7 @@
5067
  "attributes": {}
5068
  }
5069
  },
5070
- "total_flos": 4.456483217658085e+19,
5071
  "train_batch_size": 1,
5072
  "trial_name": null,
5073
  "trial_params": null
 
1
  {
2
+ "best_global_step": 15000,
3
+ "best_metric": 2.397136688232422,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-15000",
5
+ "epoch": 0.3,
6
  "eval_steps": 100,
7
+ "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5048
  "eval_samples_per_second": 3.222,
5049
  "eval_steps_per_second": 1.611,
5050
  "step": 14000
5051
+ },
5052
+ {
5053
+ "epoch": 0.2805,
5054
+ "grad_norm": 0.5680524398621669,
5055
+ "learning_rate": 7.994666666666666e-06,
5056
+ "loss": 2.382,
5057
+ "step": 14025
5058
+ },
5059
+ {
5060
+ "epoch": 0.281,
5061
+ "grad_norm": 0.5577808062612865,
5062
+ "learning_rate": 7.989111111111112e-06,
5063
+ "loss": 2.3817,
5064
+ "step": 14050
5065
+ },
5066
+ {
5067
+ "epoch": 0.2815,
5068
+ "grad_norm": 0.5609272583996402,
5069
+ "learning_rate": 7.983555555555557e-06,
5070
+ "loss": 2.3807,
5071
+ "step": 14075
5072
+ },
5073
+ {
5074
+ "epoch": 0.282,
5075
+ "grad_norm": 0.5572862450140419,
5076
+ "learning_rate": 7.978e-06,
5077
+ "loss": 2.3883,
5078
+ "step": 14100
5079
+ },
5080
+ {
5081
+ "epoch": 0.282,
5082
+ "eval_loss": 2.399045467376709,
5083
+ "eval_runtime": 31.4262,
5084
+ "eval_samples_per_second": 3.246,
5085
+ "eval_steps_per_second": 1.623,
5086
+ "step": 14100
5087
+ },
5088
+ {
5089
+ "epoch": 0.2825,
5090
+ "grad_norm": 0.5548825232758766,
5091
+ "learning_rate": 7.972444444444444e-06,
5092
+ "loss": 2.3906,
5093
+ "step": 14125
5094
+ },
5095
+ {
5096
+ "epoch": 0.283,
5097
+ "grad_norm": 0.5699464235282781,
5098
+ "learning_rate": 7.96688888888889e-06,
5099
+ "loss": 2.3985,
5100
+ "step": 14150
5101
+ },
5102
+ {
5103
+ "epoch": 0.2835,
5104
+ "grad_norm": 0.5949860745449153,
5105
+ "learning_rate": 7.961333333333335e-06,
5106
+ "loss": 2.384,
5107
+ "step": 14175
5108
+ },
5109
+ {
5110
+ "epoch": 0.284,
5111
+ "grad_norm": 1.207767068552352,
5112
+ "learning_rate": 7.955777777777778e-06,
5113
+ "loss": 2.3897,
5114
+ "step": 14200
5115
+ },
5116
+ {
5117
+ "epoch": 0.284,
5118
+ "eval_loss": 2.3988163471221924,
5119
+ "eval_runtime": 31.5331,
5120
+ "eval_samples_per_second": 3.235,
5121
+ "eval_steps_per_second": 1.617,
5122
+ "step": 14200
5123
+ },
5124
+ {
5125
+ "epoch": 0.2845,
5126
+ "grad_norm": 0.5734778733619218,
5127
+ "learning_rate": 7.950222222222222e-06,
5128
+ "loss": 2.3995,
5129
+ "step": 14225
5130
+ },
5131
+ {
5132
+ "epoch": 0.285,
5133
+ "grad_norm": 0.5809053174835214,
5134
+ "learning_rate": 7.944666666666667e-06,
5135
+ "loss": 2.3935,
5136
+ "step": 14250
5137
+ },
5138
+ {
5139
+ "epoch": 0.2855,
5140
+ "grad_norm": 0.5721177604701749,
5141
+ "learning_rate": 7.939111111111112e-06,
5142
+ "loss": 2.3831,
5143
+ "step": 14275
5144
+ },
5145
+ {
5146
+ "epoch": 0.286,
5147
+ "grad_norm": 0.5870187369085319,
5148
+ "learning_rate": 7.933555555555556e-06,
5149
+ "loss": 2.3876,
5150
+ "step": 14300
5151
+ },
5152
+ {
5153
+ "epoch": 0.286,
5154
+ "eval_loss": 2.3985910415649414,
5155
+ "eval_runtime": 31.8276,
5156
+ "eval_samples_per_second": 3.205,
5157
+ "eval_steps_per_second": 1.602,
5158
+ "step": 14300
5159
+ },
5160
+ {
5161
+ "epoch": 0.2865,
5162
+ "grad_norm": 0.5540420732959112,
5163
+ "learning_rate": 7.928e-06,
5164
+ "loss": 2.3894,
5165
+ "step": 14325
5166
+ },
5167
+ {
5168
+ "epoch": 0.287,
5169
+ "grad_norm": 0.5771375830109964,
5170
+ "learning_rate": 7.922444444444445e-06,
5171
+ "loss": 2.3919,
5172
+ "step": 14350
5173
+ },
5174
+ {
5175
+ "epoch": 0.2875,
5176
+ "grad_norm": 0.558274829145414,
5177
+ "learning_rate": 7.91688888888889e-06,
5178
+ "loss": 2.3792,
5179
+ "step": 14375
5180
+ },
5181
+ {
5182
+ "epoch": 0.288,
5183
+ "grad_norm": 0.5489382411994304,
5184
+ "learning_rate": 7.911333333333333e-06,
5185
+ "loss": 2.382,
5186
+ "step": 14400
5187
+ },
5188
+ {
5189
+ "epoch": 0.288,
5190
+ "eval_loss": 2.398547887802124,
5191
+ "eval_runtime": 31.7859,
5192
+ "eval_samples_per_second": 3.209,
5193
+ "eval_steps_per_second": 1.604,
5194
+ "step": 14400
5195
+ },
5196
+ {
5197
+ "epoch": 0.2885,
5198
+ "grad_norm": 0.5437020470565486,
5199
+ "learning_rate": 7.905777777777779e-06,
5200
+ "loss": 2.391,
5201
+ "step": 14425
5202
+ },
5203
+ {
5204
+ "epoch": 0.289,
5205
+ "grad_norm": 0.5822012645571201,
5206
+ "learning_rate": 7.900222222222222e-06,
5207
+ "loss": 2.3774,
5208
+ "step": 14450
5209
+ },
5210
+ {
5211
+ "epoch": 0.2895,
5212
+ "grad_norm": 0.5662409547337693,
5213
+ "learning_rate": 7.894666666666667e-06,
5214
+ "loss": 2.3754,
5215
+ "step": 14475
5216
+ },
5217
+ {
5218
+ "epoch": 0.29,
5219
+ "grad_norm": 0.574336415517884,
5220
+ "learning_rate": 7.889111111111113e-06,
5221
+ "loss": 2.3696,
5222
+ "step": 14500
5223
+ },
5224
+ {
5225
+ "epoch": 0.29,
5226
+ "eval_loss": 2.3984858989715576,
5227
+ "eval_runtime": 31.7473,
5228
+ "eval_samples_per_second": 3.213,
5229
+ "eval_steps_per_second": 1.606,
5230
+ "step": 14500
5231
+ },
5232
+ {
5233
+ "epoch": 0.2905,
5234
+ "grad_norm": 0.5564392509678192,
5235
+ "learning_rate": 7.883555555555556e-06,
5236
+ "loss": 2.3856,
5237
+ "step": 14525
5238
+ },
5239
+ {
5240
+ "epoch": 0.291,
5241
+ "grad_norm": 0.5518394045498354,
5242
+ "learning_rate": 7.878e-06,
5243
+ "loss": 2.3972,
5244
+ "step": 14550
5245
+ },
5246
+ {
5247
+ "epoch": 0.2915,
5248
+ "grad_norm": 0.5795808696759357,
5249
+ "learning_rate": 7.872444444444445e-06,
5250
+ "loss": 2.3831,
5251
+ "step": 14575
5252
+ },
5253
+ {
5254
+ "epoch": 0.292,
5255
+ "grad_norm": 0.5601055983017486,
5256
+ "learning_rate": 7.86688888888889e-06,
5257
+ "loss": 2.3844,
5258
+ "step": 14600
5259
+ },
5260
+ {
5261
+ "epoch": 0.292,
5262
+ "eval_loss": 2.3982439041137695,
5263
+ "eval_runtime": 31.6763,
5264
+ "eval_samples_per_second": 3.22,
5265
+ "eval_steps_per_second": 1.61,
5266
+ "step": 14600
5267
+ },
5268
+ {
5269
+ "epoch": 0.2925,
5270
+ "grad_norm": 0.5964235234322374,
5271
+ "learning_rate": 7.861333333333334e-06,
5272
+ "loss": 2.3899,
5273
+ "step": 14625
5274
+ },
5275
+ {
5276
+ "epoch": 0.293,
5277
+ "grad_norm": 0.5610795516162878,
5278
+ "learning_rate": 7.855777777777779e-06,
5279
+ "loss": 2.3838,
5280
+ "step": 14650
5281
+ },
5282
+ {
5283
+ "epoch": 0.2935,
5284
+ "grad_norm": 0.5670881867616083,
5285
+ "learning_rate": 7.850222222222223e-06,
5286
+ "loss": 2.3825,
5287
+ "step": 14675
5288
+ },
5289
+ {
5290
+ "epoch": 0.294,
5291
+ "grad_norm": 0.5643624181789829,
5292
+ "learning_rate": 7.844666666666668e-06,
5293
+ "loss": 2.3882,
5294
+ "step": 14700
5295
+ },
5296
+ {
5297
+ "epoch": 0.294,
5298
+ "eval_loss": 2.398089647293091,
5299
+ "eval_runtime": 31.7677,
5300
+ "eval_samples_per_second": 3.211,
5301
+ "eval_steps_per_second": 1.605,
5302
+ "step": 14700
5303
+ },
5304
+ {
5305
+ "epoch": 0.2945,
5306
+ "grad_norm": 0.5686315690402087,
5307
+ "learning_rate": 7.839111111111111e-06,
5308
+ "loss": 2.3745,
5309
+ "step": 14725
5310
+ },
5311
+ {
5312
+ "epoch": 0.295,
5313
+ "grad_norm": 0.5893983725540548,
5314
+ "learning_rate": 7.833555555555557e-06,
5315
+ "loss": 2.378,
5316
+ "step": 14750
5317
+ },
5318
+ {
5319
+ "epoch": 0.2955,
5320
+ "grad_norm": 0.5972901998200331,
5321
+ "learning_rate": 7.828000000000002e-06,
5322
+ "loss": 2.377,
5323
+ "step": 14775
5324
+ },
5325
+ {
5326
+ "epoch": 0.296,
5327
+ "grad_norm": 0.5804879541179684,
5328
+ "learning_rate": 7.822444444444446e-06,
5329
+ "loss": 2.3911,
5330
+ "step": 14800
5331
+ },
5332
+ {
5333
+ "epoch": 0.296,
5334
+ "eval_loss": 2.397839069366455,
5335
+ "eval_runtime": 31.7602,
5336
+ "eval_samples_per_second": 3.212,
5337
+ "eval_steps_per_second": 1.606,
5338
+ "step": 14800
5339
+ },
5340
+ {
5341
+ "epoch": 0.2965,
5342
+ "grad_norm": 0.577463980570899,
5343
+ "learning_rate": 7.816888888888889e-06,
5344
+ "loss": 2.3896,
5345
+ "step": 14825
5346
+ },
5347
+ {
5348
+ "epoch": 0.297,
5349
+ "grad_norm": 0.5800702741538564,
5350
+ "learning_rate": 7.811333333333334e-06,
5351
+ "loss": 2.3838,
5352
+ "step": 14850
5353
+ },
5354
+ {
5355
+ "epoch": 0.2975,
5356
+ "grad_norm": 0.6037725626202978,
5357
+ "learning_rate": 7.80577777777778e-06,
5358
+ "loss": 2.3827,
5359
+ "step": 14875
5360
+ },
5361
+ {
5362
+ "epoch": 0.298,
5363
+ "grad_norm": 0.5862145198472817,
5364
+ "learning_rate": 7.800222222222223e-06,
5365
+ "loss": 2.3801,
5366
+ "step": 14900
5367
+ },
5368
+ {
5369
+ "epoch": 0.298,
5370
+ "eval_loss": 2.3976035118103027,
5371
+ "eval_runtime": 31.751,
5372
+ "eval_samples_per_second": 3.212,
5373
+ "eval_steps_per_second": 1.606,
5374
+ "step": 14900
5375
+ },
5376
+ {
5377
+ "epoch": 0.2985,
5378
+ "grad_norm": 0.5670781074548332,
5379
+ "learning_rate": 7.794666666666667e-06,
5380
+ "loss": 2.3819,
5381
+ "step": 14925
5382
+ },
5383
+ {
5384
+ "epoch": 0.299,
5385
+ "grad_norm": 0.5571823653622203,
5386
+ "learning_rate": 7.789111111111112e-06,
5387
+ "loss": 2.3835,
5388
+ "step": 14950
5389
+ },
5390
+ {
5391
+ "epoch": 0.2995,
5392
+ "grad_norm": 0.5733242457342494,
5393
+ "learning_rate": 7.783555555555557e-06,
5394
+ "loss": 2.3728,
5395
+ "step": 14975
5396
+ },
5397
+ {
5398
+ "epoch": 0.3,
5399
+ "grad_norm": 0.5619677124489769,
5400
+ "learning_rate": 7.778e-06,
5401
+ "loss": 2.3794,
5402
+ "step": 15000
5403
+ },
5404
+ {
5405
+ "epoch": 0.3,
5406
+ "eval_loss": 2.397136688232422,
5407
+ "eval_runtime": 31.7183,
5408
+ "eval_samples_per_second": 3.216,
5409
+ "eval_steps_per_second": 1.608,
5410
+ "step": 15000
5411
  }
5412
  ],
5413
  "logging_steps": 25,
 
5427
  "attributes": {}
5428
  }
5429
  },
5430
+ "total_flos": 4.774803447490806e+19,
5431
  "train_batch_size": 1,
5432
  "trial_name": null,
5433
  "trial_params": null