Plofski commited on
Commit
00fc286
·
verified ·
1 Parent(s): 4ae0828

Training in progress, step 14000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5a9bd42305a39ea10e14897e10ee483294601df6c8b6bb20eb9acc7de3a5b74
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:071ff40e66008578cff6a11839a98b3bd55870fb4ecd78b520fd649a835f02e1
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fd3300583dc98302b4bc1805b201303b140f489f169bc005adefa8fde0fce38
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0402536afc76b268263c8a44f7565c5d35ba54094497cf95e3c11e92a054cd5
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ce5bfd25fb939a324385a4adfd5b1d29fedc6793352a13b276f53eccc661d15
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7200e211c4af21388df4ea9729221c37205d2f4defca496f0d1b43ecbe09b628
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.7201289542615354,
6
  "eval_steps": 500,
7
- "global_step": 13500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12158,6 +12158,456 @@
12158
  "mean_token_accuracy": 0.7941052973270416,
12159
  "num_tokens": 14956201.0,
12160
  "step": 13500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12161
  }
12162
  ],
12163
  "logging_steps": 10,
@@ -12177,7 +12627,7 @@
12177
  "attributes": {}
12178
  }
12179
  },
12180
- "total_flos": 1.807875931971379e+16,
12181
  "train_batch_size": 8,
12182
  "trial_name": null,
12183
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.8208744710860367,
6
  "eval_steps": 500,
7
+ "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12158
  "mean_token_accuracy": 0.7941052973270416,
12159
  "num_tokens": 14956201.0,
12160
  "step": 13500
12161
+ },
12162
+ {
12163
+ "epoch": 2.722143864598025,
12164
+ "grad_norm": 13.3125,
12165
+ "learning_rate": 1.8537175095708242e-06,
12166
+ "loss": 0.8932,
12167
+ "mean_token_accuracy": 0.7903563916683197,
12168
+ "num_tokens": 14968110.0,
12169
+ "step": 13510
12170
+ },
12171
+ {
12172
+ "epoch": 2.7241587749345153,
12173
+ "grad_norm": 12.4375,
12174
+ "learning_rate": 1.8402847739942239e-06,
12175
+ "loss": 0.8239,
12176
+ "mean_token_accuracy": 0.7959101080894471,
12177
+ "num_tokens": 14979544.0,
12178
+ "step": 13520
12179
+ },
12180
+ {
12181
+ "epoch": 2.7261736852710055,
12182
+ "grad_norm": 14.25,
12183
+ "learning_rate": 1.8268520384176238e-06,
12184
+ "loss": 0.8986,
12185
+ "mean_token_accuracy": 0.7841361403465271,
12186
+ "num_tokens": 14990853.0,
12187
+ "step": 13530
12188
+ },
12189
+ {
12190
+ "epoch": 2.7281885956074956,
12191
+ "grad_norm": 9.5,
12192
+ "learning_rate": 1.8134193028410239e-06,
12193
+ "loss": 0.7908,
12194
+ "mean_token_accuracy": 0.7991403341293335,
12195
+ "num_tokens": 15002448.0,
12196
+ "step": 13540
12197
+ },
12198
+ {
12199
+ "epoch": 2.7302035059439858,
12200
+ "grad_norm": 11.8125,
12201
+ "learning_rate": 1.7999865672644234e-06,
12202
+ "loss": 0.8348,
12203
+ "mean_token_accuracy": 0.797630226612091,
12204
+ "num_tokens": 15012426.0,
12205
+ "step": 13550
12206
+ },
12207
+ {
12208
+ "epoch": 2.7322184162804755,
12209
+ "grad_norm": 12.375,
12210
+ "learning_rate": 1.7865538316878235e-06,
12211
+ "loss": 0.829,
12212
+ "mean_token_accuracy": 0.8008688688278198,
12213
+ "num_tokens": 15023568.0,
12214
+ "step": 13560
12215
+ },
12216
+ {
12217
+ "epoch": 2.7342333266169656,
12218
+ "grad_norm": 11.875,
12219
+ "learning_rate": 1.7731210961112234e-06,
12220
+ "loss": 0.8301,
12221
+ "mean_token_accuracy": 0.7909713625907898,
12222
+ "num_tokens": 15033450.0,
12223
+ "step": 13570
12224
+ },
12225
+ {
12226
+ "epoch": 2.7362482369534558,
12227
+ "grad_norm": 10.875,
12228
+ "learning_rate": 1.759688360534623e-06,
12229
+ "loss": 0.797,
12230
+ "mean_token_accuracy": 0.8022173583507538,
12231
+ "num_tokens": 15045140.0,
12232
+ "step": 13580
12233
+ },
12234
+ {
12235
+ "epoch": 2.7382631472899455,
12236
+ "grad_norm": 10.4375,
12237
+ "learning_rate": 1.746255624958023e-06,
12238
+ "loss": 0.8381,
12239
+ "mean_token_accuracy": 0.7926445186138154,
12240
+ "num_tokens": 15056256.0,
12241
+ "step": 13590
12242
+ },
12243
+ {
12244
+ "epoch": 2.7402780576264356,
12245
+ "grad_norm": 11.1875,
12246
+ "learning_rate": 1.7328228893814228e-06,
12247
+ "loss": 0.8104,
12248
+ "mean_token_accuracy": 0.7977364182472229,
12249
+ "num_tokens": 15068134.0,
12250
+ "step": 13600
12251
+ },
12252
+ {
12253
+ "epoch": 2.7422929679629258,
12254
+ "grad_norm": 10.125,
12255
+ "learning_rate": 1.7193901538048225e-06,
12256
+ "loss": 0.8133,
12257
+ "mean_token_accuracy": 0.8040676951408386,
12258
+ "num_tokens": 15079578.0,
12259
+ "step": 13610
12260
+ },
12261
+ {
12262
+ "epoch": 2.7443078782994155,
12263
+ "grad_norm": 11.0,
12264
+ "learning_rate": 1.7059574182282224e-06,
12265
+ "loss": 0.9289,
12266
+ "mean_token_accuracy": 0.7738179624080658,
12267
+ "num_tokens": 15090034.0,
12268
+ "step": 13620
12269
+ },
12270
+ {
12271
+ "epoch": 2.7463227886359056,
12272
+ "grad_norm": 11.375,
12273
+ "learning_rate": 1.692524682651622e-06,
12274
+ "loss": 0.8635,
12275
+ "mean_token_accuracy": 0.7958697319030762,
12276
+ "num_tokens": 15101919.0,
12277
+ "step": 13630
12278
+ },
12279
+ {
12280
+ "epoch": 2.7483376989723958,
12281
+ "grad_norm": 13.0625,
12282
+ "learning_rate": 1.679091947075022e-06,
12283
+ "loss": 0.8911,
12284
+ "mean_token_accuracy": 0.7814191520214081,
12285
+ "num_tokens": 15114084.0,
12286
+ "step": 13640
12287
+ },
12288
+ {
12289
+ "epoch": 2.750352609308886,
12290
+ "grad_norm": 12.75,
12291
+ "learning_rate": 1.6656592114984219e-06,
12292
+ "loss": 0.7362,
12293
+ "mean_token_accuracy": 0.8138824105262756,
12294
+ "num_tokens": 15124878.0,
12295
+ "step": 13650
12296
+ },
12297
+ {
12298
+ "epoch": 2.7523675196453756,
12299
+ "grad_norm": 11.75,
12300
+ "learning_rate": 1.6522264759218216e-06,
12301
+ "loss": 0.8195,
12302
+ "mean_token_accuracy": 0.793831080198288,
12303
+ "num_tokens": 15135525.0,
12304
+ "step": 13660
12305
+ },
12306
+ {
12307
+ "epoch": 2.7543824299818658,
12308
+ "grad_norm": 9.8125,
12309
+ "learning_rate": 1.6387937403452214e-06,
12310
+ "loss": 0.7857,
12311
+ "mean_token_accuracy": 0.8074389100074768,
12312
+ "num_tokens": 15147692.0,
12313
+ "step": 13670
12314
+ },
12315
+ {
12316
+ "epoch": 2.756397340318356,
12317
+ "grad_norm": 10.125,
12318
+ "learning_rate": 1.6253610047686213e-06,
12319
+ "loss": 0.9199,
12320
+ "mean_token_accuracy": 0.7814192116260529,
12321
+ "num_tokens": 15159592.0,
12322
+ "step": 13680
12323
+ },
12324
+ {
12325
+ "epoch": 2.758412250654846,
12326
+ "grad_norm": 10.5625,
12327
+ "learning_rate": 1.611928269192021e-06,
12328
+ "loss": 0.7825,
12329
+ "mean_token_accuracy": 0.7981011807918549,
12330
+ "num_tokens": 15171601.0,
12331
+ "step": 13690
12332
+ },
12333
+ {
12334
+ "epoch": 2.760427160991336,
12335
+ "grad_norm": 14.9375,
12336
+ "learning_rate": 1.598495533615421e-06,
12337
+ "loss": 0.9254,
12338
+ "mean_token_accuracy": 0.777032095193863,
12339
+ "num_tokens": 15182890.0,
12340
+ "step": 13700
12341
+ },
12342
+ {
12343
+ "epoch": 2.762442071327826,
12344
+ "grad_norm": 12.125,
12345
+ "learning_rate": 1.5850627980388208e-06,
12346
+ "loss": 0.7658,
12347
+ "mean_token_accuracy": 0.8108864903450013,
12348
+ "num_tokens": 15193434.0,
12349
+ "step": 13710
12350
+ },
12351
+ {
12352
+ "epoch": 2.764456981664316,
12353
+ "grad_norm": 12.6875,
12354
+ "learning_rate": 1.5716300624622205e-06,
12355
+ "loss": 0.7604,
12356
+ "mean_token_accuracy": 0.8065372705459595,
12357
+ "num_tokens": 15204253.0,
12358
+ "step": 13720
12359
+ },
12360
+ {
12361
+ "epoch": 2.766471892000806,
12362
+ "grad_norm": 12.75,
12363
+ "learning_rate": 1.5581973268856204e-06,
12364
+ "loss": 0.7993,
12365
+ "mean_token_accuracy": 0.8044365346431732,
12366
+ "num_tokens": 15214089.0,
12367
+ "step": 13730
12368
+ },
12369
+ {
12370
+ "epoch": 2.768486802337296,
12371
+ "grad_norm": 11.4375,
12372
+ "learning_rate": 1.5447645913090203e-06,
12373
+ "loss": 0.8261,
12374
+ "mean_token_accuracy": 0.7986261487007141,
12375
+ "num_tokens": 15224909.0,
12376
+ "step": 13740
12377
+ },
12378
+ {
12379
+ "epoch": 2.770501712673786,
12380
+ "grad_norm": 16.125,
12381
+ "learning_rate": 1.53133185573242e-06,
12382
+ "loss": 0.9516,
12383
+ "mean_token_accuracy": 0.7706651806831359,
12384
+ "num_tokens": 15237744.0,
12385
+ "step": 13750
12386
+ },
12387
+ {
12388
+ "epoch": 2.772516623010276,
12389
+ "grad_norm": 18.25,
12390
+ "learning_rate": 1.5178991201558199e-06,
12391
+ "loss": 0.8024,
12392
+ "mean_token_accuracy": 0.8005965650081635,
12393
+ "num_tokens": 15248510.0,
12394
+ "step": 13760
12395
+ },
12396
+ {
12397
+ "epoch": 2.774531533346766,
12398
+ "grad_norm": 11.8125,
12399
+ "learning_rate": 1.5044663845792195e-06,
12400
+ "loss": 0.7795,
12401
+ "mean_token_accuracy": 0.8093705713748932,
12402
+ "num_tokens": 15258924.0,
12403
+ "step": 13770
12404
+ },
12405
+ {
12406
+ "epoch": 2.776546443683256,
12407
+ "grad_norm": 8.9375,
12408
+ "learning_rate": 1.4910336490026194e-06,
12409
+ "loss": 0.822,
12410
+ "mean_token_accuracy": 0.8004359900951385,
12411
+ "num_tokens": 15270012.0,
12412
+ "step": 13780
12413
+ },
12414
+ {
12415
+ "epoch": 2.7785613540197462,
12416
+ "grad_norm": 13.6875,
12417
+ "learning_rate": 1.4776009134260193e-06,
12418
+ "loss": 0.7118,
12419
+ "mean_token_accuracy": 0.8202294111251831,
12420
+ "num_tokens": 15280170.0,
12421
+ "step": 13790
12422
+ },
12423
+ {
12424
+ "epoch": 2.7805762643562364,
12425
+ "grad_norm": 10.4375,
12426
+ "learning_rate": 1.464168177849419e-06,
12427
+ "loss": 0.7994,
12428
+ "mean_token_accuracy": 0.8046676278114319,
12429
+ "num_tokens": 15291487.0,
12430
+ "step": 13800
12431
+ },
12432
+ {
12433
+ "epoch": 2.782591174692726,
12434
+ "grad_norm": 11.375,
12435
+ "learning_rate": 1.450735442272819e-06,
12436
+ "loss": 0.7917,
12437
+ "mean_token_accuracy": 0.7997995793819428,
12438
+ "num_tokens": 15302931.0,
12439
+ "step": 13810
12440
+ },
12441
+ {
12442
+ "epoch": 2.7846060850292162,
12443
+ "grad_norm": 11.75,
12444
+ "learning_rate": 1.437302706696219e-06,
12445
+ "loss": 0.8199,
12446
+ "mean_token_accuracy": 0.7960925221443176,
12447
+ "num_tokens": 15313041.0,
12448
+ "step": 13820
12449
+ },
12450
+ {
12451
+ "epoch": 2.7866209953657064,
12452
+ "grad_norm": 11.6875,
12453
+ "learning_rate": 1.4238699711196185e-06,
12454
+ "loss": 0.7861,
12455
+ "mean_token_accuracy": 0.8075309932231903,
12456
+ "num_tokens": 15324592.0,
12457
+ "step": 13830
12458
+ },
12459
+ {
12460
+ "epoch": 2.788635905702196,
12461
+ "grad_norm": 10.75,
12462
+ "learning_rate": 1.4104372355430186e-06,
12463
+ "loss": 0.9482,
12464
+ "mean_token_accuracy": 0.7787281274795532,
12465
+ "num_tokens": 15336954.0,
12466
+ "step": 13840
12467
+ },
12468
+ {
12469
+ "epoch": 2.7906508160386863,
12470
+ "grad_norm": 10.8125,
12471
+ "learning_rate": 1.3970044999664185e-06,
12472
+ "loss": 0.7294,
12473
+ "mean_token_accuracy": 0.81562819480896,
12474
+ "num_tokens": 15346970.0,
12475
+ "step": 13850
12476
+ },
12477
+ {
12478
+ "epoch": 2.7926657263751764,
12479
+ "grad_norm": 12.5,
12480
+ "learning_rate": 1.3835717643898182e-06,
12481
+ "loss": 0.8401,
12482
+ "mean_token_accuracy": 0.7911224365234375,
12483
+ "num_tokens": 15357988.0,
12484
+ "step": 13860
12485
+ },
12486
+ {
12487
+ "epoch": 2.794680636711666,
12488
+ "grad_norm": 11.6875,
12489
+ "learning_rate": 1.370139028813218e-06,
12490
+ "loss": 0.8417,
12491
+ "mean_token_accuracy": 0.7968161761760711,
12492
+ "num_tokens": 15368806.0,
12493
+ "step": 13870
12494
+ },
12495
+ {
12496
+ "epoch": 2.7966955470481563,
12497
+ "grad_norm": 12.875,
12498
+ "learning_rate": 1.3567062932366175e-06,
12499
+ "loss": 0.8506,
12500
+ "mean_token_accuracy": 0.7901014566421509,
12501
+ "num_tokens": 15378192.0,
12502
+ "step": 13880
12503
+ },
12504
+ {
12505
+ "epoch": 2.7987104573846464,
12506
+ "grad_norm": 12.1875,
12507
+ "learning_rate": 1.3432735576600176e-06,
12508
+ "loss": 0.7323,
12509
+ "mean_token_accuracy": 0.812350469827652,
12510
+ "num_tokens": 15388640.0,
12511
+ "step": 13890
12512
+ },
12513
+ {
12514
+ "epoch": 2.8007253677211366,
12515
+ "grad_norm": 13.0625,
12516
+ "learning_rate": 1.3298408220834175e-06,
12517
+ "loss": 0.7982,
12518
+ "mean_token_accuracy": 0.8071064949035645,
12519
+ "num_tokens": 15398587.0,
12520
+ "step": 13900
12521
+ },
12522
+ {
12523
+ "epoch": 2.8027402780576267,
12524
+ "grad_norm": 10.25,
12525
+ "learning_rate": 1.3164080865068172e-06,
12526
+ "loss": 0.9217,
12527
+ "mean_token_accuracy": 0.7764141440391541,
12528
+ "num_tokens": 15408343.0,
12529
+ "step": 13910
12530
+ },
12531
+ {
12532
+ "epoch": 2.8047551883941164,
12533
+ "grad_norm": 12.25,
12534
+ "learning_rate": 1.302975350930217e-06,
12535
+ "loss": 0.7961,
12536
+ "mean_token_accuracy": 0.799578857421875,
12537
+ "num_tokens": 15419485.0,
12538
+ "step": 13920
12539
+ },
12540
+ {
12541
+ "epoch": 2.8067700987306066,
12542
+ "grad_norm": 11.3125,
12543
+ "learning_rate": 1.289542615353617e-06,
12544
+ "loss": 0.8453,
12545
+ "mean_token_accuracy": 0.794361412525177,
12546
+ "num_tokens": 15431130.0,
12547
+ "step": 13930
12548
+ },
12549
+ {
12550
+ "epoch": 2.8087850090670967,
12551
+ "grad_norm": 14.75,
12552
+ "learning_rate": 1.2761098797770167e-06,
12553
+ "loss": 0.8325,
12554
+ "mean_token_accuracy": 0.7931640625,
12555
+ "num_tokens": 15442377.0,
12556
+ "step": 13940
12557
+ },
12558
+ {
12559
+ "epoch": 2.8107999194035864,
12560
+ "grad_norm": 11.5625,
12561
+ "learning_rate": 1.2626771442004166e-06,
12562
+ "loss": 0.7471,
12563
+ "mean_token_accuracy": 0.8172047972679138,
12564
+ "num_tokens": 15453500.0,
12565
+ "step": 13950
12566
+ },
12567
+ {
12568
+ "epoch": 2.8128148297400766,
12569
+ "grad_norm": 11.6875,
12570
+ "learning_rate": 1.2492444086238162e-06,
12571
+ "loss": 0.8275,
12572
+ "mean_token_accuracy": 0.7963262915611267,
12573
+ "num_tokens": 15465388.0,
12574
+ "step": 13960
12575
+ },
12576
+ {
12577
+ "epoch": 2.8148297400765667,
12578
+ "grad_norm": 12.875,
12579
+ "learning_rate": 1.2358116730472161e-06,
12580
+ "loss": 0.7675,
12581
+ "mean_token_accuracy": 0.8086236357688904,
12582
+ "num_tokens": 15475972.0,
12583
+ "step": 13970
12584
+ },
12585
+ {
12586
+ "epoch": 2.8168446504130564,
12587
+ "grad_norm": 10.0,
12588
+ "learning_rate": 1.222378937470616e-06,
12589
+ "loss": 0.7866,
12590
+ "mean_token_accuracy": 0.8052810370922089,
12591
+ "num_tokens": 15489313.0,
12592
+ "step": 13980
12593
+ },
12594
+ {
12595
+ "epoch": 2.8188595607495466,
12596
+ "grad_norm": 11.8125,
12597
+ "learning_rate": 1.208946201894016e-06,
12598
+ "loss": 0.7903,
12599
+ "mean_token_accuracy": 0.805169427394867,
12600
+ "num_tokens": 15500110.0,
12601
+ "step": 13990
12602
+ },
12603
+ {
12604
+ "epoch": 2.8208744710860367,
12605
+ "grad_norm": 13.5,
12606
+ "learning_rate": 1.1955134663174156e-06,
12607
+ "loss": 0.8045,
12608
+ "mean_token_accuracy": 0.7995685517787934,
12609
+ "num_tokens": 15509702.0,
12610
+ "step": 14000
12611
  }
12612
  ],
12613
  "logging_steps": 10,
 
12627
  "attributes": {}
12628
  }
12629
  },
12630
+ "total_flos": 1.874850530342093e+16,
12631
  "train_batch_size": 8,
12632
  "trial_name": null,
12633
  "trial_params": null