Plofski commited on
Commit
2486be3
·
verified ·
1 Parent(s): 96dc39d

Training in progress, step 13000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18518c164df026440f068fac8233b3bff2d8d4502ff38a32a862597f23f6b7c0
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e31724b0cf74835ae0b9aaeff5c05e7e852cb9e158de0e35d8a673c930d429
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c73b91ebf8be54d28c1c49c244582f7f70def8a8258d400992d104200bbf23d2
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98033794262f4774a192ebe69b4dfddba3edee43a3cce40cedfd5c1785391e67
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:113d12b5af2a861076397bdce257b8a1e5a1daabe8a5aaee5bfcbdb6024fca69
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3526295826c2a8db767925a5ee2fce15661c2f21ba999bd2bc96732400f36f2d
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.518637920612533,
6
  "eval_steps": 500,
7
- "global_step": 12500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11258,6 +11258,456 @@
11258
  "mean_token_accuracy": 0.8074711799621582,
11259
  "num_tokens": 13840892.0,
11260
  "step": 12500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11261
  }
11262
  ],
11263
  "logging_steps": 10,
@@ -11277,7 +11727,7 @@
11277
  "attributes": {}
11278
  }
11279
  },
11280
- "total_flos": 1.6741415131650048e+16,
11281
  "train_batch_size": 8,
11282
  "trial_name": null,
11283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.619383437437034,
6
  "eval_steps": 500,
7
+ "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11258
  "mean_token_accuracy": 0.8074711799621582,
11259
  "num_tokens": 13840892.0,
11260
  "step": 12500
11261
+ },
11262
+ {
11263
+ "epoch": 2.5206528309490226,
11264
+ "grad_norm": 14.0625,
11265
+ "learning_rate": 3.196991067230842e-06,
11266
+ "loss": 0.8395,
11267
+ "mean_token_accuracy": 0.797085040807724,
11268
+ "num_tokens": 13852682.0,
11269
+ "step": 12510
11270
+ },
11271
+ {
11272
+ "epoch": 2.5226677412855127,
11273
+ "grad_norm": 10.4375,
11274
+ "learning_rate": 3.183558331654242e-06,
11275
+ "loss": 0.7477,
11276
+ "mean_token_accuracy": 0.8105276763439179,
11277
+ "num_tokens": 13864663.0,
11278
+ "step": 12520
11279
+ },
11280
+ {
11281
+ "epoch": 2.524682651622003,
11282
+ "grad_norm": 10.5625,
11283
+ "learning_rate": 3.1701255960776416e-06,
11284
+ "loss": 0.6768,
11285
+ "mean_token_accuracy": 0.8293303847312927,
11286
+ "num_tokens": 13876171.0,
11287
+ "step": 12530
11288
+ },
11289
+ {
11290
+ "epoch": 2.5266975619584926,
11291
+ "grad_norm": 9.625,
11292
+ "learning_rate": 3.156692860501041e-06,
11293
+ "loss": 0.72,
11294
+ "mean_token_accuracy": 0.8201279520988465,
11295
+ "num_tokens": 13887876.0,
11296
+ "step": 12540
11297
+ },
11298
+ {
11299
+ "epoch": 2.5287124722949827,
11300
+ "grad_norm": 15.6875,
11301
+ "learning_rate": 3.143260124924441e-06,
11302
+ "loss": 0.8625,
11303
+ "mean_token_accuracy": 0.7869448184967041,
11304
+ "num_tokens": 13898022.0,
11305
+ "step": 12550
11306
+ },
11307
+ {
11308
+ "epoch": 2.530727382631473,
11309
+ "grad_norm": 12.625,
11310
+ "learning_rate": 3.129827389347841e-06,
11311
+ "loss": 0.8983,
11312
+ "mean_token_accuracy": 0.7845638215541839,
11313
+ "num_tokens": 13909748.0,
11314
+ "step": 12560
11315
+ },
11316
+ {
11317
+ "epoch": 2.532742292967963,
11318
+ "grad_norm": 10.25,
11319
+ "learning_rate": 3.1163946537712408e-06,
11320
+ "loss": 0.8229,
11321
+ "mean_token_accuracy": 0.7928247213363647,
11322
+ "num_tokens": 13922277.0,
11323
+ "step": 12570
11324
+ },
11325
+ {
11326
+ "epoch": 2.534757203304453,
11327
+ "grad_norm": 11.125,
11328
+ "learning_rate": 3.102961918194641e-06,
11329
+ "loss": 0.7876,
11330
+ "mean_token_accuracy": 0.7989083111286164,
11331
+ "num_tokens": 13934598.0,
11332
+ "step": 12580
11333
+ },
11334
+ {
11335
+ "epoch": 2.536772113640943,
11336
+ "grad_norm": 12.9375,
11337
+ "learning_rate": 3.0895291826180406e-06,
11338
+ "loss": 0.7705,
11339
+ "mean_token_accuracy": 0.8085067272186279,
11340
+ "num_tokens": 13944920.0,
11341
+ "step": 12590
11342
+ },
11343
+ {
11344
+ "epoch": 2.538787023977433,
11345
+ "grad_norm": 12.1875,
11346
+ "learning_rate": 3.0760964470414402e-06,
11347
+ "loss": 0.8116,
11348
+ "mean_token_accuracy": 0.7995778679847717,
11349
+ "num_tokens": 13956061.0,
11350
+ "step": 12600
11351
+ },
11352
+ {
11353
+ "epoch": 2.540801934313923,
11354
+ "grad_norm": 9.9375,
11355
+ "learning_rate": 3.06266371146484e-06,
11356
+ "loss": 0.767,
11357
+ "mean_token_accuracy": 0.8065134942531585,
11358
+ "num_tokens": 13967029.0,
11359
+ "step": 12610
11360
+ },
11361
+ {
11362
+ "epoch": 2.542816844650413,
11363
+ "grad_norm": 11.25,
11364
+ "learning_rate": 3.04923097588824e-06,
11365
+ "loss": 0.8058,
11366
+ "mean_token_accuracy": 0.7920451164245605,
11367
+ "num_tokens": 13978641.0,
11368
+ "step": 12620
11369
+ },
11370
+ {
11371
+ "epoch": 2.544831754986903,
11372
+ "grad_norm": 11.1875,
11373
+ "learning_rate": 3.0357982403116397e-06,
11374
+ "loss": 0.7992,
11375
+ "mean_token_accuracy": 0.7988592565059662,
11376
+ "num_tokens": 13989687.0,
11377
+ "step": 12630
11378
+ },
11379
+ {
11380
+ "epoch": 2.546846665323393,
11381
+ "grad_norm": 14.125,
11382
+ "learning_rate": 3.02236550473504e-06,
11383
+ "loss": 0.6931,
11384
+ "mean_token_accuracy": 0.8216106593608856,
11385
+ "num_tokens": 14001448.0,
11386
+ "step": 12640
11387
+ },
11388
+ {
11389
+ "epoch": 2.548861575659883,
11390
+ "grad_norm": 12.6875,
11391
+ "learning_rate": 3.008932769158439e-06,
11392
+ "loss": 0.8851,
11393
+ "mean_token_accuracy": 0.7797039806842804,
11394
+ "num_tokens": 14012660.0,
11395
+ "step": 12650
11396
+ },
11397
+ {
11398
+ "epoch": 2.550876485996373,
11399
+ "grad_norm": 11.0625,
11400
+ "learning_rate": 2.995500033581839e-06,
11401
+ "loss": 0.8223,
11402
+ "mean_token_accuracy": 0.7970936000347137,
11403
+ "num_tokens": 14023087.0,
11404
+ "step": 12660
11405
+ },
11406
+ {
11407
+ "epoch": 2.552891396332863,
11408
+ "grad_norm": 10.25,
11409
+ "learning_rate": 2.982067298005239e-06,
11410
+ "loss": 0.7795,
11411
+ "mean_token_accuracy": 0.8052306652069092,
11412
+ "num_tokens": 14035227.0,
11413
+ "step": 12670
11414
+ },
11415
+ {
11416
+ "epoch": 2.5549063066693534,
11417
+ "grad_norm": 10.6875,
11418
+ "learning_rate": 2.968634562428639e-06,
11419
+ "loss": 0.8418,
11420
+ "mean_token_accuracy": 0.7952195703983307,
11421
+ "num_tokens": 14046084.0,
11422
+ "step": 12680
11423
+ },
11424
+ {
11425
+ "epoch": 2.5569212170058435,
11426
+ "grad_norm": 11.625,
11427
+ "learning_rate": 2.9552018268520386e-06,
11428
+ "loss": 0.7606,
11429
+ "mean_token_accuracy": 0.8044079065322876,
11430
+ "num_tokens": 14055858.0,
11431
+ "step": 12690
11432
+ },
11433
+ {
11434
+ "epoch": 2.5589361273423332,
11435
+ "grad_norm": 11.375,
11436
+ "learning_rate": 2.9417690912754388e-06,
11437
+ "loss": 0.7828,
11438
+ "mean_token_accuracy": 0.8071886241436005,
11439
+ "num_tokens": 14067097.0,
11440
+ "step": 12700
11441
+ },
11442
+ {
11443
+ "epoch": 2.5609510376788234,
11444
+ "grad_norm": 14.1875,
11445
+ "learning_rate": 2.928336355698838e-06,
11446
+ "loss": 0.8814,
11447
+ "mean_token_accuracy": 0.7862110197544098,
11448
+ "num_tokens": 14077884.0,
11449
+ "step": 12710
11450
+ },
11451
+ {
11452
+ "epoch": 2.5629659480153135,
11453
+ "grad_norm": 11.8125,
11454
+ "learning_rate": 2.914903620122238e-06,
11455
+ "loss": 0.8627,
11456
+ "mean_token_accuracy": 0.7814090967178344,
11457
+ "num_tokens": 14088843.0,
11458
+ "step": 12720
11459
+ },
11460
+ {
11461
+ "epoch": 2.5649808583518032,
11462
+ "grad_norm": 14.6875,
11463
+ "learning_rate": 2.901470884545638e-06,
11464
+ "loss": 0.8469,
11465
+ "mean_token_accuracy": 0.7937661349773407,
11466
+ "num_tokens": 14100309.0,
11467
+ "step": 12730
11468
+ },
11469
+ {
11470
+ "epoch": 2.5669957686882934,
11471
+ "grad_norm": 11.1875,
11472
+ "learning_rate": 2.888038148969038e-06,
11473
+ "loss": 0.8517,
11474
+ "mean_token_accuracy": 0.790552693605423,
11475
+ "num_tokens": 14110323.0,
11476
+ "step": 12740
11477
+ },
11478
+ {
11479
+ "epoch": 2.5690106790247835,
11480
+ "grad_norm": 12.625,
11481
+ "learning_rate": 2.874605413392438e-06,
11482
+ "loss": 0.8309,
11483
+ "mean_token_accuracy": 0.795288497209549,
11484
+ "num_tokens": 14121179.0,
11485
+ "step": 12750
11486
+ },
11487
+ {
11488
+ "epoch": 2.5710255893612732,
11489
+ "grad_norm": 11.9375,
11490
+ "learning_rate": 2.8611726778158373e-06,
11491
+ "loss": 0.8162,
11492
+ "mean_token_accuracy": 0.7960000455379486,
11493
+ "num_tokens": 14132215.0,
11494
+ "step": 12760
11495
+ },
11496
+ {
11497
+ "epoch": 2.5730404996977634,
11498
+ "grad_norm": 11.875,
11499
+ "learning_rate": 2.847739942239237e-06,
11500
+ "loss": 0.794,
11501
+ "mean_token_accuracy": 0.8040944337844849,
11502
+ "num_tokens": 14142028.0,
11503
+ "step": 12770
11504
+ },
11505
+ {
11506
+ "epoch": 2.5750554100342535,
11507
+ "grad_norm": 9.6875,
11508
+ "learning_rate": 2.834307206662637e-06,
11509
+ "loss": 0.9474,
11510
+ "mean_token_accuracy": 0.7681374192237854,
11511
+ "num_tokens": 14153369.0,
11512
+ "step": 12780
11513
+ },
11514
+ {
11515
+ "epoch": 2.5770703203707432,
11516
+ "grad_norm": 11.5625,
11517
+ "learning_rate": 2.820874471086037e-06,
11518
+ "loss": 0.8301,
11519
+ "mean_token_accuracy": 0.7957022428512573,
11520
+ "num_tokens": 14165045.0,
11521
+ "step": 12790
11522
+ },
11523
+ {
11524
+ "epoch": 2.5790852307072334,
11525
+ "grad_norm": 13.9375,
11526
+ "learning_rate": 2.807441735509437e-06,
11527
+ "loss": 0.7298,
11528
+ "mean_token_accuracy": 0.812953507900238,
11529
+ "num_tokens": 14175171.0,
11530
+ "step": 12800
11531
+ },
11532
+ {
11533
+ "epoch": 2.5811001410437235,
11534
+ "grad_norm": 10.0625,
11535
+ "learning_rate": 2.794008999932837e-06,
11536
+ "loss": 0.8874,
11537
+ "mean_token_accuracy": 0.7833206593990326,
11538
+ "num_tokens": 14186567.0,
11539
+ "step": 12810
11540
+ },
11541
+ {
11542
+ "epoch": 2.5831150513802137,
11543
+ "grad_norm": 14.4375,
11544
+ "learning_rate": 2.780576264356236e-06,
11545
+ "loss": 0.7494,
11546
+ "mean_token_accuracy": 0.8073345363140106,
11547
+ "num_tokens": 14196603.0,
11548
+ "step": 12820
11549
+ },
11550
+ {
11551
+ "epoch": 2.585129961716704,
11552
+ "grad_norm": 12.6875,
11553
+ "learning_rate": 2.7671435287796363e-06,
11554
+ "loss": 0.7582,
11555
+ "mean_token_accuracy": 0.8067417740821838,
11556
+ "num_tokens": 14207335.0,
11557
+ "step": 12830
11558
+ },
11559
+ {
11560
+ "epoch": 2.5871448720531935,
11561
+ "grad_norm": 12.375,
11562
+ "learning_rate": 2.753710793203036e-06,
11563
+ "loss": 0.7523,
11564
+ "mean_token_accuracy": 0.8145627319812775,
11565
+ "num_tokens": 14218474.0,
11566
+ "step": 12840
11567
+ },
11568
+ {
11569
+ "epoch": 2.5891597823896837,
11570
+ "grad_norm": 10.375,
11571
+ "learning_rate": 2.740278057626436e-06,
11572
+ "loss": 0.8045,
11573
+ "mean_token_accuracy": 0.8010720014572144,
11574
+ "num_tokens": 14229469.0,
11575
+ "step": 12850
11576
+ },
11577
+ {
11578
+ "epoch": 2.591174692726174,
11579
+ "grad_norm": 12.625,
11580
+ "learning_rate": 2.7268453220498358e-06,
11581
+ "loss": 0.8266,
11582
+ "mean_token_accuracy": 0.7978542387485504,
11583
+ "num_tokens": 14240757.0,
11584
+ "step": 12860
11585
+ },
11586
+ {
11587
+ "epoch": 2.5931896030626636,
11588
+ "grad_norm": 11.3125,
11589
+ "learning_rate": 2.713412586473235e-06,
11590
+ "loss": 0.8082,
11591
+ "mean_token_accuracy": 0.7974193513393402,
11592
+ "num_tokens": 14251148.0,
11593
+ "step": 12870
11594
+ },
11595
+ {
11596
+ "epoch": 2.5952045133991537,
11597
+ "grad_norm": 11.3125,
11598
+ "learning_rate": 2.699979850896635e-06,
11599
+ "loss": 0.8217,
11600
+ "mean_token_accuracy": 0.7950396835803986,
11601
+ "num_tokens": 14263499.0,
11602
+ "step": 12880
11603
+ },
11604
+ {
11605
+ "epoch": 2.597219423735644,
11606
+ "grad_norm": 12.4375,
11607
+ "learning_rate": 2.6865471153200352e-06,
11608
+ "loss": 0.7426,
11609
+ "mean_token_accuracy": 0.8107175350189209,
11610
+ "num_tokens": 14273600.0,
11611
+ "step": 12890
11612
+ },
11613
+ {
11614
+ "epoch": 2.5992343340721336,
11615
+ "grad_norm": 12.1875,
11616
+ "learning_rate": 2.673114379743435e-06,
11617
+ "loss": 0.7092,
11618
+ "mean_token_accuracy": 0.8177358627319335,
11619
+ "num_tokens": 14284136.0,
11620
+ "step": 12900
11621
+ },
11622
+ {
11623
+ "epoch": 2.6012492444086237,
11624
+ "grad_norm": 12.625,
11625
+ "learning_rate": 2.659681644166835e-06,
11626
+ "loss": 0.7701,
11627
+ "mean_token_accuracy": 0.8068889915943146,
11628
+ "num_tokens": 14294590.0,
11629
+ "step": 12910
11630
+ },
11631
+ {
11632
+ "epoch": 2.603264154745114,
11633
+ "grad_norm": 11.375,
11634
+ "learning_rate": 2.6462489085902347e-06,
11635
+ "loss": 0.8433,
11636
+ "mean_token_accuracy": 0.7921142339706421,
11637
+ "num_tokens": 14305206.0,
11638
+ "step": 12920
11639
+ },
11640
+ {
11641
+ "epoch": 2.605279065081604,
11642
+ "grad_norm": 9.75,
11643
+ "learning_rate": 2.6328161730136344e-06,
11644
+ "loss": 0.7931,
11645
+ "mean_token_accuracy": 0.7983499586582183,
11646
+ "num_tokens": 14315998.0,
11647
+ "step": 12930
11648
+ },
11649
+ {
11650
+ "epoch": 2.607293975418094,
11651
+ "grad_norm": 14.4375,
11652
+ "learning_rate": 2.619383437437034e-06,
11653
+ "loss": 0.8605,
11654
+ "mean_token_accuracy": 0.7901061117649079,
11655
+ "num_tokens": 14326408.0,
11656
+ "step": 12940
11657
+ },
11658
+ {
11659
+ "epoch": 2.609308885754584,
11660
+ "grad_norm": 8.6875,
11661
+ "learning_rate": 2.605950701860434e-06,
11662
+ "loss": 0.8868,
11663
+ "mean_token_accuracy": 0.7825572431087494,
11664
+ "num_tokens": 14337843.0,
11665
+ "step": 12950
11666
+ },
11667
+ {
11668
+ "epoch": 2.611323796091074,
11669
+ "grad_norm": 11.0,
11670
+ "learning_rate": 2.592517966283834e-06,
11671
+ "loss": 0.7792,
11672
+ "mean_token_accuracy": 0.7979351162910462,
11673
+ "num_tokens": 14348649.0,
11674
+ "step": 12960
11675
+ },
11676
+ {
11677
+ "epoch": 2.613338706427564,
11678
+ "grad_norm": 10.5,
11679
+ "learning_rate": 2.579085230707234e-06,
11680
+ "loss": 0.832,
11681
+ "mean_token_accuracy": 0.7962626338005065,
11682
+ "num_tokens": 14360145.0,
11683
+ "step": 12970
11684
+ },
11685
+ {
11686
+ "epoch": 2.615353616764054,
11687
+ "grad_norm": 9.5,
11688
+ "learning_rate": 2.5656524951306332e-06,
11689
+ "loss": 0.8188,
11690
+ "mean_token_accuracy": 0.7971078157424927,
11691
+ "num_tokens": 14371903.0,
11692
+ "step": 12980
11693
+ },
11694
+ {
11695
+ "epoch": 2.617368527100544,
11696
+ "grad_norm": 10.1875,
11697
+ "learning_rate": 2.5522197595540333e-06,
11698
+ "loss": 0.837,
11699
+ "mean_token_accuracy": 0.7964988470077514,
11700
+ "num_tokens": 14381687.0,
11701
+ "step": 12990
11702
+ },
11703
+ {
11704
+ "epoch": 2.619383437437034,
11705
+ "grad_norm": 12.8125,
11706
+ "learning_rate": 2.538787023977433e-06,
11707
+ "loss": 0.9483,
11708
+ "mean_token_accuracy": 0.7685989677906037,
11709
+ "num_tokens": 14393395.0,
11710
+ "step": 13000
11711
  }
11712
  ],
11713
  "logging_steps": 10,
 
11727
  "attributes": {}
11728
  }
11729
  },
11730
+ "total_flos": 1.7403253820080128e+16,
11731
  "train_batch_size": 8,
11732
  "trial_name": null,
11733
  "trial_params": null