Plofski commited on
Commit
6421f08
·
verified ·
1 Parent(s): 94557ad

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c78f458d11eae9e4154eb728cce06719d74e09c423918147e47d15f28937e92f
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e424925fa2b2770536f70d1899af46260c1bbb5c290c98396f2248352c7add
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:036c2c1f7cdbea44cbf7137c6b1c3cf16b5447a1c3d590934dbf649691bc4729
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be8d0890e7228cd98f10766bc63bebe515a3fa05be0c7762618a01f87fa2799c
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f9a4928c3c29f8d8ffe6d8c80c93af4c98237f714bf32b55ba4f3d5d67a23da
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdff80ed8983588a862f2109bcc080c93759e076260079b20d08888071ee3452
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.3171468869635303,
6
  "eval_steps": 500,
7
- "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10358,6 +10358,456 @@
10358
  "mean_token_accuracy": 0.8073502600193023,
10359
  "num_tokens": 12733862.0,
10360
  "step": 11500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10361
  }
10362
  ],
10363
  "logging_steps": 10,
@@ -10377,7 +10827,7 @@
10377
  "attributes": {}
10378
  }
10379
  },
10380
- "total_flos": 1.5401013006618624e+16,
10381
  "train_batch_size": 8,
10382
  "trial_name": null,
10383
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.4178924037880316,
6
  "eval_steps": 500,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10358
  "mean_token_accuracy": 0.8073502600193023,
10359
  "num_tokens": 12733862.0,
10360
  "step": 11500
10361
+ },
10362
+ {
10363
+ "epoch": 2.31916179730002,
10364
+ "grad_norm": 11.9375,
10365
+ "learning_rate": 4.540264624890859e-06,
10366
+ "loss": 0.8758,
10367
+ "mean_token_accuracy": 0.7853485405445099,
10368
+ "num_tokens": 12745243.0,
10369
+ "step": 11510
10370
+ },
10371
+ {
10372
+ "epoch": 2.32117670763651,
10373
+ "grad_norm": 13.8125,
10374
+ "learning_rate": 4.526831889314259e-06,
10375
+ "loss": 0.8738,
10376
+ "mean_token_accuracy": 0.7857240378856659,
10377
+ "num_tokens": 12756282.0,
10378
+ "step": 11520
10379
+ },
10380
+ {
10381
+ "epoch": 2.3231916179730003,
10382
+ "grad_norm": 9.3125,
10383
+ "learning_rate": 4.5133991537376595e-06,
10384
+ "loss": 0.8261,
10385
+ "mean_token_accuracy": 0.7956898987293244,
10386
+ "num_tokens": 12766669.0,
10387
+ "step": 11530
10388
+ },
10389
+ {
10390
+ "epoch": 2.32520652830949,
10391
+ "grad_norm": 11.25,
10392
+ "learning_rate": 4.499966418161058e-06,
10393
+ "loss": 0.8651,
10394
+ "mean_token_accuracy": 0.7886692404747009,
10395
+ "num_tokens": 12777269.0,
10396
+ "step": 11540
10397
+ },
10398
+ {
10399
+ "epoch": 2.32722143864598,
10400
+ "grad_norm": 9.9375,
10401
+ "learning_rate": 4.486533682584459e-06,
10402
+ "loss": 0.7734,
10403
+ "mean_token_accuracy": 0.805381816625595,
10404
+ "num_tokens": 12788362.0,
10405
+ "step": 11550
10406
+ },
10407
+ {
10408
+ "epoch": 2.3292363489824703,
10409
+ "grad_norm": 11.125,
10410
+ "learning_rate": 4.4731009470078585e-06,
10411
+ "loss": 0.7621,
10412
+ "mean_token_accuracy": 0.8127647817134858,
10413
+ "num_tokens": 12798806.0,
10414
+ "step": 11560
10415
+ },
10416
+ {
10417
+ "epoch": 2.3312512593189605,
10418
+ "grad_norm": 11.375,
10419
+ "learning_rate": 4.459668211431258e-06,
10420
+ "loss": 0.8961,
10421
+ "mean_token_accuracy": 0.7812471866607666,
10422
+ "num_tokens": 12809177.0,
10423
+ "step": 11570
10424
+ },
10425
+ {
10426
+ "epoch": 2.33326616965545,
10427
+ "grad_norm": 11.5625,
10428
+ "learning_rate": 4.446235475854659e-06,
10429
+ "loss": 0.8318,
10430
+ "mean_token_accuracy": 0.791858333349228,
10431
+ "num_tokens": 12819801.0,
10432
+ "step": 11580
10433
+ },
10434
+ {
10435
+ "epoch": 2.3352810799919403,
10436
+ "grad_norm": 12.9375,
10437
+ "learning_rate": 4.432802740278058e-06,
10438
+ "loss": 0.8065,
10439
+ "mean_token_accuracy": 0.8027865469455719,
10440
+ "num_tokens": 12830608.0,
10441
+ "step": 11590
10442
+ },
10443
+ {
10444
+ "epoch": 2.3372959903284305,
10445
+ "grad_norm": 13.5,
10446
+ "learning_rate": 4.419370004701457e-06,
10447
+ "loss": 0.8616,
10448
+ "mean_token_accuracy": 0.7896072804927826,
10449
+ "num_tokens": 12841034.0,
10450
+ "step": 11600
10451
+ },
10452
+ {
10453
+ "epoch": 2.3393109006649206,
10454
+ "grad_norm": 12.0625,
10455
+ "learning_rate": 4.405937269124858e-06,
10456
+ "loss": 0.8395,
10457
+ "mean_token_accuracy": 0.7913760662078857,
10458
+ "num_tokens": 12852500.0,
10459
+ "step": 11610
10460
+ },
10461
+ {
10462
+ "epoch": 2.3413258110014104,
10463
+ "grad_norm": 12.9375,
10464
+ "learning_rate": 4.3925045335482574e-06,
10465
+ "loss": 0.7748,
10466
+ "mean_token_accuracy": 0.8028417646884918,
10467
+ "num_tokens": 12862957.0,
10468
+ "step": 11620
10469
+ },
10470
+ {
10471
+ "epoch": 2.3433407213379005,
10472
+ "grad_norm": 11.9375,
10473
+ "learning_rate": 4.379071797971657e-06,
10474
+ "loss": 0.726,
10475
+ "mean_token_accuracy": 0.8169633626937867,
10476
+ "num_tokens": 12873810.0,
10477
+ "step": 11630
10478
+ },
10479
+ {
10480
+ "epoch": 2.3453556316743907,
10481
+ "grad_norm": 14.25,
10482
+ "learning_rate": 4.365639062395058e-06,
10483
+ "loss": 0.7605,
10484
+ "mean_token_accuracy": 0.8124097108840942,
10485
+ "num_tokens": 12884782.0,
10486
+ "step": 11640
10487
+ },
10488
+ {
10489
+ "epoch": 2.3473705420108804,
10490
+ "grad_norm": 10.125,
10491
+ "learning_rate": 4.3522063268184565e-06,
10492
+ "loss": 0.8192,
10493
+ "mean_token_accuracy": 0.7956154048442841,
10494
+ "num_tokens": 12896291.0,
10495
+ "step": 11650
10496
+ },
10497
+ {
10498
+ "epoch": 2.3493854523473705,
10499
+ "grad_norm": 10.5625,
10500
+ "learning_rate": 4.338773591241857e-06,
10501
+ "loss": 0.7772,
10502
+ "mean_token_accuracy": 0.803158450126648,
10503
+ "num_tokens": 12908167.0,
10504
+ "step": 11660
10505
+ },
10506
+ {
10507
+ "epoch": 2.3514003626838607,
10508
+ "grad_norm": 10.4375,
10509
+ "learning_rate": 4.325340855665257e-06,
10510
+ "loss": 0.9174,
10511
+ "mean_token_accuracy": 0.7823165059089661,
10512
+ "num_tokens": 12920439.0,
10513
+ "step": 11670
10514
+ },
10515
+ {
10516
+ "epoch": 2.3534152730203504,
10517
+ "grad_norm": 12.375,
10518
+ "learning_rate": 4.311908120088656e-06,
10519
+ "loss": 0.9018,
10520
+ "mean_token_accuracy": 0.7804082155227661,
10521
+ "num_tokens": 12932185.0,
10522
+ "step": 11680
10523
+ },
10524
+ {
10525
+ "epoch": 2.3554301833568405,
10526
+ "grad_norm": 13.25,
10527
+ "learning_rate": 4.298475384512056e-06,
10528
+ "loss": 0.7592,
10529
+ "mean_token_accuracy": 0.8126965999603272,
10530
+ "num_tokens": 12942868.0,
10531
+ "step": 11690
10532
+ },
10533
+ {
10534
+ "epoch": 2.3574450936933307,
10535
+ "grad_norm": 12.0,
10536
+ "learning_rate": 4.285042648935457e-06,
10537
+ "loss": 0.78,
10538
+ "mean_token_accuracy": 0.7975714325904846,
10539
+ "num_tokens": 12954738.0,
10540
+ "step": 11700
10541
+ },
10542
+ {
10543
+ "epoch": 2.359460004029821,
10544
+ "grad_norm": 12.375,
10545
+ "learning_rate": 4.271609913358855e-06,
10546
+ "loss": 0.8378,
10547
+ "mean_token_accuracy": 0.7859510540962219,
10548
+ "num_tokens": 12967446.0,
10549
+ "step": 11710
10550
+ },
10551
+ {
10552
+ "epoch": 2.3614749143663105,
10553
+ "grad_norm": 11.9375,
10554
+ "learning_rate": 4.258177177782256e-06,
10555
+ "loss": 0.7837,
10556
+ "mean_token_accuracy": 0.8069138765335083,
10557
+ "num_tokens": 12978934.0,
10558
+ "step": 11720
10559
+ },
10560
+ {
10561
+ "epoch": 2.3634898247028007,
10562
+ "grad_norm": 11.9375,
10563
+ "learning_rate": 4.244744442205656e-06,
10564
+ "loss": 0.8833,
10565
+ "mean_token_accuracy": 0.786394476890564,
10566
+ "num_tokens": 12989739.0,
10567
+ "step": 11730
10568
+ },
10569
+ {
10570
+ "epoch": 2.365504735039291,
10571
+ "grad_norm": 11.75,
10572
+ "learning_rate": 4.231311706629055e-06,
10573
+ "loss": 0.8046,
10574
+ "mean_token_accuracy": 0.8040765285491943,
10575
+ "num_tokens": 13001096.0,
10576
+ "step": 11740
10577
+ },
10578
+ {
10579
+ "epoch": 2.367519645375781,
10580
+ "grad_norm": 10.75,
10581
+ "learning_rate": 4.217878971052455e-06,
10582
+ "loss": 0.7969,
10583
+ "mean_token_accuracy": 0.8002257823944092,
10584
+ "num_tokens": 13013888.0,
10585
+ "step": 11750
10586
+ },
10587
+ {
10588
+ "epoch": 2.3695345557122707,
10589
+ "grad_norm": 11.875,
10590
+ "learning_rate": 4.204446235475855e-06,
10591
+ "loss": 0.7773,
10592
+ "mean_token_accuracy": 0.8048622369766235,
10593
+ "num_tokens": 13024639.0,
10594
+ "step": 11760
10595
+ },
10596
+ {
10597
+ "epoch": 2.371549466048761,
10598
+ "grad_norm": 10.0,
10599
+ "learning_rate": 4.191013499899254e-06,
10600
+ "loss": 0.7608,
10601
+ "mean_token_accuracy": 0.8129175007343292,
10602
+ "num_tokens": 13035841.0,
10603
+ "step": 11770
10604
+ },
10605
+ {
10606
+ "epoch": 2.373564376385251,
10607
+ "grad_norm": 10.0,
10608
+ "learning_rate": 4.177580764322655e-06,
10609
+ "loss": 0.7931,
10610
+ "mean_token_accuracy": 0.8056257784366607,
10611
+ "num_tokens": 13046314.0,
10612
+ "step": 11780
10613
+ },
10614
+ {
10615
+ "epoch": 2.3755792867217407,
10616
+ "grad_norm": 11.625,
10617
+ "learning_rate": 4.1641480287460546e-06,
10618
+ "loss": 0.7639,
10619
+ "mean_token_accuracy": 0.8064506113529205,
10620
+ "num_tokens": 13057027.0,
10621
+ "step": 11790
10622
+ },
10623
+ {
10624
+ "epoch": 2.377594197058231,
10625
+ "grad_norm": 13.0,
10626
+ "learning_rate": 4.150715293169454e-06,
10627
+ "loss": 0.8375,
10628
+ "mean_token_accuracy": 0.7984327495098114,
10629
+ "num_tokens": 13068328.0,
10630
+ "step": 11800
10631
+ },
10632
+ {
10633
+ "epoch": 2.379609107394721,
10634
+ "grad_norm": 10.625,
10635
+ "learning_rate": 4.137282557592854e-06,
10636
+ "loss": 0.7661,
10637
+ "mean_token_accuracy": 0.8164677619934082,
10638
+ "num_tokens": 13079532.0,
10639
+ "step": 11810
10640
+ },
10641
+ {
10642
+ "epoch": 2.381624017731211,
10643
+ "grad_norm": 12.0,
10644
+ "learning_rate": 4.123849822016254e-06,
10645
+ "loss": 0.8284,
10646
+ "mean_token_accuracy": 0.7940251708030701,
10647
+ "num_tokens": 13091655.0,
10648
+ "step": 11820
10649
+ },
10650
+ {
10651
+ "epoch": 2.383638928067701,
10652
+ "grad_norm": 9.4375,
10653
+ "learning_rate": 4.110417086439653e-06,
10654
+ "loss": 0.778,
10655
+ "mean_token_accuracy": 0.8088996291160584,
10656
+ "num_tokens": 13103593.0,
10657
+ "step": 11830
10658
+ },
10659
+ {
10660
+ "epoch": 2.385653838404191,
10661
+ "grad_norm": 11.875,
10662
+ "learning_rate": 4.096984350863054e-06,
10663
+ "loss": 0.9015,
10664
+ "mean_token_accuracy": 0.7831744253635406,
10665
+ "num_tokens": 13115707.0,
10666
+ "step": 11840
10667
+ },
10668
+ {
10669
+ "epoch": 2.387668748740681,
10670
+ "grad_norm": 11.0625,
10671
+ "learning_rate": 4.0835516152864535e-06,
10672
+ "loss": 0.7765,
10673
+ "mean_token_accuracy": 0.8058106303215027,
10674
+ "num_tokens": 13126526.0,
10675
+ "step": 11850
10676
+ },
10677
+ {
10678
+ "epoch": 2.3896836590771713,
10679
+ "grad_norm": 11.4375,
10680
+ "learning_rate": 4.070118879709853e-06,
10681
+ "loss": 0.8312,
10682
+ "mean_token_accuracy": 0.7961088418960571,
10683
+ "num_tokens": 13138535.0,
10684
+ "step": 11860
10685
+ },
10686
+ {
10687
+ "epoch": 2.391698569413661,
10688
+ "grad_norm": 12.5625,
10689
+ "learning_rate": 4.056686144133254e-06,
10690
+ "loss": 0.7469,
10691
+ "mean_token_accuracy": 0.8120961427688599,
10692
+ "num_tokens": 13149421.0,
10693
+ "step": 11870
10694
+ },
10695
+ {
10696
+ "epoch": 2.393713479750151,
10697
+ "grad_norm": 12.0,
10698
+ "learning_rate": 4.0432534085566526e-06,
10699
+ "loss": 0.7109,
10700
+ "mean_token_accuracy": 0.8213139772415161,
10701
+ "num_tokens": 13159892.0,
10702
+ "step": 11880
10703
+ },
10704
+ {
10705
+ "epoch": 2.3957283900866413,
10706
+ "grad_norm": 11.75,
10707
+ "learning_rate": 4.029820672980052e-06,
10708
+ "loss": 0.7591,
10709
+ "mean_token_accuracy": 0.8050274133682251,
10710
+ "num_tokens": 13171470.0,
10711
+ "step": 11890
10712
+ },
10713
+ {
10714
+ "epoch": 2.397743300423131,
10715
+ "grad_norm": 10.8125,
10716
+ "learning_rate": 4.016387937403453e-06,
10717
+ "loss": 0.7032,
10718
+ "mean_token_accuracy": 0.8176105141639709,
10719
+ "num_tokens": 13182185.0,
10720
+ "step": 11900
10721
+ },
10722
+ {
10723
+ "epoch": 2.399758210759621,
10724
+ "grad_norm": 14.75,
10725
+ "learning_rate": 4.0029552018268524e-06,
10726
+ "loss": 0.8339,
10727
+ "mean_token_accuracy": 0.7909499406814575,
10728
+ "num_tokens": 13193599.0,
10729
+ "step": 11910
10730
+ },
10731
+ {
10732
+ "epoch": 2.4017731210961113,
10733
+ "grad_norm": 11.875,
10734
+ "learning_rate": 3.989522466250252e-06,
10735
+ "loss": 0.8541,
10736
+ "mean_token_accuracy": 0.7976760566234589,
10737
+ "num_tokens": 13204212.0,
10738
+ "step": 11920
10739
+ },
10740
+ {
10741
+ "epoch": 2.403788031432601,
10742
+ "grad_norm": 10.625,
10743
+ "learning_rate": 3.976089730673652e-06,
10744
+ "loss": 0.759,
10745
+ "mean_token_accuracy": 0.8070417881011963,
10746
+ "num_tokens": 13214466.0,
10747
+ "step": 11930
10748
+ },
10749
+ {
10750
+ "epoch": 2.405802941769091,
10751
+ "grad_norm": 11.5,
10752
+ "learning_rate": 3.9626569950970515e-06,
10753
+ "loss": 0.7853,
10754
+ "mean_token_accuracy": 0.8054651498794556,
10755
+ "num_tokens": 13226936.0,
10756
+ "step": 11940
10757
+ },
10758
+ {
10759
+ "epoch": 2.4078178521055813,
10760
+ "grad_norm": 11.625,
10761
+ "learning_rate": 3.949224259520452e-06,
10762
+ "loss": 1.0198,
10763
+ "mean_token_accuracy": 0.7604237377643586,
10764
+ "num_tokens": 13239461.0,
10765
+ "step": 11950
10766
+ },
10767
+ {
10768
+ "epoch": 2.4098327624420715,
10769
+ "grad_norm": 10.9375,
10770
+ "learning_rate": 3.935791523943852e-06,
10771
+ "loss": 0.7993,
10772
+ "mean_token_accuracy": 0.8001957833766937,
10773
+ "num_tokens": 13250850.0,
10774
+ "step": 11960
10775
+ },
10776
+ {
10777
+ "epoch": 2.4118476727785616,
10778
+ "grad_norm": 11.5,
10779
+ "learning_rate": 3.922358788367251e-06,
10780
+ "loss": 0.728,
10781
+ "mean_token_accuracy": 0.8160501599311829,
10782
+ "num_tokens": 13261351.0,
10783
+ "step": 11970
10784
+ },
10785
+ {
10786
+ "epoch": 2.4138625831150513,
10787
+ "grad_norm": 11.125,
10788
+ "learning_rate": 3.908926052790651e-06,
10789
+ "loss": 0.7994,
10790
+ "mean_token_accuracy": 0.8003806352615357,
10791
+ "num_tokens": 13272623.0,
10792
+ "step": 11980
10793
+ },
10794
+ {
10795
+ "epoch": 2.4158774934515415,
10796
+ "grad_norm": 10.5625,
10797
+ "learning_rate": 3.895493317214051e-06,
10798
+ "loss": 0.8576,
10799
+ "mean_token_accuracy": 0.7920307397842408,
10800
+ "num_tokens": 13284043.0,
10801
+ "step": 11990
10802
+ },
10803
+ {
10804
+ "epoch": 2.4178924037880316,
10805
+ "grad_norm": 11.5,
10806
+ "learning_rate": 3.8820605816374504e-06,
10807
+ "loss": 0.7156,
10808
+ "mean_token_accuracy": 0.8190572082996368,
10809
+ "num_tokens": 13294166.0,
10810
+ "step": 12000
10811
  }
10812
  ],
10813
  "logging_steps": 10,
 
10827
  "attributes": {}
10828
  }
10829
  },
10830
+ "total_flos": 1.6084473958017024e+16,
10831
  "train_batch_size": 8,
10832
  "trial_name": null,
10833
  "trial_params": null