Plofski commited on
Commit
704448f
·
verified ·
1 Parent(s): d75a2fc

Training in progress, step 11000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cc6d2ac14b136a0c5c39d3842c8290195765d0231c31019222880ab2ada323a
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:653a7bb4c0270ae2dd03d344965c51599b26df08817400d9611fe8bd0497aa7e
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6c4e658acbdc5e0bc6eda245ab297a40c16a3c1814b13d63c1d7cae82962a95
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49ab3488ed04a08a6119dd62c223dc3bd691b1d8c04575c9d55a422631b4cec4
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f1b6e95985cf829ad61f7f680a73f323339cc556ff96e0fd4cb8e86a2237898
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a92df46ff7ec03358cd9241260e8a718523df24a66e616bac3dad8000c153e0c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.1156558533145273,
6
  "eval_steps": 500,
7
- "global_step": 10500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9458,6 +9458,456 @@
9458
  "mean_token_accuracy": 0.807522964477539,
9459
  "num_tokens": 11623915.0,
9460
  "step": 10500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9461
  }
9462
  ],
9463
  "logging_steps": 10,
@@ -9477,7 +9927,7 @@
9477
  "attributes": {}
9478
  }
9479
  },
9480
- "total_flos": 1.4062792370479104e+16,
9481
  "train_batch_size": 8,
9482
  "trial_name": null,
9483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.216401370139029,
6
  "eval_steps": 500,
7
+ "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9458
  "mean_token_accuracy": 0.807522964477539,
9459
  "num_tokens": 11623915.0,
9460
  "step": 10500
9461
+ },
9462
+ {
9463
+ "epoch": 2.1176707636510175,
9464
+ "grad_norm": 10.1875,
9465
+ "learning_rate": 5.8835381825508775e-06,
9466
+ "loss": 0.8048,
9467
+ "mean_token_accuracy": 0.8046258687973022,
9468
+ "num_tokens": 11634260.0,
9469
+ "step": 10510
9470
+ },
9471
+ {
9472
+ "epoch": 2.1196856739875076,
9473
+ "grad_norm": 8.875,
9474
+ "learning_rate": 5.870105446974277e-06,
9475
+ "loss": 0.8054,
9476
+ "mean_token_accuracy": 0.7993561148643493,
9477
+ "num_tokens": 11644984.0,
9478
+ "step": 10520
9479
+ },
9480
+ {
9481
+ "epoch": 2.1217005843239978,
9482
+ "grad_norm": 11.1875,
9483
+ "learning_rate": 5.856672711397676e-06,
9484
+ "loss": 0.7494,
9485
+ "mean_token_accuracy": 0.8139807939529419,
9486
+ "num_tokens": 11656539.0,
9487
+ "step": 10530
9488
+ },
9489
+ {
9490
+ "epoch": 2.1237154946604875,
9491
+ "grad_norm": 8.5,
9492
+ "learning_rate": 5.8432399758210765e-06,
9493
+ "loss": 0.7947,
9494
+ "mean_token_accuracy": 0.8058351814746857,
9495
+ "num_tokens": 11667616.0,
9496
+ "step": 10540
9497
+ },
9498
+ {
9499
+ "epoch": 2.1257304049969776,
9500
+ "grad_norm": 13.625,
9501
+ "learning_rate": 5.829807240244476e-06,
9502
+ "loss": 0.7885,
9503
+ "mean_token_accuracy": 0.8079341351985931,
9504
+ "num_tokens": 11678470.0,
9505
+ "step": 10550
9506
+ },
9507
+ {
9508
+ "epoch": 2.127745315333468,
9509
+ "grad_norm": 10.25,
9510
+ "learning_rate": 5.816374504667877e-06,
9511
+ "loss": 0.733,
9512
+ "mean_token_accuracy": 0.8124136865139008,
9513
+ "num_tokens": 11690615.0,
9514
+ "step": 10560
9515
+ },
9516
+ {
9517
+ "epoch": 2.1297602256699575,
9518
+ "grad_norm": 12.9375,
9519
+ "learning_rate": 5.802941769091276e-06,
9520
+ "loss": 0.8069,
9521
+ "mean_token_accuracy": 0.7959172546863555,
9522
+ "num_tokens": 11700806.0,
9523
+ "step": 10570
9524
+ },
9525
+ {
9526
+ "epoch": 2.1317751360064476,
9527
+ "grad_norm": 12.125,
9528
+ "learning_rate": 5.789509033514675e-06,
9529
+ "loss": 0.7666,
9530
+ "mean_token_accuracy": 0.8081447660923005,
9531
+ "num_tokens": 11712546.0,
9532
+ "step": 10580
9533
+ },
9534
+ {
9535
+ "epoch": 2.133790046342938,
9536
+ "grad_norm": 14.8125,
9537
+ "learning_rate": 5.776076297938076e-06,
9538
+ "loss": 0.7609,
9539
+ "mean_token_accuracy": 0.8059535026550293,
9540
+ "num_tokens": 11722798.0,
9541
+ "step": 10590
9542
+ },
9543
+ {
9544
+ "epoch": 2.135804956679428,
9545
+ "grad_norm": 12.0,
9546
+ "learning_rate": 5.7626435623614755e-06,
9547
+ "loss": 0.8261,
9548
+ "mean_token_accuracy": 0.7918490886688232,
9549
+ "num_tokens": 11733979.0,
9550
+ "step": 10600
9551
+ },
9552
+ {
9553
+ "epoch": 2.1378198670159176,
9554
+ "grad_norm": 10.6875,
9555
+ "learning_rate": 5.749210826784876e-06,
9556
+ "loss": 0.8653,
9557
+ "mean_token_accuracy": 0.7918577075004578,
9558
+ "num_tokens": 11746313.0,
9559
+ "step": 10610
9560
+ },
9561
+ {
9562
+ "epoch": 2.139834777352408,
9563
+ "grad_norm": 13.8125,
9564
+ "learning_rate": 5.735778091208275e-06,
9565
+ "loss": 0.7597,
9566
+ "mean_token_accuracy": 0.8129185199737549,
9567
+ "num_tokens": 11756847.0,
9568
+ "step": 10620
9569
+ },
9570
+ {
9571
+ "epoch": 2.141849687688898,
9572
+ "grad_norm": 11.3125,
9573
+ "learning_rate": 5.7223453556316745e-06,
9574
+ "loss": 0.8895,
9575
+ "mean_token_accuracy": 0.7831692516803741,
9576
+ "num_tokens": 11768092.0,
9577
+ "step": 10630
9578
+ },
9579
+ {
9580
+ "epoch": 2.143864598025388,
9581
+ "grad_norm": 9.75,
9582
+ "learning_rate": 5.708912620055075e-06,
9583
+ "loss": 0.8293,
9584
+ "mean_token_accuracy": 0.7959823906421661,
9585
+ "num_tokens": 11779092.0,
9586
+ "step": 10640
9587
+ },
9588
+ {
9589
+ "epoch": 2.145879508361878,
9590
+ "grad_norm": 13.625,
9591
+ "learning_rate": 5.695479884478474e-06,
9592
+ "loss": 0.7806,
9593
+ "mean_token_accuracy": 0.8032085597515106,
9594
+ "num_tokens": 11789301.0,
9595
+ "step": 10650
9596
+ },
9597
+ {
9598
+ "epoch": 2.147894418698368,
9599
+ "grad_norm": 10.25,
9600
+ "learning_rate": 5.6820471489018744e-06,
9601
+ "loss": 0.7285,
9602
+ "mean_token_accuracy": 0.8135238766670227,
9603
+ "num_tokens": 11799827.0,
9604
+ "step": 10660
9605
+ },
9606
+ {
9607
+ "epoch": 2.149909329034858,
9608
+ "grad_norm": 11.75,
9609
+ "learning_rate": 5.668614413325274e-06,
9610
+ "loss": 0.8109,
9611
+ "mean_token_accuracy": 0.8031542479991913,
9612
+ "num_tokens": 11810095.0,
9613
+ "step": 10670
9614
+ },
9615
+ {
9616
+ "epoch": 2.151924239371348,
9617
+ "grad_norm": 10.9375,
9618
+ "learning_rate": 5.655181677748674e-06,
9619
+ "loss": 0.8062,
9620
+ "mean_token_accuracy": 0.7998530924320221,
9621
+ "num_tokens": 11821701.0,
9622
+ "step": 10680
9623
+ },
9624
+ {
9625
+ "epoch": 2.153939149707838,
9626
+ "grad_norm": 16.25,
9627
+ "learning_rate": 5.641748942172074e-06,
9628
+ "loss": 0.7909,
9629
+ "mean_token_accuracy": 0.8020996809005737,
9630
+ "num_tokens": 11833622.0,
9631
+ "step": 10690
9632
+ },
9633
+ {
9634
+ "epoch": 2.155954060044328,
9635
+ "grad_norm": 11.6875,
9636
+ "learning_rate": 5.628316206595473e-06,
9637
+ "loss": 0.8753,
9638
+ "mean_token_accuracy": 0.7875764667987823,
9639
+ "num_tokens": 11844025.0,
9640
+ "step": 10700
9641
+ },
9642
+ {
9643
+ "epoch": 2.1579689703808183,
9644
+ "grad_norm": 15.1875,
9645
+ "learning_rate": 5.614883471018874e-06,
9646
+ "loss": 0.8975,
9647
+ "mean_token_accuracy": 0.7894319653511047,
9648
+ "num_tokens": 11855329.0,
9649
+ "step": 10710
9650
+ },
9651
+ {
9652
+ "epoch": 2.159983880717308,
9653
+ "grad_norm": 12.1875,
9654
+ "learning_rate": 5.601450735442273e-06,
9655
+ "loss": 0.847,
9656
+ "mean_token_accuracy": 0.7901701211929322,
9657
+ "num_tokens": 11866697.0,
9658
+ "step": 10720
9659
+ },
9660
+ {
9661
+ "epoch": 2.161998791053798,
9662
+ "grad_norm": 12.3125,
9663
+ "learning_rate": 5.588017999865674e-06,
9664
+ "loss": 0.8007,
9665
+ "mean_token_accuracy": 0.805288553237915,
9666
+ "num_tokens": 11877358.0,
9667
+ "step": 10730
9668
+ },
9669
+ {
9670
+ "epoch": 2.1640137013902883,
9671
+ "grad_norm": 11.375,
9672
+ "learning_rate": 5.574585264289073e-06,
9673
+ "loss": 0.8334,
9674
+ "mean_token_accuracy": 0.8021558821201324,
9675
+ "num_tokens": 11888098.0,
9676
+ "step": 10740
9677
+ },
9678
+ {
9679
+ "epoch": 2.166028611726778,
9680
+ "grad_norm": 10.1875,
9681
+ "learning_rate": 5.561152528712472e-06,
9682
+ "loss": 0.7298,
9683
+ "mean_token_accuracy": 0.8173341572284698,
9684
+ "num_tokens": 11900343.0,
9685
+ "step": 10750
9686
+ },
9687
+ {
9688
+ "epoch": 2.168043522063268,
9689
+ "grad_norm": 11.875,
9690
+ "learning_rate": 5.547719793135873e-06,
9691
+ "loss": 0.7146,
9692
+ "mean_token_accuracy": 0.8224671244621277,
9693
+ "num_tokens": 11911403.0,
9694
+ "step": 10760
9695
+ },
9696
+ {
9697
+ "epoch": 2.1700584323997583,
9698
+ "grad_norm": 12.125,
9699
+ "learning_rate": 5.534287057559273e-06,
9700
+ "loss": 0.8245,
9701
+ "mean_token_accuracy": 0.7936823606491089,
9702
+ "num_tokens": 11922991.0,
9703
+ "step": 10770
9704
+ },
9705
+ {
9706
+ "epoch": 2.1720733427362484,
9707
+ "grad_norm": 10.9375,
9708
+ "learning_rate": 5.520854321982672e-06,
9709
+ "loss": 0.8443,
9710
+ "mean_token_accuracy": 0.788495534658432,
9711
+ "num_tokens": 11934105.0,
9712
+ "step": 10780
9713
+ },
9714
+ {
9715
+ "epoch": 2.174088253072738,
9716
+ "grad_norm": 14.3125,
9717
+ "learning_rate": 5.507421586406072e-06,
9718
+ "loss": 0.8389,
9719
+ "mean_token_accuracy": 0.7919258952140809,
9720
+ "num_tokens": 11944878.0,
9721
+ "step": 10790
9722
+ },
9723
+ {
9724
+ "epoch": 2.1761031634092283,
9725
+ "grad_norm": 10.8125,
9726
+ "learning_rate": 5.493988850829472e-06,
9727
+ "loss": 0.8987,
9728
+ "mean_token_accuracy": 0.7812518179416656,
9729
+ "num_tokens": 11956600.0,
9730
+ "step": 10800
9731
+ },
9732
+ {
9733
+ "epoch": 2.1781180737457184,
9734
+ "grad_norm": 11.625,
9735
+ "learning_rate": 5.480556115252872e-06,
9736
+ "loss": 0.8744,
9737
+ "mean_token_accuracy": 0.7875288486480713,
9738
+ "num_tokens": 11966645.0,
9739
+ "step": 10810
9740
+ },
9741
+ {
9742
+ "epoch": 2.180132984082208,
9743
+ "grad_norm": 11.1875,
9744
+ "learning_rate": 5.467123379676271e-06,
9745
+ "loss": 0.7598,
9746
+ "mean_token_accuracy": 0.8071795523166656,
9747
+ "num_tokens": 11977516.0,
9748
+ "step": 10820
9749
+ },
9750
+ {
9751
+ "epoch": 2.1821478944186983,
9752
+ "grad_norm": 11.125,
9753
+ "learning_rate": 5.4536906440996716e-06,
9754
+ "loss": 0.7946,
9755
+ "mean_token_accuracy": 0.7999853491783142,
9756
+ "num_tokens": 11987823.0,
9757
+ "step": 10830
9758
+ },
9759
+ {
9760
+ "epoch": 2.1841628047551884,
9761
+ "grad_norm": 9.9375,
9762
+ "learning_rate": 5.440257908523071e-06,
9763
+ "loss": 0.7951,
9764
+ "mean_token_accuracy": 0.8064453899860382,
9765
+ "num_tokens": 11999675.0,
9766
+ "step": 10840
9767
+ },
9768
+ {
9769
+ "epoch": 2.1861777150916786,
9770
+ "grad_norm": 10.0,
9771
+ "learning_rate": 5.42682517294647e-06,
9772
+ "loss": 0.8071,
9773
+ "mean_token_accuracy": 0.7993614792823791,
9774
+ "num_tokens": 12010690.0,
9775
+ "step": 10850
9776
+ },
9777
+ {
9778
+ "epoch": 2.1881926254281683,
9779
+ "grad_norm": 11.625,
9780
+ "learning_rate": 5.413392437369871e-06,
9781
+ "loss": 0.8318,
9782
+ "mean_token_accuracy": 0.7873802423477173,
9783
+ "num_tokens": 12021657.0,
9784
+ "step": 10860
9785
+ },
9786
+ {
9787
+ "epoch": 2.1902075357646584,
9788
+ "grad_norm": 10.9375,
9789
+ "learning_rate": 5.39995970179327e-06,
9790
+ "loss": 0.8989,
9791
+ "mean_token_accuracy": 0.7851345241069794,
9792
+ "num_tokens": 12033302.0,
9793
+ "step": 10870
9794
+ },
9795
+ {
9796
+ "epoch": 2.1922224461011486,
9797
+ "grad_norm": 10.9375,
9798
+ "learning_rate": 5.386526966216671e-06,
9799
+ "loss": 0.7589,
9800
+ "mean_token_accuracy": 0.805691534280777,
9801
+ "num_tokens": 12043229.0,
9802
+ "step": 10880
9803
+ },
9804
+ {
9805
+ "epoch": 2.1942373564376387,
9806
+ "grad_norm": 11.9375,
9807
+ "learning_rate": 5.3730942306400705e-06,
9808
+ "loss": 0.8026,
9809
+ "mean_token_accuracy": 0.8073262214660645,
9810
+ "num_tokens": 12052950.0,
9811
+ "step": 10890
9812
+ },
9813
+ {
9814
+ "epoch": 2.1962522667741284,
9815
+ "grad_norm": 11.0,
9816
+ "learning_rate": 5.359661495063471e-06,
9817
+ "loss": 0.8301,
9818
+ "mean_token_accuracy": 0.7973912358283997,
9819
+ "num_tokens": 12063314.0,
9820
+ "step": 10900
9821
+ },
9822
+ {
9823
+ "epoch": 2.1982671771106186,
9824
+ "grad_norm": 11.5625,
9825
+ "learning_rate": 5.34622875948687e-06,
9826
+ "loss": 0.7227,
9827
+ "mean_token_accuracy": 0.8158142805099488,
9828
+ "num_tokens": 12074240.0,
9829
+ "step": 10910
9830
+ },
9831
+ {
9832
+ "epoch": 2.2002820874471087,
9833
+ "grad_norm": 10.9375,
9834
+ "learning_rate": 5.3327960239102695e-06,
9835
+ "loss": 0.7952,
9836
+ "mean_token_accuracy": 0.8007366359233856,
9837
+ "num_tokens": 12086925.0,
9838
+ "step": 10920
9839
+ },
9840
+ {
9841
+ "epoch": 2.2022969977835984,
9842
+ "grad_norm": 12.6875,
9843
+ "learning_rate": 5.31936328833367e-06,
9844
+ "loss": 0.9782,
9845
+ "mean_token_accuracy": 0.7690042972564697,
9846
+ "num_tokens": 12098664.0,
9847
+ "step": 10930
9848
+ },
9849
+ {
9850
+ "epoch": 2.2043119081200886,
9851
+ "grad_norm": 10.375,
9852
+ "learning_rate": 5.305930552757069e-06,
9853
+ "loss": 0.8007,
9854
+ "mean_token_accuracy": 0.8038599193096161,
9855
+ "num_tokens": 12109170.0,
9856
+ "step": 10940
9857
+ },
9858
+ {
9859
+ "epoch": 2.2063268184565787,
9860
+ "grad_norm": 11.5,
9861
+ "learning_rate": 5.2924978171804694e-06,
9862
+ "loss": 0.752,
9863
+ "mean_token_accuracy": 0.8126667857170105,
9864
+ "num_tokens": 12120379.0,
9865
+ "step": 10950
9866
+ },
9867
+ {
9868
+ "epoch": 2.208341728793069,
9869
+ "grad_norm": 10.0625,
9870
+ "learning_rate": 5.279065081603869e-06,
9871
+ "loss": 0.8548,
9872
+ "mean_token_accuracy": 0.793574595451355,
9873
+ "num_tokens": 12132062.0,
9874
+ "step": 10960
9875
+ },
9876
+ {
9877
+ "epoch": 2.2103566391295586,
9878
+ "grad_norm": 12.1875,
9879
+ "learning_rate": 5.265632346027269e-06,
9880
+ "loss": 0.7465,
9881
+ "mean_token_accuracy": 0.8148365259170532,
9882
+ "num_tokens": 12143111.0,
9883
+ "step": 10970
9884
+ },
9885
+ {
9886
+ "epoch": 2.2123715494660487,
9887
+ "grad_norm": 11.3125,
9888
+ "learning_rate": 5.252199610450669e-06,
9889
+ "loss": 0.822,
9890
+ "mean_token_accuracy": 0.7944930195808411,
9891
+ "num_tokens": 12155572.0,
9892
+ "step": 10980
9893
+ },
9894
+ {
9895
+ "epoch": 2.214386459802539,
9896
+ "grad_norm": 10.9375,
9897
+ "learning_rate": 5.238766874874068e-06,
9898
+ "loss": 0.9171,
9899
+ "mean_token_accuracy": 0.7774474084377289,
9900
+ "num_tokens": 12167871.0,
9901
+ "step": 10990
9902
+ },
9903
+ {
9904
+ "epoch": 2.216401370139029,
9905
+ "grad_norm": 12.9375,
9906
+ "learning_rate": 5.225334139297469e-06,
9907
+ "loss": 0.9178,
9908
+ "mean_token_accuracy": 0.7765659749507904,
9909
+ "num_tokens": 12178091.0,
9910
+ "step": 11000
9911
  }
9912
  ],
9913
  "logging_steps": 10,
 
9927
  "attributes": {}
9928
  }
9929
  },
9930
+ "total_flos": 1.4726200960407552e+16,
9931
  "train_batch_size": 8,
9932
  "trial_name": null,
9933
  "trial_params": null