Plofski commited on
Commit
1302f69
·
verified ·
1 Parent(s): 00fc286

Training in progress, step 14500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:071ff40e66008578cff6a11839a98b3bd55870fb4ecd78b520fd649a835f02e1
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87d564460f84baac9ace9dc44cd612f3da4c9738f97e9806a8457bb9462e95db
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0402536afc76b268263c8a44f7565c5d35ba54094497cf95e3c11e92a054cd5
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ace8d39e9d75867a54c7c346772698f7c6e42165925320fb3b2367daa7c674e
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7200e211c4af21388df4ea9729221c37205d2f4defca496f0d1b43ecbe09b628
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e3f275449dfbc8efc7d2d2f06d134c7b39e55b8e539f36e09b007c731c81c65
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.8208744710860367,
6
  "eval_steps": 500,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12608,6 +12608,456 @@
12608
  "mean_token_accuracy": 0.7995685517787934,
12609
  "num_tokens": 15509702.0,
12610
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12611
  }
12612
  ],
12613
  "logging_steps": 10,
@@ -12627,7 +13077,7 @@
12627
  "attributes": {}
12628
  }
12629
  },
12630
- "total_flos": 1.874850530342093e+16,
12631
  "train_batch_size": 8,
12632
  "trial_name": null,
12633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.921619987910538,
6
  "eval_steps": 500,
7
+ "global_step": 14500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12608
  "mean_token_accuracy": 0.7995685517787934,
12609
  "num_tokens": 15509702.0,
12610
  "step": 14000
12611
+ },
12612
+ {
12613
+ "epoch": 2.8228893814225264,
12614
+ "grad_norm": 12.5625,
12615
+ "learning_rate": 1.1820807307408155e-06,
12616
+ "loss": 0.7108,
12617
+ "mean_token_accuracy": 0.8189300537109375,
12618
+ "num_tokens": 15519975.0,
12619
+ "step": 14010
12620
+ },
12621
+ {
12622
+ "epoch": 2.8249042917590166,
12623
+ "grad_norm": 12.75,
12624
+ "learning_rate": 1.1686479951642154e-06,
12625
+ "loss": 0.817,
12626
+ "mean_token_accuracy": 0.8028945684432983,
12627
+ "num_tokens": 15530042.0,
12628
+ "step": 14020
12629
+ },
12630
+ {
12631
+ "epoch": 2.8269192020955067,
12632
+ "grad_norm": 11.0,
12633
+ "learning_rate": 1.155215259587615e-06,
12634
+ "loss": 0.8255,
12635
+ "mean_token_accuracy": 0.7976077675819397,
12636
+ "num_tokens": 15540964.0,
12637
+ "step": 14030
12638
+ },
12639
+ {
12640
+ "epoch": 2.828934112431997,
12641
+ "grad_norm": 11.0625,
12642
+ "learning_rate": 1.141782524011015e-06,
12643
+ "loss": 0.8116,
12644
+ "mean_token_accuracy": 0.7950972735881805,
12645
+ "num_tokens": 15551879.0,
12646
+ "step": 14040
12647
+ },
12648
+ {
12649
+ "epoch": 2.830949022768487,
12650
+ "grad_norm": 15.5625,
12651
+ "learning_rate": 1.1283497884344149e-06,
12652
+ "loss": 0.8752,
12653
+ "mean_token_accuracy": 0.7869309186935425,
12654
+ "num_tokens": 15563299.0,
12655
+ "step": 14050
12656
+ },
12657
+ {
12658
+ "epoch": 2.8329639331049767,
12659
+ "grad_norm": 13.1875,
12660
+ "learning_rate": 1.1149170528578145e-06,
12661
+ "loss": 0.842,
12662
+ "mean_token_accuracy": 0.7949115037918091,
12663
+ "num_tokens": 15573972.0,
12664
+ "step": 14060
12665
+ },
12666
+ {
12667
+ "epoch": 2.834978843441467,
12668
+ "grad_norm": 12.5,
12669
+ "learning_rate": 1.1014843172812144e-06,
12670
+ "loss": 0.9261,
12671
+ "mean_token_accuracy": 0.778299605846405,
12672
+ "num_tokens": 15587253.0,
12673
+ "step": 14070
12674
+ },
12675
+ {
12676
+ "epoch": 2.836993753777957,
12677
+ "grad_norm": 11.5625,
12678
+ "learning_rate": 1.0880515817046141e-06,
12679
+ "loss": 0.8781,
12680
+ "mean_token_accuracy": 0.7811039209365844,
12681
+ "num_tokens": 15598147.0,
12682
+ "step": 14080
12683
+ },
12684
+ {
12685
+ "epoch": 2.8390086641144467,
12686
+ "grad_norm": 14.375,
12687
+ "learning_rate": 1.074618846128014e-06,
12688
+ "loss": 0.887,
12689
+ "mean_token_accuracy": 0.7856419622898102,
12690
+ "num_tokens": 15609722.0,
12691
+ "step": 14090
12692
+ },
12693
+ {
12694
+ "epoch": 2.841023574450937,
12695
+ "grad_norm": 14.1875,
12696
+ "learning_rate": 1.061186110551414e-06,
12697
+ "loss": 0.8371,
12698
+ "mean_token_accuracy": 0.7899275839328765,
12699
+ "num_tokens": 15620950.0,
12700
+ "step": 14100
12701
+ },
12702
+ {
12703
+ "epoch": 2.843038484787427,
12704
+ "grad_norm": 13.625,
12705
+ "learning_rate": 1.0477533749748136e-06,
12706
+ "loss": 0.8213,
12707
+ "mean_token_accuracy": 0.8016888916492462,
12708
+ "num_tokens": 15632384.0,
12709
+ "step": 14110
12710
+ },
12711
+ {
12712
+ "epoch": 2.8450533951239168,
12713
+ "grad_norm": 10.1875,
12714
+ "learning_rate": 1.0343206393982135e-06,
12715
+ "loss": 0.8916,
12716
+ "mean_token_accuracy": 0.781292325258255,
12717
+ "num_tokens": 15643502.0,
12718
+ "step": 14120
12719
+ },
12720
+ {
12721
+ "epoch": 2.847068305460407,
12722
+ "grad_norm": 10.75,
12723
+ "learning_rate": 1.0208879038216134e-06,
12724
+ "loss": 0.8747,
12725
+ "mean_token_accuracy": 0.7830813884735107,
12726
+ "num_tokens": 15655219.0,
12727
+ "step": 14130
12728
+ },
12729
+ {
12730
+ "epoch": 2.849083215796897,
12731
+ "grad_norm": 11.875,
12732
+ "learning_rate": 1.007455168245013e-06,
12733
+ "loss": 0.8008,
12734
+ "mean_token_accuracy": 0.8025586724281311,
12735
+ "num_tokens": 15665158.0,
12736
+ "step": 14140
12737
+ },
12738
+ {
12739
+ "epoch": 2.851098126133387,
12740
+ "grad_norm": 10.625,
12741
+ "learning_rate": 9.94022432668413e-07,
12742
+ "loss": 0.8352,
12743
+ "mean_token_accuracy": 0.7977402985095978,
12744
+ "num_tokens": 15677733.0,
12745
+ "step": 14150
12746
+ },
12747
+ {
12748
+ "epoch": 2.8531130364698774,
12749
+ "grad_norm": 10.5625,
12750
+ "learning_rate": 9.805896970918128e-07,
12751
+ "loss": 0.8434,
12752
+ "mean_token_accuracy": 0.7951288640499115,
12753
+ "num_tokens": 15688430.0,
12754
+ "step": 14160
12755
+ },
12756
+ {
12757
+ "epoch": 2.855127946806367,
12758
+ "grad_norm": 11.75,
12759
+ "learning_rate": 9.671569615152127e-07,
12760
+ "loss": 0.8963,
12761
+ "mean_token_accuracy": 0.7789243698120117,
12762
+ "num_tokens": 15700376.0,
12763
+ "step": 14170
12764
+ },
12765
+ {
12766
+ "epoch": 2.857142857142857,
12767
+ "grad_norm": 13.0,
12768
+ "learning_rate": 9.537242259386124e-07,
12769
+ "loss": 0.8915,
12770
+ "mean_token_accuracy": 0.7802317202091217,
12771
+ "num_tokens": 15710761.0,
12772
+ "step": 14180
12773
+ },
12774
+ {
12775
+ "epoch": 2.8591577674793474,
12776
+ "grad_norm": 12.125,
12777
+ "learning_rate": 9.402914903620123e-07,
12778
+ "loss": 0.6912,
12779
+ "mean_token_accuracy": 0.8160697996616364,
12780
+ "num_tokens": 15721644.0,
12781
+ "step": 14190
12782
+ },
12783
+ {
12784
+ "epoch": 2.861172677815837,
12785
+ "grad_norm": 12.0,
12786
+ "learning_rate": 9.268587547854121e-07,
12787
+ "loss": 0.7238,
12788
+ "mean_token_accuracy": 0.8155933260917664,
12789
+ "num_tokens": 15732607.0,
12790
+ "step": 14200
12791
+ },
12792
+ {
12793
+ "epoch": 2.863187588152327,
12794
+ "grad_norm": 9.125,
12795
+ "learning_rate": 9.134260192088119e-07,
12796
+ "loss": 0.8317,
12797
+ "mean_token_accuracy": 0.7980758368968963,
12798
+ "num_tokens": 15745252.0,
12799
+ "step": 14210
12800
+ },
12801
+ {
12802
+ "epoch": 2.8652024984888174,
12803
+ "grad_norm": 11.0625,
12804
+ "learning_rate": 8.999932836322117e-07,
12805
+ "loss": 0.7692,
12806
+ "mean_token_accuracy": 0.812389326095581,
12807
+ "num_tokens": 15756570.0,
12808
+ "step": 14220
12809
+ },
12810
+ {
12811
+ "epoch": 2.867217408825307,
12812
+ "grad_norm": 12.0,
12813
+ "learning_rate": 8.865605480556117e-07,
12814
+ "loss": 0.807,
12815
+ "mean_token_accuracy": 0.8013573944568634,
12816
+ "num_tokens": 15768196.0,
12817
+ "step": 14230
12818
+ },
12819
+ {
12820
+ "epoch": 2.869232319161797,
12821
+ "grad_norm": 10.1875,
12822
+ "learning_rate": 8.731278124790115e-07,
12823
+ "loss": 0.8102,
12824
+ "mean_token_accuracy": 0.7977238118648529,
12825
+ "num_tokens": 15780108.0,
12826
+ "step": 14240
12827
+ },
12828
+ {
12829
+ "epoch": 2.8712472294982874,
12830
+ "grad_norm": 10.75,
12831
+ "learning_rate": 8.596950769024113e-07,
12832
+ "loss": 0.7232,
12833
+ "mean_token_accuracy": 0.8186571359634399,
12834
+ "num_tokens": 15790323.0,
12835
+ "step": 14250
12836
+ },
12837
+ {
12838
+ "epoch": 2.8732621398347775,
12839
+ "grad_norm": 10.75,
12840
+ "learning_rate": 8.46262341325811e-07,
12841
+ "loss": 0.7311,
12842
+ "mean_token_accuracy": 0.8196884751319885,
12843
+ "num_tokens": 15801035.0,
12844
+ "step": 14260
12845
+ },
12846
+ {
12847
+ "epoch": 2.8752770501712672,
12848
+ "grad_norm": 12.5625,
12849
+ "learning_rate": 8.328296057492109e-07,
12850
+ "loss": 0.9671,
12851
+ "mean_token_accuracy": 0.7726804137229919,
12852
+ "num_tokens": 15812082.0,
12853
+ "step": 14270
12854
+ },
12855
+ {
12856
+ "epoch": 2.8772919605077574,
12857
+ "grad_norm": 13.75,
12858
+ "learning_rate": 8.193968701726107e-07,
12859
+ "loss": 0.7606,
12860
+ "mean_token_accuracy": 0.8072145521640778,
12861
+ "num_tokens": 15822853.0,
12862
+ "step": 14280
12863
+ },
12864
+ {
12865
+ "epoch": 2.8793068708442475,
12866
+ "grad_norm": 14.75,
12867
+ "learning_rate": 8.059641345960105e-07,
12868
+ "loss": 0.8093,
12869
+ "mean_token_accuracy": 0.8010785162448884,
12870
+ "num_tokens": 15832947.0,
12871
+ "step": 14290
12872
+ },
12873
+ {
12874
+ "epoch": 2.8813217811807377,
12875
+ "grad_norm": 11.5,
12876
+ "learning_rate": 7.925313990194104e-07,
12877
+ "loss": 0.8572,
12878
+ "mean_token_accuracy": 0.7934750914573669,
12879
+ "num_tokens": 15843708.0,
12880
+ "step": 14300
12881
+ },
12882
+ {
12883
+ "epoch": 2.8833366915172274,
12884
+ "grad_norm": 10.625,
12885
+ "learning_rate": 7.790986634428102e-07,
12886
+ "loss": 0.7406,
12887
+ "mean_token_accuracy": 0.813157856464386,
12888
+ "num_tokens": 15855097.0,
12889
+ "step": 14310
12890
+ },
12891
+ {
12892
+ "epoch": 2.8853516018537175,
12893
+ "grad_norm": 13.875,
12894
+ "learning_rate": 7.6566592786621e-07,
12895
+ "loss": 0.8571,
12896
+ "mean_token_accuracy": 0.7906988859176636,
12897
+ "num_tokens": 15866641.0,
12898
+ "step": 14320
12899
+ },
12900
+ {
12901
+ "epoch": 2.8873665121902077,
12902
+ "grad_norm": 12.0625,
12903
+ "learning_rate": 7.522331922896098e-07,
12904
+ "loss": 0.7257,
12905
+ "mean_token_accuracy": 0.815925520658493,
12906
+ "num_tokens": 15877191.0,
12907
+ "step": 14330
12908
+ },
12909
+ {
12910
+ "epoch": 2.8893814225266974,
12911
+ "grad_norm": 10.6875,
12912
+ "learning_rate": 7.388004567130097e-07,
12913
+ "loss": 0.8654,
12914
+ "mean_token_accuracy": 0.7846165299415588,
12915
+ "num_tokens": 15888129.0,
12916
+ "step": 14340
12917
+ },
12918
+ {
12919
+ "epoch": 2.8913963328631875,
12920
+ "grad_norm": 11.625,
12921
+ "learning_rate": 7.253677211364094e-07,
12922
+ "loss": 0.7777,
12923
+ "mean_token_accuracy": 0.807235324382782,
12924
+ "num_tokens": 15899237.0,
12925
+ "step": 14350
12926
+ },
12927
+ {
12928
+ "epoch": 2.8934112431996777,
12929
+ "grad_norm": 14.625,
12930
+ "learning_rate": 7.119349855598092e-07,
12931
+ "loss": 0.769,
12932
+ "mean_token_accuracy": 0.8052566349506378,
12933
+ "num_tokens": 15910090.0,
12934
+ "step": 14360
12935
+ },
12936
+ {
12937
+ "epoch": 2.8954261535361674,
12938
+ "grad_norm": 9.5625,
12939
+ "learning_rate": 6.985022499832092e-07,
12940
+ "loss": 0.7232,
12941
+ "mean_token_accuracy": 0.821067851781845,
12942
+ "num_tokens": 15920709.0,
12943
+ "step": 14370
12944
+ },
12945
+ {
12946
+ "epoch": 2.8974410638726575,
12947
+ "grad_norm": 11.9375,
12948
+ "learning_rate": 6.85069514406609e-07,
12949
+ "loss": 0.7402,
12950
+ "mean_token_accuracy": 0.8163648307323456,
12951
+ "num_tokens": 15933274.0,
12952
+ "step": 14380
12953
+ },
12954
+ {
12955
+ "epoch": 2.8994559742091477,
12956
+ "grad_norm": 13.75,
12957
+ "learning_rate": 6.716367788300088e-07,
12958
+ "loss": 0.8013,
12959
+ "mean_token_accuracy": 0.8017265141010285,
12960
+ "num_tokens": 15943313.0,
12961
+ "step": 14390
12962
+ },
12963
+ {
12964
+ "epoch": 2.901470884545638,
12965
+ "grad_norm": 13.25,
12966
+ "learning_rate": 6.582040432534086e-07,
12967
+ "loss": 0.8565,
12968
+ "mean_token_accuracy": 0.786570030450821,
12969
+ "num_tokens": 15952883.0,
12970
+ "step": 14400
12971
+ },
12972
+ {
12973
+ "epoch": 2.903485794882128,
12974
+ "grad_norm": 14.5,
12975
+ "learning_rate": 6.447713076768085e-07,
12976
+ "loss": 0.7816,
12977
+ "mean_token_accuracy": 0.8096172749996186,
12978
+ "num_tokens": 15964351.0,
12979
+ "step": 14410
12980
+ },
12981
+ {
12982
+ "epoch": 2.9055007052186177,
12983
+ "grad_norm": 11.8125,
12984
+ "learning_rate": 6.313385721002083e-07,
12985
+ "loss": 0.8196,
12986
+ "mean_token_accuracy": 0.7991693377494812,
12987
+ "num_tokens": 15975245.0,
12988
+ "step": 14420
12989
+ },
12990
+ {
12991
+ "epoch": 2.907515615555108,
12992
+ "grad_norm": 11.875,
12993
+ "learning_rate": 6.179058365236081e-07,
12994
+ "loss": 0.7624,
12995
+ "mean_token_accuracy": 0.8095822989940643,
12996
+ "num_tokens": 15986457.0,
12997
+ "step": 14430
12998
+ },
12999
+ {
13000
+ "epoch": 2.909530525891598,
13001
+ "grad_norm": 11.125,
13002
+ "learning_rate": 6.04473100947008e-07,
13003
+ "loss": 0.7871,
13004
+ "mean_token_accuracy": 0.8019815146923065,
13005
+ "num_tokens": 15997870.0,
13006
+ "step": 14440
13007
+ },
13008
+ {
13009
+ "epoch": 2.9115454362280877,
13010
+ "grad_norm": 12.6875,
13011
+ "learning_rate": 5.910403653704078e-07,
13012
+ "loss": 0.7562,
13013
+ "mean_token_accuracy": 0.8092824459075928,
13014
+ "num_tokens": 16008778.0,
13015
+ "step": 14450
13016
+ },
13017
+ {
13018
+ "epoch": 2.913560346564578,
13019
+ "grad_norm": 10.4375,
13020
+ "learning_rate": 5.776076297938075e-07,
13021
+ "loss": 0.7719,
13022
+ "mean_token_accuracy": 0.8073769569396972,
13023
+ "num_tokens": 16020048.0,
13024
+ "step": 14460
13025
+ },
13026
+ {
13027
+ "epoch": 2.915575256901068,
13028
+ "grad_norm": 11.5,
13029
+ "learning_rate": 5.641748942172074e-07,
13030
+ "loss": 0.8207,
13031
+ "mean_token_accuracy": 0.7947039902210236,
13032
+ "num_tokens": 16032290.0,
13033
+ "step": 14470
13034
+ },
13035
+ {
13036
+ "epoch": 2.9175901672375577,
13037
+ "grad_norm": 13.75,
13038
+ "learning_rate": 5.507421586406072e-07,
13039
+ "loss": 0.7469,
13040
+ "mean_token_accuracy": 0.8104640543460846,
13041
+ "num_tokens": 16043678.0,
13042
+ "step": 14480
13043
+ },
13044
+ {
13045
+ "epoch": 2.919605077574048,
13046
+ "grad_norm": 11.375,
13047
+ "learning_rate": 5.37309423064007e-07,
13048
+ "loss": 0.9164,
13049
+ "mean_token_accuracy": 0.7859593093395233,
13050
+ "num_tokens": 16055184.0,
13051
+ "step": 14490
13052
+ },
13053
+ {
13054
+ "epoch": 2.921619987910538,
13055
+ "grad_norm": 15.125,
13056
+ "learning_rate": 5.238766874874068e-07,
13057
+ "loss": 0.8604,
13058
+ "mean_token_accuracy": 0.7894056618213654,
13059
+ "num_tokens": 16065206.0,
13060
+ "step": 14500
13061
  }
13062
  ],
13063
  "logging_steps": 10,
 
13077
  "attributes": {}
13078
  }
13079
  },
13080
+ "total_flos": 1.9417933454309376e+16,
13081
  "train_batch_size": 8,
13082
  "trial_name": null,
13083
  "trial_params": null