Plofski commited on
Commit
4ae0828
·
verified ·
1 Parent(s): 2486be3

Training in progress, step 13500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4e31724b0cf74835ae0b9aaeff5c05e7e852cb9e158de0e35d8a673c930d429
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5a9bd42305a39ea10e14897e10ee483294601df6c8b6bb20eb9acc7de3a5b74
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98033794262f4774a192ebe69b4dfddba3edee43a3cce40cedfd5c1785391e67
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fd3300583dc98302b4bc1805b201303b140f489f169bc005adefa8fde0fce38
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3526295826c2a8db767925a5ee2fce15661c2f21ba999bd2bc96732400f36f2d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ce5bfd25fb939a324385a4adfd5b1d29fedc6793352a13b276f53eccc661d15
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.619383437437034,
6
  "eval_steps": 500,
7
- "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11708,6 +11708,456 @@
11708
  "mean_token_accuracy": 0.7685989677906037,
11709
  "num_tokens": 14393395.0,
11710
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11711
  }
11712
  ],
11713
  "logging_steps": 10,
@@ -11727,7 +12177,7 @@
11727
  "attributes": {}
11728
  }
11729
  },
11730
- "total_flos": 1.7403253820080128e+16,
11731
  "train_batch_size": 8,
11732
  "trial_name": null,
11733
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.7201289542615354,
6
  "eval_steps": 500,
7
+ "global_step": 13500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11708
  "mean_token_accuracy": 0.7685989677906037,
11709
  "num_tokens": 14393395.0,
11710
  "step": 13000
11711
+ },
11712
+ {
11713
+ "epoch": 2.621398347773524,
11714
+ "grad_norm": 10.5,
11715
+ "learning_rate": 2.525354288400833e-06,
11716
+ "loss": 0.7726,
11717
+ "mean_token_accuracy": 0.8112367451190948,
11718
+ "num_tokens": 14405357.0,
11719
+ "step": 13010
11720
+ },
11721
+ {
11722
+ "epoch": 2.623413258110014,
11723
+ "grad_norm": 10.5,
11724
+ "learning_rate": 2.511921552824233e-06,
11725
+ "loss": 0.8883,
11726
+ "mean_token_accuracy": 0.7785914719104767,
11727
+ "num_tokens": 14416733.0,
11728
+ "step": 13020
11729
+ },
11730
+ {
11731
+ "epoch": 2.625428168446504,
11732
+ "grad_norm": 11.0625,
11733
+ "learning_rate": 2.4984888172476325e-06,
11734
+ "loss": 0.8215,
11735
+ "mean_token_accuracy": 0.7950002431869507,
11736
+ "num_tokens": 14428394.0,
11737
+ "step": 13030
11738
+ },
11739
+ {
11740
+ "epoch": 2.627443078782994,
11741
+ "grad_norm": 11.75,
11742
+ "learning_rate": 2.4850560816710326e-06,
11743
+ "loss": 0.7974,
11744
+ "mean_token_accuracy": 0.8010411977767944,
11745
+ "num_tokens": 14439149.0,
11746
+ "step": 13040
11747
+ },
11748
+ {
11749
+ "epoch": 2.629457989119484,
11750
+ "grad_norm": 11.75,
11751
+ "learning_rate": 2.4716233460944323e-06,
11752
+ "loss": 0.7182,
11753
+ "mean_token_accuracy": 0.8175202190876008,
11754
+ "num_tokens": 14449655.0,
11755
+ "step": 13050
11756
+ },
11757
+ {
11758
+ "epoch": 2.631472899455974,
11759
+ "grad_norm": 10.625,
11760
+ "learning_rate": 2.458190610517832e-06,
11761
+ "loss": 0.7572,
11762
+ "mean_token_accuracy": 0.8045202255249023,
11763
+ "num_tokens": 14459975.0,
11764
+ "step": 13060
11765
+ },
11766
+ {
11767
+ "epoch": 2.6334878097924643,
11768
+ "grad_norm": 13.0,
11769
+ "learning_rate": 2.444757874941232e-06,
11770
+ "loss": 0.7044,
11771
+ "mean_token_accuracy": 0.81912060379982,
11772
+ "num_tokens": 14470362.0,
11773
+ "step": 13070
11774
+ },
11775
+ {
11776
+ "epoch": 2.6355027201289545,
11777
+ "grad_norm": 10.25,
11778
+ "learning_rate": 2.4313251393646317e-06,
11779
+ "loss": 0.8873,
11780
+ "mean_token_accuracy": 0.7796810269355774,
11781
+ "num_tokens": 14482051.0,
11782
+ "step": 13080
11783
+ },
11784
+ {
11785
+ "epoch": 2.637517630465444,
11786
+ "grad_norm": 12.4375,
11787
+ "learning_rate": 2.417892403788032e-06,
11788
+ "loss": 0.8495,
11789
+ "mean_token_accuracy": 0.7927891492843628,
11790
+ "num_tokens": 14493266.0,
11791
+ "step": 13090
11792
+ },
11793
+ {
11794
+ "epoch": 2.6395325408019343,
11795
+ "grad_norm": 9.125,
11796
+ "learning_rate": 2.4044596682114315e-06,
11797
+ "loss": 0.7857,
11798
+ "mean_token_accuracy": 0.7974917531013489,
11799
+ "num_tokens": 14505920.0,
11800
+ "step": 13100
11801
+ },
11802
+ {
11803
+ "epoch": 2.6415474511384245,
11804
+ "grad_norm": 11.0,
11805
+ "learning_rate": 2.3910269326348312e-06,
11806
+ "loss": 0.7904,
11807
+ "mean_token_accuracy": 0.7985158562660217,
11808
+ "num_tokens": 14517589.0,
11809
+ "step": 13110
11810
+ },
11811
+ {
11812
+ "epoch": 2.643562361474914,
11813
+ "grad_norm": 12.4375,
11814
+ "learning_rate": 2.3775941970582313e-06,
11815
+ "loss": 0.7822,
11816
+ "mean_token_accuracy": 0.7983147978782654,
11817
+ "num_tokens": 14528137.0,
11818
+ "step": 13120
11819
+ },
11820
+ {
11821
+ "epoch": 2.6455772718114043,
11822
+ "grad_norm": 11.5,
11823
+ "learning_rate": 2.364161461481631e-06,
11824
+ "loss": 0.7523,
11825
+ "mean_token_accuracy": 0.8122865617275238,
11826
+ "num_tokens": 14539021.0,
11827
+ "step": 13130
11828
+ },
11829
+ {
11830
+ "epoch": 2.6475921821478945,
11831
+ "grad_norm": 11.875,
11832
+ "learning_rate": 2.3507287259050307e-06,
11833
+ "loss": 0.8472,
11834
+ "mean_token_accuracy": 0.7906874716281891,
11835
+ "num_tokens": 14551105.0,
11836
+ "step": 13140
11837
+ },
11838
+ {
11839
+ "epoch": 2.649607092484384,
11840
+ "grad_norm": 11.875,
11841
+ "learning_rate": 2.337295990328431e-06,
11842
+ "loss": 0.8266,
11843
+ "mean_token_accuracy": 0.7899503231048584,
11844
+ "num_tokens": 14561956.0,
11845
+ "step": 13150
11846
+ },
11847
+ {
11848
+ "epoch": 2.6516220028208743,
11849
+ "grad_norm": 13.75,
11850
+ "learning_rate": 2.3238632547518305e-06,
11851
+ "loss": 0.6823,
11852
+ "mean_token_accuracy": 0.8246838212013244,
11853
+ "num_tokens": 14572323.0,
11854
+ "step": 13160
11855
+ },
11856
+ {
11857
+ "epoch": 2.6536369131573645,
11858
+ "grad_norm": 12.75,
11859
+ "learning_rate": 2.31043051917523e-06,
11860
+ "loss": 0.8422,
11861
+ "mean_token_accuracy": 0.7883449614048004,
11862
+ "num_tokens": 14583428.0,
11863
+ "step": 13170
11864
+ },
11865
+ {
11866
+ "epoch": 2.6556518234938546,
11867
+ "grad_norm": 12.125,
11868
+ "learning_rate": 2.2969977835986303e-06,
11869
+ "loss": 0.7448,
11870
+ "mean_token_accuracy": 0.8116320252418519,
11871
+ "num_tokens": 14593189.0,
11872
+ "step": 13180
11873
+ },
11874
+ {
11875
+ "epoch": 2.657666733830345,
11876
+ "grad_norm": 12.875,
11877
+ "learning_rate": 2.28356504802203e-06,
11878
+ "loss": 0.7905,
11879
+ "mean_token_accuracy": 0.8049618184566498,
11880
+ "num_tokens": 14604815.0,
11881
+ "step": 13190
11882
+ },
11883
+ {
11884
+ "epoch": 2.6596816441668345,
11885
+ "grad_norm": 10.625,
11886
+ "learning_rate": 2.2701323124454296e-06,
11887
+ "loss": 0.8403,
11888
+ "mean_token_accuracy": 0.7927229404449463,
11889
+ "num_tokens": 14616104.0,
11890
+ "step": 13200
11891
+ },
11892
+ {
11893
+ "epoch": 2.6616965545033247,
11894
+ "grad_norm": 15.75,
11895
+ "learning_rate": 2.2566995768688297e-06,
11896
+ "loss": 0.7988,
11897
+ "mean_token_accuracy": 0.8040601491928101,
11898
+ "num_tokens": 14626600.0,
11899
+ "step": 13210
11900
+ },
11901
+ {
11902
+ "epoch": 2.663711464839815,
11903
+ "grad_norm": 11.6875,
11904
+ "learning_rate": 2.2432668412922294e-06,
11905
+ "loss": 0.7137,
11906
+ "mean_token_accuracy": 0.8165888667106629,
11907
+ "num_tokens": 14637101.0,
11908
+ "step": 13220
11909
+ },
11910
+ {
11911
+ "epoch": 2.6657263751763045,
11912
+ "grad_norm": 12.5,
11913
+ "learning_rate": 2.229834105715629e-06,
11914
+ "loss": 0.7265,
11915
+ "mean_token_accuracy": 0.8147784769535065,
11916
+ "num_tokens": 14647761.0,
11917
+ "step": 13230
11918
+ },
11919
+ {
11920
+ "epoch": 2.6677412855127947,
11921
+ "grad_norm": 10.4375,
11922
+ "learning_rate": 2.216401370139029e-06,
11923
+ "loss": 0.7454,
11924
+ "mean_token_accuracy": 0.8097371995449066,
11925
+ "num_tokens": 14658218.0,
11926
+ "step": 13240
11927
+ },
11928
+ {
11929
+ "epoch": 2.669756195849285,
11930
+ "grad_norm": 10.375,
11931
+ "learning_rate": 2.202968634562429e-06,
11932
+ "loss": 0.7277,
11933
+ "mean_token_accuracy": 0.8119274914264679,
11934
+ "num_tokens": 14669077.0,
11935
+ "step": 13250
11936
+ },
11937
+ {
11938
+ "epoch": 2.6717711061857745,
11939
+ "grad_norm": 11.0625,
11940
+ "learning_rate": 2.1895358989858286e-06,
11941
+ "loss": 0.8152,
11942
+ "mean_token_accuracy": 0.7921769440174102,
11943
+ "num_tokens": 14680555.0,
11944
+ "step": 13260
11945
+ },
11946
+ {
11947
+ "epoch": 2.6737860165222647,
11948
+ "grad_norm": 15.3125,
11949
+ "learning_rate": 2.1761031634092282e-06,
11950
+ "loss": 0.776,
11951
+ "mean_token_accuracy": 0.8004971742630005,
11952
+ "num_tokens": 14691228.0,
11953
+ "step": 13270
11954
+ },
11955
+ {
11956
+ "epoch": 2.675800926858755,
11957
+ "grad_norm": 10.5625,
11958
+ "learning_rate": 2.1626704278326283e-06,
11959
+ "loss": 0.7987,
11960
+ "mean_token_accuracy": 0.7978484213352204,
11961
+ "num_tokens": 14702555.0,
11962
+ "step": 13280
11963
+ },
11964
+ {
11965
+ "epoch": 2.677815837195245,
11966
+ "grad_norm": 10.6875,
11967
+ "learning_rate": 2.149237692256028e-06,
11968
+ "loss": 0.9027,
11969
+ "mean_token_accuracy": 0.7809522151947021,
11970
+ "num_tokens": 14713437.0,
11971
+ "step": 13290
11972
+ },
11973
+ {
11974
+ "epoch": 2.679830747531735,
11975
+ "grad_norm": 16.125,
11976
+ "learning_rate": 2.1358049566794277e-06,
11977
+ "loss": 0.7883,
11978
+ "mean_token_accuracy": 0.805361670255661,
11979
+ "num_tokens": 14725318.0,
11980
+ "step": 13300
11981
+ },
11982
+ {
11983
+ "epoch": 2.681845657868225,
11984
+ "grad_norm": 12.0625,
11985
+ "learning_rate": 2.122372221102828e-06,
11986
+ "loss": 0.8495,
11987
+ "mean_token_accuracy": 0.7887078404426575,
11988
+ "num_tokens": 14736729.0,
11989
+ "step": 13310
11990
+ },
11991
+ {
11992
+ "epoch": 2.683860568204715,
11993
+ "grad_norm": 10.3125,
11994
+ "learning_rate": 2.1089394855262275e-06,
11995
+ "loss": 0.7301,
11996
+ "mean_token_accuracy": 0.8114965260028839,
11997
+ "num_tokens": 14747781.0,
11998
+ "step": 13320
11999
+ },
12000
+ {
12001
+ "epoch": 2.685875478541205,
12002
+ "grad_norm": 9.8125,
12003
+ "learning_rate": 2.095506749949627e-06,
12004
+ "loss": 0.7964,
12005
+ "mean_token_accuracy": 0.7989574909210205,
12006
+ "num_tokens": 14758609.0,
12007
+ "step": 13330
12008
+ },
12009
+ {
12010
+ "epoch": 2.687890388877695,
12011
+ "grad_norm": 11.1875,
12012
+ "learning_rate": 2.0820740143730273e-06,
12013
+ "loss": 0.9205,
12014
+ "mean_token_accuracy": 0.779743617773056,
12015
+ "num_tokens": 14770464.0,
12016
+ "step": 13340
12017
+ },
12018
+ {
12019
+ "epoch": 2.689905299214185,
12020
+ "grad_norm": 12.0625,
12021
+ "learning_rate": 2.068641278796427e-06,
12022
+ "loss": 0.8432,
12023
+ "mean_token_accuracy": 0.788221025466919,
12024
+ "num_tokens": 14783187.0,
12025
+ "step": 13350
12026
+ },
12027
+ {
12028
+ "epoch": 2.691920209550675,
12029
+ "grad_norm": 11.25,
12030
+ "learning_rate": 2.0552085432198266e-06,
12031
+ "loss": 0.7166,
12032
+ "mean_token_accuracy": 0.8211326837539673,
12033
+ "num_tokens": 14794959.0,
12034
+ "step": 13360
12035
+ },
12036
+ {
12037
+ "epoch": 2.693935119887165,
12038
+ "grad_norm": 12.5625,
12039
+ "learning_rate": 2.0417758076432268e-06,
12040
+ "loss": 0.7359,
12041
+ "mean_token_accuracy": 0.8126482903957367,
12042
+ "num_tokens": 14805848.0,
12043
+ "step": 13370
12044
+ },
12045
+ {
12046
+ "epoch": 2.695950030223655,
12047
+ "grad_norm": 10.5625,
12048
+ "learning_rate": 2.028343072066627e-06,
12049
+ "loss": 0.7513,
12050
+ "mean_token_accuracy": 0.8137486338615417,
12051
+ "num_tokens": 14817111.0,
12052
+ "step": 13380
12053
+ },
12054
+ {
12055
+ "epoch": 2.697964940560145,
12056
+ "grad_norm": 13.8125,
12057
+ "learning_rate": 2.014910336490026e-06,
12058
+ "loss": 0.827,
12059
+ "mean_token_accuracy": 0.7981631934642792,
12060
+ "num_tokens": 14827755.0,
12061
+ "step": 13390
12062
+ },
12063
+ {
12064
+ "epoch": 2.699979850896635,
12065
+ "grad_norm": 10.5,
12066
+ "learning_rate": 2.0014776009134262e-06,
12067
+ "loss": 0.7427,
12068
+ "mean_token_accuracy": 0.8144878685474396,
12069
+ "num_tokens": 14840080.0,
12070
+ "step": 13400
12071
+ },
12072
+ {
12073
+ "epoch": 2.701994761233125,
12074
+ "grad_norm": 11.5625,
12075
+ "learning_rate": 1.988044865336826e-06,
12076
+ "loss": 0.8199,
12077
+ "mean_token_accuracy": 0.7990254759788513,
12078
+ "num_tokens": 14852571.0,
12079
+ "step": 13410
12080
+ },
12081
+ {
12082
+ "epoch": 2.704009671569615,
12083
+ "grad_norm": 10.1875,
12084
+ "learning_rate": 1.974612129760226e-06,
12085
+ "loss": 0.8714,
12086
+ "mean_token_accuracy": 0.788819420337677,
12087
+ "num_tokens": 14865165.0,
12088
+ "step": 13420
12089
+ },
12090
+ {
12091
+ "epoch": 2.7060245819061053,
12092
+ "grad_norm": 10.75,
12093
+ "learning_rate": 1.9611793941836257e-06,
12094
+ "loss": 0.8413,
12095
+ "mean_token_accuracy": 0.7880795717239379,
12096
+ "num_tokens": 14875987.0,
12097
+ "step": 13430
12098
+ },
12099
+ {
12100
+ "epoch": 2.7080394922425954,
12101
+ "grad_norm": 13.4375,
12102
+ "learning_rate": 1.9477466586070254e-06,
12103
+ "loss": 0.8133,
12104
+ "mean_token_accuracy": 0.7958886742591857,
12105
+ "num_tokens": 14887222.0,
12106
+ "step": 13440
12107
+ },
12108
+ {
12109
+ "epoch": 2.710054402579085,
12110
+ "grad_norm": 11.625,
12111
+ "learning_rate": 1.9343139230304255e-06,
12112
+ "loss": 0.8375,
12113
+ "mean_token_accuracy": 0.7916811347007752,
12114
+ "num_tokens": 14897992.0,
12115
+ "step": 13450
12116
+ },
12117
+ {
12118
+ "epoch": 2.7120693129155753,
12119
+ "grad_norm": 11.1875,
12120
+ "learning_rate": 1.920881187453825e-06,
12121
+ "loss": 0.7921,
12122
+ "mean_token_accuracy": 0.8102820634841919,
12123
+ "num_tokens": 14909769.0,
12124
+ "step": 13460
12125
+ },
12126
+ {
12127
+ "epoch": 2.7140842232520654,
12128
+ "grad_norm": 14.4375,
12129
+ "learning_rate": 1.907448451877225e-06,
12130
+ "loss": 0.7959,
12131
+ "mean_token_accuracy": 0.8004967868328094,
12132
+ "num_tokens": 14920552.0,
12133
+ "step": 13470
12134
+ },
12135
+ {
12136
+ "epoch": 2.716099133588555,
12137
+ "grad_norm": 14.5625,
12138
+ "learning_rate": 1.8940157163006247e-06,
12139
+ "loss": 0.8219,
12140
+ "mean_token_accuracy": 0.796020919084549,
12141
+ "num_tokens": 14932205.0,
12142
+ "step": 13480
12143
+ },
12144
+ {
12145
+ "epoch": 2.7181140439250453,
12146
+ "grad_norm": 11.0,
12147
+ "learning_rate": 1.8805829807240246e-06,
12148
+ "loss": 0.7236,
12149
+ "mean_token_accuracy": 0.8162269771099091,
12150
+ "num_tokens": 14944469.0,
12151
+ "step": 13490
12152
+ },
12153
+ {
12154
+ "epoch": 2.7201289542615354,
12155
+ "grad_norm": 13.8125,
12156
+ "learning_rate": 1.8671502451474243e-06,
12157
+ "loss": 0.8344,
12158
+ "mean_token_accuracy": 0.7941052973270416,
12159
+ "num_tokens": 14956201.0,
12160
+ "step": 13500
12161
  }
12162
  ],
12163
  "logging_steps": 10,
 
12177
  "attributes": {}
12178
  }
12179
  },
12180
+ "total_flos": 1.807875931971379e+16,
12181
  "train_batch_size": 8,
12182
  "trial_name": null,
12183
  "trial_params": null