Plofski commited on
Commit
96dc39d
·
verified ·
1 Parent(s): 6421f08

Training in progress, step 12500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3e424925fa2b2770536f70d1899af46260c1bbb5c290c98396f2248352c7add
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18518c164df026440f068fac8233b3bff2d8d4502ff38a32a862597f23f6b7c0
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be8d0890e7228cd98f10766bc63bebe515a3fa05be0c7762618a01f87fa2799c
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73b91ebf8be54d28c1c49c244582f7f70def8a8258d400992d104200bbf23d2
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdff80ed8983588a862f2109bcc080c93759e076260079b20d08888071ee3452
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:113d12b5af2a861076397bdce257b8a1e5a1daabe8a5aaee5bfcbdb6024fca69
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.4178924037880316,
6
  "eval_steps": 500,
7
- "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10808,6 +10808,456 @@
10808
  "mean_token_accuracy": 0.8190572082996368,
10809
  "num_tokens": 13294166.0,
10810
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10811
  }
10812
  ],
10813
  "logging_steps": 10,
@@ -10827,7 +11277,7 @@
10827
  "attributes": {}
10828
  }
10829
  },
10830
- "total_flos": 1.6084473958017024e+16,
10831
  "train_batch_size": 8,
10832
  "trial_name": null,
10833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.518637920612533,
6
  "eval_steps": 500,
7
+ "global_step": 12500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10808
  "mean_token_accuracy": 0.8190572082996368,
10809
  "num_tokens": 13294166.0,
10810
  "step": 12000
10811
+ },
10812
+ {
10813
+ "epoch": 2.4199073141245213,
10814
+ "grad_norm": 10.5625,
10815
+ "learning_rate": 3.868627846060851e-06,
10816
+ "loss": 0.7126,
10817
+ "mean_token_accuracy": 0.8174474656581878,
10818
+ "num_tokens": 13305628.0,
10819
+ "step": 12010
10820
+ },
10821
+ {
10822
+ "epoch": 2.4219222244610115,
10823
+ "grad_norm": 12.75,
10824
+ "learning_rate": 3.855195110484251e-06,
10825
+ "loss": 0.6862,
10826
+ "mean_token_accuracy": 0.8214840114116668,
10827
+ "num_tokens": 13316153.0,
10828
+ "step": 12020
10829
+ },
10830
+ {
10831
+ "epoch": 2.4239371347975016,
10832
+ "grad_norm": 10.125,
10833
+ "learning_rate": 3.84176237490765e-06,
10834
+ "loss": 0.8504,
10835
+ "mean_token_accuracy": 0.7878824770450592,
10836
+ "num_tokens": 13327124.0,
10837
+ "step": 12030
10838
+ },
10839
+ {
10840
+ "epoch": 2.4259520451339913,
10841
+ "grad_norm": 14.0,
10842
+ "learning_rate": 3.82832963933105e-06,
10843
+ "loss": 0.7197,
10844
+ "mean_token_accuracy": 0.8142663776874542,
10845
+ "num_tokens": 13337746.0,
10846
+ "step": 12040
10847
+ },
10848
+ {
10849
+ "epoch": 2.4279669554704815,
10850
+ "grad_norm": 13.125,
10851
+ "learning_rate": 3.81489690375445e-06,
10852
+ "loss": 0.7603,
10853
+ "mean_token_accuracy": 0.8075387954711915,
10854
+ "num_tokens": 13349108.0,
10855
+ "step": 12050
10856
+ },
10857
+ {
10858
+ "epoch": 2.4299818658069716,
10859
+ "grad_norm": 12.125,
10860
+ "learning_rate": 3.80146416817785e-06,
10861
+ "loss": 0.7589,
10862
+ "mean_token_accuracy": 0.8082424461841583,
10863
+ "num_tokens": 13359297.0,
10864
+ "step": 12060
10865
+ },
10866
+ {
10867
+ "epoch": 2.4319967761434618,
10868
+ "grad_norm": 11.5625,
10869
+ "learning_rate": 3.7880314326012495e-06,
10870
+ "loss": 0.8093,
10871
+ "mean_token_accuracy": 0.8029668807983399,
10872
+ "num_tokens": 13370587.0,
10873
+ "step": 12070
10874
+ },
10875
+ {
10876
+ "epoch": 2.4340116864799515,
10877
+ "grad_norm": 12.0,
10878
+ "learning_rate": 3.7745986970246496e-06,
10879
+ "loss": 0.7561,
10880
+ "mean_token_accuracy": 0.8101776361465454,
10881
+ "num_tokens": 13381606.0,
10882
+ "step": 12080
10883
+ },
10884
+ {
10885
+ "epoch": 2.4360265968164416,
10886
+ "grad_norm": 10.0625,
10887
+ "learning_rate": 3.7611659614480493e-06,
10888
+ "loss": 0.8599,
10889
+ "mean_token_accuracy": 0.7864530384540558,
10890
+ "num_tokens": 13394004.0,
10891
+ "step": 12090
10892
+ },
10893
+ {
10894
+ "epoch": 2.4380415071529318,
10895
+ "grad_norm": 11.4375,
10896
+ "learning_rate": 3.747733225871449e-06,
10897
+ "loss": 0.87,
10898
+ "mean_token_accuracy": 0.7880048811435699,
10899
+ "num_tokens": 13405785.0,
10900
+ "step": 12100
10901
+ },
10902
+ {
10903
+ "epoch": 2.440056417489422,
10904
+ "grad_norm": 11.375,
10905
+ "learning_rate": 3.7343004902948486e-06,
10906
+ "loss": 0.8041,
10907
+ "mean_token_accuracy": 0.7979571759700775,
10908
+ "num_tokens": 13418406.0,
10909
+ "step": 12110
10910
+ },
10911
+ {
10912
+ "epoch": 2.4420713278259116,
10913
+ "grad_norm": 13.5,
10914
+ "learning_rate": 3.7208677547182487e-06,
10915
+ "loss": 0.7067,
10916
+ "mean_token_accuracy": 0.8095525324344635,
10917
+ "num_tokens": 13428380.0,
10918
+ "step": 12120
10919
+ },
10920
+ {
10921
+ "epoch": 2.444086238162402,
10922
+ "grad_norm": 10.625,
10923
+ "learning_rate": 3.7074350191416484e-06,
10924
+ "loss": 0.8096,
10925
+ "mean_token_accuracy": 0.79591383934021,
10926
+ "num_tokens": 13439614.0,
10927
+ "step": 12130
10928
+ },
10929
+ {
10930
+ "epoch": 2.446101148498892,
10931
+ "grad_norm": 10.75,
10932
+ "learning_rate": 3.6940022835650485e-06,
10933
+ "loss": 0.8097,
10934
+ "mean_token_accuracy": 0.7982459485530853,
10935
+ "num_tokens": 13450951.0,
10936
+ "step": 12140
10937
+ },
10938
+ {
10939
+ "epoch": 2.4481160588353816,
10940
+ "grad_norm": 10.375,
10941
+ "learning_rate": 3.6805695479884478e-06,
10942
+ "loss": 0.82,
10943
+ "mean_token_accuracy": 0.7986723423004151,
10944
+ "num_tokens": 13461483.0,
10945
+ "step": 12150
10946
+ },
10947
+ {
10948
+ "epoch": 2.450130969171872,
10949
+ "grad_norm": 12.125,
10950
+ "learning_rate": 3.667136812411848e-06,
10951
+ "loss": 0.7788,
10952
+ "mean_token_accuracy": 0.80440074801445,
10953
+ "num_tokens": 13472237.0,
10954
+ "step": 12160
10955
+ },
10956
+ {
10957
+ "epoch": 2.452145879508362,
10958
+ "grad_norm": 9.625,
10959
+ "learning_rate": 3.6537040768352476e-06,
10960
+ "loss": 0.8397,
10961
+ "mean_token_accuracy": 0.7989638984203339,
10962
+ "num_tokens": 13483806.0,
10963
+ "step": 12170
10964
+ },
10965
+ {
10966
+ "epoch": 2.454160789844852,
10967
+ "grad_norm": 11.4375,
10968
+ "learning_rate": 3.6402713412586477e-06,
10969
+ "loss": 0.7816,
10970
+ "mean_token_accuracy": 0.8013097047805786,
10971
+ "num_tokens": 13493924.0,
10972
+ "step": 12180
10973
+ },
10974
+ {
10975
+ "epoch": 2.456175700181342,
10976
+ "grad_norm": 15.8125,
10977
+ "learning_rate": 3.6268386056820478e-06,
10978
+ "loss": 0.7321,
10979
+ "mean_token_accuracy": 0.815697294473648,
10980
+ "num_tokens": 13505064.0,
10981
+ "step": 12190
10982
+ },
10983
+ {
10984
+ "epoch": 2.458190610517832,
10985
+ "grad_norm": 10.3125,
10986
+ "learning_rate": 3.6134058701054475e-06,
10987
+ "loss": 0.766,
10988
+ "mean_token_accuracy": 0.8088764250278473,
10989
+ "num_tokens": 13515435.0,
10990
+ "step": 12200
10991
+ },
10992
+ {
10993
+ "epoch": 2.460205520854322,
10994
+ "grad_norm": 11.6875,
10995
+ "learning_rate": 3.5999731345288467e-06,
10996
+ "loss": 0.8167,
10997
+ "mean_token_accuracy": 0.7981218516826629,
10998
+ "num_tokens": 13526529.0,
10999
+ "step": 12210
11000
+ },
11001
+ {
11002
+ "epoch": 2.4622204311908122,
11003
+ "grad_norm": 11.5625,
11004
+ "learning_rate": 3.586540398952247e-06,
11005
+ "loss": 0.8728,
11006
+ "mean_token_accuracy": 0.7834985911846161,
11007
+ "num_tokens": 13537408.0,
11008
+ "step": 12220
11009
+ },
11010
+ {
11011
+ "epoch": 2.464235341527302,
11012
+ "grad_norm": 9.9375,
11013
+ "learning_rate": 3.573107663375647e-06,
11014
+ "loss": 0.8162,
11015
+ "mean_token_accuracy": 0.7954578995704651,
11016
+ "num_tokens": 13547488.0,
11017
+ "step": 12230
11018
+ },
11019
+ {
11020
+ "epoch": 2.466250251863792,
11021
+ "grad_norm": 12.0,
11022
+ "learning_rate": 3.5596749277990466e-06,
11023
+ "loss": 0.8507,
11024
+ "mean_token_accuracy": 0.7890534639358521,
11025
+ "num_tokens": 13558242.0,
11026
+ "step": 12240
11027
+ },
11028
+ {
11029
+ "epoch": 2.4682651622002822,
11030
+ "grad_norm": 11.1875,
11031
+ "learning_rate": 3.5462421922224467e-06,
11032
+ "loss": 0.7756,
11033
+ "mean_token_accuracy": 0.8034618675708771,
11034
+ "num_tokens": 13568217.0,
11035
+ "step": 12250
11036
+ },
11037
+ {
11038
+ "epoch": 2.470280072536772,
11039
+ "grad_norm": 13.3125,
11040
+ "learning_rate": 3.532809456645846e-06,
11041
+ "loss": 0.8328,
11042
+ "mean_token_accuracy": 0.7975371956825257,
11043
+ "num_tokens": 13579334.0,
11044
+ "step": 12260
11045
+ },
11046
+ {
11047
+ "epoch": 2.472294982873262,
11048
+ "grad_norm": 11.8125,
11049
+ "learning_rate": 3.519376721069246e-06,
11050
+ "loss": 0.7325,
11051
+ "mean_token_accuracy": 0.8158390104770661,
11052
+ "num_tokens": 13589924.0,
11053
+ "step": 12270
11054
+ },
11055
+ {
11056
+ "epoch": 2.4743098932097523,
11057
+ "grad_norm": 9.9375,
11058
+ "learning_rate": 3.5059439854926458e-06,
11059
+ "loss": 0.9189,
11060
+ "mean_token_accuracy": 0.7810778141021728,
11061
+ "num_tokens": 13601915.0,
11062
+ "step": 12280
11063
+ },
11064
+ {
11065
+ "epoch": 2.476324803546242,
11066
+ "grad_norm": 11.1875,
11067
+ "learning_rate": 3.492511249916046e-06,
11068
+ "loss": 0.7933,
11069
+ "mean_token_accuracy": 0.804823362827301,
11070
+ "num_tokens": 13613049.0,
11071
+ "step": 12290
11072
+ },
11073
+ {
11074
+ "epoch": 2.478339713882732,
11075
+ "grad_norm": 11.0625,
11076
+ "learning_rate": 3.4790785143394455e-06,
11077
+ "loss": 0.7509,
11078
+ "mean_token_accuracy": 0.8156402170658111,
11079
+ "num_tokens": 13624399.0,
11080
+ "step": 12300
11081
+ },
11082
+ {
11083
+ "epoch": 2.4803546242192223,
11084
+ "grad_norm": 15.0625,
11085
+ "learning_rate": 3.4656457787628457e-06,
11086
+ "loss": 0.7869,
11087
+ "mean_token_accuracy": 0.8047832548618317,
11088
+ "num_tokens": 13635186.0,
11089
+ "step": 12310
11090
+ },
11091
+ {
11092
+ "epoch": 2.4823695345557124,
11093
+ "grad_norm": 13.5,
11094
+ "learning_rate": 3.452213043186245e-06,
11095
+ "loss": 0.826,
11096
+ "mean_token_accuracy": 0.7985908687114716,
11097
+ "num_tokens": 13644792.0,
11098
+ "step": 12320
11099
+ },
11100
+ {
11101
+ "epoch": 2.484384444892202,
11102
+ "grad_norm": 10.0625,
11103
+ "learning_rate": 3.438780307609645e-06,
11104
+ "loss": 0.8709,
11105
+ "mean_token_accuracy": 0.7914902806282044,
11106
+ "num_tokens": 13656993.0,
11107
+ "step": 12330
11108
+ },
11109
+ {
11110
+ "epoch": 2.4863993552286923,
11111
+ "grad_norm": 10.1875,
11112
+ "learning_rate": 3.4253475720330447e-06,
11113
+ "loss": 0.8268,
11114
+ "mean_token_accuracy": 0.7995809733867645,
11115
+ "num_tokens": 13669719.0,
11116
+ "step": 12340
11117
+ },
11118
+ {
11119
+ "epoch": 2.4884142655651824,
11120
+ "grad_norm": 9.375,
11121
+ "learning_rate": 3.411914836456445e-06,
11122
+ "loss": 0.8012,
11123
+ "mean_token_accuracy": 0.7969933092594147,
11124
+ "num_tokens": 13679980.0,
11125
+ "step": 12350
11126
+ },
11127
+ {
11128
+ "epoch": 2.4904291759016726,
11129
+ "grad_norm": 10.5,
11130
+ "learning_rate": 3.3984821008798445e-06,
11131
+ "loss": 0.8088,
11132
+ "mean_token_accuracy": 0.8042493402957916,
11133
+ "num_tokens": 13691125.0,
11134
+ "step": 12360
11135
+ },
11136
+ {
11137
+ "epoch": 2.4924440862381623,
11138
+ "grad_norm": 11.0,
11139
+ "learning_rate": 3.385049365303244e-06,
11140
+ "loss": 0.8507,
11141
+ "mean_token_accuracy": 0.7906042397022247,
11142
+ "num_tokens": 13701602.0,
11143
+ "step": 12370
11144
+ },
11145
+ {
11146
+ "epoch": 2.4944589965746524,
11147
+ "grad_norm": 12.9375,
11148
+ "learning_rate": 3.371616629726644e-06,
11149
+ "loss": 0.7928,
11150
+ "mean_token_accuracy": 0.8045152962207794,
11151
+ "num_tokens": 13711693.0,
11152
+ "step": 12380
11153
+ },
11154
+ {
11155
+ "epoch": 2.4964739069111426,
11156
+ "grad_norm": 10.125,
11157
+ "learning_rate": 3.358183894150044e-06,
11158
+ "loss": 0.8049,
11159
+ "mean_token_accuracy": 0.7998487055301666,
11160
+ "num_tokens": 13723412.0,
11161
+ "step": 12390
11162
+ },
11163
+ {
11164
+ "epoch": 2.4984888172476323,
11165
+ "grad_norm": 10.625,
11166
+ "learning_rate": 3.3447511585734436e-06,
11167
+ "loss": 0.7884,
11168
+ "mean_token_accuracy": 0.7947759389877319,
11169
+ "num_tokens": 13733833.0,
11170
+ "step": 12400
11171
+ },
11172
+ {
11173
+ "epoch": 2.5005037275841224,
11174
+ "grad_norm": 10.5625,
11175
+ "learning_rate": 3.3313184229968437e-06,
11176
+ "loss": 0.7059,
11177
+ "mean_token_accuracy": 0.814959716796875,
11178
+ "num_tokens": 13744955.0,
11179
+ "step": 12410
11180
+ },
11181
+ {
11182
+ "epoch": 2.5025186379206126,
11183
+ "grad_norm": 12.125,
11184
+ "learning_rate": 3.317885687420244e-06,
11185
+ "loss": 0.8388,
11186
+ "mean_token_accuracy": 0.7952327311038971,
11187
+ "num_tokens": 13755393.0,
11188
+ "step": 12420
11189
+ },
11190
+ {
11191
+ "epoch": 2.5045335482571023,
11192
+ "grad_norm": 10.25,
11193
+ "learning_rate": 3.304452951843643e-06,
11194
+ "loss": 0.8153,
11195
+ "mean_token_accuracy": 0.8019894421100616,
11196
+ "num_tokens": 13766163.0,
11197
+ "step": 12430
11198
+ },
11199
+ {
11200
+ "epoch": 2.5065484585935924,
11201
+ "grad_norm": 10.0,
11202
+ "learning_rate": 3.2910202162670428e-06,
11203
+ "loss": 0.839,
11204
+ "mean_token_accuracy": 0.7980442643165588,
11205
+ "num_tokens": 13778064.0,
11206
+ "step": 12440
11207
+ },
11208
+ {
11209
+ "epoch": 2.5085633689300826,
11210
+ "grad_norm": 13.75,
11211
+ "learning_rate": 3.277587480690443e-06,
11212
+ "loss": 0.7933,
11213
+ "mean_token_accuracy": 0.8014937698841095,
11214
+ "num_tokens": 13788785.0,
11215
+ "step": 12450
11216
+ },
11217
+ {
11218
+ "epoch": 2.5105782792665727,
11219
+ "grad_norm": 13.0625,
11220
+ "learning_rate": 3.2641547451138426e-06,
11221
+ "loss": 0.8186,
11222
+ "mean_token_accuracy": 0.7962758064270019,
11223
+ "num_tokens": 13800349.0,
11224
+ "step": 12460
11225
+ },
11226
+ {
11227
+ "epoch": 2.512593189603063,
11228
+ "grad_norm": 10.5,
11229
+ "learning_rate": 3.2507220095372427e-06,
11230
+ "loss": 0.8279,
11231
+ "mean_token_accuracy": 0.7982799649238587,
11232
+ "num_tokens": 13810275.0,
11233
+ "step": 12470
11234
+ },
11235
+ {
11236
+ "epoch": 2.5146080999395526,
11237
+ "grad_norm": 12.875,
11238
+ "learning_rate": 3.237289273960642e-06,
11239
+ "loss": 0.7193,
11240
+ "mean_token_accuracy": 0.8151337385177613,
11241
+ "num_tokens": 13820122.0,
11242
+ "step": 12480
11243
+ },
11244
+ {
11245
+ "epoch": 2.5166230102760427,
11246
+ "grad_norm": 13.1875,
11247
+ "learning_rate": 3.223856538384042e-06,
11248
+ "loss": 0.8233,
11249
+ "mean_token_accuracy": 0.7997250974178314,
11250
+ "num_tokens": 13830787.0,
11251
+ "step": 12490
11252
+ },
11253
+ {
11254
+ "epoch": 2.518637920612533,
11255
+ "grad_norm": 12.75,
11256
+ "learning_rate": 3.2104238028074417e-06,
11257
+ "loss": 0.7905,
11258
+ "mean_token_accuracy": 0.8074711799621582,
11259
+ "num_tokens": 13840892.0,
11260
+ "step": 12500
11261
  }
11262
  ],
11263
  "logging_steps": 10,
 
11277
  "attributes": {}
11278
  }
11279
  },
11280
+ "total_flos": 1.6741415131650048e+16,
11281
  "train_batch_size": 8,
11282
  "trial_name": null,
11283
  "trial_params": null