Plofski commited on
Commit
94557ad
·
verified ·
1 Parent(s): 704448f

Training in progress, step 11500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:653a7bb4c0270ae2dd03d344965c51599b26df08817400d9611fe8bd0497aa7e
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c78f458d11eae9e4154eb728cce06719d74e09c423918147e47d15f28937e92f
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49ab3488ed04a08a6119dd62c223dc3bd691b1d8c04575c9d55a422631b4cec4
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:036c2c1f7cdbea44cbf7137c6b1c3cf16b5447a1c3d590934dbf649691bc4729
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a92df46ff7ec03358cd9241260e8a718523df24a66e616bac3dad8000c153e0c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f9a4928c3c29f8d8ffe6d8c80c93af4c98237f714bf32b55ba4f3d5d67a23da
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.216401370139029,
6
  "eval_steps": 500,
7
- "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9908,6 +9908,456 @@
9908
  "mean_token_accuracy": 0.7765659749507904,
9909
  "num_tokens": 12178091.0,
9910
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9911
  }
9912
  ],
9913
  "logging_steps": 10,
@@ -9927,7 +10377,7 @@
9927
  "attributes": {}
9928
  }
9929
  },
9930
- "total_flos": 1.4726200960407552e+16,
9931
  "train_batch_size": 8,
9932
  "trial_name": null,
9933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.3171468869635303,
6
  "eval_steps": 500,
7
+ "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9908
  "mean_token_accuracy": 0.7765659749507904,
9909
  "num_tokens": 12178091.0,
9910
  "step": 11000
9911
+ },
9912
+ {
9913
+ "epoch": 2.2184162804755188,
9914
+ "grad_norm": 11.5625,
9915
+ "learning_rate": 5.211901403720868e-06,
9916
+ "loss": 0.8083,
9917
+ "mean_token_accuracy": 0.7975468277931214,
9918
+ "num_tokens": 12188970.0,
9919
+ "step": 11010
9920
+ },
9921
+ {
9922
+ "epoch": 2.220431190812009,
9923
+ "grad_norm": 10.0,
9924
+ "learning_rate": 5.198468668144267e-06,
9925
+ "loss": 0.7398,
9926
+ "mean_token_accuracy": 0.809011173248291,
9927
+ "num_tokens": 12201125.0,
9928
+ "step": 11020
9929
+ },
9930
+ {
9931
+ "epoch": 2.222446101148499,
9932
+ "grad_norm": 8.8125,
9933
+ "learning_rate": 5.185035932567668e-06,
9934
+ "loss": 0.8308,
9935
+ "mean_token_accuracy": 0.7948459804058075,
9936
+ "num_tokens": 12214789.0,
9937
+ "step": 11030
9938
+ },
9939
+ {
9940
+ "epoch": 2.2244610114849888,
9941
+ "grad_norm": 11.625,
9942
+ "learning_rate": 5.171603196991067e-06,
9943
+ "loss": 0.7774,
9944
+ "mean_token_accuracy": 0.8057547986507416,
9945
+ "num_tokens": 12224991.0,
9946
+ "step": 11040
9947
+ },
9948
+ {
9949
+ "epoch": 2.226475921821479,
9950
+ "grad_norm": 9.0,
9951
+ "learning_rate": 5.158170461414468e-06,
9952
+ "loss": 0.8747,
9953
+ "mean_token_accuracy": 0.7873781383037567,
9954
+ "num_tokens": 12235996.0,
9955
+ "step": 11050
9956
+ },
9957
+ {
9958
+ "epoch": 2.228490832157969,
9959
+ "grad_norm": 13.625,
9960
+ "learning_rate": 5.144737725837868e-06,
9961
+ "loss": 0.8058,
9962
+ "mean_token_accuracy": 0.800428307056427,
9963
+ "num_tokens": 12247704.0,
9964
+ "step": 11060
9965
+ },
9966
+ {
9967
+ "epoch": 2.2305057424944588,
9968
+ "grad_norm": 10.375,
9969
+ "learning_rate": 5.1313049902612665e-06,
9970
+ "loss": 0.8622,
9971
+ "mean_token_accuracy": 0.7865659236907959,
9972
+ "num_tokens": 12259569.0,
9973
+ "step": 11070
9974
+ },
9975
+ {
9976
+ "epoch": 2.232520652830949,
9977
+ "grad_norm": 14.0,
9978
+ "learning_rate": 5.117872254684667e-06,
9979
+ "loss": 0.7844,
9980
+ "mean_token_accuracy": 0.8082635223865509,
9981
+ "num_tokens": 12269981.0,
9982
+ "step": 11080
9983
+ },
9984
+ {
9985
+ "epoch": 2.234535563167439,
9986
+ "grad_norm": 12.1875,
9987
+ "learning_rate": 5.104439519108067e-06,
9988
+ "loss": 0.8389,
9989
+ "mean_token_accuracy": 0.7902609288692475,
9990
+ "num_tokens": 12281790.0,
9991
+ "step": 11090
9992
+ },
9993
+ {
9994
+ "epoch": 2.236550473503929,
9995
+ "grad_norm": 13.0625,
9996
+ "learning_rate": 5.091006783531467e-06,
9997
+ "loss": 0.7349,
9998
+ "mean_token_accuracy": 0.8109230279922486,
9999
+ "num_tokens": 12292218.0,
10000
+ "step": 11100
10001
+ },
10002
+ {
10003
+ "epoch": 2.238565383840419,
10004
+ "grad_norm": 11.8125,
10005
+ "learning_rate": 5.077574047954866e-06,
10006
+ "loss": 0.8892,
10007
+ "mean_token_accuracy": 0.7831051290035248,
10008
+ "num_tokens": 12303549.0,
10009
+ "step": 11110
10010
+ },
10011
+ {
10012
+ "epoch": 2.240580294176909,
10013
+ "grad_norm": 16.125,
10014
+ "learning_rate": 5.0641413123782666e-06,
10015
+ "loss": 0.8981,
10016
+ "mean_token_accuracy": 0.779134213924408,
10017
+ "num_tokens": 12314039.0,
10018
+ "step": 11120
10019
+ },
10020
+ {
10021
+ "epoch": 2.242595204513399,
10022
+ "grad_norm": 15.1875,
10023
+ "learning_rate": 5.050708576801666e-06,
10024
+ "loss": 0.8596,
10025
+ "mean_token_accuracy": 0.795196259021759,
10026
+ "num_tokens": 12323912.0,
10027
+ "step": 11130
10028
+ },
10029
+ {
10030
+ "epoch": 2.2446101148498894,
10031
+ "grad_norm": 12.75,
10032
+ "learning_rate": 5.037275841225066e-06,
10033
+ "loss": 0.8716,
10034
+ "mean_token_accuracy": 0.7830780863761901,
10035
+ "num_tokens": 12335963.0,
10036
+ "step": 11140
10037
+ },
10038
+ {
10039
+ "epoch": 2.246625025186379,
10040
+ "grad_norm": 9.6875,
10041
+ "learning_rate": 5.023843105648466e-06,
10042
+ "loss": 0.7997,
10043
+ "mean_token_accuracy": 0.7968696773052215,
10044
+ "num_tokens": 12347034.0,
10045
+ "step": 11150
10046
+ },
10047
+ {
10048
+ "epoch": 2.2486399355228692,
10049
+ "grad_norm": 12.6875,
10050
+ "learning_rate": 5.010410370071865e-06,
10051
+ "loss": 0.8812,
10052
+ "mean_token_accuracy": 0.7810611367225647,
10053
+ "num_tokens": 12359524.0,
10054
+ "step": 11160
10055
+ },
10056
+ {
10057
+ "epoch": 2.2506548458593594,
10058
+ "grad_norm": 11.3125,
10059
+ "learning_rate": 4.996977634495265e-06,
10060
+ "loss": 0.8117,
10061
+ "mean_token_accuracy": 0.8003436684608459,
10062
+ "num_tokens": 12369580.0,
10063
+ "step": 11170
10064
+ },
10065
+ {
10066
+ "epoch": 2.252669756195849,
10067
+ "grad_norm": 13.3125,
10068
+ "learning_rate": 4.9835448989186655e-06,
10069
+ "loss": 0.8,
10070
+ "mean_token_accuracy": 0.7997420608997345,
10071
+ "num_tokens": 12380449.0,
10072
+ "step": 11180
10073
+ },
10074
+ {
10075
+ "epoch": 2.2546846665323392,
10076
+ "grad_norm": 11.5625,
10077
+ "learning_rate": 4.970112163342065e-06,
10078
+ "loss": 0.7495,
10079
+ "mean_token_accuracy": 0.812464052438736,
10080
+ "num_tokens": 12391160.0,
10081
+ "step": 11190
10082
+ },
10083
+ {
10084
+ "epoch": 2.2566995768688294,
10085
+ "grad_norm": 10.75,
10086
+ "learning_rate": 4.956679427765465e-06,
10087
+ "loss": 0.8713,
10088
+ "mean_token_accuracy": 0.7861813962459564,
10089
+ "num_tokens": 12403496.0,
10090
+ "step": 11200
10091
+ },
10092
+ {
10093
+ "epoch": 2.2587144872053195,
10094
+ "grad_norm": 12.4375,
10095
+ "learning_rate": 4.9432466921888646e-06,
10096
+ "loss": 0.7124,
10097
+ "mean_token_accuracy": 0.8236021995544434,
10098
+ "num_tokens": 12414075.0,
10099
+ "step": 11210
10100
+ },
10101
+ {
10102
+ "epoch": 2.2607293975418092,
10103
+ "grad_norm": 12.1875,
10104
+ "learning_rate": 4.929813956612264e-06,
10105
+ "loss": 0.82,
10106
+ "mean_token_accuracy": 0.7931098341941833,
10107
+ "num_tokens": 12424499.0,
10108
+ "step": 11220
10109
+ },
10110
+ {
10111
+ "epoch": 2.2627443078782994,
10112
+ "grad_norm": 12.25,
10113
+ "learning_rate": 4.916381221035664e-06,
10114
+ "loss": 0.7704,
10115
+ "mean_token_accuracy": 0.8014878571033478,
10116
+ "num_tokens": 12435957.0,
10117
+ "step": 11230
10118
+ },
10119
+ {
10120
+ "epoch": 2.2647592182147895,
10121
+ "grad_norm": 12.3125,
10122
+ "learning_rate": 4.9029484854590644e-06,
10123
+ "loss": 0.8023,
10124
+ "mean_token_accuracy": 0.798302048444748,
10125
+ "num_tokens": 12447051.0,
10126
+ "step": 11240
10127
+ },
10128
+ {
10129
+ "epoch": 2.2667741285512797,
10130
+ "grad_norm": 11.0,
10131
+ "learning_rate": 4.889515749882464e-06,
10132
+ "loss": 0.8716,
10133
+ "mean_token_accuracy": 0.7819954872131347,
10134
+ "num_tokens": 12458100.0,
10135
+ "step": 11250
10136
+ },
10137
+ {
10138
+ "epoch": 2.2687890388877694,
10139
+ "grad_norm": 11.0,
10140
+ "learning_rate": 4.876083014305864e-06,
10141
+ "loss": 0.767,
10142
+ "mean_token_accuracy": 0.8059248864650727,
10143
+ "num_tokens": 12469697.0,
10144
+ "step": 11260
10145
+ },
10146
+ {
10147
+ "epoch": 2.2708039492242595,
10148
+ "grad_norm": 10.625,
10149
+ "learning_rate": 4.8626502787292635e-06,
10150
+ "loss": 0.7644,
10151
+ "mean_token_accuracy": 0.8037352323532104,
10152
+ "num_tokens": 12482462.0,
10153
+ "step": 11270
10154
+ },
10155
+ {
10156
+ "epoch": 2.2728188595607497,
10157
+ "grad_norm": 13.9375,
10158
+ "learning_rate": 4.849217543152663e-06,
10159
+ "loss": 0.8345,
10160
+ "mean_token_accuracy": 0.7897806167602539,
10161
+ "num_tokens": 12494660.0,
10162
+ "step": 11280
10163
+ },
10164
+ {
10165
+ "epoch": 2.2748337698972394,
10166
+ "grad_norm": 10.875,
10167
+ "learning_rate": 4.835784807576064e-06,
10168
+ "loss": 0.7907,
10169
+ "mean_token_accuracy": 0.8008930742740631,
10170
+ "num_tokens": 12505534.0,
10171
+ "step": 11290
10172
+ },
10173
+ {
10174
+ "epoch": 2.2768486802337295,
10175
+ "grad_norm": 12.3125,
10176
+ "learning_rate": 4.822352071999463e-06,
10177
+ "loss": 0.8689,
10178
+ "mean_token_accuracy": 0.789547073841095,
10179
+ "num_tokens": 12516343.0,
10180
+ "step": 11300
10181
+ },
10182
+ {
10183
+ "epoch": 2.2788635905702197,
10184
+ "grad_norm": 12.5625,
10185
+ "learning_rate": 4.808919336422863e-06,
10186
+ "loss": 0.8781,
10187
+ "mean_token_accuracy": 0.7866406381130219,
10188
+ "num_tokens": 12527325.0,
10189
+ "step": 11310
10190
+ },
10191
+ {
10192
+ "epoch": 2.2808785009067094,
10193
+ "grad_norm": 12.5,
10194
+ "learning_rate": 4.795486600846263e-06,
10195
+ "loss": 0.7903,
10196
+ "mean_token_accuracy": 0.8062444806098938,
10197
+ "num_tokens": 12538802.0,
10198
+ "step": 11320
10199
+ },
10200
+ {
10201
+ "epoch": 2.2828934112431996,
10202
+ "grad_norm": 13.3125,
10203
+ "learning_rate": 4.7820538652696624e-06,
10204
+ "loss": 0.7503,
10205
+ "mean_token_accuracy": 0.8119116723537445,
10206
+ "num_tokens": 12549546.0,
10207
+ "step": 11330
10208
+ },
10209
+ {
10210
+ "epoch": 2.2849083215796897,
10211
+ "grad_norm": 14.125,
10212
+ "learning_rate": 4.768621129693062e-06,
10213
+ "loss": 0.8099,
10214
+ "mean_token_accuracy": 0.8033313393592835,
10215
+ "num_tokens": 12560090.0,
10216
+ "step": 11340
10217
+ },
10218
+ {
10219
+ "epoch": 2.28692323191618,
10220
+ "grad_norm": 13.0625,
10221
+ "learning_rate": 4.755188394116463e-06,
10222
+ "loss": 0.9013,
10223
+ "mean_token_accuracy": 0.7799701750278473,
10224
+ "num_tokens": 12571882.0,
10225
+ "step": 11350
10226
+ },
10227
+ {
10228
+ "epoch": 2.28893814225267,
10229
+ "grad_norm": 11.5,
10230
+ "learning_rate": 4.741755658539862e-06,
10231
+ "loss": 0.8159,
10232
+ "mean_token_accuracy": 0.7954154729843139,
10233
+ "num_tokens": 12583570.0,
10234
+ "step": 11360
10235
+ },
10236
+ {
10237
+ "epoch": 2.2909530525891597,
10238
+ "grad_norm": 13.8125,
10239
+ "learning_rate": 4.728322922963262e-06,
10240
+ "loss": 0.7928,
10241
+ "mean_token_accuracy": 0.800947493314743,
10242
+ "num_tokens": 12594636.0,
10243
+ "step": 11370
10244
+ },
10245
+ {
10246
+ "epoch": 2.29296796292565,
10247
+ "grad_norm": 13.4375,
10248
+ "learning_rate": 4.714890187386662e-06,
10249
+ "loss": 0.7321,
10250
+ "mean_token_accuracy": 0.813873153924942,
10251
+ "num_tokens": 12605481.0,
10252
+ "step": 11380
10253
+ },
10254
+ {
10255
+ "epoch": 2.29498287326214,
10256
+ "grad_norm": 8.125,
10257
+ "learning_rate": 4.701457451810061e-06,
10258
+ "loss": 0.7956,
10259
+ "mean_token_accuracy": 0.8005879402160645,
10260
+ "num_tokens": 12616477.0,
10261
+ "step": 11390
10262
+ },
10263
+ {
10264
+ "epoch": 2.2969977835986297,
10265
+ "grad_norm": 12.5,
10266
+ "learning_rate": 4.688024716233461e-06,
10267
+ "loss": 0.8169,
10268
+ "mean_token_accuracy": 0.7956750094890594,
10269
+ "num_tokens": 12627070.0,
10270
+ "step": 11400
10271
+ },
10272
+ {
10273
+ "epoch": 2.29901269393512,
10274
+ "grad_norm": 12.9375,
10275
+ "learning_rate": 4.674591980656862e-06,
10276
+ "loss": 0.8507,
10277
+ "mean_token_accuracy": 0.7936709105968476,
10278
+ "num_tokens": 12639024.0,
10279
+ "step": 11410
10280
+ },
10281
+ {
10282
+ "epoch": 2.30102760427161,
10283
+ "grad_norm": 11.0625,
10284
+ "learning_rate": 4.661159245080261e-06,
10285
+ "loss": 0.7626,
10286
+ "mean_token_accuracy": 0.8066515803337098,
10287
+ "num_tokens": 12649628.0,
10288
+ "step": 11420
10289
+ },
10290
+ {
10291
+ "epoch": 2.3030425146080997,
10292
+ "grad_norm": 13.5625,
10293
+ "learning_rate": 4.647726509503661e-06,
10294
+ "loss": 0.829,
10295
+ "mean_token_accuracy": 0.7963403999805451,
10296
+ "num_tokens": 12660338.0,
10297
+ "step": 11430
10298
+ },
10299
+ {
10300
+ "epoch": 2.30505742494459,
10301
+ "grad_norm": 9.9375,
10302
+ "learning_rate": 4.634293773927061e-06,
10303
+ "loss": 0.8586,
10304
+ "mean_token_accuracy": 0.7866991460323334,
10305
+ "num_tokens": 12672143.0,
10306
+ "step": 11440
10307
+ },
10308
+ {
10309
+ "epoch": 2.30707233528108,
10310
+ "grad_norm": 9.9375,
10311
+ "learning_rate": 4.62086103835046e-06,
10312
+ "loss": 0.7363,
10313
+ "mean_token_accuracy": 0.8156927347183227,
10314
+ "num_tokens": 12681970.0,
10315
+ "step": 11450
10316
+ },
10317
+ {
10318
+ "epoch": 2.30908724561757,
10319
+ "grad_norm": 11.5,
10320
+ "learning_rate": 4.60742830277386e-06,
10321
+ "loss": 0.7685,
10322
+ "mean_token_accuracy": 0.8116656005382538,
10323
+ "num_tokens": 12692199.0,
10324
+ "step": 11460
10325
+ },
10326
+ {
10327
+ "epoch": 2.31110215595406,
10328
+ "grad_norm": 11.3125,
10329
+ "learning_rate": 4.5939955671972605e-06,
10330
+ "loss": 0.7485,
10331
+ "mean_token_accuracy": 0.8076441287994385,
10332
+ "num_tokens": 12702476.0,
10333
+ "step": 11470
10334
+ },
10335
+ {
10336
+ "epoch": 2.31311706629055,
10337
+ "grad_norm": 12.5625,
10338
+ "learning_rate": 4.58056283162066e-06,
10339
+ "loss": 0.7926,
10340
+ "mean_token_accuracy": 0.8072655260562897,
10341
+ "num_tokens": 12712782.0,
10342
+ "step": 11480
10343
+ },
10344
+ {
10345
+ "epoch": 2.31513197662704,
10346
+ "grad_norm": 15.9375,
10347
+ "learning_rate": 4.56713009604406e-06,
10348
+ "loss": 0.8342,
10349
+ "mean_token_accuracy": 0.7965205907821655,
10350
+ "num_tokens": 12723427.0,
10351
+ "step": 11490
10352
+ },
10353
+ {
10354
+ "epoch": 2.3171468869635303,
10355
+ "grad_norm": 11.5625,
10356
+ "learning_rate": 4.5536973604674596e-06,
10357
+ "loss": 0.7846,
10358
+ "mean_token_accuracy": 0.8073502600193023,
10359
+ "num_tokens": 12733862.0,
10360
+ "step": 11500
10361
  }
10362
  ],
10363
  "logging_steps": 10,
 
10377
  "attributes": {}
10378
  }
10379
  },
10380
+ "total_flos": 1.5401013006618624e+16,
10381
  "train_batch_size": 8,
10382
  "trial_name": null,
10383
  "trial_params": null