Plofski commited on
Commit
569f341
·
verified ·
1 Parent(s): da87a16

Training in progress, step 14889, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87d564460f84baac9ace9dc44cd612f3da4c9738f97e9806a8457bb9462e95db
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abfb7980299d9a0833e40cfa75a4e071101b9b5dbcb4b7b8be67cc1f7a5b1358
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ace8d39e9d75867a54c7c346772698f7c6e42165925320fb3b2367daa7c674e
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d02f233cb73ec902ca0b622f60572ba5696796aa69c3f044f06782367911a3f9
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e3f275449dfbc8efc7d2d2f06d134c7b39e55b8e539f36e09b007c731c81c65
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d3ef4695d48aabb51830d7d806ccbb8d1a7c1dd1163d43a57a82226f9575540
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.921619987910538,
6
  "eval_steps": 500,
7
- "global_step": 14500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -13058,6 +13058,348 @@
13058
  "mean_token_accuracy": 0.7894056618213654,
13059
  "num_tokens": 16065206.0,
13060
  "step": 14500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13061
  }
13062
  ],
13063
  "logging_steps": 10,
@@ -13072,12 +13414,12 @@
13072
  "should_evaluate": false,
13073
  "should_log": false,
13074
  "should_save": true,
13075
- "should_training_stop": false
13076
  },
13077
  "attributes": {}
13078
  }
13079
  },
13080
- "total_flos": 1.9417933454309376e+16,
13081
  "train_batch_size": 8,
13082
  "trial_name": null,
13083
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 14889,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
13058
  "mean_token_accuracy": 0.7894056618213654,
13059
  "num_tokens": 16065206.0,
13060
  "step": 14500
13061
+ },
13062
+ {
13063
+ "epoch": 2.923634898247028,
13064
+ "grad_norm": 10.6875,
13065
+ "learning_rate": 5.104439519108067e-07,
13066
+ "loss": 0.8542,
13067
+ "mean_token_accuracy": 0.7932229697704315,
13068
+ "num_tokens": 16076636.0,
13069
+ "step": 14510
13070
+ },
13071
+ {
13072
+ "epoch": 2.9256498085835183,
13073
+ "grad_norm": 10.6875,
13074
+ "learning_rate": 4.970112163342065e-07,
13075
+ "loss": 0.8027,
13076
+ "mean_token_accuracy": 0.7981291949748993,
13077
+ "num_tokens": 16088110.0,
13078
+ "step": 14520
13079
+ },
13080
+ {
13081
+ "epoch": 2.927664718920008,
13082
+ "grad_norm": 13.9375,
13083
+ "learning_rate": 4.835784807576064e-07,
13084
+ "loss": 0.8503,
13085
+ "mean_token_accuracy": 0.7925727784633636,
13086
+ "num_tokens": 16100038.0,
13087
+ "step": 14530
13088
+ },
13089
+ {
13090
+ "epoch": 2.929679629256498,
13091
+ "grad_norm": 12.25,
13092
+ "learning_rate": 4.7014574518100616e-07,
13093
+ "loss": 0.908,
13094
+ "mean_token_accuracy": 0.787571269273758,
13095
+ "num_tokens": 16110204.0,
13096
+ "step": 14540
13097
+ },
13098
+ {
13099
+ "epoch": 2.9316945395929883,
13100
+ "grad_norm": 10.4375,
13101
+ "learning_rate": 4.5671300960440595e-07,
13102
+ "loss": 0.8271,
13103
+ "mean_token_accuracy": 0.7935189664363861,
13104
+ "num_tokens": 16122082.0,
13105
+ "step": 14550
13106
+ },
13107
+ {
13108
+ "epoch": 2.933709449929478,
13109
+ "grad_norm": 10.875,
13110
+ "learning_rate": 4.4328027402780584e-07,
13111
+ "loss": 0.7881,
13112
+ "mean_token_accuracy": 0.8038492739200592,
13113
+ "num_tokens": 16133321.0,
13114
+ "step": 14560
13115
+ },
13116
+ {
13117
+ "epoch": 2.935724360265968,
13118
+ "grad_norm": 9.875,
13119
+ "learning_rate": 4.2984753845120563e-07,
13120
+ "loss": 0.8147,
13121
+ "mean_token_accuracy": 0.7948280692100524,
13122
+ "num_tokens": 16145276.0,
13123
+ "step": 14570
13124
+ },
13125
+ {
13126
+ "epoch": 2.9377392706024583,
13127
+ "grad_norm": 13.4375,
13128
+ "learning_rate": 4.1641480287460547e-07,
13129
+ "loss": 0.7914,
13130
+ "mean_token_accuracy": 0.8023073971271515,
13131
+ "num_tokens": 16156515.0,
13132
+ "step": 14580
13133
+ },
13134
+ {
13135
+ "epoch": 2.939754180938948,
13136
+ "grad_norm": 13.5,
13137
+ "learning_rate": 4.0298206729800526e-07,
13138
+ "loss": 0.7906,
13139
+ "mean_token_accuracy": 0.7989113509654999,
13140
+ "num_tokens": 16168451.0,
13141
+ "step": 14590
13142
+ },
13143
+ {
13144
+ "epoch": 2.941769091275438,
13145
+ "grad_norm": 9.4375,
13146
+ "learning_rate": 3.895493317214051e-07,
13147
+ "loss": 0.7716,
13148
+ "mean_token_accuracy": 0.8048623919486999,
13149
+ "num_tokens": 16180616.0,
13150
+ "step": 14600
13151
+ },
13152
+ {
13153
+ "epoch": 2.9437840016119283,
13154
+ "grad_norm": 13.1875,
13155
+ "learning_rate": 3.761165961448049e-07,
13156
+ "loss": 0.7059,
13157
+ "mean_token_accuracy": 0.8211403012275695,
13158
+ "num_tokens": 16190863.0,
13159
+ "step": 14610
13160
+ },
13161
+ {
13162
+ "epoch": 2.945798911948418,
13163
+ "grad_norm": 9.6875,
13164
+ "learning_rate": 3.626838605682047e-07,
13165
+ "loss": 0.7256,
13166
+ "mean_token_accuracy": 0.8117915868759156,
13167
+ "num_tokens": 16201924.0,
13168
+ "step": 14620
13169
+ },
13170
+ {
13171
+ "epoch": 2.947813822284908,
13172
+ "grad_norm": 12.5625,
13173
+ "learning_rate": 3.492511249916046e-07,
13174
+ "loss": 0.8323,
13175
+ "mean_token_accuracy": 0.7886347115039826,
13176
+ "num_tokens": 16212994.0,
13177
+ "step": 14630
13178
+ },
13179
+ {
13180
+ "epoch": 2.9498287326213983,
13181
+ "grad_norm": 12.6875,
13182
+ "learning_rate": 3.358183894150044e-07,
13183
+ "loss": 0.8058,
13184
+ "mean_token_accuracy": 0.7921592950820923,
13185
+ "num_tokens": 16223545.0,
13186
+ "step": 14640
13187
+ },
13188
+ {
13189
+ "epoch": 2.9518436429578885,
13190
+ "grad_norm": 10.625,
13191
+ "learning_rate": 3.2238565383840425e-07,
13192
+ "loss": 0.7639,
13193
+ "mean_token_accuracy": 0.8088575303554535,
13194
+ "num_tokens": 16234810.0,
13195
+ "step": 14650
13196
+ },
13197
+ {
13198
+ "epoch": 2.9538585532943786,
13199
+ "grad_norm": 12.9375,
13200
+ "learning_rate": 3.0895291826180403e-07,
13201
+ "loss": 0.9168,
13202
+ "mean_token_accuracy": 0.7773300051689148,
13203
+ "num_tokens": 16246077.0,
13204
+ "step": 14660
13205
+ },
13206
+ {
13207
+ "epoch": 2.9558734636308683,
13208
+ "grad_norm": 13.875,
13209
+ "learning_rate": 2.955201826852039e-07,
13210
+ "loss": 0.8155,
13211
+ "mean_token_accuracy": 0.7995950043201446,
13212
+ "num_tokens": 16256503.0,
13213
+ "step": 14670
13214
+ },
13215
+ {
13216
+ "epoch": 2.9578883739673585,
13217
+ "grad_norm": 13.6875,
13218
+ "learning_rate": 2.820874471086037e-07,
13219
+ "loss": 0.8045,
13220
+ "mean_token_accuracy": 0.8015713572502137,
13221
+ "num_tokens": 16266819.0,
13222
+ "step": 14680
13223
+ },
13224
+ {
13225
+ "epoch": 2.9599032843038486,
13226
+ "grad_norm": 12.0625,
13227
+ "learning_rate": 2.686547115320035e-07,
13228
+ "loss": 0.9132,
13229
+ "mean_token_accuracy": 0.7834485352039338,
13230
+ "num_tokens": 16278113.0,
13231
+ "step": 14690
13232
+ },
13233
+ {
13234
+ "epoch": 2.9619181946403383,
13235
+ "grad_norm": 13.3125,
13236
+ "learning_rate": 2.5522197595540334e-07,
13237
+ "loss": 0.7371,
13238
+ "mean_token_accuracy": 0.8118620038032531,
13239
+ "num_tokens": 16288705.0,
13240
+ "step": 14700
13241
+ },
13242
+ {
13243
+ "epoch": 2.9639331049768285,
13244
+ "grad_norm": 13.125,
13245
+ "learning_rate": 2.417892403788032e-07,
13246
+ "loss": 0.8454,
13247
+ "mean_token_accuracy": 0.7928309857845306,
13248
+ "num_tokens": 16299215.0,
13249
+ "step": 14710
13250
+ },
13251
+ {
13252
+ "epoch": 2.9659480153133186,
13253
+ "grad_norm": 11.125,
13254
+ "learning_rate": 2.2835650480220297e-07,
13255
+ "loss": 0.7582,
13256
+ "mean_token_accuracy": 0.814406418800354,
13257
+ "num_tokens": 16309978.0,
13258
+ "step": 14720
13259
+ },
13260
+ {
13261
+ "epoch": 2.9679629256498083,
13262
+ "grad_norm": 13.25,
13263
+ "learning_rate": 2.1492376922560281e-07,
13264
+ "loss": 0.7703,
13265
+ "mean_token_accuracy": 0.8121409773826599,
13266
+ "num_tokens": 16320485.0,
13267
+ "step": 14730
13268
+ },
13269
+ {
13270
+ "epoch": 2.9699778359862985,
13271
+ "grad_norm": 11.1875,
13272
+ "learning_rate": 2.0149103364900263e-07,
13273
+ "loss": 0.7339,
13274
+ "mean_token_accuracy": 0.8153697431087494,
13275
+ "num_tokens": 16332678.0,
13276
+ "step": 14740
13277
+ },
13278
+ {
13279
+ "epoch": 2.9719927463227886,
13280
+ "grad_norm": 11.0625,
13281
+ "learning_rate": 1.8805829807240244e-07,
13282
+ "loss": 0.8436,
13283
+ "mean_token_accuracy": 0.7889176428318023,
13284
+ "num_tokens": 16345494.0,
13285
+ "step": 14750
13286
+ },
13287
+ {
13288
+ "epoch": 2.974007656659279,
13289
+ "grad_norm": 10.4375,
13290
+ "learning_rate": 1.746255624958023e-07,
13291
+ "loss": 0.788,
13292
+ "mean_token_accuracy": 0.8068629801273346,
13293
+ "num_tokens": 16356280.0,
13294
+ "step": 14760
13295
+ },
13296
+ {
13297
+ "epoch": 2.976022566995769,
13298
+ "grad_norm": 10.6875,
13299
+ "learning_rate": 1.6119282691920212e-07,
13300
+ "loss": 0.9647,
13301
+ "mean_token_accuracy": 0.7653753876686096,
13302
+ "num_tokens": 16367963.0,
13303
+ "step": 14770
13304
+ },
13305
+ {
13306
+ "epoch": 2.9780374773322587,
13307
+ "grad_norm": 11.3125,
13308
+ "learning_rate": 1.4776009134260194e-07,
13309
+ "loss": 0.8052,
13310
+ "mean_token_accuracy": 0.7993070542812347,
13311
+ "num_tokens": 16378573.0,
13312
+ "step": 14780
13313
+ },
13314
+ {
13315
+ "epoch": 2.980052387668749,
13316
+ "grad_norm": 11.75,
13317
+ "learning_rate": 1.3432735576600175e-07,
13318
+ "loss": 0.7878,
13319
+ "mean_token_accuracy": 0.8004900455474854,
13320
+ "num_tokens": 16389458.0,
13321
+ "step": 14790
13322
+ },
13323
+ {
13324
+ "epoch": 2.982067298005239,
13325
+ "grad_norm": 11.375,
13326
+ "learning_rate": 1.208946201894016e-07,
13327
+ "loss": 0.8526,
13328
+ "mean_token_accuracy": 0.7890514850616455,
13329
+ "num_tokens": 16400816.0,
13330
+ "step": 14800
13331
+ },
13332
+ {
13333
+ "epoch": 2.9840822083417287,
13334
+ "grad_norm": 12.625,
13335
+ "learning_rate": 1.0746188461280141e-07,
13336
+ "loss": 0.7664,
13337
+ "mean_token_accuracy": 0.8109397828578949,
13338
+ "num_tokens": 16410740.0,
13339
+ "step": 14810
13340
+ },
13341
+ {
13342
+ "epoch": 2.986097118678219,
13343
+ "grad_norm": 12.6875,
13344
+ "learning_rate": 9.402914903620122e-08,
13345
+ "loss": 0.8027,
13346
+ "mean_token_accuracy": 0.7973058164119721,
13347
+ "num_tokens": 16422624.0,
13348
+ "step": 14820
13349
+ },
13350
+ {
13351
+ "epoch": 2.988112029014709,
13352
+ "grad_norm": 11.75,
13353
+ "learning_rate": 8.059641345960106e-08,
13354
+ "loss": 0.8327,
13355
+ "mean_token_accuracy": 0.7947525262832642,
13356
+ "num_tokens": 16432503.0,
13357
+ "step": 14830
13358
+ },
13359
+ {
13360
+ "epoch": 2.9901269393511987,
13361
+ "grad_norm": 12.25,
13362
+ "learning_rate": 6.716367788300088e-08,
13363
+ "loss": 0.8677,
13364
+ "mean_token_accuracy": 0.7895227074623108,
13365
+ "num_tokens": 16443714.0,
13366
+ "step": 14840
13367
+ },
13368
+ {
13369
+ "epoch": 2.992141849687689,
13370
+ "grad_norm": 12.625,
13371
+ "learning_rate": 5.3730942306400703e-08,
13372
+ "loss": 0.7835,
13373
+ "mean_token_accuracy": 0.8056479752063751,
13374
+ "num_tokens": 16455282.0,
13375
+ "step": 14850
13376
+ },
13377
+ {
13378
+ "epoch": 2.994156760024179,
13379
+ "grad_norm": 10.5625,
13380
+ "learning_rate": 4.029820672980053e-08,
13381
+ "loss": 0.8101,
13382
+ "mean_token_accuracy": 0.8069123327732086,
13383
+ "num_tokens": 16466521.0,
13384
+ "step": 14860
13385
+ },
13386
+ {
13387
+ "epoch": 2.996171670360669,
13388
+ "grad_norm": 12.5,
13389
+ "learning_rate": 2.6865471153200352e-08,
13390
+ "loss": 0.8407,
13391
+ "mean_token_accuracy": 0.7910451471805573,
13392
+ "num_tokens": 16477990.0,
13393
+ "step": 14870
13394
+ },
13395
+ {
13396
+ "epoch": 2.998186580697159,
13397
+ "grad_norm": 10.9375,
13398
+ "learning_rate": 1.3432735576600176e-08,
13399
+ "loss": 0.8122,
13400
+ "mean_token_accuracy": 0.7982128620147705,
13401
+ "num_tokens": 16488273.0,
13402
+ "step": 14880
13403
  }
13404
  ],
13405
  "logging_steps": 10,
 
13414
  "should_evaluate": false,
13415
  "should_log": false,
13416
  "should_save": true,
13417
+ "should_training_stop": true
13418
  },
13419
  "attributes": {}
13420
  }
13421
  },
13422
+ "total_flos": 1.9946788002011136e+16,
13423
  "train_batch_size": 8,
13424
  "trial_name": null,
13425
  "trial_params": null