mikhail-panzo commited on
Commit
a084b69
1 Parent(s): 6e11fad

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8d21651e5db2ed67f31319a652131e535151d2fbf4e6b24b3edeb923bb9d99f
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f49478b3558087de3ef1a10558e5e95232e5021f5554b1a3c06b2ca1a4c1837
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9f742d32e28e8158fb337c29ed9b09546167bb83f74364f94ebdc185bff0685
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c3d2f41767d06de85b925c048cc0009bb7905dc81e830f65ec94b3a302a2b4a
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e7a2dc2fd34210e38a0dafa68768e7a8ac00e877b03b70d54a43bd027bc5930
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0adb4fd9f26d958c1229bd5f580059c7a1f34ab63e309cc84449bd74e51f61ac
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cbb951fc4257a68dab12f51ae2258de85fc85dd8c8f4de0474b3e6fac987a51
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e73199a599db3ee4dbee079b2308a29999c25a67f748ab52b6cc64d7c9b3df0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.39598318934440613,
3
  "best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s8000/checkpoint-3500",
4
- "epoch": 297.029702970297,
5
  "eval_steps": 500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1177,6 +1177,84 @@
1177
  "eval_samples_per_second": 26.065,
1178
  "eval_steps_per_second": 3.33,
1179
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1180
  }
1181
  ],
1182
  "logging_steps": 50,
@@ -1191,12 +1269,12 @@
1191
  "should_evaluate": false,
1192
  "should_log": false,
1193
  "should_save": true,
1194
- "should_training_stop": false
1195
  },
1196
  "attributes": {}
1197
  }
1198
  },
1199
- "total_flos": 8.103931252887646e+16,
1200
  "train_batch_size": 16,
1201
  "trial_name": null,
1202
  "trial_params": null
 
1
  {
2
  "best_metric": 0.39598318934440613,
3
  "best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s8000/checkpoint-3500",
4
+ "epoch": 316.83168316831683,
5
  "eval_steps": 500,
6
+ "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1177
  "eval_samples_per_second": 26.065,
1178
  "eval_steps_per_second": 3.33,
1179
  "step": 7500
1180
+ },
1181
+ {
1182
+ "epoch": 299.009900990099,
1183
+ "grad_norm": 0.5674709677696228,
1184
+ "learning_rate": 7.533333333333334e-06,
1185
+ "loss": 0.3317,
1186
+ "step": 7550
1187
+ },
1188
+ {
1189
+ "epoch": 300.990099009901,
1190
+ "grad_norm": 0.5428618788719177,
1191
+ "learning_rate": 6.700000000000001e-06,
1192
+ "loss": 0.3322,
1193
+ "step": 7600
1194
+ },
1195
+ {
1196
+ "epoch": 302.970297029703,
1197
+ "grad_norm": 0.6271554827690125,
1198
+ "learning_rate": 5.866666666666667e-06,
1199
+ "loss": 0.3337,
1200
+ "step": 7650
1201
+ },
1202
+ {
1203
+ "epoch": 304.9504950495049,
1204
+ "grad_norm": 0.41911429166793823,
1205
+ "learning_rate": 5.033333333333334e-06,
1206
+ "loss": 0.329,
1207
+ "step": 7700
1208
+ },
1209
+ {
1210
+ "epoch": 306.9306930693069,
1211
+ "grad_norm": 0.4316006600856781,
1212
+ "learning_rate": 4.2000000000000004e-06,
1213
+ "loss": 0.3338,
1214
+ "step": 7750
1215
+ },
1216
+ {
1217
+ "epoch": 308.91089108910893,
1218
+ "grad_norm": 0.5471222400665283,
1219
+ "learning_rate": 3.3666666666666665e-06,
1220
+ "loss": 0.3316,
1221
+ "step": 7800
1222
+ },
1223
+ {
1224
+ "epoch": 310.8910891089109,
1225
+ "grad_norm": 0.5605342388153076,
1226
+ "learning_rate": 2.5333333333333334e-06,
1227
+ "loss": 0.3289,
1228
+ "step": 7850
1229
+ },
1230
+ {
1231
+ "epoch": 312.8712871287129,
1232
+ "grad_norm": 0.5504734516143799,
1233
+ "learning_rate": 1.7000000000000002e-06,
1234
+ "loss": 0.3303,
1235
+ "step": 7900
1236
+ },
1237
+ {
1238
+ "epoch": 314.8514851485148,
1239
+ "grad_norm": 0.5514795780181885,
1240
+ "learning_rate": 8.666666666666667e-07,
1241
+ "loss": 0.3282,
1242
+ "step": 7950
1243
+ },
1244
+ {
1245
+ "epoch": 316.83168316831683,
1246
+ "grad_norm": 0.5700021982192993,
1247
+ "learning_rate": 3.3333333333333334e-08,
1248
+ "loss": 0.3348,
1249
+ "step": 8000
1250
+ },
1251
+ {
1252
+ "epoch": 316.83168316831683,
1253
+ "eval_loss": 0.4050144553184509,
1254
+ "eval_runtime": 6.8387,
1255
+ "eval_samples_per_second": 26.321,
1256
+ "eval_steps_per_second": 3.363,
1257
+ "step": 8000
1258
  }
1259
  ],
1260
  "logging_steps": 50,
 
1269
  "should_evaluate": false,
1270
  "should_log": false,
1271
  "should_save": true,
1272
+ "should_training_stop": true
1273
  },
1274
  "attributes": {}
1275
  }
1276
  },
1277
+ "total_flos": 8.643923525044128e+16,
1278
  "train_batch_size": 16,
1279
  "trial_name": null,
1280
  "trial_params": null