Training in progress, step 137000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b38436cae5381f691ba804b915e325932d55429d83532b1470e95efd579a29b
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b3d9c01ac2fd401fd65707f0e1d6a24eefcca9fe471c863196aa9b97efe6f47
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7354a4e3d8de85b55d51bbeb0dfcfc86efd5d09ac4e401efe6b4ee83bc0b66a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1bf416de216a0fa7180c9c5b3632984e63b58047aa8bc6d944e50f798fb000d5
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24216,11 +24216,189 @@
|
|
| 24216 |
"eval_steps_per_second": 15.073,
|
| 24217 |
"num_input_tokens_seen": 71291638272,
|
| 24218 |
"step": 136000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24219 |
}
|
| 24220 |
],
|
| 24221 |
"logging_steps": 50,
|
| 24222 |
"max_steps": 140000,
|
| 24223 |
-
"num_input_tokens_seen":
|
| 24224 |
"num_train_epochs": 2,
|
| 24225 |
"save_steps": 1000,
|
| 24226 |
"stateful_callbacks": {
|
|
@@ -24235,7 +24413,7 @@
|
|
| 24235 |
"attributes": {}
|
| 24236 |
}
|
| 24237 |
},
|
| 24238 |
-
"total_flos": 1.
|
| 24239 |
"train_batch_size": 32,
|
| 24240 |
"trial_name": null,
|
| 24241 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.3069892793684486,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 137000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24216 |
"eval_steps_per_second": 15.073,
|
| 24217 |
"num_input_tokens_seen": 71291638272,
|
| 24218 |
"step": 136000
|
| 24219 |
+
},
|
| 24220 |
+
{
|
| 24221 |
+
"epoch": 1.2979262315608715,
|
| 24222 |
+
"grad_norm": 0.1190498098731041,
|
| 24223 |
+
"learning_rate": 4.8305620184135315e-05,
|
| 24224 |
+
"loss": 2.0321,
|
| 24225 |
+
"num_input_tokens_seen": 71317844512,
|
| 24226 |
+
"step": 136050
|
| 24227 |
+
},
|
| 24228 |
+
{
|
| 24229 |
+
"epoch": 1.2984032340770597,
|
| 24230 |
+
"grad_norm": 0.11770997196435928,
|
| 24231 |
+
"learning_rate": 4.7109889986402973e-05,
|
| 24232 |
+
"loss": 2.0341,
|
| 24233 |
+
"num_input_tokens_seen": 71344050560,
|
| 24234 |
+
"step": 136100
|
| 24235 |
+
},
|
| 24236 |
+
{
|
| 24237 |
+
"epoch": 1.2988802365932481,
|
| 24238 |
+
"grad_norm": 0.11683844774961472,
|
| 24239 |
+
"learning_rate": 4.592841308745932e-05,
|
| 24240 |
+
"loss": 2.0243,
|
| 24241 |
+
"num_input_tokens_seen": 71370258656,
|
| 24242 |
+
"step": 136150
|
| 24243 |
+
},
|
| 24244 |
+
{
|
| 24245 |
+
"epoch": 1.2993572391094363,
|
| 24246 |
+
"grad_norm": 0.12114414572715759,
|
| 24247 |
+
"learning_rate": 4.476122667059207e-05,
|
| 24248 |
+
"loss": 2.0379,
|
| 24249 |
+
"num_input_tokens_seen": 71396470656,
|
| 24250 |
+
"step": 136200
|
| 24251 |
+
},
|
| 24252 |
+
{
|
| 24253 |
+
"epoch": 1.2998342416256246,
|
| 24254 |
+
"grad_norm": 0.11975762993097305,
|
| 24255 |
+
"learning_rate": 4.3608367469340547e-05,
|
| 24256 |
+
"loss": 2.0359,
|
| 24257 |
+
"num_input_tokens_seen": 71422685056,
|
| 24258 |
+
"step": 136250
|
| 24259 |
+
},
|
| 24260 |
+
{
|
| 24261 |
+
"epoch": 1.3003112441418128,
|
| 24262 |
+
"grad_norm": 0.11278797686100006,
|
| 24263 |
+
"learning_rate": 4.2469871766340095e-05,
|
| 24264 |
+
"loss": 2.0219,
|
| 24265 |
+
"num_input_tokens_seen": 71448892928,
|
| 24266 |
+
"step": 136300
|
| 24267 |
+
},
|
| 24268 |
+
{
|
| 24269 |
+
"epoch": 1.3007882466580012,
|
| 24270 |
+
"grad_norm": 0.11854268610477448,
|
| 24271 |
+
"learning_rate": 4.1345775392179654e-05,
|
| 24272 |
+
"loss": 2.0404,
|
| 24273 |
+
"num_input_tokens_seen": 71475094528,
|
| 24274 |
+
"step": 136350
|
| 24275 |
+
},
|
| 24276 |
+
{
|
| 24277 |
+
"epoch": 1.3012652491741894,
|
| 24278 |
+
"grad_norm": 0.11631016433238983,
|
| 24279 |
+
"learning_rate": 4.0236113724274713e-05,
|
| 24280 |
+
"loss": 2.0301,
|
| 24281 |
+
"num_input_tokens_seen": 71501303968,
|
| 24282 |
+
"step": 136400
|
| 24283 |
+
},
|
| 24284 |
+
{
|
| 24285 |
+
"epoch": 1.3017422516903776,
|
| 24286 |
+
"grad_norm": 0.11170602589845657,
|
| 24287 |
+
"learning_rate": 3.9140921685753064e-05,
|
| 24288 |
+
"loss": 2.0431,
|
| 24289 |
+
"num_input_tokens_seen": 71527518368,
|
| 24290 |
+
"step": 136450
|
| 24291 |
+
},
|
| 24292 |
+
{
|
| 24293 |
+
"epoch": 1.302219254206566,
|
| 24294 |
+
"grad_norm": 0.11311063915491104,
|
| 24295 |
+
"learning_rate": 3.806023374435663e-05,
|
| 24296 |
+
"loss": 2.0173,
|
| 24297 |
+
"num_input_tokens_seen": 71553726688,
|
| 24298 |
+
"step": 136500
|
| 24299 |
+
},
|
| 24300 |
+
{
|
| 24301 |
+
"epoch": 1.302219254206566,
|
| 24302 |
+
"eval_loss": 1.9524949789047241,
|
| 24303 |
+
"eval_runtime": 83.0874,
|
| 24304 |
+
"eval_samples_per_second": 60.178,
|
| 24305 |
+
"eval_steps_per_second": 15.044,
|
| 24306 |
+
"num_input_tokens_seen": 71553726688,
|
| 24307 |
+
"step": 136500
|
| 24308 |
+
},
|
| 24309 |
+
{
|
| 24310 |
+
"epoch": 1.3026962567227542,
|
| 24311 |
+
"grad_norm": 0.728589653968811,
|
| 24312 |
+
"learning_rate": 3.699408391135611e-05,
|
| 24313 |
+
"loss": 2.0415,
|
| 24314 |
+
"num_input_tokens_seen": 71579934304,
|
| 24315 |
+
"step": 136550
|
| 24316 |
+
},
|
| 24317 |
+
{
|
| 24318 |
+
"epoch": 1.3031732592389424,
|
| 24319 |
+
"grad_norm": 0.11253057420253754,
|
| 24320 |
+
"learning_rate": 3.594250574048058e-05,
|
| 24321 |
+
"loss": 2.0334,
|
| 24322 |
+
"num_input_tokens_seen": 71606145184,
|
| 24323 |
+
"step": 136600
|
| 24324 |
+
},
|
| 24325 |
+
{
|
| 24326 |
+
"epoch": 1.3036502617551307,
|
| 24327 |
+
"grad_norm": 0.12201691418886185,
|
| 24328 |
+
"learning_rate": 3.4905532326861944e-05,
|
| 24329 |
+
"loss": 2.0403,
|
| 24330 |
+
"num_input_tokens_seen": 71632351648,
|
| 24331 |
+
"step": 136650
|
| 24332 |
+
},
|
| 24333 |
+
{
|
| 24334 |
+
"epoch": 1.304127264271319,
|
| 24335 |
+
"grad_norm": 0.11976749449968338,
|
| 24336 |
+
"learning_rate": 3.3883196305992905e-05,
|
| 24337 |
+
"loss": 2.0292,
|
| 24338 |
+
"num_input_tokens_seen": 71658566048,
|
| 24339 |
+
"step": 136700
|
| 24340 |
+
},
|
| 24341 |
+
{
|
| 24342 |
+
"epoch": 1.3046042667875073,
|
| 24343 |
+
"grad_norm": 0.12131944298744202,
|
| 24344 |
+
"learning_rate": 3.2875529852700146e-05,
|
| 24345 |
+
"loss": 2.0405,
|
| 24346 |
+
"num_input_tokens_seen": 71684775808,
|
| 24347 |
+
"step": 136750
|
| 24348 |
+
},
|
| 24349 |
+
{
|
| 24350 |
+
"epoch": 1.3050812693036955,
|
| 24351 |
+
"grad_norm": 0.11625051498413086,
|
| 24352 |
+
"learning_rate": 3.18825646801314e-05,
|
| 24353 |
+
"loss": 2.0392,
|
| 24354 |
+
"num_input_tokens_seen": 71710990048,
|
| 24355 |
+
"step": 136800
|
| 24356 |
+
},
|
| 24357 |
+
{
|
| 24358 |
+
"epoch": 1.305558271819884,
|
| 24359 |
+
"grad_norm": 0.11870067566633224,
|
| 24360 |
+
"learning_rate": 3.0904332038757974e-05,
|
| 24361 |
+
"loss": 2.0388,
|
| 24362 |
+
"num_input_tokens_seen": 71737198176,
|
| 24363 |
+
"step": 136850
|
| 24364 |
+
},
|
| 24365 |
+
{
|
| 24366 |
+
"epoch": 1.3060352743360721,
|
| 24367 |
+
"grad_norm": 0.11490604281425476,
|
| 24368 |
+
"learning_rate": 2.994086271539048e-05,
|
| 24369 |
+
"loss": 2.0261,
|
| 24370 |
+
"num_input_tokens_seen": 71763409248,
|
| 24371 |
+
"step": 136900
|
| 24372 |
+
},
|
| 24373 |
+
{
|
| 24374 |
+
"epoch": 1.3065122768522603,
|
| 24375 |
+
"grad_norm": 0.1218944787979126,
|
| 24376 |
+
"learning_rate": 2.8992187032210516e-05,
|
| 24377 |
+
"loss": 2.0421,
|
| 24378 |
+
"num_input_tokens_seen": 71789610880,
|
| 24379 |
+
"step": 136950
|
| 24380 |
+
},
|
| 24381 |
+
{
|
| 24382 |
+
"epoch": 1.3069892793684486,
|
| 24383 |
+
"grad_norm": 0.11681609600782394,
|
| 24384 |
+
"learning_rate": 2.8058334845816213e-05,
|
| 24385 |
+
"loss": 2.0287,
|
| 24386 |
+
"num_input_tokens_seen": 71815816608,
|
| 24387 |
+
"step": 137000
|
| 24388 |
+
},
|
| 24389 |
+
{
|
| 24390 |
+
"epoch": 1.3069892793684486,
|
| 24391 |
+
"eval_loss": 1.951898455619812,
|
| 24392 |
+
"eval_runtime": 82.7779,
|
| 24393 |
+
"eval_samples_per_second": 60.403,
|
| 24394 |
+
"eval_steps_per_second": 15.101,
|
| 24395 |
+
"num_input_tokens_seen": 71815816608,
|
| 24396 |
+
"step": 137000
|
| 24397 |
}
|
| 24398 |
],
|
| 24399 |
"logging_steps": 50,
|
| 24400 |
"max_steps": 140000,
|
| 24401 |
+
"num_input_tokens_seen": 71815816608,
|
| 24402 |
"num_train_epochs": 2,
|
| 24403 |
"save_steps": 1000,
|
| 24404 |
"stateful_callbacks": {
|
|
|
|
| 24413 |
"attributes": {}
|
| 24414 |
}
|
| 24415 |
},
|
| 24416 |
+
"total_flos": 1.271008961912107e+20,
|
| 24417 |
"train_batch_size": 32,
|
| 24418 |
"trial_name": null,
|
| 24419 |
"trial_params": null
|