Training in progress, step 22800, checkpoint
Browse files
.gitattributes
CHANGED
|
@@ -34,4 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
checkpoint
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3541119728
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:237f5c17c55df679a8e8f4a65ad9de09e2a99a2eaba9876aace075096abcfb63
|
| 3 |
size 3541119728
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 778374186
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cc82f803cbf27fa5d02dc20006fbaf09405895a4d61a6169c832576c2db2940
|
| 3 |
size 778374186
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:959293b55cb243a7a3af582584a0698f2aeb95373b8b27dd72c03d8f0bdce376
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -20258,6 +20258,276 @@
|
|
| 20258 |
"mean_token_accuracy": 0.8939898759126663,
|
| 20259 |
"num_tokens": 37361844.0,
|
| 20260 |
"step": 22500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20261 |
}
|
| 20262 |
],
|
| 20263 |
"logging_steps": 10,
|
|
@@ -20277,7 +20547,7 @@
|
|
| 20277 |
"attributes": {}
|
| 20278 |
}
|
| 20279 |
},
|
| 20280 |
-
"total_flos": 8.
|
| 20281 |
"train_batch_size": 2,
|
| 20282 |
"trial_name": null,
|
| 20283 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0880468628831172,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 22800,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 20258 |
"mean_token_accuracy": 0.8939898759126663,
|
| 20259 |
"num_tokens": 37361844.0,
|
| 20260 |
"step": 22500
|
| 20261 |
+
},
|
| 20262 |
+
{
|
| 20263 |
+
"epoch": 1.074207518581706,
|
| 20264 |
+
"grad_norm": 0.37823808193206787,
|
| 20265 |
+
"learning_rate": 9.258410880458124e-06,
|
| 20266 |
+
"loss": 0.6272,
|
| 20267 |
+
"mean_token_accuracy": 0.8770518571138382,
|
| 20268 |
+
"num_tokens": 37379000.0,
|
| 20269 |
+
"step": 22510
|
| 20270 |
+
},
|
| 20271 |
+
{
|
| 20272 |
+
"epoch": 1.07468473735072,
|
| 20273 |
+
"grad_norm": 0.4005347788333893,
|
| 20274 |
+
"learning_rate": 9.253638749701743e-06,
|
| 20275 |
+
"loss": 0.6015,
|
| 20276 |
+
"mean_token_accuracy": 0.8748599126935005,
|
| 20277 |
+
"num_tokens": 37395773.0,
|
| 20278 |
+
"step": 22520
|
| 20279 |
+
},
|
| 20280 |
+
{
|
| 20281 |
+
"epoch": 1.075161956119734,
|
| 20282 |
+
"grad_norm": 0.39781296253204346,
|
| 20283 |
+
"learning_rate": 9.24886661894536e-06,
|
| 20284 |
+
"loss": 0.6869,
|
| 20285 |
+
"mean_token_accuracy": 0.8716883912682534,
|
| 20286 |
+
"num_tokens": 37415210.0,
|
| 20287 |
+
"step": 22530
|
| 20288 |
+
},
|
| 20289 |
+
{
|
| 20290 |
+
"epoch": 1.0756391748887484,
|
| 20291 |
+
"grad_norm": 0.39247405529022217,
|
| 20292 |
+
"learning_rate": 9.244094488188978e-06,
|
| 20293 |
+
"loss": 0.7395,
|
| 20294 |
+
"mean_token_accuracy": 0.8584993034601212,
|
| 20295 |
+
"num_tokens": 37432118.0,
|
| 20296 |
+
"step": 22540
|
| 20297 |
+
},
|
| 20298 |
+
{
|
| 20299 |
+
"epoch": 1.0761163936577625,
|
| 20300 |
+
"grad_norm": 0.3580706715583801,
|
| 20301 |
+
"learning_rate": 9.239322357432594e-06,
|
| 20302 |
+
"loss": 0.79,
|
| 20303 |
+
"mean_token_accuracy": 0.8506992489099503,
|
| 20304 |
+
"num_tokens": 37452181.0,
|
| 20305 |
+
"step": 22550
|
| 20306 |
+
},
|
| 20307 |
+
{
|
| 20308 |
+
"epoch": 1.0765936124267768,
|
| 20309 |
+
"grad_norm": 0.3170486092567444,
|
| 20310 |
+
"learning_rate": 9.234550226676213e-06,
|
| 20311 |
+
"loss": 0.6072,
|
| 20312 |
+
"mean_token_accuracy": 0.8733228012919426,
|
| 20313 |
+
"num_tokens": 37468470.0,
|
| 20314 |
+
"step": 22560
|
| 20315 |
+
},
|
| 20316 |
+
{
|
| 20317 |
+
"epoch": 1.0770708311957908,
|
| 20318 |
+
"grad_norm": 0.44699838757514954,
|
| 20319 |
+
"learning_rate": 9.22977809591983e-06,
|
| 20320 |
+
"loss": 0.5566,
|
| 20321 |
+
"mean_token_accuracy": 0.8752368673682213,
|
| 20322 |
+
"num_tokens": 37485065.0,
|
| 20323 |
+
"step": 22570
|
| 20324 |
+
},
|
| 20325 |
+
{
|
| 20326 |
+
"epoch": 1.0775480499648051,
|
| 20327 |
+
"grad_norm": 0.5032857656478882,
|
| 20328 |
+
"learning_rate": 9.225005965163446e-06,
|
| 20329 |
+
"loss": 0.656,
|
| 20330 |
+
"mean_token_accuracy": 0.8768767550587654,
|
| 20331 |
+
"num_tokens": 37501662.0,
|
| 20332 |
+
"step": 22580
|
| 20333 |
+
},
|
| 20334 |
+
{
|
| 20335 |
+
"epoch": 1.0780252687338192,
|
| 20336 |
+
"grad_norm": 0.3128605782985687,
|
| 20337 |
+
"learning_rate": 9.220233834407063e-06,
|
| 20338 |
+
"loss": 0.6645,
|
| 20339 |
+
"mean_token_accuracy": 0.8662539958953858,
|
| 20340 |
+
"num_tokens": 37519154.0,
|
| 20341 |
+
"step": 22590
|
| 20342 |
+
},
|
| 20343 |
+
{
|
| 20344 |
+
"epoch": 1.0785024875028335,
|
| 20345 |
+
"grad_norm": 0.3933728039264679,
|
| 20346 |
+
"learning_rate": 9.215461703650681e-06,
|
| 20347 |
+
"loss": 0.5397,
|
| 20348 |
+
"mean_token_accuracy": 0.8856153175234794,
|
| 20349 |
+
"num_tokens": 37534651.0,
|
| 20350 |
+
"step": 22600
|
| 20351 |
+
},
|
| 20352 |
+
{
|
| 20353 |
+
"epoch": 1.0789797062718476,
|
| 20354 |
+
"grad_norm": 0.5340325832366943,
|
| 20355 |
+
"learning_rate": 9.210689572894298e-06,
|
| 20356 |
+
"loss": 0.6269,
|
| 20357 |
+
"mean_token_accuracy": 0.8772767931222916,
|
| 20358 |
+
"num_tokens": 37551794.0,
|
| 20359 |
+
"step": 22610
|
| 20360 |
+
},
|
| 20361 |
+
{
|
| 20362 |
+
"epoch": 1.0794569250408619,
|
| 20363 |
+
"grad_norm": 0.3841538429260254,
|
| 20364 |
+
"learning_rate": 9.205917442137915e-06,
|
| 20365 |
+
"loss": 0.6175,
|
| 20366 |
+
"mean_token_accuracy": 0.8755568400025368,
|
| 20367 |
+
"num_tokens": 37567508.0,
|
| 20368 |
+
"step": 22620
|
| 20369 |
+
},
|
| 20370 |
+
{
|
| 20371 |
+
"epoch": 1.079934143809876,
|
| 20372 |
+
"grad_norm": 0.37845683097839355,
|
| 20373 |
+
"learning_rate": 9.201145311381533e-06,
|
| 20374 |
+
"loss": 0.5757,
|
| 20375 |
+
"mean_token_accuracy": 0.879976649582386,
|
| 20376 |
+
"num_tokens": 37582508.0,
|
| 20377 |
+
"step": 22630
|
| 20378 |
+
},
|
| 20379 |
+
{
|
| 20380 |
+
"epoch": 1.0804113625788903,
|
| 20381 |
+
"grad_norm": 0.3559890687465668,
|
| 20382 |
+
"learning_rate": 9.19637318062515e-06,
|
| 20383 |
+
"loss": 0.8135,
|
| 20384 |
+
"mean_token_accuracy": 0.8407615974545479,
|
| 20385 |
+
"num_tokens": 37601326.0,
|
| 20386 |
+
"step": 22640
|
| 20387 |
+
},
|
| 20388 |
+
{
|
| 20389 |
+
"epoch": 1.0808885813479043,
|
| 20390 |
+
"grad_norm": 0.32038992643356323,
|
| 20391 |
+
"learning_rate": 9.191601049868766e-06,
|
| 20392 |
+
"loss": 0.5877,
|
| 20393 |
+
"mean_token_accuracy": 0.885163950920105,
|
| 20394 |
+
"num_tokens": 37616610.0,
|
| 20395 |
+
"step": 22650
|
| 20396 |
+
},
|
| 20397 |
+
{
|
| 20398 |
+
"epoch": 1.0813658001169186,
|
| 20399 |
+
"grad_norm": 0.366234689950943,
|
| 20400 |
+
"learning_rate": 9.186828919112385e-06,
|
| 20401 |
+
"loss": 0.6438,
|
| 20402 |
+
"mean_token_accuracy": 0.8744160294532776,
|
| 20403 |
+
"num_tokens": 37633602.0,
|
| 20404 |
+
"step": 22660
|
| 20405 |
+
},
|
| 20406 |
+
{
|
| 20407 |
+
"epoch": 1.0818430188859327,
|
| 20408 |
+
"grad_norm": 0.32627347111701965,
|
| 20409 |
+
"learning_rate": 9.182056788356001e-06,
|
| 20410 |
+
"loss": 0.6948,
|
| 20411 |
+
"mean_token_accuracy": 0.8610922127962113,
|
| 20412 |
+
"num_tokens": 37651592.0,
|
| 20413 |
+
"step": 22670
|
| 20414 |
+
},
|
| 20415 |
+
{
|
| 20416 |
+
"epoch": 1.082320237654947,
|
| 20417 |
+
"grad_norm": 0.3474673628807068,
|
| 20418 |
+
"learning_rate": 9.17728465759962e-06,
|
| 20419 |
+
"loss": 0.6262,
|
| 20420 |
+
"mean_token_accuracy": 0.8750404015183448,
|
| 20421 |
+
"num_tokens": 37668010.0,
|
| 20422 |
+
"step": 22680
|
| 20423 |
+
},
|
| 20424 |
+
{
|
| 20425 |
+
"epoch": 1.082797456423961,
|
| 20426 |
+
"grad_norm": 0.3955213129520416,
|
| 20427 |
+
"learning_rate": 9.172512526843236e-06,
|
| 20428 |
+
"loss": 0.5588,
|
| 20429 |
+
"mean_token_accuracy": 0.8861236184835434,
|
| 20430 |
+
"num_tokens": 37684538.0,
|
| 20431 |
+
"step": 22690
|
| 20432 |
+
},
|
| 20433 |
+
{
|
| 20434 |
+
"epoch": 1.0832746751929754,
|
| 20435 |
+
"grad_norm": 0.4451896846294403,
|
| 20436 |
+
"learning_rate": 9.167740396086855e-06,
|
| 20437 |
+
"loss": 0.5774,
|
| 20438 |
+
"mean_token_accuracy": 0.8859012797474861,
|
| 20439 |
+
"num_tokens": 37700694.0,
|
| 20440 |
+
"step": 22700
|
| 20441 |
+
},
|
| 20442 |
+
{
|
| 20443 |
+
"epoch": 1.0837518939619895,
|
| 20444 |
+
"grad_norm": 0.41938453912734985,
|
| 20445 |
+
"learning_rate": 9.162968265330471e-06,
|
| 20446 |
+
"loss": 0.6575,
|
| 20447 |
+
"mean_token_accuracy": 0.8762999802827836,
|
| 20448 |
+
"num_tokens": 37716717.0,
|
| 20449 |
+
"step": 22710
|
| 20450 |
+
},
|
| 20451 |
+
{
|
| 20452 |
+
"epoch": 1.0842291127310038,
|
| 20453 |
+
"grad_norm": 0.38627904653549194,
|
| 20454 |
+
"learning_rate": 9.158196134574088e-06,
|
| 20455 |
+
"loss": 0.6263,
|
| 20456 |
+
"mean_token_accuracy": 0.8728866443037987,
|
| 20457 |
+
"num_tokens": 37734196.0,
|
| 20458 |
+
"step": 22720
|
| 20459 |
+
},
|
| 20460 |
+
{
|
| 20461 |
+
"epoch": 1.0847063315000178,
|
| 20462 |
+
"grad_norm": 0.39531171321868896,
|
| 20463 |
+
"learning_rate": 9.153424003817706e-06,
|
| 20464 |
+
"loss": 0.5782,
|
| 20465 |
+
"mean_token_accuracy": 0.8879543572664261,
|
| 20466 |
+
"num_tokens": 37750684.0,
|
| 20467 |
+
"step": 22730
|
| 20468 |
+
},
|
| 20469 |
+
{
|
| 20470 |
+
"epoch": 1.0851835502690321,
|
| 20471 |
+
"grad_norm": 0.3783516585826874,
|
| 20472 |
+
"learning_rate": 9.148651873061323e-06,
|
| 20473 |
+
"loss": 0.599,
|
| 20474 |
+
"mean_token_accuracy": 0.8701232433319092,
|
| 20475 |
+
"num_tokens": 37767146.0,
|
| 20476 |
+
"step": 22740
|
| 20477 |
+
},
|
| 20478 |
+
{
|
| 20479 |
+
"epoch": 1.0856607690380462,
|
| 20480 |
+
"grad_norm": 0.39319974184036255,
|
| 20481 |
+
"learning_rate": 9.14387974230494e-06,
|
| 20482 |
+
"loss": 0.6413,
|
| 20483 |
+
"mean_token_accuracy": 0.8686925515532493,
|
| 20484 |
+
"num_tokens": 37787034.0,
|
| 20485 |
+
"step": 22750
|
| 20486 |
+
},
|
| 20487 |
+
{
|
| 20488 |
+
"epoch": 1.0861379878070605,
|
| 20489 |
+
"grad_norm": 0.41720524430274963,
|
| 20490 |
+
"learning_rate": 9.139107611548556e-06,
|
| 20491 |
+
"loss": 0.669,
|
| 20492 |
+
"mean_token_accuracy": 0.8737372472882271,
|
| 20493 |
+
"num_tokens": 37802820.0,
|
| 20494 |
+
"step": 22760
|
| 20495 |
+
},
|
| 20496 |
+
{
|
| 20497 |
+
"epoch": 1.0866152065760746,
|
| 20498 |
+
"grad_norm": 0.5915963053703308,
|
| 20499 |
+
"learning_rate": 9.134335480792175e-06,
|
| 20500 |
+
"loss": 0.7127,
|
| 20501 |
+
"mean_token_accuracy": 0.8542029947042465,
|
| 20502 |
+
"num_tokens": 37820083.0,
|
| 20503 |
+
"step": 22770
|
| 20504 |
+
},
|
| 20505 |
+
{
|
| 20506 |
+
"epoch": 1.0870924253450889,
|
| 20507 |
+
"grad_norm": 0.48407578468322754,
|
| 20508 |
+
"learning_rate": 9.129563350035791e-06,
|
| 20509 |
+
"loss": 0.6094,
|
| 20510 |
+
"mean_token_accuracy": 0.8686896711587906,
|
| 20511 |
+
"num_tokens": 37836877.0,
|
| 20512 |
+
"step": 22780
|
| 20513 |
+
},
|
| 20514 |
+
{
|
| 20515 |
+
"epoch": 1.087569644114103,
|
| 20516 |
+
"grad_norm": 0.411697119474411,
|
| 20517 |
+
"learning_rate": 9.124791219279408e-06,
|
| 20518 |
+
"loss": 0.5874,
|
| 20519 |
+
"mean_token_accuracy": 0.8753976777195931,
|
| 20520 |
+
"num_tokens": 37852853.0,
|
| 20521 |
+
"step": 22790
|
| 20522 |
+
},
|
| 20523 |
+
{
|
| 20524 |
+
"epoch": 1.0880468628831172,
|
| 20525 |
+
"grad_norm": 0.43069422245025635,
|
| 20526 |
+
"learning_rate": 9.120019088523026e-06,
|
| 20527 |
+
"loss": 0.6337,
|
| 20528 |
+
"mean_token_accuracy": 0.8802076116204262,
|
| 20529 |
+
"num_tokens": 37869204.0,
|
| 20530 |
+
"step": 22800
|
| 20531 |
}
|
| 20532 |
],
|
| 20533 |
"logging_steps": 10,
|
|
|
|
| 20547 |
"attributes": {}
|
| 20548 |
}
|
| 20549 |
},
|
| 20550 |
+
"total_flos": 8.528831995220091e+17,
|
| 20551 |
"train_batch_size": 2,
|
| 20552 |
"trial_name": null,
|
| 20553 |
"trial_params": null
|