Training in progress, step 12000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3e424925fa2b2770536f70d1899af46260c1bbb5c290c98396f2248352c7add
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be8d0890e7228cd98f10766bc63bebe515a3fa05be0c7762618a01f87fa2799c
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdff80ed8983588a862f2109bcc080c93759e076260079b20d08888071ee3452
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10358,6 +10358,456 @@
|
|
| 10358 |
"mean_token_accuracy": 0.8073502600193023,
|
| 10359 |
"num_tokens": 12733862.0,
|
| 10360 |
"step": 11500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10361 |
}
|
| 10362 |
],
|
| 10363 |
"logging_steps": 10,
|
|
@@ -10377,7 +10827,7 @@
|
|
| 10377 |
"attributes": {}
|
| 10378 |
}
|
| 10379 |
},
|
| 10380 |
-
"total_flos": 1.
|
| 10381 |
"train_batch_size": 8,
|
| 10382 |
"trial_name": null,
|
| 10383 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.4178924037880316,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 12000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10358 |
"mean_token_accuracy": 0.8073502600193023,
|
| 10359 |
"num_tokens": 12733862.0,
|
| 10360 |
"step": 11500
|
| 10361 |
+
},
|
| 10362 |
+
{
|
| 10363 |
+
"epoch": 2.31916179730002,
|
| 10364 |
+
"grad_norm": 11.9375,
|
| 10365 |
+
"learning_rate": 4.540264624890859e-06,
|
| 10366 |
+
"loss": 0.8758,
|
| 10367 |
+
"mean_token_accuracy": 0.7853485405445099,
|
| 10368 |
+
"num_tokens": 12745243.0,
|
| 10369 |
+
"step": 11510
|
| 10370 |
+
},
|
| 10371 |
+
{
|
| 10372 |
+
"epoch": 2.32117670763651,
|
| 10373 |
+
"grad_norm": 13.8125,
|
| 10374 |
+
"learning_rate": 4.526831889314259e-06,
|
| 10375 |
+
"loss": 0.8738,
|
| 10376 |
+
"mean_token_accuracy": 0.7857240378856659,
|
| 10377 |
+
"num_tokens": 12756282.0,
|
| 10378 |
+
"step": 11520
|
| 10379 |
+
},
|
| 10380 |
+
{
|
| 10381 |
+
"epoch": 2.3231916179730003,
|
| 10382 |
+
"grad_norm": 9.3125,
|
| 10383 |
+
"learning_rate": 4.5133991537376595e-06,
|
| 10384 |
+
"loss": 0.8261,
|
| 10385 |
+
"mean_token_accuracy": 0.7956898987293244,
|
| 10386 |
+
"num_tokens": 12766669.0,
|
| 10387 |
+
"step": 11530
|
| 10388 |
+
},
|
| 10389 |
+
{
|
| 10390 |
+
"epoch": 2.32520652830949,
|
| 10391 |
+
"grad_norm": 11.25,
|
| 10392 |
+
"learning_rate": 4.499966418161058e-06,
|
| 10393 |
+
"loss": 0.8651,
|
| 10394 |
+
"mean_token_accuracy": 0.7886692404747009,
|
| 10395 |
+
"num_tokens": 12777269.0,
|
| 10396 |
+
"step": 11540
|
| 10397 |
+
},
|
| 10398 |
+
{
|
| 10399 |
+
"epoch": 2.32722143864598,
|
| 10400 |
+
"grad_norm": 9.9375,
|
| 10401 |
+
"learning_rate": 4.486533682584459e-06,
|
| 10402 |
+
"loss": 0.7734,
|
| 10403 |
+
"mean_token_accuracy": 0.805381816625595,
|
| 10404 |
+
"num_tokens": 12788362.0,
|
| 10405 |
+
"step": 11550
|
| 10406 |
+
},
|
| 10407 |
+
{
|
| 10408 |
+
"epoch": 2.3292363489824703,
|
| 10409 |
+
"grad_norm": 11.125,
|
| 10410 |
+
"learning_rate": 4.4731009470078585e-06,
|
| 10411 |
+
"loss": 0.7621,
|
| 10412 |
+
"mean_token_accuracy": 0.8127647817134858,
|
| 10413 |
+
"num_tokens": 12798806.0,
|
| 10414 |
+
"step": 11560
|
| 10415 |
+
},
|
| 10416 |
+
{
|
| 10417 |
+
"epoch": 2.3312512593189605,
|
| 10418 |
+
"grad_norm": 11.375,
|
| 10419 |
+
"learning_rate": 4.459668211431258e-06,
|
| 10420 |
+
"loss": 0.8961,
|
| 10421 |
+
"mean_token_accuracy": 0.7812471866607666,
|
| 10422 |
+
"num_tokens": 12809177.0,
|
| 10423 |
+
"step": 11570
|
| 10424 |
+
},
|
| 10425 |
+
{
|
| 10426 |
+
"epoch": 2.33326616965545,
|
| 10427 |
+
"grad_norm": 11.5625,
|
| 10428 |
+
"learning_rate": 4.446235475854659e-06,
|
| 10429 |
+
"loss": 0.8318,
|
| 10430 |
+
"mean_token_accuracy": 0.791858333349228,
|
| 10431 |
+
"num_tokens": 12819801.0,
|
| 10432 |
+
"step": 11580
|
| 10433 |
+
},
|
| 10434 |
+
{
|
| 10435 |
+
"epoch": 2.3352810799919403,
|
| 10436 |
+
"grad_norm": 12.9375,
|
| 10437 |
+
"learning_rate": 4.432802740278058e-06,
|
| 10438 |
+
"loss": 0.8065,
|
| 10439 |
+
"mean_token_accuracy": 0.8027865469455719,
|
| 10440 |
+
"num_tokens": 12830608.0,
|
| 10441 |
+
"step": 11590
|
| 10442 |
+
},
|
| 10443 |
+
{
|
| 10444 |
+
"epoch": 2.3372959903284305,
|
| 10445 |
+
"grad_norm": 13.5,
|
| 10446 |
+
"learning_rate": 4.419370004701457e-06,
|
| 10447 |
+
"loss": 0.8616,
|
| 10448 |
+
"mean_token_accuracy": 0.7896072804927826,
|
| 10449 |
+
"num_tokens": 12841034.0,
|
| 10450 |
+
"step": 11600
|
| 10451 |
+
},
|
| 10452 |
+
{
|
| 10453 |
+
"epoch": 2.3393109006649206,
|
| 10454 |
+
"grad_norm": 12.0625,
|
| 10455 |
+
"learning_rate": 4.405937269124858e-06,
|
| 10456 |
+
"loss": 0.8395,
|
| 10457 |
+
"mean_token_accuracy": 0.7913760662078857,
|
| 10458 |
+
"num_tokens": 12852500.0,
|
| 10459 |
+
"step": 11610
|
| 10460 |
+
},
|
| 10461 |
+
{
|
| 10462 |
+
"epoch": 2.3413258110014104,
|
| 10463 |
+
"grad_norm": 12.9375,
|
| 10464 |
+
"learning_rate": 4.3925045335482574e-06,
|
| 10465 |
+
"loss": 0.7748,
|
| 10466 |
+
"mean_token_accuracy": 0.8028417646884918,
|
| 10467 |
+
"num_tokens": 12862957.0,
|
| 10468 |
+
"step": 11620
|
| 10469 |
+
},
|
| 10470 |
+
{
|
| 10471 |
+
"epoch": 2.3433407213379005,
|
| 10472 |
+
"grad_norm": 11.9375,
|
| 10473 |
+
"learning_rate": 4.379071797971657e-06,
|
| 10474 |
+
"loss": 0.726,
|
| 10475 |
+
"mean_token_accuracy": 0.8169633626937867,
|
| 10476 |
+
"num_tokens": 12873810.0,
|
| 10477 |
+
"step": 11630
|
| 10478 |
+
},
|
| 10479 |
+
{
|
| 10480 |
+
"epoch": 2.3453556316743907,
|
| 10481 |
+
"grad_norm": 14.25,
|
| 10482 |
+
"learning_rate": 4.365639062395058e-06,
|
| 10483 |
+
"loss": 0.7605,
|
| 10484 |
+
"mean_token_accuracy": 0.8124097108840942,
|
| 10485 |
+
"num_tokens": 12884782.0,
|
| 10486 |
+
"step": 11640
|
| 10487 |
+
},
|
| 10488 |
+
{
|
| 10489 |
+
"epoch": 2.3473705420108804,
|
| 10490 |
+
"grad_norm": 10.125,
|
| 10491 |
+
"learning_rate": 4.3522063268184565e-06,
|
| 10492 |
+
"loss": 0.8192,
|
| 10493 |
+
"mean_token_accuracy": 0.7956154048442841,
|
| 10494 |
+
"num_tokens": 12896291.0,
|
| 10495 |
+
"step": 11650
|
| 10496 |
+
},
|
| 10497 |
+
{
|
| 10498 |
+
"epoch": 2.3493854523473705,
|
| 10499 |
+
"grad_norm": 10.5625,
|
| 10500 |
+
"learning_rate": 4.338773591241857e-06,
|
| 10501 |
+
"loss": 0.7772,
|
| 10502 |
+
"mean_token_accuracy": 0.803158450126648,
|
| 10503 |
+
"num_tokens": 12908167.0,
|
| 10504 |
+
"step": 11660
|
| 10505 |
+
},
|
| 10506 |
+
{
|
| 10507 |
+
"epoch": 2.3514003626838607,
|
| 10508 |
+
"grad_norm": 10.4375,
|
| 10509 |
+
"learning_rate": 4.325340855665257e-06,
|
| 10510 |
+
"loss": 0.9174,
|
| 10511 |
+
"mean_token_accuracy": 0.7823165059089661,
|
| 10512 |
+
"num_tokens": 12920439.0,
|
| 10513 |
+
"step": 11670
|
| 10514 |
+
},
|
| 10515 |
+
{
|
| 10516 |
+
"epoch": 2.3534152730203504,
|
| 10517 |
+
"grad_norm": 12.375,
|
| 10518 |
+
"learning_rate": 4.311908120088656e-06,
|
| 10519 |
+
"loss": 0.9018,
|
| 10520 |
+
"mean_token_accuracy": 0.7804082155227661,
|
| 10521 |
+
"num_tokens": 12932185.0,
|
| 10522 |
+
"step": 11680
|
| 10523 |
+
},
|
| 10524 |
+
{
|
| 10525 |
+
"epoch": 2.3554301833568405,
|
| 10526 |
+
"grad_norm": 13.25,
|
| 10527 |
+
"learning_rate": 4.298475384512056e-06,
|
| 10528 |
+
"loss": 0.7592,
|
| 10529 |
+
"mean_token_accuracy": 0.8126965999603272,
|
| 10530 |
+
"num_tokens": 12942868.0,
|
| 10531 |
+
"step": 11690
|
| 10532 |
+
},
|
| 10533 |
+
{
|
| 10534 |
+
"epoch": 2.3574450936933307,
|
| 10535 |
+
"grad_norm": 12.0,
|
| 10536 |
+
"learning_rate": 4.285042648935457e-06,
|
| 10537 |
+
"loss": 0.78,
|
| 10538 |
+
"mean_token_accuracy": 0.7975714325904846,
|
| 10539 |
+
"num_tokens": 12954738.0,
|
| 10540 |
+
"step": 11700
|
| 10541 |
+
},
|
| 10542 |
+
{
|
| 10543 |
+
"epoch": 2.359460004029821,
|
| 10544 |
+
"grad_norm": 12.375,
|
| 10545 |
+
"learning_rate": 4.271609913358855e-06,
|
| 10546 |
+
"loss": 0.8378,
|
| 10547 |
+
"mean_token_accuracy": 0.7859510540962219,
|
| 10548 |
+
"num_tokens": 12967446.0,
|
| 10549 |
+
"step": 11710
|
| 10550 |
+
},
|
| 10551 |
+
{
|
| 10552 |
+
"epoch": 2.3614749143663105,
|
| 10553 |
+
"grad_norm": 11.9375,
|
| 10554 |
+
"learning_rate": 4.258177177782256e-06,
|
| 10555 |
+
"loss": 0.7837,
|
| 10556 |
+
"mean_token_accuracy": 0.8069138765335083,
|
| 10557 |
+
"num_tokens": 12978934.0,
|
| 10558 |
+
"step": 11720
|
| 10559 |
+
},
|
| 10560 |
+
{
|
| 10561 |
+
"epoch": 2.3634898247028007,
|
| 10562 |
+
"grad_norm": 11.9375,
|
| 10563 |
+
"learning_rate": 4.244744442205656e-06,
|
| 10564 |
+
"loss": 0.8833,
|
| 10565 |
+
"mean_token_accuracy": 0.786394476890564,
|
| 10566 |
+
"num_tokens": 12989739.0,
|
| 10567 |
+
"step": 11730
|
| 10568 |
+
},
|
| 10569 |
+
{
|
| 10570 |
+
"epoch": 2.365504735039291,
|
| 10571 |
+
"grad_norm": 11.75,
|
| 10572 |
+
"learning_rate": 4.231311706629055e-06,
|
| 10573 |
+
"loss": 0.8046,
|
| 10574 |
+
"mean_token_accuracy": 0.8040765285491943,
|
| 10575 |
+
"num_tokens": 13001096.0,
|
| 10576 |
+
"step": 11740
|
| 10577 |
+
},
|
| 10578 |
+
{
|
| 10579 |
+
"epoch": 2.367519645375781,
|
| 10580 |
+
"grad_norm": 10.75,
|
| 10581 |
+
"learning_rate": 4.217878971052455e-06,
|
| 10582 |
+
"loss": 0.7969,
|
| 10583 |
+
"mean_token_accuracy": 0.8002257823944092,
|
| 10584 |
+
"num_tokens": 13013888.0,
|
| 10585 |
+
"step": 11750
|
| 10586 |
+
},
|
| 10587 |
+
{
|
| 10588 |
+
"epoch": 2.3695345557122707,
|
| 10589 |
+
"grad_norm": 11.875,
|
| 10590 |
+
"learning_rate": 4.204446235475855e-06,
|
| 10591 |
+
"loss": 0.7773,
|
| 10592 |
+
"mean_token_accuracy": 0.8048622369766235,
|
| 10593 |
+
"num_tokens": 13024639.0,
|
| 10594 |
+
"step": 11760
|
| 10595 |
+
},
|
| 10596 |
+
{
|
| 10597 |
+
"epoch": 2.371549466048761,
|
| 10598 |
+
"grad_norm": 10.0,
|
| 10599 |
+
"learning_rate": 4.191013499899254e-06,
|
| 10600 |
+
"loss": 0.7608,
|
| 10601 |
+
"mean_token_accuracy": 0.8129175007343292,
|
| 10602 |
+
"num_tokens": 13035841.0,
|
| 10603 |
+
"step": 11770
|
| 10604 |
+
},
|
| 10605 |
+
{
|
| 10606 |
+
"epoch": 2.373564376385251,
|
| 10607 |
+
"grad_norm": 10.0,
|
| 10608 |
+
"learning_rate": 4.177580764322655e-06,
|
| 10609 |
+
"loss": 0.7931,
|
| 10610 |
+
"mean_token_accuracy": 0.8056257784366607,
|
| 10611 |
+
"num_tokens": 13046314.0,
|
| 10612 |
+
"step": 11780
|
| 10613 |
+
},
|
| 10614 |
+
{
|
| 10615 |
+
"epoch": 2.3755792867217407,
|
| 10616 |
+
"grad_norm": 11.625,
|
| 10617 |
+
"learning_rate": 4.1641480287460546e-06,
|
| 10618 |
+
"loss": 0.7639,
|
| 10619 |
+
"mean_token_accuracy": 0.8064506113529205,
|
| 10620 |
+
"num_tokens": 13057027.0,
|
| 10621 |
+
"step": 11790
|
| 10622 |
+
},
|
| 10623 |
+
{
|
| 10624 |
+
"epoch": 2.377594197058231,
|
| 10625 |
+
"grad_norm": 13.0,
|
| 10626 |
+
"learning_rate": 4.150715293169454e-06,
|
| 10627 |
+
"loss": 0.8375,
|
| 10628 |
+
"mean_token_accuracy": 0.7984327495098114,
|
| 10629 |
+
"num_tokens": 13068328.0,
|
| 10630 |
+
"step": 11800
|
| 10631 |
+
},
|
| 10632 |
+
{
|
| 10633 |
+
"epoch": 2.379609107394721,
|
| 10634 |
+
"grad_norm": 10.625,
|
| 10635 |
+
"learning_rate": 4.137282557592854e-06,
|
| 10636 |
+
"loss": 0.7661,
|
| 10637 |
+
"mean_token_accuracy": 0.8164677619934082,
|
| 10638 |
+
"num_tokens": 13079532.0,
|
| 10639 |
+
"step": 11810
|
| 10640 |
+
},
|
| 10641 |
+
{
|
| 10642 |
+
"epoch": 2.381624017731211,
|
| 10643 |
+
"grad_norm": 12.0,
|
| 10644 |
+
"learning_rate": 4.123849822016254e-06,
|
| 10645 |
+
"loss": 0.8284,
|
| 10646 |
+
"mean_token_accuracy": 0.7940251708030701,
|
| 10647 |
+
"num_tokens": 13091655.0,
|
| 10648 |
+
"step": 11820
|
| 10649 |
+
},
|
| 10650 |
+
{
|
| 10651 |
+
"epoch": 2.383638928067701,
|
| 10652 |
+
"grad_norm": 9.4375,
|
| 10653 |
+
"learning_rate": 4.110417086439653e-06,
|
| 10654 |
+
"loss": 0.778,
|
| 10655 |
+
"mean_token_accuracy": 0.8088996291160584,
|
| 10656 |
+
"num_tokens": 13103593.0,
|
| 10657 |
+
"step": 11830
|
| 10658 |
+
},
|
| 10659 |
+
{
|
| 10660 |
+
"epoch": 2.385653838404191,
|
| 10661 |
+
"grad_norm": 11.875,
|
| 10662 |
+
"learning_rate": 4.096984350863054e-06,
|
| 10663 |
+
"loss": 0.9015,
|
| 10664 |
+
"mean_token_accuracy": 0.7831744253635406,
|
| 10665 |
+
"num_tokens": 13115707.0,
|
| 10666 |
+
"step": 11840
|
| 10667 |
+
},
|
| 10668 |
+
{
|
| 10669 |
+
"epoch": 2.387668748740681,
|
| 10670 |
+
"grad_norm": 11.0625,
|
| 10671 |
+
"learning_rate": 4.0835516152864535e-06,
|
| 10672 |
+
"loss": 0.7765,
|
| 10673 |
+
"mean_token_accuracy": 0.8058106303215027,
|
| 10674 |
+
"num_tokens": 13126526.0,
|
| 10675 |
+
"step": 11850
|
| 10676 |
+
},
|
| 10677 |
+
{
|
| 10678 |
+
"epoch": 2.3896836590771713,
|
| 10679 |
+
"grad_norm": 11.4375,
|
| 10680 |
+
"learning_rate": 4.070118879709853e-06,
|
| 10681 |
+
"loss": 0.8312,
|
| 10682 |
+
"mean_token_accuracy": 0.7961088418960571,
|
| 10683 |
+
"num_tokens": 13138535.0,
|
| 10684 |
+
"step": 11860
|
| 10685 |
+
},
|
| 10686 |
+
{
|
| 10687 |
+
"epoch": 2.391698569413661,
|
| 10688 |
+
"grad_norm": 12.5625,
|
| 10689 |
+
"learning_rate": 4.056686144133254e-06,
|
| 10690 |
+
"loss": 0.7469,
|
| 10691 |
+
"mean_token_accuracy": 0.8120961427688599,
|
| 10692 |
+
"num_tokens": 13149421.0,
|
| 10693 |
+
"step": 11870
|
| 10694 |
+
},
|
| 10695 |
+
{
|
| 10696 |
+
"epoch": 2.393713479750151,
|
| 10697 |
+
"grad_norm": 12.0,
|
| 10698 |
+
"learning_rate": 4.0432534085566526e-06,
|
| 10699 |
+
"loss": 0.7109,
|
| 10700 |
+
"mean_token_accuracy": 0.8213139772415161,
|
| 10701 |
+
"num_tokens": 13159892.0,
|
| 10702 |
+
"step": 11880
|
| 10703 |
+
},
|
| 10704 |
+
{
|
| 10705 |
+
"epoch": 2.3957283900866413,
|
| 10706 |
+
"grad_norm": 11.75,
|
| 10707 |
+
"learning_rate": 4.029820672980052e-06,
|
| 10708 |
+
"loss": 0.7591,
|
| 10709 |
+
"mean_token_accuracy": 0.8050274133682251,
|
| 10710 |
+
"num_tokens": 13171470.0,
|
| 10711 |
+
"step": 11890
|
| 10712 |
+
},
|
| 10713 |
+
{
|
| 10714 |
+
"epoch": 2.397743300423131,
|
| 10715 |
+
"grad_norm": 10.8125,
|
| 10716 |
+
"learning_rate": 4.016387937403453e-06,
|
| 10717 |
+
"loss": 0.7032,
|
| 10718 |
+
"mean_token_accuracy": 0.8176105141639709,
|
| 10719 |
+
"num_tokens": 13182185.0,
|
| 10720 |
+
"step": 11900
|
| 10721 |
+
},
|
| 10722 |
+
{
|
| 10723 |
+
"epoch": 2.399758210759621,
|
| 10724 |
+
"grad_norm": 14.75,
|
| 10725 |
+
"learning_rate": 4.0029552018268524e-06,
|
| 10726 |
+
"loss": 0.8339,
|
| 10727 |
+
"mean_token_accuracy": 0.7909499406814575,
|
| 10728 |
+
"num_tokens": 13193599.0,
|
| 10729 |
+
"step": 11910
|
| 10730 |
+
},
|
| 10731 |
+
{
|
| 10732 |
+
"epoch": 2.4017731210961113,
|
| 10733 |
+
"grad_norm": 11.875,
|
| 10734 |
+
"learning_rate": 3.989522466250252e-06,
|
| 10735 |
+
"loss": 0.8541,
|
| 10736 |
+
"mean_token_accuracy": 0.7976760566234589,
|
| 10737 |
+
"num_tokens": 13204212.0,
|
| 10738 |
+
"step": 11920
|
| 10739 |
+
},
|
| 10740 |
+
{
|
| 10741 |
+
"epoch": 2.403788031432601,
|
| 10742 |
+
"grad_norm": 10.625,
|
| 10743 |
+
"learning_rate": 3.976089730673652e-06,
|
| 10744 |
+
"loss": 0.759,
|
| 10745 |
+
"mean_token_accuracy": 0.8070417881011963,
|
| 10746 |
+
"num_tokens": 13214466.0,
|
| 10747 |
+
"step": 11930
|
| 10748 |
+
},
|
| 10749 |
+
{
|
| 10750 |
+
"epoch": 2.405802941769091,
|
| 10751 |
+
"grad_norm": 11.5,
|
| 10752 |
+
"learning_rate": 3.9626569950970515e-06,
|
| 10753 |
+
"loss": 0.7853,
|
| 10754 |
+
"mean_token_accuracy": 0.8054651498794556,
|
| 10755 |
+
"num_tokens": 13226936.0,
|
| 10756 |
+
"step": 11940
|
| 10757 |
+
},
|
| 10758 |
+
{
|
| 10759 |
+
"epoch": 2.4078178521055813,
|
| 10760 |
+
"grad_norm": 11.625,
|
| 10761 |
+
"learning_rate": 3.949224259520452e-06,
|
| 10762 |
+
"loss": 1.0198,
|
| 10763 |
+
"mean_token_accuracy": 0.7604237377643586,
|
| 10764 |
+
"num_tokens": 13239461.0,
|
| 10765 |
+
"step": 11950
|
| 10766 |
+
},
|
| 10767 |
+
{
|
| 10768 |
+
"epoch": 2.4098327624420715,
|
| 10769 |
+
"grad_norm": 10.9375,
|
| 10770 |
+
"learning_rate": 3.935791523943852e-06,
|
| 10771 |
+
"loss": 0.7993,
|
| 10772 |
+
"mean_token_accuracy": 0.8001957833766937,
|
| 10773 |
+
"num_tokens": 13250850.0,
|
| 10774 |
+
"step": 11960
|
| 10775 |
+
},
|
| 10776 |
+
{
|
| 10777 |
+
"epoch": 2.4118476727785616,
|
| 10778 |
+
"grad_norm": 11.5,
|
| 10779 |
+
"learning_rate": 3.922358788367251e-06,
|
| 10780 |
+
"loss": 0.728,
|
| 10781 |
+
"mean_token_accuracy": 0.8160501599311829,
|
| 10782 |
+
"num_tokens": 13261351.0,
|
| 10783 |
+
"step": 11970
|
| 10784 |
+
},
|
| 10785 |
+
{
|
| 10786 |
+
"epoch": 2.4138625831150513,
|
| 10787 |
+
"grad_norm": 11.125,
|
| 10788 |
+
"learning_rate": 3.908926052790651e-06,
|
| 10789 |
+
"loss": 0.7994,
|
| 10790 |
+
"mean_token_accuracy": 0.8003806352615357,
|
| 10791 |
+
"num_tokens": 13272623.0,
|
| 10792 |
+
"step": 11980
|
| 10793 |
+
},
|
| 10794 |
+
{
|
| 10795 |
+
"epoch": 2.4158774934515415,
|
| 10796 |
+
"grad_norm": 10.5625,
|
| 10797 |
+
"learning_rate": 3.895493317214051e-06,
|
| 10798 |
+
"loss": 0.8576,
|
| 10799 |
+
"mean_token_accuracy": 0.7920307397842408,
|
| 10800 |
+
"num_tokens": 13284043.0,
|
| 10801 |
+
"step": 11990
|
| 10802 |
+
},
|
| 10803 |
+
{
|
| 10804 |
+
"epoch": 2.4178924037880316,
|
| 10805 |
+
"grad_norm": 11.5,
|
| 10806 |
+
"learning_rate": 3.8820605816374504e-06,
|
| 10807 |
+
"loss": 0.7156,
|
| 10808 |
+
"mean_token_accuracy": 0.8190572082996368,
|
| 10809 |
+
"num_tokens": 13294166.0,
|
| 10810 |
+
"step": 12000
|
| 10811 |
}
|
| 10812 |
],
|
| 10813 |
"logging_steps": 10,
|
|
|
|
| 10827 |
"attributes": {}
|
| 10828 |
}
|
| 10829 |
},
|
| 10830 |
+
"total_flos": 1.6084473958017024e+16,
|
| 10831 |
"train_batch_size": 8,
|
| 10832 |
"trial_name": null,
|
| 10833 |
"trial_params": null
|