Training in progress, step 11000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:653a7bb4c0270ae2dd03d344965c51599b26df08817400d9611fe8bd0497aa7e
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49ab3488ed04a08a6119dd62c223dc3bd691b1d8c04575c9d55a422631b4cec4
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a92df46ff7ec03358cd9241260e8a718523df24a66e616bac3dad8000c153e0c
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9458,6 +9458,456 @@
|
|
| 9458 |
"mean_token_accuracy": 0.807522964477539,
|
| 9459 |
"num_tokens": 11623915.0,
|
| 9460 |
"step": 10500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9461 |
}
|
| 9462 |
],
|
| 9463 |
"logging_steps": 10,
|
|
@@ -9477,7 +9927,7 @@
|
|
| 9477 |
"attributes": {}
|
| 9478 |
}
|
| 9479 |
},
|
| 9480 |
-
"total_flos": 1.
|
| 9481 |
"train_batch_size": 8,
|
| 9482 |
"trial_name": null,
|
| 9483 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.216401370139029,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 11000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9458 |
"mean_token_accuracy": 0.807522964477539,
|
| 9459 |
"num_tokens": 11623915.0,
|
| 9460 |
"step": 10500
|
| 9461 |
+
},
|
| 9462 |
+
{
|
| 9463 |
+
"epoch": 2.1176707636510175,
|
| 9464 |
+
"grad_norm": 10.1875,
|
| 9465 |
+
"learning_rate": 5.8835381825508775e-06,
|
| 9466 |
+
"loss": 0.8048,
|
| 9467 |
+
"mean_token_accuracy": 0.8046258687973022,
|
| 9468 |
+
"num_tokens": 11634260.0,
|
| 9469 |
+
"step": 10510
|
| 9470 |
+
},
|
| 9471 |
+
{
|
| 9472 |
+
"epoch": 2.1196856739875076,
|
| 9473 |
+
"grad_norm": 8.875,
|
| 9474 |
+
"learning_rate": 5.870105446974277e-06,
|
| 9475 |
+
"loss": 0.8054,
|
| 9476 |
+
"mean_token_accuracy": 0.7993561148643493,
|
| 9477 |
+
"num_tokens": 11644984.0,
|
| 9478 |
+
"step": 10520
|
| 9479 |
+
},
|
| 9480 |
+
{
|
| 9481 |
+
"epoch": 2.1217005843239978,
|
| 9482 |
+
"grad_norm": 11.1875,
|
| 9483 |
+
"learning_rate": 5.856672711397676e-06,
|
| 9484 |
+
"loss": 0.7494,
|
| 9485 |
+
"mean_token_accuracy": 0.8139807939529419,
|
| 9486 |
+
"num_tokens": 11656539.0,
|
| 9487 |
+
"step": 10530
|
| 9488 |
+
},
|
| 9489 |
+
{
|
| 9490 |
+
"epoch": 2.1237154946604875,
|
| 9491 |
+
"grad_norm": 8.5,
|
| 9492 |
+
"learning_rate": 5.8432399758210765e-06,
|
| 9493 |
+
"loss": 0.7947,
|
| 9494 |
+
"mean_token_accuracy": 0.8058351814746857,
|
| 9495 |
+
"num_tokens": 11667616.0,
|
| 9496 |
+
"step": 10540
|
| 9497 |
+
},
|
| 9498 |
+
{
|
| 9499 |
+
"epoch": 2.1257304049969776,
|
| 9500 |
+
"grad_norm": 13.625,
|
| 9501 |
+
"learning_rate": 5.829807240244476e-06,
|
| 9502 |
+
"loss": 0.7885,
|
| 9503 |
+
"mean_token_accuracy": 0.8079341351985931,
|
| 9504 |
+
"num_tokens": 11678470.0,
|
| 9505 |
+
"step": 10550
|
| 9506 |
+
},
|
| 9507 |
+
{
|
| 9508 |
+
"epoch": 2.127745315333468,
|
| 9509 |
+
"grad_norm": 10.25,
|
| 9510 |
+
"learning_rate": 5.816374504667877e-06,
|
| 9511 |
+
"loss": 0.733,
|
| 9512 |
+
"mean_token_accuracy": 0.8124136865139008,
|
| 9513 |
+
"num_tokens": 11690615.0,
|
| 9514 |
+
"step": 10560
|
| 9515 |
+
},
|
| 9516 |
+
{
|
| 9517 |
+
"epoch": 2.1297602256699575,
|
| 9518 |
+
"grad_norm": 12.9375,
|
| 9519 |
+
"learning_rate": 5.802941769091276e-06,
|
| 9520 |
+
"loss": 0.8069,
|
| 9521 |
+
"mean_token_accuracy": 0.7959172546863555,
|
| 9522 |
+
"num_tokens": 11700806.0,
|
| 9523 |
+
"step": 10570
|
| 9524 |
+
},
|
| 9525 |
+
{
|
| 9526 |
+
"epoch": 2.1317751360064476,
|
| 9527 |
+
"grad_norm": 12.125,
|
| 9528 |
+
"learning_rate": 5.789509033514675e-06,
|
| 9529 |
+
"loss": 0.7666,
|
| 9530 |
+
"mean_token_accuracy": 0.8081447660923005,
|
| 9531 |
+
"num_tokens": 11712546.0,
|
| 9532 |
+
"step": 10580
|
| 9533 |
+
},
|
| 9534 |
+
{
|
| 9535 |
+
"epoch": 2.133790046342938,
|
| 9536 |
+
"grad_norm": 14.8125,
|
| 9537 |
+
"learning_rate": 5.776076297938076e-06,
|
| 9538 |
+
"loss": 0.7609,
|
| 9539 |
+
"mean_token_accuracy": 0.8059535026550293,
|
| 9540 |
+
"num_tokens": 11722798.0,
|
| 9541 |
+
"step": 10590
|
| 9542 |
+
},
|
| 9543 |
+
{
|
| 9544 |
+
"epoch": 2.135804956679428,
|
| 9545 |
+
"grad_norm": 12.0,
|
| 9546 |
+
"learning_rate": 5.7626435623614755e-06,
|
| 9547 |
+
"loss": 0.8261,
|
| 9548 |
+
"mean_token_accuracy": 0.7918490886688232,
|
| 9549 |
+
"num_tokens": 11733979.0,
|
| 9550 |
+
"step": 10600
|
| 9551 |
+
},
|
| 9552 |
+
{
|
| 9553 |
+
"epoch": 2.1378198670159176,
|
| 9554 |
+
"grad_norm": 10.6875,
|
| 9555 |
+
"learning_rate": 5.749210826784876e-06,
|
| 9556 |
+
"loss": 0.8653,
|
| 9557 |
+
"mean_token_accuracy": 0.7918577075004578,
|
| 9558 |
+
"num_tokens": 11746313.0,
|
| 9559 |
+
"step": 10610
|
| 9560 |
+
},
|
| 9561 |
+
{
|
| 9562 |
+
"epoch": 2.139834777352408,
|
| 9563 |
+
"grad_norm": 13.8125,
|
| 9564 |
+
"learning_rate": 5.735778091208275e-06,
|
| 9565 |
+
"loss": 0.7597,
|
| 9566 |
+
"mean_token_accuracy": 0.8129185199737549,
|
| 9567 |
+
"num_tokens": 11756847.0,
|
| 9568 |
+
"step": 10620
|
| 9569 |
+
},
|
| 9570 |
+
{
|
| 9571 |
+
"epoch": 2.141849687688898,
|
| 9572 |
+
"grad_norm": 11.3125,
|
| 9573 |
+
"learning_rate": 5.7223453556316745e-06,
|
| 9574 |
+
"loss": 0.8895,
|
| 9575 |
+
"mean_token_accuracy": 0.7831692516803741,
|
| 9576 |
+
"num_tokens": 11768092.0,
|
| 9577 |
+
"step": 10630
|
| 9578 |
+
},
|
| 9579 |
+
{
|
| 9580 |
+
"epoch": 2.143864598025388,
|
| 9581 |
+
"grad_norm": 9.75,
|
| 9582 |
+
"learning_rate": 5.708912620055075e-06,
|
| 9583 |
+
"loss": 0.8293,
|
| 9584 |
+
"mean_token_accuracy": 0.7959823906421661,
|
| 9585 |
+
"num_tokens": 11779092.0,
|
| 9586 |
+
"step": 10640
|
| 9587 |
+
},
|
| 9588 |
+
{
|
| 9589 |
+
"epoch": 2.145879508361878,
|
| 9590 |
+
"grad_norm": 13.625,
|
| 9591 |
+
"learning_rate": 5.695479884478474e-06,
|
| 9592 |
+
"loss": 0.7806,
|
| 9593 |
+
"mean_token_accuracy": 0.8032085597515106,
|
| 9594 |
+
"num_tokens": 11789301.0,
|
| 9595 |
+
"step": 10650
|
| 9596 |
+
},
|
| 9597 |
+
{
|
| 9598 |
+
"epoch": 2.147894418698368,
|
| 9599 |
+
"grad_norm": 10.25,
|
| 9600 |
+
"learning_rate": 5.6820471489018744e-06,
|
| 9601 |
+
"loss": 0.7285,
|
| 9602 |
+
"mean_token_accuracy": 0.8135238766670227,
|
| 9603 |
+
"num_tokens": 11799827.0,
|
| 9604 |
+
"step": 10660
|
| 9605 |
+
},
|
| 9606 |
+
{
|
| 9607 |
+
"epoch": 2.149909329034858,
|
| 9608 |
+
"grad_norm": 11.75,
|
| 9609 |
+
"learning_rate": 5.668614413325274e-06,
|
| 9610 |
+
"loss": 0.8109,
|
| 9611 |
+
"mean_token_accuracy": 0.8031542479991913,
|
| 9612 |
+
"num_tokens": 11810095.0,
|
| 9613 |
+
"step": 10670
|
| 9614 |
+
},
|
| 9615 |
+
{
|
| 9616 |
+
"epoch": 2.151924239371348,
|
| 9617 |
+
"grad_norm": 10.9375,
|
| 9618 |
+
"learning_rate": 5.655181677748674e-06,
|
| 9619 |
+
"loss": 0.8062,
|
| 9620 |
+
"mean_token_accuracy": 0.7998530924320221,
|
| 9621 |
+
"num_tokens": 11821701.0,
|
| 9622 |
+
"step": 10680
|
| 9623 |
+
},
|
| 9624 |
+
{
|
| 9625 |
+
"epoch": 2.153939149707838,
|
| 9626 |
+
"grad_norm": 16.25,
|
| 9627 |
+
"learning_rate": 5.641748942172074e-06,
|
| 9628 |
+
"loss": 0.7909,
|
| 9629 |
+
"mean_token_accuracy": 0.8020996809005737,
|
| 9630 |
+
"num_tokens": 11833622.0,
|
| 9631 |
+
"step": 10690
|
| 9632 |
+
},
|
| 9633 |
+
{
|
| 9634 |
+
"epoch": 2.155954060044328,
|
| 9635 |
+
"grad_norm": 11.6875,
|
| 9636 |
+
"learning_rate": 5.628316206595473e-06,
|
| 9637 |
+
"loss": 0.8753,
|
| 9638 |
+
"mean_token_accuracy": 0.7875764667987823,
|
| 9639 |
+
"num_tokens": 11844025.0,
|
| 9640 |
+
"step": 10700
|
| 9641 |
+
},
|
| 9642 |
+
{
|
| 9643 |
+
"epoch": 2.1579689703808183,
|
| 9644 |
+
"grad_norm": 15.1875,
|
| 9645 |
+
"learning_rate": 5.614883471018874e-06,
|
| 9646 |
+
"loss": 0.8975,
|
| 9647 |
+
"mean_token_accuracy": 0.7894319653511047,
|
| 9648 |
+
"num_tokens": 11855329.0,
|
| 9649 |
+
"step": 10710
|
| 9650 |
+
},
|
| 9651 |
+
{
|
| 9652 |
+
"epoch": 2.159983880717308,
|
| 9653 |
+
"grad_norm": 12.1875,
|
| 9654 |
+
"learning_rate": 5.601450735442273e-06,
|
| 9655 |
+
"loss": 0.847,
|
| 9656 |
+
"mean_token_accuracy": 0.7901701211929322,
|
| 9657 |
+
"num_tokens": 11866697.0,
|
| 9658 |
+
"step": 10720
|
| 9659 |
+
},
|
| 9660 |
+
{
|
| 9661 |
+
"epoch": 2.161998791053798,
|
| 9662 |
+
"grad_norm": 12.3125,
|
| 9663 |
+
"learning_rate": 5.588017999865674e-06,
|
| 9664 |
+
"loss": 0.8007,
|
| 9665 |
+
"mean_token_accuracy": 0.805288553237915,
|
| 9666 |
+
"num_tokens": 11877358.0,
|
| 9667 |
+
"step": 10730
|
| 9668 |
+
},
|
| 9669 |
+
{
|
| 9670 |
+
"epoch": 2.1640137013902883,
|
| 9671 |
+
"grad_norm": 11.375,
|
| 9672 |
+
"learning_rate": 5.574585264289073e-06,
|
| 9673 |
+
"loss": 0.8334,
|
| 9674 |
+
"mean_token_accuracy": 0.8021558821201324,
|
| 9675 |
+
"num_tokens": 11888098.0,
|
| 9676 |
+
"step": 10740
|
| 9677 |
+
},
|
| 9678 |
+
{
|
| 9679 |
+
"epoch": 2.166028611726778,
|
| 9680 |
+
"grad_norm": 10.1875,
|
| 9681 |
+
"learning_rate": 5.561152528712472e-06,
|
| 9682 |
+
"loss": 0.7298,
|
| 9683 |
+
"mean_token_accuracy": 0.8173341572284698,
|
| 9684 |
+
"num_tokens": 11900343.0,
|
| 9685 |
+
"step": 10750
|
| 9686 |
+
},
|
| 9687 |
+
{
|
| 9688 |
+
"epoch": 2.168043522063268,
|
| 9689 |
+
"grad_norm": 11.875,
|
| 9690 |
+
"learning_rate": 5.547719793135873e-06,
|
| 9691 |
+
"loss": 0.7146,
|
| 9692 |
+
"mean_token_accuracy": 0.8224671244621277,
|
| 9693 |
+
"num_tokens": 11911403.0,
|
| 9694 |
+
"step": 10760
|
| 9695 |
+
},
|
| 9696 |
+
{
|
| 9697 |
+
"epoch": 2.1700584323997583,
|
| 9698 |
+
"grad_norm": 12.125,
|
| 9699 |
+
"learning_rate": 5.534287057559273e-06,
|
| 9700 |
+
"loss": 0.8245,
|
| 9701 |
+
"mean_token_accuracy": 0.7936823606491089,
|
| 9702 |
+
"num_tokens": 11922991.0,
|
| 9703 |
+
"step": 10770
|
| 9704 |
+
},
|
| 9705 |
+
{
|
| 9706 |
+
"epoch": 2.1720733427362484,
|
| 9707 |
+
"grad_norm": 10.9375,
|
| 9708 |
+
"learning_rate": 5.520854321982672e-06,
|
| 9709 |
+
"loss": 0.8443,
|
| 9710 |
+
"mean_token_accuracy": 0.788495534658432,
|
| 9711 |
+
"num_tokens": 11934105.0,
|
| 9712 |
+
"step": 10780
|
| 9713 |
+
},
|
| 9714 |
+
{
|
| 9715 |
+
"epoch": 2.174088253072738,
|
| 9716 |
+
"grad_norm": 14.3125,
|
| 9717 |
+
"learning_rate": 5.507421586406072e-06,
|
| 9718 |
+
"loss": 0.8389,
|
| 9719 |
+
"mean_token_accuracy": 0.7919258952140809,
|
| 9720 |
+
"num_tokens": 11944878.0,
|
| 9721 |
+
"step": 10790
|
| 9722 |
+
},
|
| 9723 |
+
{
|
| 9724 |
+
"epoch": 2.1761031634092283,
|
| 9725 |
+
"grad_norm": 10.8125,
|
| 9726 |
+
"learning_rate": 5.493988850829472e-06,
|
| 9727 |
+
"loss": 0.8987,
|
| 9728 |
+
"mean_token_accuracy": 0.7812518179416656,
|
| 9729 |
+
"num_tokens": 11956600.0,
|
| 9730 |
+
"step": 10800
|
| 9731 |
+
},
|
| 9732 |
+
{
|
| 9733 |
+
"epoch": 2.1781180737457184,
|
| 9734 |
+
"grad_norm": 11.625,
|
| 9735 |
+
"learning_rate": 5.480556115252872e-06,
|
| 9736 |
+
"loss": 0.8744,
|
| 9737 |
+
"mean_token_accuracy": 0.7875288486480713,
|
| 9738 |
+
"num_tokens": 11966645.0,
|
| 9739 |
+
"step": 10810
|
| 9740 |
+
},
|
| 9741 |
+
{
|
| 9742 |
+
"epoch": 2.180132984082208,
|
| 9743 |
+
"grad_norm": 11.1875,
|
| 9744 |
+
"learning_rate": 5.467123379676271e-06,
|
| 9745 |
+
"loss": 0.7598,
|
| 9746 |
+
"mean_token_accuracy": 0.8071795523166656,
|
| 9747 |
+
"num_tokens": 11977516.0,
|
| 9748 |
+
"step": 10820
|
| 9749 |
+
},
|
| 9750 |
+
{
|
| 9751 |
+
"epoch": 2.1821478944186983,
|
| 9752 |
+
"grad_norm": 11.125,
|
| 9753 |
+
"learning_rate": 5.4536906440996716e-06,
|
| 9754 |
+
"loss": 0.7946,
|
| 9755 |
+
"mean_token_accuracy": 0.7999853491783142,
|
| 9756 |
+
"num_tokens": 11987823.0,
|
| 9757 |
+
"step": 10830
|
| 9758 |
+
},
|
| 9759 |
+
{
|
| 9760 |
+
"epoch": 2.1841628047551884,
|
| 9761 |
+
"grad_norm": 9.9375,
|
| 9762 |
+
"learning_rate": 5.440257908523071e-06,
|
| 9763 |
+
"loss": 0.7951,
|
| 9764 |
+
"mean_token_accuracy": 0.8064453899860382,
|
| 9765 |
+
"num_tokens": 11999675.0,
|
| 9766 |
+
"step": 10840
|
| 9767 |
+
},
|
| 9768 |
+
{
|
| 9769 |
+
"epoch": 2.1861777150916786,
|
| 9770 |
+
"grad_norm": 10.0,
|
| 9771 |
+
"learning_rate": 5.42682517294647e-06,
|
| 9772 |
+
"loss": 0.8071,
|
| 9773 |
+
"mean_token_accuracy": 0.7993614792823791,
|
| 9774 |
+
"num_tokens": 12010690.0,
|
| 9775 |
+
"step": 10850
|
| 9776 |
+
},
|
| 9777 |
+
{
|
| 9778 |
+
"epoch": 2.1881926254281683,
|
| 9779 |
+
"grad_norm": 11.625,
|
| 9780 |
+
"learning_rate": 5.413392437369871e-06,
|
| 9781 |
+
"loss": 0.8318,
|
| 9782 |
+
"mean_token_accuracy": 0.7873802423477173,
|
| 9783 |
+
"num_tokens": 12021657.0,
|
| 9784 |
+
"step": 10860
|
| 9785 |
+
},
|
| 9786 |
+
{
|
| 9787 |
+
"epoch": 2.1902075357646584,
|
| 9788 |
+
"grad_norm": 10.9375,
|
| 9789 |
+
"learning_rate": 5.39995970179327e-06,
|
| 9790 |
+
"loss": 0.8989,
|
| 9791 |
+
"mean_token_accuracy": 0.7851345241069794,
|
| 9792 |
+
"num_tokens": 12033302.0,
|
| 9793 |
+
"step": 10870
|
| 9794 |
+
},
|
| 9795 |
+
{
|
| 9796 |
+
"epoch": 2.1922224461011486,
|
| 9797 |
+
"grad_norm": 10.9375,
|
| 9798 |
+
"learning_rate": 5.386526966216671e-06,
|
| 9799 |
+
"loss": 0.7589,
|
| 9800 |
+
"mean_token_accuracy": 0.805691534280777,
|
| 9801 |
+
"num_tokens": 12043229.0,
|
| 9802 |
+
"step": 10880
|
| 9803 |
+
},
|
| 9804 |
+
{
|
| 9805 |
+
"epoch": 2.1942373564376387,
|
| 9806 |
+
"grad_norm": 11.9375,
|
| 9807 |
+
"learning_rate": 5.3730942306400705e-06,
|
| 9808 |
+
"loss": 0.8026,
|
| 9809 |
+
"mean_token_accuracy": 0.8073262214660645,
|
| 9810 |
+
"num_tokens": 12052950.0,
|
| 9811 |
+
"step": 10890
|
| 9812 |
+
},
|
| 9813 |
+
{
|
| 9814 |
+
"epoch": 2.1962522667741284,
|
| 9815 |
+
"grad_norm": 11.0,
|
| 9816 |
+
"learning_rate": 5.359661495063471e-06,
|
| 9817 |
+
"loss": 0.8301,
|
| 9818 |
+
"mean_token_accuracy": 0.7973912358283997,
|
| 9819 |
+
"num_tokens": 12063314.0,
|
| 9820 |
+
"step": 10900
|
| 9821 |
+
},
|
| 9822 |
+
{
|
| 9823 |
+
"epoch": 2.1982671771106186,
|
| 9824 |
+
"grad_norm": 11.5625,
|
| 9825 |
+
"learning_rate": 5.34622875948687e-06,
|
| 9826 |
+
"loss": 0.7227,
|
| 9827 |
+
"mean_token_accuracy": 0.8158142805099488,
|
| 9828 |
+
"num_tokens": 12074240.0,
|
| 9829 |
+
"step": 10910
|
| 9830 |
+
},
|
| 9831 |
+
{
|
| 9832 |
+
"epoch": 2.2002820874471087,
|
| 9833 |
+
"grad_norm": 10.9375,
|
| 9834 |
+
"learning_rate": 5.3327960239102695e-06,
|
| 9835 |
+
"loss": 0.7952,
|
| 9836 |
+
"mean_token_accuracy": 0.8007366359233856,
|
| 9837 |
+
"num_tokens": 12086925.0,
|
| 9838 |
+
"step": 10920
|
| 9839 |
+
},
|
| 9840 |
+
{
|
| 9841 |
+
"epoch": 2.2022969977835984,
|
| 9842 |
+
"grad_norm": 12.6875,
|
| 9843 |
+
"learning_rate": 5.31936328833367e-06,
|
| 9844 |
+
"loss": 0.9782,
|
| 9845 |
+
"mean_token_accuracy": 0.7690042972564697,
|
| 9846 |
+
"num_tokens": 12098664.0,
|
| 9847 |
+
"step": 10930
|
| 9848 |
+
},
|
| 9849 |
+
{
|
| 9850 |
+
"epoch": 2.2043119081200886,
|
| 9851 |
+
"grad_norm": 10.375,
|
| 9852 |
+
"learning_rate": 5.305930552757069e-06,
|
| 9853 |
+
"loss": 0.8007,
|
| 9854 |
+
"mean_token_accuracy": 0.8038599193096161,
|
| 9855 |
+
"num_tokens": 12109170.0,
|
| 9856 |
+
"step": 10940
|
| 9857 |
+
},
|
| 9858 |
+
{
|
| 9859 |
+
"epoch": 2.2063268184565787,
|
| 9860 |
+
"grad_norm": 11.5,
|
| 9861 |
+
"learning_rate": 5.2924978171804694e-06,
|
| 9862 |
+
"loss": 0.752,
|
| 9863 |
+
"mean_token_accuracy": 0.8126667857170105,
|
| 9864 |
+
"num_tokens": 12120379.0,
|
| 9865 |
+
"step": 10950
|
| 9866 |
+
},
|
| 9867 |
+
{
|
| 9868 |
+
"epoch": 2.208341728793069,
|
| 9869 |
+
"grad_norm": 10.0625,
|
| 9870 |
+
"learning_rate": 5.279065081603869e-06,
|
| 9871 |
+
"loss": 0.8548,
|
| 9872 |
+
"mean_token_accuracy": 0.793574595451355,
|
| 9873 |
+
"num_tokens": 12132062.0,
|
| 9874 |
+
"step": 10960
|
| 9875 |
+
},
|
| 9876 |
+
{
|
| 9877 |
+
"epoch": 2.2103566391295586,
|
| 9878 |
+
"grad_norm": 12.1875,
|
| 9879 |
+
"learning_rate": 5.265632346027269e-06,
|
| 9880 |
+
"loss": 0.7465,
|
| 9881 |
+
"mean_token_accuracy": 0.8148365259170532,
|
| 9882 |
+
"num_tokens": 12143111.0,
|
| 9883 |
+
"step": 10970
|
| 9884 |
+
},
|
| 9885 |
+
{
|
| 9886 |
+
"epoch": 2.2123715494660487,
|
| 9887 |
+
"grad_norm": 11.3125,
|
| 9888 |
+
"learning_rate": 5.252199610450669e-06,
|
| 9889 |
+
"loss": 0.822,
|
| 9890 |
+
"mean_token_accuracy": 0.7944930195808411,
|
| 9891 |
+
"num_tokens": 12155572.0,
|
| 9892 |
+
"step": 10980
|
| 9893 |
+
},
|
| 9894 |
+
{
|
| 9895 |
+
"epoch": 2.214386459802539,
|
| 9896 |
+
"grad_norm": 10.9375,
|
| 9897 |
+
"learning_rate": 5.238766874874068e-06,
|
| 9898 |
+
"loss": 0.9171,
|
| 9899 |
+
"mean_token_accuracy": 0.7774474084377289,
|
| 9900 |
+
"num_tokens": 12167871.0,
|
| 9901 |
+
"step": 10990
|
| 9902 |
+
},
|
| 9903 |
+
{
|
| 9904 |
+
"epoch": 2.216401370139029,
|
| 9905 |
+
"grad_norm": 12.9375,
|
| 9906 |
+
"learning_rate": 5.225334139297469e-06,
|
| 9907 |
+
"loss": 0.9178,
|
| 9908 |
+
"mean_token_accuracy": 0.7765659749507904,
|
| 9909 |
+
"num_tokens": 12178091.0,
|
| 9910 |
+
"step": 11000
|
| 9911 |
}
|
| 9912 |
],
|
| 9913 |
"logging_steps": 10,
|
|
|
|
| 9927 |
"attributes": {}
|
| 9928 |
}
|
| 9929 |
},
|
| 9930 |
+
"total_flos": 1.4726200960407552e+16,
|
| 9931 |
"train_batch_size": 8,
|
| 9932 |
"trial_name": null,
|
| 9933 |
"trial_params": null
|