Training in progress, step 14500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87d564460f84baac9ace9dc44cd612f3da4c9738f97e9806a8457bb9462e95db
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ace8d39e9d75867a54c7c346772698f7c6e42165925320fb3b2367daa7c674e
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e3f275449dfbc8efc7d2d2f06d134c7b39e55b8e539f36e09b007c731c81c65
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12608,6 +12608,456 @@
|
|
| 12608 |
"mean_token_accuracy": 0.7995685517787934,
|
| 12609 |
"num_tokens": 15509702.0,
|
| 12610 |
"step": 14000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12611 |
}
|
| 12612 |
],
|
| 12613 |
"logging_steps": 10,
|
|
@@ -12627,7 +13077,7 @@
|
|
| 12627 |
"attributes": {}
|
| 12628 |
}
|
| 12629 |
},
|
| 12630 |
-
"total_flos": 1.
|
| 12631 |
"train_batch_size": 8,
|
| 12632 |
"trial_name": null,
|
| 12633 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.921619987910538,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 14500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12608 |
"mean_token_accuracy": 0.7995685517787934,
|
| 12609 |
"num_tokens": 15509702.0,
|
| 12610 |
"step": 14000
|
| 12611 |
+
},
|
| 12612 |
+
{
|
| 12613 |
+
"epoch": 2.8228893814225264,
|
| 12614 |
+
"grad_norm": 12.5625,
|
| 12615 |
+
"learning_rate": 1.1820807307408155e-06,
|
| 12616 |
+
"loss": 0.7108,
|
| 12617 |
+
"mean_token_accuracy": 0.8189300537109375,
|
| 12618 |
+
"num_tokens": 15519975.0,
|
| 12619 |
+
"step": 14010
|
| 12620 |
+
},
|
| 12621 |
+
{
|
| 12622 |
+
"epoch": 2.8249042917590166,
|
| 12623 |
+
"grad_norm": 12.75,
|
| 12624 |
+
"learning_rate": 1.1686479951642154e-06,
|
| 12625 |
+
"loss": 0.817,
|
| 12626 |
+
"mean_token_accuracy": 0.8028945684432983,
|
| 12627 |
+
"num_tokens": 15530042.0,
|
| 12628 |
+
"step": 14020
|
| 12629 |
+
},
|
| 12630 |
+
{
|
| 12631 |
+
"epoch": 2.8269192020955067,
|
| 12632 |
+
"grad_norm": 11.0,
|
| 12633 |
+
"learning_rate": 1.155215259587615e-06,
|
| 12634 |
+
"loss": 0.8255,
|
| 12635 |
+
"mean_token_accuracy": 0.7976077675819397,
|
| 12636 |
+
"num_tokens": 15540964.0,
|
| 12637 |
+
"step": 14030
|
| 12638 |
+
},
|
| 12639 |
+
{
|
| 12640 |
+
"epoch": 2.828934112431997,
|
| 12641 |
+
"grad_norm": 11.0625,
|
| 12642 |
+
"learning_rate": 1.141782524011015e-06,
|
| 12643 |
+
"loss": 0.8116,
|
| 12644 |
+
"mean_token_accuracy": 0.7950972735881805,
|
| 12645 |
+
"num_tokens": 15551879.0,
|
| 12646 |
+
"step": 14040
|
| 12647 |
+
},
|
| 12648 |
+
{
|
| 12649 |
+
"epoch": 2.830949022768487,
|
| 12650 |
+
"grad_norm": 15.5625,
|
| 12651 |
+
"learning_rate": 1.1283497884344149e-06,
|
| 12652 |
+
"loss": 0.8752,
|
| 12653 |
+
"mean_token_accuracy": 0.7869309186935425,
|
| 12654 |
+
"num_tokens": 15563299.0,
|
| 12655 |
+
"step": 14050
|
| 12656 |
+
},
|
| 12657 |
+
{
|
| 12658 |
+
"epoch": 2.8329639331049767,
|
| 12659 |
+
"grad_norm": 13.1875,
|
| 12660 |
+
"learning_rate": 1.1149170528578145e-06,
|
| 12661 |
+
"loss": 0.842,
|
| 12662 |
+
"mean_token_accuracy": 0.7949115037918091,
|
| 12663 |
+
"num_tokens": 15573972.0,
|
| 12664 |
+
"step": 14060
|
| 12665 |
+
},
|
| 12666 |
+
{
|
| 12667 |
+
"epoch": 2.834978843441467,
|
| 12668 |
+
"grad_norm": 12.5,
|
| 12669 |
+
"learning_rate": 1.1014843172812144e-06,
|
| 12670 |
+
"loss": 0.9261,
|
| 12671 |
+
"mean_token_accuracy": 0.778299605846405,
|
| 12672 |
+
"num_tokens": 15587253.0,
|
| 12673 |
+
"step": 14070
|
| 12674 |
+
},
|
| 12675 |
+
{
|
| 12676 |
+
"epoch": 2.836993753777957,
|
| 12677 |
+
"grad_norm": 11.5625,
|
| 12678 |
+
"learning_rate": 1.0880515817046141e-06,
|
| 12679 |
+
"loss": 0.8781,
|
| 12680 |
+
"mean_token_accuracy": 0.7811039209365844,
|
| 12681 |
+
"num_tokens": 15598147.0,
|
| 12682 |
+
"step": 14080
|
| 12683 |
+
},
|
| 12684 |
+
{
|
| 12685 |
+
"epoch": 2.8390086641144467,
|
| 12686 |
+
"grad_norm": 14.375,
|
| 12687 |
+
"learning_rate": 1.074618846128014e-06,
|
| 12688 |
+
"loss": 0.887,
|
| 12689 |
+
"mean_token_accuracy": 0.7856419622898102,
|
| 12690 |
+
"num_tokens": 15609722.0,
|
| 12691 |
+
"step": 14090
|
| 12692 |
+
},
|
| 12693 |
+
{
|
| 12694 |
+
"epoch": 2.841023574450937,
|
| 12695 |
+
"grad_norm": 14.1875,
|
| 12696 |
+
"learning_rate": 1.061186110551414e-06,
|
| 12697 |
+
"loss": 0.8371,
|
| 12698 |
+
"mean_token_accuracy": 0.7899275839328765,
|
| 12699 |
+
"num_tokens": 15620950.0,
|
| 12700 |
+
"step": 14100
|
| 12701 |
+
},
|
| 12702 |
+
{
|
| 12703 |
+
"epoch": 2.843038484787427,
|
| 12704 |
+
"grad_norm": 13.625,
|
| 12705 |
+
"learning_rate": 1.0477533749748136e-06,
|
| 12706 |
+
"loss": 0.8213,
|
| 12707 |
+
"mean_token_accuracy": 0.8016888916492462,
|
| 12708 |
+
"num_tokens": 15632384.0,
|
| 12709 |
+
"step": 14110
|
| 12710 |
+
},
|
| 12711 |
+
{
|
| 12712 |
+
"epoch": 2.8450533951239168,
|
| 12713 |
+
"grad_norm": 10.1875,
|
| 12714 |
+
"learning_rate": 1.0343206393982135e-06,
|
| 12715 |
+
"loss": 0.8916,
|
| 12716 |
+
"mean_token_accuracy": 0.781292325258255,
|
| 12717 |
+
"num_tokens": 15643502.0,
|
| 12718 |
+
"step": 14120
|
| 12719 |
+
},
|
| 12720 |
+
{
|
| 12721 |
+
"epoch": 2.847068305460407,
|
| 12722 |
+
"grad_norm": 10.75,
|
| 12723 |
+
"learning_rate": 1.0208879038216134e-06,
|
| 12724 |
+
"loss": 0.8747,
|
| 12725 |
+
"mean_token_accuracy": 0.7830813884735107,
|
| 12726 |
+
"num_tokens": 15655219.0,
|
| 12727 |
+
"step": 14130
|
| 12728 |
+
},
|
| 12729 |
+
{
|
| 12730 |
+
"epoch": 2.849083215796897,
|
| 12731 |
+
"grad_norm": 11.875,
|
| 12732 |
+
"learning_rate": 1.007455168245013e-06,
|
| 12733 |
+
"loss": 0.8008,
|
| 12734 |
+
"mean_token_accuracy": 0.8025586724281311,
|
| 12735 |
+
"num_tokens": 15665158.0,
|
| 12736 |
+
"step": 14140
|
| 12737 |
+
},
|
| 12738 |
+
{
|
| 12739 |
+
"epoch": 2.851098126133387,
|
| 12740 |
+
"grad_norm": 10.625,
|
| 12741 |
+
"learning_rate": 9.94022432668413e-07,
|
| 12742 |
+
"loss": 0.8352,
|
| 12743 |
+
"mean_token_accuracy": 0.7977402985095978,
|
| 12744 |
+
"num_tokens": 15677733.0,
|
| 12745 |
+
"step": 14150
|
| 12746 |
+
},
|
| 12747 |
+
{
|
| 12748 |
+
"epoch": 2.8531130364698774,
|
| 12749 |
+
"grad_norm": 10.5625,
|
| 12750 |
+
"learning_rate": 9.805896970918128e-07,
|
| 12751 |
+
"loss": 0.8434,
|
| 12752 |
+
"mean_token_accuracy": 0.7951288640499115,
|
| 12753 |
+
"num_tokens": 15688430.0,
|
| 12754 |
+
"step": 14160
|
| 12755 |
+
},
|
| 12756 |
+
{
|
| 12757 |
+
"epoch": 2.855127946806367,
|
| 12758 |
+
"grad_norm": 11.75,
|
| 12759 |
+
"learning_rate": 9.671569615152127e-07,
|
| 12760 |
+
"loss": 0.8963,
|
| 12761 |
+
"mean_token_accuracy": 0.7789243698120117,
|
| 12762 |
+
"num_tokens": 15700376.0,
|
| 12763 |
+
"step": 14170
|
| 12764 |
+
},
|
| 12765 |
+
{
|
| 12766 |
+
"epoch": 2.857142857142857,
|
| 12767 |
+
"grad_norm": 13.0,
|
| 12768 |
+
"learning_rate": 9.537242259386124e-07,
|
| 12769 |
+
"loss": 0.8915,
|
| 12770 |
+
"mean_token_accuracy": 0.7802317202091217,
|
| 12771 |
+
"num_tokens": 15710761.0,
|
| 12772 |
+
"step": 14180
|
| 12773 |
+
},
|
| 12774 |
+
{
|
| 12775 |
+
"epoch": 2.8591577674793474,
|
| 12776 |
+
"grad_norm": 12.125,
|
| 12777 |
+
"learning_rate": 9.402914903620123e-07,
|
| 12778 |
+
"loss": 0.6912,
|
| 12779 |
+
"mean_token_accuracy": 0.8160697996616364,
|
| 12780 |
+
"num_tokens": 15721644.0,
|
| 12781 |
+
"step": 14190
|
| 12782 |
+
},
|
| 12783 |
+
{
|
| 12784 |
+
"epoch": 2.861172677815837,
|
| 12785 |
+
"grad_norm": 12.0,
|
| 12786 |
+
"learning_rate": 9.268587547854121e-07,
|
| 12787 |
+
"loss": 0.7238,
|
| 12788 |
+
"mean_token_accuracy": 0.8155933260917664,
|
| 12789 |
+
"num_tokens": 15732607.0,
|
| 12790 |
+
"step": 14200
|
| 12791 |
+
},
|
| 12792 |
+
{
|
| 12793 |
+
"epoch": 2.863187588152327,
|
| 12794 |
+
"grad_norm": 9.125,
|
| 12795 |
+
"learning_rate": 9.134260192088119e-07,
|
| 12796 |
+
"loss": 0.8317,
|
| 12797 |
+
"mean_token_accuracy": 0.7980758368968963,
|
| 12798 |
+
"num_tokens": 15745252.0,
|
| 12799 |
+
"step": 14210
|
| 12800 |
+
},
|
| 12801 |
+
{
|
| 12802 |
+
"epoch": 2.8652024984888174,
|
| 12803 |
+
"grad_norm": 11.0625,
|
| 12804 |
+
"learning_rate": 8.999932836322117e-07,
|
| 12805 |
+
"loss": 0.7692,
|
| 12806 |
+
"mean_token_accuracy": 0.812389326095581,
|
| 12807 |
+
"num_tokens": 15756570.0,
|
| 12808 |
+
"step": 14220
|
| 12809 |
+
},
|
| 12810 |
+
{
|
| 12811 |
+
"epoch": 2.867217408825307,
|
| 12812 |
+
"grad_norm": 12.0,
|
| 12813 |
+
"learning_rate": 8.865605480556117e-07,
|
| 12814 |
+
"loss": 0.807,
|
| 12815 |
+
"mean_token_accuracy": 0.8013573944568634,
|
| 12816 |
+
"num_tokens": 15768196.0,
|
| 12817 |
+
"step": 14230
|
| 12818 |
+
},
|
| 12819 |
+
{
|
| 12820 |
+
"epoch": 2.869232319161797,
|
| 12821 |
+
"grad_norm": 10.1875,
|
| 12822 |
+
"learning_rate": 8.731278124790115e-07,
|
| 12823 |
+
"loss": 0.8102,
|
| 12824 |
+
"mean_token_accuracy": 0.7977238118648529,
|
| 12825 |
+
"num_tokens": 15780108.0,
|
| 12826 |
+
"step": 14240
|
| 12827 |
+
},
|
| 12828 |
+
{
|
| 12829 |
+
"epoch": 2.8712472294982874,
|
| 12830 |
+
"grad_norm": 10.75,
|
| 12831 |
+
"learning_rate": 8.596950769024113e-07,
|
| 12832 |
+
"loss": 0.7232,
|
| 12833 |
+
"mean_token_accuracy": 0.8186571359634399,
|
| 12834 |
+
"num_tokens": 15790323.0,
|
| 12835 |
+
"step": 14250
|
| 12836 |
+
},
|
| 12837 |
+
{
|
| 12838 |
+
"epoch": 2.8732621398347775,
|
| 12839 |
+
"grad_norm": 10.75,
|
| 12840 |
+
"learning_rate": 8.46262341325811e-07,
|
| 12841 |
+
"loss": 0.7311,
|
| 12842 |
+
"mean_token_accuracy": 0.8196884751319885,
|
| 12843 |
+
"num_tokens": 15801035.0,
|
| 12844 |
+
"step": 14260
|
| 12845 |
+
},
|
| 12846 |
+
{
|
| 12847 |
+
"epoch": 2.8752770501712672,
|
| 12848 |
+
"grad_norm": 12.5625,
|
| 12849 |
+
"learning_rate": 8.328296057492109e-07,
|
| 12850 |
+
"loss": 0.9671,
|
| 12851 |
+
"mean_token_accuracy": 0.7726804137229919,
|
| 12852 |
+
"num_tokens": 15812082.0,
|
| 12853 |
+
"step": 14270
|
| 12854 |
+
},
|
| 12855 |
+
{
|
| 12856 |
+
"epoch": 2.8772919605077574,
|
| 12857 |
+
"grad_norm": 13.75,
|
| 12858 |
+
"learning_rate": 8.193968701726107e-07,
|
| 12859 |
+
"loss": 0.7606,
|
| 12860 |
+
"mean_token_accuracy": 0.8072145521640778,
|
| 12861 |
+
"num_tokens": 15822853.0,
|
| 12862 |
+
"step": 14280
|
| 12863 |
+
},
|
| 12864 |
+
{
|
| 12865 |
+
"epoch": 2.8793068708442475,
|
| 12866 |
+
"grad_norm": 14.75,
|
| 12867 |
+
"learning_rate": 8.059641345960105e-07,
|
| 12868 |
+
"loss": 0.8093,
|
| 12869 |
+
"mean_token_accuracy": 0.8010785162448884,
|
| 12870 |
+
"num_tokens": 15832947.0,
|
| 12871 |
+
"step": 14290
|
| 12872 |
+
},
|
| 12873 |
+
{
|
| 12874 |
+
"epoch": 2.8813217811807377,
|
| 12875 |
+
"grad_norm": 11.5,
|
| 12876 |
+
"learning_rate": 7.925313990194104e-07,
|
| 12877 |
+
"loss": 0.8572,
|
| 12878 |
+
"mean_token_accuracy": 0.7934750914573669,
|
| 12879 |
+
"num_tokens": 15843708.0,
|
| 12880 |
+
"step": 14300
|
| 12881 |
+
},
|
| 12882 |
+
{
|
| 12883 |
+
"epoch": 2.8833366915172274,
|
| 12884 |
+
"grad_norm": 10.625,
|
| 12885 |
+
"learning_rate": 7.790986634428102e-07,
|
| 12886 |
+
"loss": 0.7406,
|
| 12887 |
+
"mean_token_accuracy": 0.813157856464386,
|
| 12888 |
+
"num_tokens": 15855097.0,
|
| 12889 |
+
"step": 14310
|
| 12890 |
+
},
|
| 12891 |
+
{
|
| 12892 |
+
"epoch": 2.8853516018537175,
|
| 12893 |
+
"grad_norm": 13.875,
|
| 12894 |
+
"learning_rate": 7.6566592786621e-07,
|
| 12895 |
+
"loss": 0.8571,
|
| 12896 |
+
"mean_token_accuracy": 0.7906988859176636,
|
| 12897 |
+
"num_tokens": 15866641.0,
|
| 12898 |
+
"step": 14320
|
| 12899 |
+
},
|
| 12900 |
+
{
|
| 12901 |
+
"epoch": 2.8873665121902077,
|
| 12902 |
+
"grad_norm": 12.0625,
|
| 12903 |
+
"learning_rate": 7.522331922896098e-07,
|
| 12904 |
+
"loss": 0.7257,
|
| 12905 |
+
"mean_token_accuracy": 0.815925520658493,
|
| 12906 |
+
"num_tokens": 15877191.0,
|
| 12907 |
+
"step": 14330
|
| 12908 |
+
},
|
| 12909 |
+
{
|
| 12910 |
+
"epoch": 2.8893814225266974,
|
| 12911 |
+
"grad_norm": 10.6875,
|
| 12912 |
+
"learning_rate": 7.388004567130097e-07,
|
| 12913 |
+
"loss": 0.8654,
|
| 12914 |
+
"mean_token_accuracy": 0.7846165299415588,
|
| 12915 |
+
"num_tokens": 15888129.0,
|
| 12916 |
+
"step": 14340
|
| 12917 |
+
},
|
| 12918 |
+
{
|
| 12919 |
+
"epoch": 2.8913963328631875,
|
| 12920 |
+
"grad_norm": 11.625,
|
| 12921 |
+
"learning_rate": 7.253677211364094e-07,
|
| 12922 |
+
"loss": 0.7777,
|
| 12923 |
+
"mean_token_accuracy": 0.807235324382782,
|
| 12924 |
+
"num_tokens": 15899237.0,
|
| 12925 |
+
"step": 14350
|
| 12926 |
+
},
|
| 12927 |
+
{
|
| 12928 |
+
"epoch": 2.8934112431996777,
|
| 12929 |
+
"grad_norm": 14.625,
|
| 12930 |
+
"learning_rate": 7.119349855598092e-07,
|
| 12931 |
+
"loss": 0.769,
|
| 12932 |
+
"mean_token_accuracy": 0.8052566349506378,
|
| 12933 |
+
"num_tokens": 15910090.0,
|
| 12934 |
+
"step": 14360
|
| 12935 |
+
},
|
| 12936 |
+
{
|
| 12937 |
+
"epoch": 2.8954261535361674,
|
| 12938 |
+
"grad_norm": 9.5625,
|
| 12939 |
+
"learning_rate": 6.985022499832092e-07,
|
| 12940 |
+
"loss": 0.7232,
|
| 12941 |
+
"mean_token_accuracy": 0.821067851781845,
|
| 12942 |
+
"num_tokens": 15920709.0,
|
| 12943 |
+
"step": 14370
|
| 12944 |
+
},
|
| 12945 |
+
{
|
| 12946 |
+
"epoch": 2.8974410638726575,
|
| 12947 |
+
"grad_norm": 11.9375,
|
| 12948 |
+
"learning_rate": 6.85069514406609e-07,
|
| 12949 |
+
"loss": 0.7402,
|
| 12950 |
+
"mean_token_accuracy": 0.8163648307323456,
|
| 12951 |
+
"num_tokens": 15933274.0,
|
| 12952 |
+
"step": 14380
|
| 12953 |
+
},
|
| 12954 |
+
{
|
| 12955 |
+
"epoch": 2.8994559742091477,
|
| 12956 |
+
"grad_norm": 13.75,
|
| 12957 |
+
"learning_rate": 6.716367788300088e-07,
|
| 12958 |
+
"loss": 0.8013,
|
| 12959 |
+
"mean_token_accuracy": 0.8017265141010285,
|
| 12960 |
+
"num_tokens": 15943313.0,
|
| 12961 |
+
"step": 14390
|
| 12962 |
+
},
|
| 12963 |
+
{
|
| 12964 |
+
"epoch": 2.901470884545638,
|
| 12965 |
+
"grad_norm": 13.25,
|
| 12966 |
+
"learning_rate": 6.582040432534086e-07,
|
| 12967 |
+
"loss": 0.8565,
|
| 12968 |
+
"mean_token_accuracy": 0.786570030450821,
|
| 12969 |
+
"num_tokens": 15952883.0,
|
| 12970 |
+
"step": 14400
|
| 12971 |
+
},
|
| 12972 |
+
{
|
| 12973 |
+
"epoch": 2.903485794882128,
|
| 12974 |
+
"grad_norm": 14.5,
|
| 12975 |
+
"learning_rate": 6.447713076768085e-07,
|
| 12976 |
+
"loss": 0.7816,
|
| 12977 |
+
"mean_token_accuracy": 0.8096172749996186,
|
| 12978 |
+
"num_tokens": 15964351.0,
|
| 12979 |
+
"step": 14410
|
| 12980 |
+
},
|
| 12981 |
+
{
|
| 12982 |
+
"epoch": 2.9055007052186177,
|
| 12983 |
+
"grad_norm": 11.8125,
|
| 12984 |
+
"learning_rate": 6.313385721002083e-07,
|
| 12985 |
+
"loss": 0.8196,
|
| 12986 |
+
"mean_token_accuracy": 0.7991693377494812,
|
| 12987 |
+
"num_tokens": 15975245.0,
|
| 12988 |
+
"step": 14420
|
| 12989 |
+
},
|
| 12990 |
+
{
|
| 12991 |
+
"epoch": 2.907515615555108,
|
| 12992 |
+
"grad_norm": 11.875,
|
| 12993 |
+
"learning_rate": 6.179058365236081e-07,
|
| 12994 |
+
"loss": 0.7624,
|
| 12995 |
+
"mean_token_accuracy": 0.8095822989940643,
|
| 12996 |
+
"num_tokens": 15986457.0,
|
| 12997 |
+
"step": 14430
|
| 12998 |
+
},
|
| 12999 |
+
{
|
| 13000 |
+
"epoch": 2.909530525891598,
|
| 13001 |
+
"grad_norm": 11.125,
|
| 13002 |
+
"learning_rate": 6.04473100947008e-07,
|
| 13003 |
+
"loss": 0.7871,
|
| 13004 |
+
"mean_token_accuracy": 0.8019815146923065,
|
| 13005 |
+
"num_tokens": 15997870.0,
|
| 13006 |
+
"step": 14440
|
| 13007 |
+
},
|
| 13008 |
+
{
|
| 13009 |
+
"epoch": 2.9115454362280877,
|
| 13010 |
+
"grad_norm": 12.6875,
|
| 13011 |
+
"learning_rate": 5.910403653704078e-07,
|
| 13012 |
+
"loss": 0.7562,
|
| 13013 |
+
"mean_token_accuracy": 0.8092824459075928,
|
| 13014 |
+
"num_tokens": 16008778.0,
|
| 13015 |
+
"step": 14450
|
| 13016 |
+
},
|
| 13017 |
+
{
|
| 13018 |
+
"epoch": 2.913560346564578,
|
| 13019 |
+
"grad_norm": 10.4375,
|
| 13020 |
+
"learning_rate": 5.776076297938075e-07,
|
| 13021 |
+
"loss": 0.7719,
|
| 13022 |
+
"mean_token_accuracy": 0.8073769569396972,
|
| 13023 |
+
"num_tokens": 16020048.0,
|
| 13024 |
+
"step": 14460
|
| 13025 |
+
},
|
| 13026 |
+
{
|
| 13027 |
+
"epoch": 2.915575256901068,
|
| 13028 |
+
"grad_norm": 11.5,
|
| 13029 |
+
"learning_rate": 5.641748942172074e-07,
|
| 13030 |
+
"loss": 0.8207,
|
| 13031 |
+
"mean_token_accuracy": 0.7947039902210236,
|
| 13032 |
+
"num_tokens": 16032290.0,
|
| 13033 |
+
"step": 14470
|
| 13034 |
+
},
|
| 13035 |
+
{
|
| 13036 |
+
"epoch": 2.9175901672375577,
|
| 13037 |
+
"grad_norm": 13.75,
|
| 13038 |
+
"learning_rate": 5.507421586406072e-07,
|
| 13039 |
+
"loss": 0.7469,
|
| 13040 |
+
"mean_token_accuracy": 0.8104640543460846,
|
| 13041 |
+
"num_tokens": 16043678.0,
|
| 13042 |
+
"step": 14480
|
| 13043 |
+
},
|
| 13044 |
+
{
|
| 13045 |
+
"epoch": 2.919605077574048,
|
| 13046 |
+
"grad_norm": 11.375,
|
| 13047 |
+
"learning_rate": 5.37309423064007e-07,
|
| 13048 |
+
"loss": 0.9164,
|
| 13049 |
+
"mean_token_accuracy": 0.7859593093395233,
|
| 13050 |
+
"num_tokens": 16055184.0,
|
| 13051 |
+
"step": 14490
|
| 13052 |
+
},
|
| 13053 |
+
{
|
| 13054 |
+
"epoch": 2.921619987910538,
|
| 13055 |
+
"grad_norm": 15.125,
|
| 13056 |
+
"learning_rate": 5.238766874874068e-07,
|
| 13057 |
+
"loss": 0.8604,
|
| 13058 |
+
"mean_token_accuracy": 0.7894056618213654,
|
| 13059 |
+
"num_tokens": 16065206.0,
|
| 13060 |
+
"step": 14500
|
| 13061 |
}
|
| 13062 |
],
|
| 13063 |
"logging_steps": 10,
|
|
|
|
| 13077 |
"attributes": {}
|
| 13078 |
}
|
| 13079 |
},
|
| 13080 |
+
"total_flos": 1.9417933454309376e+16,
|
| 13081 |
"train_batch_size": 8,
|
| 13082 |
"trial_name": null,
|
| 13083 |
"trial_params": null
|