Training in progress, step 13500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5a9bd42305a39ea10e14897e10ee483294601df6c8b6bb20eb9acc7de3a5b74
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fd3300583dc98302b4bc1805b201303b140f489f169bc005adefa8fde0fce38
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ce5bfd25fb939a324385a4adfd5b1d29fedc6793352a13b276f53eccc661d15
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11708,6 +11708,456 @@
|
|
| 11708 |
"mean_token_accuracy": 0.7685989677906037,
|
| 11709 |
"num_tokens": 14393395.0,
|
| 11710 |
"step": 13000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11711 |
}
|
| 11712 |
],
|
| 11713 |
"logging_steps": 10,
|
|
@@ -11727,7 +12177,7 @@
|
|
| 11727 |
"attributes": {}
|
| 11728 |
}
|
| 11729 |
},
|
| 11730 |
-
"total_flos": 1.
|
| 11731 |
"train_batch_size": 8,
|
| 11732 |
"trial_name": null,
|
| 11733 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.7201289542615354,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 13500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11708 |
"mean_token_accuracy": 0.7685989677906037,
|
| 11709 |
"num_tokens": 14393395.0,
|
| 11710 |
"step": 13000
|
| 11711 |
+
},
|
| 11712 |
+
{
|
| 11713 |
+
"epoch": 2.621398347773524,
|
| 11714 |
+
"grad_norm": 10.5,
|
| 11715 |
+
"learning_rate": 2.525354288400833e-06,
|
| 11716 |
+
"loss": 0.7726,
|
| 11717 |
+
"mean_token_accuracy": 0.8112367451190948,
|
| 11718 |
+
"num_tokens": 14405357.0,
|
| 11719 |
+
"step": 13010
|
| 11720 |
+
},
|
| 11721 |
+
{
|
| 11722 |
+
"epoch": 2.623413258110014,
|
| 11723 |
+
"grad_norm": 10.5,
|
| 11724 |
+
"learning_rate": 2.511921552824233e-06,
|
| 11725 |
+
"loss": 0.8883,
|
| 11726 |
+
"mean_token_accuracy": 0.7785914719104767,
|
| 11727 |
+
"num_tokens": 14416733.0,
|
| 11728 |
+
"step": 13020
|
| 11729 |
+
},
|
| 11730 |
+
{
|
| 11731 |
+
"epoch": 2.625428168446504,
|
| 11732 |
+
"grad_norm": 11.0625,
|
| 11733 |
+
"learning_rate": 2.4984888172476325e-06,
|
| 11734 |
+
"loss": 0.8215,
|
| 11735 |
+
"mean_token_accuracy": 0.7950002431869507,
|
| 11736 |
+
"num_tokens": 14428394.0,
|
| 11737 |
+
"step": 13030
|
| 11738 |
+
},
|
| 11739 |
+
{
|
| 11740 |
+
"epoch": 2.627443078782994,
|
| 11741 |
+
"grad_norm": 11.75,
|
| 11742 |
+
"learning_rate": 2.4850560816710326e-06,
|
| 11743 |
+
"loss": 0.7974,
|
| 11744 |
+
"mean_token_accuracy": 0.8010411977767944,
|
| 11745 |
+
"num_tokens": 14439149.0,
|
| 11746 |
+
"step": 13040
|
| 11747 |
+
},
|
| 11748 |
+
{
|
| 11749 |
+
"epoch": 2.629457989119484,
|
| 11750 |
+
"grad_norm": 11.75,
|
| 11751 |
+
"learning_rate": 2.4716233460944323e-06,
|
| 11752 |
+
"loss": 0.7182,
|
| 11753 |
+
"mean_token_accuracy": 0.8175202190876008,
|
| 11754 |
+
"num_tokens": 14449655.0,
|
| 11755 |
+
"step": 13050
|
| 11756 |
+
},
|
| 11757 |
+
{
|
| 11758 |
+
"epoch": 2.631472899455974,
|
| 11759 |
+
"grad_norm": 10.625,
|
| 11760 |
+
"learning_rate": 2.458190610517832e-06,
|
| 11761 |
+
"loss": 0.7572,
|
| 11762 |
+
"mean_token_accuracy": 0.8045202255249023,
|
| 11763 |
+
"num_tokens": 14459975.0,
|
| 11764 |
+
"step": 13060
|
| 11765 |
+
},
|
| 11766 |
+
{
|
| 11767 |
+
"epoch": 2.6334878097924643,
|
| 11768 |
+
"grad_norm": 13.0,
|
| 11769 |
+
"learning_rate": 2.444757874941232e-06,
|
| 11770 |
+
"loss": 0.7044,
|
| 11771 |
+
"mean_token_accuracy": 0.81912060379982,
|
| 11772 |
+
"num_tokens": 14470362.0,
|
| 11773 |
+
"step": 13070
|
| 11774 |
+
},
|
| 11775 |
+
{
|
| 11776 |
+
"epoch": 2.6355027201289545,
|
| 11777 |
+
"grad_norm": 10.25,
|
| 11778 |
+
"learning_rate": 2.4313251393646317e-06,
|
| 11779 |
+
"loss": 0.8873,
|
| 11780 |
+
"mean_token_accuracy": 0.7796810269355774,
|
| 11781 |
+
"num_tokens": 14482051.0,
|
| 11782 |
+
"step": 13080
|
| 11783 |
+
},
|
| 11784 |
+
{
|
| 11785 |
+
"epoch": 2.637517630465444,
|
| 11786 |
+
"grad_norm": 12.4375,
|
| 11787 |
+
"learning_rate": 2.417892403788032e-06,
|
| 11788 |
+
"loss": 0.8495,
|
| 11789 |
+
"mean_token_accuracy": 0.7927891492843628,
|
| 11790 |
+
"num_tokens": 14493266.0,
|
| 11791 |
+
"step": 13090
|
| 11792 |
+
},
|
| 11793 |
+
{
|
| 11794 |
+
"epoch": 2.6395325408019343,
|
| 11795 |
+
"grad_norm": 9.125,
|
| 11796 |
+
"learning_rate": 2.4044596682114315e-06,
|
| 11797 |
+
"loss": 0.7857,
|
| 11798 |
+
"mean_token_accuracy": 0.7974917531013489,
|
| 11799 |
+
"num_tokens": 14505920.0,
|
| 11800 |
+
"step": 13100
|
| 11801 |
+
},
|
| 11802 |
+
{
|
| 11803 |
+
"epoch": 2.6415474511384245,
|
| 11804 |
+
"grad_norm": 11.0,
|
| 11805 |
+
"learning_rate": 2.3910269326348312e-06,
|
| 11806 |
+
"loss": 0.7904,
|
| 11807 |
+
"mean_token_accuracy": 0.7985158562660217,
|
| 11808 |
+
"num_tokens": 14517589.0,
|
| 11809 |
+
"step": 13110
|
| 11810 |
+
},
|
| 11811 |
+
{
|
| 11812 |
+
"epoch": 2.643562361474914,
|
| 11813 |
+
"grad_norm": 12.4375,
|
| 11814 |
+
"learning_rate": 2.3775941970582313e-06,
|
| 11815 |
+
"loss": 0.7822,
|
| 11816 |
+
"mean_token_accuracy": 0.7983147978782654,
|
| 11817 |
+
"num_tokens": 14528137.0,
|
| 11818 |
+
"step": 13120
|
| 11819 |
+
},
|
| 11820 |
+
{
|
| 11821 |
+
"epoch": 2.6455772718114043,
|
| 11822 |
+
"grad_norm": 11.5,
|
| 11823 |
+
"learning_rate": 2.364161461481631e-06,
|
| 11824 |
+
"loss": 0.7523,
|
| 11825 |
+
"mean_token_accuracy": 0.8122865617275238,
|
| 11826 |
+
"num_tokens": 14539021.0,
|
| 11827 |
+
"step": 13130
|
| 11828 |
+
},
|
| 11829 |
+
{
|
| 11830 |
+
"epoch": 2.6475921821478945,
|
| 11831 |
+
"grad_norm": 11.875,
|
| 11832 |
+
"learning_rate": 2.3507287259050307e-06,
|
| 11833 |
+
"loss": 0.8472,
|
| 11834 |
+
"mean_token_accuracy": 0.7906874716281891,
|
| 11835 |
+
"num_tokens": 14551105.0,
|
| 11836 |
+
"step": 13140
|
| 11837 |
+
},
|
| 11838 |
+
{
|
| 11839 |
+
"epoch": 2.649607092484384,
|
| 11840 |
+
"grad_norm": 11.875,
|
| 11841 |
+
"learning_rate": 2.337295990328431e-06,
|
| 11842 |
+
"loss": 0.8266,
|
| 11843 |
+
"mean_token_accuracy": 0.7899503231048584,
|
| 11844 |
+
"num_tokens": 14561956.0,
|
| 11845 |
+
"step": 13150
|
| 11846 |
+
},
|
| 11847 |
+
{
|
| 11848 |
+
"epoch": 2.6516220028208743,
|
| 11849 |
+
"grad_norm": 13.75,
|
| 11850 |
+
"learning_rate": 2.3238632547518305e-06,
|
| 11851 |
+
"loss": 0.6823,
|
| 11852 |
+
"mean_token_accuracy": 0.8246838212013244,
|
| 11853 |
+
"num_tokens": 14572323.0,
|
| 11854 |
+
"step": 13160
|
| 11855 |
+
},
|
| 11856 |
+
{
|
| 11857 |
+
"epoch": 2.6536369131573645,
|
| 11858 |
+
"grad_norm": 12.75,
|
| 11859 |
+
"learning_rate": 2.31043051917523e-06,
|
| 11860 |
+
"loss": 0.8422,
|
| 11861 |
+
"mean_token_accuracy": 0.7883449614048004,
|
| 11862 |
+
"num_tokens": 14583428.0,
|
| 11863 |
+
"step": 13170
|
| 11864 |
+
},
|
| 11865 |
+
{
|
| 11866 |
+
"epoch": 2.6556518234938546,
|
| 11867 |
+
"grad_norm": 12.125,
|
| 11868 |
+
"learning_rate": 2.2969977835986303e-06,
|
| 11869 |
+
"loss": 0.7448,
|
| 11870 |
+
"mean_token_accuracy": 0.8116320252418519,
|
| 11871 |
+
"num_tokens": 14593189.0,
|
| 11872 |
+
"step": 13180
|
| 11873 |
+
},
|
| 11874 |
+
{
|
| 11875 |
+
"epoch": 2.657666733830345,
|
| 11876 |
+
"grad_norm": 12.875,
|
| 11877 |
+
"learning_rate": 2.28356504802203e-06,
|
| 11878 |
+
"loss": 0.7905,
|
| 11879 |
+
"mean_token_accuracy": 0.8049618184566498,
|
| 11880 |
+
"num_tokens": 14604815.0,
|
| 11881 |
+
"step": 13190
|
| 11882 |
+
},
|
| 11883 |
+
{
|
| 11884 |
+
"epoch": 2.6596816441668345,
|
| 11885 |
+
"grad_norm": 10.625,
|
| 11886 |
+
"learning_rate": 2.2701323124454296e-06,
|
| 11887 |
+
"loss": 0.8403,
|
| 11888 |
+
"mean_token_accuracy": 0.7927229404449463,
|
| 11889 |
+
"num_tokens": 14616104.0,
|
| 11890 |
+
"step": 13200
|
| 11891 |
+
},
|
| 11892 |
+
{
|
| 11893 |
+
"epoch": 2.6616965545033247,
|
| 11894 |
+
"grad_norm": 15.75,
|
| 11895 |
+
"learning_rate": 2.2566995768688297e-06,
|
| 11896 |
+
"loss": 0.7988,
|
| 11897 |
+
"mean_token_accuracy": 0.8040601491928101,
|
| 11898 |
+
"num_tokens": 14626600.0,
|
| 11899 |
+
"step": 13210
|
| 11900 |
+
},
|
| 11901 |
+
{
|
| 11902 |
+
"epoch": 2.663711464839815,
|
| 11903 |
+
"grad_norm": 11.6875,
|
| 11904 |
+
"learning_rate": 2.2432668412922294e-06,
|
| 11905 |
+
"loss": 0.7137,
|
| 11906 |
+
"mean_token_accuracy": 0.8165888667106629,
|
| 11907 |
+
"num_tokens": 14637101.0,
|
| 11908 |
+
"step": 13220
|
| 11909 |
+
},
|
| 11910 |
+
{
|
| 11911 |
+
"epoch": 2.6657263751763045,
|
| 11912 |
+
"grad_norm": 12.5,
|
| 11913 |
+
"learning_rate": 2.229834105715629e-06,
|
| 11914 |
+
"loss": 0.7265,
|
| 11915 |
+
"mean_token_accuracy": 0.8147784769535065,
|
| 11916 |
+
"num_tokens": 14647761.0,
|
| 11917 |
+
"step": 13230
|
| 11918 |
+
},
|
| 11919 |
+
{
|
| 11920 |
+
"epoch": 2.6677412855127947,
|
| 11921 |
+
"grad_norm": 10.4375,
|
| 11922 |
+
"learning_rate": 2.216401370139029e-06,
|
| 11923 |
+
"loss": 0.7454,
|
| 11924 |
+
"mean_token_accuracy": 0.8097371995449066,
|
| 11925 |
+
"num_tokens": 14658218.0,
|
| 11926 |
+
"step": 13240
|
| 11927 |
+
},
|
| 11928 |
+
{
|
| 11929 |
+
"epoch": 2.669756195849285,
|
| 11930 |
+
"grad_norm": 10.375,
|
| 11931 |
+
"learning_rate": 2.202968634562429e-06,
|
| 11932 |
+
"loss": 0.7277,
|
| 11933 |
+
"mean_token_accuracy": 0.8119274914264679,
|
| 11934 |
+
"num_tokens": 14669077.0,
|
| 11935 |
+
"step": 13250
|
| 11936 |
+
},
|
| 11937 |
+
{
|
| 11938 |
+
"epoch": 2.6717711061857745,
|
| 11939 |
+
"grad_norm": 11.0625,
|
| 11940 |
+
"learning_rate": 2.1895358989858286e-06,
|
| 11941 |
+
"loss": 0.8152,
|
| 11942 |
+
"mean_token_accuracy": 0.7921769440174102,
|
| 11943 |
+
"num_tokens": 14680555.0,
|
| 11944 |
+
"step": 13260
|
| 11945 |
+
},
|
| 11946 |
+
{
|
| 11947 |
+
"epoch": 2.6737860165222647,
|
| 11948 |
+
"grad_norm": 15.3125,
|
| 11949 |
+
"learning_rate": 2.1761031634092282e-06,
|
| 11950 |
+
"loss": 0.776,
|
| 11951 |
+
"mean_token_accuracy": 0.8004971742630005,
|
| 11952 |
+
"num_tokens": 14691228.0,
|
| 11953 |
+
"step": 13270
|
| 11954 |
+
},
|
| 11955 |
+
{
|
| 11956 |
+
"epoch": 2.675800926858755,
|
| 11957 |
+
"grad_norm": 10.5625,
|
| 11958 |
+
"learning_rate": 2.1626704278326283e-06,
|
| 11959 |
+
"loss": 0.7987,
|
| 11960 |
+
"mean_token_accuracy": 0.7978484213352204,
|
| 11961 |
+
"num_tokens": 14702555.0,
|
| 11962 |
+
"step": 13280
|
| 11963 |
+
},
|
| 11964 |
+
{
|
| 11965 |
+
"epoch": 2.677815837195245,
|
| 11966 |
+
"grad_norm": 10.6875,
|
| 11967 |
+
"learning_rate": 2.149237692256028e-06,
|
| 11968 |
+
"loss": 0.9027,
|
| 11969 |
+
"mean_token_accuracy": 0.7809522151947021,
|
| 11970 |
+
"num_tokens": 14713437.0,
|
| 11971 |
+
"step": 13290
|
| 11972 |
+
},
|
| 11973 |
+
{
|
| 11974 |
+
"epoch": 2.679830747531735,
|
| 11975 |
+
"grad_norm": 16.125,
|
| 11976 |
+
"learning_rate": 2.1358049566794277e-06,
|
| 11977 |
+
"loss": 0.7883,
|
| 11978 |
+
"mean_token_accuracy": 0.805361670255661,
|
| 11979 |
+
"num_tokens": 14725318.0,
|
| 11980 |
+
"step": 13300
|
| 11981 |
+
},
|
| 11982 |
+
{
|
| 11983 |
+
"epoch": 2.681845657868225,
|
| 11984 |
+
"grad_norm": 12.0625,
|
| 11985 |
+
"learning_rate": 2.122372221102828e-06,
|
| 11986 |
+
"loss": 0.8495,
|
| 11987 |
+
"mean_token_accuracy": 0.7887078404426575,
|
| 11988 |
+
"num_tokens": 14736729.0,
|
| 11989 |
+
"step": 13310
|
| 11990 |
+
},
|
| 11991 |
+
{
|
| 11992 |
+
"epoch": 2.683860568204715,
|
| 11993 |
+
"grad_norm": 10.3125,
|
| 11994 |
+
"learning_rate": 2.1089394855262275e-06,
|
| 11995 |
+
"loss": 0.7301,
|
| 11996 |
+
"mean_token_accuracy": 0.8114965260028839,
|
| 11997 |
+
"num_tokens": 14747781.0,
|
| 11998 |
+
"step": 13320
|
| 11999 |
+
},
|
| 12000 |
+
{
|
| 12001 |
+
"epoch": 2.685875478541205,
|
| 12002 |
+
"grad_norm": 9.8125,
|
| 12003 |
+
"learning_rate": 2.095506749949627e-06,
|
| 12004 |
+
"loss": 0.7964,
|
| 12005 |
+
"mean_token_accuracy": 0.7989574909210205,
|
| 12006 |
+
"num_tokens": 14758609.0,
|
| 12007 |
+
"step": 13330
|
| 12008 |
+
},
|
| 12009 |
+
{
|
| 12010 |
+
"epoch": 2.687890388877695,
|
| 12011 |
+
"grad_norm": 11.1875,
|
| 12012 |
+
"learning_rate": 2.0820740143730273e-06,
|
| 12013 |
+
"loss": 0.9205,
|
| 12014 |
+
"mean_token_accuracy": 0.779743617773056,
|
| 12015 |
+
"num_tokens": 14770464.0,
|
| 12016 |
+
"step": 13340
|
| 12017 |
+
},
|
| 12018 |
+
{
|
| 12019 |
+
"epoch": 2.689905299214185,
|
| 12020 |
+
"grad_norm": 12.0625,
|
| 12021 |
+
"learning_rate": 2.068641278796427e-06,
|
| 12022 |
+
"loss": 0.8432,
|
| 12023 |
+
"mean_token_accuracy": 0.788221025466919,
|
| 12024 |
+
"num_tokens": 14783187.0,
|
| 12025 |
+
"step": 13350
|
| 12026 |
+
},
|
| 12027 |
+
{
|
| 12028 |
+
"epoch": 2.691920209550675,
|
| 12029 |
+
"grad_norm": 11.25,
|
| 12030 |
+
"learning_rate": 2.0552085432198266e-06,
|
| 12031 |
+
"loss": 0.7166,
|
| 12032 |
+
"mean_token_accuracy": 0.8211326837539673,
|
| 12033 |
+
"num_tokens": 14794959.0,
|
| 12034 |
+
"step": 13360
|
| 12035 |
+
},
|
| 12036 |
+
{
|
| 12037 |
+
"epoch": 2.693935119887165,
|
| 12038 |
+
"grad_norm": 12.5625,
|
| 12039 |
+
"learning_rate": 2.0417758076432268e-06,
|
| 12040 |
+
"loss": 0.7359,
|
| 12041 |
+
"mean_token_accuracy": 0.8126482903957367,
|
| 12042 |
+
"num_tokens": 14805848.0,
|
| 12043 |
+
"step": 13370
|
| 12044 |
+
},
|
| 12045 |
+
{
|
| 12046 |
+
"epoch": 2.695950030223655,
|
| 12047 |
+
"grad_norm": 10.5625,
|
| 12048 |
+
"learning_rate": 2.028343072066627e-06,
|
| 12049 |
+
"loss": 0.7513,
|
| 12050 |
+
"mean_token_accuracy": 0.8137486338615417,
|
| 12051 |
+
"num_tokens": 14817111.0,
|
| 12052 |
+
"step": 13380
|
| 12053 |
+
},
|
| 12054 |
+
{
|
| 12055 |
+
"epoch": 2.697964940560145,
|
| 12056 |
+
"grad_norm": 13.8125,
|
| 12057 |
+
"learning_rate": 2.014910336490026e-06,
|
| 12058 |
+
"loss": 0.827,
|
| 12059 |
+
"mean_token_accuracy": 0.7981631934642792,
|
| 12060 |
+
"num_tokens": 14827755.0,
|
| 12061 |
+
"step": 13390
|
| 12062 |
+
},
|
| 12063 |
+
{
|
| 12064 |
+
"epoch": 2.699979850896635,
|
| 12065 |
+
"grad_norm": 10.5,
|
| 12066 |
+
"learning_rate": 2.0014776009134262e-06,
|
| 12067 |
+
"loss": 0.7427,
|
| 12068 |
+
"mean_token_accuracy": 0.8144878685474396,
|
| 12069 |
+
"num_tokens": 14840080.0,
|
| 12070 |
+
"step": 13400
|
| 12071 |
+
},
|
| 12072 |
+
{
|
| 12073 |
+
"epoch": 2.701994761233125,
|
| 12074 |
+
"grad_norm": 11.5625,
|
| 12075 |
+
"learning_rate": 1.988044865336826e-06,
|
| 12076 |
+
"loss": 0.8199,
|
| 12077 |
+
"mean_token_accuracy": 0.7990254759788513,
|
| 12078 |
+
"num_tokens": 14852571.0,
|
| 12079 |
+
"step": 13410
|
| 12080 |
+
},
|
| 12081 |
+
{
|
| 12082 |
+
"epoch": 2.704009671569615,
|
| 12083 |
+
"grad_norm": 10.1875,
|
| 12084 |
+
"learning_rate": 1.974612129760226e-06,
|
| 12085 |
+
"loss": 0.8714,
|
| 12086 |
+
"mean_token_accuracy": 0.788819420337677,
|
| 12087 |
+
"num_tokens": 14865165.0,
|
| 12088 |
+
"step": 13420
|
| 12089 |
+
},
|
| 12090 |
+
{
|
| 12091 |
+
"epoch": 2.7060245819061053,
|
| 12092 |
+
"grad_norm": 10.75,
|
| 12093 |
+
"learning_rate": 1.9611793941836257e-06,
|
| 12094 |
+
"loss": 0.8413,
|
| 12095 |
+
"mean_token_accuracy": 0.7880795717239379,
|
| 12096 |
+
"num_tokens": 14875987.0,
|
| 12097 |
+
"step": 13430
|
| 12098 |
+
},
|
| 12099 |
+
{
|
| 12100 |
+
"epoch": 2.7080394922425954,
|
| 12101 |
+
"grad_norm": 13.4375,
|
| 12102 |
+
"learning_rate": 1.9477466586070254e-06,
|
| 12103 |
+
"loss": 0.8133,
|
| 12104 |
+
"mean_token_accuracy": 0.7958886742591857,
|
| 12105 |
+
"num_tokens": 14887222.0,
|
| 12106 |
+
"step": 13440
|
| 12107 |
+
},
|
| 12108 |
+
{
|
| 12109 |
+
"epoch": 2.710054402579085,
|
| 12110 |
+
"grad_norm": 11.625,
|
| 12111 |
+
"learning_rate": 1.9343139230304255e-06,
|
| 12112 |
+
"loss": 0.8375,
|
| 12113 |
+
"mean_token_accuracy": 0.7916811347007752,
|
| 12114 |
+
"num_tokens": 14897992.0,
|
| 12115 |
+
"step": 13450
|
| 12116 |
+
},
|
| 12117 |
+
{
|
| 12118 |
+
"epoch": 2.7120693129155753,
|
| 12119 |
+
"grad_norm": 11.1875,
|
| 12120 |
+
"learning_rate": 1.920881187453825e-06,
|
| 12121 |
+
"loss": 0.7921,
|
| 12122 |
+
"mean_token_accuracy": 0.8102820634841919,
|
| 12123 |
+
"num_tokens": 14909769.0,
|
| 12124 |
+
"step": 13460
|
| 12125 |
+
},
|
| 12126 |
+
{
|
| 12127 |
+
"epoch": 2.7140842232520654,
|
| 12128 |
+
"grad_norm": 14.4375,
|
| 12129 |
+
"learning_rate": 1.907448451877225e-06,
|
| 12130 |
+
"loss": 0.7959,
|
| 12131 |
+
"mean_token_accuracy": 0.8004967868328094,
|
| 12132 |
+
"num_tokens": 14920552.0,
|
| 12133 |
+
"step": 13470
|
| 12134 |
+
},
|
| 12135 |
+
{
|
| 12136 |
+
"epoch": 2.716099133588555,
|
| 12137 |
+
"grad_norm": 14.5625,
|
| 12138 |
+
"learning_rate": 1.8940157163006247e-06,
|
| 12139 |
+
"loss": 0.8219,
|
| 12140 |
+
"mean_token_accuracy": 0.796020919084549,
|
| 12141 |
+
"num_tokens": 14932205.0,
|
| 12142 |
+
"step": 13480
|
| 12143 |
+
},
|
| 12144 |
+
{
|
| 12145 |
+
"epoch": 2.7181140439250453,
|
| 12146 |
+
"grad_norm": 11.0,
|
| 12147 |
+
"learning_rate": 1.8805829807240246e-06,
|
| 12148 |
+
"loss": 0.7236,
|
| 12149 |
+
"mean_token_accuracy": 0.8162269771099091,
|
| 12150 |
+
"num_tokens": 14944469.0,
|
| 12151 |
+
"step": 13490
|
| 12152 |
+
},
|
| 12153 |
+
{
|
| 12154 |
+
"epoch": 2.7201289542615354,
|
| 12155 |
+
"grad_norm": 13.8125,
|
| 12156 |
+
"learning_rate": 1.8671502451474243e-06,
|
| 12157 |
+
"loss": 0.8344,
|
| 12158 |
+
"mean_token_accuracy": 0.7941052973270416,
|
| 12159 |
+
"num_tokens": 14956201.0,
|
| 12160 |
+
"step": 13500
|
| 12161 |
}
|
| 12162 |
],
|
| 12163 |
"logging_steps": 10,
|
|
|
|
| 12177 |
"attributes": {}
|
| 12178 |
}
|
| 12179 |
},
|
| 12180 |
+
"total_flos": 1.807875931971379e+16,
|
| 12181 |
"train_batch_size": 8,
|
| 12182 |
"trial_name": null,
|
| 12183 |
"trial_params": null
|