Training in progress, step 12500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18518c164df026440f068fac8233b3bff2d8d4502ff38a32a862597f23f6b7c0
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c73b91ebf8be54d28c1c49c244582f7f70def8a8258d400992d104200bbf23d2
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:113d12b5af2a861076397bdce257b8a1e5a1daabe8a5aaee5bfcbdb6024fca69
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10808,6 +10808,456 @@
|
|
| 10808 |
"mean_token_accuracy": 0.8190572082996368,
|
| 10809 |
"num_tokens": 13294166.0,
|
| 10810 |
"step": 12000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10811 |
}
|
| 10812 |
],
|
| 10813 |
"logging_steps": 10,
|
|
@@ -10827,7 +11277,7 @@
|
|
| 10827 |
"attributes": {}
|
| 10828 |
}
|
| 10829 |
},
|
| 10830 |
-
"total_flos": 1.
|
| 10831 |
"train_batch_size": 8,
|
| 10832 |
"trial_name": null,
|
| 10833 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.518637920612533,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 12500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10808 |
"mean_token_accuracy": 0.8190572082996368,
|
| 10809 |
"num_tokens": 13294166.0,
|
| 10810 |
"step": 12000
|
| 10811 |
+
},
|
| 10812 |
+
{
|
| 10813 |
+
"epoch": 2.4199073141245213,
|
| 10814 |
+
"grad_norm": 10.5625,
|
| 10815 |
+
"learning_rate": 3.868627846060851e-06,
|
| 10816 |
+
"loss": 0.7126,
|
| 10817 |
+
"mean_token_accuracy": 0.8174474656581878,
|
| 10818 |
+
"num_tokens": 13305628.0,
|
| 10819 |
+
"step": 12010
|
| 10820 |
+
},
|
| 10821 |
+
{
|
| 10822 |
+
"epoch": 2.4219222244610115,
|
| 10823 |
+
"grad_norm": 12.75,
|
| 10824 |
+
"learning_rate": 3.855195110484251e-06,
|
| 10825 |
+
"loss": 0.6862,
|
| 10826 |
+
"mean_token_accuracy": 0.8214840114116668,
|
| 10827 |
+
"num_tokens": 13316153.0,
|
| 10828 |
+
"step": 12020
|
| 10829 |
+
},
|
| 10830 |
+
{
|
| 10831 |
+
"epoch": 2.4239371347975016,
|
| 10832 |
+
"grad_norm": 10.125,
|
| 10833 |
+
"learning_rate": 3.84176237490765e-06,
|
| 10834 |
+
"loss": 0.8504,
|
| 10835 |
+
"mean_token_accuracy": 0.7878824770450592,
|
| 10836 |
+
"num_tokens": 13327124.0,
|
| 10837 |
+
"step": 12030
|
| 10838 |
+
},
|
| 10839 |
+
{
|
| 10840 |
+
"epoch": 2.4259520451339913,
|
| 10841 |
+
"grad_norm": 14.0,
|
| 10842 |
+
"learning_rate": 3.82832963933105e-06,
|
| 10843 |
+
"loss": 0.7197,
|
| 10844 |
+
"mean_token_accuracy": 0.8142663776874542,
|
| 10845 |
+
"num_tokens": 13337746.0,
|
| 10846 |
+
"step": 12040
|
| 10847 |
+
},
|
| 10848 |
+
{
|
| 10849 |
+
"epoch": 2.4279669554704815,
|
| 10850 |
+
"grad_norm": 13.125,
|
| 10851 |
+
"learning_rate": 3.81489690375445e-06,
|
| 10852 |
+
"loss": 0.7603,
|
| 10853 |
+
"mean_token_accuracy": 0.8075387954711915,
|
| 10854 |
+
"num_tokens": 13349108.0,
|
| 10855 |
+
"step": 12050
|
| 10856 |
+
},
|
| 10857 |
+
{
|
| 10858 |
+
"epoch": 2.4299818658069716,
|
| 10859 |
+
"grad_norm": 12.125,
|
| 10860 |
+
"learning_rate": 3.80146416817785e-06,
|
| 10861 |
+
"loss": 0.7589,
|
| 10862 |
+
"mean_token_accuracy": 0.8082424461841583,
|
| 10863 |
+
"num_tokens": 13359297.0,
|
| 10864 |
+
"step": 12060
|
| 10865 |
+
},
|
| 10866 |
+
{
|
| 10867 |
+
"epoch": 2.4319967761434618,
|
| 10868 |
+
"grad_norm": 11.5625,
|
| 10869 |
+
"learning_rate": 3.7880314326012495e-06,
|
| 10870 |
+
"loss": 0.8093,
|
| 10871 |
+
"mean_token_accuracy": 0.8029668807983399,
|
| 10872 |
+
"num_tokens": 13370587.0,
|
| 10873 |
+
"step": 12070
|
| 10874 |
+
},
|
| 10875 |
+
{
|
| 10876 |
+
"epoch": 2.4340116864799515,
|
| 10877 |
+
"grad_norm": 12.0,
|
| 10878 |
+
"learning_rate": 3.7745986970246496e-06,
|
| 10879 |
+
"loss": 0.7561,
|
| 10880 |
+
"mean_token_accuracy": 0.8101776361465454,
|
| 10881 |
+
"num_tokens": 13381606.0,
|
| 10882 |
+
"step": 12080
|
| 10883 |
+
},
|
| 10884 |
+
{
|
| 10885 |
+
"epoch": 2.4360265968164416,
|
| 10886 |
+
"grad_norm": 10.0625,
|
| 10887 |
+
"learning_rate": 3.7611659614480493e-06,
|
| 10888 |
+
"loss": 0.8599,
|
| 10889 |
+
"mean_token_accuracy": 0.7864530384540558,
|
| 10890 |
+
"num_tokens": 13394004.0,
|
| 10891 |
+
"step": 12090
|
| 10892 |
+
},
|
| 10893 |
+
{
|
| 10894 |
+
"epoch": 2.4380415071529318,
|
| 10895 |
+
"grad_norm": 11.4375,
|
| 10896 |
+
"learning_rate": 3.747733225871449e-06,
|
| 10897 |
+
"loss": 0.87,
|
| 10898 |
+
"mean_token_accuracy": 0.7880048811435699,
|
| 10899 |
+
"num_tokens": 13405785.0,
|
| 10900 |
+
"step": 12100
|
| 10901 |
+
},
|
| 10902 |
+
{
|
| 10903 |
+
"epoch": 2.440056417489422,
|
| 10904 |
+
"grad_norm": 11.375,
|
| 10905 |
+
"learning_rate": 3.7343004902948486e-06,
|
| 10906 |
+
"loss": 0.8041,
|
| 10907 |
+
"mean_token_accuracy": 0.7979571759700775,
|
| 10908 |
+
"num_tokens": 13418406.0,
|
| 10909 |
+
"step": 12110
|
| 10910 |
+
},
|
| 10911 |
+
{
|
| 10912 |
+
"epoch": 2.4420713278259116,
|
| 10913 |
+
"grad_norm": 13.5,
|
| 10914 |
+
"learning_rate": 3.7208677547182487e-06,
|
| 10915 |
+
"loss": 0.7067,
|
| 10916 |
+
"mean_token_accuracy": 0.8095525324344635,
|
| 10917 |
+
"num_tokens": 13428380.0,
|
| 10918 |
+
"step": 12120
|
| 10919 |
+
},
|
| 10920 |
+
{
|
| 10921 |
+
"epoch": 2.444086238162402,
|
| 10922 |
+
"grad_norm": 10.625,
|
| 10923 |
+
"learning_rate": 3.7074350191416484e-06,
|
| 10924 |
+
"loss": 0.8096,
|
| 10925 |
+
"mean_token_accuracy": 0.79591383934021,
|
| 10926 |
+
"num_tokens": 13439614.0,
|
| 10927 |
+
"step": 12130
|
| 10928 |
+
},
|
| 10929 |
+
{
|
| 10930 |
+
"epoch": 2.446101148498892,
|
| 10931 |
+
"grad_norm": 10.75,
|
| 10932 |
+
"learning_rate": 3.6940022835650485e-06,
|
| 10933 |
+
"loss": 0.8097,
|
| 10934 |
+
"mean_token_accuracy": 0.7982459485530853,
|
| 10935 |
+
"num_tokens": 13450951.0,
|
| 10936 |
+
"step": 12140
|
| 10937 |
+
},
|
| 10938 |
+
{
|
| 10939 |
+
"epoch": 2.4481160588353816,
|
| 10940 |
+
"grad_norm": 10.375,
|
| 10941 |
+
"learning_rate": 3.6805695479884478e-06,
|
| 10942 |
+
"loss": 0.82,
|
| 10943 |
+
"mean_token_accuracy": 0.7986723423004151,
|
| 10944 |
+
"num_tokens": 13461483.0,
|
| 10945 |
+
"step": 12150
|
| 10946 |
+
},
|
| 10947 |
+
{
|
| 10948 |
+
"epoch": 2.450130969171872,
|
| 10949 |
+
"grad_norm": 12.125,
|
| 10950 |
+
"learning_rate": 3.667136812411848e-06,
|
| 10951 |
+
"loss": 0.7788,
|
| 10952 |
+
"mean_token_accuracy": 0.80440074801445,
|
| 10953 |
+
"num_tokens": 13472237.0,
|
| 10954 |
+
"step": 12160
|
| 10955 |
+
},
|
| 10956 |
+
{
|
| 10957 |
+
"epoch": 2.452145879508362,
|
| 10958 |
+
"grad_norm": 9.625,
|
| 10959 |
+
"learning_rate": 3.6537040768352476e-06,
|
| 10960 |
+
"loss": 0.8397,
|
| 10961 |
+
"mean_token_accuracy": 0.7989638984203339,
|
| 10962 |
+
"num_tokens": 13483806.0,
|
| 10963 |
+
"step": 12170
|
| 10964 |
+
},
|
| 10965 |
+
{
|
| 10966 |
+
"epoch": 2.454160789844852,
|
| 10967 |
+
"grad_norm": 11.4375,
|
| 10968 |
+
"learning_rate": 3.6402713412586477e-06,
|
| 10969 |
+
"loss": 0.7816,
|
| 10970 |
+
"mean_token_accuracy": 0.8013097047805786,
|
| 10971 |
+
"num_tokens": 13493924.0,
|
| 10972 |
+
"step": 12180
|
| 10973 |
+
},
|
| 10974 |
+
{
|
| 10975 |
+
"epoch": 2.456175700181342,
|
| 10976 |
+
"grad_norm": 15.8125,
|
| 10977 |
+
"learning_rate": 3.6268386056820478e-06,
|
| 10978 |
+
"loss": 0.7321,
|
| 10979 |
+
"mean_token_accuracy": 0.815697294473648,
|
| 10980 |
+
"num_tokens": 13505064.0,
|
| 10981 |
+
"step": 12190
|
| 10982 |
+
},
|
| 10983 |
+
{
|
| 10984 |
+
"epoch": 2.458190610517832,
|
| 10985 |
+
"grad_norm": 10.3125,
|
| 10986 |
+
"learning_rate": 3.6134058701054475e-06,
|
| 10987 |
+
"loss": 0.766,
|
| 10988 |
+
"mean_token_accuracy": 0.8088764250278473,
|
| 10989 |
+
"num_tokens": 13515435.0,
|
| 10990 |
+
"step": 12200
|
| 10991 |
+
},
|
| 10992 |
+
{
|
| 10993 |
+
"epoch": 2.460205520854322,
|
| 10994 |
+
"grad_norm": 11.6875,
|
| 10995 |
+
"learning_rate": 3.5999731345288467e-06,
|
| 10996 |
+
"loss": 0.8167,
|
| 10997 |
+
"mean_token_accuracy": 0.7981218516826629,
|
| 10998 |
+
"num_tokens": 13526529.0,
|
| 10999 |
+
"step": 12210
|
| 11000 |
+
},
|
| 11001 |
+
{
|
| 11002 |
+
"epoch": 2.4622204311908122,
|
| 11003 |
+
"grad_norm": 11.5625,
|
| 11004 |
+
"learning_rate": 3.586540398952247e-06,
|
| 11005 |
+
"loss": 0.8728,
|
| 11006 |
+
"mean_token_accuracy": 0.7834985911846161,
|
| 11007 |
+
"num_tokens": 13537408.0,
|
| 11008 |
+
"step": 12220
|
| 11009 |
+
},
|
| 11010 |
+
{
|
| 11011 |
+
"epoch": 2.464235341527302,
|
| 11012 |
+
"grad_norm": 9.9375,
|
| 11013 |
+
"learning_rate": 3.573107663375647e-06,
|
| 11014 |
+
"loss": 0.8162,
|
| 11015 |
+
"mean_token_accuracy": 0.7954578995704651,
|
| 11016 |
+
"num_tokens": 13547488.0,
|
| 11017 |
+
"step": 12230
|
| 11018 |
+
},
|
| 11019 |
+
{
|
| 11020 |
+
"epoch": 2.466250251863792,
|
| 11021 |
+
"grad_norm": 12.0,
|
| 11022 |
+
"learning_rate": 3.5596749277990466e-06,
|
| 11023 |
+
"loss": 0.8507,
|
| 11024 |
+
"mean_token_accuracy": 0.7890534639358521,
|
| 11025 |
+
"num_tokens": 13558242.0,
|
| 11026 |
+
"step": 12240
|
| 11027 |
+
},
|
| 11028 |
+
{
|
| 11029 |
+
"epoch": 2.4682651622002822,
|
| 11030 |
+
"grad_norm": 11.1875,
|
| 11031 |
+
"learning_rate": 3.5462421922224467e-06,
|
| 11032 |
+
"loss": 0.7756,
|
| 11033 |
+
"mean_token_accuracy": 0.8034618675708771,
|
| 11034 |
+
"num_tokens": 13568217.0,
|
| 11035 |
+
"step": 12250
|
| 11036 |
+
},
|
| 11037 |
+
{
|
| 11038 |
+
"epoch": 2.470280072536772,
|
| 11039 |
+
"grad_norm": 13.3125,
|
| 11040 |
+
"learning_rate": 3.532809456645846e-06,
|
| 11041 |
+
"loss": 0.8328,
|
| 11042 |
+
"mean_token_accuracy": 0.7975371956825257,
|
| 11043 |
+
"num_tokens": 13579334.0,
|
| 11044 |
+
"step": 12260
|
| 11045 |
+
},
|
| 11046 |
+
{
|
| 11047 |
+
"epoch": 2.472294982873262,
|
| 11048 |
+
"grad_norm": 11.8125,
|
| 11049 |
+
"learning_rate": 3.519376721069246e-06,
|
| 11050 |
+
"loss": 0.7325,
|
| 11051 |
+
"mean_token_accuracy": 0.8158390104770661,
|
| 11052 |
+
"num_tokens": 13589924.0,
|
| 11053 |
+
"step": 12270
|
| 11054 |
+
},
|
| 11055 |
+
{
|
| 11056 |
+
"epoch": 2.4743098932097523,
|
| 11057 |
+
"grad_norm": 9.9375,
|
| 11058 |
+
"learning_rate": 3.5059439854926458e-06,
|
| 11059 |
+
"loss": 0.9189,
|
| 11060 |
+
"mean_token_accuracy": 0.7810778141021728,
|
| 11061 |
+
"num_tokens": 13601915.0,
|
| 11062 |
+
"step": 12280
|
| 11063 |
+
},
|
| 11064 |
+
{
|
| 11065 |
+
"epoch": 2.476324803546242,
|
| 11066 |
+
"grad_norm": 11.1875,
|
| 11067 |
+
"learning_rate": 3.492511249916046e-06,
|
| 11068 |
+
"loss": 0.7933,
|
| 11069 |
+
"mean_token_accuracy": 0.804823362827301,
|
| 11070 |
+
"num_tokens": 13613049.0,
|
| 11071 |
+
"step": 12290
|
| 11072 |
+
},
|
| 11073 |
+
{
|
| 11074 |
+
"epoch": 2.478339713882732,
|
| 11075 |
+
"grad_norm": 11.0625,
|
| 11076 |
+
"learning_rate": 3.4790785143394455e-06,
|
| 11077 |
+
"loss": 0.7509,
|
| 11078 |
+
"mean_token_accuracy": 0.8156402170658111,
|
| 11079 |
+
"num_tokens": 13624399.0,
|
| 11080 |
+
"step": 12300
|
| 11081 |
+
},
|
| 11082 |
+
{
|
| 11083 |
+
"epoch": 2.4803546242192223,
|
| 11084 |
+
"grad_norm": 15.0625,
|
| 11085 |
+
"learning_rate": 3.4656457787628457e-06,
|
| 11086 |
+
"loss": 0.7869,
|
| 11087 |
+
"mean_token_accuracy": 0.8047832548618317,
|
| 11088 |
+
"num_tokens": 13635186.0,
|
| 11089 |
+
"step": 12310
|
| 11090 |
+
},
|
| 11091 |
+
{
|
| 11092 |
+
"epoch": 2.4823695345557124,
|
| 11093 |
+
"grad_norm": 13.5,
|
| 11094 |
+
"learning_rate": 3.452213043186245e-06,
|
| 11095 |
+
"loss": 0.826,
|
| 11096 |
+
"mean_token_accuracy": 0.7985908687114716,
|
| 11097 |
+
"num_tokens": 13644792.0,
|
| 11098 |
+
"step": 12320
|
| 11099 |
+
},
|
| 11100 |
+
{
|
| 11101 |
+
"epoch": 2.484384444892202,
|
| 11102 |
+
"grad_norm": 10.0625,
|
| 11103 |
+
"learning_rate": 3.438780307609645e-06,
|
| 11104 |
+
"loss": 0.8709,
|
| 11105 |
+
"mean_token_accuracy": 0.7914902806282044,
|
| 11106 |
+
"num_tokens": 13656993.0,
|
| 11107 |
+
"step": 12330
|
| 11108 |
+
},
|
| 11109 |
+
{
|
| 11110 |
+
"epoch": 2.4863993552286923,
|
| 11111 |
+
"grad_norm": 10.1875,
|
| 11112 |
+
"learning_rate": 3.4253475720330447e-06,
|
| 11113 |
+
"loss": 0.8268,
|
| 11114 |
+
"mean_token_accuracy": 0.7995809733867645,
|
| 11115 |
+
"num_tokens": 13669719.0,
|
| 11116 |
+
"step": 12340
|
| 11117 |
+
},
|
| 11118 |
+
{
|
| 11119 |
+
"epoch": 2.4884142655651824,
|
| 11120 |
+
"grad_norm": 9.375,
|
| 11121 |
+
"learning_rate": 3.411914836456445e-06,
|
| 11122 |
+
"loss": 0.8012,
|
| 11123 |
+
"mean_token_accuracy": 0.7969933092594147,
|
| 11124 |
+
"num_tokens": 13679980.0,
|
| 11125 |
+
"step": 12350
|
| 11126 |
+
},
|
| 11127 |
+
{
|
| 11128 |
+
"epoch": 2.4904291759016726,
|
| 11129 |
+
"grad_norm": 10.5,
|
| 11130 |
+
"learning_rate": 3.3984821008798445e-06,
|
| 11131 |
+
"loss": 0.8088,
|
| 11132 |
+
"mean_token_accuracy": 0.8042493402957916,
|
| 11133 |
+
"num_tokens": 13691125.0,
|
| 11134 |
+
"step": 12360
|
| 11135 |
+
},
|
| 11136 |
+
{
|
| 11137 |
+
"epoch": 2.4924440862381623,
|
| 11138 |
+
"grad_norm": 11.0,
|
| 11139 |
+
"learning_rate": 3.385049365303244e-06,
|
| 11140 |
+
"loss": 0.8507,
|
| 11141 |
+
"mean_token_accuracy": 0.7906042397022247,
|
| 11142 |
+
"num_tokens": 13701602.0,
|
| 11143 |
+
"step": 12370
|
| 11144 |
+
},
|
| 11145 |
+
{
|
| 11146 |
+
"epoch": 2.4944589965746524,
|
| 11147 |
+
"grad_norm": 12.9375,
|
| 11148 |
+
"learning_rate": 3.371616629726644e-06,
|
| 11149 |
+
"loss": 0.7928,
|
| 11150 |
+
"mean_token_accuracy": 0.8045152962207794,
|
| 11151 |
+
"num_tokens": 13711693.0,
|
| 11152 |
+
"step": 12380
|
| 11153 |
+
},
|
| 11154 |
+
{
|
| 11155 |
+
"epoch": 2.4964739069111426,
|
| 11156 |
+
"grad_norm": 10.125,
|
| 11157 |
+
"learning_rate": 3.358183894150044e-06,
|
| 11158 |
+
"loss": 0.8049,
|
| 11159 |
+
"mean_token_accuracy": 0.7998487055301666,
|
| 11160 |
+
"num_tokens": 13723412.0,
|
| 11161 |
+
"step": 12390
|
| 11162 |
+
},
|
| 11163 |
+
{
|
| 11164 |
+
"epoch": 2.4984888172476323,
|
| 11165 |
+
"grad_norm": 10.625,
|
| 11166 |
+
"learning_rate": 3.3447511585734436e-06,
|
| 11167 |
+
"loss": 0.7884,
|
| 11168 |
+
"mean_token_accuracy": 0.7947759389877319,
|
| 11169 |
+
"num_tokens": 13733833.0,
|
| 11170 |
+
"step": 12400
|
| 11171 |
+
},
|
| 11172 |
+
{
|
| 11173 |
+
"epoch": 2.5005037275841224,
|
| 11174 |
+
"grad_norm": 10.5625,
|
| 11175 |
+
"learning_rate": 3.3313184229968437e-06,
|
| 11176 |
+
"loss": 0.7059,
|
| 11177 |
+
"mean_token_accuracy": 0.814959716796875,
|
| 11178 |
+
"num_tokens": 13744955.0,
|
| 11179 |
+
"step": 12410
|
| 11180 |
+
},
|
| 11181 |
+
{
|
| 11182 |
+
"epoch": 2.5025186379206126,
|
| 11183 |
+
"grad_norm": 12.125,
|
| 11184 |
+
"learning_rate": 3.317885687420244e-06,
|
| 11185 |
+
"loss": 0.8388,
|
| 11186 |
+
"mean_token_accuracy": 0.7952327311038971,
|
| 11187 |
+
"num_tokens": 13755393.0,
|
| 11188 |
+
"step": 12420
|
| 11189 |
+
},
|
| 11190 |
+
{
|
| 11191 |
+
"epoch": 2.5045335482571023,
|
| 11192 |
+
"grad_norm": 10.25,
|
| 11193 |
+
"learning_rate": 3.304452951843643e-06,
|
| 11194 |
+
"loss": 0.8153,
|
| 11195 |
+
"mean_token_accuracy": 0.8019894421100616,
|
| 11196 |
+
"num_tokens": 13766163.0,
|
| 11197 |
+
"step": 12430
|
| 11198 |
+
},
|
| 11199 |
+
{
|
| 11200 |
+
"epoch": 2.5065484585935924,
|
| 11201 |
+
"grad_norm": 10.0,
|
| 11202 |
+
"learning_rate": 3.2910202162670428e-06,
|
| 11203 |
+
"loss": 0.839,
|
| 11204 |
+
"mean_token_accuracy": 0.7980442643165588,
|
| 11205 |
+
"num_tokens": 13778064.0,
|
| 11206 |
+
"step": 12440
|
| 11207 |
+
},
|
| 11208 |
+
{
|
| 11209 |
+
"epoch": 2.5085633689300826,
|
| 11210 |
+
"grad_norm": 13.75,
|
| 11211 |
+
"learning_rate": 3.277587480690443e-06,
|
| 11212 |
+
"loss": 0.7933,
|
| 11213 |
+
"mean_token_accuracy": 0.8014937698841095,
|
| 11214 |
+
"num_tokens": 13788785.0,
|
| 11215 |
+
"step": 12450
|
| 11216 |
+
},
|
| 11217 |
+
{
|
| 11218 |
+
"epoch": 2.5105782792665727,
|
| 11219 |
+
"grad_norm": 13.0625,
|
| 11220 |
+
"learning_rate": 3.2641547451138426e-06,
|
| 11221 |
+
"loss": 0.8186,
|
| 11222 |
+
"mean_token_accuracy": 0.7962758064270019,
|
| 11223 |
+
"num_tokens": 13800349.0,
|
| 11224 |
+
"step": 12460
|
| 11225 |
+
},
|
| 11226 |
+
{
|
| 11227 |
+
"epoch": 2.512593189603063,
|
| 11228 |
+
"grad_norm": 10.5,
|
| 11229 |
+
"learning_rate": 3.2507220095372427e-06,
|
| 11230 |
+
"loss": 0.8279,
|
| 11231 |
+
"mean_token_accuracy": 0.7982799649238587,
|
| 11232 |
+
"num_tokens": 13810275.0,
|
| 11233 |
+
"step": 12470
|
| 11234 |
+
},
|
| 11235 |
+
{
|
| 11236 |
+
"epoch": 2.5146080999395526,
|
| 11237 |
+
"grad_norm": 12.875,
|
| 11238 |
+
"learning_rate": 3.237289273960642e-06,
|
| 11239 |
+
"loss": 0.7193,
|
| 11240 |
+
"mean_token_accuracy": 0.8151337385177613,
|
| 11241 |
+
"num_tokens": 13820122.0,
|
| 11242 |
+
"step": 12480
|
| 11243 |
+
},
|
| 11244 |
+
{
|
| 11245 |
+
"epoch": 2.5166230102760427,
|
| 11246 |
+
"grad_norm": 13.1875,
|
| 11247 |
+
"learning_rate": 3.223856538384042e-06,
|
| 11248 |
+
"loss": 0.8233,
|
| 11249 |
+
"mean_token_accuracy": 0.7997250974178314,
|
| 11250 |
+
"num_tokens": 13830787.0,
|
| 11251 |
+
"step": 12490
|
| 11252 |
+
},
|
| 11253 |
+
{
|
| 11254 |
+
"epoch": 2.518637920612533,
|
| 11255 |
+
"grad_norm": 12.75,
|
| 11256 |
+
"learning_rate": 3.2104238028074417e-06,
|
| 11257 |
+
"loss": 0.7905,
|
| 11258 |
+
"mean_token_accuracy": 0.8074711799621582,
|
| 11259 |
+
"num_tokens": 13840892.0,
|
| 11260 |
+
"step": 12500
|
| 11261 |
}
|
| 11262 |
],
|
| 11263 |
"logging_steps": 10,
|
|
|
|
| 11277 |
"attributes": {}
|
| 11278 |
}
|
| 11279 |
},
|
| 11280 |
+
"total_flos": 1.6741415131650048e+16,
|
| 11281 |
"train_batch_size": 8,
|
| 11282 |
"trial_name": null,
|
| 11283 |
"trial_params": null
|