Training in progress, step 11500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c78f458d11eae9e4154eb728cce06719d74e09c423918147e47d15f28937e92f
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:036c2c1f7cdbea44cbf7137c6b1c3cf16b5447a1c3d590934dbf649691bc4729
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f9a4928c3c29f8d8ffe6d8c80c93af4c98237f714bf32b55ba4f3d5d67a23da
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9908,6 +9908,456 @@
|
|
| 9908 |
"mean_token_accuracy": 0.7765659749507904,
|
| 9909 |
"num_tokens": 12178091.0,
|
| 9910 |
"step": 11000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9911 |
}
|
| 9912 |
],
|
| 9913 |
"logging_steps": 10,
|
|
@@ -9927,7 +10377,7 @@
|
|
| 9927 |
"attributes": {}
|
| 9928 |
}
|
| 9929 |
},
|
| 9930 |
-
"total_flos": 1.
|
| 9931 |
"train_batch_size": 8,
|
| 9932 |
"trial_name": null,
|
| 9933 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.3171468869635303,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 11500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9908 |
"mean_token_accuracy": 0.7765659749507904,
|
| 9909 |
"num_tokens": 12178091.0,
|
| 9910 |
"step": 11000
|
| 9911 |
+
},
|
| 9912 |
+
{
|
| 9913 |
+
"epoch": 2.2184162804755188,
|
| 9914 |
+
"grad_norm": 11.5625,
|
| 9915 |
+
"learning_rate": 5.211901403720868e-06,
|
| 9916 |
+
"loss": 0.8083,
|
| 9917 |
+
"mean_token_accuracy": 0.7975468277931214,
|
| 9918 |
+
"num_tokens": 12188970.0,
|
| 9919 |
+
"step": 11010
|
| 9920 |
+
},
|
| 9921 |
+
{
|
| 9922 |
+
"epoch": 2.220431190812009,
|
| 9923 |
+
"grad_norm": 10.0,
|
| 9924 |
+
"learning_rate": 5.198468668144267e-06,
|
| 9925 |
+
"loss": 0.7398,
|
| 9926 |
+
"mean_token_accuracy": 0.809011173248291,
|
| 9927 |
+
"num_tokens": 12201125.0,
|
| 9928 |
+
"step": 11020
|
| 9929 |
+
},
|
| 9930 |
+
{
|
| 9931 |
+
"epoch": 2.222446101148499,
|
| 9932 |
+
"grad_norm": 8.8125,
|
| 9933 |
+
"learning_rate": 5.185035932567668e-06,
|
| 9934 |
+
"loss": 0.8308,
|
| 9935 |
+
"mean_token_accuracy": 0.7948459804058075,
|
| 9936 |
+
"num_tokens": 12214789.0,
|
| 9937 |
+
"step": 11030
|
| 9938 |
+
},
|
| 9939 |
+
{
|
| 9940 |
+
"epoch": 2.2244610114849888,
|
| 9941 |
+
"grad_norm": 11.625,
|
| 9942 |
+
"learning_rate": 5.171603196991067e-06,
|
| 9943 |
+
"loss": 0.7774,
|
| 9944 |
+
"mean_token_accuracy": 0.8057547986507416,
|
| 9945 |
+
"num_tokens": 12224991.0,
|
| 9946 |
+
"step": 11040
|
| 9947 |
+
},
|
| 9948 |
+
{
|
| 9949 |
+
"epoch": 2.226475921821479,
|
| 9950 |
+
"grad_norm": 9.0,
|
| 9951 |
+
"learning_rate": 5.158170461414468e-06,
|
| 9952 |
+
"loss": 0.8747,
|
| 9953 |
+
"mean_token_accuracy": 0.7873781383037567,
|
| 9954 |
+
"num_tokens": 12235996.0,
|
| 9955 |
+
"step": 11050
|
| 9956 |
+
},
|
| 9957 |
+
{
|
| 9958 |
+
"epoch": 2.228490832157969,
|
| 9959 |
+
"grad_norm": 13.625,
|
| 9960 |
+
"learning_rate": 5.144737725837868e-06,
|
| 9961 |
+
"loss": 0.8058,
|
| 9962 |
+
"mean_token_accuracy": 0.800428307056427,
|
| 9963 |
+
"num_tokens": 12247704.0,
|
| 9964 |
+
"step": 11060
|
| 9965 |
+
},
|
| 9966 |
+
{
|
| 9967 |
+
"epoch": 2.2305057424944588,
|
| 9968 |
+
"grad_norm": 10.375,
|
| 9969 |
+
"learning_rate": 5.1313049902612665e-06,
|
| 9970 |
+
"loss": 0.8622,
|
| 9971 |
+
"mean_token_accuracy": 0.7865659236907959,
|
| 9972 |
+
"num_tokens": 12259569.0,
|
| 9973 |
+
"step": 11070
|
| 9974 |
+
},
|
| 9975 |
+
{
|
| 9976 |
+
"epoch": 2.232520652830949,
|
| 9977 |
+
"grad_norm": 14.0,
|
| 9978 |
+
"learning_rate": 5.117872254684667e-06,
|
| 9979 |
+
"loss": 0.7844,
|
| 9980 |
+
"mean_token_accuracy": 0.8082635223865509,
|
| 9981 |
+
"num_tokens": 12269981.0,
|
| 9982 |
+
"step": 11080
|
| 9983 |
+
},
|
| 9984 |
+
{
|
| 9985 |
+
"epoch": 2.234535563167439,
|
| 9986 |
+
"grad_norm": 12.1875,
|
| 9987 |
+
"learning_rate": 5.104439519108067e-06,
|
| 9988 |
+
"loss": 0.8389,
|
| 9989 |
+
"mean_token_accuracy": 0.7902609288692475,
|
| 9990 |
+
"num_tokens": 12281790.0,
|
| 9991 |
+
"step": 11090
|
| 9992 |
+
},
|
| 9993 |
+
{
|
| 9994 |
+
"epoch": 2.236550473503929,
|
| 9995 |
+
"grad_norm": 13.0625,
|
| 9996 |
+
"learning_rate": 5.091006783531467e-06,
|
| 9997 |
+
"loss": 0.7349,
|
| 9998 |
+
"mean_token_accuracy": 0.8109230279922486,
|
| 9999 |
+
"num_tokens": 12292218.0,
|
| 10000 |
+
"step": 11100
|
| 10001 |
+
},
|
| 10002 |
+
{
|
| 10003 |
+
"epoch": 2.238565383840419,
|
| 10004 |
+
"grad_norm": 11.8125,
|
| 10005 |
+
"learning_rate": 5.077574047954866e-06,
|
| 10006 |
+
"loss": 0.8892,
|
| 10007 |
+
"mean_token_accuracy": 0.7831051290035248,
|
| 10008 |
+
"num_tokens": 12303549.0,
|
| 10009 |
+
"step": 11110
|
| 10010 |
+
},
|
| 10011 |
+
{
|
| 10012 |
+
"epoch": 2.240580294176909,
|
| 10013 |
+
"grad_norm": 16.125,
|
| 10014 |
+
"learning_rate": 5.0641413123782666e-06,
|
| 10015 |
+
"loss": 0.8981,
|
| 10016 |
+
"mean_token_accuracy": 0.779134213924408,
|
| 10017 |
+
"num_tokens": 12314039.0,
|
| 10018 |
+
"step": 11120
|
| 10019 |
+
},
|
| 10020 |
+
{
|
| 10021 |
+
"epoch": 2.242595204513399,
|
| 10022 |
+
"grad_norm": 15.1875,
|
| 10023 |
+
"learning_rate": 5.050708576801666e-06,
|
| 10024 |
+
"loss": 0.8596,
|
| 10025 |
+
"mean_token_accuracy": 0.795196259021759,
|
| 10026 |
+
"num_tokens": 12323912.0,
|
| 10027 |
+
"step": 11130
|
| 10028 |
+
},
|
| 10029 |
+
{
|
| 10030 |
+
"epoch": 2.2446101148498894,
|
| 10031 |
+
"grad_norm": 12.75,
|
| 10032 |
+
"learning_rate": 5.037275841225066e-06,
|
| 10033 |
+
"loss": 0.8716,
|
| 10034 |
+
"mean_token_accuracy": 0.7830780863761901,
|
| 10035 |
+
"num_tokens": 12335963.0,
|
| 10036 |
+
"step": 11140
|
| 10037 |
+
},
|
| 10038 |
+
{
|
| 10039 |
+
"epoch": 2.246625025186379,
|
| 10040 |
+
"grad_norm": 9.6875,
|
| 10041 |
+
"learning_rate": 5.023843105648466e-06,
|
| 10042 |
+
"loss": 0.7997,
|
| 10043 |
+
"mean_token_accuracy": 0.7968696773052215,
|
| 10044 |
+
"num_tokens": 12347034.0,
|
| 10045 |
+
"step": 11150
|
| 10046 |
+
},
|
| 10047 |
+
{
|
| 10048 |
+
"epoch": 2.2486399355228692,
|
| 10049 |
+
"grad_norm": 12.6875,
|
| 10050 |
+
"learning_rate": 5.010410370071865e-06,
|
| 10051 |
+
"loss": 0.8812,
|
| 10052 |
+
"mean_token_accuracy": 0.7810611367225647,
|
| 10053 |
+
"num_tokens": 12359524.0,
|
| 10054 |
+
"step": 11160
|
| 10055 |
+
},
|
| 10056 |
+
{
|
| 10057 |
+
"epoch": 2.2506548458593594,
|
| 10058 |
+
"grad_norm": 11.3125,
|
| 10059 |
+
"learning_rate": 4.996977634495265e-06,
|
| 10060 |
+
"loss": 0.8117,
|
| 10061 |
+
"mean_token_accuracy": 0.8003436684608459,
|
| 10062 |
+
"num_tokens": 12369580.0,
|
| 10063 |
+
"step": 11170
|
| 10064 |
+
},
|
| 10065 |
+
{
|
| 10066 |
+
"epoch": 2.252669756195849,
|
| 10067 |
+
"grad_norm": 13.3125,
|
| 10068 |
+
"learning_rate": 4.9835448989186655e-06,
|
| 10069 |
+
"loss": 0.8,
|
| 10070 |
+
"mean_token_accuracy": 0.7997420608997345,
|
| 10071 |
+
"num_tokens": 12380449.0,
|
| 10072 |
+
"step": 11180
|
| 10073 |
+
},
|
| 10074 |
+
{
|
| 10075 |
+
"epoch": 2.2546846665323392,
|
| 10076 |
+
"grad_norm": 11.5625,
|
| 10077 |
+
"learning_rate": 4.970112163342065e-06,
|
| 10078 |
+
"loss": 0.7495,
|
| 10079 |
+
"mean_token_accuracy": 0.812464052438736,
|
| 10080 |
+
"num_tokens": 12391160.0,
|
| 10081 |
+
"step": 11190
|
| 10082 |
+
},
|
| 10083 |
+
{
|
| 10084 |
+
"epoch": 2.2566995768688294,
|
| 10085 |
+
"grad_norm": 10.75,
|
| 10086 |
+
"learning_rate": 4.956679427765465e-06,
|
| 10087 |
+
"loss": 0.8713,
|
| 10088 |
+
"mean_token_accuracy": 0.7861813962459564,
|
| 10089 |
+
"num_tokens": 12403496.0,
|
| 10090 |
+
"step": 11200
|
| 10091 |
+
},
|
| 10092 |
+
{
|
| 10093 |
+
"epoch": 2.2587144872053195,
|
| 10094 |
+
"grad_norm": 12.4375,
|
| 10095 |
+
"learning_rate": 4.9432466921888646e-06,
|
| 10096 |
+
"loss": 0.7124,
|
| 10097 |
+
"mean_token_accuracy": 0.8236021995544434,
|
| 10098 |
+
"num_tokens": 12414075.0,
|
| 10099 |
+
"step": 11210
|
| 10100 |
+
},
|
| 10101 |
+
{
|
| 10102 |
+
"epoch": 2.2607293975418092,
|
| 10103 |
+
"grad_norm": 12.1875,
|
| 10104 |
+
"learning_rate": 4.929813956612264e-06,
|
| 10105 |
+
"loss": 0.82,
|
| 10106 |
+
"mean_token_accuracy": 0.7931098341941833,
|
| 10107 |
+
"num_tokens": 12424499.0,
|
| 10108 |
+
"step": 11220
|
| 10109 |
+
},
|
| 10110 |
+
{
|
| 10111 |
+
"epoch": 2.2627443078782994,
|
| 10112 |
+
"grad_norm": 12.25,
|
| 10113 |
+
"learning_rate": 4.916381221035664e-06,
|
| 10114 |
+
"loss": 0.7704,
|
| 10115 |
+
"mean_token_accuracy": 0.8014878571033478,
|
| 10116 |
+
"num_tokens": 12435957.0,
|
| 10117 |
+
"step": 11230
|
| 10118 |
+
},
|
| 10119 |
+
{
|
| 10120 |
+
"epoch": 2.2647592182147895,
|
| 10121 |
+
"grad_norm": 12.3125,
|
| 10122 |
+
"learning_rate": 4.9029484854590644e-06,
|
| 10123 |
+
"loss": 0.8023,
|
| 10124 |
+
"mean_token_accuracy": 0.798302048444748,
|
| 10125 |
+
"num_tokens": 12447051.0,
|
| 10126 |
+
"step": 11240
|
| 10127 |
+
},
|
| 10128 |
+
{
|
| 10129 |
+
"epoch": 2.2667741285512797,
|
| 10130 |
+
"grad_norm": 11.0,
|
| 10131 |
+
"learning_rate": 4.889515749882464e-06,
|
| 10132 |
+
"loss": 0.8716,
|
| 10133 |
+
"mean_token_accuracy": 0.7819954872131347,
|
| 10134 |
+
"num_tokens": 12458100.0,
|
| 10135 |
+
"step": 11250
|
| 10136 |
+
},
|
| 10137 |
+
{
|
| 10138 |
+
"epoch": 2.2687890388877694,
|
| 10139 |
+
"grad_norm": 11.0,
|
| 10140 |
+
"learning_rate": 4.876083014305864e-06,
|
| 10141 |
+
"loss": 0.767,
|
| 10142 |
+
"mean_token_accuracy": 0.8059248864650727,
|
| 10143 |
+
"num_tokens": 12469697.0,
|
| 10144 |
+
"step": 11260
|
| 10145 |
+
},
|
| 10146 |
+
{
|
| 10147 |
+
"epoch": 2.2708039492242595,
|
| 10148 |
+
"grad_norm": 10.625,
|
| 10149 |
+
"learning_rate": 4.8626502787292635e-06,
|
| 10150 |
+
"loss": 0.7644,
|
| 10151 |
+
"mean_token_accuracy": 0.8037352323532104,
|
| 10152 |
+
"num_tokens": 12482462.0,
|
| 10153 |
+
"step": 11270
|
| 10154 |
+
},
|
| 10155 |
+
{
|
| 10156 |
+
"epoch": 2.2728188595607497,
|
| 10157 |
+
"grad_norm": 13.9375,
|
| 10158 |
+
"learning_rate": 4.849217543152663e-06,
|
| 10159 |
+
"loss": 0.8345,
|
| 10160 |
+
"mean_token_accuracy": 0.7897806167602539,
|
| 10161 |
+
"num_tokens": 12494660.0,
|
| 10162 |
+
"step": 11280
|
| 10163 |
+
},
|
| 10164 |
+
{
|
| 10165 |
+
"epoch": 2.2748337698972394,
|
| 10166 |
+
"grad_norm": 10.875,
|
| 10167 |
+
"learning_rate": 4.835784807576064e-06,
|
| 10168 |
+
"loss": 0.7907,
|
| 10169 |
+
"mean_token_accuracy": 0.8008930742740631,
|
| 10170 |
+
"num_tokens": 12505534.0,
|
| 10171 |
+
"step": 11290
|
| 10172 |
+
},
|
| 10173 |
+
{
|
| 10174 |
+
"epoch": 2.2768486802337295,
|
| 10175 |
+
"grad_norm": 12.3125,
|
| 10176 |
+
"learning_rate": 4.822352071999463e-06,
|
| 10177 |
+
"loss": 0.8689,
|
| 10178 |
+
"mean_token_accuracy": 0.789547073841095,
|
| 10179 |
+
"num_tokens": 12516343.0,
|
| 10180 |
+
"step": 11300
|
| 10181 |
+
},
|
| 10182 |
+
{
|
| 10183 |
+
"epoch": 2.2788635905702197,
|
| 10184 |
+
"grad_norm": 12.5625,
|
| 10185 |
+
"learning_rate": 4.808919336422863e-06,
|
| 10186 |
+
"loss": 0.8781,
|
| 10187 |
+
"mean_token_accuracy": 0.7866406381130219,
|
| 10188 |
+
"num_tokens": 12527325.0,
|
| 10189 |
+
"step": 11310
|
| 10190 |
+
},
|
| 10191 |
+
{
|
| 10192 |
+
"epoch": 2.2808785009067094,
|
| 10193 |
+
"grad_norm": 12.5,
|
| 10194 |
+
"learning_rate": 4.795486600846263e-06,
|
| 10195 |
+
"loss": 0.7903,
|
| 10196 |
+
"mean_token_accuracy": 0.8062444806098938,
|
| 10197 |
+
"num_tokens": 12538802.0,
|
| 10198 |
+
"step": 11320
|
| 10199 |
+
},
|
| 10200 |
+
{
|
| 10201 |
+
"epoch": 2.2828934112431996,
|
| 10202 |
+
"grad_norm": 13.3125,
|
| 10203 |
+
"learning_rate": 4.7820538652696624e-06,
|
| 10204 |
+
"loss": 0.7503,
|
| 10205 |
+
"mean_token_accuracy": 0.8119116723537445,
|
| 10206 |
+
"num_tokens": 12549546.0,
|
| 10207 |
+
"step": 11330
|
| 10208 |
+
},
|
| 10209 |
+
{
|
| 10210 |
+
"epoch": 2.2849083215796897,
|
| 10211 |
+
"grad_norm": 14.125,
|
| 10212 |
+
"learning_rate": 4.768621129693062e-06,
|
| 10213 |
+
"loss": 0.8099,
|
| 10214 |
+
"mean_token_accuracy": 0.8033313393592835,
|
| 10215 |
+
"num_tokens": 12560090.0,
|
| 10216 |
+
"step": 11340
|
| 10217 |
+
},
|
| 10218 |
+
{
|
| 10219 |
+
"epoch": 2.28692323191618,
|
| 10220 |
+
"grad_norm": 13.0625,
|
| 10221 |
+
"learning_rate": 4.755188394116463e-06,
|
| 10222 |
+
"loss": 0.9013,
|
| 10223 |
+
"mean_token_accuracy": 0.7799701750278473,
|
| 10224 |
+
"num_tokens": 12571882.0,
|
| 10225 |
+
"step": 11350
|
| 10226 |
+
},
|
| 10227 |
+
{
|
| 10228 |
+
"epoch": 2.28893814225267,
|
| 10229 |
+
"grad_norm": 11.5,
|
| 10230 |
+
"learning_rate": 4.741755658539862e-06,
|
| 10231 |
+
"loss": 0.8159,
|
| 10232 |
+
"mean_token_accuracy": 0.7954154729843139,
|
| 10233 |
+
"num_tokens": 12583570.0,
|
| 10234 |
+
"step": 11360
|
| 10235 |
+
},
|
| 10236 |
+
{
|
| 10237 |
+
"epoch": 2.2909530525891597,
|
| 10238 |
+
"grad_norm": 13.8125,
|
| 10239 |
+
"learning_rate": 4.728322922963262e-06,
|
| 10240 |
+
"loss": 0.7928,
|
| 10241 |
+
"mean_token_accuracy": 0.800947493314743,
|
| 10242 |
+
"num_tokens": 12594636.0,
|
| 10243 |
+
"step": 11370
|
| 10244 |
+
},
|
| 10245 |
+
{
|
| 10246 |
+
"epoch": 2.29296796292565,
|
| 10247 |
+
"grad_norm": 13.4375,
|
| 10248 |
+
"learning_rate": 4.714890187386662e-06,
|
| 10249 |
+
"loss": 0.7321,
|
| 10250 |
+
"mean_token_accuracy": 0.813873153924942,
|
| 10251 |
+
"num_tokens": 12605481.0,
|
| 10252 |
+
"step": 11380
|
| 10253 |
+
},
|
| 10254 |
+
{
|
| 10255 |
+
"epoch": 2.29498287326214,
|
| 10256 |
+
"grad_norm": 8.125,
|
| 10257 |
+
"learning_rate": 4.701457451810061e-06,
|
| 10258 |
+
"loss": 0.7956,
|
| 10259 |
+
"mean_token_accuracy": 0.8005879402160645,
|
| 10260 |
+
"num_tokens": 12616477.0,
|
| 10261 |
+
"step": 11390
|
| 10262 |
+
},
|
| 10263 |
+
{
|
| 10264 |
+
"epoch": 2.2969977835986297,
|
| 10265 |
+
"grad_norm": 12.5,
|
| 10266 |
+
"learning_rate": 4.688024716233461e-06,
|
| 10267 |
+
"loss": 0.8169,
|
| 10268 |
+
"mean_token_accuracy": 0.7956750094890594,
|
| 10269 |
+
"num_tokens": 12627070.0,
|
| 10270 |
+
"step": 11400
|
| 10271 |
+
},
|
| 10272 |
+
{
|
| 10273 |
+
"epoch": 2.29901269393512,
|
| 10274 |
+
"grad_norm": 12.9375,
|
| 10275 |
+
"learning_rate": 4.674591980656862e-06,
|
| 10276 |
+
"loss": 0.8507,
|
| 10277 |
+
"mean_token_accuracy": 0.7936709105968476,
|
| 10278 |
+
"num_tokens": 12639024.0,
|
| 10279 |
+
"step": 11410
|
| 10280 |
+
},
|
| 10281 |
+
{
|
| 10282 |
+
"epoch": 2.30102760427161,
|
| 10283 |
+
"grad_norm": 11.0625,
|
| 10284 |
+
"learning_rate": 4.661159245080261e-06,
|
| 10285 |
+
"loss": 0.7626,
|
| 10286 |
+
"mean_token_accuracy": 0.8066515803337098,
|
| 10287 |
+
"num_tokens": 12649628.0,
|
| 10288 |
+
"step": 11420
|
| 10289 |
+
},
|
| 10290 |
+
{
|
| 10291 |
+
"epoch": 2.3030425146080997,
|
| 10292 |
+
"grad_norm": 13.5625,
|
| 10293 |
+
"learning_rate": 4.647726509503661e-06,
|
| 10294 |
+
"loss": 0.829,
|
| 10295 |
+
"mean_token_accuracy": 0.7963403999805451,
|
| 10296 |
+
"num_tokens": 12660338.0,
|
| 10297 |
+
"step": 11430
|
| 10298 |
+
},
|
| 10299 |
+
{
|
| 10300 |
+
"epoch": 2.30505742494459,
|
| 10301 |
+
"grad_norm": 9.9375,
|
| 10302 |
+
"learning_rate": 4.634293773927061e-06,
|
| 10303 |
+
"loss": 0.8586,
|
| 10304 |
+
"mean_token_accuracy": 0.7866991460323334,
|
| 10305 |
+
"num_tokens": 12672143.0,
|
| 10306 |
+
"step": 11440
|
| 10307 |
+
},
|
| 10308 |
+
{
|
| 10309 |
+
"epoch": 2.30707233528108,
|
| 10310 |
+
"grad_norm": 9.9375,
|
| 10311 |
+
"learning_rate": 4.62086103835046e-06,
|
| 10312 |
+
"loss": 0.7363,
|
| 10313 |
+
"mean_token_accuracy": 0.8156927347183227,
|
| 10314 |
+
"num_tokens": 12681970.0,
|
| 10315 |
+
"step": 11450
|
| 10316 |
+
},
|
| 10317 |
+
{
|
| 10318 |
+
"epoch": 2.30908724561757,
|
| 10319 |
+
"grad_norm": 11.5,
|
| 10320 |
+
"learning_rate": 4.60742830277386e-06,
|
| 10321 |
+
"loss": 0.7685,
|
| 10322 |
+
"mean_token_accuracy": 0.8116656005382538,
|
| 10323 |
+
"num_tokens": 12692199.0,
|
| 10324 |
+
"step": 11460
|
| 10325 |
+
},
|
| 10326 |
+
{
|
| 10327 |
+
"epoch": 2.31110215595406,
|
| 10328 |
+
"grad_norm": 11.3125,
|
| 10329 |
+
"learning_rate": 4.5939955671972605e-06,
|
| 10330 |
+
"loss": 0.7485,
|
| 10331 |
+
"mean_token_accuracy": 0.8076441287994385,
|
| 10332 |
+
"num_tokens": 12702476.0,
|
| 10333 |
+
"step": 11470
|
| 10334 |
+
},
|
| 10335 |
+
{
|
| 10336 |
+
"epoch": 2.31311706629055,
|
| 10337 |
+
"grad_norm": 12.5625,
|
| 10338 |
+
"learning_rate": 4.58056283162066e-06,
|
| 10339 |
+
"loss": 0.7926,
|
| 10340 |
+
"mean_token_accuracy": 0.8072655260562897,
|
| 10341 |
+
"num_tokens": 12712782.0,
|
| 10342 |
+
"step": 11480
|
| 10343 |
+
},
|
| 10344 |
+
{
|
| 10345 |
+
"epoch": 2.31513197662704,
|
| 10346 |
+
"grad_norm": 15.9375,
|
| 10347 |
+
"learning_rate": 4.56713009604406e-06,
|
| 10348 |
+
"loss": 0.8342,
|
| 10349 |
+
"mean_token_accuracy": 0.7965205907821655,
|
| 10350 |
+
"num_tokens": 12723427.0,
|
| 10351 |
+
"step": 11490
|
| 10352 |
+
},
|
| 10353 |
+
{
|
| 10354 |
+
"epoch": 2.3171468869635303,
|
| 10355 |
+
"grad_norm": 11.5625,
|
| 10356 |
+
"learning_rate": 4.5536973604674596e-06,
|
| 10357 |
+
"loss": 0.7846,
|
| 10358 |
+
"mean_token_accuracy": 0.8073502600193023,
|
| 10359 |
+
"num_tokens": 12733862.0,
|
| 10360 |
+
"step": 11500
|
| 10361 |
}
|
| 10362 |
],
|
| 10363 |
"logging_steps": 10,
|
|
|
|
| 10377 |
"attributes": {}
|
| 10378 |
}
|
| 10379 |
},
|
| 10380 |
+
"total_flos": 1.5401013006618624e+16,
|
| 10381 |
"train_batch_size": 8,
|
| 10382 |
"trial_name": null,
|
| 10383 |
"trial_params": null
|