Training in progress, step 14889, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abfb7980299d9a0833e40cfa75a4e071101b9b5dbcb4b7b8be67cc1f7a5b1358
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d02f233cb73ec902ca0b622f60572ba5696796aa69c3f044f06782367911a3f9
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d3ef4695d48aabb51830d7d806ccbb8d1a7c1dd1163d43a57a82226f9575540
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -13058,6 +13058,348 @@
|
|
| 13058 |
"mean_token_accuracy": 0.7894056618213654,
|
| 13059 |
"num_tokens": 16065206.0,
|
| 13060 |
"step": 14500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13061 |
}
|
| 13062 |
],
|
| 13063 |
"logging_steps": 10,
|
|
@@ -13072,12 +13414,12 @@
|
|
| 13072 |
"should_evaluate": false,
|
| 13073 |
"should_log": false,
|
| 13074 |
"should_save": true,
|
| 13075 |
-
"should_training_stop":
|
| 13076 |
},
|
| 13077 |
"attributes": {}
|
| 13078 |
}
|
| 13079 |
},
|
| 13080 |
-
"total_flos": 1.
|
| 13081 |
"train_batch_size": 8,
|
| 13082 |
"trial_name": null,
|
| 13083 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 14889,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 13058 |
"mean_token_accuracy": 0.7894056618213654,
|
| 13059 |
"num_tokens": 16065206.0,
|
| 13060 |
"step": 14500
|
| 13061 |
+
},
|
| 13062 |
+
{
|
| 13063 |
+
"epoch": 2.923634898247028,
|
| 13064 |
+
"grad_norm": 10.6875,
|
| 13065 |
+
"learning_rate": 5.104439519108067e-07,
|
| 13066 |
+
"loss": 0.8542,
|
| 13067 |
+
"mean_token_accuracy": 0.7932229697704315,
|
| 13068 |
+
"num_tokens": 16076636.0,
|
| 13069 |
+
"step": 14510
|
| 13070 |
+
},
|
| 13071 |
+
{
|
| 13072 |
+
"epoch": 2.9256498085835183,
|
| 13073 |
+
"grad_norm": 10.6875,
|
| 13074 |
+
"learning_rate": 4.970112163342065e-07,
|
| 13075 |
+
"loss": 0.8027,
|
| 13076 |
+
"mean_token_accuracy": 0.7981291949748993,
|
| 13077 |
+
"num_tokens": 16088110.0,
|
| 13078 |
+
"step": 14520
|
| 13079 |
+
},
|
| 13080 |
+
{
|
| 13081 |
+
"epoch": 2.927664718920008,
|
| 13082 |
+
"grad_norm": 13.9375,
|
| 13083 |
+
"learning_rate": 4.835784807576064e-07,
|
| 13084 |
+
"loss": 0.8503,
|
| 13085 |
+
"mean_token_accuracy": 0.7925727784633636,
|
| 13086 |
+
"num_tokens": 16100038.0,
|
| 13087 |
+
"step": 14530
|
| 13088 |
+
},
|
| 13089 |
+
{
|
| 13090 |
+
"epoch": 2.929679629256498,
|
| 13091 |
+
"grad_norm": 12.25,
|
| 13092 |
+
"learning_rate": 4.7014574518100616e-07,
|
| 13093 |
+
"loss": 0.908,
|
| 13094 |
+
"mean_token_accuracy": 0.787571269273758,
|
| 13095 |
+
"num_tokens": 16110204.0,
|
| 13096 |
+
"step": 14540
|
| 13097 |
+
},
|
| 13098 |
+
{
|
| 13099 |
+
"epoch": 2.9316945395929883,
|
| 13100 |
+
"grad_norm": 10.4375,
|
| 13101 |
+
"learning_rate": 4.5671300960440595e-07,
|
| 13102 |
+
"loss": 0.8271,
|
| 13103 |
+
"mean_token_accuracy": 0.7935189664363861,
|
| 13104 |
+
"num_tokens": 16122082.0,
|
| 13105 |
+
"step": 14550
|
| 13106 |
+
},
|
| 13107 |
+
{
|
| 13108 |
+
"epoch": 2.933709449929478,
|
| 13109 |
+
"grad_norm": 10.875,
|
| 13110 |
+
"learning_rate": 4.4328027402780584e-07,
|
| 13111 |
+
"loss": 0.7881,
|
| 13112 |
+
"mean_token_accuracy": 0.8038492739200592,
|
| 13113 |
+
"num_tokens": 16133321.0,
|
| 13114 |
+
"step": 14560
|
| 13115 |
+
},
|
| 13116 |
+
{
|
| 13117 |
+
"epoch": 2.935724360265968,
|
| 13118 |
+
"grad_norm": 9.875,
|
| 13119 |
+
"learning_rate": 4.2984753845120563e-07,
|
| 13120 |
+
"loss": 0.8147,
|
| 13121 |
+
"mean_token_accuracy": 0.7948280692100524,
|
| 13122 |
+
"num_tokens": 16145276.0,
|
| 13123 |
+
"step": 14570
|
| 13124 |
+
},
|
| 13125 |
+
{
|
| 13126 |
+
"epoch": 2.9377392706024583,
|
| 13127 |
+
"grad_norm": 13.4375,
|
| 13128 |
+
"learning_rate": 4.1641480287460547e-07,
|
| 13129 |
+
"loss": 0.7914,
|
| 13130 |
+
"mean_token_accuracy": 0.8023073971271515,
|
| 13131 |
+
"num_tokens": 16156515.0,
|
| 13132 |
+
"step": 14580
|
| 13133 |
+
},
|
| 13134 |
+
{
|
| 13135 |
+
"epoch": 2.939754180938948,
|
| 13136 |
+
"grad_norm": 13.5,
|
| 13137 |
+
"learning_rate": 4.0298206729800526e-07,
|
| 13138 |
+
"loss": 0.7906,
|
| 13139 |
+
"mean_token_accuracy": 0.7989113509654999,
|
| 13140 |
+
"num_tokens": 16168451.0,
|
| 13141 |
+
"step": 14590
|
| 13142 |
+
},
|
| 13143 |
+
{
|
| 13144 |
+
"epoch": 2.941769091275438,
|
| 13145 |
+
"grad_norm": 9.4375,
|
| 13146 |
+
"learning_rate": 3.895493317214051e-07,
|
| 13147 |
+
"loss": 0.7716,
|
| 13148 |
+
"mean_token_accuracy": 0.8048623919486999,
|
| 13149 |
+
"num_tokens": 16180616.0,
|
| 13150 |
+
"step": 14600
|
| 13151 |
+
},
|
| 13152 |
+
{
|
| 13153 |
+
"epoch": 2.9437840016119283,
|
| 13154 |
+
"grad_norm": 13.1875,
|
| 13155 |
+
"learning_rate": 3.761165961448049e-07,
|
| 13156 |
+
"loss": 0.7059,
|
| 13157 |
+
"mean_token_accuracy": 0.8211403012275695,
|
| 13158 |
+
"num_tokens": 16190863.0,
|
| 13159 |
+
"step": 14610
|
| 13160 |
+
},
|
| 13161 |
+
{
|
| 13162 |
+
"epoch": 2.945798911948418,
|
| 13163 |
+
"grad_norm": 9.6875,
|
| 13164 |
+
"learning_rate": 3.626838605682047e-07,
|
| 13165 |
+
"loss": 0.7256,
|
| 13166 |
+
"mean_token_accuracy": 0.8117915868759156,
|
| 13167 |
+
"num_tokens": 16201924.0,
|
| 13168 |
+
"step": 14620
|
| 13169 |
+
},
|
| 13170 |
+
{
|
| 13171 |
+
"epoch": 2.947813822284908,
|
| 13172 |
+
"grad_norm": 12.5625,
|
| 13173 |
+
"learning_rate": 3.492511249916046e-07,
|
| 13174 |
+
"loss": 0.8323,
|
| 13175 |
+
"mean_token_accuracy": 0.7886347115039826,
|
| 13176 |
+
"num_tokens": 16212994.0,
|
| 13177 |
+
"step": 14630
|
| 13178 |
+
},
|
| 13179 |
+
{
|
| 13180 |
+
"epoch": 2.9498287326213983,
|
| 13181 |
+
"grad_norm": 12.6875,
|
| 13182 |
+
"learning_rate": 3.358183894150044e-07,
|
| 13183 |
+
"loss": 0.8058,
|
| 13184 |
+
"mean_token_accuracy": 0.7921592950820923,
|
| 13185 |
+
"num_tokens": 16223545.0,
|
| 13186 |
+
"step": 14640
|
| 13187 |
+
},
|
| 13188 |
+
{
|
| 13189 |
+
"epoch": 2.9518436429578885,
|
| 13190 |
+
"grad_norm": 10.625,
|
| 13191 |
+
"learning_rate": 3.2238565383840425e-07,
|
| 13192 |
+
"loss": 0.7639,
|
| 13193 |
+
"mean_token_accuracy": 0.8088575303554535,
|
| 13194 |
+
"num_tokens": 16234810.0,
|
| 13195 |
+
"step": 14650
|
| 13196 |
+
},
|
| 13197 |
+
{
|
| 13198 |
+
"epoch": 2.9538585532943786,
|
| 13199 |
+
"grad_norm": 12.9375,
|
| 13200 |
+
"learning_rate": 3.0895291826180403e-07,
|
| 13201 |
+
"loss": 0.9168,
|
| 13202 |
+
"mean_token_accuracy": 0.7773300051689148,
|
| 13203 |
+
"num_tokens": 16246077.0,
|
| 13204 |
+
"step": 14660
|
| 13205 |
+
},
|
| 13206 |
+
{
|
| 13207 |
+
"epoch": 2.9558734636308683,
|
| 13208 |
+
"grad_norm": 13.875,
|
| 13209 |
+
"learning_rate": 2.955201826852039e-07,
|
| 13210 |
+
"loss": 0.8155,
|
| 13211 |
+
"mean_token_accuracy": 0.7995950043201446,
|
| 13212 |
+
"num_tokens": 16256503.0,
|
| 13213 |
+
"step": 14670
|
| 13214 |
+
},
|
| 13215 |
+
{
|
| 13216 |
+
"epoch": 2.9578883739673585,
|
| 13217 |
+
"grad_norm": 13.6875,
|
| 13218 |
+
"learning_rate": 2.820874471086037e-07,
|
| 13219 |
+
"loss": 0.8045,
|
| 13220 |
+
"mean_token_accuracy": 0.8015713572502137,
|
| 13221 |
+
"num_tokens": 16266819.0,
|
| 13222 |
+
"step": 14680
|
| 13223 |
+
},
|
| 13224 |
+
{
|
| 13225 |
+
"epoch": 2.9599032843038486,
|
| 13226 |
+
"grad_norm": 12.0625,
|
| 13227 |
+
"learning_rate": 2.686547115320035e-07,
|
| 13228 |
+
"loss": 0.9132,
|
| 13229 |
+
"mean_token_accuracy": 0.7834485352039338,
|
| 13230 |
+
"num_tokens": 16278113.0,
|
| 13231 |
+
"step": 14690
|
| 13232 |
+
},
|
| 13233 |
+
{
|
| 13234 |
+
"epoch": 2.9619181946403383,
|
| 13235 |
+
"grad_norm": 13.3125,
|
| 13236 |
+
"learning_rate": 2.5522197595540334e-07,
|
| 13237 |
+
"loss": 0.7371,
|
| 13238 |
+
"mean_token_accuracy": 0.8118620038032531,
|
| 13239 |
+
"num_tokens": 16288705.0,
|
| 13240 |
+
"step": 14700
|
| 13241 |
+
},
|
| 13242 |
+
{
|
| 13243 |
+
"epoch": 2.9639331049768285,
|
| 13244 |
+
"grad_norm": 13.125,
|
| 13245 |
+
"learning_rate": 2.417892403788032e-07,
|
| 13246 |
+
"loss": 0.8454,
|
| 13247 |
+
"mean_token_accuracy": 0.7928309857845306,
|
| 13248 |
+
"num_tokens": 16299215.0,
|
| 13249 |
+
"step": 14710
|
| 13250 |
+
},
|
| 13251 |
+
{
|
| 13252 |
+
"epoch": 2.9659480153133186,
|
| 13253 |
+
"grad_norm": 11.125,
|
| 13254 |
+
"learning_rate": 2.2835650480220297e-07,
|
| 13255 |
+
"loss": 0.7582,
|
| 13256 |
+
"mean_token_accuracy": 0.814406418800354,
|
| 13257 |
+
"num_tokens": 16309978.0,
|
| 13258 |
+
"step": 14720
|
| 13259 |
+
},
|
| 13260 |
+
{
|
| 13261 |
+
"epoch": 2.9679629256498083,
|
| 13262 |
+
"grad_norm": 13.25,
|
| 13263 |
+
"learning_rate": 2.1492376922560281e-07,
|
| 13264 |
+
"loss": 0.7703,
|
| 13265 |
+
"mean_token_accuracy": 0.8121409773826599,
|
| 13266 |
+
"num_tokens": 16320485.0,
|
| 13267 |
+
"step": 14730
|
| 13268 |
+
},
|
| 13269 |
+
{
|
| 13270 |
+
"epoch": 2.9699778359862985,
|
| 13271 |
+
"grad_norm": 11.1875,
|
| 13272 |
+
"learning_rate": 2.0149103364900263e-07,
|
| 13273 |
+
"loss": 0.7339,
|
| 13274 |
+
"mean_token_accuracy": 0.8153697431087494,
|
| 13275 |
+
"num_tokens": 16332678.0,
|
| 13276 |
+
"step": 14740
|
| 13277 |
+
},
|
| 13278 |
+
{
|
| 13279 |
+
"epoch": 2.9719927463227886,
|
| 13280 |
+
"grad_norm": 11.0625,
|
| 13281 |
+
"learning_rate": 1.8805829807240244e-07,
|
| 13282 |
+
"loss": 0.8436,
|
| 13283 |
+
"mean_token_accuracy": 0.7889176428318023,
|
| 13284 |
+
"num_tokens": 16345494.0,
|
| 13285 |
+
"step": 14750
|
| 13286 |
+
},
|
| 13287 |
+
{
|
| 13288 |
+
"epoch": 2.974007656659279,
|
| 13289 |
+
"grad_norm": 10.4375,
|
| 13290 |
+
"learning_rate": 1.746255624958023e-07,
|
| 13291 |
+
"loss": 0.788,
|
| 13292 |
+
"mean_token_accuracy": 0.8068629801273346,
|
| 13293 |
+
"num_tokens": 16356280.0,
|
| 13294 |
+
"step": 14760
|
| 13295 |
+
},
|
| 13296 |
+
{
|
| 13297 |
+
"epoch": 2.976022566995769,
|
| 13298 |
+
"grad_norm": 10.6875,
|
| 13299 |
+
"learning_rate": 1.6119282691920212e-07,
|
| 13300 |
+
"loss": 0.9647,
|
| 13301 |
+
"mean_token_accuracy": 0.7653753876686096,
|
| 13302 |
+
"num_tokens": 16367963.0,
|
| 13303 |
+
"step": 14770
|
| 13304 |
+
},
|
| 13305 |
+
{
|
| 13306 |
+
"epoch": 2.9780374773322587,
|
| 13307 |
+
"grad_norm": 11.3125,
|
| 13308 |
+
"learning_rate": 1.4776009134260194e-07,
|
| 13309 |
+
"loss": 0.8052,
|
| 13310 |
+
"mean_token_accuracy": 0.7993070542812347,
|
| 13311 |
+
"num_tokens": 16378573.0,
|
| 13312 |
+
"step": 14780
|
| 13313 |
+
},
|
| 13314 |
+
{
|
| 13315 |
+
"epoch": 2.980052387668749,
|
| 13316 |
+
"grad_norm": 11.75,
|
| 13317 |
+
"learning_rate": 1.3432735576600175e-07,
|
| 13318 |
+
"loss": 0.7878,
|
| 13319 |
+
"mean_token_accuracy": 0.8004900455474854,
|
| 13320 |
+
"num_tokens": 16389458.0,
|
| 13321 |
+
"step": 14790
|
| 13322 |
+
},
|
| 13323 |
+
{
|
| 13324 |
+
"epoch": 2.982067298005239,
|
| 13325 |
+
"grad_norm": 11.375,
|
| 13326 |
+
"learning_rate": 1.208946201894016e-07,
|
| 13327 |
+
"loss": 0.8526,
|
| 13328 |
+
"mean_token_accuracy": 0.7890514850616455,
|
| 13329 |
+
"num_tokens": 16400816.0,
|
| 13330 |
+
"step": 14800
|
| 13331 |
+
},
|
| 13332 |
+
{
|
| 13333 |
+
"epoch": 2.9840822083417287,
|
| 13334 |
+
"grad_norm": 12.625,
|
| 13335 |
+
"learning_rate": 1.0746188461280141e-07,
|
| 13336 |
+
"loss": 0.7664,
|
| 13337 |
+
"mean_token_accuracy": 0.8109397828578949,
|
| 13338 |
+
"num_tokens": 16410740.0,
|
| 13339 |
+
"step": 14810
|
| 13340 |
+
},
|
| 13341 |
+
{
|
| 13342 |
+
"epoch": 2.986097118678219,
|
| 13343 |
+
"grad_norm": 12.6875,
|
| 13344 |
+
"learning_rate": 9.402914903620122e-08,
|
| 13345 |
+
"loss": 0.8027,
|
| 13346 |
+
"mean_token_accuracy": 0.7973058164119721,
|
| 13347 |
+
"num_tokens": 16422624.0,
|
| 13348 |
+
"step": 14820
|
| 13349 |
+
},
|
| 13350 |
+
{
|
| 13351 |
+
"epoch": 2.988112029014709,
|
| 13352 |
+
"grad_norm": 11.75,
|
| 13353 |
+
"learning_rate": 8.059641345960106e-08,
|
| 13354 |
+
"loss": 0.8327,
|
| 13355 |
+
"mean_token_accuracy": 0.7947525262832642,
|
| 13356 |
+
"num_tokens": 16432503.0,
|
| 13357 |
+
"step": 14830
|
| 13358 |
+
},
|
| 13359 |
+
{
|
| 13360 |
+
"epoch": 2.9901269393511987,
|
| 13361 |
+
"grad_norm": 12.25,
|
| 13362 |
+
"learning_rate": 6.716367788300088e-08,
|
| 13363 |
+
"loss": 0.8677,
|
| 13364 |
+
"mean_token_accuracy": 0.7895227074623108,
|
| 13365 |
+
"num_tokens": 16443714.0,
|
| 13366 |
+
"step": 14840
|
| 13367 |
+
},
|
| 13368 |
+
{
|
| 13369 |
+
"epoch": 2.992141849687689,
|
| 13370 |
+
"grad_norm": 12.625,
|
| 13371 |
+
"learning_rate": 5.3730942306400703e-08,
|
| 13372 |
+
"loss": 0.7835,
|
| 13373 |
+
"mean_token_accuracy": 0.8056479752063751,
|
| 13374 |
+
"num_tokens": 16455282.0,
|
| 13375 |
+
"step": 14850
|
| 13376 |
+
},
|
| 13377 |
+
{
|
| 13378 |
+
"epoch": 2.994156760024179,
|
| 13379 |
+
"grad_norm": 10.5625,
|
| 13380 |
+
"learning_rate": 4.029820672980053e-08,
|
| 13381 |
+
"loss": 0.8101,
|
| 13382 |
+
"mean_token_accuracy": 0.8069123327732086,
|
| 13383 |
+
"num_tokens": 16466521.0,
|
| 13384 |
+
"step": 14860
|
| 13385 |
+
},
|
| 13386 |
+
{
|
| 13387 |
+
"epoch": 2.996171670360669,
|
| 13388 |
+
"grad_norm": 12.5,
|
| 13389 |
+
"learning_rate": 2.6865471153200352e-08,
|
| 13390 |
+
"loss": 0.8407,
|
| 13391 |
+
"mean_token_accuracy": 0.7910451471805573,
|
| 13392 |
+
"num_tokens": 16477990.0,
|
| 13393 |
+
"step": 14870
|
| 13394 |
+
},
|
| 13395 |
+
{
|
| 13396 |
+
"epoch": 2.998186580697159,
|
| 13397 |
+
"grad_norm": 10.9375,
|
| 13398 |
+
"learning_rate": 1.3432735576600176e-08,
|
| 13399 |
+
"loss": 0.8122,
|
| 13400 |
+
"mean_token_accuracy": 0.7982128620147705,
|
| 13401 |
+
"num_tokens": 16488273.0,
|
| 13402 |
+
"step": 14880
|
| 13403 |
}
|
| 13404 |
],
|
| 13405 |
"logging_steps": 10,
|
|
|
|
| 13414 |
"should_evaluate": false,
|
| 13415 |
"should_log": false,
|
| 13416 |
"should_save": true,
|
| 13417 |
+
"should_training_stop": true
|
| 13418 |
},
|
| 13419 |
"attributes": {}
|
| 13420 |
}
|
| 13421 |
},
|
| 13422 |
+
"total_flos": 1.9946788002011136e+16,
|
| 13423 |
"train_batch_size": 8,
|
| 13424 |
"trial_name": null,
|
| 13425 |
"trial_params": null
|