Training in progress, step 14000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:071ff40e66008578cff6a11839a98b3bd55870fb4ecd78b520fd649a835f02e1
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0402536afc76b268263c8a44f7565c5d35ba54094497cf95e3c11e92a054cd5
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7200e211c4af21388df4ea9729221c37205d2f4defca496f0d1b43ecbe09b628
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12158,6 +12158,456 @@
|
|
| 12158 |
"mean_token_accuracy": 0.7941052973270416,
|
| 12159 |
"num_tokens": 14956201.0,
|
| 12160 |
"step": 13500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12161 |
}
|
| 12162 |
],
|
| 12163 |
"logging_steps": 10,
|
|
@@ -12177,7 +12627,7 @@
|
|
| 12177 |
"attributes": {}
|
| 12178 |
}
|
| 12179 |
},
|
| 12180 |
-
"total_flos": 1.
|
| 12181 |
"train_batch_size": 8,
|
| 12182 |
"trial_name": null,
|
| 12183 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.8208744710860367,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 14000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12158 |
"mean_token_accuracy": 0.7941052973270416,
|
| 12159 |
"num_tokens": 14956201.0,
|
| 12160 |
"step": 13500
|
| 12161 |
+
},
|
| 12162 |
+
{
|
| 12163 |
+
"epoch": 2.722143864598025,
|
| 12164 |
+
"grad_norm": 13.3125,
|
| 12165 |
+
"learning_rate": 1.8537175095708242e-06,
|
| 12166 |
+
"loss": 0.8932,
|
| 12167 |
+
"mean_token_accuracy": 0.7903563916683197,
|
| 12168 |
+
"num_tokens": 14968110.0,
|
| 12169 |
+
"step": 13510
|
| 12170 |
+
},
|
| 12171 |
+
{
|
| 12172 |
+
"epoch": 2.7241587749345153,
|
| 12173 |
+
"grad_norm": 12.4375,
|
| 12174 |
+
"learning_rate": 1.8402847739942239e-06,
|
| 12175 |
+
"loss": 0.8239,
|
| 12176 |
+
"mean_token_accuracy": 0.7959101080894471,
|
| 12177 |
+
"num_tokens": 14979544.0,
|
| 12178 |
+
"step": 13520
|
| 12179 |
+
},
|
| 12180 |
+
{
|
| 12181 |
+
"epoch": 2.7261736852710055,
|
| 12182 |
+
"grad_norm": 14.25,
|
| 12183 |
+
"learning_rate": 1.8268520384176238e-06,
|
| 12184 |
+
"loss": 0.8986,
|
| 12185 |
+
"mean_token_accuracy": 0.7841361403465271,
|
| 12186 |
+
"num_tokens": 14990853.0,
|
| 12187 |
+
"step": 13530
|
| 12188 |
+
},
|
| 12189 |
+
{
|
| 12190 |
+
"epoch": 2.7281885956074956,
|
| 12191 |
+
"grad_norm": 9.5,
|
| 12192 |
+
"learning_rate": 1.8134193028410239e-06,
|
| 12193 |
+
"loss": 0.7908,
|
| 12194 |
+
"mean_token_accuracy": 0.7991403341293335,
|
| 12195 |
+
"num_tokens": 15002448.0,
|
| 12196 |
+
"step": 13540
|
| 12197 |
+
},
|
| 12198 |
+
{
|
| 12199 |
+
"epoch": 2.7302035059439858,
|
| 12200 |
+
"grad_norm": 11.8125,
|
| 12201 |
+
"learning_rate": 1.7999865672644234e-06,
|
| 12202 |
+
"loss": 0.8348,
|
| 12203 |
+
"mean_token_accuracy": 0.797630226612091,
|
| 12204 |
+
"num_tokens": 15012426.0,
|
| 12205 |
+
"step": 13550
|
| 12206 |
+
},
|
| 12207 |
+
{
|
| 12208 |
+
"epoch": 2.7322184162804755,
|
| 12209 |
+
"grad_norm": 12.375,
|
| 12210 |
+
"learning_rate": 1.7865538316878235e-06,
|
| 12211 |
+
"loss": 0.829,
|
| 12212 |
+
"mean_token_accuracy": 0.8008688688278198,
|
| 12213 |
+
"num_tokens": 15023568.0,
|
| 12214 |
+
"step": 13560
|
| 12215 |
+
},
|
| 12216 |
+
{
|
| 12217 |
+
"epoch": 2.7342333266169656,
|
| 12218 |
+
"grad_norm": 11.875,
|
| 12219 |
+
"learning_rate": 1.7731210961112234e-06,
|
| 12220 |
+
"loss": 0.8301,
|
| 12221 |
+
"mean_token_accuracy": 0.7909713625907898,
|
| 12222 |
+
"num_tokens": 15033450.0,
|
| 12223 |
+
"step": 13570
|
| 12224 |
+
},
|
| 12225 |
+
{
|
| 12226 |
+
"epoch": 2.7362482369534558,
|
| 12227 |
+
"grad_norm": 10.875,
|
| 12228 |
+
"learning_rate": 1.759688360534623e-06,
|
| 12229 |
+
"loss": 0.797,
|
| 12230 |
+
"mean_token_accuracy": 0.8022173583507538,
|
| 12231 |
+
"num_tokens": 15045140.0,
|
| 12232 |
+
"step": 13580
|
| 12233 |
+
},
|
| 12234 |
+
{
|
| 12235 |
+
"epoch": 2.7382631472899455,
|
| 12236 |
+
"grad_norm": 10.4375,
|
| 12237 |
+
"learning_rate": 1.746255624958023e-06,
|
| 12238 |
+
"loss": 0.8381,
|
| 12239 |
+
"mean_token_accuracy": 0.7926445186138154,
|
| 12240 |
+
"num_tokens": 15056256.0,
|
| 12241 |
+
"step": 13590
|
| 12242 |
+
},
|
| 12243 |
+
{
|
| 12244 |
+
"epoch": 2.7402780576264356,
|
| 12245 |
+
"grad_norm": 11.1875,
|
| 12246 |
+
"learning_rate": 1.7328228893814228e-06,
|
| 12247 |
+
"loss": 0.8104,
|
| 12248 |
+
"mean_token_accuracy": 0.7977364182472229,
|
| 12249 |
+
"num_tokens": 15068134.0,
|
| 12250 |
+
"step": 13600
|
| 12251 |
+
},
|
| 12252 |
+
{
|
| 12253 |
+
"epoch": 2.7422929679629258,
|
| 12254 |
+
"grad_norm": 10.125,
|
| 12255 |
+
"learning_rate": 1.7193901538048225e-06,
|
| 12256 |
+
"loss": 0.8133,
|
| 12257 |
+
"mean_token_accuracy": 0.8040676951408386,
|
| 12258 |
+
"num_tokens": 15079578.0,
|
| 12259 |
+
"step": 13610
|
| 12260 |
+
},
|
| 12261 |
+
{
|
| 12262 |
+
"epoch": 2.7443078782994155,
|
| 12263 |
+
"grad_norm": 11.0,
|
| 12264 |
+
"learning_rate": 1.7059574182282224e-06,
|
| 12265 |
+
"loss": 0.9289,
|
| 12266 |
+
"mean_token_accuracy": 0.7738179624080658,
|
| 12267 |
+
"num_tokens": 15090034.0,
|
| 12268 |
+
"step": 13620
|
| 12269 |
+
},
|
| 12270 |
+
{
|
| 12271 |
+
"epoch": 2.7463227886359056,
|
| 12272 |
+
"grad_norm": 11.375,
|
| 12273 |
+
"learning_rate": 1.692524682651622e-06,
|
| 12274 |
+
"loss": 0.8635,
|
| 12275 |
+
"mean_token_accuracy": 0.7958697319030762,
|
| 12276 |
+
"num_tokens": 15101919.0,
|
| 12277 |
+
"step": 13630
|
| 12278 |
+
},
|
| 12279 |
+
{
|
| 12280 |
+
"epoch": 2.7483376989723958,
|
| 12281 |
+
"grad_norm": 13.0625,
|
| 12282 |
+
"learning_rate": 1.679091947075022e-06,
|
| 12283 |
+
"loss": 0.8911,
|
| 12284 |
+
"mean_token_accuracy": 0.7814191520214081,
|
| 12285 |
+
"num_tokens": 15114084.0,
|
| 12286 |
+
"step": 13640
|
| 12287 |
+
},
|
| 12288 |
+
{
|
| 12289 |
+
"epoch": 2.750352609308886,
|
| 12290 |
+
"grad_norm": 12.75,
|
| 12291 |
+
"learning_rate": 1.6656592114984219e-06,
|
| 12292 |
+
"loss": 0.7362,
|
| 12293 |
+
"mean_token_accuracy": 0.8138824105262756,
|
| 12294 |
+
"num_tokens": 15124878.0,
|
| 12295 |
+
"step": 13650
|
| 12296 |
+
},
|
| 12297 |
+
{
|
| 12298 |
+
"epoch": 2.7523675196453756,
|
| 12299 |
+
"grad_norm": 11.75,
|
| 12300 |
+
"learning_rate": 1.6522264759218216e-06,
|
| 12301 |
+
"loss": 0.8195,
|
| 12302 |
+
"mean_token_accuracy": 0.793831080198288,
|
| 12303 |
+
"num_tokens": 15135525.0,
|
| 12304 |
+
"step": 13660
|
| 12305 |
+
},
|
| 12306 |
+
{
|
| 12307 |
+
"epoch": 2.7543824299818658,
|
| 12308 |
+
"grad_norm": 9.8125,
|
| 12309 |
+
"learning_rate": 1.6387937403452214e-06,
|
| 12310 |
+
"loss": 0.7857,
|
| 12311 |
+
"mean_token_accuracy": 0.8074389100074768,
|
| 12312 |
+
"num_tokens": 15147692.0,
|
| 12313 |
+
"step": 13670
|
| 12314 |
+
},
|
| 12315 |
+
{
|
| 12316 |
+
"epoch": 2.756397340318356,
|
| 12317 |
+
"grad_norm": 10.125,
|
| 12318 |
+
"learning_rate": 1.6253610047686213e-06,
|
| 12319 |
+
"loss": 0.9199,
|
| 12320 |
+
"mean_token_accuracy": 0.7814192116260529,
|
| 12321 |
+
"num_tokens": 15159592.0,
|
| 12322 |
+
"step": 13680
|
| 12323 |
+
},
|
| 12324 |
+
{
|
| 12325 |
+
"epoch": 2.758412250654846,
|
| 12326 |
+
"grad_norm": 10.5625,
|
| 12327 |
+
"learning_rate": 1.611928269192021e-06,
|
| 12328 |
+
"loss": 0.7825,
|
| 12329 |
+
"mean_token_accuracy": 0.7981011807918549,
|
| 12330 |
+
"num_tokens": 15171601.0,
|
| 12331 |
+
"step": 13690
|
| 12332 |
+
},
|
| 12333 |
+
{
|
| 12334 |
+
"epoch": 2.760427160991336,
|
| 12335 |
+
"grad_norm": 14.9375,
|
| 12336 |
+
"learning_rate": 1.598495533615421e-06,
|
| 12337 |
+
"loss": 0.9254,
|
| 12338 |
+
"mean_token_accuracy": 0.777032095193863,
|
| 12339 |
+
"num_tokens": 15182890.0,
|
| 12340 |
+
"step": 13700
|
| 12341 |
+
},
|
| 12342 |
+
{
|
| 12343 |
+
"epoch": 2.762442071327826,
|
| 12344 |
+
"grad_norm": 12.125,
|
| 12345 |
+
"learning_rate": 1.5850627980388208e-06,
|
| 12346 |
+
"loss": 0.7658,
|
| 12347 |
+
"mean_token_accuracy": 0.8108864903450013,
|
| 12348 |
+
"num_tokens": 15193434.0,
|
| 12349 |
+
"step": 13710
|
| 12350 |
+
},
|
| 12351 |
+
{
|
| 12352 |
+
"epoch": 2.764456981664316,
|
| 12353 |
+
"grad_norm": 12.6875,
|
| 12354 |
+
"learning_rate": 1.5716300624622205e-06,
|
| 12355 |
+
"loss": 0.7604,
|
| 12356 |
+
"mean_token_accuracy": 0.8065372705459595,
|
| 12357 |
+
"num_tokens": 15204253.0,
|
| 12358 |
+
"step": 13720
|
| 12359 |
+
},
|
| 12360 |
+
{
|
| 12361 |
+
"epoch": 2.766471892000806,
|
| 12362 |
+
"grad_norm": 12.75,
|
| 12363 |
+
"learning_rate": 1.5581973268856204e-06,
|
| 12364 |
+
"loss": 0.7993,
|
| 12365 |
+
"mean_token_accuracy": 0.8044365346431732,
|
| 12366 |
+
"num_tokens": 15214089.0,
|
| 12367 |
+
"step": 13730
|
| 12368 |
+
},
|
| 12369 |
+
{
|
| 12370 |
+
"epoch": 2.768486802337296,
|
| 12371 |
+
"grad_norm": 11.4375,
|
| 12372 |
+
"learning_rate": 1.5447645913090203e-06,
|
| 12373 |
+
"loss": 0.8261,
|
| 12374 |
+
"mean_token_accuracy": 0.7986261487007141,
|
| 12375 |
+
"num_tokens": 15224909.0,
|
| 12376 |
+
"step": 13740
|
| 12377 |
+
},
|
| 12378 |
+
{
|
| 12379 |
+
"epoch": 2.770501712673786,
|
| 12380 |
+
"grad_norm": 16.125,
|
| 12381 |
+
"learning_rate": 1.53133185573242e-06,
|
| 12382 |
+
"loss": 0.9516,
|
| 12383 |
+
"mean_token_accuracy": 0.7706651806831359,
|
| 12384 |
+
"num_tokens": 15237744.0,
|
| 12385 |
+
"step": 13750
|
| 12386 |
+
},
|
| 12387 |
+
{
|
| 12388 |
+
"epoch": 2.772516623010276,
|
| 12389 |
+
"grad_norm": 18.25,
|
| 12390 |
+
"learning_rate": 1.5178991201558199e-06,
|
| 12391 |
+
"loss": 0.8024,
|
| 12392 |
+
"mean_token_accuracy": 0.8005965650081635,
|
| 12393 |
+
"num_tokens": 15248510.0,
|
| 12394 |
+
"step": 13760
|
| 12395 |
+
},
|
| 12396 |
+
{
|
| 12397 |
+
"epoch": 2.774531533346766,
|
| 12398 |
+
"grad_norm": 11.8125,
|
| 12399 |
+
"learning_rate": 1.5044663845792195e-06,
|
| 12400 |
+
"loss": 0.7795,
|
| 12401 |
+
"mean_token_accuracy": 0.8093705713748932,
|
| 12402 |
+
"num_tokens": 15258924.0,
|
| 12403 |
+
"step": 13770
|
| 12404 |
+
},
|
| 12405 |
+
{
|
| 12406 |
+
"epoch": 2.776546443683256,
|
| 12407 |
+
"grad_norm": 8.9375,
|
| 12408 |
+
"learning_rate": 1.4910336490026194e-06,
|
| 12409 |
+
"loss": 0.822,
|
| 12410 |
+
"mean_token_accuracy": 0.8004359900951385,
|
| 12411 |
+
"num_tokens": 15270012.0,
|
| 12412 |
+
"step": 13780
|
| 12413 |
+
},
|
| 12414 |
+
{
|
| 12415 |
+
"epoch": 2.7785613540197462,
|
| 12416 |
+
"grad_norm": 13.6875,
|
| 12417 |
+
"learning_rate": 1.4776009134260193e-06,
|
| 12418 |
+
"loss": 0.7118,
|
| 12419 |
+
"mean_token_accuracy": 0.8202294111251831,
|
| 12420 |
+
"num_tokens": 15280170.0,
|
| 12421 |
+
"step": 13790
|
| 12422 |
+
},
|
| 12423 |
+
{
|
| 12424 |
+
"epoch": 2.7805762643562364,
|
| 12425 |
+
"grad_norm": 10.4375,
|
| 12426 |
+
"learning_rate": 1.464168177849419e-06,
|
| 12427 |
+
"loss": 0.7994,
|
| 12428 |
+
"mean_token_accuracy": 0.8046676278114319,
|
| 12429 |
+
"num_tokens": 15291487.0,
|
| 12430 |
+
"step": 13800
|
| 12431 |
+
},
|
| 12432 |
+
{
|
| 12433 |
+
"epoch": 2.782591174692726,
|
| 12434 |
+
"grad_norm": 11.375,
|
| 12435 |
+
"learning_rate": 1.450735442272819e-06,
|
| 12436 |
+
"loss": 0.7917,
|
| 12437 |
+
"mean_token_accuracy": 0.7997995793819428,
|
| 12438 |
+
"num_tokens": 15302931.0,
|
| 12439 |
+
"step": 13810
|
| 12440 |
+
},
|
| 12441 |
+
{
|
| 12442 |
+
"epoch": 2.7846060850292162,
|
| 12443 |
+
"grad_norm": 11.75,
|
| 12444 |
+
"learning_rate": 1.437302706696219e-06,
|
| 12445 |
+
"loss": 0.8199,
|
| 12446 |
+
"mean_token_accuracy": 0.7960925221443176,
|
| 12447 |
+
"num_tokens": 15313041.0,
|
| 12448 |
+
"step": 13820
|
| 12449 |
+
},
|
| 12450 |
+
{
|
| 12451 |
+
"epoch": 2.7866209953657064,
|
| 12452 |
+
"grad_norm": 11.6875,
|
| 12453 |
+
"learning_rate": 1.4238699711196185e-06,
|
| 12454 |
+
"loss": 0.7861,
|
| 12455 |
+
"mean_token_accuracy": 0.8075309932231903,
|
| 12456 |
+
"num_tokens": 15324592.0,
|
| 12457 |
+
"step": 13830
|
| 12458 |
+
},
|
| 12459 |
+
{
|
| 12460 |
+
"epoch": 2.788635905702196,
|
| 12461 |
+
"grad_norm": 10.75,
|
| 12462 |
+
"learning_rate": 1.4104372355430186e-06,
|
| 12463 |
+
"loss": 0.9482,
|
| 12464 |
+
"mean_token_accuracy": 0.7787281274795532,
|
| 12465 |
+
"num_tokens": 15336954.0,
|
| 12466 |
+
"step": 13840
|
| 12467 |
+
},
|
| 12468 |
+
{
|
| 12469 |
+
"epoch": 2.7906508160386863,
|
| 12470 |
+
"grad_norm": 10.8125,
|
| 12471 |
+
"learning_rate": 1.3970044999664185e-06,
|
| 12472 |
+
"loss": 0.7294,
|
| 12473 |
+
"mean_token_accuracy": 0.81562819480896,
|
| 12474 |
+
"num_tokens": 15346970.0,
|
| 12475 |
+
"step": 13850
|
| 12476 |
+
},
|
| 12477 |
+
{
|
| 12478 |
+
"epoch": 2.7926657263751764,
|
| 12479 |
+
"grad_norm": 12.5,
|
| 12480 |
+
"learning_rate": 1.3835717643898182e-06,
|
| 12481 |
+
"loss": 0.8401,
|
| 12482 |
+
"mean_token_accuracy": 0.7911224365234375,
|
| 12483 |
+
"num_tokens": 15357988.0,
|
| 12484 |
+
"step": 13860
|
| 12485 |
+
},
|
| 12486 |
+
{
|
| 12487 |
+
"epoch": 2.794680636711666,
|
| 12488 |
+
"grad_norm": 11.6875,
|
| 12489 |
+
"learning_rate": 1.370139028813218e-06,
|
| 12490 |
+
"loss": 0.8417,
|
| 12491 |
+
"mean_token_accuracy": 0.7968161761760711,
|
| 12492 |
+
"num_tokens": 15368806.0,
|
| 12493 |
+
"step": 13870
|
| 12494 |
+
},
|
| 12495 |
+
{
|
| 12496 |
+
"epoch": 2.7966955470481563,
|
| 12497 |
+
"grad_norm": 12.875,
|
| 12498 |
+
"learning_rate": 1.3567062932366175e-06,
|
| 12499 |
+
"loss": 0.8506,
|
| 12500 |
+
"mean_token_accuracy": 0.7901014566421509,
|
| 12501 |
+
"num_tokens": 15378192.0,
|
| 12502 |
+
"step": 13880
|
| 12503 |
+
},
|
| 12504 |
+
{
|
| 12505 |
+
"epoch": 2.7987104573846464,
|
| 12506 |
+
"grad_norm": 12.1875,
|
| 12507 |
+
"learning_rate": 1.3432735576600176e-06,
|
| 12508 |
+
"loss": 0.7323,
|
| 12509 |
+
"mean_token_accuracy": 0.812350469827652,
|
| 12510 |
+
"num_tokens": 15388640.0,
|
| 12511 |
+
"step": 13890
|
| 12512 |
+
},
|
| 12513 |
+
{
|
| 12514 |
+
"epoch": 2.8007253677211366,
|
| 12515 |
+
"grad_norm": 13.0625,
|
| 12516 |
+
"learning_rate": 1.3298408220834175e-06,
|
| 12517 |
+
"loss": 0.7982,
|
| 12518 |
+
"mean_token_accuracy": 0.8071064949035645,
|
| 12519 |
+
"num_tokens": 15398587.0,
|
| 12520 |
+
"step": 13900
|
| 12521 |
+
},
|
| 12522 |
+
{
|
| 12523 |
+
"epoch": 2.8027402780576267,
|
| 12524 |
+
"grad_norm": 10.25,
|
| 12525 |
+
"learning_rate": 1.3164080865068172e-06,
|
| 12526 |
+
"loss": 0.9217,
|
| 12527 |
+
"mean_token_accuracy": 0.7764141440391541,
|
| 12528 |
+
"num_tokens": 15408343.0,
|
| 12529 |
+
"step": 13910
|
| 12530 |
+
},
|
| 12531 |
+
{
|
| 12532 |
+
"epoch": 2.8047551883941164,
|
| 12533 |
+
"grad_norm": 12.25,
|
| 12534 |
+
"learning_rate": 1.302975350930217e-06,
|
| 12535 |
+
"loss": 0.7961,
|
| 12536 |
+
"mean_token_accuracy": 0.799578857421875,
|
| 12537 |
+
"num_tokens": 15419485.0,
|
| 12538 |
+
"step": 13920
|
| 12539 |
+
},
|
| 12540 |
+
{
|
| 12541 |
+
"epoch": 2.8067700987306066,
|
| 12542 |
+
"grad_norm": 11.3125,
|
| 12543 |
+
"learning_rate": 1.289542615353617e-06,
|
| 12544 |
+
"loss": 0.8453,
|
| 12545 |
+
"mean_token_accuracy": 0.794361412525177,
|
| 12546 |
+
"num_tokens": 15431130.0,
|
| 12547 |
+
"step": 13930
|
| 12548 |
+
},
|
| 12549 |
+
{
|
| 12550 |
+
"epoch": 2.8087850090670967,
|
| 12551 |
+
"grad_norm": 14.75,
|
| 12552 |
+
"learning_rate": 1.2761098797770167e-06,
|
| 12553 |
+
"loss": 0.8325,
|
| 12554 |
+
"mean_token_accuracy": 0.7931640625,
|
| 12555 |
+
"num_tokens": 15442377.0,
|
| 12556 |
+
"step": 13940
|
| 12557 |
+
},
|
| 12558 |
+
{
|
| 12559 |
+
"epoch": 2.8107999194035864,
|
| 12560 |
+
"grad_norm": 11.5625,
|
| 12561 |
+
"learning_rate": 1.2626771442004166e-06,
|
| 12562 |
+
"loss": 0.7471,
|
| 12563 |
+
"mean_token_accuracy": 0.8172047972679138,
|
| 12564 |
+
"num_tokens": 15453500.0,
|
| 12565 |
+
"step": 13950
|
| 12566 |
+
},
|
| 12567 |
+
{
|
| 12568 |
+
"epoch": 2.8128148297400766,
|
| 12569 |
+
"grad_norm": 11.6875,
|
| 12570 |
+
"learning_rate": 1.2492444086238162e-06,
|
| 12571 |
+
"loss": 0.8275,
|
| 12572 |
+
"mean_token_accuracy": 0.7963262915611267,
|
| 12573 |
+
"num_tokens": 15465388.0,
|
| 12574 |
+
"step": 13960
|
| 12575 |
+
},
|
| 12576 |
+
{
|
| 12577 |
+
"epoch": 2.8148297400765667,
|
| 12578 |
+
"grad_norm": 12.875,
|
| 12579 |
+
"learning_rate": 1.2358116730472161e-06,
|
| 12580 |
+
"loss": 0.7675,
|
| 12581 |
+
"mean_token_accuracy": 0.8086236357688904,
|
| 12582 |
+
"num_tokens": 15475972.0,
|
| 12583 |
+
"step": 13970
|
| 12584 |
+
},
|
| 12585 |
+
{
|
| 12586 |
+
"epoch": 2.8168446504130564,
|
| 12587 |
+
"grad_norm": 10.0,
|
| 12588 |
+
"learning_rate": 1.222378937470616e-06,
|
| 12589 |
+
"loss": 0.7866,
|
| 12590 |
+
"mean_token_accuracy": 0.8052810370922089,
|
| 12591 |
+
"num_tokens": 15489313.0,
|
| 12592 |
+
"step": 13980
|
| 12593 |
+
},
|
| 12594 |
+
{
|
| 12595 |
+
"epoch": 2.8188595607495466,
|
| 12596 |
+
"grad_norm": 11.8125,
|
| 12597 |
+
"learning_rate": 1.208946201894016e-06,
|
| 12598 |
+
"loss": 0.7903,
|
| 12599 |
+
"mean_token_accuracy": 0.805169427394867,
|
| 12600 |
+
"num_tokens": 15500110.0,
|
| 12601 |
+
"step": 13990
|
| 12602 |
+
},
|
| 12603 |
+
{
|
| 12604 |
+
"epoch": 2.8208744710860367,
|
| 12605 |
+
"grad_norm": 13.5,
|
| 12606 |
+
"learning_rate": 1.1955134663174156e-06,
|
| 12607 |
+
"loss": 0.8045,
|
| 12608 |
+
"mean_token_accuracy": 0.7995685517787934,
|
| 12609 |
+
"num_tokens": 15509702.0,
|
| 12610 |
+
"step": 14000
|
| 12611 |
}
|
| 12612 |
],
|
| 12613 |
"logging_steps": 10,
|
|
|
|
| 12627 |
"attributes": {}
|
| 12628 |
}
|
| 12629 |
},
|
| 12630 |
+
"total_flos": 1.874850530342093e+16,
|
| 12631 |
"train_batch_size": 8,
|
| 12632 |
"trial_name": null,
|
| 12633 |
"trial_params": null
|