Training in progress, step 13000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4e31724b0cf74835ae0b9aaeff5c05e7e852cb9e158de0e35d8a673c930d429
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98033794262f4774a192ebe69b4dfddba3edee43a3cce40cedfd5c1785391e67
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3526295826c2a8db767925a5ee2fce15661c2f21ba999bd2bc96732400f36f2d
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11258,6 +11258,456 @@
|
|
| 11258 |
"mean_token_accuracy": 0.8074711799621582,
|
| 11259 |
"num_tokens": 13840892.0,
|
| 11260 |
"step": 12500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11261 |
}
|
| 11262 |
],
|
| 11263 |
"logging_steps": 10,
|
|
@@ -11277,7 +11727,7 @@
|
|
| 11277 |
"attributes": {}
|
| 11278 |
}
|
| 11279 |
},
|
| 11280 |
-
"total_flos": 1.
|
| 11281 |
"train_batch_size": 8,
|
| 11282 |
"trial_name": null,
|
| 11283 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.619383437437034,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 13000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11258 |
"mean_token_accuracy": 0.8074711799621582,
|
| 11259 |
"num_tokens": 13840892.0,
|
| 11260 |
"step": 12500
|
| 11261 |
+
},
|
| 11262 |
+
{
|
| 11263 |
+
"epoch": 2.5206528309490226,
|
| 11264 |
+
"grad_norm": 14.0625,
|
| 11265 |
+
"learning_rate": 3.196991067230842e-06,
|
| 11266 |
+
"loss": 0.8395,
|
| 11267 |
+
"mean_token_accuracy": 0.797085040807724,
|
| 11268 |
+
"num_tokens": 13852682.0,
|
| 11269 |
+
"step": 12510
|
| 11270 |
+
},
|
| 11271 |
+
{
|
| 11272 |
+
"epoch": 2.5226677412855127,
|
| 11273 |
+
"grad_norm": 10.4375,
|
| 11274 |
+
"learning_rate": 3.183558331654242e-06,
|
| 11275 |
+
"loss": 0.7477,
|
| 11276 |
+
"mean_token_accuracy": 0.8105276763439179,
|
| 11277 |
+
"num_tokens": 13864663.0,
|
| 11278 |
+
"step": 12520
|
| 11279 |
+
},
|
| 11280 |
+
{
|
| 11281 |
+
"epoch": 2.524682651622003,
|
| 11282 |
+
"grad_norm": 10.5625,
|
| 11283 |
+
"learning_rate": 3.1701255960776416e-06,
|
| 11284 |
+
"loss": 0.6768,
|
| 11285 |
+
"mean_token_accuracy": 0.8293303847312927,
|
| 11286 |
+
"num_tokens": 13876171.0,
|
| 11287 |
+
"step": 12530
|
| 11288 |
+
},
|
| 11289 |
+
{
|
| 11290 |
+
"epoch": 2.5266975619584926,
|
| 11291 |
+
"grad_norm": 9.625,
|
| 11292 |
+
"learning_rate": 3.156692860501041e-06,
|
| 11293 |
+
"loss": 0.72,
|
| 11294 |
+
"mean_token_accuracy": 0.8201279520988465,
|
| 11295 |
+
"num_tokens": 13887876.0,
|
| 11296 |
+
"step": 12540
|
| 11297 |
+
},
|
| 11298 |
+
{
|
| 11299 |
+
"epoch": 2.5287124722949827,
|
| 11300 |
+
"grad_norm": 15.6875,
|
| 11301 |
+
"learning_rate": 3.143260124924441e-06,
|
| 11302 |
+
"loss": 0.8625,
|
| 11303 |
+
"mean_token_accuracy": 0.7869448184967041,
|
| 11304 |
+
"num_tokens": 13898022.0,
|
| 11305 |
+
"step": 12550
|
| 11306 |
+
},
|
| 11307 |
+
{
|
| 11308 |
+
"epoch": 2.530727382631473,
|
| 11309 |
+
"grad_norm": 12.625,
|
| 11310 |
+
"learning_rate": 3.129827389347841e-06,
|
| 11311 |
+
"loss": 0.8983,
|
| 11312 |
+
"mean_token_accuracy": 0.7845638215541839,
|
| 11313 |
+
"num_tokens": 13909748.0,
|
| 11314 |
+
"step": 12560
|
| 11315 |
+
},
|
| 11316 |
+
{
|
| 11317 |
+
"epoch": 2.532742292967963,
|
| 11318 |
+
"grad_norm": 10.25,
|
| 11319 |
+
"learning_rate": 3.1163946537712408e-06,
|
| 11320 |
+
"loss": 0.8229,
|
| 11321 |
+
"mean_token_accuracy": 0.7928247213363647,
|
| 11322 |
+
"num_tokens": 13922277.0,
|
| 11323 |
+
"step": 12570
|
| 11324 |
+
},
|
| 11325 |
+
{
|
| 11326 |
+
"epoch": 2.534757203304453,
|
| 11327 |
+
"grad_norm": 11.125,
|
| 11328 |
+
"learning_rate": 3.102961918194641e-06,
|
| 11329 |
+
"loss": 0.7876,
|
| 11330 |
+
"mean_token_accuracy": 0.7989083111286164,
|
| 11331 |
+
"num_tokens": 13934598.0,
|
| 11332 |
+
"step": 12580
|
| 11333 |
+
},
|
| 11334 |
+
{
|
| 11335 |
+
"epoch": 2.536772113640943,
|
| 11336 |
+
"grad_norm": 12.9375,
|
| 11337 |
+
"learning_rate": 3.0895291826180406e-06,
|
| 11338 |
+
"loss": 0.7705,
|
| 11339 |
+
"mean_token_accuracy": 0.8085067272186279,
|
| 11340 |
+
"num_tokens": 13944920.0,
|
| 11341 |
+
"step": 12590
|
| 11342 |
+
},
|
| 11343 |
+
{
|
| 11344 |
+
"epoch": 2.538787023977433,
|
| 11345 |
+
"grad_norm": 12.1875,
|
| 11346 |
+
"learning_rate": 3.0760964470414402e-06,
|
| 11347 |
+
"loss": 0.8116,
|
| 11348 |
+
"mean_token_accuracy": 0.7995778679847717,
|
| 11349 |
+
"num_tokens": 13956061.0,
|
| 11350 |
+
"step": 12600
|
| 11351 |
+
},
|
| 11352 |
+
{
|
| 11353 |
+
"epoch": 2.540801934313923,
|
| 11354 |
+
"grad_norm": 9.9375,
|
| 11355 |
+
"learning_rate": 3.06266371146484e-06,
|
| 11356 |
+
"loss": 0.767,
|
| 11357 |
+
"mean_token_accuracy": 0.8065134942531585,
|
| 11358 |
+
"num_tokens": 13967029.0,
|
| 11359 |
+
"step": 12610
|
| 11360 |
+
},
|
| 11361 |
+
{
|
| 11362 |
+
"epoch": 2.542816844650413,
|
| 11363 |
+
"grad_norm": 11.25,
|
| 11364 |
+
"learning_rate": 3.04923097588824e-06,
|
| 11365 |
+
"loss": 0.8058,
|
| 11366 |
+
"mean_token_accuracy": 0.7920451164245605,
|
| 11367 |
+
"num_tokens": 13978641.0,
|
| 11368 |
+
"step": 12620
|
| 11369 |
+
},
|
| 11370 |
+
{
|
| 11371 |
+
"epoch": 2.544831754986903,
|
| 11372 |
+
"grad_norm": 11.1875,
|
| 11373 |
+
"learning_rate": 3.0357982403116397e-06,
|
| 11374 |
+
"loss": 0.7992,
|
| 11375 |
+
"mean_token_accuracy": 0.7988592565059662,
|
| 11376 |
+
"num_tokens": 13989687.0,
|
| 11377 |
+
"step": 12630
|
| 11378 |
+
},
|
| 11379 |
+
{
|
| 11380 |
+
"epoch": 2.546846665323393,
|
| 11381 |
+
"grad_norm": 14.125,
|
| 11382 |
+
"learning_rate": 3.02236550473504e-06,
|
| 11383 |
+
"loss": 0.6931,
|
| 11384 |
+
"mean_token_accuracy": 0.8216106593608856,
|
| 11385 |
+
"num_tokens": 14001448.0,
|
| 11386 |
+
"step": 12640
|
| 11387 |
+
},
|
| 11388 |
+
{
|
| 11389 |
+
"epoch": 2.548861575659883,
|
| 11390 |
+
"grad_norm": 12.6875,
|
| 11391 |
+
"learning_rate": 3.008932769158439e-06,
|
| 11392 |
+
"loss": 0.8851,
|
| 11393 |
+
"mean_token_accuracy": 0.7797039806842804,
|
| 11394 |
+
"num_tokens": 14012660.0,
|
| 11395 |
+
"step": 12650
|
| 11396 |
+
},
|
| 11397 |
+
{
|
| 11398 |
+
"epoch": 2.550876485996373,
|
| 11399 |
+
"grad_norm": 11.0625,
|
| 11400 |
+
"learning_rate": 2.995500033581839e-06,
|
| 11401 |
+
"loss": 0.8223,
|
| 11402 |
+
"mean_token_accuracy": 0.7970936000347137,
|
| 11403 |
+
"num_tokens": 14023087.0,
|
| 11404 |
+
"step": 12660
|
| 11405 |
+
},
|
| 11406 |
+
{
|
| 11407 |
+
"epoch": 2.552891396332863,
|
| 11408 |
+
"grad_norm": 10.25,
|
| 11409 |
+
"learning_rate": 2.982067298005239e-06,
|
| 11410 |
+
"loss": 0.7795,
|
| 11411 |
+
"mean_token_accuracy": 0.8052306652069092,
|
| 11412 |
+
"num_tokens": 14035227.0,
|
| 11413 |
+
"step": 12670
|
| 11414 |
+
},
|
| 11415 |
+
{
|
| 11416 |
+
"epoch": 2.5549063066693534,
|
| 11417 |
+
"grad_norm": 10.6875,
|
| 11418 |
+
"learning_rate": 2.968634562428639e-06,
|
| 11419 |
+
"loss": 0.8418,
|
| 11420 |
+
"mean_token_accuracy": 0.7952195703983307,
|
| 11421 |
+
"num_tokens": 14046084.0,
|
| 11422 |
+
"step": 12680
|
| 11423 |
+
},
|
| 11424 |
+
{
|
| 11425 |
+
"epoch": 2.5569212170058435,
|
| 11426 |
+
"grad_norm": 11.625,
|
| 11427 |
+
"learning_rate": 2.9552018268520386e-06,
|
| 11428 |
+
"loss": 0.7606,
|
| 11429 |
+
"mean_token_accuracy": 0.8044079065322876,
|
| 11430 |
+
"num_tokens": 14055858.0,
|
| 11431 |
+
"step": 12690
|
| 11432 |
+
},
|
| 11433 |
+
{
|
| 11434 |
+
"epoch": 2.5589361273423332,
|
| 11435 |
+
"grad_norm": 11.375,
|
| 11436 |
+
"learning_rate": 2.9417690912754388e-06,
|
| 11437 |
+
"loss": 0.7828,
|
| 11438 |
+
"mean_token_accuracy": 0.8071886241436005,
|
| 11439 |
+
"num_tokens": 14067097.0,
|
| 11440 |
+
"step": 12700
|
| 11441 |
+
},
|
| 11442 |
+
{
|
| 11443 |
+
"epoch": 2.5609510376788234,
|
| 11444 |
+
"grad_norm": 14.1875,
|
| 11445 |
+
"learning_rate": 2.928336355698838e-06,
|
| 11446 |
+
"loss": 0.8814,
|
| 11447 |
+
"mean_token_accuracy": 0.7862110197544098,
|
| 11448 |
+
"num_tokens": 14077884.0,
|
| 11449 |
+
"step": 12710
|
| 11450 |
+
},
|
| 11451 |
+
{
|
| 11452 |
+
"epoch": 2.5629659480153135,
|
| 11453 |
+
"grad_norm": 11.8125,
|
| 11454 |
+
"learning_rate": 2.914903620122238e-06,
|
| 11455 |
+
"loss": 0.8627,
|
| 11456 |
+
"mean_token_accuracy": 0.7814090967178344,
|
| 11457 |
+
"num_tokens": 14088843.0,
|
| 11458 |
+
"step": 12720
|
| 11459 |
+
},
|
| 11460 |
+
{
|
| 11461 |
+
"epoch": 2.5649808583518032,
|
| 11462 |
+
"grad_norm": 14.6875,
|
| 11463 |
+
"learning_rate": 2.901470884545638e-06,
|
| 11464 |
+
"loss": 0.8469,
|
| 11465 |
+
"mean_token_accuracy": 0.7937661349773407,
|
| 11466 |
+
"num_tokens": 14100309.0,
|
| 11467 |
+
"step": 12730
|
| 11468 |
+
},
|
| 11469 |
+
{
|
| 11470 |
+
"epoch": 2.5669957686882934,
|
| 11471 |
+
"grad_norm": 11.1875,
|
| 11472 |
+
"learning_rate": 2.888038148969038e-06,
|
| 11473 |
+
"loss": 0.8517,
|
| 11474 |
+
"mean_token_accuracy": 0.790552693605423,
|
| 11475 |
+
"num_tokens": 14110323.0,
|
| 11476 |
+
"step": 12740
|
| 11477 |
+
},
|
| 11478 |
+
{
|
| 11479 |
+
"epoch": 2.5690106790247835,
|
| 11480 |
+
"grad_norm": 12.625,
|
| 11481 |
+
"learning_rate": 2.874605413392438e-06,
|
| 11482 |
+
"loss": 0.8309,
|
| 11483 |
+
"mean_token_accuracy": 0.795288497209549,
|
| 11484 |
+
"num_tokens": 14121179.0,
|
| 11485 |
+
"step": 12750
|
| 11486 |
+
},
|
| 11487 |
+
{
|
| 11488 |
+
"epoch": 2.5710255893612732,
|
| 11489 |
+
"grad_norm": 11.9375,
|
| 11490 |
+
"learning_rate": 2.8611726778158373e-06,
|
| 11491 |
+
"loss": 0.8162,
|
| 11492 |
+
"mean_token_accuracy": 0.7960000455379486,
|
| 11493 |
+
"num_tokens": 14132215.0,
|
| 11494 |
+
"step": 12760
|
| 11495 |
+
},
|
| 11496 |
+
{
|
| 11497 |
+
"epoch": 2.5730404996977634,
|
| 11498 |
+
"grad_norm": 11.875,
|
| 11499 |
+
"learning_rate": 2.847739942239237e-06,
|
| 11500 |
+
"loss": 0.794,
|
| 11501 |
+
"mean_token_accuracy": 0.8040944337844849,
|
| 11502 |
+
"num_tokens": 14142028.0,
|
| 11503 |
+
"step": 12770
|
| 11504 |
+
},
|
| 11505 |
+
{
|
| 11506 |
+
"epoch": 2.5750554100342535,
|
| 11507 |
+
"grad_norm": 9.6875,
|
| 11508 |
+
"learning_rate": 2.834307206662637e-06,
|
| 11509 |
+
"loss": 0.9474,
|
| 11510 |
+
"mean_token_accuracy": 0.7681374192237854,
|
| 11511 |
+
"num_tokens": 14153369.0,
|
| 11512 |
+
"step": 12780
|
| 11513 |
+
},
|
| 11514 |
+
{
|
| 11515 |
+
"epoch": 2.5770703203707432,
|
| 11516 |
+
"grad_norm": 11.5625,
|
| 11517 |
+
"learning_rate": 2.820874471086037e-06,
|
| 11518 |
+
"loss": 0.8301,
|
| 11519 |
+
"mean_token_accuracy": 0.7957022428512573,
|
| 11520 |
+
"num_tokens": 14165045.0,
|
| 11521 |
+
"step": 12790
|
| 11522 |
+
},
|
| 11523 |
+
{
|
| 11524 |
+
"epoch": 2.5790852307072334,
|
| 11525 |
+
"grad_norm": 13.9375,
|
| 11526 |
+
"learning_rate": 2.807441735509437e-06,
|
| 11527 |
+
"loss": 0.7298,
|
| 11528 |
+
"mean_token_accuracy": 0.812953507900238,
|
| 11529 |
+
"num_tokens": 14175171.0,
|
| 11530 |
+
"step": 12800
|
| 11531 |
+
},
|
| 11532 |
+
{
|
| 11533 |
+
"epoch": 2.5811001410437235,
|
| 11534 |
+
"grad_norm": 10.0625,
|
| 11535 |
+
"learning_rate": 2.794008999932837e-06,
|
| 11536 |
+
"loss": 0.8874,
|
| 11537 |
+
"mean_token_accuracy": 0.7833206593990326,
|
| 11538 |
+
"num_tokens": 14186567.0,
|
| 11539 |
+
"step": 12810
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 2.5831150513802137,
|
| 11543 |
+
"grad_norm": 14.4375,
|
| 11544 |
+
"learning_rate": 2.780576264356236e-06,
|
| 11545 |
+
"loss": 0.7494,
|
| 11546 |
+
"mean_token_accuracy": 0.8073345363140106,
|
| 11547 |
+
"num_tokens": 14196603.0,
|
| 11548 |
+
"step": 12820
|
| 11549 |
+
},
|
| 11550 |
+
{
|
| 11551 |
+
"epoch": 2.585129961716704,
|
| 11552 |
+
"grad_norm": 12.6875,
|
| 11553 |
+
"learning_rate": 2.7671435287796363e-06,
|
| 11554 |
+
"loss": 0.7582,
|
| 11555 |
+
"mean_token_accuracy": 0.8067417740821838,
|
| 11556 |
+
"num_tokens": 14207335.0,
|
| 11557 |
+
"step": 12830
|
| 11558 |
+
},
|
| 11559 |
+
{
|
| 11560 |
+
"epoch": 2.5871448720531935,
|
| 11561 |
+
"grad_norm": 12.375,
|
| 11562 |
+
"learning_rate": 2.753710793203036e-06,
|
| 11563 |
+
"loss": 0.7523,
|
| 11564 |
+
"mean_token_accuracy": 0.8145627319812775,
|
| 11565 |
+
"num_tokens": 14218474.0,
|
| 11566 |
+
"step": 12840
|
| 11567 |
+
},
|
| 11568 |
+
{
|
| 11569 |
+
"epoch": 2.5891597823896837,
|
| 11570 |
+
"grad_norm": 10.375,
|
| 11571 |
+
"learning_rate": 2.740278057626436e-06,
|
| 11572 |
+
"loss": 0.8045,
|
| 11573 |
+
"mean_token_accuracy": 0.8010720014572144,
|
| 11574 |
+
"num_tokens": 14229469.0,
|
| 11575 |
+
"step": 12850
|
| 11576 |
+
},
|
| 11577 |
+
{
|
| 11578 |
+
"epoch": 2.591174692726174,
|
| 11579 |
+
"grad_norm": 12.625,
|
| 11580 |
+
"learning_rate": 2.7268453220498358e-06,
|
| 11581 |
+
"loss": 0.8266,
|
| 11582 |
+
"mean_token_accuracy": 0.7978542387485504,
|
| 11583 |
+
"num_tokens": 14240757.0,
|
| 11584 |
+
"step": 12860
|
| 11585 |
+
},
|
| 11586 |
+
{
|
| 11587 |
+
"epoch": 2.5931896030626636,
|
| 11588 |
+
"grad_norm": 11.3125,
|
| 11589 |
+
"learning_rate": 2.713412586473235e-06,
|
| 11590 |
+
"loss": 0.8082,
|
| 11591 |
+
"mean_token_accuracy": 0.7974193513393402,
|
| 11592 |
+
"num_tokens": 14251148.0,
|
| 11593 |
+
"step": 12870
|
| 11594 |
+
},
|
| 11595 |
+
{
|
| 11596 |
+
"epoch": 2.5952045133991537,
|
| 11597 |
+
"grad_norm": 11.3125,
|
| 11598 |
+
"learning_rate": 2.699979850896635e-06,
|
| 11599 |
+
"loss": 0.8217,
|
| 11600 |
+
"mean_token_accuracy": 0.7950396835803986,
|
| 11601 |
+
"num_tokens": 14263499.0,
|
| 11602 |
+
"step": 12880
|
| 11603 |
+
},
|
| 11604 |
+
{
|
| 11605 |
+
"epoch": 2.597219423735644,
|
| 11606 |
+
"grad_norm": 12.4375,
|
| 11607 |
+
"learning_rate": 2.6865471153200352e-06,
|
| 11608 |
+
"loss": 0.7426,
|
| 11609 |
+
"mean_token_accuracy": 0.8107175350189209,
|
| 11610 |
+
"num_tokens": 14273600.0,
|
| 11611 |
+
"step": 12890
|
| 11612 |
+
},
|
| 11613 |
+
{
|
| 11614 |
+
"epoch": 2.5992343340721336,
|
| 11615 |
+
"grad_norm": 12.1875,
|
| 11616 |
+
"learning_rate": 2.673114379743435e-06,
|
| 11617 |
+
"loss": 0.7092,
|
| 11618 |
+
"mean_token_accuracy": 0.8177358627319335,
|
| 11619 |
+
"num_tokens": 14284136.0,
|
| 11620 |
+
"step": 12900
|
| 11621 |
+
},
|
| 11622 |
+
{
|
| 11623 |
+
"epoch": 2.6012492444086237,
|
| 11624 |
+
"grad_norm": 12.625,
|
| 11625 |
+
"learning_rate": 2.659681644166835e-06,
|
| 11626 |
+
"loss": 0.7701,
|
| 11627 |
+
"mean_token_accuracy": 0.8068889915943146,
|
| 11628 |
+
"num_tokens": 14294590.0,
|
| 11629 |
+
"step": 12910
|
| 11630 |
+
},
|
| 11631 |
+
{
|
| 11632 |
+
"epoch": 2.603264154745114,
|
| 11633 |
+
"grad_norm": 11.375,
|
| 11634 |
+
"learning_rate": 2.6462489085902347e-06,
|
| 11635 |
+
"loss": 0.8433,
|
| 11636 |
+
"mean_token_accuracy": 0.7921142339706421,
|
| 11637 |
+
"num_tokens": 14305206.0,
|
| 11638 |
+
"step": 12920
|
| 11639 |
+
},
|
| 11640 |
+
{
|
| 11641 |
+
"epoch": 2.605279065081604,
|
| 11642 |
+
"grad_norm": 9.75,
|
| 11643 |
+
"learning_rate": 2.6328161730136344e-06,
|
| 11644 |
+
"loss": 0.7931,
|
| 11645 |
+
"mean_token_accuracy": 0.7983499586582183,
|
| 11646 |
+
"num_tokens": 14315998.0,
|
| 11647 |
+
"step": 12930
|
| 11648 |
+
},
|
| 11649 |
+
{
|
| 11650 |
+
"epoch": 2.607293975418094,
|
| 11651 |
+
"grad_norm": 14.4375,
|
| 11652 |
+
"learning_rate": 2.619383437437034e-06,
|
| 11653 |
+
"loss": 0.8605,
|
| 11654 |
+
"mean_token_accuracy": 0.7901061117649079,
|
| 11655 |
+
"num_tokens": 14326408.0,
|
| 11656 |
+
"step": 12940
|
| 11657 |
+
},
|
| 11658 |
+
{
|
| 11659 |
+
"epoch": 2.609308885754584,
|
| 11660 |
+
"grad_norm": 8.6875,
|
| 11661 |
+
"learning_rate": 2.605950701860434e-06,
|
| 11662 |
+
"loss": 0.8868,
|
| 11663 |
+
"mean_token_accuracy": 0.7825572431087494,
|
| 11664 |
+
"num_tokens": 14337843.0,
|
| 11665 |
+
"step": 12950
|
| 11666 |
+
},
|
| 11667 |
+
{
|
| 11668 |
+
"epoch": 2.611323796091074,
|
| 11669 |
+
"grad_norm": 11.0,
|
| 11670 |
+
"learning_rate": 2.592517966283834e-06,
|
| 11671 |
+
"loss": 0.7792,
|
| 11672 |
+
"mean_token_accuracy": 0.7979351162910462,
|
| 11673 |
+
"num_tokens": 14348649.0,
|
| 11674 |
+
"step": 12960
|
| 11675 |
+
},
|
| 11676 |
+
{
|
| 11677 |
+
"epoch": 2.613338706427564,
|
| 11678 |
+
"grad_norm": 10.5,
|
| 11679 |
+
"learning_rate": 2.579085230707234e-06,
|
| 11680 |
+
"loss": 0.832,
|
| 11681 |
+
"mean_token_accuracy": 0.7962626338005065,
|
| 11682 |
+
"num_tokens": 14360145.0,
|
| 11683 |
+
"step": 12970
|
| 11684 |
+
},
|
| 11685 |
+
{
|
| 11686 |
+
"epoch": 2.615353616764054,
|
| 11687 |
+
"grad_norm": 9.5,
|
| 11688 |
+
"learning_rate": 2.5656524951306332e-06,
|
| 11689 |
+
"loss": 0.8188,
|
| 11690 |
+
"mean_token_accuracy": 0.7971078157424927,
|
| 11691 |
+
"num_tokens": 14371903.0,
|
| 11692 |
+
"step": 12980
|
| 11693 |
+
},
|
| 11694 |
+
{
|
| 11695 |
+
"epoch": 2.617368527100544,
|
| 11696 |
+
"grad_norm": 10.1875,
|
| 11697 |
+
"learning_rate": 2.5522197595540333e-06,
|
| 11698 |
+
"loss": 0.837,
|
| 11699 |
+
"mean_token_accuracy": 0.7964988470077514,
|
| 11700 |
+
"num_tokens": 14381687.0,
|
| 11701 |
+
"step": 12990
|
| 11702 |
+
},
|
| 11703 |
+
{
|
| 11704 |
+
"epoch": 2.619383437437034,
|
| 11705 |
+
"grad_norm": 12.8125,
|
| 11706 |
+
"learning_rate": 2.538787023977433e-06,
|
| 11707 |
+
"loss": 0.9483,
|
| 11708 |
+
"mean_token_accuracy": 0.7685989677906037,
|
| 11709 |
+
"num_tokens": 14393395.0,
|
| 11710 |
+
"step": 13000
|
| 11711 |
}
|
| 11712 |
],
|
| 11713 |
"logging_steps": 10,
|
|
|
|
| 11727 |
"attributes": {}
|
| 11728 |
}
|
| 11729 |
},
|
| 11730 |
+
"total_flos": 1.7403253820080128e+16,
|
| 11731 |
"train_batch_size": 8,
|
| 11732 |
"trial_name": null,
|
| 11733 |
"trial_params": null
|