|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99695843190267, |
|
"eval_steps": 500, |
|
"global_step": 1107, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027036160865157147, |
|
"grad_norm": 3.7416573454919604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8414, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054072321730314295, |
|
"grad_norm": 2.420213587424198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6857, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08110848259547145, |
|
"grad_norm": 2.026364466482845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6481, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10814464346062859, |
|
"grad_norm": 1.7544085093823143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.636, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13518080432578575, |
|
"grad_norm": 2.5689946075394876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.621, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1622169651909429, |
|
"grad_norm": 2.90035063993728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6159, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18925312605610004, |
|
"grad_norm": 1.9704021628275166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.613, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21628928692125718, |
|
"grad_norm": 2.0006565102546277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6038, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24332544778641432, |
|
"grad_norm": 1.8244581409196392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6014, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2703616086515715, |
|
"grad_norm": 3.5170158663707562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29739776951672864, |
|
"grad_norm": 1.9160505890443071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6012, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3244339303818858, |
|
"grad_norm": 1.5411483704174398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5956, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3514700912470429, |
|
"grad_norm": 2.649167678139615, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5974, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37850625211220007, |
|
"grad_norm": 1.4247865709072371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5965, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4055424129773572, |
|
"grad_norm": 1.4405880085697187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.589, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43257857384251436, |
|
"grad_norm": 1.6781578168272708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.592, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4596147347076715, |
|
"grad_norm": 1.9394919639875512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5928, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.48665089557282865, |
|
"grad_norm": 1.7808260489612084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5922, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5136870564379858, |
|
"grad_norm": 1.6418704591917206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5887, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.540723217303143, |
|
"grad_norm": 1.463644316677313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5863, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5677593781683001, |
|
"grad_norm": 1.7587180647301535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5887, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5947955390334573, |
|
"grad_norm": 1.6536202559796944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5843, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6218316998986144, |
|
"grad_norm": 1.4196511331884876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5857, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6488678607637716, |
|
"grad_norm": 1.36566737151604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5824, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6759040216289287, |
|
"grad_norm": 2.9952670177096987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7029401824940859, |
|
"grad_norm": 1.4939335531208207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5792, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.729976343359243, |
|
"grad_norm": 1.3045505143888763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5809, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7570125042244001, |
|
"grad_norm": 1.2454141875159743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7840486650895573, |
|
"grad_norm": 1.6672422177821689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5762, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8110848259547144, |
|
"grad_norm": 1.3806810495733297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8381209868198716, |
|
"grad_norm": 1.4982083898695995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5734, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8651571476850287, |
|
"grad_norm": 1.6985418707956137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8921933085501859, |
|
"grad_norm": 2.04658180538426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5735, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.919229469415343, |
|
"grad_norm": 2.2541915966390924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.581, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9462656302805001, |
|
"grad_norm": 1.9396838879359772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5715, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9733017911456573, |
|
"grad_norm": 1.981100443956288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5691, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9976343359242987, |
|
"eval_loss": 0.07115323096513748, |
|
"eval_runtime": 246.0901, |
|
"eval_samples_per_second": 40.493, |
|
"eval_steps_per_second": 0.634, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.0023656640757013, |
|
"grad_norm": 3.6621685211123927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5653, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0294018249408583, |
|
"grad_norm": 2.4178358681298926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4812, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0564379858060156, |
|
"grad_norm": 1.573769542810224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4752, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0834741466711728, |
|
"grad_norm": 1.6174732701515342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4784, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1105103075363298, |
|
"grad_norm": 1.3203029544672327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4767, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1375464684014869, |
|
"grad_norm": 1.659088202560807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.48, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1645826292666441, |
|
"grad_norm": 1.5293709624512395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4886, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1916187901318014, |
|
"grad_norm": 1.7162950969288626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.48, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2186549509969584, |
|
"grad_norm": 1.789642107765462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4846, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2456911118621157, |
|
"grad_norm": 2.3719269899175464, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4868, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 2.0649999632624234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4867, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.29976343359243, |
|
"grad_norm": 1.7868192883503795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4808, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.326799594457587, |
|
"grad_norm": 1.580468512053965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4802, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3538357553227442, |
|
"grad_norm": 1.4368050523026477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.484, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3808719161879013, |
|
"grad_norm": 1.8494974343766688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4852, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4079080770530585, |
|
"grad_norm": 1.4845281910598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4873, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4349442379182156, |
|
"grad_norm": 1.7382803616315246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4805, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4619803987833728, |
|
"grad_norm": 1.9374881637819812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4852, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4890165596485299, |
|
"grad_norm": 2.0166526872457995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4837, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5160527205136871, |
|
"grad_norm": 1.9298737331649736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4854, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5430888813788441, |
|
"grad_norm": 1.656041584162736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4805, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5701250422440014, |
|
"grad_norm": 1.322425683347454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4824, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5971612031091587, |
|
"grad_norm": 1.5996884459070573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4865, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6241973639743157, |
|
"grad_norm": 1.7912731135269302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4855, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6512335248394727, |
|
"grad_norm": 1.6299247983634995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4839, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.67826968570463, |
|
"grad_norm": 1.6195975059981367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4879, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7053058465697872, |
|
"grad_norm": 1.5581289945884544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4934, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7323420074349443, |
|
"grad_norm": 1.7245628020214243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4921, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7593781683001013, |
|
"grad_norm": 1.4558529164875116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4899, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7864143291652586, |
|
"grad_norm": 1.3838084244152689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4867, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8134504900304158, |
|
"grad_norm": 1.2978952486050377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4898, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8404866508955728, |
|
"grad_norm": 1.3510779632006422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4933, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8675228117607299, |
|
"grad_norm": 1.2650328544272216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4922, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8945589726258871, |
|
"grad_norm": 1.2364699638447674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4949, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9215951334910444, |
|
"grad_norm": 1.2724457587924303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4995, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9486312943562014, |
|
"grad_norm": 1.3500253788955112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4927, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9756674552213584, |
|
"grad_norm": 1.211531877719849, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4889, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9972963839134843, |
|
"eval_loss": 0.07184100151062012, |
|
"eval_runtime": 245.9279, |
|
"eval_samples_per_second": 40.52, |
|
"eval_steps_per_second": 0.634, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.0047313281514025, |
|
"grad_norm": 3.245553628462997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4769, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0317674890165596, |
|
"grad_norm": 1.8685703471097137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3856, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0588036498817166, |
|
"grad_norm": 1.9250688037531085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3858, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.085839810746874, |
|
"grad_norm": 1.7824110630026255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3817, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.112875971612031, |
|
"grad_norm": 1.6606355551312104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.382, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.139912132477188, |
|
"grad_norm": 1.638700904915986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3811, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1669482933423456, |
|
"grad_norm": 1.615315899395297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3877, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1939844542075027, |
|
"grad_norm": 1.708187020441321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3869, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2210206150726597, |
|
"grad_norm": 1.7454063972273366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3874, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2480567759378167, |
|
"grad_norm": 1.8331936510642228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3899, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2750929368029738, |
|
"grad_norm": 1.7020028217377983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3922, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.3021290976681312, |
|
"grad_norm": 1.6721649170340955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3866, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3291652585332883, |
|
"grad_norm": 1.6841976597663628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3902, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3562014193984453, |
|
"grad_norm": 1.5088788343187851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3953, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3832375802636028, |
|
"grad_norm": 1.7096850640735373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3938, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.41027374112876, |
|
"grad_norm": 1.5945133369441267, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3936, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.437309901993917, |
|
"grad_norm": 1.7236799110962686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3944, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.464346062859074, |
|
"grad_norm": 1.6116484052046463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3925, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4913822237242313, |
|
"grad_norm": 1.575428566577217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3939, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5184183845893884, |
|
"grad_norm": 1.5852588401636647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.393, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.5725956084643744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4006, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5724907063197024, |
|
"grad_norm": 1.5697632754260353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3958, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.59952686718486, |
|
"grad_norm": 1.6015038862083164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4015, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.626563028050017, |
|
"grad_norm": 1.5339909095907225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3997, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.653599188915174, |
|
"grad_norm": 1.5352960562989082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4039, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6806353497803315, |
|
"grad_norm": 1.5748878900674392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4019, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7076715106454885, |
|
"grad_norm": 1.8108417713608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3956, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7347076715106455, |
|
"grad_norm": 1.9888167435166277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4028, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.7617438323758026, |
|
"grad_norm": 1.6982996347109014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4052, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7887799932409596, |
|
"grad_norm": 1.4591697770426337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4029, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.815816154106117, |
|
"grad_norm": 1.4793538277944271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4024, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.842852314971274, |
|
"grad_norm": 1.5146429069615557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4037, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.869888475836431, |
|
"grad_norm": 1.4264075457576446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3995, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8969246367015886, |
|
"grad_norm": 1.6106041516243625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4078, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.9239607975667457, |
|
"grad_norm": 1.4810218451874833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.403, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9509969584319027, |
|
"grad_norm": 1.4927021349271592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.406, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9780331192970597, |
|
"grad_norm": 1.4791467599867494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4055, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.99695843190267, |
|
"eval_loss": 0.07671036571264267, |
|
"eval_runtime": 251.6292, |
|
"eval_samples_per_second": 39.602, |
|
"eval_steps_per_second": 0.62, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 2.99695843190267, |
|
"step": 1107, |
|
"total_flos": 1854056851046400.0, |
|
"train_loss": 0.49416024618140186, |
|
"train_runtime": 35644.3817, |
|
"train_samples_per_second": 15.935, |
|
"train_steps_per_second": 0.031 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1107, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1854056851046400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|