|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.999882826231301, |
|
"eval_steps": 500, |
|
"global_step": 1600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006249267663945631, |
|
"grad_norm": 6.042294502258301, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.7433, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012498535327891263, |
|
"grad_norm": 3.4981777667999268, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.6848, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.018747802991836895, |
|
"grad_norm": 7.156867980957031, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.6722, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.024997070655782525, |
|
"grad_norm": 3.301668167114258, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.6924, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.031246338319728156, |
|
"grad_norm": 2.7308597564697266, |
|
"learning_rate": 1.999991805061211e-05, |
|
"loss": 0.6759, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03749560598367379, |
|
"grad_norm": 5.730428695678711, |
|
"learning_rate": 1.999704996306308e-05, |
|
"loss": 0.7484, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04374487364761942, |
|
"grad_norm": 2.7434427738189697, |
|
"learning_rate": 1.999008574916082e-05, |
|
"loss": 0.7298, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04999414131156505, |
|
"grad_norm": 2.67563533782959, |
|
"learning_rate": 1.997902826237712e-05, |
|
"loss": 0.7381, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05624340897551068, |
|
"grad_norm": 4.653203010559082, |
|
"learning_rate": 1.9963882033334827e-05, |
|
"loss": 0.716, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06249267663945631, |
|
"grad_norm": 3.6553049087524414, |
|
"learning_rate": 1.9944653267951507e-05, |
|
"loss": 0.713, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06874194430340194, |
|
"grad_norm": 3.856987953186035, |
|
"learning_rate": 1.9921349844896655e-05, |
|
"loss": 0.6886, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07499121196734758, |
|
"grad_norm": 2.80169677734375, |
|
"learning_rate": 1.9893981312363563e-05, |
|
"loss": 0.7459, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0812404796312932, |
|
"grad_norm": 9.439187049865723, |
|
"learning_rate": 1.9862558884157067e-05, |
|
"loss": 0.7131, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08748974729523884, |
|
"grad_norm": 3.0188660621643066, |
|
"learning_rate": 1.9827095435098926e-05, |
|
"loss": 0.7419, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09373901495918448, |
|
"grad_norm": 6.821515083312988, |
|
"learning_rate": 1.9787605495752528e-05, |
|
"loss": 0.7093, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0999882826231301, |
|
"grad_norm": 3.1190407276153564, |
|
"learning_rate": 1.9744105246469264e-05, |
|
"loss": 0.7162, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10623755028707574, |
|
"grad_norm": 3.4266233444213867, |
|
"learning_rate": 1.9696612510758878e-05, |
|
"loss": 0.7501, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11248681795102136, |
|
"grad_norm": 6.012339115142822, |
|
"learning_rate": 1.964514674798659e-05, |
|
"loss": 0.7228, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.118736085614967, |
|
"grad_norm": 2.518491268157959, |
|
"learning_rate": 1.9589729045399935e-05, |
|
"loss": 0.7283, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12498535327891262, |
|
"grad_norm": 2.739582061767578, |
|
"learning_rate": 1.953038210948861e-05, |
|
"loss": 0.7464, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13123462094285826, |
|
"grad_norm": 2.70062518119812, |
|
"learning_rate": 1.9467130256680867e-05, |
|
"loss": 0.721, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13748388860680388, |
|
"grad_norm": 2.4166440963745117, |
|
"learning_rate": 1.9399999403380266e-05, |
|
"loss": 0.711, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.14373315627074953, |
|
"grad_norm": 3.875697374343872, |
|
"learning_rate": 1.932901705534683e-05, |
|
"loss": 0.7296, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14998242393469516, |
|
"grad_norm": 3.1276426315307617, |
|
"learning_rate": 1.9254212296427043e-05, |
|
"loss": 0.7079, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15623169159864078, |
|
"grad_norm": 3.067948341369629, |
|
"learning_rate": 1.9175615776637212e-05, |
|
"loss": 0.7231, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1624809592625864, |
|
"grad_norm": 2.604984760284424, |
|
"learning_rate": 1.9093259699605125e-05, |
|
"loss": 0.7202, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16873022692653206, |
|
"grad_norm": 2.3994998931884766, |
|
"learning_rate": 1.900717780937514e-05, |
|
"loss": 0.7129, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.17497949459047768, |
|
"grad_norm": 2.617325782775879, |
|
"learning_rate": 1.8917405376582144e-05, |
|
"loss": 0.7142, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1812287622544233, |
|
"grad_norm": 2.6472878456115723, |
|
"learning_rate": 1.8823979183999965e-05, |
|
"loss": 0.7168, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.18747802991836895, |
|
"grad_norm": 2.1472179889678955, |
|
"learning_rate": 1.8726937511470247e-05, |
|
"loss": 0.7209, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.19372729758231458, |
|
"grad_norm": 2.5548176765441895, |
|
"learning_rate": 1.8626320120217922e-05, |
|
"loss": 0.7278, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1999765652462602, |
|
"grad_norm": 2.2397587299346924, |
|
"learning_rate": 1.8522168236559693e-05, |
|
"loss": 0.7229, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.20622583291020583, |
|
"grad_norm": 2.72623872756958, |
|
"learning_rate": 1.8414524535012244e-05, |
|
"loss": 0.7003, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21247510057415148, |
|
"grad_norm": 2.6225929260253906, |
|
"learning_rate": 1.8303433120807043e-05, |
|
"loss": 0.729, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2187243682380971, |
|
"grad_norm": 4.628861427307129, |
|
"learning_rate": 1.8188939511818965e-05, |
|
"loss": 0.7437, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.22497363590204272, |
|
"grad_norm": 2.2117223739624023, |
|
"learning_rate": 1.8071090619916095e-05, |
|
"loss": 0.7101, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.23122290356598835, |
|
"grad_norm": 2.2710976600646973, |
|
"learning_rate": 1.7949934731738348e-05, |
|
"loss": 0.7297, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.237472171229934, |
|
"grad_norm": 2.2278854846954346, |
|
"learning_rate": 1.7825521488912833e-05, |
|
"loss": 0.7275, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.24372143889387962, |
|
"grad_norm": 2.332300901412964, |
|
"learning_rate": 1.7697901867713997e-05, |
|
"loss": 0.7103, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.24997070655782525, |
|
"grad_norm": 1.9451990127563477, |
|
"learning_rate": 1.7567128158176955e-05, |
|
"loss": 0.6981, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2562199742217709, |
|
"grad_norm": 2.7532973289489746, |
|
"learning_rate": 1.7433253942672497e-05, |
|
"loss": 0.7015, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2624692418857165, |
|
"grad_norm": 2.6847236156463623, |
|
"learning_rate": 1.7296334073952606e-05, |
|
"loss": 0.7242, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.26871850954966214, |
|
"grad_norm": 2.6689813137054443, |
|
"learning_rate": 1.7156424652675433e-05, |
|
"loss": 0.7115, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.27496777721360777, |
|
"grad_norm": 2.2913591861724854, |
|
"learning_rate": 1.7013583004418994e-05, |
|
"loss": 0.6912, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2812170448775534, |
|
"grad_norm": 2.5969536304473877, |
|
"learning_rate": 1.6867867656192946e-05, |
|
"loss": 0.7074, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.28746631254149907, |
|
"grad_norm": 2.246922731399536, |
|
"learning_rate": 1.6719338312458123e-05, |
|
"loss": 0.7048, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2937155802054447, |
|
"grad_norm": 2.251768112182617, |
|
"learning_rate": 1.656805583066361e-05, |
|
"loss": 0.7085, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2999648478693903, |
|
"grad_norm": 2.249643325805664, |
|
"learning_rate": 1.6414082196311402e-05, |
|
"loss": 0.6897, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.30621411553333594, |
|
"grad_norm": 1.974482536315918, |
|
"learning_rate": 1.6257480497558873e-05, |
|
"loss": 0.6941, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.31246338319728156, |
|
"grad_norm": 2.1665918827056885, |
|
"learning_rate": 1.6098314899369446e-05, |
|
"loss": 0.7061, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3187126508612272, |
|
"grad_norm": 2.391334295272827, |
|
"learning_rate": 1.5936650617222063e-05, |
|
"loss": 0.6958, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3249619185251728, |
|
"grad_norm": 2.047011375427246, |
|
"learning_rate": 1.5772553890390196e-05, |
|
"loss": 0.7177, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3312111861891185, |
|
"grad_norm": 1.8378033638000488, |
|
"learning_rate": 1.560609195480142e-05, |
|
"loss": 0.6803, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3374604538530641, |
|
"grad_norm": 2.126997232437134, |
|
"learning_rate": 1.5437333015488586e-05, |
|
"loss": 0.6694, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.34370972151700974, |
|
"grad_norm": 2.4227328300476074, |
|
"learning_rate": 1.526634621864395e-05, |
|
"loss": 0.6773, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.34995898918095536, |
|
"grad_norm": 2.368255615234375, |
|
"learning_rate": 1.5093201623287631e-05, |
|
"loss": 0.7048, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.356208256844901, |
|
"grad_norm": 1.9544981718063354, |
|
"learning_rate": 1.4917970172562122e-05, |
|
"loss": 0.686, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3624575245088466, |
|
"grad_norm": 2.019627809524536, |
|
"learning_rate": 1.4740723664664483e-05, |
|
"loss": 0.6911, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.36870679217279223, |
|
"grad_norm": 1.8915493488311768, |
|
"learning_rate": 1.4561534723428205e-05, |
|
"loss": 0.6828, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3749560598367379, |
|
"grad_norm": 2.211825370788574, |
|
"learning_rate": 1.4380476768566825e-05, |
|
"loss": 0.6636, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.38120532750068353, |
|
"grad_norm": 1.9755256175994873, |
|
"learning_rate": 1.4197623985591373e-05, |
|
"loss": 0.6902, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.38745459516462916, |
|
"grad_norm": 1.9239295721054077, |
|
"learning_rate": 1.4013051295414108e-05, |
|
"loss": 0.6795, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3937038628285748, |
|
"grad_norm": 2.1901464462280273, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 0.7005, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3999531304925204, |
|
"grad_norm": 1.9579427242279053, |
|
"learning_rate": 1.3639049369634878e-05, |
|
"loss": 0.6789, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.406202398156466, |
|
"grad_norm": 1.819069266319275, |
|
"learning_rate": 1.344977337515404e-05, |
|
"loss": 0.6808, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.41245166582041165, |
|
"grad_norm": 2.11354923248291, |
|
"learning_rate": 1.3259083892925633e-05, |
|
"loss": 0.6798, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.41870093348435733, |
|
"grad_norm": 2.0537407398223877, |
|
"learning_rate": 1.3067059054820184e-05, |
|
"loss": 0.6587, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.42495020114830295, |
|
"grad_norm": 2.4250621795654297, |
|
"learning_rate": 1.2873777539848284e-05, |
|
"loss": 0.6699, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4311994688122486, |
|
"grad_norm": 2.0022125244140625, |
|
"learning_rate": 1.2679318541923131e-05, |
|
"loss": 0.6644, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4374487364761942, |
|
"grad_norm": 1.9328835010528564, |
|
"learning_rate": 1.248376173741215e-05, |
|
"loss": 0.68, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4436980041401398, |
|
"grad_norm": 2.0376391410827637, |
|
"learning_rate": 1.2287187252490914e-05, |
|
"loss": 0.684, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.44994727180408545, |
|
"grad_norm": 1.9673105478286743, |
|
"learning_rate": 1.2089675630312755e-05, |
|
"loss": 0.6515, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.45619653946803107, |
|
"grad_norm": 1.7611652612686157, |
|
"learning_rate": 1.1891307798007536e-05, |
|
"loss": 0.6609, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4624458071319767, |
|
"grad_norm": 1.9665021896362305, |
|
"learning_rate": 1.1692165033523117e-05, |
|
"loss": 0.6801, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4686950747959224, |
|
"grad_norm": 2.349421977996826, |
|
"learning_rate": 1.1492328932323022e-05, |
|
"loss": 0.6831, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.474944342459868, |
|
"grad_norm": 2.5945515632629395, |
|
"learning_rate": 1.1291881373954066e-05, |
|
"loss": 0.6725, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4811936101238136, |
|
"grad_norm": 1.7327567338943481, |
|
"learning_rate": 1.109090448849755e-05, |
|
"loss": 0.6394, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.48744287778775924, |
|
"grad_norm": 1.980264663696289, |
|
"learning_rate": 1.088948062291783e-05, |
|
"loss": 0.6682, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.49369214545170487, |
|
"grad_norm": 2.313495635986328, |
|
"learning_rate": 1.0687692307321984e-05, |
|
"loss": 0.6353, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4999414131156505, |
|
"grad_norm": 2.176020860671997, |
|
"learning_rate": 1.0485622221144485e-05, |
|
"loss": 0.6433, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5061906807795962, |
|
"grad_norm": 2.5199544429779053, |
|
"learning_rate": 1.0283353159270644e-05, |
|
"loss": 0.6719, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5124399484435418, |
|
"grad_norm": 2.6333529949188232, |
|
"learning_rate": 1.0080967998112787e-05, |
|
"loss": 0.6576, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5186892161074874, |
|
"grad_norm": 2.367847204208374, |
|
"learning_rate": 9.878549661653013e-06, |
|
"loss": 0.6547, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.524938483771433, |
|
"grad_norm": 1.736178994178772, |
|
"learning_rate": 9.676181087466444e-06, |
|
"loss": 0.6361, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5311877514353787, |
|
"grad_norm": 1.8639039993286133, |
|
"learning_rate": 9.473945192738933e-06, |
|
"loss": 0.6497, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5374370190993243, |
|
"grad_norm": 1.7635918855667114, |
|
"learning_rate": 9.27192484029312e-06, |
|
"loss": 0.6654, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5436862867632699, |
|
"grad_norm": 2.0125083923339844, |
|
"learning_rate": 9.070202804636745e-06, |
|
"loss": 0.6397, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5499355544272155, |
|
"grad_norm": 2.078883171081543, |
|
"learning_rate": 8.868861738047158e-06, |
|
"loss": 0.6603, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5561848220911612, |
|
"grad_norm": 1.781632423400879, |
|
"learning_rate": 8.667984136705927e-06, |
|
"loss": 0.6556, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5624340897551068, |
|
"grad_norm": 2.0591318607330322, |
|
"learning_rate": 8.46765230689737e-06, |
|
"loss": 0.6258, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5686833574190524, |
|
"grad_norm": 1.6111246347427368, |
|
"learning_rate": 8.267948331284923e-06, |
|
"loss": 0.6472, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5749326250829981, |
|
"grad_norm": 2.417381763458252, |
|
"learning_rate": 8.068954035279121e-06, |
|
"loss": 0.6226, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5811818927469438, |
|
"grad_norm": 2.185147762298584, |
|
"learning_rate": 7.870750953510983e-06, |
|
"loss": 0.617, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5874311604108894, |
|
"grad_norm": 2.3496596813201904, |
|
"learning_rate": 7.673420296424541e-06, |
|
"loss": 0.633, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.593680428074835, |
|
"grad_norm": 2.4552910327911377, |
|
"learning_rate": 7.4770429170022e-06, |
|
"loss": 0.6312, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5999296957387806, |
|
"grad_norm": 2.7454309463500977, |
|
"learning_rate": 7.2816992776365714e-06, |
|
"loss": 0.6252, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6061789634027263, |
|
"grad_norm": 2.967708110809326, |
|
"learning_rate": 7.08746941716232e-06, |
|
"loss": 0.634, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6124282310666719, |
|
"grad_norm": 2.755345582962036, |
|
"learning_rate": 6.894432918061579e-06, |
|
"loss": 0.6216, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6186774987306175, |
|
"grad_norm": 1.987317681312561, |
|
"learning_rate": 6.702668873856339e-06, |
|
"loss": 0.643, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6249267663945631, |
|
"grad_norm": 2.4610655307769775, |
|
"learning_rate": 6.5122558567011775e-06, |
|
"loss": 0.6409, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6311760340585088, |
|
"grad_norm": 1.9931960105895996, |
|
"learning_rate": 6.323271885189636e-06, |
|
"loss": 0.6418, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6374253017224544, |
|
"grad_norm": 3.0215256214141846, |
|
"learning_rate": 6.135794392387353e-06, |
|
"loss": 0.635, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6436745693864, |
|
"grad_norm": 1.752156376838684, |
|
"learning_rate": 5.949900194105167e-06, |
|
"loss": 0.6326, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6499238370503456, |
|
"grad_norm": 1.8890056610107422, |
|
"learning_rate": 5.765665457425102e-06, |
|
"loss": 0.6338, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6561731047142912, |
|
"grad_norm": 2.2345621585845947, |
|
"learning_rate": 5.5831656694921465e-06, |
|
"loss": 0.6262, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.662422372378237, |
|
"grad_norm": 1.983519196510315, |
|
"learning_rate": 5.40247560658467e-06, |
|
"loss": 0.6421, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6686716400421826, |
|
"grad_norm": 2.091252326965332, |
|
"learning_rate": 5.223669303476041e-06, |
|
"loss": 0.6166, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6749209077061282, |
|
"grad_norm": 1.9888851642608643, |
|
"learning_rate": 5.046820023100129e-06, |
|
"loss": 0.6033, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6811701753700738, |
|
"grad_norm": 3.788809061050415, |
|
"learning_rate": 4.872000226533001e-06, |
|
"loss": 0.6292, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6874194430340195, |
|
"grad_norm": 2.0238468647003174, |
|
"learning_rate": 4.699281543303222e-06, |
|
"loss": 0.6238, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6936687106979651, |
|
"grad_norm": 1.7076054811477661, |
|
"learning_rate": 4.528734742042803e-06, |
|
"loss": 0.609, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6999179783619107, |
|
"grad_norm": 2.1124353408813477, |
|
"learning_rate": 4.360429701490935e-06, |
|
"loss": 0.6116, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7061672460258563, |
|
"grad_norm": 1.8266746997833252, |
|
"learning_rate": 4.194435381862343e-06, |
|
"loss": 0.6217, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.712416513689802, |
|
"grad_norm": 1.7669917345046997, |
|
"learning_rate": 4.03081979659195e-06, |
|
"loss": 0.6247, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7186657813537476, |
|
"grad_norm": 2.847994565963745, |
|
"learning_rate": 3.869649984467504e-06, |
|
"loss": 0.6092, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7249150490176932, |
|
"grad_norm": 5.524293899536133, |
|
"learning_rate": 3.7109919821615546e-06, |
|
"loss": 0.6063, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7311643166816388, |
|
"grad_norm": 2.3931970596313477, |
|
"learning_rate": 3.5549107971739905e-06, |
|
"loss": 0.6154, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7374135843455845, |
|
"grad_norm": 1.9852912425994873, |
|
"learning_rate": 3.4014703811963024e-06, |
|
"loss": 0.6119, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7436628520095301, |
|
"grad_norm": 1.8052198886871338, |
|
"learning_rate": 3.2507336039084315e-06, |
|
"loss": 0.6043, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7499121196734758, |
|
"grad_norm": 1.8438955545425415, |
|
"learning_rate": 3.1027622272189572e-06, |
|
"loss": 0.6308, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7561613873374214, |
|
"grad_norm": 2.11885929107666, |
|
"learning_rate": 2.9576168799591663e-06, |
|
"loss": 0.5984, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7624106550013671, |
|
"grad_norm": 2.6443002223968506, |
|
"learning_rate": 2.8153570330413925e-06, |
|
"loss": 0.6139, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7686599226653127, |
|
"grad_norm": 2.413456678390503, |
|
"learning_rate": 2.6760409750917925e-06, |
|
"loss": 0.5974, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7749091903292583, |
|
"grad_norm": 2.166473627090454, |
|
"learning_rate": 2.5397257885675396e-06, |
|
"loss": 0.6034, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7811584579932039, |
|
"grad_norm": 1.9241559505462646, |
|
"learning_rate": 2.406467326368237e-06, |
|
"loss": 0.6026, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7874077256571496, |
|
"grad_norm": 2.043766736984253, |
|
"learning_rate": 2.2763201889510987e-06, |
|
"loss": 0.6216, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7936569933210952, |
|
"grad_norm": 1.6110656261444092, |
|
"learning_rate": 2.149337701959325e-06, |
|
"loss": 0.6198, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7999062609850408, |
|
"grad_norm": 1.7661818265914917, |
|
"learning_rate": 2.025571894372794e-06, |
|
"loss": 0.5838, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8061555286489864, |
|
"grad_norm": 1.8362523317337036, |
|
"learning_rate": 1.9050734771900414e-06, |
|
"loss": 0.5999, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.812404796312932, |
|
"grad_norm": 1.9224945306777954, |
|
"learning_rate": 1.7878918226502816e-06, |
|
"loss": 0.5956, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8186540639768777, |
|
"grad_norm": 2.024622678756714, |
|
"learning_rate": 1.6740749440039262e-06, |
|
"loss": 0.6083, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8249033316408233, |
|
"grad_norm": 2.0744571685791016, |
|
"learning_rate": 1.5636694758399563e-06, |
|
"loss": 0.5917, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8311525993047689, |
|
"grad_norm": 2.3073995113372803, |
|
"learning_rate": 1.4567206549781699e-06, |
|
"loss": 0.5929, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8374018669687147, |
|
"grad_norm": 1.8136156797409058, |
|
"learning_rate": 1.3532723019341376e-06, |
|
"loss": 0.5936, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8436511346326603, |
|
"grad_norm": 1.8207391500473022, |
|
"learning_rate": 1.2533668029644751e-06, |
|
"loss": 0.5933, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8499004022966059, |
|
"grad_norm": 2.001443386077881, |
|
"learning_rate": 1.1570450926997657e-06, |
|
"loss": 0.5891, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8561496699605515, |
|
"grad_norm": 1.8668729066848755, |
|
"learning_rate": 1.064346637372271e-06, |
|
"loss": 0.5799, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8623989376244972, |
|
"grad_norm": 2.3950822353363037, |
|
"learning_rate": 9.753094186453028e-07, |
|
"loss": 0.5994, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8686482052884428, |
|
"grad_norm": 1.9551249742507935, |
|
"learning_rate": 8.89969918050847e-07, |
|
"loss": 0.6119, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8748974729523884, |
|
"grad_norm": 1.774511456489563, |
|
"learning_rate": 8.083631020418792e-07, |
|
"loss": 0.5958, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.881146740616334, |
|
"grad_norm": 1.6152182817459106, |
|
"learning_rate": 7.305224076654127e-07, |
|
"loss": 0.6025, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8873960082802796, |
|
"grad_norm": 3.579341173171997, |
|
"learning_rate": 6.564797288622371e-07, |
|
"loss": 0.5985, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8936452759442253, |
|
"grad_norm": 1.9974656105041504, |
|
"learning_rate": 5.86265403398899e-07, |
|
"loss": 0.5992, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8998945436081709, |
|
"grad_norm": 1.8749598264694214, |
|
"learning_rate": 5.199082004372958e-07, |
|
"loss": 0.5714, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9061438112721165, |
|
"grad_norm": 2.20654296875, |
|
"learning_rate": 4.5743530874699293e-07, |
|
"loss": 0.5967, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9123930789360621, |
|
"grad_norm": 1.7783575057983398, |
|
"learning_rate": 3.988723255650728e-07, |
|
"loss": 0.5883, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9186423466000078, |
|
"grad_norm": 1.8559459447860718, |
|
"learning_rate": 3.442432461080858e-07, |
|
"loss": 0.6041, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.9248916142639534, |
|
"grad_norm": 1.6267383098602295, |
|
"learning_rate": 2.935704537404083e-07, |
|
"loss": 0.6076, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.9311408819278991, |
|
"grad_norm": 1.5899640321731567, |
|
"learning_rate": 2.468747108030289e-07, |
|
"loss": 0.593, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.9373901495918447, |
|
"grad_norm": 1.630462884902954, |
|
"learning_rate": 2.0417515010652032e-07, |
|
"loss": 0.5928, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9436394172557904, |
|
"grad_norm": 1.6635667085647583, |
|
"learning_rate": 1.6548926709168634e-07, |
|
"loss": 0.6025, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.949888684919736, |
|
"grad_norm": 1.7267752885818481, |
|
"learning_rate": 1.30832912661093e-07, |
|
"loss": 0.5937, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9561379525836816, |
|
"grad_norm": 1.8789523839950562, |
|
"learning_rate": 1.0022028668442374e-07, |
|
"loss": 0.5874, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9623872202476272, |
|
"grad_norm": 1.6144094467163086, |
|
"learning_rate": 7.366393218031564e-08, |
|
"loss": 0.6099, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9686364879115729, |
|
"grad_norm": 1.5856353044509888, |
|
"learning_rate": 5.1174730177064866e-08, |
|
"loss": 0.5902, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9748857555755185, |
|
"grad_norm": 1.5565356016159058, |
|
"learning_rate": 3.2761895254306285e-08, |
|
"loss": 0.5852, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9811350232394641, |
|
"grad_norm": 1.7339156866073608, |
|
"learning_rate": 1.8432971767488038e-08, |
|
"loss": 0.5962, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9873842909034097, |
|
"grad_norm": 1.5991542339324951, |
|
"learning_rate": 8.193830756699773e-09, |
|
"loss": 0.6002, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9936335585673554, |
|
"grad_norm": 1.7472448348999023, |
|
"learning_rate": 2.0486675411102165e-09, |
|
"loss": 0.5943, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.999882826231301, |
|
"grad_norm": 1.9169718027114868, |
|
"learning_rate": 0.0, |
|
"loss": 0.5911, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.999882826231301, |
|
"step": 1600, |
|
"total_flos": 1.7902479199415828e+19, |
|
"train_loss": 0.6576289132237434, |
|
"train_runtime": 37752.4225, |
|
"train_samples_per_second": 2.713, |
|
"train_steps_per_second": 0.042 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 256, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7902479199415828e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|