{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999882826231301, "eval_steps": 500, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006249267663945631, "grad_norm": 6.042294502258301, "learning_rate": 4.166666666666667e-06, "loss": 0.7433, "step": 10 }, { "epoch": 0.012498535327891263, "grad_norm": 3.4981777667999268, "learning_rate": 8.333333333333334e-06, "loss": 0.6848, "step": 20 }, { "epoch": 0.018747802991836895, "grad_norm": 7.156867980957031, "learning_rate": 1.25e-05, "loss": 0.6722, "step": 30 }, { "epoch": 0.024997070655782525, "grad_norm": 3.301668167114258, "learning_rate": 1.6666666666666667e-05, "loss": 0.6924, "step": 40 }, { "epoch": 0.031246338319728156, "grad_norm": 2.7308597564697266, "learning_rate": 1.999991805061211e-05, "loss": 0.6759, "step": 50 }, { "epoch": 0.03749560598367379, "grad_norm": 5.730428695678711, "learning_rate": 1.999704996306308e-05, "loss": 0.7484, "step": 60 }, { "epoch": 0.04374487364761942, "grad_norm": 2.7434427738189697, "learning_rate": 1.999008574916082e-05, "loss": 0.7298, "step": 70 }, { "epoch": 0.04999414131156505, "grad_norm": 2.67563533782959, "learning_rate": 1.997902826237712e-05, "loss": 0.7381, "step": 80 }, { "epoch": 0.05624340897551068, "grad_norm": 4.653203010559082, "learning_rate": 1.9963882033334827e-05, "loss": 0.716, "step": 90 }, { "epoch": 0.06249267663945631, "grad_norm": 3.6553049087524414, "learning_rate": 1.9944653267951507e-05, "loss": 0.713, "step": 100 }, { "epoch": 0.06874194430340194, "grad_norm": 3.856987953186035, "learning_rate": 1.9921349844896655e-05, "loss": 0.6886, "step": 110 }, { "epoch": 0.07499121196734758, "grad_norm": 2.80169677734375, "learning_rate": 1.9893981312363563e-05, "loss": 0.7459, "step": 120 }, { "epoch": 0.0812404796312932, "grad_norm": 9.439187049865723, "learning_rate": 1.9862558884157067e-05, "loss": 0.7131, "step": 130 }, { "epoch": 0.08748974729523884, "grad_norm": 3.0188660621643066, "learning_rate": 1.9827095435098926e-05, "loss": 0.7419, "step": 140 }, { "epoch": 0.09373901495918448, "grad_norm": 6.821515083312988, "learning_rate": 1.9787605495752528e-05, "loss": 0.7093, "step": 150 }, { "epoch": 0.0999882826231301, "grad_norm": 3.1190407276153564, "learning_rate": 1.9744105246469264e-05, "loss": 0.7162, "step": 160 }, { "epoch": 0.10623755028707574, "grad_norm": 3.4266233444213867, "learning_rate": 1.9696612510758878e-05, "loss": 0.7501, "step": 170 }, { "epoch": 0.11248681795102136, "grad_norm": 6.012339115142822, "learning_rate": 1.964514674798659e-05, "loss": 0.7228, "step": 180 }, { "epoch": 0.118736085614967, "grad_norm": 2.518491268157959, "learning_rate": 1.9589729045399935e-05, "loss": 0.7283, "step": 190 }, { "epoch": 0.12498535327891262, "grad_norm": 2.739582061767578, "learning_rate": 1.953038210948861e-05, "loss": 0.7464, "step": 200 }, { "epoch": 0.13123462094285826, "grad_norm": 2.70062518119812, "learning_rate": 1.9467130256680867e-05, "loss": 0.721, "step": 210 }, { "epoch": 0.13748388860680388, "grad_norm": 2.4166440963745117, "learning_rate": 1.9399999403380266e-05, "loss": 0.711, "step": 220 }, { "epoch": 0.14373315627074953, "grad_norm": 3.875697374343872, "learning_rate": 1.932901705534683e-05, "loss": 0.7296, "step": 230 }, { "epoch": 0.14998242393469516, "grad_norm": 3.1276426315307617, "learning_rate": 1.9254212296427043e-05, "loss": 0.7079, "step": 240 }, { "epoch": 0.15623169159864078, "grad_norm": 3.067948341369629, "learning_rate": 1.9175615776637212e-05, "loss": 0.7231, "step": 250 }, { "epoch": 0.1624809592625864, "grad_norm": 2.604984760284424, "learning_rate": 1.9093259699605125e-05, "loss": 0.7202, "step": 260 }, { "epoch": 0.16873022692653206, "grad_norm": 2.3994998931884766, "learning_rate": 1.900717780937514e-05, "loss": 0.7129, "step": 270 }, { "epoch": 0.17497949459047768, "grad_norm": 2.617325782775879, "learning_rate": 1.8917405376582144e-05, "loss": 0.7142, "step": 280 }, { "epoch": 0.1812287622544233, "grad_norm": 2.6472878456115723, "learning_rate": 1.8823979183999965e-05, "loss": 0.7168, "step": 290 }, { "epoch": 0.18747802991836895, "grad_norm": 2.1472179889678955, "learning_rate": 1.8726937511470247e-05, "loss": 0.7209, "step": 300 }, { "epoch": 0.19372729758231458, "grad_norm": 2.5548176765441895, "learning_rate": 1.8626320120217922e-05, "loss": 0.7278, "step": 310 }, { "epoch": 0.1999765652462602, "grad_norm": 2.2397587299346924, "learning_rate": 1.8522168236559693e-05, "loss": 0.7229, "step": 320 }, { "epoch": 0.20622583291020583, "grad_norm": 2.72623872756958, "learning_rate": 1.8414524535012244e-05, "loss": 0.7003, "step": 330 }, { "epoch": 0.21247510057415148, "grad_norm": 2.6225929260253906, "learning_rate": 1.8303433120807043e-05, "loss": 0.729, "step": 340 }, { "epoch": 0.2187243682380971, "grad_norm": 4.628861427307129, "learning_rate": 1.8188939511818965e-05, "loss": 0.7437, "step": 350 }, { "epoch": 0.22497363590204272, "grad_norm": 2.2117223739624023, "learning_rate": 1.8071090619916095e-05, "loss": 0.7101, "step": 360 }, { "epoch": 0.23122290356598835, "grad_norm": 2.2710976600646973, "learning_rate": 1.7949934731738348e-05, "loss": 0.7297, "step": 370 }, { "epoch": 0.237472171229934, "grad_norm": 2.2278854846954346, "learning_rate": 1.7825521488912833e-05, "loss": 0.7275, "step": 380 }, { "epoch": 0.24372143889387962, "grad_norm": 2.332300901412964, "learning_rate": 1.7697901867713997e-05, "loss": 0.7103, "step": 390 }, { "epoch": 0.24997070655782525, "grad_norm": 1.9451990127563477, "learning_rate": 1.7567128158176955e-05, "loss": 0.6981, "step": 400 }, { "epoch": 0.2562199742217709, "grad_norm": 2.7532973289489746, "learning_rate": 1.7433253942672497e-05, "loss": 0.7015, "step": 410 }, { "epoch": 0.2624692418857165, "grad_norm": 2.6847236156463623, "learning_rate": 1.7296334073952606e-05, "loss": 0.7242, "step": 420 }, { "epoch": 0.26871850954966214, "grad_norm": 2.6689813137054443, "learning_rate": 1.7156424652675433e-05, "loss": 0.7115, "step": 430 }, { "epoch": 0.27496777721360777, "grad_norm": 2.2913591861724854, "learning_rate": 1.7013583004418994e-05, "loss": 0.6912, "step": 440 }, { "epoch": 0.2812170448775534, "grad_norm": 2.5969536304473877, "learning_rate": 1.6867867656192946e-05, "loss": 0.7074, "step": 450 }, { "epoch": 0.28746631254149907, "grad_norm": 2.246922731399536, "learning_rate": 1.6719338312458123e-05, "loss": 0.7048, "step": 460 }, { "epoch": 0.2937155802054447, "grad_norm": 2.251768112182617, "learning_rate": 1.656805583066361e-05, "loss": 0.7085, "step": 470 }, { "epoch": 0.2999648478693903, "grad_norm": 2.249643325805664, "learning_rate": 1.6414082196311402e-05, "loss": 0.6897, "step": 480 }, { "epoch": 0.30621411553333594, "grad_norm": 1.974482536315918, "learning_rate": 1.6257480497558873e-05, "loss": 0.6941, "step": 490 }, { "epoch": 0.31246338319728156, "grad_norm": 2.1665918827056885, "learning_rate": 1.6098314899369446e-05, "loss": 0.7061, "step": 500 }, { "epoch": 0.3187126508612272, "grad_norm": 2.391334295272827, "learning_rate": 1.5936650617222063e-05, "loss": 0.6958, "step": 510 }, { "epoch": 0.3249619185251728, "grad_norm": 2.047011375427246, "learning_rate": 1.5772553890390196e-05, "loss": 0.7177, "step": 520 }, { "epoch": 0.3312111861891185, "grad_norm": 1.8378033638000488, "learning_rate": 1.560609195480142e-05, "loss": 0.6803, "step": 530 }, { "epoch": 0.3374604538530641, "grad_norm": 2.126997232437134, "learning_rate": 1.5437333015488586e-05, "loss": 0.6694, "step": 540 }, { "epoch": 0.34370972151700974, "grad_norm": 2.4227328300476074, "learning_rate": 1.526634621864395e-05, "loss": 0.6773, "step": 550 }, { "epoch": 0.34995898918095536, "grad_norm": 2.368255615234375, "learning_rate": 1.5093201623287631e-05, "loss": 0.7048, "step": 560 }, { "epoch": 0.356208256844901, "grad_norm": 1.9544981718063354, "learning_rate": 1.4917970172562122e-05, "loss": 0.686, "step": 570 }, { "epoch": 0.3624575245088466, "grad_norm": 2.019627809524536, "learning_rate": 1.4740723664664483e-05, "loss": 0.6911, "step": 580 }, { "epoch": 0.36870679217279223, "grad_norm": 1.8915493488311768, "learning_rate": 1.4561534723428205e-05, "loss": 0.6828, "step": 590 }, { "epoch": 0.3749560598367379, "grad_norm": 2.211825370788574, "learning_rate": 1.4380476768566825e-05, "loss": 0.6636, "step": 600 }, { "epoch": 0.38120532750068353, "grad_norm": 1.9755256175994873, "learning_rate": 1.4197623985591373e-05, "loss": 0.6902, "step": 610 }, { "epoch": 0.38745459516462916, "grad_norm": 1.9239295721054077, "learning_rate": 1.4013051295414108e-05, "loss": 0.6795, "step": 620 }, { "epoch": 0.3937038628285748, "grad_norm": 2.1901464462280273, "learning_rate": 1.3826834323650899e-05, "loss": 0.7005, "step": 630 }, { "epoch": 0.3999531304925204, "grad_norm": 1.9579427242279053, "learning_rate": 1.3639049369634878e-05, "loss": 0.6789, "step": 640 }, { "epoch": 0.406202398156466, "grad_norm": 1.819069266319275, "learning_rate": 1.344977337515404e-05, "loss": 0.6808, "step": 650 }, { "epoch": 0.41245166582041165, "grad_norm": 2.11354923248291, "learning_rate": 1.3259083892925633e-05, "loss": 0.6798, "step": 660 }, { "epoch": 0.41870093348435733, "grad_norm": 2.0537407398223877, "learning_rate": 1.3067059054820184e-05, "loss": 0.6587, "step": 670 }, { "epoch": 0.42495020114830295, "grad_norm": 2.4250621795654297, "learning_rate": 1.2873777539848284e-05, "loss": 0.6699, "step": 680 }, { "epoch": 0.4311994688122486, "grad_norm": 2.0022125244140625, "learning_rate": 1.2679318541923131e-05, "loss": 0.6644, "step": 690 }, { "epoch": 0.4374487364761942, "grad_norm": 1.9328835010528564, "learning_rate": 1.248376173741215e-05, "loss": 0.68, "step": 700 }, { "epoch": 0.4436980041401398, "grad_norm": 2.0376391410827637, "learning_rate": 1.2287187252490914e-05, "loss": 0.684, "step": 710 }, { "epoch": 0.44994727180408545, "grad_norm": 1.9673105478286743, "learning_rate": 1.2089675630312755e-05, "loss": 0.6515, "step": 720 }, { "epoch": 0.45619653946803107, "grad_norm": 1.7611652612686157, "learning_rate": 1.1891307798007536e-05, "loss": 0.6609, "step": 730 }, { "epoch": 0.4624458071319767, "grad_norm": 1.9665021896362305, "learning_rate": 1.1692165033523117e-05, "loss": 0.6801, "step": 740 }, { "epoch": 0.4686950747959224, "grad_norm": 2.349421977996826, "learning_rate": 1.1492328932323022e-05, "loss": 0.6831, "step": 750 }, { "epoch": 0.474944342459868, "grad_norm": 2.5945515632629395, "learning_rate": 1.1291881373954066e-05, "loss": 0.6725, "step": 760 }, { "epoch": 0.4811936101238136, "grad_norm": 1.7327567338943481, "learning_rate": 1.109090448849755e-05, "loss": 0.6394, "step": 770 }, { "epoch": 0.48744287778775924, "grad_norm": 1.980264663696289, "learning_rate": 1.088948062291783e-05, "loss": 0.6682, "step": 780 }, { "epoch": 0.49369214545170487, "grad_norm": 2.313495635986328, "learning_rate": 1.0687692307321984e-05, "loss": 0.6353, "step": 790 }, { "epoch": 0.4999414131156505, "grad_norm": 2.176020860671997, "learning_rate": 1.0485622221144485e-05, "loss": 0.6433, "step": 800 }, { "epoch": 0.5061906807795962, "grad_norm": 2.5199544429779053, "learning_rate": 1.0283353159270644e-05, "loss": 0.6719, "step": 810 }, { "epoch": 0.5124399484435418, "grad_norm": 2.6333529949188232, "learning_rate": 1.0080967998112787e-05, "loss": 0.6576, "step": 820 }, { "epoch": 0.5186892161074874, "grad_norm": 2.367847204208374, "learning_rate": 9.878549661653013e-06, "loss": 0.6547, "step": 830 }, { "epoch": 0.524938483771433, "grad_norm": 1.736178994178772, "learning_rate": 9.676181087466444e-06, "loss": 0.6361, "step": 840 }, { "epoch": 0.5311877514353787, "grad_norm": 1.8639039993286133, "learning_rate": 9.473945192738933e-06, "loss": 0.6497, "step": 850 }, { "epoch": 0.5374370190993243, "grad_norm": 1.7635918855667114, "learning_rate": 9.27192484029312e-06, "loss": 0.6654, "step": 860 }, { "epoch": 0.5436862867632699, "grad_norm": 2.0125083923339844, "learning_rate": 9.070202804636745e-06, "loss": 0.6397, "step": 870 }, { "epoch": 0.5499355544272155, "grad_norm": 2.078883171081543, "learning_rate": 8.868861738047158e-06, "loss": 0.6603, "step": 880 }, { "epoch": 0.5561848220911612, "grad_norm": 1.781632423400879, "learning_rate": 8.667984136705927e-06, "loss": 0.6556, "step": 890 }, { "epoch": 0.5624340897551068, "grad_norm": 2.0591318607330322, "learning_rate": 8.46765230689737e-06, "loss": 0.6258, "step": 900 }, { "epoch": 0.5686833574190524, "grad_norm": 1.6111246347427368, "learning_rate": 8.267948331284923e-06, "loss": 0.6472, "step": 910 }, { "epoch": 0.5749326250829981, "grad_norm": 2.417381763458252, "learning_rate": 8.068954035279121e-06, "loss": 0.6226, "step": 920 }, { "epoch": 0.5811818927469438, "grad_norm": 2.185147762298584, "learning_rate": 7.870750953510983e-06, "loss": 0.617, "step": 930 }, { "epoch": 0.5874311604108894, "grad_norm": 2.3496596813201904, "learning_rate": 7.673420296424541e-06, "loss": 0.633, "step": 940 }, { "epoch": 0.593680428074835, "grad_norm": 2.4552910327911377, "learning_rate": 7.4770429170022e-06, "loss": 0.6312, "step": 950 }, { "epoch": 0.5999296957387806, "grad_norm": 2.7454309463500977, "learning_rate": 7.2816992776365714e-06, "loss": 0.6252, "step": 960 }, { "epoch": 0.6061789634027263, "grad_norm": 2.967708110809326, "learning_rate": 7.08746941716232e-06, "loss": 0.634, "step": 970 }, { "epoch": 0.6124282310666719, "grad_norm": 2.755345582962036, "learning_rate": 6.894432918061579e-06, "loss": 0.6216, "step": 980 }, { "epoch": 0.6186774987306175, "grad_norm": 1.987317681312561, "learning_rate": 6.702668873856339e-06, "loss": 0.643, "step": 990 }, { "epoch": 0.6249267663945631, "grad_norm": 2.4610655307769775, "learning_rate": 6.5122558567011775e-06, "loss": 0.6409, "step": 1000 }, { "epoch": 0.6311760340585088, "grad_norm": 1.9931960105895996, "learning_rate": 6.323271885189636e-06, "loss": 0.6418, "step": 1010 }, { "epoch": 0.6374253017224544, "grad_norm": 3.0215256214141846, "learning_rate": 6.135794392387353e-06, "loss": 0.635, "step": 1020 }, { "epoch": 0.6436745693864, "grad_norm": 1.752156376838684, "learning_rate": 5.949900194105167e-06, "loss": 0.6326, "step": 1030 }, { "epoch": 0.6499238370503456, "grad_norm": 1.8890056610107422, "learning_rate": 5.765665457425102e-06, "loss": 0.6338, "step": 1040 }, { "epoch": 0.6561731047142912, "grad_norm": 2.2345621585845947, "learning_rate": 5.5831656694921465e-06, "loss": 0.6262, "step": 1050 }, { "epoch": 0.662422372378237, "grad_norm": 1.983519196510315, "learning_rate": 5.40247560658467e-06, "loss": 0.6421, "step": 1060 }, { "epoch": 0.6686716400421826, "grad_norm": 2.091252326965332, "learning_rate": 5.223669303476041e-06, "loss": 0.6166, "step": 1070 }, { "epoch": 0.6749209077061282, "grad_norm": 1.9888851642608643, "learning_rate": 5.046820023100129e-06, "loss": 0.6033, "step": 1080 }, { "epoch": 0.6811701753700738, "grad_norm": 3.788809061050415, "learning_rate": 4.872000226533001e-06, "loss": 0.6292, "step": 1090 }, { "epoch": 0.6874194430340195, "grad_norm": 2.0238468647003174, "learning_rate": 4.699281543303222e-06, "loss": 0.6238, "step": 1100 }, { "epoch": 0.6936687106979651, "grad_norm": 1.7076054811477661, "learning_rate": 4.528734742042803e-06, "loss": 0.609, "step": 1110 }, { "epoch": 0.6999179783619107, "grad_norm": 2.1124353408813477, "learning_rate": 4.360429701490935e-06, "loss": 0.6116, "step": 1120 }, { "epoch": 0.7061672460258563, "grad_norm": 1.8266746997833252, "learning_rate": 4.194435381862343e-06, "loss": 0.6217, "step": 1130 }, { "epoch": 0.712416513689802, "grad_norm": 1.7669917345046997, "learning_rate": 4.03081979659195e-06, "loss": 0.6247, "step": 1140 }, { "epoch": 0.7186657813537476, "grad_norm": 2.847994565963745, "learning_rate": 3.869649984467504e-06, "loss": 0.6092, "step": 1150 }, { "epoch": 0.7249150490176932, "grad_norm": 5.524293899536133, "learning_rate": 3.7109919821615546e-06, "loss": 0.6063, "step": 1160 }, { "epoch": 0.7311643166816388, "grad_norm": 2.3931970596313477, "learning_rate": 3.5549107971739905e-06, "loss": 0.6154, "step": 1170 }, { "epoch": 0.7374135843455845, "grad_norm": 1.9852912425994873, "learning_rate": 3.4014703811963024e-06, "loss": 0.6119, "step": 1180 }, { "epoch": 0.7436628520095301, "grad_norm": 1.8052198886871338, "learning_rate": 3.2507336039084315e-06, "loss": 0.6043, "step": 1190 }, { "epoch": 0.7499121196734758, "grad_norm": 1.8438955545425415, "learning_rate": 3.1027622272189572e-06, "loss": 0.6308, "step": 1200 }, { "epoch": 0.7561613873374214, "grad_norm": 2.11885929107666, "learning_rate": 2.9576168799591663e-06, "loss": 0.5984, "step": 1210 }, { "epoch": 0.7624106550013671, "grad_norm": 2.6443002223968506, "learning_rate": 2.8153570330413925e-06, "loss": 0.6139, "step": 1220 }, { "epoch": 0.7686599226653127, "grad_norm": 2.413456678390503, "learning_rate": 2.6760409750917925e-06, "loss": 0.5974, "step": 1230 }, { "epoch": 0.7749091903292583, "grad_norm": 2.166473627090454, "learning_rate": 2.5397257885675396e-06, "loss": 0.6034, "step": 1240 }, { "epoch": 0.7811584579932039, "grad_norm": 1.9241559505462646, "learning_rate": 2.406467326368237e-06, "loss": 0.6026, "step": 1250 }, { "epoch": 0.7874077256571496, "grad_norm": 2.043766736984253, "learning_rate": 2.2763201889510987e-06, "loss": 0.6216, "step": 1260 }, { "epoch": 0.7936569933210952, "grad_norm": 1.6110656261444092, "learning_rate": 2.149337701959325e-06, "loss": 0.6198, "step": 1270 }, { "epoch": 0.7999062609850408, "grad_norm": 1.7661818265914917, "learning_rate": 2.025571894372794e-06, "loss": 0.5838, "step": 1280 }, { "epoch": 0.8061555286489864, "grad_norm": 1.8362523317337036, "learning_rate": 1.9050734771900414e-06, "loss": 0.5999, "step": 1290 }, { "epoch": 0.812404796312932, "grad_norm": 1.9224945306777954, "learning_rate": 1.7878918226502816e-06, "loss": 0.5956, "step": 1300 }, { "epoch": 0.8186540639768777, "grad_norm": 2.024622678756714, "learning_rate": 1.6740749440039262e-06, "loss": 0.6083, "step": 1310 }, { "epoch": 0.8249033316408233, "grad_norm": 2.0744571685791016, "learning_rate": 1.5636694758399563e-06, "loss": 0.5917, "step": 1320 }, { "epoch": 0.8311525993047689, "grad_norm": 2.3073995113372803, "learning_rate": 1.4567206549781699e-06, "loss": 0.5929, "step": 1330 }, { "epoch": 0.8374018669687147, "grad_norm": 1.8136156797409058, "learning_rate": 1.3532723019341376e-06, "loss": 0.5936, "step": 1340 }, { "epoch": 0.8436511346326603, "grad_norm": 1.8207391500473022, "learning_rate": 1.2533668029644751e-06, "loss": 0.5933, "step": 1350 }, { "epoch": 0.8499004022966059, "grad_norm": 2.001443386077881, "learning_rate": 1.1570450926997657e-06, "loss": 0.5891, "step": 1360 }, { "epoch": 0.8561496699605515, "grad_norm": 1.8668729066848755, "learning_rate": 1.064346637372271e-06, "loss": 0.5799, "step": 1370 }, { "epoch": 0.8623989376244972, "grad_norm": 2.3950822353363037, "learning_rate": 9.753094186453028e-07, "loss": 0.5994, "step": 1380 }, { "epoch": 0.8686482052884428, "grad_norm": 1.9551249742507935, "learning_rate": 8.89969918050847e-07, "loss": 0.6119, "step": 1390 }, { "epoch": 0.8748974729523884, "grad_norm": 1.774511456489563, "learning_rate": 8.083631020418792e-07, "loss": 0.5958, "step": 1400 }, { "epoch": 0.881146740616334, "grad_norm": 1.6152182817459106, "learning_rate": 7.305224076654127e-07, "loss": 0.6025, "step": 1410 }, { "epoch": 0.8873960082802796, "grad_norm": 3.579341173171997, "learning_rate": 6.564797288622371e-07, "loss": 0.5985, "step": 1420 }, { "epoch": 0.8936452759442253, "grad_norm": 1.9974656105041504, "learning_rate": 5.86265403398899e-07, "loss": 0.5992, "step": 1430 }, { "epoch": 0.8998945436081709, "grad_norm": 1.8749598264694214, "learning_rate": 5.199082004372958e-07, "loss": 0.5714, "step": 1440 }, { "epoch": 0.9061438112721165, "grad_norm": 2.20654296875, "learning_rate": 4.5743530874699293e-07, "loss": 0.5967, "step": 1450 }, { "epoch": 0.9123930789360621, "grad_norm": 1.7783575057983398, "learning_rate": 3.988723255650728e-07, "loss": 0.5883, "step": 1460 }, { "epoch": 0.9186423466000078, "grad_norm": 1.8559459447860718, "learning_rate": 3.442432461080858e-07, "loss": 0.6041, "step": 1470 }, { "epoch": 0.9248916142639534, "grad_norm": 1.6267383098602295, "learning_rate": 2.935704537404083e-07, "loss": 0.6076, "step": 1480 }, { "epoch": 0.9311408819278991, "grad_norm": 1.5899640321731567, "learning_rate": 2.468747108030289e-07, "loss": 0.593, "step": 1490 }, { "epoch": 0.9373901495918447, "grad_norm": 1.630462884902954, "learning_rate": 2.0417515010652032e-07, "loss": 0.5928, "step": 1500 }, { "epoch": 0.9436394172557904, "grad_norm": 1.6635667085647583, "learning_rate": 1.6548926709168634e-07, "loss": 0.6025, "step": 1510 }, { "epoch": 0.949888684919736, "grad_norm": 1.7267752885818481, "learning_rate": 1.30832912661093e-07, "loss": 0.5937, "step": 1520 }, { "epoch": 0.9561379525836816, "grad_norm": 1.8789523839950562, "learning_rate": 1.0022028668442374e-07, "loss": 0.5874, "step": 1530 }, { "epoch": 0.9623872202476272, "grad_norm": 1.6144094467163086, "learning_rate": 7.366393218031564e-08, "loss": 0.6099, "step": 1540 }, { "epoch": 0.9686364879115729, "grad_norm": 1.5856353044509888, "learning_rate": 5.1174730177064866e-08, "loss": 0.5902, "step": 1550 }, { "epoch": 0.9748857555755185, "grad_norm": 1.5565356016159058, "learning_rate": 3.2761895254306285e-08, "loss": 0.5852, "step": 1560 }, { "epoch": 0.9811350232394641, "grad_norm": 1.7339156866073608, "learning_rate": 1.8432971767488038e-08, "loss": 0.5962, "step": 1570 }, { "epoch": 0.9873842909034097, "grad_norm": 1.5991542339324951, "learning_rate": 8.193830756699773e-09, "loss": 0.6002, "step": 1580 }, { "epoch": 0.9936335585673554, "grad_norm": 1.7472448348999023, "learning_rate": 2.0486675411102165e-09, "loss": 0.5943, "step": 1590 }, { "epoch": 0.999882826231301, "grad_norm": 1.9169718027114868, "learning_rate": 0.0, "loss": 0.5911, "step": 1600 }, { "epoch": 0.999882826231301, "step": 1600, "total_flos": 1.7902479199415828e+19, "train_loss": 0.6576289132237434, "train_runtime": 37752.4225, "train_samples_per_second": 2.713, "train_steps_per_second": 0.042 } ], "logging_steps": 10, "max_steps": 1600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 256, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7902479199415828e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }