{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6623832549513148, "eval_steps": 500, "global_step": 80000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008279790686891435, "grad_norm": 0.2026122659444809, "learning_rate": 0.00019994481795043316, "loss": 1.9308, "step": 100 }, { "epoch": 0.001655958137378287, "grad_norm": 0.189473494887352, "learning_rate": 0.00019988961934128355, "loss": 1.796, "step": 200 }, { "epoch": 0.0024839372060674308, "grad_norm": 0.21440361440181732, "learning_rate": 0.00019983442073213394, "loss": 1.7292, "step": 300 }, { "epoch": 0.003311916274756574, "grad_norm": 0.4124305844306946, "learning_rate": 0.00019977922212298434, "loss": 1.7479, "step": 400 }, { "epoch": 0.004139895343445717, "grad_norm": 0.326767235994339, "learning_rate": 0.00019972402351383473, "loss": 1.7445, "step": 500 }, { "epoch": 0.0049678744121348616, "grad_norm": 0.23368559777736664, "learning_rate": 0.00019966882490468513, "loss": 1.7146, "step": 600 }, { "epoch": 0.005795853480824005, "grad_norm": 0.2080787867307663, "learning_rate": 0.00019961362629553552, "loss": 1.7009, "step": 700 }, { "epoch": 0.006623832549513148, "grad_norm": 0.23906764388084412, "learning_rate": 0.00019955842768638592, "loss": 1.7346, "step": 800 }, { "epoch": 0.0074518116182022915, "grad_norm": 0.21370458602905273, "learning_rate": 0.00019950322907723634, "loss": 1.7161, "step": 900 }, { "epoch": 0.008279790686891435, "grad_norm": 0.2232082337141037, "learning_rate": 0.0001994480304680867, "loss": 1.6845, "step": 1000 }, { "epoch": 0.009107769755580579, "grad_norm": 0.28137433528900146, "learning_rate": 0.00019939283185893713, "loss": 1.7064, "step": 1100 }, { "epoch": 0.009935748824269723, "grad_norm": 0.27577510476112366, "learning_rate": 0.00019933763324978752, "loss": 1.7216, "step": 1200 }, { "epoch": 0.010763727892958866, "grad_norm": 0.19732429087162018, "learning_rate": 0.0001992824346406379, "loss": 1.6785, "step": 1300 }, { "epoch": 0.01159170696164801, "grad_norm": 0.2802606523036957, "learning_rate": 0.0001992272360314883, "loss": 1.6987, "step": 1400 }, { "epoch": 0.012419686030337154, "grad_norm": 0.2615717053413391, "learning_rate": 0.0001991720374223387, "loss": 1.7157, "step": 1500 }, { "epoch": 0.013247665099026296, "grad_norm": 0.21463246643543243, "learning_rate": 0.00019911683881318907, "loss": 1.709, "step": 1600 }, { "epoch": 0.01407564416771544, "grad_norm": 0.23596777021884918, "learning_rate": 0.0001990616402040395, "loss": 1.7288, "step": 1700 }, { "epoch": 0.014903623236404583, "grad_norm": 0.26049914956092834, "learning_rate": 0.00019900644159488989, "loss": 1.7007, "step": 1800 }, { "epoch": 0.01573160230509373, "grad_norm": 0.1828412264585495, "learning_rate": 0.00019895124298574025, "loss": 1.6872, "step": 1900 }, { "epoch": 0.01655958137378287, "grad_norm": 0.30434760451316833, "learning_rate": 0.00019889604437659067, "loss": 1.7084, "step": 2000 }, { "epoch": 0.017387560442472014, "grad_norm": 0.19470958411693573, "learning_rate": 0.00019884084576744107, "loss": 1.6804, "step": 2100 }, { "epoch": 0.018215539511161158, "grad_norm": 0.23792260885238647, "learning_rate": 0.00019878564715829144, "loss": 1.6873, "step": 2200 }, { "epoch": 0.019043518579850302, "grad_norm": 0.2714627683162689, "learning_rate": 0.00019873044854914186, "loss": 1.6803, "step": 2300 }, { "epoch": 0.019871497648539446, "grad_norm": 0.2310653179883957, "learning_rate": 0.00019867524993999225, "loss": 1.7056, "step": 2400 }, { "epoch": 0.020699476717228587, "grad_norm": 0.21665313839912415, "learning_rate": 0.00019862005133084262, "loss": 1.6636, "step": 2500 }, { "epoch": 0.02152745578591773, "grad_norm": 0.26358917355537415, "learning_rate": 0.00019856485272169304, "loss": 1.6567, "step": 2600 }, { "epoch": 0.022355434854606875, "grad_norm": 0.2334413081407547, "learning_rate": 0.00019850965411254343, "loss": 1.6666, "step": 2700 }, { "epoch": 0.02318341392329602, "grad_norm": 0.20429347455501556, "learning_rate": 0.0001984544555033938, "loss": 1.7123, "step": 2800 }, { "epoch": 0.024011392991985164, "grad_norm": 0.22018109261989594, "learning_rate": 0.00019839925689424422, "loss": 1.6405, "step": 2900 }, { "epoch": 0.024839372060674308, "grad_norm": 0.2289331555366516, "learning_rate": 0.00019834405828509462, "loss": 1.6769, "step": 3000 }, { "epoch": 0.02566735112936345, "grad_norm": 0.21042221784591675, "learning_rate": 0.00019828885967594498, "loss": 1.6559, "step": 3100 }, { "epoch": 0.026495330198052593, "grad_norm": 0.24456587433815002, "learning_rate": 0.0001982336610667954, "loss": 1.6889, "step": 3200 }, { "epoch": 0.027323309266741737, "grad_norm": 0.19650936126708984, "learning_rate": 0.0001981784624576458, "loss": 1.6577, "step": 3300 }, { "epoch": 0.02815128833543088, "grad_norm": 0.2078315168619156, "learning_rate": 0.00019812326384849617, "loss": 1.6968, "step": 3400 }, { "epoch": 0.028979267404120025, "grad_norm": 0.27083057165145874, "learning_rate": 0.0001980680652393466, "loss": 1.6851, "step": 3500 }, { "epoch": 0.029807246472809166, "grad_norm": 0.19873012602329254, "learning_rate": 0.00019801286663019698, "loss": 1.6642, "step": 3600 }, { "epoch": 0.03063522554149831, "grad_norm": 0.25127413868904114, "learning_rate": 0.00019795766802104735, "loss": 1.6925, "step": 3700 }, { "epoch": 0.03146320461018746, "grad_norm": 0.28270578384399414, "learning_rate": 0.00019790246941189777, "loss": 1.6823, "step": 3800 }, { "epoch": 0.032291183678876595, "grad_norm": 0.22762413322925568, "learning_rate": 0.00019784727080274816, "loss": 1.6802, "step": 3900 }, { "epoch": 0.03311916274756574, "grad_norm": 0.24451415240764618, "learning_rate": 0.00019779207219359853, "loss": 1.6855, "step": 4000 }, { "epoch": 0.03394714181625488, "grad_norm": 0.18539206683635712, "learning_rate": 0.00019773687358444895, "loss": 1.6753, "step": 4100 }, { "epoch": 0.03477512088494403, "grad_norm": 0.20416654646396637, "learning_rate": 0.00019768167497529935, "loss": 1.6892, "step": 4200 }, { "epoch": 0.03560309995363317, "grad_norm": 0.22939777374267578, "learning_rate": 0.00019762647636614971, "loss": 1.6953, "step": 4300 }, { "epoch": 0.036431079022322316, "grad_norm": 0.21451617777347565, "learning_rate": 0.00019757127775700014, "loss": 1.6609, "step": 4400 }, { "epoch": 0.03725905809101146, "grad_norm": 0.20271216332912445, "learning_rate": 0.00019751607914785053, "loss": 1.6868, "step": 4500 }, { "epoch": 0.038087037159700604, "grad_norm": 0.23506902158260345, "learning_rate": 0.0001974608805387009, "loss": 1.7012, "step": 4600 }, { "epoch": 0.03891501622838975, "grad_norm": 0.19889087975025177, "learning_rate": 0.00019740568192955132, "loss": 1.6867, "step": 4700 }, { "epoch": 0.03974299529707889, "grad_norm": 0.21165409684181213, "learning_rate": 0.0001973504833204017, "loss": 1.6666, "step": 4800 }, { "epoch": 0.04057097436576804, "grad_norm": 0.25541654229164124, "learning_rate": 0.0001972952847112521, "loss": 1.7012, "step": 4900 }, { "epoch": 0.041398953434457174, "grad_norm": 0.2289835512638092, "learning_rate": 0.0001972400861021025, "loss": 1.6351, "step": 5000 }, { "epoch": 0.04222693250314632, "grad_norm": 0.20699161291122437, "learning_rate": 0.0001971848874929529, "loss": 1.6834, "step": 5100 }, { "epoch": 0.04305491157183546, "grad_norm": 0.21547134220600128, "learning_rate": 0.0001971296888838033, "loss": 1.6605, "step": 5200 }, { "epoch": 0.043882890640524606, "grad_norm": 0.21996253728866577, "learning_rate": 0.00019707449027465368, "loss": 1.6747, "step": 5300 }, { "epoch": 0.04471086970921375, "grad_norm": 0.19425031542778015, "learning_rate": 0.00019701929166550408, "loss": 1.6677, "step": 5400 }, { "epoch": 0.045538848777902895, "grad_norm": 0.21223612129688263, "learning_rate": 0.00019696409305635447, "loss": 1.6401, "step": 5500 }, { "epoch": 0.04636682784659204, "grad_norm": 0.21771861612796783, "learning_rate": 0.00019690889444720487, "loss": 1.6655, "step": 5600 }, { "epoch": 0.04719480691528118, "grad_norm": 0.17102624475955963, "learning_rate": 0.00019685369583805526, "loss": 1.6373, "step": 5700 }, { "epoch": 0.04802278598397033, "grad_norm": 1.002648115158081, "learning_rate": 0.00019679849722890565, "loss": 1.6578, "step": 5800 }, { "epoch": 0.04885076505265947, "grad_norm": 0.209450364112854, "learning_rate": 0.00019674329861975605, "loss": 1.7114, "step": 5900 }, { "epoch": 0.049678744121348616, "grad_norm": 0.21328583359718323, "learning_rate": 0.00019668810001060644, "loss": 1.6816, "step": 6000 }, { "epoch": 0.05050672319003775, "grad_norm": 0.2184838354587555, "learning_rate": 0.00019663290140145684, "loss": 1.6596, "step": 6100 }, { "epoch": 0.0513347022587269, "grad_norm": 0.23963460326194763, "learning_rate": 0.00019657825477839872, "loss": 1.6627, "step": 6200 }, { "epoch": 0.05216268132741604, "grad_norm": 0.2165103405714035, "learning_rate": 0.00019652305616924912, "loss": 1.6808, "step": 6300 }, { "epoch": 0.052990660396105185, "grad_norm": 0.19727864861488342, "learning_rate": 0.0001964678575600995, "loss": 1.7164, "step": 6400 }, { "epoch": 0.05381863946479433, "grad_norm": 0.23533885180950165, "learning_rate": 0.0001964126589509499, "loss": 1.6575, "step": 6500 }, { "epoch": 0.054646618533483474, "grad_norm": 0.23083338141441345, "learning_rate": 0.0001963574603418003, "loss": 1.6481, "step": 6600 }, { "epoch": 0.05547459760217262, "grad_norm": 0.18879123032093048, "learning_rate": 0.0001963022617326507, "loss": 1.6798, "step": 6700 }, { "epoch": 0.05630257667086176, "grad_norm": 0.2472197264432907, "learning_rate": 0.0001962470631235011, "loss": 1.6739, "step": 6800 }, { "epoch": 0.057130555739550906, "grad_norm": 0.19961301982402802, "learning_rate": 0.00019619186451435148, "loss": 1.6307, "step": 6900 }, { "epoch": 0.05795853480824005, "grad_norm": 0.1921355426311493, "learning_rate": 0.00019613666590520188, "loss": 1.6432, "step": 7000 }, { "epoch": 0.058786513876929194, "grad_norm": 0.29221248626708984, "learning_rate": 0.00019608146729605227, "loss": 1.6591, "step": 7100 }, { "epoch": 0.05961449294561833, "grad_norm": 0.2565169632434845, "learning_rate": 0.00019602626868690267, "loss": 1.6314, "step": 7200 }, { "epoch": 0.060442472014307476, "grad_norm": 0.19964313507080078, "learning_rate": 0.00019597107007775306, "loss": 1.6571, "step": 7300 }, { "epoch": 0.06127045108299662, "grad_norm": 0.19322757422924042, "learning_rate": 0.00019591587146860345, "loss": 1.6553, "step": 7400 }, { "epoch": 0.062098430151685764, "grad_norm": 0.2548861801624298, "learning_rate": 0.00019586067285945385, "loss": 1.6791, "step": 7500 }, { "epoch": 0.06292640922037492, "grad_norm": 0.2616177797317505, "learning_rate": 0.00019580547425030424, "loss": 1.6208, "step": 7600 }, { "epoch": 0.06375438828906405, "grad_norm": 0.2758786976337433, "learning_rate": 0.00019575027564115464, "loss": 1.6838, "step": 7700 }, { "epoch": 0.06458236735775319, "grad_norm": 0.1881067007780075, "learning_rate": 0.00019569507703200503, "loss": 1.6577, "step": 7800 }, { "epoch": 0.06541034642644233, "grad_norm": 0.21222031116485596, "learning_rate": 0.00019563987842285543, "loss": 1.6952, "step": 7900 }, { "epoch": 0.06623832549513148, "grad_norm": 0.23496469855308533, "learning_rate": 0.00019558467981370582, "loss": 1.6604, "step": 8000 }, { "epoch": 0.06706630456382062, "grad_norm": 0.21739377081394196, "learning_rate": 0.00019552948120455621, "loss": 1.6493, "step": 8100 }, { "epoch": 0.06789428363250977, "grad_norm": 0.2083568572998047, "learning_rate": 0.0001954742825954066, "loss": 1.6507, "step": 8200 }, { "epoch": 0.06872226270119891, "grad_norm": 0.2494797557592392, "learning_rate": 0.000195419083986257, "loss": 1.6718, "step": 8300 }, { "epoch": 0.06955024176988805, "grad_norm": 0.22432386875152588, "learning_rate": 0.0001953638853771074, "loss": 1.6758, "step": 8400 }, { "epoch": 0.0703782208385772, "grad_norm": 0.21674951910972595, "learning_rate": 0.00019530923875404928, "loss": 1.6437, "step": 8500 }, { "epoch": 0.07120619990726634, "grad_norm": 0.188558429479599, "learning_rate": 0.00019525404014489968, "loss": 1.6345, "step": 8600 }, { "epoch": 0.07203417897595549, "grad_norm": 0.22968587279319763, "learning_rate": 0.00019519884153575007, "loss": 1.6609, "step": 8700 }, { "epoch": 0.07286215804464463, "grad_norm": 0.2717007100582123, "learning_rate": 0.00019514364292660047, "loss": 1.6514, "step": 8800 }, { "epoch": 0.07369013711333378, "grad_norm": 0.2857931852340698, "learning_rate": 0.00019508844431745086, "loss": 1.6669, "step": 8900 }, { "epoch": 0.07451811618202292, "grad_norm": 0.23388004302978516, "learning_rate": 0.00019503324570830125, "loss": 1.6785, "step": 9000 }, { "epoch": 0.07534609525071206, "grad_norm": 0.20342163741588593, "learning_rate": 0.00019497804709915165, "loss": 1.6777, "step": 9100 }, { "epoch": 0.07617407431940121, "grad_norm": 0.2345508486032486, "learning_rate": 0.00019492340047609353, "loss": 1.6672, "step": 9200 }, { "epoch": 0.07700205338809035, "grad_norm": 0.20348191261291504, "learning_rate": 0.00019486820186694396, "loss": 1.6851, "step": 9300 }, { "epoch": 0.0778300324567795, "grad_norm": 0.2147115170955658, "learning_rate": 0.00019481300325779435, "loss": 1.6768, "step": 9400 }, { "epoch": 0.07865801152546864, "grad_norm": 0.24411408603191376, "learning_rate": 0.00019475780464864472, "loss": 1.6488, "step": 9500 }, { "epoch": 0.07948599059415778, "grad_norm": 0.2360156625509262, "learning_rate": 0.00019470260603949514, "loss": 1.6248, "step": 9600 }, { "epoch": 0.08031396966284693, "grad_norm": 0.24534212052822113, "learning_rate": 0.00019464740743034553, "loss": 1.6425, "step": 9700 }, { "epoch": 0.08114194873153607, "grad_norm": 0.26849544048309326, "learning_rate": 0.0001945922088211959, "loss": 1.668, "step": 9800 }, { "epoch": 0.0819699278002252, "grad_norm": 0.20883604884147644, "learning_rate": 0.00019453701021204632, "loss": 1.6492, "step": 9900 }, { "epoch": 0.08279790686891435, "grad_norm": 0.22967711091041565, "learning_rate": 0.00019448181160289672, "loss": 1.6667, "step": 10000 }, { "epoch": 0.08362588593760349, "grad_norm": 0.24589480459690094, "learning_rate": 0.00019442661299374708, "loss": 1.62, "step": 10100 }, { "epoch": 0.08445386500629264, "grad_norm": 0.26812034845352173, "learning_rate": 0.0001943714143845975, "loss": 1.6351, "step": 10200 }, { "epoch": 0.08528184407498178, "grad_norm": 0.24378615617752075, "learning_rate": 0.0001943162157754479, "loss": 1.6097, "step": 10300 }, { "epoch": 0.08610982314367092, "grad_norm": 0.31285640597343445, "learning_rate": 0.00019426101716629827, "loss": 1.7014, "step": 10400 }, { "epoch": 0.08693780221236007, "grad_norm": 0.277204692363739, "learning_rate": 0.0001942058185571487, "loss": 1.6482, "step": 10500 }, { "epoch": 0.08776578128104921, "grad_norm": 0.22598931193351746, "learning_rate": 0.00019415061994799908, "loss": 1.6459, "step": 10600 }, { "epoch": 0.08859376034973836, "grad_norm": 0.3453767001628876, "learning_rate": 0.00019409542133884945, "loss": 1.6549, "step": 10700 }, { "epoch": 0.0894217394184275, "grad_norm": 0.2226220667362213, "learning_rate": 0.00019404022272969987, "loss": 1.6481, "step": 10800 }, { "epoch": 0.09024971848711665, "grad_norm": 0.2280457317829132, "learning_rate": 0.00019398502412055026, "loss": 1.6587, "step": 10900 }, { "epoch": 0.09107769755580579, "grad_norm": 0.2163500040769577, "learning_rate": 0.00019392982551140063, "loss": 1.6693, "step": 11000 }, { "epoch": 0.09190567662449493, "grad_norm": 0.19597041606903076, "learning_rate": 0.00019387462690225105, "loss": 1.6313, "step": 11100 }, { "epoch": 0.09273365569318408, "grad_norm": 0.2143152803182602, "learning_rate": 0.00019381942829310145, "loss": 1.6543, "step": 11200 }, { "epoch": 0.09356163476187322, "grad_norm": 0.2231709212064743, "learning_rate": 0.0001937642296839518, "loss": 1.6534, "step": 11300 }, { "epoch": 0.09438961383056237, "grad_norm": 0.21003256738185883, "learning_rate": 0.00019370903107480223, "loss": 1.644, "step": 11400 }, { "epoch": 0.09521759289925151, "grad_norm": 0.23484201729297638, "learning_rate": 0.00019365383246565263, "loss": 1.67, "step": 11500 }, { "epoch": 0.09604557196794065, "grad_norm": 0.23727314174175262, "learning_rate": 0.000193598633856503, "loss": 1.6078, "step": 11600 }, { "epoch": 0.0968735510366298, "grad_norm": 0.2734909653663635, "learning_rate": 0.00019354343524735342, "loss": 1.6389, "step": 11700 }, { "epoch": 0.09770153010531894, "grad_norm": 0.266926646232605, "learning_rate": 0.0001934882366382038, "loss": 1.6522, "step": 11800 }, { "epoch": 0.09852950917400809, "grad_norm": 0.22209756076335907, "learning_rate": 0.00019343303802905418, "loss": 1.679, "step": 11900 }, { "epoch": 0.09935748824269723, "grad_norm": 0.20740656554698944, "learning_rate": 0.0001933778394199046, "loss": 1.6494, "step": 12000 }, { "epoch": 0.10018546731138636, "grad_norm": 0.19304044544696808, "learning_rate": 0.000193322640810755, "loss": 1.644, "step": 12100 }, { "epoch": 0.1010134463800755, "grad_norm": 0.2168605774641037, "learning_rate": 0.0001932674422016054, "loss": 1.673, "step": 12200 }, { "epoch": 0.10184142544876465, "grad_norm": 0.2477482557296753, "learning_rate": 0.00019321224359245578, "loss": 1.6912, "step": 12300 }, { "epoch": 0.1026694045174538, "grad_norm": 0.220897376537323, "learning_rate": 0.00019315704498330618, "loss": 1.6553, "step": 12400 }, { "epoch": 0.10349738358614294, "grad_norm": 0.22782349586486816, "learning_rate": 0.00019310184637415657, "loss": 1.6585, "step": 12500 }, { "epoch": 0.10432536265483208, "grad_norm": 0.22199232876300812, "learning_rate": 0.00019304664776500697, "loss": 1.6617, "step": 12600 }, { "epoch": 0.10515334172352123, "grad_norm": 0.19177618622779846, "learning_rate": 0.00019299144915585736, "loss": 1.6788, "step": 12700 }, { "epoch": 0.10598132079221037, "grad_norm": 0.20250776410102844, "learning_rate": 0.00019293625054670775, "loss": 1.6329, "step": 12800 }, { "epoch": 0.10680929986089951, "grad_norm": 0.23451390862464905, "learning_rate": 0.00019288105193755815, "loss": 1.6336, "step": 12900 }, { "epoch": 0.10763727892958866, "grad_norm": 0.20917341113090515, "learning_rate": 0.00019282585332840854, "loss": 1.6436, "step": 13000 }, { "epoch": 0.1084652579982778, "grad_norm": 0.24702110886573792, "learning_rate": 0.00019277065471925894, "loss": 1.6569, "step": 13100 }, { "epoch": 0.10929323706696695, "grad_norm": 0.2046515792608261, "learning_rate": 0.00019271545611010933, "loss": 1.6728, "step": 13200 }, { "epoch": 0.11012121613565609, "grad_norm": 0.23590846359729767, "learning_rate": 0.00019266025750095973, "loss": 1.6808, "step": 13300 }, { "epoch": 0.11094919520434524, "grad_norm": 0.2132890671491623, "learning_rate": 0.00019260505889181012, "loss": 1.6428, "step": 13400 }, { "epoch": 0.11177717427303438, "grad_norm": 0.22132565081119537, "learning_rate": 0.000192550412268752, "loss": 1.688, "step": 13500 }, { "epoch": 0.11260515334172352, "grad_norm": 0.2686227262020111, "learning_rate": 0.0001924952136596024, "loss": 1.6523, "step": 13600 }, { "epoch": 0.11343313241041267, "grad_norm": 0.2114957571029663, "learning_rate": 0.0001924400150504528, "loss": 1.643, "step": 13700 }, { "epoch": 0.11426111147910181, "grad_norm": 0.22338928282260895, "learning_rate": 0.0001923848164413032, "loss": 1.6371, "step": 13800 }, { "epoch": 0.11508909054779096, "grad_norm": 0.27738282084465027, "learning_rate": 0.00019232961783215358, "loss": 1.664, "step": 13900 }, { "epoch": 0.1159170696164801, "grad_norm": 0.211566761136055, "learning_rate": 0.00019227441922300398, "loss": 1.7005, "step": 14000 }, { "epoch": 0.11674504868516924, "grad_norm": 0.23775742948055267, "learning_rate": 0.00019221922061385437, "loss": 1.6538, "step": 14100 }, { "epoch": 0.11757302775385839, "grad_norm": 0.24485789239406586, "learning_rate": 0.00019216402200470477, "loss": 1.6969, "step": 14200 }, { "epoch": 0.11840100682254752, "grad_norm": 0.2573772966861725, "learning_rate": 0.00019210882339555516, "loss": 1.6268, "step": 14300 }, { "epoch": 0.11922898589123666, "grad_norm": 0.24128000438213348, "learning_rate": 0.00019205362478640555, "loss": 1.6765, "step": 14400 }, { "epoch": 0.12005696495992581, "grad_norm": 0.2213265597820282, "learning_rate": 0.00019199842617725595, "loss": 1.6483, "step": 14500 }, { "epoch": 0.12088494402861495, "grad_norm": 0.3523562252521515, "learning_rate": 0.00019194322756810634, "loss": 1.6394, "step": 14600 }, { "epoch": 0.1217129230973041, "grad_norm": 0.26578399538993835, "learning_rate": 0.00019188802895895674, "loss": 1.7057, "step": 14700 }, { "epoch": 0.12254090216599324, "grad_norm": 0.2317681610584259, "learning_rate": 0.00019183283034980713, "loss": 1.6244, "step": 14800 }, { "epoch": 0.12336888123468238, "grad_norm": 0.2221396416425705, "learning_rate": 0.00019177763174065752, "loss": 1.6442, "step": 14900 }, { "epoch": 0.12419686030337153, "grad_norm": 0.2175658941268921, "learning_rate": 0.00019172243313150792, "loss": 1.6643, "step": 15000 }, { "epoch": 0.1250248393720607, "grad_norm": 0.21838437020778656, "learning_rate": 0.0001916672345223583, "loss": 1.6629, "step": 15100 }, { "epoch": 0.12585281844074983, "grad_norm": 0.19501908123493195, "learning_rate": 0.0001916120359132087, "loss": 1.6466, "step": 15200 }, { "epoch": 0.12668079750943897, "grad_norm": 0.24163535237312317, "learning_rate": 0.0001915568373040591, "loss": 1.6448, "step": 15300 }, { "epoch": 0.1275087765781281, "grad_norm": 0.2042425274848938, "learning_rate": 0.0001915016386949095, "loss": 1.6639, "step": 15400 }, { "epoch": 0.12833675564681724, "grad_norm": 0.22937864065170288, "learning_rate": 0.0001914464400857599, "loss": 1.635, "step": 15500 }, { "epoch": 0.12916473471550638, "grad_norm": 0.2340078055858612, "learning_rate": 0.00019139124147661028, "loss": 1.6335, "step": 15600 }, { "epoch": 0.12999271378419552, "grad_norm": 0.24161306023597717, "learning_rate": 0.00019133604286746068, "loss": 1.634, "step": 15700 }, { "epoch": 0.13082069285288467, "grad_norm": 0.25775766372680664, "learning_rate": 0.00019128084425831107, "loss": 1.6331, "step": 15800 }, { "epoch": 0.1316486719215738, "grad_norm": 0.23609529435634613, "learning_rate": 0.00019122564564916147, "loss": 1.6431, "step": 15900 }, { "epoch": 0.13247665099026296, "grad_norm": 0.24818001687526703, "learning_rate": 0.00019117044704001186, "loss": 1.6355, "step": 16000 }, { "epoch": 0.1333046300589521, "grad_norm": 0.2805030941963196, "learning_rate": 0.00019111580041695375, "loss": 1.6459, "step": 16100 }, { "epoch": 0.13413260912764124, "grad_norm": 0.2721966505050659, "learning_rate": 0.00019106060180780417, "loss": 1.6384, "step": 16200 }, { "epoch": 0.1349605881963304, "grad_norm": 0.23994600772857666, "learning_rate": 0.00019100540319865454, "loss": 1.627, "step": 16300 }, { "epoch": 0.13578856726501953, "grad_norm": 0.20296916365623474, "learning_rate": 0.00019095020458950496, "loss": 1.6528, "step": 16400 }, { "epoch": 0.13661654633370868, "grad_norm": 0.21805709600448608, "learning_rate": 0.00019089500598035535, "loss": 1.64, "step": 16500 }, { "epoch": 0.13744452540239782, "grad_norm": 0.25123125314712524, "learning_rate": 0.00019083980737120572, "loss": 1.6889, "step": 16600 }, { "epoch": 0.13827250447108697, "grad_norm": 0.2638598680496216, "learning_rate": 0.00019078460876205614, "loss": 1.6507, "step": 16700 }, { "epoch": 0.1391004835397761, "grad_norm": 0.21334075927734375, "learning_rate": 0.00019072941015290653, "loss": 1.6428, "step": 16800 }, { "epoch": 0.13992846260846525, "grad_norm": 0.2150932401418686, "learning_rate": 0.0001906742115437569, "loss": 1.6277, "step": 16900 }, { "epoch": 0.1407564416771544, "grad_norm": 0.28504666686058044, "learning_rate": 0.00019061901293460732, "loss": 1.6609, "step": 17000 }, { "epoch": 0.14158442074584354, "grad_norm": 0.22296425700187683, "learning_rate": 0.00019056381432545772, "loss": 1.6359, "step": 17100 }, { "epoch": 0.1424123998145327, "grad_norm": 0.23885318636894226, "learning_rate": 0.00019050861571630808, "loss": 1.6262, "step": 17200 }, { "epoch": 0.14324037888322183, "grad_norm": 0.22231832146644592, "learning_rate": 0.0001904534171071585, "loss": 1.6369, "step": 17300 }, { "epoch": 0.14406835795191097, "grad_norm": 0.25279131531715393, "learning_rate": 0.0001903982184980089, "loss": 1.6376, "step": 17400 }, { "epoch": 0.14489633702060012, "grad_norm": 0.2754824459552765, "learning_rate": 0.00019034301988885927, "loss": 1.6579, "step": 17500 }, { "epoch": 0.14572431608928926, "grad_norm": 0.22775475680828094, "learning_rate": 0.0001902878212797097, "loss": 1.674, "step": 17600 }, { "epoch": 0.1465522951579784, "grad_norm": 0.21519581973552704, "learning_rate": 0.00019023262267056008, "loss": 1.654, "step": 17700 }, { "epoch": 0.14738027422666755, "grad_norm": 0.2484564632177353, "learning_rate": 0.00019017742406141045, "loss": 1.6046, "step": 17800 }, { "epoch": 0.1482082532953567, "grad_norm": 0.21091294288635254, "learning_rate": 0.00019012222545226087, "loss": 1.616, "step": 17900 }, { "epoch": 0.14903623236404584, "grad_norm": 0.27948471903800964, "learning_rate": 0.00019006702684311127, "loss": 1.6866, "step": 18000 }, { "epoch": 0.14986421143273498, "grad_norm": 0.28303253650665283, "learning_rate": 0.00019001182823396163, "loss": 1.6569, "step": 18100 }, { "epoch": 0.15069219050142413, "grad_norm": 0.2028988003730774, "learning_rate": 0.00018995662962481205, "loss": 1.6752, "step": 18200 }, { "epoch": 0.15152016957011327, "grad_norm": 0.237883523106575, "learning_rate": 0.0001899019830017539, "loss": 1.6618, "step": 18300 }, { "epoch": 0.15234814863880242, "grad_norm": 0.21337077021598816, "learning_rate": 0.00018984678439260433, "loss": 1.6397, "step": 18400 }, { "epoch": 0.15317612770749156, "grad_norm": 0.20675146579742432, "learning_rate": 0.00018979158578345473, "loss": 1.6382, "step": 18500 }, { "epoch": 0.1540041067761807, "grad_norm": 0.23405767977237701, "learning_rate": 0.0001897363871743051, "loss": 1.6112, "step": 18600 }, { "epoch": 0.15483208584486985, "grad_norm": 0.22801779210567474, "learning_rate": 0.00018968118856515552, "loss": 1.6332, "step": 18700 }, { "epoch": 0.155660064913559, "grad_norm": 0.28877681493759155, "learning_rate": 0.0001896259899560059, "loss": 1.618, "step": 18800 }, { "epoch": 0.15648804398224814, "grad_norm": 0.2644197940826416, "learning_rate": 0.00018957079134685628, "loss": 1.6617, "step": 18900 }, { "epoch": 0.15731602305093728, "grad_norm": 0.20182272791862488, "learning_rate": 0.0001895155927377067, "loss": 1.6273, "step": 19000 }, { "epoch": 0.15814400211962643, "grad_norm": 0.2702679932117462, "learning_rate": 0.0001894603941285571, "loss": 1.6573, "step": 19100 }, { "epoch": 0.15897198118831557, "grad_norm": 0.24677981436252594, "learning_rate": 0.00018940519551940746, "loss": 1.645, "step": 19200 }, { "epoch": 0.15979996025700471, "grad_norm": 0.22584903240203857, "learning_rate": 0.00018934999691025788, "loss": 1.6496, "step": 19300 }, { "epoch": 0.16062793932569386, "grad_norm": 0.25998055934906006, "learning_rate": 0.00018929479830110828, "loss": 1.6406, "step": 19400 }, { "epoch": 0.161455918394383, "grad_norm": 0.24679061770439148, "learning_rate": 0.00018923959969195867, "loss": 1.664, "step": 19500 }, { "epoch": 0.16228389746307215, "grad_norm": 0.21543079614639282, "learning_rate": 0.00018918440108280907, "loss": 1.6316, "step": 19600 }, { "epoch": 0.1631118765317613, "grad_norm": 0.20581622421741486, "learning_rate": 0.00018912920247365946, "loss": 1.6417, "step": 19700 }, { "epoch": 0.1639398556004504, "grad_norm": 0.24385277926921844, "learning_rate": 0.00018907400386450985, "loss": 1.6462, "step": 19800 }, { "epoch": 0.16476783466913955, "grad_norm": 0.21104101836681366, "learning_rate": 0.00018901880525536025, "loss": 1.6255, "step": 19900 }, { "epoch": 0.1655958137378287, "grad_norm": 0.2160460352897644, "learning_rate": 0.00018896360664621064, "loss": 1.6602, "step": 20000 }, { "epoch": 0.16642379280651784, "grad_norm": 0.27930399775505066, "learning_rate": 0.00018890840803706104, "loss": 1.676, "step": 20100 }, { "epoch": 0.16725177187520698, "grad_norm": 0.21541720628738403, "learning_rate": 0.00018885320942791143, "loss": 1.627, "step": 20200 }, { "epoch": 0.16807975094389613, "grad_norm": 0.21303394436836243, "learning_rate": 0.00018879801081876182, "loss": 1.6332, "step": 20300 }, { "epoch": 0.16890773001258527, "grad_norm": 0.22445669770240784, "learning_rate": 0.0001887433641957037, "loss": 1.6466, "step": 20400 }, { "epoch": 0.16973570908127442, "grad_norm": 0.21410098671913147, "learning_rate": 0.0001886881655865541, "loss": 1.6392, "step": 20500 }, { "epoch": 0.17056368814996356, "grad_norm": 0.19233547151088715, "learning_rate": 0.0001886329669774045, "loss": 1.6396, "step": 20600 }, { "epoch": 0.1713916672186527, "grad_norm": 0.20589858293533325, "learning_rate": 0.0001885777683682549, "loss": 1.6295, "step": 20700 }, { "epoch": 0.17221964628734185, "grad_norm": 0.25907692313194275, "learning_rate": 0.0001885225697591053, "loss": 1.6329, "step": 20800 }, { "epoch": 0.173047625356031, "grad_norm": 0.2765398919582367, "learning_rate": 0.00018846737114995568, "loss": 1.6868, "step": 20900 }, { "epoch": 0.17387560442472014, "grad_norm": 0.23948393762111664, "learning_rate": 0.00018841217254080608, "loss": 1.6664, "step": 21000 }, { "epoch": 0.17470358349340928, "grad_norm": 0.2371809184551239, "learning_rate": 0.00018835697393165647, "loss": 1.6843, "step": 21100 }, { "epoch": 0.17553156256209843, "grad_norm": 0.19910277426242828, "learning_rate": 0.00018830177532250686, "loss": 1.619, "step": 21200 }, { "epoch": 0.17635954163078757, "grad_norm": 0.24008843302726746, "learning_rate": 0.00018824657671335726, "loss": 1.6254, "step": 21300 }, { "epoch": 0.1771875206994767, "grad_norm": 0.22527045011520386, "learning_rate": 0.00018819137810420765, "loss": 1.656, "step": 21400 }, { "epoch": 0.17801549976816586, "grad_norm": 0.21581338346004486, "learning_rate": 0.00018813617949505805, "loss": 1.6297, "step": 21500 }, { "epoch": 0.178843478836855, "grad_norm": 0.23672404885292053, "learning_rate": 0.00018808098088590844, "loss": 1.6493, "step": 21600 }, { "epoch": 0.17967145790554415, "grad_norm": 0.22824163734912872, "learning_rate": 0.00018802578227675884, "loss": 1.6428, "step": 21700 }, { "epoch": 0.1804994369742333, "grad_norm": 0.21679380536079407, "learning_rate": 0.00018797058366760923, "loss": 1.6373, "step": 21800 }, { "epoch": 0.18132741604292243, "grad_norm": 0.23232513666152954, "learning_rate": 0.00018791538505845962, "loss": 1.6408, "step": 21900 }, { "epoch": 0.18215539511161158, "grad_norm": 0.21634866297245026, "learning_rate": 0.00018786018644931002, "loss": 1.6428, "step": 22000 }, { "epoch": 0.18298337418030072, "grad_norm": 0.24199073016643524, "learning_rate": 0.0001878049878401604, "loss": 1.6912, "step": 22100 }, { "epoch": 0.18381135324898987, "grad_norm": 0.24073319137096405, "learning_rate": 0.0001877497892310108, "loss": 1.6697, "step": 22200 }, { "epoch": 0.184639332317679, "grad_norm": 0.20666244626045227, "learning_rate": 0.0001876945906218612, "loss": 1.6372, "step": 22300 }, { "epoch": 0.18546731138636816, "grad_norm": 0.24439455568790436, "learning_rate": 0.0001876393920127116, "loss": 1.6562, "step": 22400 }, { "epoch": 0.1862952904550573, "grad_norm": 0.20771746337413788, "learning_rate": 0.000187584193403562, "loss": 1.6342, "step": 22500 }, { "epoch": 0.18712326952374644, "grad_norm": 0.23821604251861572, "learning_rate": 0.00018752899479441238, "loss": 1.6266, "step": 22600 }, { "epoch": 0.1879512485924356, "grad_norm": 0.20013689994812012, "learning_rate": 0.00018747379618526278, "loss": 1.6394, "step": 22700 }, { "epoch": 0.18877922766112473, "grad_norm": 0.24145907163619995, "learning_rate": 0.00018741859757611317, "loss": 1.639, "step": 22800 }, { "epoch": 0.18960720672981388, "grad_norm": 0.227446511387825, "learning_rate": 0.00018736395095305509, "loss": 1.6245, "step": 22900 }, { "epoch": 0.19043518579850302, "grad_norm": 0.23569269478321075, "learning_rate": 0.00018730875234390545, "loss": 1.6536, "step": 23000 }, { "epoch": 0.19126316486719216, "grad_norm": 0.26718559861183167, "learning_rate": 0.00018725355373475585, "loss": 1.6152, "step": 23100 }, { "epoch": 0.1920911439358813, "grad_norm": 0.2448846399784088, "learning_rate": 0.00018719835512560627, "loss": 1.6462, "step": 23200 }, { "epoch": 0.19291912300457045, "grad_norm": 0.22957266867160797, "learning_rate": 0.00018714315651645664, "loss": 1.6493, "step": 23300 }, { "epoch": 0.1937471020732596, "grad_norm": 0.24881067872047424, "learning_rate": 0.00018708795790730703, "loss": 1.6509, "step": 23400 }, { "epoch": 0.19457508114194874, "grad_norm": 0.26392731070518494, "learning_rate": 0.00018703275929815745, "loss": 1.6436, "step": 23500 }, { "epoch": 0.19540306021063789, "grad_norm": 0.23469410836696625, "learning_rate": 0.00018697756068900782, "loss": 1.6494, "step": 23600 }, { "epoch": 0.19623103927932703, "grad_norm": 0.191993847489357, "learning_rate": 0.00018692236207985824, "loss": 1.6118, "step": 23700 }, { "epoch": 0.19705901834801617, "grad_norm": 0.24076974391937256, "learning_rate": 0.00018686716347070863, "loss": 1.6414, "step": 23800 }, { "epoch": 0.19788699741670532, "grad_norm": 0.22676746547222137, "learning_rate": 0.000186811964861559, "loss": 1.6392, "step": 23900 }, { "epoch": 0.19871497648539446, "grad_norm": 0.2693268060684204, "learning_rate": 0.00018675676625240942, "loss": 1.6254, "step": 24000 }, { "epoch": 0.1995429555540836, "grad_norm": 0.23930659890174866, "learning_rate": 0.00018670156764325982, "loss": 1.6451, "step": 24100 }, { "epoch": 0.20037093462277272, "grad_norm": 0.23256762325763702, "learning_rate": 0.00018664636903411018, "loss": 1.6424, "step": 24200 }, { "epoch": 0.20119891369146187, "grad_norm": 0.2577391564846039, "learning_rate": 0.0001865911704249606, "loss": 1.6212, "step": 24300 }, { "epoch": 0.202026892760151, "grad_norm": 0.26071691513061523, "learning_rate": 0.000186535971815811, "loss": 1.6314, "step": 24400 }, { "epoch": 0.20285487182884016, "grad_norm": 0.22462689876556396, "learning_rate": 0.00018648077320666137, "loss": 1.6237, "step": 24500 }, { "epoch": 0.2036828508975293, "grad_norm": 0.24064601957798004, "learning_rate": 0.0001864255745975118, "loss": 1.6391, "step": 24600 }, { "epoch": 0.20451082996621844, "grad_norm": 0.22928814589977264, "learning_rate": 0.00018637037598836218, "loss": 1.6147, "step": 24700 }, { "epoch": 0.2053388090349076, "grad_norm": 0.22983402013778687, "learning_rate": 0.00018631517737921255, "loss": 1.6427, "step": 24800 }, { "epoch": 0.20616678810359673, "grad_norm": 0.22635537385940552, "learning_rate": 0.00018625997877006297, "loss": 1.6421, "step": 24900 }, { "epoch": 0.20699476717228588, "grad_norm": 0.23368728160858154, "learning_rate": 0.00018620478016091336, "loss": 1.6472, "step": 25000 }, { "epoch": 0.20782274624097502, "grad_norm": 0.24677674472332, "learning_rate": 0.00018614958155176373, "loss": 1.6697, "step": 25100 }, { "epoch": 0.20865072530966416, "grad_norm": 0.2160155028104782, "learning_rate": 0.00018609438294261415, "loss": 1.6327, "step": 25200 }, { "epoch": 0.2094787043783533, "grad_norm": 0.20933164656162262, "learning_rate": 0.00018603918433346455, "loss": 1.6464, "step": 25300 }, { "epoch": 0.21030668344704245, "grad_norm": 0.2508947253227234, "learning_rate": 0.00018598398572431491, "loss": 1.6531, "step": 25400 }, { "epoch": 0.2111346625157316, "grad_norm": 0.264946311712265, "learning_rate": 0.00018592878711516534, "loss": 1.6285, "step": 25500 }, { "epoch": 0.21196264158442074, "grad_norm": 0.23839199542999268, "learning_rate": 0.00018587358850601573, "loss": 1.6601, "step": 25600 }, { "epoch": 0.21279062065310989, "grad_norm": 0.20937936007976532, "learning_rate": 0.0001858183898968661, "loss": 1.6199, "step": 25700 }, { "epoch": 0.21361859972179903, "grad_norm": 0.2978517413139343, "learning_rate": 0.00018576319128771652, "loss": 1.6233, "step": 25800 }, { "epoch": 0.21444657879048817, "grad_norm": 0.22715617716312408, "learning_rate": 0.0001857079926785669, "loss": 1.6616, "step": 25900 }, { "epoch": 0.21527455785917732, "grad_norm": 0.21679271757602692, "learning_rate": 0.00018565279406941728, "loss": 1.6562, "step": 26000 }, { "epoch": 0.21610253692786646, "grad_norm": 0.2540909945964813, "learning_rate": 0.0001855975954602677, "loss": 1.6292, "step": 26100 }, { "epoch": 0.2169305159965556, "grad_norm": 0.23456817865371704, "learning_rate": 0.0001855423968511181, "loss": 1.6117, "step": 26200 }, { "epoch": 0.21775849506524475, "grad_norm": 0.2706565856933594, "learning_rate": 0.00018548719824196846, "loss": 1.6119, "step": 26300 }, { "epoch": 0.2185864741339339, "grad_norm": 0.24881504476070404, "learning_rate": 0.00018543199963281888, "loss": 1.6263, "step": 26400 }, { "epoch": 0.21941445320262304, "grad_norm": 0.3882363736629486, "learning_rate": 0.00018537680102366928, "loss": 1.6537, "step": 26500 }, { "epoch": 0.22024243227131218, "grad_norm": 0.19953063130378723, "learning_rate": 0.00018532160241451965, "loss": 1.6786, "step": 26600 }, { "epoch": 0.22107041134000133, "grad_norm": 0.21415996551513672, "learning_rate": 0.00018526640380537007, "loss": 1.6582, "step": 26700 }, { "epoch": 0.22189839040869047, "grad_norm": 0.22917871177196503, "learning_rate": 0.00018521120519622046, "loss": 1.5913, "step": 26800 }, { "epoch": 0.22272636947737962, "grad_norm": 0.23988696932792664, "learning_rate": 0.00018515600658707083, "loss": 1.6284, "step": 26900 }, { "epoch": 0.22355434854606876, "grad_norm": 0.21336591243743896, "learning_rate": 0.00018510080797792125, "loss": 1.6594, "step": 27000 }, { "epoch": 0.2243823276147579, "grad_norm": 0.22978056967258453, "learning_rate": 0.00018504560936877164, "loss": 1.653, "step": 27100 }, { "epoch": 0.22521030668344705, "grad_norm": 0.27317842841148376, "learning_rate": 0.00018499096274571353, "loss": 1.6267, "step": 27200 }, { "epoch": 0.2260382857521362, "grad_norm": 0.2396378219127655, "learning_rate": 0.00018493576413656392, "loss": 1.6452, "step": 27300 }, { "epoch": 0.22686626482082534, "grad_norm": 0.2800294756889343, "learning_rate": 0.00018488056552741432, "loss": 1.6722, "step": 27400 }, { "epoch": 0.22769424388951448, "grad_norm": 0.24665579199790955, "learning_rate": 0.0001848253669182647, "loss": 1.6473, "step": 27500 }, { "epoch": 0.22852222295820362, "grad_norm": 0.23755255341529846, "learning_rate": 0.0001847701683091151, "loss": 1.6072, "step": 27600 }, { "epoch": 0.22935020202689277, "grad_norm": 0.23274143040180206, "learning_rate": 0.0001847149696999655, "loss": 1.6547, "step": 27700 }, { "epoch": 0.2301781810955819, "grad_norm": 0.20112739503383636, "learning_rate": 0.0001846597710908159, "loss": 1.6307, "step": 27800 }, { "epoch": 0.23100616016427106, "grad_norm": 0.29942500591278076, "learning_rate": 0.0001846045724816663, "loss": 1.6384, "step": 27900 }, { "epoch": 0.2318341392329602, "grad_norm": 0.2471247911453247, "learning_rate": 0.00018454937387251668, "loss": 1.6568, "step": 28000 }, { "epoch": 0.23266211830164935, "grad_norm": 0.2586652934551239, "learning_rate": 0.00018449417526336708, "loss": 1.6167, "step": 28100 }, { "epoch": 0.2334900973703385, "grad_norm": 0.23116792738437653, "learning_rate": 0.00018443897665421747, "loss": 1.6222, "step": 28200 }, { "epoch": 0.23431807643902763, "grad_norm": 0.22467024624347687, "learning_rate": 0.00018438433003115936, "loss": 1.6343, "step": 28300 }, { "epoch": 0.23514605550771678, "grad_norm": 0.28815844655036926, "learning_rate": 0.00018432913142200975, "loss": 1.6388, "step": 28400 }, { "epoch": 0.23597403457640592, "grad_norm": 0.22312502562999725, "learning_rate": 0.00018427393281286017, "loss": 1.639, "step": 28500 }, { "epoch": 0.23680201364509504, "grad_norm": 0.2542140781879425, "learning_rate": 0.00018421873420371054, "loss": 1.6019, "step": 28600 }, { "epoch": 0.23762999271378418, "grad_norm": 0.2540210783481598, "learning_rate": 0.00018416353559456094, "loss": 1.6217, "step": 28700 }, { "epoch": 0.23845797178247333, "grad_norm": 0.2396947294473648, "learning_rate": 0.00018410833698541136, "loss": 1.6186, "step": 28800 }, { "epoch": 0.23928595085116247, "grad_norm": 0.2223241776227951, "learning_rate": 0.00018405313837626172, "loss": 1.6385, "step": 28900 }, { "epoch": 0.24011392991985162, "grad_norm": 0.21971747279167175, "learning_rate": 0.00018399793976711212, "loss": 1.6357, "step": 29000 }, { "epoch": 0.24094190898854076, "grad_norm": 0.24293990433216095, "learning_rate": 0.00018394274115796254, "loss": 1.6677, "step": 29100 }, { "epoch": 0.2417698880572299, "grad_norm": 0.2490108758211136, "learning_rate": 0.0001838875425488129, "loss": 1.6423, "step": 29200 }, { "epoch": 0.24259786712591905, "grad_norm": 0.2689921259880066, "learning_rate": 0.0001838323439396633, "loss": 1.6273, "step": 29300 }, { "epoch": 0.2434258461946082, "grad_norm": 0.21055381000041962, "learning_rate": 0.0001837776973166052, "loss": 1.5876, "step": 29400 }, { "epoch": 0.24425382526329734, "grad_norm": 0.24339735507965088, "learning_rate": 0.00018372249870745558, "loss": 1.5942, "step": 29500 }, { "epoch": 0.24508180433198648, "grad_norm": 0.22557999193668365, "learning_rate": 0.000183667300098306, "loss": 1.6407, "step": 29600 }, { "epoch": 0.24590978340067562, "grad_norm": 0.27617794275283813, "learning_rate": 0.00018361210148915637, "loss": 1.626, "step": 29700 }, { "epoch": 0.24673776246936477, "grad_norm": 0.2763831615447998, "learning_rate": 0.00018355690288000676, "loss": 1.6035, "step": 29800 }, { "epoch": 0.2475657415380539, "grad_norm": 0.23689210414886475, "learning_rate": 0.00018350170427085719, "loss": 1.6772, "step": 29900 }, { "epoch": 0.24839372060674306, "grad_norm": 0.22767847776412964, "learning_rate": 0.00018344650566170755, "loss": 1.6315, "step": 30000 }, { "epoch": 0.2492216996754322, "grad_norm": 0.23645687103271484, "learning_rate": 0.00018339130705255795, "loss": 1.6424, "step": 30100 }, { "epoch": 0.2500496787441214, "grad_norm": 0.29076236486434937, "learning_rate": 0.00018333610844340837, "loss": 1.6175, "step": 30200 }, { "epoch": 0.2508776578128105, "grad_norm": 0.22849859297275543, "learning_rate": 0.00018328090983425874, "loss": 1.6264, "step": 30300 }, { "epoch": 0.25170563688149966, "grad_norm": 0.19252796471118927, "learning_rate": 0.00018322571122510913, "loss": 1.6386, "step": 30400 }, { "epoch": 0.2525336159501888, "grad_norm": 0.22710925340652466, "learning_rate": 0.00018317051261595955, "loss": 1.6098, "step": 30500 }, { "epoch": 0.25336159501887795, "grad_norm": 0.22989875078201294, "learning_rate": 0.00018311531400680992, "loss": 1.6339, "step": 30600 }, { "epoch": 0.2541895740875671, "grad_norm": 0.25290095806121826, "learning_rate": 0.0001830601153976603, "loss": 1.634, "step": 30700 }, { "epoch": 0.2550175531562562, "grad_norm": 0.24599549174308777, "learning_rate": 0.00018300491678851073, "loss": 1.6577, "step": 30800 }, { "epoch": 0.2558455322249453, "grad_norm": 0.37027794122695923, "learning_rate": 0.0001829497181793611, "loss": 1.644, "step": 30900 }, { "epoch": 0.25667351129363447, "grad_norm": 0.2629932463169098, "learning_rate": 0.00018289451957021152, "loss": 1.622, "step": 31000 }, { "epoch": 0.2575014903623236, "grad_norm": 0.33217158913612366, "learning_rate": 0.00018283932096106192, "loss": 1.6334, "step": 31100 }, { "epoch": 0.25832946943101276, "grad_norm": 0.23134025931358337, "learning_rate": 0.00018278412235191228, "loss": 1.6411, "step": 31200 }, { "epoch": 0.2591574484997019, "grad_norm": 0.2514587342739105, "learning_rate": 0.0001827289237427627, "loss": 1.6125, "step": 31300 }, { "epoch": 0.25998542756839105, "grad_norm": 0.2340904325246811, "learning_rate": 0.0001826737251336131, "loss": 1.6567, "step": 31400 }, { "epoch": 0.2608134066370802, "grad_norm": 0.27328845858573914, "learning_rate": 0.00018261852652446347, "loss": 1.6479, "step": 31500 }, { "epoch": 0.26164138570576934, "grad_norm": 0.2700946033000946, "learning_rate": 0.0001825633279153139, "loss": 1.6262, "step": 31600 }, { "epoch": 0.2624693647744585, "grad_norm": 0.25395748019218445, "learning_rate": 0.00018250812930616428, "loss": 1.6166, "step": 31700 }, { "epoch": 0.2632973438431476, "grad_norm": 0.23837615549564362, "learning_rate": 0.00018245293069701465, "loss": 1.6378, "step": 31800 }, { "epoch": 0.26412532291183677, "grad_norm": 0.2575034201145172, "learning_rate": 0.00018239773208786507, "loss": 1.6448, "step": 31900 }, { "epoch": 0.2649533019805259, "grad_norm": 0.25627613067626953, "learning_rate": 0.00018234253347871546, "loss": 1.6674, "step": 32000 }, { "epoch": 0.26578128104921506, "grad_norm": 0.24171142280101776, "learning_rate": 0.00018228733486956583, "loss": 1.6584, "step": 32100 }, { "epoch": 0.2666092601179042, "grad_norm": 0.2696898877620697, "learning_rate": 0.00018223213626041625, "loss": 1.6346, "step": 32200 }, { "epoch": 0.26743723918659335, "grad_norm": 0.2296200692653656, "learning_rate": 0.00018217693765126665, "loss": 1.6376, "step": 32300 }, { "epoch": 0.2682652182552825, "grad_norm": 0.22668029367923737, "learning_rate": 0.00018212173904211701, "loss": 1.6568, "step": 32400 }, { "epoch": 0.26909319732397163, "grad_norm": 0.23633837699890137, "learning_rate": 0.00018206654043296744, "loss": 1.6355, "step": 32500 }, { "epoch": 0.2699211763926608, "grad_norm": 0.3272385597229004, "learning_rate": 0.00018201134182381783, "loss": 1.6333, "step": 32600 }, { "epoch": 0.2707491554613499, "grad_norm": 0.22284284234046936, "learning_rate": 0.0001819561432146682, "loss": 1.6482, "step": 32700 }, { "epoch": 0.27157713453003907, "grad_norm": 0.22121630609035492, "learning_rate": 0.00018190094460551862, "loss": 1.6496, "step": 32800 }, { "epoch": 0.2724051135987282, "grad_norm": 0.2997874319553375, "learning_rate": 0.000181845745996369, "loss": 1.6416, "step": 32900 }, { "epoch": 0.27323309266741735, "grad_norm": 0.26545023918151855, "learning_rate": 0.00018179054738721938, "loss": 1.6452, "step": 33000 }, { "epoch": 0.2740610717361065, "grad_norm": 0.2760174870491028, "learning_rate": 0.0001817353487780698, "loss": 1.6617, "step": 33100 }, { "epoch": 0.27488905080479564, "grad_norm": 0.2267056554555893, "learning_rate": 0.0001816801501689202, "loss": 1.6201, "step": 33200 }, { "epoch": 0.2757170298734848, "grad_norm": 0.2418147623538971, "learning_rate": 0.00018162495155977056, "loss": 1.6369, "step": 33300 }, { "epoch": 0.27654500894217393, "grad_norm": 0.2712232172489166, "learning_rate": 0.00018156975295062098, "loss": 1.61, "step": 33400 }, { "epoch": 0.2773729880108631, "grad_norm": 0.21989206969738007, "learning_rate": 0.00018151455434147138, "loss": 1.6385, "step": 33500 }, { "epoch": 0.2782009670795522, "grad_norm": 0.23285841941833496, "learning_rate": 0.00018145935573232174, "loss": 1.6234, "step": 33600 }, { "epoch": 0.27902894614824136, "grad_norm": 0.2723333537578583, "learning_rate": 0.00018140415712317217, "loss": 1.6265, "step": 33700 }, { "epoch": 0.2798569252169305, "grad_norm": 0.29798418283462524, "learning_rate": 0.00018134895851402256, "loss": 1.6278, "step": 33800 }, { "epoch": 0.28068490428561965, "grad_norm": 0.22149847447872162, "learning_rate": 0.00018129375990487293, "loss": 1.6347, "step": 33900 }, { "epoch": 0.2815128833543088, "grad_norm": 0.2652026116847992, "learning_rate": 0.00018123856129572335, "loss": 1.6388, "step": 34000 }, { "epoch": 0.28234086242299794, "grad_norm": 0.21799111366271973, "learning_rate": 0.00018118336268657374, "loss": 1.6031, "step": 34100 }, { "epoch": 0.2831688414916871, "grad_norm": 0.23749162256717682, "learning_rate": 0.0001811281640774241, "loss": 1.6268, "step": 34200 }, { "epoch": 0.28399682056037623, "grad_norm": 0.20693838596343994, "learning_rate": 0.00018107351745436602, "loss": 1.6406, "step": 34300 }, { "epoch": 0.2848247996290654, "grad_norm": 0.27509090304374695, "learning_rate": 0.00018101831884521642, "loss": 1.6291, "step": 34400 }, { "epoch": 0.2856527786977545, "grad_norm": 0.24260659515857697, "learning_rate": 0.0001809631202360668, "loss": 1.6418, "step": 34500 }, { "epoch": 0.28648075776644366, "grad_norm": 0.22240006923675537, "learning_rate": 0.0001809079216269172, "loss": 1.589, "step": 34600 }, { "epoch": 0.2873087368351328, "grad_norm": 0.21402128040790558, "learning_rate": 0.0001808527230177676, "loss": 1.6004, "step": 34700 }, { "epoch": 0.28813671590382195, "grad_norm": 0.23821817338466644, "learning_rate": 0.000180797524408618, "loss": 1.6203, "step": 34800 }, { "epoch": 0.2889646949725111, "grad_norm": 0.22152476012706757, "learning_rate": 0.0001807423257994684, "loss": 1.6351, "step": 34900 }, { "epoch": 0.28979267404120024, "grad_norm": 0.269619882106781, "learning_rate": 0.00018068712719031878, "loss": 1.6269, "step": 35000 }, { "epoch": 0.2906206531098894, "grad_norm": 0.2366069257259369, "learning_rate": 0.00018063192858116918, "loss": 1.6363, "step": 35100 }, { "epoch": 0.2914486321785785, "grad_norm": 0.2468317300081253, "learning_rate": 0.00018057672997201957, "loss": 1.6265, "step": 35200 }, { "epoch": 0.29227661124726767, "grad_norm": 0.2998073697090149, "learning_rate": 0.00018052153136286997, "loss": 1.6213, "step": 35300 }, { "epoch": 0.2931045903159568, "grad_norm": 0.2570279836654663, "learning_rate": 0.00018046633275372036, "loss": 1.6229, "step": 35400 }, { "epoch": 0.29393256938464596, "grad_norm": 0.29977649450302124, "learning_rate": 0.00018041113414457075, "loss": 1.5778, "step": 35500 }, { "epoch": 0.2947605484533351, "grad_norm": 0.23830650746822357, "learning_rate": 0.00018035593553542115, "loss": 1.643, "step": 35600 }, { "epoch": 0.29558852752202425, "grad_norm": 0.2635740339756012, "learning_rate": 0.00018030073692627154, "loss": 1.6369, "step": 35700 }, { "epoch": 0.2964165065907134, "grad_norm": 0.24356907606124878, "learning_rate": 0.00018024553831712194, "loss": 1.6231, "step": 35800 }, { "epoch": 0.29724448565940254, "grad_norm": 0.19405247271060944, "learning_rate": 0.00018019033970797233, "loss": 1.6504, "step": 35900 }, { "epoch": 0.2980724647280917, "grad_norm": 0.22447918355464935, "learning_rate": 0.00018013514109882273, "loss": 1.6385, "step": 36000 }, { "epoch": 0.2989004437967808, "grad_norm": 0.2469175159931183, "learning_rate": 0.00018007994248967312, "loss": 1.6285, "step": 36100 }, { "epoch": 0.29972842286546997, "grad_norm": 0.2642788290977478, "learning_rate": 0.00018002474388052351, "loss": 1.6119, "step": 36200 }, { "epoch": 0.3005564019341591, "grad_norm": 0.2765555679798126, "learning_rate": 0.0001799695452713739, "loss": 1.6178, "step": 36300 }, { "epoch": 0.30138438100284826, "grad_norm": 0.3768673837184906, "learning_rate": 0.0001799143466622243, "loss": 1.6424, "step": 36400 }, { "epoch": 0.3022123600715374, "grad_norm": 0.26609617471694946, "learning_rate": 0.0001798591480530747, "loss": 1.6599, "step": 36500 }, { "epoch": 0.30304033914022654, "grad_norm": 0.24030715227127075, "learning_rate": 0.0001798039494439251, "loss": 1.64, "step": 36600 }, { "epoch": 0.3038683182089157, "grad_norm": 0.24733193218708038, "learning_rate": 0.00017974875083477549, "loss": 1.6097, "step": 36700 }, { "epoch": 0.30469629727760483, "grad_norm": 0.24855640530586243, "learning_rate": 0.00017969355222562588, "loss": 1.6355, "step": 36800 }, { "epoch": 0.305524276346294, "grad_norm": 0.2558852434158325, "learning_rate": 0.00017963835361647627, "loss": 1.6395, "step": 36900 }, { "epoch": 0.3063522554149831, "grad_norm": 0.252812922000885, "learning_rate": 0.00017958315500732667, "loss": 1.6019, "step": 37000 }, { "epoch": 0.30718023448367227, "grad_norm": 0.2745685279369354, "learning_rate": 0.0001795279563981771, "loss": 1.6438, "step": 37100 }, { "epoch": 0.3080082135523614, "grad_norm": 0.24301989376544952, "learning_rate": 0.00017947330977511895, "loss": 1.5943, "step": 37200 }, { "epoch": 0.30883619262105055, "grad_norm": 0.25345346331596375, "learning_rate": 0.00017941811116596937, "loss": 1.6255, "step": 37300 }, { "epoch": 0.3096641716897397, "grad_norm": 0.2757332921028137, "learning_rate": 0.00017936291255681974, "loss": 1.622, "step": 37400 }, { "epoch": 0.31049215075842884, "grad_norm": 0.2548620402812958, "learning_rate": 0.00017930771394767013, "loss": 1.6315, "step": 37500 }, { "epoch": 0.311320129827118, "grad_norm": 0.2474357783794403, "learning_rate": 0.00017925251533852055, "loss": 1.6391, "step": 37600 }, { "epoch": 0.31214810889580713, "grad_norm": 0.23792117834091187, "learning_rate": 0.00017919731672937092, "loss": 1.6349, "step": 37700 }, { "epoch": 0.3129760879644963, "grad_norm": 0.22419889271259308, "learning_rate": 0.00017914211812022131, "loss": 1.6439, "step": 37800 }, { "epoch": 0.3138040670331854, "grad_norm": 0.2642965316772461, "learning_rate": 0.00017908691951107174, "loss": 1.6347, "step": 37900 }, { "epoch": 0.31463204610187456, "grad_norm": 0.2979467511177063, "learning_rate": 0.0001790317209019221, "loss": 1.6478, "step": 38000 }, { "epoch": 0.3154600251705637, "grad_norm": 0.33403944969177246, "learning_rate": 0.0001789765222927725, "loss": 1.5852, "step": 38100 }, { "epoch": 0.31628800423925285, "grad_norm": 0.2425394505262375, "learning_rate": 0.00017892132368362292, "loss": 1.596, "step": 38200 }, { "epoch": 0.317115983307942, "grad_norm": 0.24871741235256195, "learning_rate": 0.00017886612507447328, "loss": 1.6358, "step": 38300 }, { "epoch": 0.31794396237663114, "grad_norm": 0.2535829246044159, "learning_rate": 0.00017881092646532368, "loss": 1.6175, "step": 38400 }, { "epoch": 0.3187719414453203, "grad_norm": 0.3197729289531708, "learning_rate": 0.0001787557278561741, "loss": 1.6211, "step": 38500 }, { "epoch": 0.31959992051400943, "grad_norm": 0.23783159255981445, "learning_rate": 0.00017870052924702447, "loss": 1.6677, "step": 38600 }, { "epoch": 0.32042789958269857, "grad_norm": 0.2013082504272461, "learning_rate": 0.00017864533063787486, "loss": 1.5916, "step": 38700 }, { "epoch": 0.3212558786513877, "grad_norm": 0.2517942190170288, "learning_rate": 0.00017859013202872528, "loss": 1.61, "step": 38800 }, { "epoch": 0.32208385772007686, "grad_norm": 0.22963842749595642, "learning_rate": 0.00017853493341957565, "loss": 1.5888, "step": 38900 }, { "epoch": 0.322911836788766, "grad_norm": 0.27445054054260254, "learning_rate": 0.00017847973481042604, "loss": 1.6021, "step": 39000 }, { "epoch": 0.32373981585745515, "grad_norm": 0.21975164115428925, "learning_rate": 0.00017842453620127647, "loss": 1.6261, "step": 39100 }, { "epoch": 0.3245677949261443, "grad_norm": 0.27244439721107483, "learning_rate": 0.00017836988957821835, "loss": 1.6478, "step": 39200 }, { "epoch": 0.32539577399483344, "grad_norm": 0.2596853971481323, "learning_rate": 0.00017831469096906875, "loss": 1.5833, "step": 39300 }, { "epoch": 0.3262237530635226, "grad_norm": 0.22705727815628052, "learning_rate": 0.0001782594923599191, "loss": 1.64, "step": 39400 }, { "epoch": 0.32705173213221167, "grad_norm": 0.24524278938770294, "learning_rate": 0.00017820429375076953, "loss": 1.6177, "step": 39500 }, { "epoch": 0.3278797112009008, "grad_norm": 0.2465396374464035, "learning_rate": 0.00017814909514161993, "loss": 1.6241, "step": 39600 }, { "epoch": 0.32870769026958996, "grad_norm": 0.24681350588798523, "learning_rate": 0.0001780938965324703, "loss": 1.6366, "step": 39700 }, { "epoch": 0.3295356693382791, "grad_norm": 0.2725008726119995, "learning_rate": 0.00017803869792332072, "loss": 1.6168, "step": 39800 }, { "epoch": 0.33036364840696825, "grad_norm": 0.24282367527484894, "learning_rate": 0.0001779834993141711, "loss": 1.6427, "step": 39900 }, { "epoch": 0.3311916274756574, "grad_norm": 0.22297324240207672, "learning_rate": 0.00017792830070502148, "loss": 1.6094, "step": 40000 }, { "epoch": 0.33201960654434653, "grad_norm": 0.23385342955589294, "learning_rate": 0.0001778736540819634, "loss": 1.6268, "step": 40100 }, { "epoch": 0.3328475856130357, "grad_norm": 0.24977736175060272, "learning_rate": 0.00017781845547281376, "loss": 1.6237, "step": 40200 }, { "epoch": 0.3336755646817248, "grad_norm": 0.24268653988838196, "learning_rate": 0.00017776325686366418, "loss": 1.6471, "step": 40300 }, { "epoch": 0.33450354375041397, "grad_norm": 0.1996404081583023, "learning_rate": 0.00017770805825451457, "loss": 1.6132, "step": 40400 }, { "epoch": 0.3353315228191031, "grad_norm": 0.26735562086105347, "learning_rate": 0.00017765285964536494, "loss": 1.6451, "step": 40500 }, { "epoch": 0.33615950188779226, "grad_norm": 0.24058261513710022, "learning_rate": 0.00017759766103621536, "loss": 1.6199, "step": 40600 }, { "epoch": 0.3369874809564814, "grad_norm": 0.23054581880569458, "learning_rate": 0.00017754246242706576, "loss": 1.6227, "step": 40700 }, { "epoch": 0.33781546002517054, "grad_norm": 0.245001882314682, "learning_rate": 0.00017748726381791615, "loss": 1.6422, "step": 40800 }, { "epoch": 0.3386434390938597, "grad_norm": 0.40328505635261536, "learning_rate": 0.00017743206520876655, "loss": 1.6102, "step": 40900 }, { "epoch": 0.33947141816254883, "grad_norm": 0.21955619752407074, "learning_rate": 0.00017737686659961694, "loss": 1.6439, "step": 41000 }, { "epoch": 0.340299397231238, "grad_norm": 0.2642820477485657, "learning_rate": 0.00017732166799046733, "loss": 1.6576, "step": 41100 }, { "epoch": 0.3411273762999271, "grad_norm": 0.22586101293563843, "learning_rate": 0.00017726646938131773, "loss": 1.6331, "step": 41200 }, { "epoch": 0.34195535536861627, "grad_norm": 0.22351595759391785, "learning_rate": 0.00017721127077216812, "loss": 1.6218, "step": 41300 }, { "epoch": 0.3427833344373054, "grad_norm": 0.5699514150619507, "learning_rate": 0.00017715607216301852, "loss": 1.6398, "step": 41400 }, { "epoch": 0.34361131350599455, "grad_norm": 0.3038058280944824, "learning_rate": 0.0001771008735538689, "loss": 1.6364, "step": 41500 }, { "epoch": 0.3444392925746837, "grad_norm": 0.25284209847450256, "learning_rate": 0.0001770456749447193, "loss": 1.6345, "step": 41600 }, { "epoch": 0.34526727164337284, "grad_norm": 0.5244471430778503, "learning_rate": 0.0001769904763355697, "loss": 1.6348, "step": 41700 }, { "epoch": 0.346095250712062, "grad_norm": 0.240703284740448, "learning_rate": 0.0001769352777264201, "loss": 1.6312, "step": 41800 }, { "epoch": 0.34692322978075113, "grad_norm": 0.23771056532859802, "learning_rate": 0.0001768800791172705, "loss": 1.6526, "step": 41900 }, { "epoch": 0.3477512088494403, "grad_norm": 0.25365206599235535, "learning_rate": 0.00017682488050812088, "loss": 1.6376, "step": 42000 }, { "epoch": 0.3485791879181294, "grad_norm": 0.27795979380607605, "learning_rate": 0.00017676968189897128, "loss": 1.6621, "step": 42100 }, { "epoch": 0.34940716698681856, "grad_norm": 0.24921217560768127, "learning_rate": 0.00017671448328982167, "loss": 1.622, "step": 42200 }, { "epoch": 0.3502351460555077, "grad_norm": 0.24285615980625153, "learning_rate": 0.00017665928468067207, "loss": 1.6524, "step": 42300 }, { "epoch": 0.35106312512419685, "grad_norm": 0.23597076535224915, "learning_rate": 0.00017660408607152246, "loss": 1.6257, "step": 42400 }, { "epoch": 0.351891104192886, "grad_norm": 0.2752334177494049, "learning_rate": 0.00017654888746237285, "loss": 1.6223, "step": 42500 }, { "epoch": 0.35271908326157514, "grad_norm": 0.24339468777179718, "learning_rate": 0.00017649424083931474, "loss": 1.6307, "step": 42600 }, { "epoch": 0.3535470623302643, "grad_norm": 0.22084777057170868, "learning_rate": 0.00017643904223016513, "loss": 1.6245, "step": 42700 }, { "epoch": 0.3543750413989534, "grad_norm": 0.2476748824119568, "learning_rate": 0.00017638384362101556, "loss": 1.6262, "step": 42800 }, { "epoch": 0.35520302046764257, "grad_norm": 0.2630603611469269, "learning_rate": 0.00017632864501186592, "loss": 1.6396, "step": 42900 }, { "epoch": 0.3560309995363317, "grad_norm": 0.27675458788871765, "learning_rate": 0.00017627344640271632, "loss": 1.6497, "step": 43000 }, { "epoch": 0.35685897860502086, "grad_norm": 0.23261144757270813, "learning_rate": 0.00017621824779356674, "loss": 1.6097, "step": 43100 }, { "epoch": 0.35768695767371, "grad_norm": 0.24998947978019714, "learning_rate": 0.0001761630491844171, "loss": 1.6306, "step": 43200 }, { "epoch": 0.35851493674239915, "grad_norm": 0.2858506441116333, "learning_rate": 0.0001761078505752675, "loss": 1.6167, "step": 43300 }, { "epoch": 0.3593429158110883, "grad_norm": 0.24741750955581665, "learning_rate": 0.00017605265196611792, "loss": 1.6434, "step": 43400 }, { "epoch": 0.36017089487977744, "grad_norm": 0.2413889467716217, "learning_rate": 0.0001759974533569683, "loss": 1.6175, "step": 43500 }, { "epoch": 0.3609988739484666, "grad_norm": 0.23276059329509735, "learning_rate": 0.00017594225474781868, "loss": 1.6506, "step": 43600 }, { "epoch": 0.3618268530171557, "grad_norm": 0.7301183938980103, "learning_rate": 0.0001758870561386691, "loss": 1.6602, "step": 43700 }, { "epoch": 0.36265483208584487, "grad_norm": 0.26085367798805237, "learning_rate": 0.00017583185752951947, "loss": 1.6361, "step": 43800 }, { "epoch": 0.363482811154534, "grad_norm": 0.2507553994655609, "learning_rate": 0.00017577665892036987, "loss": 1.6207, "step": 43900 }, { "epoch": 0.36431079022322316, "grad_norm": 0.2663898766040802, "learning_rate": 0.00017572146031122029, "loss": 1.6443, "step": 44000 }, { "epoch": 0.3651387692919123, "grad_norm": 0.34235909581184387, "learning_rate": 0.00017566626170207065, "loss": 1.668, "step": 44100 }, { "epoch": 0.36596674836060145, "grad_norm": 0.26196587085723877, "learning_rate": 0.00017561106309292105, "loss": 1.6544, "step": 44200 }, { "epoch": 0.3667947274292906, "grad_norm": 0.23575757443904877, "learning_rate": 0.00017555586448377147, "loss": 1.6252, "step": 44300 }, { "epoch": 0.36762270649797973, "grad_norm": 0.2926133871078491, "learning_rate": 0.00017550066587462184, "loss": 1.6282, "step": 44400 }, { "epoch": 0.3684506855666689, "grad_norm": 0.21387304365634918, "learning_rate": 0.00017544546726547223, "loss": 1.6542, "step": 44500 }, { "epoch": 0.369278664635358, "grad_norm": 0.24483337998390198, "learning_rate": 0.00017539026865632265, "loss": 1.626, "step": 44600 }, { "epoch": 0.37010664370404717, "grad_norm": 0.3214597702026367, "learning_rate": 0.00017533507004717302, "loss": 1.5933, "step": 44700 }, { "epoch": 0.3709346227727363, "grad_norm": 0.23716263473033905, "learning_rate": 0.0001752798714380234, "loss": 1.6189, "step": 44800 }, { "epoch": 0.37176260184142546, "grad_norm": 0.22332042455673218, "learning_rate": 0.00017522467282887383, "loss": 1.6488, "step": 44900 }, { "epoch": 0.3725905809101146, "grad_norm": 0.2706310749053955, "learning_rate": 0.0001751694742197242, "loss": 1.6269, "step": 45000 }, { "epoch": 0.37341855997880374, "grad_norm": 0.2572775185108185, "learning_rate": 0.0001751142756105746, "loss": 1.6432, "step": 45100 }, { "epoch": 0.3742465390474929, "grad_norm": 0.23288114368915558, "learning_rate": 0.00017505907700142502, "loss": 1.6531, "step": 45200 }, { "epoch": 0.37507451811618203, "grad_norm": 0.2673160433769226, "learning_rate": 0.00017500387839227538, "loss": 1.6047, "step": 45300 }, { "epoch": 0.3759024971848712, "grad_norm": 0.29170718789100647, "learning_rate": 0.00017494867978312578, "loss": 1.6119, "step": 45400 }, { "epoch": 0.3767304762535603, "grad_norm": 0.2626097500324249, "learning_rate": 0.0001748934811739762, "loss": 1.6329, "step": 45500 }, { "epoch": 0.37755845532224946, "grad_norm": 0.22249017655849457, "learning_rate": 0.00017483828256482657, "loss": 1.6284, "step": 45600 }, { "epoch": 0.3783864343909386, "grad_norm": 0.36833006143569946, "learning_rate": 0.00017478308395567696, "loss": 1.6079, "step": 45700 }, { "epoch": 0.37921441345962775, "grad_norm": 0.23582060635089874, "learning_rate": 0.00017472788534652738, "loss": 1.6243, "step": 45800 }, { "epoch": 0.3800423925283169, "grad_norm": 0.20202980935573578, "learning_rate": 0.00017467268673737775, "loss": 1.6312, "step": 45900 }, { "epoch": 0.38087037159700604, "grad_norm": 0.3692006468772888, "learning_rate": 0.00017461748812822814, "loss": 1.6048, "step": 46000 }, { "epoch": 0.3816983506656952, "grad_norm": 0.27726104855537415, "learning_rate": 0.00017456228951907857, "loss": 1.6396, "step": 46100 }, { "epoch": 0.38252632973438433, "grad_norm": 0.2585281431674957, "learning_rate": 0.00017450709090992893, "loss": 1.6185, "step": 46200 }, { "epoch": 0.3833543088030735, "grad_norm": 0.24427704513072968, "learning_rate": 0.00017445189230077935, "loss": 1.5996, "step": 46300 }, { "epoch": 0.3841822878717626, "grad_norm": 0.2784598469734192, "learning_rate": 0.00017439669369162975, "loss": 1.6012, "step": 46400 }, { "epoch": 0.38501026694045176, "grad_norm": 0.25094759464263916, "learning_rate": 0.00017434149508248012, "loss": 1.6043, "step": 46500 }, { "epoch": 0.3858382460091409, "grad_norm": 0.2598397731781006, "learning_rate": 0.00017428629647333054, "loss": 1.6278, "step": 46600 }, { "epoch": 0.38666622507783005, "grad_norm": 0.22646105289459229, "learning_rate": 0.00017423109786418093, "loss": 1.6297, "step": 46700 }, { "epoch": 0.3874942041465192, "grad_norm": 0.29078444838523865, "learning_rate": 0.0001741758992550313, "loss": 1.6509, "step": 46800 }, { "epoch": 0.38832218321520834, "grad_norm": 0.2686766982078552, "learning_rate": 0.00017412070064588172, "loss": 1.6145, "step": 46900 }, { "epoch": 0.3891501622838975, "grad_norm": 0.2642682194709778, "learning_rate": 0.0001740655020367321, "loss": 1.6421, "step": 47000 }, { "epoch": 0.3899781413525866, "grad_norm": 0.2528562545776367, "learning_rate": 0.000174010855413674, "loss": 1.6607, "step": 47100 }, { "epoch": 0.39080612042127577, "grad_norm": 0.2299865484237671, "learning_rate": 0.00017395620879061589, "loss": 1.6112, "step": 47200 }, { "epoch": 0.3916340994899649, "grad_norm": 0.24297955632209778, "learning_rate": 0.00017390101018146628, "loss": 1.6081, "step": 47300 }, { "epoch": 0.39246207855865406, "grad_norm": 0.243024542927742, "learning_rate": 0.00017384581157231667, "loss": 1.6703, "step": 47400 }, { "epoch": 0.3932900576273432, "grad_norm": 0.24055872857570648, "learning_rate": 0.00017379061296316707, "loss": 1.5895, "step": 47500 }, { "epoch": 0.39411803669603235, "grad_norm": 0.2664213478565216, "learning_rate": 0.00017373541435401746, "loss": 1.6173, "step": 47600 }, { "epoch": 0.3949460157647215, "grad_norm": 0.23474732041358948, "learning_rate": 0.00017368021574486786, "loss": 1.6347, "step": 47700 }, { "epoch": 0.39577399483341064, "grad_norm": 0.24745678901672363, "learning_rate": 0.00017362501713571825, "loss": 1.6112, "step": 47800 }, { "epoch": 0.3966019739020998, "grad_norm": 0.2576342523097992, "learning_rate": 0.00017356981852656865, "loss": 1.6181, "step": 47900 }, { "epoch": 0.3974299529707889, "grad_norm": 0.27308520674705505, "learning_rate": 0.00017351461991741904, "loss": 1.6037, "step": 48000 }, { "epoch": 0.39825793203947807, "grad_norm": 0.2533150315284729, "learning_rate": 0.00017345942130826943, "loss": 1.6561, "step": 48100 }, { "epoch": 0.3990859111081672, "grad_norm": 0.24679023027420044, "learning_rate": 0.00017340422269911983, "loss": 1.6129, "step": 48200 }, { "epoch": 0.3999138901768563, "grad_norm": 0.28216132521629333, "learning_rate": 0.00017334902408997022, "loss": 1.6374, "step": 48300 }, { "epoch": 0.40074186924554545, "grad_norm": 0.26568201184272766, "learning_rate": 0.00017329382548082062, "loss": 1.6395, "step": 48400 }, { "epoch": 0.4015698483142346, "grad_norm": 0.2306041121482849, "learning_rate": 0.000173238626871671, "loss": 1.6055, "step": 48500 }, { "epoch": 0.40239782738292373, "grad_norm": 0.38825201988220215, "learning_rate": 0.0001731834282625214, "loss": 1.6479, "step": 48600 }, { "epoch": 0.4032258064516129, "grad_norm": 0.2115429788827896, "learning_rate": 0.0001731282296533718, "loss": 1.616, "step": 48700 }, { "epoch": 0.404053785520302, "grad_norm": 0.2399633675813675, "learning_rate": 0.0001730730310442222, "loss": 1.644, "step": 48800 }, { "epoch": 0.40488176458899117, "grad_norm": 0.2738541066646576, "learning_rate": 0.0001730178324350726, "loss": 1.6346, "step": 48900 }, { "epoch": 0.4057097436576803, "grad_norm": 0.24475158751010895, "learning_rate": 0.00017296263382592298, "loss": 1.6143, "step": 49000 }, { "epoch": 0.40653772272636945, "grad_norm": 0.2846825122833252, "learning_rate": 0.00017290743521677338, "loss": 1.627, "step": 49100 }, { "epoch": 0.4073657017950586, "grad_norm": 0.24657665193080902, "learning_rate": 0.00017285223660762377, "loss": 1.6223, "step": 49200 }, { "epoch": 0.40819368086374774, "grad_norm": 0.24977359175682068, "learning_rate": 0.00017279703799847416, "loss": 1.6454, "step": 49300 }, { "epoch": 0.4090216599324369, "grad_norm": 0.24544388055801392, "learning_rate": 0.00017274183938932456, "loss": 1.6557, "step": 49400 }, { "epoch": 0.40984963900112603, "grad_norm": 0.23884369432926178, "learning_rate": 0.00017268664078017495, "loss": 1.6128, "step": 49500 }, { "epoch": 0.4106776180698152, "grad_norm": 0.36807382106781006, "learning_rate": 0.00017263144217102535, "loss": 1.6237, "step": 49600 }, { "epoch": 0.4115055971385043, "grad_norm": 0.291007936000824, "learning_rate": 0.00017257624356187574, "loss": 1.6439, "step": 49700 }, { "epoch": 0.41233357620719346, "grad_norm": 0.22296257317066193, "learning_rate": 0.00017252104495272614, "loss": 1.636, "step": 49800 }, { "epoch": 0.4131615552758826, "grad_norm": 0.281239777803421, "learning_rate": 0.00017246584634357653, "loss": 1.6008, "step": 49900 }, { "epoch": 0.41398953434457175, "grad_norm": 0.30486106872558594, "learning_rate": 0.00017241064773442692, "loss": 1.5891, "step": 50000 }, { "epoch": 0.4148175134132609, "grad_norm": 0.24123990535736084, "learning_rate": 0.00017235544912527732, "loss": 1.6299, "step": 50100 }, { "epoch": 0.41564549248195004, "grad_norm": 0.3727368116378784, "learning_rate": 0.0001723002505161277, "loss": 1.6166, "step": 50200 }, { "epoch": 0.4164734715506392, "grad_norm": 0.24565471708774567, "learning_rate": 0.0001722450519069781, "loss": 1.6129, "step": 50300 }, { "epoch": 0.41730145061932833, "grad_norm": 0.2552422285079956, "learning_rate": 0.00017219040528392002, "loss": 1.6117, "step": 50400 }, { "epoch": 0.4181294296880175, "grad_norm": 0.21545369923114777, "learning_rate": 0.0001721352066747704, "loss": 1.6433, "step": 50500 }, { "epoch": 0.4189574087567066, "grad_norm": 0.23316791653633118, "learning_rate": 0.00017208000806562078, "loss": 1.6806, "step": 50600 }, { "epoch": 0.41978538782539576, "grad_norm": 0.2567419409751892, "learning_rate": 0.0001720248094564712, "loss": 1.6253, "step": 50700 }, { "epoch": 0.4206133668940849, "grad_norm": 0.31419897079467773, "learning_rate": 0.00017196961084732157, "loss": 1.611, "step": 50800 }, { "epoch": 0.42144134596277405, "grad_norm": 0.26649272441864014, "learning_rate": 0.00017191441223817196, "loss": 1.6269, "step": 50900 }, { "epoch": 0.4222693250314632, "grad_norm": 0.24049483239650726, "learning_rate": 0.00017185921362902239, "loss": 1.6257, "step": 51000 }, { "epoch": 0.42309730410015234, "grad_norm": 0.22467993199825287, "learning_rate": 0.00017180401501987275, "loss": 1.5821, "step": 51100 }, { "epoch": 0.4239252831688415, "grad_norm": 0.27348917722702026, "learning_rate": 0.00017174881641072315, "loss": 1.6243, "step": 51200 }, { "epoch": 0.4247532622375306, "grad_norm": 0.23247596621513367, "learning_rate": 0.00017169361780157357, "loss": 1.6333, "step": 51300 }, { "epoch": 0.42558124130621977, "grad_norm": 0.25708910822868347, "learning_rate": 0.00017163841919242394, "loss": 1.6168, "step": 51400 }, { "epoch": 0.4264092203749089, "grad_norm": 0.2495090365409851, "learning_rate": 0.00017158322058327433, "loss": 1.6105, "step": 51500 }, { "epoch": 0.42723719944359806, "grad_norm": 0.2512340545654297, "learning_rate": 0.00017152802197412475, "loss": 1.631, "step": 51600 }, { "epoch": 0.4280651785122872, "grad_norm": 0.27026236057281494, "learning_rate": 0.00017147282336497512, "loss": 1.5874, "step": 51700 }, { "epoch": 0.42889315758097635, "grad_norm": 0.23251605033874512, "learning_rate": 0.0001714176247558255, "loss": 1.6383, "step": 51800 }, { "epoch": 0.4297211366496655, "grad_norm": 0.3068407475948334, "learning_rate": 0.00017136242614667593, "loss": 1.607, "step": 51900 }, { "epoch": 0.43054911571835464, "grad_norm": 0.22653043270111084, "learning_rate": 0.0001713072275375263, "loss": 1.5913, "step": 52000 }, { "epoch": 0.4313770947870438, "grad_norm": 0.22893564403057098, "learning_rate": 0.0001712520289283767, "loss": 1.6218, "step": 52100 }, { "epoch": 0.4322050738557329, "grad_norm": 0.23899468779563904, "learning_rate": 0.00017119683031922712, "loss": 1.6178, "step": 52200 }, { "epoch": 0.43303305292442207, "grad_norm": 0.26594260334968567, "learning_rate": 0.00017114163171007748, "loss": 1.6118, "step": 52300 }, { "epoch": 0.4338610319931112, "grad_norm": 0.29181089997291565, "learning_rate": 0.00017108643310092788, "loss": 1.6391, "step": 52400 }, { "epoch": 0.43468901106180036, "grad_norm": 0.35307735204696655, "learning_rate": 0.0001710312344917783, "loss": 1.601, "step": 52500 }, { "epoch": 0.4355169901304895, "grad_norm": 0.25828322768211365, "learning_rate": 0.00017097603588262867, "loss": 1.6421, "step": 52600 }, { "epoch": 0.43634496919917864, "grad_norm": 0.30803248286247253, "learning_rate": 0.00017092083727347906, "loss": 1.655, "step": 52700 }, { "epoch": 0.4371729482678678, "grad_norm": 0.23458120226860046, "learning_rate": 0.00017086563866432948, "loss": 1.6099, "step": 52800 }, { "epoch": 0.43800092733655693, "grad_norm": 0.23281985521316528, "learning_rate": 0.00017081044005517985, "loss": 1.6388, "step": 52900 }, { "epoch": 0.4388289064052461, "grad_norm": 0.23414187133312225, "learning_rate": 0.00017075524144603024, "loss": 1.5956, "step": 53000 }, { "epoch": 0.4396568854739352, "grad_norm": 0.25793951749801636, "learning_rate": 0.00017070004283688066, "loss": 1.6734, "step": 53100 }, { "epoch": 0.44048486454262437, "grad_norm": 0.23932301998138428, "learning_rate": 0.00017064484422773103, "loss": 1.6106, "step": 53200 }, { "epoch": 0.4413128436113135, "grad_norm": 0.24835747480392456, "learning_rate": 0.00017058964561858143, "loss": 1.6077, "step": 53300 }, { "epoch": 0.44214082268000265, "grad_norm": 0.26732298731803894, "learning_rate": 0.00017053444700943185, "loss": 1.5729, "step": 53400 }, { "epoch": 0.4429688017486918, "grad_norm": 0.2826359272003174, "learning_rate": 0.00017047924840028221, "loss": 1.6623, "step": 53500 }, { "epoch": 0.44379678081738094, "grad_norm": 0.24312160909175873, "learning_rate": 0.00017042404979113264, "loss": 1.6075, "step": 53600 }, { "epoch": 0.4446247598860701, "grad_norm": 0.293720006942749, "learning_rate": 0.00017036885118198303, "loss": 1.6293, "step": 53700 }, { "epoch": 0.44545273895475923, "grad_norm": 0.2534725069999695, "learning_rate": 0.0001703136525728334, "loss": 1.5869, "step": 53800 }, { "epoch": 0.4462807180234484, "grad_norm": 0.25833797454833984, "learning_rate": 0.00017025845396368382, "loss": 1.6359, "step": 53900 }, { "epoch": 0.4471086970921375, "grad_norm": 0.2595597207546234, "learning_rate": 0.0001702032553545342, "loss": 1.598, "step": 54000 }, { "epoch": 0.44793667616082666, "grad_norm": 0.2243652492761612, "learning_rate": 0.00017014805674538458, "loss": 1.6555, "step": 54100 }, { "epoch": 0.4487646552295158, "grad_norm": 0.2626785337924957, "learning_rate": 0.000170092858136235, "loss": 1.6327, "step": 54200 }, { "epoch": 0.44959263429820495, "grad_norm": 0.28202512860298157, "learning_rate": 0.0001700376595270854, "loss": 1.5877, "step": 54300 }, { "epoch": 0.4504206133668941, "grad_norm": 0.23521679639816284, "learning_rate": 0.00016998246091793576, "loss": 1.6357, "step": 54400 }, { "epoch": 0.45124859243558324, "grad_norm": 0.24466179311275482, "learning_rate": 0.00016992726230878618, "loss": 1.6089, "step": 54500 }, { "epoch": 0.4520765715042724, "grad_norm": 0.3313222825527191, "learning_rate": 0.00016987261568572804, "loss": 1.634, "step": 54600 }, { "epoch": 0.45290455057296153, "grad_norm": 0.21911990642547607, "learning_rate": 0.00016981741707657846, "loss": 1.6241, "step": 54700 }, { "epoch": 0.4537325296416507, "grad_norm": 0.24226447939872742, "learning_rate": 0.00016976221846742886, "loss": 1.5985, "step": 54800 }, { "epoch": 0.4545605087103398, "grad_norm": 0.25372591614723206, "learning_rate": 0.00016970701985827925, "loss": 1.5914, "step": 54900 }, { "epoch": 0.45538848777902896, "grad_norm": 0.27368178963661194, "learning_rate": 0.00016965182124912965, "loss": 1.652, "step": 55000 }, { "epoch": 0.4562164668477181, "grad_norm": 0.305602103471756, "learning_rate": 0.00016959662263998004, "loss": 1.651, "step": 55100 }, { "epoch": 0.45704444591640725, "grad_norm": 0.2739979326725006, "learning_rate": 0.00016954142403083044, "loss": 1.6583, "step": 55200 }, { "epoch": 0.4578724249850964, "grad_norm": 0.2888374924659729, "learning_rate": 0.00016948622542168083, "loss": 1.6283, "step": 55300 }, { "epoch": 0.45870040405378554, "grad_norm": 0.26104286313056946, "learning_rate": 0.00016943102681253122, "loss": 1.6222, "step": 55400 }, { "epoch": 0.4595283831224747, "grad_norm": 0.2540399134159088, "learning_rate": 0.00016937582820338162, "loss": 1.62, "step": 55500 }, { "epoch": 0.4603563621911638, "grad_norm": 0.2812783718109131, "learning_rate": 0.000169320629594232, "loss": 1.6215, "step": 55600 }, { "epoch": 0.46118434125985297, "grad_norm": 0.28794047236442566, "learning_rate": 0.0001692654309850824, "loss": 1.6361, "step": 55700 }, { "epoch": 0.4620123203285421, "grad_norm": 0.24678725004196167, "learning_rate": 0.0001692102323759328, "loss": 1.6053, "step": 55800 }, { "epoch": 0.46284029939723126, "grad_norm": 0.2598378360271454, "learning_rate": 0.0001691550337667832, "loss": 1.606, "step": 55900 }, { "epoch": 0.4636682784659204, "grad_norm": 0.3878665566444397, "learning_rate": 0.0001690998351576336, "loss": 1.6494, "step": 56000 }, { "epoch": 0.46449625753460955, "grad_norm": 0.24927765130996704, "learning_rate": 0.00016904463654848398, "loss": 1.6219, "step": 56100 }, { "epoch": 0.4653242366032987, "grad_norm": 0.2416529506444931, "learning_rate": 0.00016898943793933438, "loss": 1.6164, "step": 56200 }, { "epoch": 0.46615221567198784, "grad_norm": 0.3543456792831421, "learning_rate": 0.00016893423933018477, "loss": 1.604, "step": 56300 }, { "epoch": 0.466980194740677, "grad_norm": 0.3229213058948517, "learning_rate": 0.00016887904072103517, "loss": 1.6126, "step": 56400 }, { "epoch": 0.4678081738093661, "grad_norm": 0.24769534170627594, "learning_rate": 0.00016882384211188556, "loss": 1.6298, "step": 56500 }, { "epoch": 0.46863615287805527, "grad_norm": 0.25397947430610657, "learning_rate": 0.00016876864350273595, "loss": 1.6389, "step": 56600 }, { "epoch": 0.4694641319467444, "grad_norm": 0.2763853371143341, "learning_rate": 0.00016871399687967784, "loss": 1.602, "step": 56700 }, { "epoch": 0.47029211101543356, "grad_norm": 0.24106226861476898, "learning_rate": 0.00016865879827052824, "loss": 1.6317, "step": 56800 }, { "epoch": 0.4711200900841227, "grad_norm": 0.31650885939598083, "learning_rate": 0.00016860359966137863, "loss": 1.6287, "step": 56900 }, { "epoch": 0.47194806915281184, "grad_norm": 0.22699052095413208, "learning_rate": 0.00016854840105222902, "loss": 1.6059, "step": 57000 }, { "epoch": 0.47277604822150093, "grad_norm": 0.23599205911159515, "learning_rate": 0.00016849320244307942, "loss": 1.6359, "step": 57100 }, { "epoch": 0.4736040272901901, "grad_norm": 0.2733438014984131, "learning_rate": 0.0001684380038339298, "loss": 1.6221, "step": 57200 }, { "epoch": 0.4744320063588792, "grad_norm": 0.22067943215370178, "learning_rate": 0.0001683828052247802, "loss": 1.603, "step": 57300 }, { "epoch": 0.47525998542756837, "grad_norm": 0.25246864557266235, "learning_rate": 0.0001683276066156306, "loss": 1.5875, "step": 57400 }, { "epoch": 0.4760879644962575, "grad_norm": 0.2747368812561035, "learning_rate": 0.000168272408006481, "loss": 1.6522, "step": 57500 }, { "epoch": 0.47691594356494665, "grad_norm": 0.352892130613327, "learning_rate": 0.0001682172093973314, "loss": 1.609, "step": 57600 }, { "epoch": 0.4777439226336358, "grad_norm": 0.26428744196891785, "learning_rate": 0.00016816201078818178, "loss": 1.6191, "step": 57700 }, { "epoch": 0.47857190170232494, "grad_norm": 0.2937561571598053, "learning_rate": 0.0001681068121790322, "loss": 1.6204, "step": 57800 }, { "epoch": 0.4793998807710141, "grad_norm": 0.23604629933834076, "learning_rate": 0.00016805161356988257, "loss": 1.6416, "step": 57900 }, { "epoch": 0.48022785983970323, "grad_norm": 0.2349170446395874, "learning_rate": 0.00016799641496073297, "loss": 1.6116, "step": 58000 }, { "epoch": 0.4810558389083924, "grad_norm": 0.28929609060287476, "learning_rate": 0.0001679412163515834, "loss": 1.5956, "step": 58100 }, { "epoch": 0.4818838179770815, "grad_norm": 0.2797716557979584, "learning_rate": 0.00016788601774243375, "loss": 1.598, "step": 58200 }, { "epoch": 0.48271179704577066, "grad_norm": 0.30529841780662537, "learning_rate": 0.00016783081913328415, "loss": 1.6324, "step": 58300 }, { "epoch": 0.4835397761144598, "grad_norm": 0.35713014006614685, "learning_rate": 0.00016777562052413457, "loss": 1.624, "step": 58400 }, { "epoch": 0.48436775518314895, "grad_norm": 0.29055917263031006, "learning_rate": 0.00016772042191498494, "loss": 1.618, "step": 58500 }, { "epoch": 0.4851957342518381, "grad_norm": 0.24039526283740997, "learning_rate": 0.00016766522330583533, "loss": 1.6056, "step": 58600 }, { "epoch": 0.48602371332052724, "grad_norm": 0.26216012239456177, "learning_rate": 0.00016761002469668575, "loss": 1.6113, "step": 58700 }, { "epoch": 0.4868516923892164, "grad_norm": 0.2159920334815979, "learning_rate": 0.00016755482608753612, "loss": 1.6335, "step": 58800 }, { "epoch": 0.48767967145790553, "grad_norm": 0.26959407329559326, "learning_rate": 0.00016749962747838651, "loss": 1.6376, "step": 58900 }, { "epoch": 0.4885076505265947, "grad_norm": 0.2333938479423523, "learning_rate": 0.00016744442886923694, "loss": 1.6516, "step": 59000 }, { "epoch": 0.4893356295952838, "grad_norm": 0.3124794065952301, "learning_rate": 0.0001673892302600873, "loss": 1.6144, "step": 59100 }, { "epoch": 0.49016360866397296, "grad_norm": 0.26165392994880676, "learning_rate": 0.0001673340316509377, "loss": 1.6102, "step": 59200 }, { "epoch": 0.4909915877326621, "grad_norm": 0.24849289655685425, "learning_rate": 0.00016727883304178812, "loss": 1.6353, "step": 59300 }, { "epoch": 0.49181956680135125, "grad_norm": 0.29942548274993896, "learning_rate": 0.00016722418641872998, "loss": 1.5957, "step": 59400 }, { "epoch": 0.4926475458700404, "grad_norm": 0.29977184534072876, "learning_rate": 0.0001671689878095804, "loss": 1.6041, "step": 59500 }, { "epoch": 0.49347552493872954, "grad_norm": 0.30653277039527893, "learning_rate": 0.00016711378920043077, "loss": 1.6167, "step": 59600 }, { "epoch": 0.4943035040074187, "grad_norm": 0.26511892676353455, "learning_rate": 0.00016705859059128116, "loss": 1.6033, "step": 59700 }, { "epoch": 0.4951314830761078, "grad_norm": 0.2956562340259552, "learning_rate": 0.00016700339198213158, "loss": 1.6046, "step": 59800 }, { "epoch": 0.49595946214479697, "grad_norm": 0.2536456286907196, "learning_rate": 0.00016694819337298195, "loss": 1.5994, "step": 59900 }, { "epoch": 0.4967874412134861, "grad_norm": 0.31090065836906433, "learning_rate": 0.00016689299476383234, "loss": 1.6182, "step": 60000 }, { "epoch": 0.49761542028217526, "grad_norm": 0.34695854783058167, "learning_rate": 0.00016683779615468276, "loss": 1.6126, "step": 60100 }, { "epoch": 0.4984433993508644, "grad_norm": 0.2310916781425476, "learning_rate": 0.00016678259754553313, "loss": 1.6232, "step": 60200 }, { "epoch": 0.49927137841955355, "grad_norm": 0.26144957542419434, "learning_rate": 0.00016672739893638353, "loss": 1.6211, "step": 60300 }, { "epoch": 0.5000993574882427, "grad_norm": 0.24252454936504364, "learning_rate": 0.00016667220032723395, "loss": 1.6249, "step": 60400 }, { "epoch": 0.5009273365569319, "grad_norm": 0.3022114932537079, "learning_rate": 0.00016661700171808431, "loss": 1.5934, "step": 60500 }, { "epoch": 0.501755315625621, "grad_norm": 0.23993396759033203, "learning_rate": 0.00016656180310893474, "loss": 1.5778, "step": 60600 }, { "epoch": 0.5025832946943102, "grad_norm": 0.2709307074546814, "learning_rate": 0.00016650660449978513, "loss": 1.6199, "step": 60700 }, { "epoch": 0.5034112737629993, "grad_norm": 0.25770658254623413, "learning_rate": 0.0001664514058906355, "loss": 1.6205, "step": 60800 }, { "epoch": 0.5042392528316885, "grad_norm": 0.25296691060066223, "learning_rate": 0.00016639620728148592, "loss": 1.5798, "step": 60900 }, { "epoch": 0.5050672319003776, "grad_norm": 0.26305311918258667, "learning_rate": 0.0001663410086723363, "loss": 1.6263, "step": 61000 }, { "epoch": 0.5058952109690668, "grad_norm": 0.29788756370544434, "learning_rate": 0.00016628581006318668, "loss": 1.6235, "step": 61100 }, { "epoch": 0.5067231900377559, "grad_norm": 0.3243436813354492, "learning_rate": 0.0001662306114540371, "loss": 1.5963, "step": 61200 }, { "epoch": 0.507551169106445, "grad_norm": 0.2363291084766388, "learning_rate": 0.0001661754128448875, "loss": 1.6289, "step": 61300 }, { "epoch": 0.5083791481751342, "grad_norm": 0.2559576630592346, "learning_rate": 0.00016612021423573786, "loss": 1.6197, "step": 61400 }, { "epoch": 0.5092071272438232, "grad_norm": 0.25220799446105957, "learning_rate": 0.00016606501562658828, "loss": 1.5826, "step": 61500 }, { "epoch": 0.5100351063125124, "grad_norm": 0.27892744541168213, "learning_rate": 0.00016600981701743868, "loss": 1.649, "step": 61600 }, { "epoch": 0.5108630853812015, "grad_norm": 0.26306354999542236, "learning_rate": 0.00016595461840828904, "loss": 1.6075, "step": 61700 }, { "epoch": 0.5116910644498907, "grad_norm": 0.3013826906681061, "learning_rate": 0.00016589941979913947, "loss": 1.6187, "step": 61800 }, { "epoch": 0.5125190435185798, "grad_norm": 0.24135053157806396, "learning_rate": 0.00016584422118998986, "loss": 1.6182, "step": 61900 }, { "epoch": 0.5133470225872689, "grad_norm": 0.2604123651981354, "learning_rate": 0.00016578902258084023, "loss": 1.6128, "step": 62000 }, { "epoch": 0.5141750016559581, "grad_norm": 0.27147355675697327, "learning_rate": 0.00016573382397169065, "loss": 1.6312, "step": 62100 }, { "epoch": 0.5150029807246472, "grad_norm": 0.25358569622039795, "learning_rate": 0.00016567862536254104, "loss": 1.6305, "step": 62200 }, { "epoch": 0.5158309597933364, "grad_norm": 0.28588297963142395, "learning_rate": 0.0001656234267533914, "loss": 1.6501, "step": 62300 }, { "epoch": 0.5166589388620255, "grad_norm": 0.28298503160476685, "learning_rate": 0.00016556822814424183, "loss": 1.6245, "step": 62400 }, { "epoch": 0.5174869179307147, "grad_norm": 0.248241126537323, "learning_rate": 0.00016551302953509223, "loss": 1.623, "step": 62500 }, { "epoch": 0.5183148969994038, "grad_norm": 0.352561891078949, "learning_rate": 0.0001654578309259426, "loss": 1.6494, "step": 62600 }, { "epoch": 0.519142876068093, "grad_norm": 0.24471941590309143, "learning_rate": 0.00016540263231679301, "loss": 1.6013, "step": 62700 }, { "epoch": 0.5199708551367821, "grad_norm": 0.2803977429866791, "learning_rate": 0.0001653474337076434, "loss": 1.5904, "step": 62800 }, { "epoch": 0.5207988342054712, "grad_norm": 0.2543172240257263, "learning_rate": 0.00016529223509849378, "loss": 1.62, "step": 62900 }, { "epoch": 0.5216268132741604, "grad_norm": 0.27087855339050293, "learning_rate": 0.0001652370364893442, "loss": 1.6163, "step": 63000 }, { "epoch": 0.5224547923428495, "grad_norm": 0.2647192180156708, "learning_rate": 0.0001651818378801946, "loss": 1.5956, "step": 63100 }, { "epoch": 0.5232827714115387, "grad_norm": 0.2669653594493866, "learning_rate": 0.00016512663927104496, "loss": 1.5881, "step": 63200 }, { "epoch": 0.5241107504802278, "grad_norm": 0.21718978881835938, "learning_rate": 0.00016507144066189538, "loss": 1.5997, "step": 63300 }, { "epoch": 0.524938729548917, "grad_norm": 0.31373196840286255, "learning_rate": 0.00016501679403883727, "loss": 1.6362, "step": 63400 }, { "epoch": 0.5257667086176061, "grad_norm": 0.2445104718208313, "learning_rate": 0.00016496159542968766, "loss": 1.6085, "step": 63500 }, { "epoch": 0.5265946876862952, "grad_norm": 0.2483099400997162, "learning_rate": 0.00016490639682053805, "loss": 1.6412, "step": 63600 }, { "epoch": 0.5274226667549844, "grad_norm": 0.2993902564048767, "learning_rate": 0.00016485119821138845, "loss": 1.605, "step": 63700 }, { "epoch": 0.5282506458236735, "grad_norm": 0.28177618980407715, "learning_rate": 0.00016479655158833033, "loss": 1.6216, "step": 63800 }, { "epoch": 0.5290786248923627, "grad_norm": 0.2316046804189682, "learning_rate": 0.00016474135297918073, "loss": 1.593, "step": 63900 }, { "epoch": 0.5299066039610518, "grad_norm": 0.24782416224479675, "learning_rate": 0.00016468615437003112, "loss": 1.622, "step": 64000 }, { "epoch": 0.530734583029741, "grad_norm": 0.2592697739601135, "learning_rate": 0.00016463095576088152, "loss": 1.6015, "step": 64100 }, { "epoch": 0.5315625620984301, "grad_norm": 0.2624833881855011, "learning_rate": 0.0001645757571517319, "loss": 1.6343, "step": 64200 }, { "epoch": 0.5323905411671193, "grad_norm": 0.25795653462409973, "learning_rate": 0.0001645205585425823, "loss": 1.6222, "step": 64300 }, { "epoch": 0.5332185202358084, "grad_norm": 0.2875909209251404, "learning_rate": 0.0001644653599334327, "loss": 1.6077, "step": 64400 }, { "epoch": 0.5340464993044975, "grad_norm": 0.2648106515407562, "learning_rate": 0.0001644101613242831, "loss": 1.6233, "step": 64500 }, { "epoch": 0.5348744783731867, "grad_norm": 0.24412347376346588, "learning_rate": 0.0001643549627151335, "loss": 1.6316, "step": 64600 }, { "epoch": 0.5357024574418758, "grad_norm": 0.24017901718616486, "learning_rate": 0.00016429976410598388, "loss": 1.607, "step": 64700 }, { "epoch": 0.536530436510565, "grad_norm": 0.24622277915477753, "learning_rate": 0.00016424456549683428, "loss": 1.5945, "step": 64800 }, { "epoch": 0.5373584155792541, "grad_norm": 0.2481980174779892, "learning_rate": 0.00016418936688768467, "loss": 1.615, "step": 64900 }, { "epoch": 0.5381863946479433, "grad_norm": 0.24149680137634277, "learning_rate": 0.00016413416827853507, "loss": 1.6407, "step": 65000 }, { "epoch": 0.5390143737166324, "grad_norm": 0.25136297941207886, "learning_rate": 0.0001640789696693855, "loss": 1.6301, "step": 65100 }, { "epoch": 0.5398423527853216, "grad_norm": 0.3350437879562378, "learning_rate": 0.00016402377106023585, "loss": 1.6091, "step": 65200 }, { "epoch": 0.5406703318540107, "grad_norm": 0.23967650532722473, "learning_rate": 0.00016396857245108625, "loss": 1.6442, "step": 65300 }, { "epoch": 0.5414983109226998, "grad_norm": 0.22246354818344116, "learning_rate": 0.00016391337384193667, "loss": 1.6478, "step": 65400 }, { "epoch": 0.542326289991389, "grad_norm": 0.2450036257505417, "learning_rate": 0.00016385817523278704, "loss": 1.632, "step": 65500 }, { "epoch": 0.5431542690600781, "grad_norm": 0.2727454602718353, "learning_rate": 0.00016380297662363743, "loss": 1.6217, "step": 65600 }, { "epoch": 0.5439822481287673, "grad_norm": 0.21638920903205872, "learning_rate": 0.00016374777801448785, "loss": 1.62, "step": 65700 }, { "epoch": 0.5448102271974564, "grad_norm": 0.23474307358264923, "learning_rate": 0.00016369257940533822, "loss": 1.6286, "step": 65800 }, { "epoch": 0.5456382062661456, "grad_norm": 0.2379283905029297, "learning_rate": 0.00016363738079618861, "loss": 1.6096, "step": 65900 }, { "epoch": 0.5464661853348347, "grad_norm": 0.2463446408510208, "learning_rate": 0.00016358218218703904, "loss": 1.6315, "step": 66000 }, { "epoch": 0.5472941644035239, "grad_norm": 0.2681124210357666, "learning_rate": 0.0001635269835778894, "loss": 1.6258, "step": 66100 }, { "epoch": 0.548122143472213, "grad_norm": 0.3003305494785309, "learning_rate": 0.0001634717849687398, "loss": 1.6177, "step": 66200 }, { "epoch": 0.5489501225409021, "grad_norm": 0.24380895495414734, "learning_rate": 0.00016341658635959022, "loss": 1.6304, "step": 66300 }, { "epoch": 0.5497781016095913, "grad_norm": 0.22243930399417877, "learning_rate": 0.00016336138775044058, "loss": 1.6123, "step": 66400 }, { "epoch": 0.5506060806782804, "grad_norm": 0.33760640025138855, "learning_rate": 0.00016330618914129098, "loss": 1.6521, "step": 66500 }, { "epoch": 0.5514340597469696, "grad_norm": 0.23244598507881165, "learning_rate": 0.0001632509905321414, "loss": 1.6081, "step": 66600 }, { "epoch": 0.5522620388156587, "grad_norm": 0.36719369888305664, "learning_rate": 0.00016319579192299177, "loss": 1.6511, "step": 66700 }, { "epoch": 0.5530900178843479, "grad_norm": 0.27889949083328247, "learning_rate": 0.00016314059331384216, "loss": 1.6124, "step": 66800 }, { "epoch": 0.553917996953037, "grad_norm": 0.271400511264801, "learning_rate": 0.00016308539470469258, "loss": 1.6521, "step": 66900 }, { "epoch": 0.5547459760217262, "grad_norm": 0.2746060788631439, "learning_rate": 0.00016303019609554295, "loss": 1.624, "step": 67000 }, { "epoch": 0.5555739550904153, "grad_norm": 0.23455697298049927, "learning_rate": 0.00016297499748639334, "loss": 1.6593, "step": 67100 }, { "epoch": 0.5564019341591044, "grad_norm": 0.3208955228328705, "learning_rate": 0.00016291979887724377, "loss": 1.6015, "step": 67200 }, { "epoch": 0.5572299132277936, "grad_norm": 0.4708058834075928, "learning_rate": 0.00016286460026809413, "loss": 1.6145, "step": 67300 }, { "epoch": 0.5580578922964827, "grad_norm": 0.26985836029052734, "learning_rate": 0.00016280940165894453, "loss": 1.6278, "step": 67400 }, { "epoch": 0.5588858713651719, "grad_norm": 0.26395246386528015, "learning_rate": 0.00016275420304979495, "loss": 1.6238, "step": 67500 }, { "epoch": 0.559713850433861, "grad_norm": 0.24450361728668213, "learning_rate": 0.00016269900444064532, "loss": 1.6155, "step": 67600 }, { "epoch": 0.5605418295025502, "grad_norm": 0.2270006537437439, "learning_rate": 0.0001626438058314957, "loss": 1.5851, "step": 67700 }, { "epoch": 0.5613698085712393, "grad_norm": 0.23719048500061035, "learning_rate": 0.00016258860722234613, "loss": 1.6182, "step": 67800 }, { "epoch": 0.5621977876399284, "grad_norm": 0.2850781977176666, "learning_rate": 0.0001625334086131965, "loss": 1.6223, "step": 67900 }, { "epoch": 0.5630257667086176, "grad_norm": 0.2562520205974579, "learning_rate": 0.0001624782100040469, "loss": 1.6209, "step": 68000 }, { "epoch": 0.5638537457773067, "grad_norm": 0.2286711037158966, "learning_rate": 0.00016242301139489731, "loss": 1.6387, "step": 68100 }, { "epoch": 0.5646817248459959, "grad_norm": 0.27937260270118713, "learning_rate": 0.00016236781278574768, "loss": 1.6235, "step": 68200 }, { "epoch": 0.565509703914685, "grad_norm": 0.24311909079551697, "learning_rate": 0.00016231261417659808, "loss": 1.6306, "step": 68300 }, { "epoch": 0.5663376829833742, "grad_norm": 0.24035251140594482, "learning_rate": 0.0001622574155674485, "loss": 1.6538, "step": 68400 }, { "epoch": 0.5671656620520633, "grad_norm": 0.23354622721672058, "learning_rate": 0.00016220221695829886, "loss": 1.6199, "step": 68500 }, { "epoch": 0.5679936411207525, "grad_norm": 0.2286653369665146, "learning_rate": 0.00016214701834914926, "loss": 1.6107, "step": 68600 }, { "epoch": 0.5688216201894416, "grad_norm": 0.2563110291957855, "learning_rate": 0.00016209237172609117, "loss": 1.6103, "step": 68700 }, { "epoch": 0.5696495992581307, "grad_norm": 0.2600407600402832, "learning_rate": 0.00016203717311694157, "loss": 1.6191, "step": 68800 }, { "epoch": 0.5704775783268199, "grad_norm": 0.29776090383529663, "learning_rate": 0.00016198197450779196, "loss": 1.5936, "step": 68900 }, { "epoch": 0.571305557395509, "grad_norm": 0.24299946427345276, "learning_rate": 0.00016192677589864235, "loss": 1.6416, "step": 69000 }, { "epoch": 0.5721335364641982, "grad_norm": 0.2771673798561096, "learning_rate": 0.00016187157728949275, "loss": 1.6396, "step": 69100 }, { "epoch": 0.5729615155328873, "grad_norm": 0.24624699354171753, "learning_rate": 0.00016181637868034314, "loss": 1.604, "step": 69200 }, { "epoch": 0.5737894946015765, "grad_norm": 0.2401464879512787, "learning_rate": 0.00016176173205728503, "loss": 1.6037, "step": 69300 }, { "epoch": 0.5746174736702656, "grad_norm": 0.23874430358409882, "learning_rate": 0.00016170653344813542, "loss": 1.584, "step": 69400 }, { "epoch": 0.5754454527389548, "grad_norm": 0.255275696516037, "learning_rate": 0.00016165133483898582, "loss": 1.6203, "step": 69500 }, { "epoch": 0.5762734318076439, "grad_norm": 0.27830347418785095, "learning_rate": 0.0001615961362298362, "loss": 1.5646, "step": 69600 }, { "epoch": 0.577101410876333, "grad_norm": 0.24452444911003113, "learning_rate": 0.0001615409376206866, "loss": 1.6278, "step": 69700 }, { "epoch": 0.5779293899450222, "grad_norm": 0.2404039353132248, "learning_rate": 0.000161485739011537, "loss": 1.6502, "step": 69800 }, { "epoch": 0.5787573690137113, "grad_norm": 0.28415966033935547, "learning_rate": 0.0001614305404023874, "loss": 1.5849, "step": 69900 }, { "epoch": 0.5795853480824005, "grad_norm": 0.27514511346817017, "learning_rate": 0.0001613753417932378, "loss": 1.5964, "step": 70000 }, { "epoch": 0.5804133271510896, "grad_norm": 0.24850310385227203, "learning_rate": 0.00016132014318408818, "loss": 1.6137, "step": 70100 }, { "epoch": 0.5812413062197788, "grad_norm": 0.26514944434165955, "learning_rate": 0.00016126494457493858, "loss": 1.6413, "step": 70200 }, { "epoch": 0.5820692852884679, "grad_norm": 0.23754677176475525, "learning_rate": 0.00016120974596578897, "loss": 1.6314, "step": 70300 }, { "epoch": 0.582897264357157, "grad_norm": 0.24664048850536346, "learning_rate": 0.00016115454735663937, "loss": 1.5932, "step": 70400 }, { "epoch": 0.5837252434258462, "grad_norm": 0.2942032814025879, "learning_rate": 0.00016109934874748976, "loss": 1.6285, "step": 70500 }, { "epoch": 0.5845532224945353, "grad_norm": 0.28962376713752747, "learning_rate": 0.00016104415013834015, "loss": 1.6427, "step": 70600 }, { "epoch": 0.5853812015632245, "grad_norm": 0.2608729302883148, "learning_rate": 0.00016098895152919055, "loss": 1.6261, "step": 70700 }, { "epoch": 0.5862091806319136, "grad_norm": 0.2974378764629364, "learning_rate": 0.00016093375292004094, "loss": 1.617, "step": 70800 }, { "epoch": 0.5870371597006028, "grad_norm": 0.2549142837524414, "learning_rate": 0.00016087855431089134, "loss": 1.5969, "step": 70900 }, { "epoch": 0.5878651387692919, "grad_norm": 0.22917169332504272, "learning_rate": 0.00016082335570174173, "loss": 1.6223, "step": 71000 }, { "epoch": 0.5886931178379811, "grad_norm": 0.2683030068874359, "learning_rate": 0.00016076815709259213, "loss": 1.6115, "step": 71100 }, { "epoch": 0.5895210969066702, "grad_norm": 0.24944636225700378, "learning_rate": 0.00016071295848344252, "loss": 1.5796, "step": 71200 }, { "epoch": 0.5903490759753593, "grad_norm": 0.2558874487876892, "learning_rate": 0.0001606577598742929, "loss": 1.5923, "step": 71300 }, { "epoch": 0.5911770550440485, "grad_norm": 0.23976579308509827, "learning_rate": 0.0001606025612651433, "loss": 1.6389, "step": 71400 }, { "epoch": 0.5920050341127376, "grad_norm": 0.2433394193649292, "learning_rate": 0.0001605479146420852, "loss": 1.6068, "step": 71500 }, { "epoch": 0.5928330131814268, "grad_norm": 0.2515048384666443, "learning_rate": 0.0001604927160329356, "loss": 1.6122, "step": 71600 }, { "epoch": 0.5936609922501159, "grad_norm": 0.271219938993454, "learning_rate": 0.00016043751742378598, "loss": 1.5872, "step": 71700 }, { "epoch": 0.5944889713188051, "grad_norm": 0.2834341824054718, "learning_rate": 0.00016038231881463638, "loss": 1.6537, "step": 71800 }, { "epoch": 0.5953169503874942, "grad_norm": 0.2426416575908661, "learning_rate": 0.00016032712020548677, "loss": 1.6489, "step": 71900 }, { "epoch": 0.5961449294561834, "grad_norm": 0.2656005322933197, "learning_rate": 0.00016027192159633717, "loss": 1.5983, "step": 72000 }, { "epoch": 0.5969729085248725, "grad_norm": 0.2662166655063629, "learning_rate": 0.0001602167229871876, "loss": 1.5954, "step": 72100 }, { "epoch": 0.5978008875935616, "grad_norm": 0.1967765837907791, "learning_rate": 0.00016016152437803795, "loss": 1.6181, "step": 72200 }, { "epoch": 0.5986288666622508, "grad_norm": 0.28618454933166504, "learning_rate": 0.00016010632576888835, "loss": 1.5902, "step": 72300 }, { "epoch": 0.5994568457309399, "grad_norm": 0.24525755643844604, "learning_rate": 0.00016005112715973877, "loss": 1.608, "step": 72400 }, { "epoch": 0.6002848247996291, "grad_norm": 0.32975664734840393, "learning_rate": 0.00015999592855058914, "loss": 1.6127, "step": 72500 }, { "epoch": 0.6011128038683182, "grad_norm": 0.3004739582538605, "learning_rate": 0.00015994072994143953, "loss": 1.6255, "step": 72600 }, { "epoch": 0.6019407829370074, "grad_norm": 0.28511524200439453, "learning_rate": 0.00015988553133228995, "loss": 1.61, "step": 72700 }, { "epoch": 0.6027687620056965, "grad_norm": 0.28003406524658203, "learning_rate": 0.00015983033272314032, "loss": 1.622, "step": 72800 }, { "epoch": 0.6035967410743857, "grad_norm": 0.25821688771247864, "learning_rate": 0.0001597751341139907, "loss": 1.6041, "step": 72900 }, { "epoch": 0.6044247201430748, "grad_norm": 0.2734212577342987, "learning_rate": 0.00015971993550484113, "loss": 1.6453, "step": 73000 }, { "epoch": 0.605252699211764, "grad_norm": 0.25921088457107544, "learning_rate": 0.0001596647368956915, "loss": 1.6162, "step": 73100 }, { "epoch": 0.6060806782804531, "grad_norm": 0.26158612966537476, "learning_rate": 0.0001596095382865419, "loss": 1.6102, "step": 73200 }, { "epoch": 0.6069086573491422, "grad_norm": 0.28052300214767456, "learning_rate": 0.00015955433967739232, "loss": 1.6353, "step": 73300 }, { "epoch": 0.6077366364178314, "grad_norm": 0.25399696826934814, "learning_rate": 0.00015949914106824268, "loss": 1.6477, "step": 73400 }, { "epoch": 0.6085646154865205, "grad_norm": 0.30021095275878906, "learning_rate": 0.00015944394245909308, "loss": 1.6177, "step": 73500 }, { "epoch": 0.6093925945552097, "grad_norm": 0.2222752422094345, "learning_rate": 0.0001593887438499435, "loss": 1.6026, "step": 73600 }, { "epoch": 0.6102205736238988, "grad_norm": 0.3250563442707062, "learning_rate": 0.00015933354524079387, "loss": 1.65, "step": 73700 }, { "epoch": 0.611048552692588, "grad_norm": 0.27014991641044617, "learning_rate": 0.00015927834663164426, "loss": 1.6051, "step": 73800 }, { "epoch": 0.6118765317612771, "grad_norm": 0.3009546101093292, "learning_rate": 0.00015922314802249468, "loss": 1.5846, "step": 73900 }, { "epoch": 0.6127045108299662, "grad_norm": 0.3216160833835602, "learning_rate": 0.00015916794941334505, "loss": 1.6195, "step": 74000 }, { "epoch": 0.6135324898986554, "grad_norm": 0.3125784397125244, "learning_rate": 0.00015911275080419544, "loss": 1.6443, "step": 74100 }, { "epoch": 0.6143604689673445, "grad_norm": 0.2232864946126938, "learning_rate": 0.00015905755219504587, "loss": 1.6312, "step": 74200 }, { "epoch": 0.6151884480360337, "grad_norm": 0.22715523838996887, "learning_rate": 0.00015900235358589623, "loss": 1.5811, "step": 74300 }, { "epoch": 0.6160164271047228, "grad_norm": 0.2426147162914276, "learning_rate": 0.00015894715497674663, "loss": 1.6292, "step": 74400 }, { "epoch": 0.616844406173412, "grad_norm": 0.23924706876277924, "learning_rate": 0.00015889195636759705, "loss": 1.6369, "step": 74500 }, { "epoch": 0.6176723852421011, "grad_norm": 0.2847142219543457, "learning_rate": 0.00015883675775844742, "loss": 1.6098, "step": 74600 }, { "epoch": 0.6185003643107903, "grad_norm": 0.25139033794403076, "learning_rate": 0.0001587815591492978, "loss": 1.6251, "step": 74700 }, { "epoch": 0.6193283433794794, "grad_norm": 0.23232166469097137, "learning_rate": 0.00015872636054014823, "loss": 1.6285, "step": 74800 }, { "epoch": 0.6201563224481685, "grad_norm": 0.2942914366722107, "learning_rate": 0.0001586711619309986, "loss": 1.64, "step": 74900 }, { "epoch": 0.6209843015168577, "grad_norm": 0.2671305239200592, "learning_rate": 0.000158615963321849, "loss": 1.6065, "step": 75000 }, { "epoch": 0.6218122805855468, "grad_norm": 0.25377675890922546, "learning_rate": 0.0001585607647126994, "loss": 1.6549, "step": 75100 }, { "epoch": 0.622640259654236, "grad_norm": 0.20990608632564545, "learning_rate": 0.00015850556610354978, "loss": 1.6042, "step": 75200 }, { "epoch": 0.6234682387229251, "grad_norm": 0.2613056004047394, "learning_rate": 0.00015845036749440017, "loss": 1.6056, "step": 75300 }, { "epoch": 0.6242962177916143, "grad_norm": 0.27469968795776367, "learning_rate": 0.0001583951688852506, "loss": 1.602, "step": 75400 }, { "epoch": 0.6251241968603034, "grad_norm": 0.22713856399059296, "learning_rate": 0.00015833997027610096, "loss": 1.6062, "step": 75500 }, { "epoch": 0.6259521759289925, "grad_norm": 0.23872609436511993, "learning_rate": 0.00015828477166695136, "loss": 1.6026, "step": 75600 }, { "epoch": 0.6267801549976817, "grad_norm": 0.22532032430171967, "learning_rate": 0.00015822957305780178, "loss": 1.5947, "step": 75700 }, { "epoch": 0.6276081340663708, "grad_norm": 0.22259363532066345, "learning_rate": 0.00015817437444865215, "loss": 1.5985, "step": 75800 }, { "epoch": 0.62843611313506, "grad_norm": 0.2619154453277588, "learning_rate": 0.00015811917583950254, "loss": 1.5799, "step": 75900 }, { "epoch": 0.6292640922037491, "grad_norm": 0.2426246553659439, "learning_rate": 0.00015806397723035296, "loss": 1.6203, "step": 76000 }, { "epoch": 0.6300920712724383, "grad_norm": 0.2494238018989563, "learning_rate": 0.00015800877862120333, "loss": 1.5967, "step": 76100 }, { "epoch": 0.6309200503411274, "grad_norm": 0.2652859091758728, "learning_rate": 0.00015795358001205375, "loss": 1.6053, "step": 76200 }, { "epoch": 0.6317480294098166, "grad_norm": 0.2460058480501175, "learning_rate": 0.00015789838140290414, "loss": 1.6345, "step": 76300 }, { "epoch": 0.6325760084785057, "grad_norm": 0.24077683687210083, "learning_rate": 0.0001578431827937545, "loss": 1.6425, "step": 76400 }, { "epoch": 0.6334039875471948, "grad_norm": 0.3125411570072174, "learning_rate": 0.00015778798418460493, "loss": 1.6162, "step": 76500 }, { "epoch": 0.634231966615884, "grad_norm": 0.2719837427139282, "learning_rate": 0.00015773278557545533, "loss": 1.6403, "step": 76600 }, { "epoch": 0.6350599456845731, "grad_norm": 0.2815828323364258, "learning_rate": 0.0001576781389523972, "loss": 1.5833, "step": 76700 }, { "epoch": 0.6358879247532623, "grad_norm": 0.2407360076904297, "learning_rate": 0.0001576229403432476, "loss": 1.6145, "step": 76800 }, { "epoch": 0.6367159038219514, "grad_norm": 0.23104368150234222, "learning_rate": 0.000157567741734098, "loss": 1.5925, "step": 76900 }, { "epoch": 0.6375438828906406, "grad_norm": 0.24608178436756134, "learning_rate": 0.0001575125431249484, "loss": 1.5812, "step": 77000 }, { "epoch": 0.6383718619593297, "grad_norm": 0.2393268495798111, "learning_rate": 0.0001574573445157988, "loss": 1.6301, "step": 77100 }, { "epoch": 0.6391998410280189, "grad_norm": 0.24687877297401428, "learning_rate": 0.00015740214590664918, "loss": 1.6086, "step": 77200 }, { "epoch": 0.640027820096708, "grad_norm": 0.2319117784500122, "learning_rate": 0.00015734694729749958, "loss": 1.6288, "step": 77300 }, { "epoch": 0.6408557991653971, "grad_norm": 0.27230727672576904, "learning_rate": 0.00015729174868834997, "loss": 1.603, "step": 77400 }, { "epoch": 0.6416837782340863, "grad_norm": 0.2549174427986145, "learning_rate": 0.00015723655007920037, "loss": 1.5991, "step": 77500 }, { "epoch": 0.6425117573027754, "grad_norm": 0.2670983374118805, "learning_rate": 0.00015718135147005076, "loss": 1.6369, "step": 77600 }, { "epoch": 0.6433397363714646, "grad_norm": 0.24699991941452026, "learning_rate": 0.00015712615286090116, "loss": 1.6493, "step": 77700 }, { "epoch": 0.6441677154401537, "grad_norm": 0.28687578439712524, "learning_rate": 0.00015707095425175155, "loss": 1.5885, "step": 77800 }, { "epoch": 0.6449956945088429, "grad_norm": 0.22470325231552124, "learning_rate": 0.00015701575564260194, "loss": 1.5958, "step": 77900 }, { "epoch": 0.645823673577532, "grad_norm": 0.46417704224586487, "learning_rate": 0.00015696055703345234, "loss": 1.6098, "step": 78000 }, { "epoch": 0.6466516526462212, "grad_norm": 0.2403065264225006, "learning_rate": 0.00015690535842430273, "loss": 1.5947, "step": 78100 }, { "epoch": 0.6474796317149103, "grad_norm": 0.23706583678722382, "learning_rate": 0.00015685015981515313, "loss": 1.5998, "step": 78200 }, { "epoch": 0.6483076107835994, "grad_norm": 0.28114768862724304, "learning_rate": 0.00015679496120600352, "loss": 1.6447, "step": 78300 }, { "epoch": 0.6491355898522886, "grad_norm": 0.2388133406639099, "learning_rate": 0.00015673976259685392, "loss": 1.6021, "step": 78400 }, { "epoch": 0.6499635689209777, "grad_norm": 0.25675222277641296, "learning_rate": 0.0001566845639877043, "loss": 1.6216, "step": 78500 }, { "epoch": 0.6507915479896669, "grad_norm": 0.28352123498916626, "learning_rate": 0.0001566293653785547, "loss": 1.6039, "step": 78600 }, { "epoch": 0.651619527058356, "grad_norm": 0.24184921383857727, "learning_rate": 0.0001565741667694051, "loss": 1.6249, "step": 78700 }, { "epoch": 0.6524475061270452, "grad_norm": 0.2542860507965088, "learning_rate": 0.0001565189681602555, "loss": 1.6246, "step": 78800 }, { "epoch": 0.6532754851957343, "grad_norm": 0.31147143244743347, "learning_rate": 0.00015646376955110589, "loss": 1.5907, "step": 78900 }, { "epoch": 0.6541034642644233, "grad_norm": 0.26504310965538025, "learning_rate": 0.00015640857094195628, "loss": 1.6349, "step": 79000 }, { "epoch": 0.6549314433331125, "grad_norm": 0.2635844051837921, "learning_rate": 0.00015635337233280667, "loss": 1.644, "step": 79100 }, { "epoch": 0.6557594224018016, "grad_norm": 0.26536962389945984, "learning_rate": 0.00015629817372365707, "loss": 1.685, "step": 79200 }, { "epoch": 0.6565874014704908, "grad_norm": 0.22780868411064148, "learning_rate": 0.00015624352710059896, "loss": 1.6502, "step": 79300 }, { "epoch": 0.6574153805391799, "grad_norm": 0.21863988041877747, "learning_rate": 0.00015618832849144935, "loss": 1.5892, "step": 79400 }, { "epoch": 0.6582433596078691, "grad_norm": 0.2570461928844452, "learning_rate": 0.00015613312988229974, "loss": 1.6128, "step": 79500 }, { "epoch": 0.6590713386765582, "grad_norm": 0.3156609833240509, "learning_rate": 0.00015607793127315014, "loss": 1.6254, "step": 79600 }, { "epoch": 0.6598993177452473, "grad_norm": 0.2669581472873688, "learning_rate": 0.00015602328465009205, "loss": 1.5889, "step": 79700 }, { "epoch": 0.6607272968139365, "grad_norm": 0.25296109914779663, "learning_rate": 0.00015596808604094242, "loss": 1.613, "step": 79800 }, { "epoch": 0.6615552758826256, "grad_norm": 0.2798563838005066, "learning_rate": 0.0001559128874317928, "loss": 1.6267, "step": 79900 }, { "epoch": 0.6623832549513148, "grad_norm": 0.24555878341197968, "learning_rate": 0.00015585768882264323, "loss": 1.6158, "step": 80000 } ], "logging_steps": 100, "max_steps": 362328, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7660211147080745e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }