{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9998846198223146, "eval_steps": 500, "global_step": 9750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015384023691396484, "grad_norm": 0.797955334186554, "learning_rate": 4.999996755554811e-05, "loss": 1.901, "step": 5 }, { "epoch": 0.003076804738279297, "grad_norm": 1.7458454370498657, "learning_rate": 4.999987022227664e-05, "loss": 1.7519, "step": 10 }, { "epoch": 0.004615207107418946, "grad_norm": 1.4588245153427124, "learning_rate": 4.999970800043822e-05, "loss": 1.8846, "step": 15 }, { "epoch": 0.006153609476558594, "grad_norm": 1.4333337545394897, "learning_rate": 4.9999480890453916e-05, "loss": 1.7849, "step": 20 }, { "epoch": 0.007692011845698243, "grad_norm": 1.8351398706436157, "learning_rate": 4.99991888929132e-05, "loss": 1.5929, "step": 25 }, { "epoch": 0.009230414214837892, "grad_norm": 1.145352840423584, "learning_rate": 4.999883200857397e-05, "loss": 1.4525, "step": 30 }, { "epoch": 0.010768816583977539, "grad_norm": 2.642392873764038, "learning_rate": 4.999841023836254e-05, "loss": 1.676, "step": 35 }, { "epoch": 0.012307218953117188, "grad_norm": 1.1834251880645752, "learning_rate": 4.999792358337363e-05, "loss": 1.4053, "step": 40 }, { "epoch": 0.013845621322256836, "grad_norm": 1.87557053565979, "learning_rate": 4.999737204487039e-05, "loss": 1.395, "step": 45 }, { "epoch": 0.015384023691396485, "grad_norm": 1.6970587968826294, "learning_rate": 4.999675562428437e-05, "loss": 1.5014, "step": 50 }, { "epoch": 0.016922426060536132, "grad_norm": 1.1248029470443726, "learning_rate": 4.999607432321551e-05, "loss": 1.2834, "step": 55 }, { "epoch": 0.018460828429675783, "grad_norm": 5.078829765319824, "learning_rate": 4.999532814343219e-05, "loss": 1.4268, "step": 60 }, { "epoch": 0.01999923079881543, "grad_norm": 1.0455766916275024, "learning_rate": 4.999451708687114e-05, "loss": 1.33, "step": 65 }, { "epoch": 0.021537633167955077, "grad_norm": 1.837597370147705, "learning_rate": 4.999364115563751e-05, "loss": 1.3419, "step": 70 }, { "epoch": 0.023076035537094728, "grad_norm": 1.0537432432174683, "learning_rate": 4.999270035200483e-05, "loss": 1.3378, "step": 75 }, { "epoch": 0.024614437906234375, "grad_norm": 2.875886917114258, "learning_rate": 4.9991694678415e-05, "loss": 1.263, "step": 80 }, { "epoch": 0.026152840275374026, "grad_norm": 0.9554703831672668, "learning_rate": 4.9990624137478314e-05, "loss": 1.2777, "step": 85 }, { "epoch": 0.027691242644513673, "grad_norm": 1.6160002946853638, "learning_rate": 4.998948873197342e-05, "loss": 1.3095, "step": 90 }, { "epoch": 0.02922964501365332, "grad_norm": 0.982302188873291, "learning_rate": 4.998828846484732e-05, "loss": 1.3788, "step": 95 }, { "epoch": 0.03076804738279297, "grad_norm": 1.1846606731414795, "learning_rate": 4.9987023339215374e-05, "loss": 1.3818, "step": 100 }, { "epoch": 0.03230644975193262, "grad_norm": 1.7230656147003174, "learning_rate": 4.9985693358361296e-05, "loss": 1.2638, "step": 105 }, { "epoch": 0.033844852121072265, "grad_norm": 1.3569287061691284, "learning_rate": 4.998429852573712e-05, "loss": 1.1706, "step": 110 }, { "epoch": 0.035383254490211916, "grad_norm": 1.555022120475769, "learning_rate": 4.998283884496321e-05, "loss": 1.287, "step": 115 }, { "epoch": 0.036921656859351566, "grad_norm": 1.9749863147735596, "learning_rate": 4.998131431982826e-05, "loss": 1.1886, "step": 120 }, { "epoch": 0.03846005922849121, "grad_norm": 0.8346091508865356, "learning_rate": 4.9979724954289244e-05, "loss": 1.2522, "step": 125 }, { "epoch": 0.03999846159763086, "grad_norm": 4.200445175170898, "learning_rate": 4.997807075247146e-05, "loss": 1.2879, "step": 130 }, { "epoch": 0.04153686396677051, "grad_norm": 1.5512927770614624, "learning_rate": 4.9976351718668476e-05, "loss": 1.3098, "step": 135 }, { "epoch": 0.043075266335910155, "grad_norm": 1.1664855480194092, "learning_rate": 4.9974567857342155e-05, "loss": 1.2395, "step": 140 }, { "epoch": 0.044613668705049805, "grad_norm": 1.1823747158050537, "learning_rate": 4.997271917312259e-05, "loss": 1.2825, "step": 145 }, { "epoch": 0.046152071074189456, "grad_norm": 1.0937031507492065, "learning_rate": 4.997080567080817e-05, "loss": 1.1966, "step": 150 }, { "epoch": 0.0476904734433291, "grad_norm": 0.899167001247406, "learning_rate": 4.9968827355365465e-05, "loss": 1.2136, "step": 155 }, { "epoch": 0.04922887581246875, "grad_norm": 1.9292325973510742, "learning_rate": 4.996678423192933e-05, "loss": 1.2735, "step": 160 }, { "epoch": 0.0507672781816084, "grad_norm": 1.0430412292480469, "learning_rate": 4.9964676305802794e-05, "loss": 1.3386, "step": 165 }, { "epoch": 0.05230568055074805, "grad_norm": 1.879791498184204, "learning_rate": 4.99625035824571e-05, "loss": 1.2693, "step": 170 }, { "epoch": 0.053844082919887695, "grad_norm": 1.6582942008972168, "learning_rate": 4.996026606753167e-05, "loss": 1.1652, "step": 175 }, { "epoch": 0.055382485289027346, "grad_norm": 3.5379745960235596, "learning_rate": 4.99579637668341e-05, "loss": 1.3028, "step": 180 }, { "epoch": 0.056920887658166996, "grad_norm": 3.7615771293640137, "learning_rate": 4.9955596686340154e-05, "loss": 1.1635, "step": 185 }, { "epoch": 0.05845929002730664, "grad_norm": 0.928205668926239, "learning_rate": 4.995316483219372e-05, "loss": 1.4032, "step": 190 }, { "epoch": 0.05999769239644629, "grad_norm": 1.0329424142837524, "learning_rate": 4.995066821070679e-05, "loss": 1.2417, "step": 195 }, { "epoch": 0.06153609476558594, "grad_norm": 1.8608773946762085, "learning_rate": 4.994810682835951e-05, "loss": 1.2683, "step": 200 }, { "epoch": 0.06307449713472559, "grad_norm": 2.3579938411712646, "learning_rate": 4.9945480691800075e-05, "loss": 1.252, "step": 205 }, { "epoch": 0.06461289950386524, "grad_norm": 0.7121773958206177, "learning_rate": 4.994278980784478e-05, "loss": 1.2303, "step": 210 }, { "epoch": 0.06615130187300489, "grad_norm": 0.783092200756073, "learning_rate": 4.9940034183477954e-05, "loss": 1.3052, "step": 215 }, { "epoch": 0.06768970424214453, "grad_norm": 0.9442629814147949, "learning_rate": 4.993721382585199e-05, "loss": 1.3012, "step": 220 }, { "epoch": 0.06922810661128419, "grad_norm": 1.0986599922180176, "learning_rate": 4.9934328742287285e-05, "loss": 1.1937, "step": 225 }, { "epoch": 0.07076650898042383, "grad_norm": 1.1720629930496216, "learning_rate": 4.9931378940272214e-05, "loss": 1.2892, "step": 230 }, { "epoch": 0.07230491134956347, "grad_norm": 1.6844689846038818, "learning_rate": 4.992836442746317e-05, "loss": 1.1496, "step": 235 }, { "epoch": 0.07384331371870313, "grad_norm": 1.4238617420196533, "learning_rate": 4.992528521168449e-05, "loss": 1.2769, "step": 240 }, { "epoch": 0.07538171608784278, "grad_norm": 1.1840909719467163, "learning_rate": 4.992214130092845e-05, "loss": 1.2355, "step": 245 }, { "epoch": 0.07692011845698242, "grad_norm": 1.3611423969268799, "learning_rate": 4.9918932703355256e-05, "loss": 1.3105, "step": 250 }, { "epoch": 0.07845852082612208, "grad_norm": 1.2186192274093628, "learning_rate": 4.991565942729298e-05, "loss": 1.3476, "step": 255 }, { "epoch": 0.07999692319526172, "grad_norm": 1.8337148427963257, "learning_rate": 4.991232148123761e-05, "loss": 1.1474, "step": 260 }, { "epoch": 0.08153532556440136, "grad_norm": 1.6369004249572754, "learning_rate": 4.990891887385297e-05, "loss": 1.1932, "step": 265 }, { "epoch": 0.08307372793354102, "grad_norm": 1.483383059501648, "learning_rate": 4.9905451613970725e-05, "loss": 1.2814, "step": 270 }, { "epoch": 0.08461213030268067, "grad_norm": 0.9386169910430908, "learning_rate": 4.990191971059033e-05, "loss": 1.337, "step": 275 }, { "epoch": 0.08615053267182031, "grad_norm": 0.8377572894096375, "learning_rate": 4.989832317287904e-05, "loss": 1.1444, "step": 280 }, { "epoch": 0.08768893504095997, "grad_norm": 2.5681023597717285, "learning_rate": 4.9894662010171874e-05, "loss": 1.3671, "step": 285 }, { "epoch": 0.08922733741009961, "grad_norm": 0.8709166049957275, "learning_rate": 4.98909362319716e-05, "loss": 1.1663, "step": 290 }, { "epoch": 0.09076573977923925, "grad_norm": 0.9449151158332825, "learning_rate": 4.988714584794866e-05, "loss": 1.2205, "step": 295 }, { "epoch": 0.09230414214837891, "grad_norm": 1.1699813604354858, "learning_rate": 4.988329086794122e-05, "loss": 1.3335, "step": 300 }, { "epoch": 0.09384254451751856, "grad_norm": 1.5694646835327148, "learning_rate": 4.98793713019551e-05, "loss": 1.3269, "step": 305 }, { "epoch": 0.0953809468866582, "grad_norm": 1.073695421218872, "learning_rate": 4.9875387160163744e-05, "loss": 1.2983, "step": 310 }, { "epoch": 0.09691934925579786, "grad_norm": 0.9977647066116333, "learning_rate": 4.987133845290822e-05, "loss": 1.2401, "step": 315 }, { "epoch": 0.0984577516249375, "grad_norm": 1.1257829666137695, "learning_rate": 4.986722519069719e-05, "loss": 1.1723, "step": 320 }, { "epoch": 0.09999615399407714, "grad_norm": 2.4303648471832275, "learning_rate": 4.9863047384206835e-05, "loss": 1.2318, "step": 325 }, { "epoch": 0.1015345563632168, "grad_norm": 2.8859667778015137, "learning_rate": 4.9858805044280895e-05, "loss": 1.2859, "step": 330 }, { "epoch": 0.10307295873235645, "grad_norm": 1.5526022911071777, "learning_rate": 4.985449818193061e-05, "loss": 1.26, "step": 335 }, { "epoch": 0.1046113611014961, "grad_norm": 1.387646198272705, "learning_rate": 4.9850126808334665e-05, "loss": 1.1915, "step": 340 }, { "epoch": 0.10614976347063575, "grad_norm": 1.4605071544647217, "learning_rate": 4.984569093483922e-05, "loss": 1.2653, "step": 345 }, { "epoch": 0.10768816583977539, "grad_norm": 0.9224326610565186, "learning_rate": 4.984119057295783e-05, "loss": 1.1685, "step": 350 }, { "epoch": 0.10922656820891505, "grad_norm": 0.7252030372619629, "learning_rate": 4.983662573437143e-05, "loss": 1.1732, "step": 355 }, { "epoch": 0.11076497057805469, "grad_norm": 1.599922776222229, "learning_rate": 4.9831996430928326e-05, "loss": 1.144, "step": 360 }, { "epoch": 0.11230337294719434, "grad_norm": 1.3098397254943848, "learning_rate": 4.9827302674644126e-05, "loss": 1.1434, "step": 365 }, { "epoch": 0.11384177531633399, "grad_norm": 1.1649527549743652, "learning_rate": 4.982254447770175e-05, "loss": 1.2149, "step": 370 }, { "epoch": 0.11538017768547364, "grad_norm": 1.794103741645813, "learning_rate": 4.981772185245135e-05, "loss": 1.1748, "step": 375 }, { "epoch": 0.11691858005461328, "grad_norm": 0.9207557439804077, "learning_rate": 4.981283481141034e-05, "loss": 1.2767, "step": 380 }, { "epoch": 0.11845698242375294, "grad_norm": 1.2299586534500122, "learning_rate": 4.980788336726328e-05, "loss": 1.2229, "step": 385 }, { "epoch": 0.11999538479289258, "grad_norm": 1.6368671655654907, "learning_rate": 4.980286753286195e-05, "loss": 1.1892, "step": 390 }, { "epoch": 0.12153378716203223, "grad_norm": 0.9977037906646729, "learning_rate": 4.9797787321225215e-05, "loss": 1.2199, "step": 395 }, { "epoch": 0.12307218953117188, "grad_norm": 1.6430251598358154, "learning_rate": 4.979264274553905e-05, "loss": 1.167, "step": 400 }, { "epoch": 0.12461059190031153, "grad_norm": 0.9530766606330872, "learning_rate": 4.97874338191565e-05, "loss": 1.1379, "step": 405 }, { "epoch": 0.12614899426945117, "grad_norm": 1.4140937328338623, "learning_rate": 4.978216055559761e-05, "loss": 1.2004, "step": 410 }, { "epoch": 0.12768739663859083, "grad_norm": 2.96806001663208, "learning_rate": 4.9776822968549454e-05, "loss": 1.1749, "step": 415 }, { "epoch": 0.12922579900773049, "grad_norm": 1.0619330406188965, "learning_rate": 4.977142107186602e-05, "loss": 1.1946, "step": 420 }, { "epoch": 0.13076420137687012, "grad_norm": 3.656414031982422, "learning_rate": 4.976595487956823e-05, "loss": 1.1884, "step": 425 }, { "epoch": 0.13230260374600977, "grad_norm": 2.3072898387908936, "learning_rate": 4.976042440584392e-05, "loss": 1.1723, "step": 430 }, { "epoch": 0.13384100611514943, "grad_norm": 1.3326804637908936, "learning_rate": 4.975482966504772e-05, "loss": 1.2658, "step": 435 }, { "epoch": 0.13537940848428906, "grad_norm": 1.7775696516036987, "learning_rate": 4.97491706717011e-05, "loss": 1.2043, "step": 440 }, { "epoch": 0.13691781085342872, "grad_norm": 2.0168044567108154, "learning_rate": 4.97434474404923e-05, "loss": 1.2228, "step": 445 }, { "epoch": 0.13845621322256838, "grad_norm": 1.757367730140686, "learning_rate": 4.973765998627628e-05, "loss": 1.0853, "step": 450 }, { "epoch": 0.139994615591708, "grad_norm": 1.5851964950561523, "learning_rate": 4.9731808324074717e-05, "loss": 1.1649, "step": 455 }, { "epoch": 0.14153301796084766, "grad_norm": 0.7482730746269226, "learning_rate": 4.9725892469075905e-05, "loss": 1.1394, "step": 460 }, { "epoch": 0.14307142032998732, "grad_norm": 1.041838526725769, "learning_rate": 4.9719912436634796e-05, "loss": 1.183, "step": 465 }, { "epoch": 0.14460982269912695, "grad_norm": 1.5557304620742798, "learning_rate": 4.97138682422729e-05, "loss": 1.0956, "step": 470 }, { "epoch": 0.1461482250682666, "grad_norm": 2.4157028198242188, "learning_rate": 4.970775990167826e-05, "loss": 1.2898, "step": 475 }, { "epoch": 0.14768662743740626, "grad_norm": 1.1285796165466309, "learning_rate": 4.9701587430705415e-05, "loss": 1.2317, "step": 480 }, { "epoch": 0.1492250298065459, "grad_norm": 1.0138782262802124, "learning_rate": 4.969535084537536e-05, "loss": 1.2337, "step": 485 }, { "epoch": 0.15076343217568555, "grad_norm": 2.229642629623413, "learning_rate": 4.9689050161875506e-05, "loss": 1.196, "step": 490 }, { "epoch": 0.1523018345448252, "grad_norm": 1.7063117027282715, "learning_rate": 4.9682685396559627e-05, "loss": 1.1881, "step": 495 }, { "epoch": 0.15384023691396484, "grad_norm": 1.2955132722854614, "learning_rate": 4.967625656594782e-05, "loss": 1.1329, "step": 500 }, { "epoch": 0.1553786392831045, "grad_norm": 1.562524437904358, "learning_rate": 4.96697636867265e-05, "loss": 1.1875, "step": 505 }, { "epoch": 0.15691704165224415, "grad_norm": 1.5448559522628784, "learning_rate": 4.966320677574827e-05, "loss": 1.2321, "step": 510 }, { "epoch": 0.15845544402138378, "grad_norm": 1.5567346811294556, "learning_rate": 4.9656585850031987e-05, "loss": 1.3752, "step": 515 }, { "epoch": 0.15999384639052344, "grad_norm": 2.398003339767456, "learning_rate": 4.964990092676263e-05, "loss": 1.308, "step": 520 }, { "epoch": 0.1615322487596631, "grad_norm": 1.0853981971740723, "learning_rate": 4.964315202329127e-05, "loss": 1.1672, "step": 525 }, { "epoch": 0.16307065112880273, "grad_norm": 1.2827069759368896, "learning_rate": 4.963633915713509e-05, "loss": 1.1813, "step": 530 }, { "epoch": 0.1646090534979424, "grad_norm": 1.0297755002975464, "learning_rate": 4.962946234597726e-05, "loss": 1.224, "step": 535 }, { "epoch": 0.16614745586708204, "grad_norm": 1.2967716455459595, "learning_rate": 4.962252160766693e-05, "loss": 1.216, "step": 540 }, { "epoch": 0.16768585823622167, "grad_norm": 1.1088382005691528, "learning_rate": 4.961551696021918e-05, "loss": 1.2665, "step": 545 }, { "epoch": 0.16922426060536133, "grad_norm": 1.604500412940979, "learning_rate": 4.960844842181494e-05, "loss": 1.2242, "step": 550 }, { "epoch": 0.170762662974501, "grad_norm": 0.8669219017028809, "learning_rate": 4.960131601080104e-05, "loss": 1.0902, "step": 555 }, { "epoch": 0.17230106534364062, "grad_norm": 2.331838369369507, "learning_rate": 4.9594119745690014e-05, "loss": 1.2584, "step": 560 }, { "epoch": 0.17383946771278028, "grad_norm": 1.547214388847351, "learning_rate": 4.95868596451602e-05, "loss": 1.2759, "step": 565 }, { "epoch": 0.17537787008191993, "grad_norm": 0.90031498670578, "learning_rate": 4.957953572805558e-05, "loss": 1.1871, "step": 570 }, { "epoch": 0.17691627245105956, "grad_norm": 1.2103489637374878, "learning_rate": 4.957214801338581e-05, "loss": 1.2548, "step": 575 }, { "epoch": 0.17845467482019922, "grad_norm": 1.005483865737915, "learning_rate": 4.956469652032609e-05, "loss": 1.1955, "step": 580 }, { "epoch": 0.17999307718933888, "grad_norm": 0.9381604790687561, "learning_rate": 4.9557181268217227e-05, "loss": 1.1725, "step": 585 }, { "epoch": 0.1815314795584785, "grad_norm": 1.0934962034225464, "learning_rate": 4.9549602276565435e-05, "loss": 1.1608, "step": 590 }, { "epoch": 0.18306988192761817, "grad_norm": 1.2921926975250244, "learning_rate": 4.954195956504245e-05, "loss": 1.2638, "step": 595 }, { "epoch": 0.18460828429675782, "grad_norm": 1.0068799257278442, "learning_rate": 4.953425315348534e-05, "loss": 1.1962, "step": 600 }, { "epoch": 0.18614668666589745, "grad_norm": 1.205253005027771, "learning_rate": 4.9526483061896534e-05, "loss": 1.2266, "step": 605 }, { "epoch": 0.1876850890350371, "grad_norm": 2.4699807167053223, "learning_rate": 4.951864931044374e-05, "loss": 1.2421, "step": 610 }, { "epoch": 0.18922349140417677, "grad_norm": 1.494227409362793, "learning_rate": 4.9510751919459895e-05, "loss": 1.0911, "step": 615 }, { "epoch": 0.1907618937733164, "grad_norm": 1.6315932273864746, "learning_rate": 4.950279090944313e-05, "loss": 1.2937, "step": 620 }, { "epoch": 0.19230029614245606, "grad_norm": 1.1606189012527466, "learning_rate": 4.949476630105669e-05, "loss": 1.2341, "step": 625 }, { "epoch": 0.19383869851159571, "grad_norm": 0.7757031321525574, "learning_rate": 4.94866781151289e-05, "loss": 1.1867, "step": 630 }, { "epoch": 0.19537710088073534, "grad_norm": 1.1521573066711426, "learning_rate": 4.9478526372653096e-05, "loss": 1.2039, "step": 635 }, { "epoch": 0.196915503249875, "grad_norm": 0.8454951643943787, "learning_rate": 4.947031109478758e-05, "loss": 1.1598, "step": 640 }, { "epoch": 0.19845390561901466, "grad_norm": 1.3610525131225586, "learning_rate": 4.9462032302855576e-05, "loss": 1.2351, "step": 645 }, { "epoch": 0.1999923079881543, "grad_norm": 1.006761908531189, "learning_rate": 4.9453690018345144e-05, "loss": 1.3258, "step": 650 }, { "epoch": 0.20153071035729395, "grad_norm": 1.7177060842514038, "learning_rate": 4.9445284262909156e-05, "loss": 1.2972, "step": 655 }, { "epoch": 0.2030691127264336, "grad_norm": 1.063753604888916, "learning_rate": 4.943681505836523e-05, "loss": 1.1798, "step": 660 }, { "epoch": 0.20460751509557323, "grad_norm": 1.1765042543411255, "learning_rate": 4.9428282426695646e-05, "loss": 1.2289, "step": 665 }, { "epoch": 0.2061459174647129, "grad_norm": 1.552025556564331, "learning_rate": 4.9419686390047334e-05, "loss": 1.2983, "step": 670 }, { "epoch": 0.20768431983385255, "grad_norm": 2.3030500411987305, "learning_rate": 4.9411026970731805e-05, "loss": 1.1431, "step": 675 }, { "epoch": 0.2092227222029922, "grad_norm": 1.2778570652008057, "learning_rate": 4.9402304191225044e-05, "loss": 1.1139, "step": 680 }, { "epoch": 0.21076112457213184, "grad_norm": 1.5465021133422852, "learning_rate": 4.9393518074167536e-05, "loss": 1.1983, "step": 685 }, { "epoch": 0.2122995269412715, "grad_norm": 1.079221248626709, "learning_rate": 4.9384668642364126e-05, "loss": 1.1707, "step": 690 }, { "epoch": 0.21383792931041115, "grad_norm": 0.9178045988082886, "learning_rate": 4.937575591878403e-05, "loss": 1.2633, "step": 695 }, { "epoch": 0.21537633167955078, "grad_norm": 1.1767947673797607, "learning_rate": 4.93667799265607e-05, "loss": 1.1424, "step": 700 }, { "epoch": 0.21691473404869044, "grad_norm": 1.7399604320526123, "learning_rate": 4.935774068899184e-05, "loss": 1.2006, "step": 705 }, { "epoch": 0.2184531364178301, "grad_norm": 1.282569169998169, "learning_rate": 4.934863822953929e-05, "loss": 1.31, "step": 710 }, { "epoch": 0.21999153878696973, "grad_norm": 1.0059244632720947, "learning_rate": 4.933947257182901e-05, "loss": 1.2209, "step": 715 }, { "epoch": 0.22152994115610938, "grad_norm": 1.1119279861450195, "learning_rate": 4.9330243739650964e-05, "loss": 1.2628, "step": 720 }, { "epoch": 0.22306834352524904, "grad_norm": 1.2987867593765259, "learning_rate": 4.932095175695911e-05, "loss": 1.2771, "step": 725 }, { "epoch": 0.22460674589438867, "grad_norm": 2.659963846206665, "learning_rate": 4.9311596647871317e-05, "loss": 1.2315, "step": 730 }, { "epoch": 0.22614514826352833, "grad_norm": 1.846644639968872, "learning_rate": 4.9302178436669286e-05, "loss": 1.1513, "step": 735 }, { "epoch": 0.22768355063266799, "grad_norm": 1.6089102029800415, "learning_rate": 4.929269714779852e-05, "loss": 1.2085, "step": 740 }, { "epoch": 0.22922195300180762, "grad_norm": 1.953465223312378, "learning_rate": 4.9283152805868235e-05, "loss": 1.2173, "step": 745 }, { "epoch": 0.23076035537094727, "grad_norm": 1.127911925315857, "learning_rate": 4.92735454356513e-05, "loss": 1.2868, "step": 750 }, { "epoch": 0.23229875774008693, "grad_norm": 1.9900950193405151, "learning_rate": 4.9263875062084194e-05, "loss": 1.2541, "step": 755 }, { "epoch": 0.23383716010922656, "grad_norm": 1.0306469202041626, "learning_rate": 4.925414171026691e-05, "loss": 1.2301, "step": 760 }, { "epoch": 0.23537556247836622, "grad_norm": 2.1911792755126953, "learning_rate": 4.9244345405462903e-05, "loss": 1.2235, "step": 765 }, { "epoch": 0.23691396484750588, "grad_norm": 1.6600091457366943, "learning_rate": 4.923448617309905e-05, "loss": 1.2173, "step": 770 }, { "epoch": 0.2384523672166455, "grad_norm": 1.517933964729309, "learning_rate": 4.922456403876552e-05, "loss": 1.2161, "step": 775 }, { "epoch": 0.23999076958578516, "grad_norm": 2.4637537002563477, "learning_rate": 4.9214579028215776e-05, "loss": 1.1367, "step": 780 }, { "epoch": 0.24152917195492482, "grad_norm": 1.1146929264068604, "learning_rate": 4.9204531167366485e-05, "loss": 1.2063, "step": 785 }, { "epoch": 0.24306757432406445, "grad_norm": 1.044700026512146, "learning_rate": 4.919442048229743e-05, "loss": 1.2129, "step": 790 }, { "epoch": 0.2446059766932041, "grad_norm": 1.2291778326034546, "learning_rate": 4.918424699925145e-05, "loss": 1.1697, "step": 795 }, { "epoch": 0.24614437906234377, "grad_norm": 2.0124595165252686, "learning_rate": 4.917401074463441e-05, "loss": 1.1384, "step": 800 }, { "epoch": 0.2476827814314834, "grad_norm": 1.1294913291931152, "learning_rate": 4.916371174501507e-05, "loss": 1.1691, "step": 805 }, { "epoch": 0.24922118380062305, "grad_norm": 2.2073304653167725, "learning_rate": 4.9153350027125064e-05, "loss": 1.1847, "step": 810 }, { "epoch": 0.2507595861697627, "grad_norm": 0.9351872205734253, "learning_rate": 4.9142925617858814e-05, "loss": 1.1932, "step": 815 }, { "epoch": 0.25229798853890234, "grad_norm": 1.3290927410125732, "learning_rate": 4.913243854427346e-05, "loss": 1.2509, "step": 820 }, { "epoch": 0.253836390908042, "grad_norm": 1.7626986503601074, "learning_rate": 4.9121888833588795e-05, "loss": 1.2265, "step": 825 }, { "epoch": 0.25537479327718166, "grad_norm": 1.9079293012619019, "learning_rate": 4.911127651318717e-05, "loss": 1.2083, "step": 830 }, { "epoch": 0.2569131956463213, "grad_norm": 1.2099313735961914, "learning_rate": 4.910060161061347e-05, "loss": 1.0772, "step": 835 }, { "epoch": 0.25845159801546097, "grad_norm": 2.2780282497406006, "learning_rate": 4.9089864153575016e-05, "loss": 1.1247, "step": 840 }, { "epoch": 0.2599900003846006, "grad_norm": 1.4330651760101318, "learning_rate": 4.907906416994146e-05, "loss": 1.2317, "step": 845 }, { "epoch": 0.26152840275374023, "grad_norm": 1.3475781679153442, "learning_rate": 4.906820168774477e-05, "loss": 1.2299, "step": 850 }, { "epoch": 0.2630668051228799, "grad_norm": 1.4789565801620483, "learning_rate": 4.905727673517914e-05, "loss": 1.1717, "step": 855 }, { "epoch": 0.26460520749201955, "grad_norm": 1.7257953882217407, "learning_rate": 4.904628934060088e-05, "loss": 1.1482, "step": 860 }, { "epoch": 0.2661436098611592, "grad_norm": 1.5117738246917725, "learning_rate": 4.903523953252841e-05, "loss": 1.1459, "step": 865 }, { "epoch": 0.26768201223029886, "grad_norm": 1.9107036590576172, "learning_rate": 4.902412733964211e-05, "loss": 1.1814, "step": 870 }, { "epoch": 0.26922041459943846, "grad_norm": 1.9499353170394897, "learning_rate": 4.901295279078431e-05, "loss": 1.1091, "step": 875 }, { "epoch": 0.2707588169685781, "grad_norm": 0.968377947807312, "learning_rate": 4.900171591495918e-05, "loss": 1.1259, "step": 880 }, { "epoch": 0.2722972193377178, "grad_norm": 1.4124119281768799, "learning_rate": 4.899041674133265e-05, "loss": 1.227, "step": 885 }, { "epoch": 0.27383562170685743, "grad_norm": 2.3951656818389893, "learning_rate": 4.8979055299232376e-05, "loss": 1.2486, "step": 890 }, { "epoch": 0.2753740240759971, "grad_norm": 1.0433672666549683, "learning_rate": 4.896763161814761e-05, "loss": 1.2767, "step": 895 }, { "epoch": 0.27691242644513675, "grad_norm": 3.254819393157959, "learning_rate": 4.8956145727729156e-05, "loss": 1.1728, "step": 900 }, { "epoch": 0.27845082881427635, "grad_norm": 1.5969161987304688, "learning_rate": 4.894459765778929e-05, "loss": 1.0947, "step": 905 }, { "epoch": 0.279989231183416, "grad_norm": 1.0399632453918457, "learning_rate": 4.893298743830168e-05, "loss": 1.1474, "step": 910 }, { "epoch": 0.28152763355255567, "grad_norm": 0.8495728373527527, "learning_rate": 4.89213150994013e-05, "loss": 1.1966, "step": 915 }, { "epoch": 0.2830660359216953, "grad_norm": 1.0868245363235474, "learning_rate": 4.890958067138436e-05, "loss": 1.194, "step": 920 }, { "epoch": 0.284604438290835, "grad_norm": 1.8804415464401245, "learning_rate": 4.889778418470823e-05, "loss": 1.1109, "step": 925 }, { "epoch": 0.28614284065997464, "grad_norm": 2.711575508117676, "learning_rate": 4.8885925669991346e-05, "loss": 1.1817, "step": 930 }, { "epoch": 0.28768124302911424, "grad_norm": 1.0217710733413696, "learning_rate": 4.887400515801315e-05, "loss": 1.1885, "step": 935 }, { "epoch": 0.2892196453982539, "grad_norm": 2.7488129138946533, "learning_rate": 4.886202267971401e-05, "loss": 1.1094, "step": 940 }, { "epoch": 0.29075804776739356, "grad_norm": 1.5893644094467163, "learning_rate": 4.8849978266195114e-05, "loss": 1.1363, "step": 945 }, { "epoch": 0.2922964501365332, "grad_norm": 1.7660548686981201, "learning_rate": 4.883787194871841e-05, "loss": 1.1993, "step": 950 }, { "epoch": 0.29383485250567287, "grad_norm": 1.6953208446502686, "learning_rate": 4.882570375870653e-05, "loss": 1.2974, "step": 955 }, { "epoch": 0.29537325487481253, "grad_norm": 1.191604733467102, "learning_rate": 4.88134737277427e-05, "loss": 1.0904, "step": 960 }, { "epoch": 0.29691165724395213, "grad_norm": 1.317108392715454, "learning_rate": 4.880118188757064e-05, "loss": 1.1945, "step": 965 }, { "epoch": 0.2984500596130918, "grad_norm": 1.4262393712997437, "learning_rate": 4.878882827009452e-05, "loss": 1.1332, "step": 970 }, { "epoch": 0.29998846198223145, "grad_norm": 1.2089824676513672, "learning_rate": 4.877641290737884e-05, "loss": 1.1619, "step": 975 }, { "epoch": 0.3015268643513711, "grad_norm": 1.008278727531433, "learning_rate": 4.8763935831648374e-05, "loss": 1.2385, "step": 980 }, { "epoch": 0.30306526672051076, "grad_norm": 1.3324832916259766, "learning_rate": 4.8751397075288084e-05, "loss": 1.1853, "step": 985 }, { "epoch": 0.3046036690896504, "grad_norm": 1.293948769569397, "learning_rate": 4.8738796670843004e-05, "loss": 1.17, "step": 990 }, { "epoch": 0.30614207145879, "grad_norm": 1.5684713125228882, "learning_rate": 4.8726134651018194e-05, "loss": 1.1836, "step": 995 }, { "epoch": 0.3076804738279297, "grad_norm": 1.8155791759490967, "learning_rate": 4.8713411048678635e-05, "loss": 1.0884, "step": 1000 }, { "epoch": 0.30921887619706934, "grad_norm": 1.2308273315429688, "learning_rate": 4.870062589684916e-05, "loss": 1.2415, "step": 1005 }, { "epoch": 0.310757278566209, "grad_norm": 1.091968297958374, "learning_rate": 4.868777922871434e-05, "loss": 1.1733, "step": 1010 }, { "epoch": 0.31229568093534865, "grad_norm": 1.1831902265548706, "learning_rate": 4.8674871077618424e-05, "loss": 1.2495, "step": 1015 }, { "epoch": 0.3138340833044883, "grad_norm": 1.0130064487457275, "learning_rate": 4.8661901477065244e-05, "loss": 1.2221, "step": 1020 }, { "epoch": 0.3153724856736279, "grad_norm": 1.518776535987854, "learning_rate": 4.864887046071813e-05, "loss": 1.2491, "step": 1025 }, { "epoch": 0.31691088804276757, "grad_norm": 2.234370708465576, "learning_rate": 4.863577806239982e-05, "loss": 1.2308, "step": 1030 }, { "epoch": 0.3184492904119072, "grad_norm": 1.2248209714889526, "learning_rate": 4.862262431609235e-05, "loss": 1.1884, "step": 1035 }, { "epoch": 0.3199876927810469, "grad_norm": 1.0601451396942139, "learning_rate": 4.860940925593703e-05, "loss": 1.0993, "step": 1040 }, { "epoch": 0.32152609515018654, "grad_norm": 1.8158398866653442, "learning_rate": 4.859613291623428e-05, "loss": 1.267, "step": 1045 }, { "epoch": 0.3230644975193262, "grad_norm": 1.0854904651641846, "learning_rate": 4.858279533144358e-05, "loss": 1.185, "step": 1050 }, { "epoch": 0.3246028998884658, "grad_norm": 1.392730474472046, "learning_rate": 4.856939653618339e-05, "loss": 1.1568, "step": 1055 }, { "epoch": 0.32614130225760546, "grad_norm": 1.2848827838897705, "learning_rate": 4.855593656523103e-05, "loss": 0.9792, "step": 1060 }, { "epoch": 0.3276797046267451, "grad_norm": 1.0641793012619019, "learning_rate": 4.8542415453522615e-05, "loss": 1.0783, "step": 1065 }, { "epoch": 0.3292181069958848, "grad_norm": 1.3434879779815674, "learning_rate": 4.852883323615295e-05, "loss": 1.1971, "step": 1070 }, { "epoch": 0.33075650936502443, "grad_norm": 1.8678319454193115, "learning_rate": 4.8515189948375434e-05, "loss": 1.1928, "step": 1075 }, { "epoch": 0.3322949117341641, "grad_norm": 0.9058634638786316, "learning_rate": 4.8501485625601996e-05, "loss": 1.2081, "step": 1080 }, { "epoch": 0.33383331410330375, "grad_norm": 1.7286523580551147, "learning_rate": 4.848772030340297e-05, "loss": 1.1119, "step": 1085 }, { "epoch": 0.33537171647244335, "grad_norm": 0.9546146392822266, "learning_rate": 4.847389401750701e-05, "loss": 1.2201, "step": 1090 }, { "epoch": 0.336910118841583, "grad_norm": 1.6596899032592773, "learning_rate": 4.846000680380105e-05, "loss": 1.2032, "step": 1095 }, { "epoch": 0.33844852121072266, "grad_norm": 1.3017821311950684, "learning_rate": 4.8446058698330115e-05, "loss": 1.0737, "step": 1100 }, { "epoch": 0.3399869235798623, "grad_norm": 1.556097149848938, "learning_rate": 4.843204973729729e-05, "loss": 1.2315, "step": 1105 }, { "epoch": 0.341525325949002, "grad_norm": 1.3806447982788086, "learning_rate": 4.8417979957063624e-05, "loss": 1.1993, "step": 1110 }, { "epoch": 0.34306372831814164, "grad_norm": 1.5958844423294067, "learning_rate": 4.8403849394148025e-05, "loss": 1.1318, "step": 1115 }, { "epoch": 0.34460213068728124, "grad_norm": 1.1803624629974365, "learning_rate": 4.838965808522716e-05, "loss": 1.1404, "step": 1120 }, { "epoch": 0.3461405330564209, "grad_norm": 1.1995015144348145, "learning_rate": 4.837540606713538e-05, "loss": 1.0803, "step": 1125 }, { "epoch": 0.34767893542556055, "grad_norm": 1.4723166227340698, "learning_rate": 4.836109337686457e-05, "loss": 1.2123, "step": 1130 }, { "epoch": 0.3492173377947002, "grad_norm": 1.0283982753753662, "learning_rate": 4.8346720051564144e-05, "loss": 1.1247, "step": 1135 }, { "epoch": 0.35075574016383987, "grad_norm": 1.0858715772628784, "learning_rate": 4.833228612854087e-05, "loss": 1.1716, "step": 1140 }, { "epoch": 0.3522941425329795, "grad_norm": 0.8430686593055725, "learning_rate": 4.831779164525881e-05, "loss": 1.3617, "step": 1145 }, { "epoch": 0.35383254490211913, "grad_norm": 3.4253408908843994, "learning_rate": 4.83032366393392e-05, "loss": 1.1845, "step": 1150 }, { "epoch": 0.3553709472712588, "grad_norm": 1.2169939279556274, "learning_rate": 4.828862114856038e-05, "loss": 1.1945, "step": 1155 }, { "epoch": 0.35690934964039844, "grad_norm": 0.8925756216049194, "learning_rate": 4.827394521085767e-05, "loss": 1.2021, "step": 1160 }, { "epoch": 0.3584477520095381, "grad_norm": 1.1597083806991577, "learning_rate": 4.8259208864323304e-05, "loss": 1.1994, "step": 1165 }, { "epoch": 0.35998615437867776, "grad_norm": 1.1406500339508057, "learning_rate": 4.8244412147206284e-05, "loss": 1.1749, "step": 1170 }, { "epoch": 0.3615245567478174, "grad_norm": 0.8532546162605286, "learning_rate": 4.822955509791233e-05, "loss": 1.0968, "step": 1175 }, { "epoch": 0.363062959116957, "grad_norm": 1.6647778749465942, "learning_rate": 4.8214637755003745e-05, "loss": 1.1396, "step": 1180 }, { "epoch": 0.3646013614860967, "grad_norm": 4.057651042938232, "learning_rate": 4.819966015719933e-05, "loss": 1.2997, "step": 1185 }, { "epoch": 0.36613976385523633, "grad_norm": 0.9485501050949097, "learning_rate": 4.8184622343374275e-05, "loss": 1.2487, "step": 1190 }, { "epoch": 0.367678166224376, "grad_norm": 1.2841589450836182, "learning_rate": 4.8169524352560076e-05, "loss": 1.1749, "step": 1195 }, { "epoch": 0.36921656859351565, "grad_norm": 1.6274491548538208, "learning_rate": 4.815436622394441e-05, "loss": 1.2342, "step": 1200 }, { "epoch": 0.3707549709626553, "grad_norm": 1.1453361511230469, "learning_rate": 4.813914799687107e-05, "loss": 1.1538, "step": 1205 }, { "epoch": 0.3722933733317949, "grad_norm": 1.7069777250289917, "learning_rate": 4.812386971083979e-05, "loss": 1.0581, "step": 1210 }, { "epoch": 0.37383177570093457, "grad_norm": 1.478285789489746, "learning_rate": 4.810853140550624e-05, "loss": 1.1604, "step": 1215 }, { "epoch": 0.3753701780700742, "grad_norm": 2.3933002948760986, "learning_rate": 4.809313312068185e-05, "loss": 1.2667, "step": 1220 }, { "epoch": 0.3769085804392139, "grad_norm": 1.7244596481323242, "learning_rate": 4.8077674896333725e-05, "loss": 1.2328, "step": 1225 }, { "epoch": 0.37844698280835354, "grad_norm": 0.8237632513046265, "learning_rate": 4.806215677258456e-05, "loss": 1.1748, "step": 1230 }, { "epoch": 0.3799853851774932, "grad_norm": 1.9397246837615967, "learning_rate": 4.8046578789712515e-05, "loss": 1.1827, "step": 1235 }, { "epoch": 0.3815237875466328, "grad_norm": 1.060905933380127, "learning_rate": 4.803094098815112e-05, "loss": 1.1902, "step": 1240 }, { "epoch": 0.38306218991577246, "grad_norm": 1.2361557483673096, "learning_rate": 4.801524340848917e-05, "loss": 1.1679, "step": 1245 }, { "epoch": 0.3846005922849121, "grad_norm": 1.1130070686340332, "learning_rate": 4.799948609147061e-05, "loss": 1.2108, "step": 1250 }, { "epoch": 0.38613899465405177, "grad_norm": 1.0235941410064697, "learning_rate": 4.798366907799444e-05, "loss": 1.2093, "step": 1255 }, { "epoch": 0.38767739702319143, "grad_norm": 1.1517040729522705, "learning_rate": 4.7967792409114606e-05, "loss": 1.1166, "step": 1260 }, { "epoch": 0.3892157993923311, "grad_norm": 1.4021333456039429, "learning_rate": 4.795185612603991e-05, "loss": 1.1535, "step": 1265 }, { "epoch": 0.3907542017614707, "grad_norm": 1.059432864189148, "learning_rate": 4.7935860270133844e-05, "loss": 1.266, "step": 1270 }, { "epoch": 0.39229260413061035, "grad_norm": 1.1297060251235962, "learning_rate": 4.791980488291456e-05, "loss": 1.2552, "step": 1275 }, { "epoch": 0.39383100649975, "grad_norm": 2.4244351387023926, "learning_rate": 4.7903690006054726e-05, "loss": 1.2457, "step": 1280 }, { "epoch": 0.39536940886888966, "grad_norm": 1.4860352277755737, "learning_rate": 4.7887515681381403e-05, "loss": 1.0594, "step": 1285 }, { "epoch": 0.3969078112380293, "grad_norm": 1.5482337474822998, "learning_rate": 4.787128195087596e-05, "loss": 1.1454, "step": 1290 }, { "epoch": 0.398446213607169, "grad_norm": 1.3812161684036255, "learning_rate": 4.785498885667395e-05, "loss": 1.212, "step": 1295 }, { "epoch": 0.3999846159763086, "grad_norm": 1.555119276046753, "learning_rate": 4.783863644106502e-05, "loss": 1.2743, "step": 1300 }, { "epoch": 0.40152301834544823, "grad_norm": 0.8972929120063782, "learning_rate": 4.782222474649279e-05, "loss": 1.1012, "step": 1305 }, { "epoch": 0.4030614207145879, "grad_norm": 1.5616463422775269, "learning_rate": 4.780575381555472e-05, "loss": 1.0773, "step": 1310 }, { "epoch": 0.40459982308372755, "grad_norm": 1.2159663438796997, "learning_rate": 4.778922369100204e-05, "loss": 1.3538, "step": 1315 }, { "epoch": 0.4061382254528672, "grad_norm": 1.9486379623413086, "learning_rate": 4.7772634415739624e-05, "loss": 1.1075, "step": 1320 }, { "epoch": 0.40767662782200687, "grad_norm": 0.9763344526290894, "learning_rate": 4.7755986032825864e-05, "loss": 1.2541, "step": 1325 }, { "epoch": 0.40921503019114647, "grad_norm": 1.8571903705596924, "learning_rate": 4.7739278585472573e-05, "loss": 1.1718, "step": 1330 }, { "epoch": 0.4107534325602861, "grad_norm": 1.4408457279205322, "learning_rate": 4.7722512117044865e-05, "loss": 1.3156, "step": 1335 }, { "epoch": 0.4122918349294258, "grad_norm": 0.9971082806587219, "learning_rate": 4.7705686671061054e-05, "loss": 1.2319, "step": 1340 }, { "epoch": 0.41383023729856544, "grad_norm": 1.3525968790054321, "learning_rate": 4.768880229119253e-05, "loss": 1.1831, "step": 1345 }, { "epoch": 0.4153686396677051, "grad_norm": 1.2504761219024658, "learning_rate": 4.767185902126364e-05, "loss": 1.1658, "step": 1350 }, { "epoch": 0.41690704203684475, "grad_norm": 1.413370132446289, "learning_rate": 4.76548569052516e-05, "loss": 1.1391, "step": 1355 }, { "epoch": 0.4184454444059844, "grad_norm": 1.1849104166030884, "learning_rate": 4.763779598728636e-05, "loss": 1.1059, "step": 1360 }, { "epoch": 0.419983846775124, "grad_norm": 1.1771825551986694, "learning_rate": 4.762067631165049e-05, "loss": 1.1448, "step": 1365 }, { "epoch": 0.42152224914426367, "grad_norm": 1.4707238674163818, "learning_rate": 4.760349792277906e-05, "loss": 1.1634, "step": 1370 }, { "epoch": 0.42306065151340333, "grad_norm": 1.577568769454956, "learning_rate": 4.758626086525956e-05, "loss": 1.1911, "step": 1375 }, { "epoch": 0.424599053882543, "grad_norm": NaN, "learning_rate": 4.757242900787734e-05, "loss": 1.1134, "step": 1380 }, { "epoch": 0.42613745625168264, "grad_norm": 1.0331732034683228, "learning_rate": 4.755508645963771e-05, "loss": 1.2352, "step": 1385 }, { "epoch": 0.4276758586208223, "grad_norm": 1.8902751207351685, "learning_rate": 4.7537685368404684e-05, "loss": 1.1271, "step": 1390 }, { "epoch": 0.4292142609899619, "grad_norm": 1.1057363748550415, "learning_rate": 4.7520225779343754e-05, "loss": 1.1633, "step": 1395 }, { "epoch": 0.43075266335910156, "grad_norm": 1.183310627937317, "learning_rate": 4.7502707737772264e-05, "loss": 1.0869, "step": 1400 }, { "epoch": 0.4322910657282412, "grad_norm": 1.2959816455841064, "learning_rate": 4.7485131289159276e-05, "loss": 1.3126, "step": 1405 }, { "epoch": 0.4338294680973809, "grad_norm": 1.5542187690734863, "learning_rate": 4.746749647912546e-05, "loss": 1.2092, "step": 1410 }, { "epoch": 0.43536787046652053, "grad_norm": 0.8856958746910095, "learning_rate": 4.7449803353442945e-05, "loss": 1.2647, "step": 1415 }, { "epoch": 0.4369062728356602, "grad_norm": 1.3472665548324585, "learning_rate": 4.743205195803523e-05, "loss": 1.2187, "step": 1420 }, { "epoch": 0.4384446752047998, "grad_norm": 1.7011195421218872, "learning_rate": 4.741424233897707e-05, "loss": 1.2245, "step": 1425 }, { "epoch": 0.43998307757393945, "grad_norm": 2.0465426445007324, "learning_rate": 4.7396374542494314e-05, "loss": 1.2666, "step": 1430 }, { "epoch": 0.4415214799430791, "grad_norm": 1.2540628910064697, "learning_rate": 4.737844861496385e-05, "loss": 1.0815, "step": 1435 }, { "epoch": 0.44305988231221877, "grad_norm": 1.3363851308822632, "learning_rate": 4.736046460291342e-05, "loss": 1.2059, "step": 1440 }, { "epoch": 0.4445982846813584, "grad_norm": 1.0009596347808838, "learning_rate": 4.734242255302154e-05, "loss": 1.3611, "step": 1445 }, { "epoch": 0.4461366870504981, "grad_norm": 0.9863151907920837, "learning_rate": 4.732432251211735e-05, "loss": 1.1654, "step": 1450 }, { "epoch": 0.4476750894196377, "grad_norm": 0.8969554901123047, "learning_rate": 4.7306164527180546e-05, "loss": 1.1962, "step": 1455 }, { "epoch": 0.44921349178877734, "grad_norm": 1.1396467685699463, "learning_rate": 4.7287948645341185e-05, "loss": 1.2541, "step": 1460 }, { "epoch": 0.450751894157917, "grad_norm": 1.4073244333267212, "learning_rate": 4.72696749138796e-05, "loss": 1.1695, "step": 1465 }, { "epoch": 0.45229029652705666, "grad_norm": 2.4806954860687256, "learning_rate": 4.725134338022631e-05, "loss": 1.2109, "step": 1470 }, { "epoch": 0.4538286988961963, "grad_norm": 1.7406140565872192, "learning_rate": 4.723295409196183e-05, "loss": 1.1345, "step": 1475 }, { "epoch": 0.45536710126533597, "grad_norm": 0.8337318897247314, "learning_rate": 4.721450709681658e-05, "loss": 1.1668, "step": 1480 }, { "epoch": 0.4569055036344756, "grad_norm": 2.3858370780944824, "learning_rate": 4.7196002442670794e-05, "loss": 1.2392, "step": 1485 }, { "epoch": 0.45844390600361523, "grad_norm": 1.21785306930542, "learning_rate": 4.7177440177554324e-05, "loss": 1.2615, "step": 1490 }, { "epoch": 0.4599823083727549, "grad_norm": 1.8882992267608643, "learning_rate": 4.715882034964657e-05, "loss": 1.3384, "step": 1495 }, { "epoch": 0.46152071074189455, "grad_norm": 1.1433368921279907, "learning_rate": 4.714014300727634e-05, "loss": 1.2788, "step": 1500 }, { "epoch": 0.4630591131110342, "grad_norm": 1.1393554210662842, "learning_rate": 4.7121408198921745e-05, "loss": 1.1502, "step": 1505 }, { "epoch": 0.46459751548017386, "grad_norm": 1.1349945068359375, "learning_rate": 4.7102615973210004e-05, "loss": 1.1128, "step": 1510 }, { "epoch": 0.46613591784931346, "grad_norm": 1.2287825345993042, "learning_rate": 4.708376637891742e-05, "loss": 1.1704, "step": 1515 }, { "epoch": 0.4676743202184531, "grad_norm": 1.257469892501831, "learning_rate": 4.706485946496916e-05, "loss": 1.1092, "step": 1520 }, { "epoch": 0.4692127225875928, "grad_norm": 1.4764573574066162, "learning_rate": 4.704589528043918e-05, "loss": 1.1372, "step": 1525 }, { "epoch": 0.47075112495673244, "grad_norm": 1.0339967012405396, "learning_rate": 4.702687387455008e-05, "loss": 1.0919, "step": 1530 }, { "epoch": 0.4722895273258721, "grad_norm": 1.5123881101608276, "learning_rate": 4.7007795296673006e-05, "loss": 1.2167, "step": 1535 }, { "epoch": 0.47382792969501175, "grad_norm": 1.475289225578308, "learning_rate": 4.6988659596327465e-05, "loss": 1.2096, "step": 1540 }, { "epoch": 0.47536633206415135, "grad_norm": 1.229851245880127, "learning_rate": 4.696946682318124e-05, "loss": 1.1183, "step": 1545 }, { "epoch": 0.476904734433291, "grad_norm": 0.7345393896102905, "learning_rate": 4.695021702705026e-05, "loss": 1.2528, "step": 1550 }, { "epoch": 0.47844313680243067, "grad_norm": 1.239741563796997, "learning_rate": 4.693091025789845e-05, "loss": 1.1355, "step": 1555 }, { "epoch": 0.4799815391715703, "grad_norm": 1.3871427774429321, "learning_rate": 4.6911546565837605e-05, "loss": 1.1942, "step": 1560 }, { "epoch": 0.48151994154071, "grad_norm": 1.9207137823104858, "learning_rate": 4.689212600112728e-05, "loss": 1.1756, "step": 1565 }, { "epoch": 0.48305834390984964, "grad_norm": 1.4990715980529785, "learning_rate": 4.687264861417464e-05, "loss": 1.1363, "step": 1570 }, { "epoch": 0.48459674627898924, "grad_norm": 1.9793407917022705, "learning_rate": 4.6853114455534345e-05, "loss": 1.1298, "step": 1575 }, { "epoch": 0.4861351486481289, "grad_norm": 1.1142261028289795, "learning_rate": 4.683352357590839e-05, "loss": 1.0793, "step": 1580 }, { "epoch": 0.48767355101726856, "grad_norm": 1.4216675758361816, "learning_rate": 4.6813876026146007e-05, "loss": 1.1616, "step": 1585 }, { "epoch": 0.4892119533864082, "grad_norm": 1.3200958967208862, "learning_rate": 4.679417185724352e-05, "loss": 1.2206, "step": 1590 }, { "epoch": 0.4907503557555479, "grad_norm": 2.0705955028533936, "learning_rate": 4.67744111203442e-05, "loss": 1.1871, "step": 1595 }, { "epoch": 0.49228875812468753, "grad_norm": 1.3912644386291504, "learning_rate": 4.675459386673815e-05, "loss": 1.2779, "step": 1600 }, { "epoch": 0.49382716049382713, "grad_norm": 1.1627599000930786, "learning_rate": 4.673472014786216e-05, "loss": 1.0949, "step": 1605 }, { "epoch": 0.4953655628629668, "grad_norm": 1.0313643217086792, "learning_rate": 4.671479001529958e-05, "loss": 1.2541, "step": 1610 }, { "epoch": 0.49690396523210645, "grad_norm": 0.8410005569458008, "learning_rate": 4.6694803520780204e-05, "loss": 1.1915, "step": 1615 }, { "epoch": 0.4984423676012461, "grad_norm": 1.9390909671783447, "learning_rate": 4.66747607161801e-05, "loss": 1.1954, "step": 1620 }, { "epoch": 0.49998076997038576, "grad_norm": 1.324788212776184, "learning_rate": 4.665466165352147e-05, "loss": 1.163, "step": 1625 }, { "epoch": 0.5015191723395254, "grad_norm": 1.9445561170578003, "learning_rate": 4.663450638497259e-05, "loss": 1.2891, "step": 1630 }, { "epoch": 0.5030575747086651, "grad_norm": 1.2497957944869995, "learning_rate": 4.661429496284757e-05, "loss": 1.2295, "step": 1635 }, { "epoch": 0.5045959770778047, "grad_norm": 1.6469943523406982, "learning_rate": 4.65940274396063e-05, "loss": 1.0742, "step": 1640 }, { "epoch": 0.5061343794469444, "grad_norm": 1.3002201318740845, "learning_rate": 4.657370386785427e-05, "loss": 1.1575, "step": 1645 }, { "epoch": 0.507672781816084, "grad_norm": 1.3192851543426514, "learning_rate": 4.6553324300342446e-05, "loss": 1.2364, "step": 1650 }, { "epoch": 0.5092111841852236, "grad_norm": 2.825892925262451, "learning_rate": 4.653288878996716e-05, "loss": 1.3216, "step": 1655 }, { "epoch": 0.5107495865543633, "grad_norm": 1.0338250398635864, "learning_rate": 4.651239738976991e-05, "loss": 1.1948, "step": 1660 }, { "epoch": 0.5122879889235029, "grad_norm": 1.3987075090408325, "learning_rate": 4.649185015293728e-05, "loss": 1.1198, "step": 1665 }, { "epoch": 0.5138263912926426, "grad_norm": 1.920446515083313, "learning_rate": 4.647124713280078e-05, "loss": 1.1055, "step": 1670 }, { "epoch": 0.5153647936617822, "grad_norm": 2.621272325515747, "learning_rate": 4.645058838283669e-05, "loss": 1.1013, "step": 1675 }, { "epoch": 0.5169031960309219, "grad_norm": 1.3560134172439575, "learning_rate": 4.642987395666598e-05, "loss": 1.2337, "step": 1680 }, { "epoch": 0.5184415984000615, "grad_norm": 1.2603050470352173, "learning_rate": 4.64091039080541e-05, "loss": 1.1338, "step": 1685 }, { "epoch": 0.5199800007692011, "grad_norm": 1.3172502517700195, "learning_rate": 4.638827829091086e-05, "loss": 1.2974, "step": 1690 }, { "epoch": 0.5215184031383409, "grad_norm": 1.0840466022491455, "learning_rate": 4.636739715929033e-05, "loss": 1.0235, "step": 1695 }, { "epoch": 0.5230568055074805, "grad_norm": 0.9899886250495911, "learning_rate": 4.634646056739066e-05, "loss": 1.3254, "step": 1700 }, { "epoch": 0.5245952078766202, "grad_norm": 1.1562014818191528, "learning_rate": 4.632546856955396e-05, "loss": 1.2105, "step": 1705 }, { "epoch": 0.5261336102457598, "grad_norm": 0.9220596551895142, "learning_rate": 4.630442122026613e-05, "loss": 1.2339, "step": 1710 }, { "epoch": 0.5276720126148994, "grad_norm": 2.6827752590179443, "learning_rate": 4.628331857415673e-05, "loss": 1.1655, "step": 1715 }, { "epoch": 0.5292104149840391, "grad_norm": 1.4331889152526855, "learning_rate": 4.626216068599889e-05, "loss": 1.1398, "step": 1720 }, { "epoch": 0.5307488173531787, "grad_norm": 1.2920069694519043, "learning_rate": 4.624094761070909e-05, "loss": 1.155, "step": 1725 }, { "epoch": 0.5322872197223184, "grad_norm": 0.9748442769050598, "learning_rate": 4.621967940334705e-05, "loss": 1.162, "step": 1730 }, { "epoch": 0.533825622091458, "grad_norm": 1.2728887796401978, "learning_rate": 4.6198356119115595e-05, "loss": 1.2505, "step": 1735 }, { "epoch": 0.5353640244605977, "grad_norm": 1.119135856628418, "learning_rate": 4.617697781336052e-05, "loss": 1.2112, "step": 1740 }, { "epoch": 0.5369024268297373, "grad_norm": 1.0900635719299316, "learning_rate": 4.6155544541570406e-05, "loss": 1.1836, "step": 1745 }, { "epoch": 0.5384408291988769, "grad_norm": 1.4258182048797607, "learning_rate": 4.613405635937651e-05, "loss": 1.1892, "step": 1750 }, { "epoch": 0.5399792315680166, "grad_norm": 1.1550543308258057, "learning_rate": 4.611251332255264e-05, "loss": 1.1996, "step": 1755 }, { "epoch": 0.5415176339371562, "grad_norm": 1.2096160650253296, "learning_rate": 4.609091548701493e-05, "loss": 1.1016, "step": 1760 }, { "epoch": 0.543056036306296, "grad_norm": 0.9595054984092712, "learning_rate": 4.6069262908821784e-05, "loss": 1.233, "step": 1765 }, { "epoch": 0.5445944386754356, "grad_norm": 1.478603482246399, "learning_rate": 4.604755564417369e-05, "loss": 1.2789, "step": 1770 }, { "epoch": 0.5461328410445752, "grad_norm": 1.2694469690322876, "learning_rate": 4.602579374941307e-05, "loss": 1.3355, "step": 1775 }, { "epoch": 0.5476712434137149, "grad_norm": 1.3089395761489868, "learning_rate": 4.600397728102414e-05, "loss": 1.0967, "step": 1780 }, { "epoch": 0.5492096457828545, "grad_norm": 2.684046506881714, "learning_rate": 4.5982106295632765e-05, "loss": 1.0866, "step": 1785 }, { "epoch": 0.5507480481519942, "grad_norm": 1.414984107017517, "learning_rate": 4.596018085000633e-05, "loss": 1.2533, "step": 1790 }, { "epoch": 0.5522864505211338, "grad_norm": 1.0765364170074463, "learning_rate": 4.593820100105355e-05, "loss": 1.2313, "step": 1795 }, { "epoch": 0.5538248528902735, "grad_norm": 0.9591373801231384, "learning_rate": 4.5916166805824353e-05, "loss": 1.1038, "step": 1800 }, { "epoch": 0.5553632552594131, "grad_norm": 1.9848989248275757, "learning_rate": 4.589407832150974e-05, "loss": 1.1897, "step": 1805 }, { "epoch": 0.5569016576285527, "grad_norm": 1.1997350454330444, "learning_rate": 4.5871935605441606e-05, "loss": 1.1637, "step": 1810 }, { "epoch": 0.5584400599976924, "grad_norm": 0.9119892716407776, "learning_rate": 4.5849738715092624e-05, "loss": 1.3695, "step": 1815 }, { "epoch": 0.559978462366832, "grad_norm": 1.0208933353424072, "learning_rate": 4.582748770807605e-05, "loss": 1.1085, "step": 1820 }, { "epoch": 0.5615168647359717, "grad_norm": 1.3944216966629028, "learning_rate": 4.580518264214564e-05, "loss": 1.2322, "step": 1825 }, { "epoch": 0.5630552671051113, "grad_norm": 0.9918113946914673, "learning_rate": 4.5782823575195444e-05, "loss": 1.0482, "step": 1830 }, { "epoch": 0.5645936694742509, "grad_norm": 0.9428990483283997, "learning_rate": 4.576041056525966e-05, "loss": 1.132, "step": 1835 }, { "epoch": 0.5661320718433906, "grad_norm": 1.1514095067977905, "learning_rate": 4.5737943670512534e-05, "loss": 1.2385, "step": 1840 }, { "epoch": 0.5676704742125303, "grad_norm": 2.3423311710357666, "learning_rate": 4.5715422949268136e-05, "loss": 1.2718, "step": 1845 }, { "epoch": 0.56920887658167, "grad_norm": 1.2614145278930664, "learning_rate": 4.5692848459980275e-05, "loss": 1.1639, "step": 1850 }, { "epoch": 0.5707472789508096, "grad_norm": 1.355564832687378, "learning_rate": 4.56702202612423e-05, "loss": 1.2753, "step": 1855 }, { "epoch": 0.5722856813199493, "grad_norm": 1.5028793811798096, "learning_rate": 4.564753841178697e-05, "loss": 1.2298, "step": 1860 }, { "epoch": 0.5738240836890889, "grad_norm": 2.331164598464966, "learning_rate": 4.5624802970486295e-05, "loss": 1.1079, "step": 1865 }, { "epoch": 0.5753624860582285, "grad_norm": 1.438877820968628, "learning_rate": 4.56020139963514e-05, "loss": 1.1423, "step": 1870 }, { "epoch": 0.5769008884273682, "grad_norm": 1.6183146238327026, "learning_rate": 4.557917154853234e-05, "loss": 1.3155, "step": 1875 }, { "epoch": 0.5784392907965078, "grad_norm": 1.206664800643921, "learning_rate": 4.555627568631798e-05, "loss": 1.1703, "step": 1880 }, { "epoch": 0.5799776931656475, "grad_norm": 0.9220191240310669, "learning_rate": 4.553332646913581e-05, "loss": 1.2033, "step": 1885 }, { "epoch": 0.5815160955347871, "grad_norm": 1.4525690078735352, "learning_rate": 4.551032395655181e-05, "loss": 1.128, "step": 1890 }, { "epoch": 0.5830544979039268, "grad_norm": 0.9040252566337585, "learning_rate": 4.5487268208270284e-05, "loss": 1.1954, "step": 1895 }, { "epoch": 0.5845929002730664, "grad_norm": 1.0403451919555664, "learning_rate": 4.546415928413373e-05, "loss": 1.1627, "step": 1900 }, { "epoch": 0.586131302642206, "grad_norm": 0.8466317057609558, "learning_rate": 4.544099724412267e-05, "loss": 1.2379, "step": 1905 }, { "epoch": 0.5876697050113457, "grad_norm": 1.0065761804580688, "learning_rate": 4.5417782148355464e-05, "loss": 1.2146, "step": 1910 }, { "epoch": 0.5892081073804853, "grad_norm": 1.1972275972366333, "learning_rate": 4.5394514057088197e-05, "loss": 1.1367, "step": 1915 }, { "epoch": 0.5907465097496251, "grad_norm": 1.0356810092926025, "learning_rate": 4.5371193030714524e-05, "loss": 1.123, "step": 1920 }, { "epoch": 0.5922849121187647, "grad_norm": 2.044755458831787, "learning_rate": 4.534781912976546e-05, "loss": 1.1501, "step": 1925 }, { "epoch": 0.5938233144879043, "grad_norm": 4.12185525894165, "learning_rate": 4.532439241490928e-05, "loss": 1.1789, "step": 1930 }, { "epoch": 0.595361716857044, "grad_norm": 1.5785492658615112, "learning_rate": 4.530091294695134e-05, "loss": 1.2626, "step": 1935 }, { "epoch": 0.5969001192261836, "grad_norm": 1.0973812341690063, "learning_rate": 4.527738078683391e-05, "loss": 1.1505, "step": 1940 }, { "epoch": 0.5984385215953233, "grad_norm": 0.8266382813453674, "learning_rate": 4.525379599563606e-05, "loss": 1.2476, "step": 1945 }, { "epoch": 0.5999769239644629, "grad_norm": 1.5638591051101685, "learning_rate": 4.5230158634573406e-05, "loss": 1.0609, "step": 1950 }, { "epoch": 0.6015153263336026, "grad_norm": 1.0881614685058594, "learning_rate": 4.5206468764998065e-05, "loss": 1.1339, "step": 1955 }, { "epoch": 0.6030537287027422, "grad_norm": 1.1539703607559204, "learning_rate": 4.518272644839843e-05, "loss": 1.0505, "step": 1960 }, { "epoch": 0.6045921310718818, "grad_norm": 1.331925630569458, "learning_rate": 4.515893174639899e-05, "loss": 1.1124, "step": 1965 }, { "epoch": 0.6061305334410215, "grad_norm": 1.0339001417160034, "learning_rate": 4.5135084720760254e-05, "loss": 1.1829, "step": 1970 }, { "epoch": 0.6076689358101611, "grad_norm": 0.8567355275154114, "learning_rate": 4.5111185433378514e-05, "loss": 1.1963, "step": 1975 }, { "epoch": 0.6092073381793008, "grad_norm": 1.3100231885910034, "learning_rate": 4.50872339462857e-05, "loss": 1.2722, "step": 1980 }, { "epoch": 0.6107457405484404, "grad_norm": 1.4597526788711548, "learning_rate": 4.506323032164925e-05, "loss": 1.3127, "step": 1985 }, { "epoch": 0.61228414291758, "grad_norm": 1.6040161848068237, "learning_rate": 4.503917462177192e-05, "loss": 1.2423, "step": 1990 }, { "epoch": 0.6138225452867198, "grad_norm": 1.1720243692398071, "learning_rate": 4.5015066909091625e-05, "loss": 1.1971, "step": 1995 }, { "epoch": 0.6153609476558594, "grad_norm": 0.9491767883300781, "learning_rate": 4.499090724618129e-05, "loss": 1.1892, "step": 2000 }, { "epoch": 0.6168993500249991, "grad_norm": 1.83636474609375, "learning_rate": 4.4966695695748686e-05, "loss": 1.0706, "step": 2005 }, { "epoch": 0.6184377523941387, "grad_norm": 1.8804785013198853, "learning_rate": 4.494243232063623e-05, "loss": 1.2081, "step": 2010 }, { "epoch": 0.6199761547632784, "grad_norm": 1.1638429164886475, "learning_rate": 4.4918117183820894e-05, "loss": 1.1153, "step": 2015 }, { "epoch": 0.621514557132418, "grad_norm": 1.120448112487793, "learning_rate": 4.489375034841397e-05, "loss": 1.26, "step": 2020 }, { "epoch": 0.6230529595015576, "grad_norm": 1.4325777292251587, "learning_rate": 4.486933187766095e-05, "loss": 1.1023, "step": 2025 }, { "epoch": 0.6245913618706973, "grad_norm": 1.2294832468032837, "learning_rate": 4.484486183494136e-05, "loss": 1.1005, "step": 2030 }, { "epoch": 0.6261297642398369, "grad_norm": 1.6747307777404785, "learning_rate": 4.4820340283768544e-05, "loss": 1.2555, "step": 2035 }, { "epoch": 0.6276681666089766, "grad_norm": 1.192826509475708, "learning_rate": 4.479576728778958e-05, "loss": 1.0987, "step": 2040 }, { "epoch": 0.6292065689781162, "grad_norm": 1.767940878868103, "learning_rate": 4.477114291078506e-05, "loss": 1.0992, "step": 2045 }, { "epoch": 0.6307449713472558, "grad_norm": 1.5168949365615845, "learning_rate": 4.474646721666893e-05, "loss": 1.1127, "step": 2050 }, { "epoch": 0.6322833737163955, "grad_norm": 2.4314072132110596, "learning_rate": 4.4721740269488355e-05, "loss": 1.141, "step": 2055 }, { "epoch": 0.6338217760855351, "grad_norm": 1.7688580751419067, "learning_rate": 4.46969621334235e-05, "loss": 1.1934, "step": 2060 }, { "epoch": 0.6353601784546749, "grad_norm": 1.0726224184036255, "learning_rate": 4.467213287278741e-05, "loss": 1.129, "step": 2065 }, { "epoch": 0.6368985808238145, "grad_norm": 1.0851497650146484, "learning_rate": 4.464725255202582e-05, "loss": 1.1553, "step": 2070 }, { "epoch": 0.6384369831929542, "grad_norm": 1.4708757400512695, "learning_rate": 4.462232123571702e-05, "loss": 1.1845, "step": 2075 }, { "epoch": 0.6399753855620938, "grad_norm": 1.3815021514892578, "learning_rate": 4.459733898857162e-05, "loss": 1.164, "step": 2080 }, { "epoch": 0.6415137879312334, "grad_norm": 0.9713008999824524, "learning_rate": 4.4572305875432465e-05, "loss": 1.2635, "step": 2085 }, { "epoch": 0.6430521903003731, "grad_norm": 1.7279797792434692, "learning_rate": 4.4547221961274385e-05, "loss": 1.2015, "step": 2090 }, { "epoch": 0.6445905926695127, "grad_norm": 3.183756113052368, "learning_rate": 4.4522087311204096e-05, "loss": 1.0928, "step": 2095 }, { "epoch": 0.6461289950386524, "grad_norm": 2.138756513595581, "learning_rate": 4.449690199046e-05, "loss": 1.2496, "step": 2100 }, { "epoch": 0.647667397407792, "grad_norm": 1.7964316606521606, "learning_rate": 4.447166606441201e-05, "loss": 1.123, "step": 2105 }, { "epoch": 0.6492057997769316, "grad_norm": 2.1838841438293457, "learning_rate": 4.444637959856137e-05, "loss": 1.0909, "step": 2110 }, { "epoch": 0.6507442021460713, "grad_norm": 1.1749359369277954, "learning_rate": 4.442104265854055e-05, "loss": 1.275, "step": 2115 }, { "epoch": 0.6522826045152109, "grad_norm": 1.2564892768859863, "learning_rate": 4.439565531011299e-05, "loss": 1.1814, "step": 2120 }, { "epoch": 0.6538210068843506, "grad_norm": 1.1078180074691772, "learning_rate": 4.4370217619172964e-05, "loss": 1.1717, "step": 2125 }, { "epoch": 0.6553594092534902, "grad_norm": 1.0620262622833252, "learning_rate": 4.434472965174545e-05, "loss": 1.2006, "step": 2130 }, { "epoch": 0.65689781162263, "grad_norm": 1.668600082397461, "learning_rate": 4.4319191473985884e-05, "loss": 1.16, "step": 2135 }, { "epoch": 0.6584362139917695, "grad_norm": 0.7764678597450256, "learning_rate": 4.429360315218005e-05, "loss": 1.3311, "step": 2140 }, { "epoch": 0.6599746163609091, "grad_norm": 0.9446391463279724, "learning_rate": 4.4267964752743854e-05, "loss": 1.2131, "step": 2145 }, { "epoch": 0.6615130187300489, "grad_norm": 0.8063297271728516, "learning_rate": 4.4242276342223235e-05, "loss": 1.15, "step": 2150 }, { "epoch": 0.6630514210991885, "grad_norm": 1.4049417972564697, "learning_rate": 4.421653798729387e-05, "loss": 1.2054, "step": 2155 }, { "epoch": 0.6645898234683282, "grad_norm": 1.716774344444275, "learning_rate": 4.4190749754761126e-05, "loss": 1.055, "step": 2160 }, { "epoch": 0.6661282258374678, "grad_norm": 2.2953073978424072, "learning_rate": 4.4164911711559803e-05, "loss": 1.2199, "step": 2165 }, { "epoch": 0.6676666282066075, "grad_norm": 0.9870630502700806, "learning_rate": 4.4139023924753995e-05, "loss": 1.1933, "step": 2170 }, { "epoch": 0.6692050305757471, "grad_norm": 1.69342839717865, "learning_rate": 4.41130864615369e-05, "loss": 1.1602, "step": 2175 }, { "epoch": 0.6707434329448867, "grad_norm": 1.0127511024475098, "learning_rate": 4.408709938923067e-05, "loss": 1.2509, "step": 2180 }, { "epoch": 0.6722818353140264, "grad_norm": 1.2484458684921265, "learning_rate": 4.40610627752862e-05, "loss": 1.1144, "step": 2185 }, { "epoch": 0.673820237683166, "grad_norm": 1.0753147602081299, "learning_rate": 4.403497668728299e-05, "loss": 1.2816, "step": 2190 }, { "epoch": 0.6753586400523057, "grad_norm": 1.2890567779541016, "learning_rate": 4.400884119292894e-05, "loss": 1.1233, "step": 2195 }, { "epoch": 0.6768970424214453, "grad_norm": 1.108391284942627, "learning_rate": 4.39826563600602e-05, "loss": 1.198, "step": 2200 }, { "epoch": 0.6784354447905849, "grad_norm": 0.9689030051231384, "learning_rate": 4.395642225664097e-05, "loss": 1.2354, "step": 2205 }, { "epoch": 0.6799738471597246, "grad_norm": 1.4813648462295532, "learning_rate": 4.393013895076335e-05, "loss": 1.1986, "step": 2210 }, { "epoch": 0.6815122495288642, "grad_norm": 1.4557727575302124, "learning_rate": 4.3903806510647115e-05, "loss": 1.2441, "step": 2215 }, { "epoch": 0.683050651898004, "grad_norm": 5.211197853088379, "learning_rate": 4.3877425004639616e-05, "loss": 1.2142, "step": 2220 }, { "epoch": 0.6845890542671436, "grad_norm": 1.0981544256210327, "learning_rate": 4.385099450121551e-05, "loss": 1.1192, "step": 2225 }, { "epoch": 0.6861274566362833, "grad_norm": 1.4392608404159546, "learning_rate": 4.3824515068976666e-05, "loss": 1.0806, "step": 2230 }, { "epoch": 0.6876658590054229, "grad_norm": 0.9992257952690125, "learning_rate": 4.3797986776651934e-05, "loss": 1.2906, "step": 2235 }, { "epoch": 0.6892042613745625, "grad_norm": 1.0846220254898071, "learning_rate": 4.3771409693096985e-05, "loss": 1.145, "step": 2240 }, { "epoch": 0.6907426637437022, "grad_norm": 1.7244092226028442, "learning_rate": 4.374478388729414e-05, "loss": 1.2479, "step": 2245 }, { "epoch": 0.6922810661128418, "grad_norm": 1.187263011932373, "learning_rate": 4.371810942835215e-05, "loss": 1.2843, "step": 2250 }, { "epoch": 0.6938194684819815, "grad_norm": 1.1405870914459229, "learning_rate": 4.369138638550611e-05, "loss": 1.2483, "step": 2255 }, { "epoch": 0.6953578708511211, "grad_norm": 1.399223804473877, "learning_rate": 4.3664614828117137e-05, "loss": 1.1166, "step": 2260 }, { "epoch": 0.6968962732202607, "grad_norm": 0.8091042637825012, "learning_rate": 4.363779482567234e-05, "loss": 1.2809, "step": 2265 }, { "epoch": 0.6984346755894004, "grad_norm": 1.0107675790786743, "learning_rate": 4.3610926447784534e-05, "loss": 1.1957, "step": 2270 }, { "epoch": 0.69997307795854, "grad_norm": 1.485467791557312, "learning_rate": 4.3584009764192094e-05, "loss": 1.1612, "step": 2275 }, { "epoch": 0.7015114803276797, "grad_norm": 1.0193709135055542, "learning_rate": 4.3557044844758796e-05, "loss": 1.1595, "step": 2280 }, { "epoch": 0.7030498826968193, "grad_norm": 1.1308696269989014, "learning_rate": 4.353003175947359e-05, "loss": 1.1202, "step": 2285 }, { "epoch": 0.704588285065959, "grad_norm": 1.233419418334961, "learning_rate": 4.3502970578450466e-05, "loss": 1.1749, "step": 2290 }, { "epoch": 0.7061266874350987, "grad_norm": 0.9433870315551758, "learning_rate": 4.3475861371928225e-05, "loss": 1.1464, "step": 2295 }, { "epoch": 0.7076650898042383, "grad_norm": 1.7299931049346924, "learning_rate": 4.344870421027036e-05, "loss": 1.1479, "step": 2300 }, { "epoch": 0.709203492173378, "grad_norm": 1.1724836826324463, "learning_rate": 4.3421499163964784e-05, "loss": 1.1548, "step": 2305 }, { "epoch": 0.7107418945425176, "grad_norm": 1.5910905599594116, "learning_rate": 4.339424630362373e-05, "loss": 1.1267, "step": 2310 }, { "epoch": 0.7122802969116573, "grad_norm": 1.4339747428894043, "learning_rate": 4.336694569998354e-05, "loss": 1.2297, "step": 2315 }, { "epoch": 0.7138186992807969, "grad_norm": 0.7568414211273193, "learning_rate": 4.333959742390444e-05, "loss": 1.228, "step": 2320 }, { "epoch": 0.7153571016499365, "grad_norm": 1.1691060066223145, "learning_rate": 4.331220154637044e-05, "loss": 1.0902, "step": 2325 }, { "epoch": 0.7168955040190762, "grad_norm": 1.296523094177246, "learning_rate": 4.328475813848906e-05, "loss": 1.2011, "step": 2330 }, { "epoch": 0.7184339063882158, "grad_norm": 1.0982245206832886, "learning_rate": 4.325726727149122e-05, "loss": 1.1082, "step": 2335 }, { "epoch": 0.7199723087573555, "grad_norm": 1.2875556945800781, "learning_rate": 4.3229729016731005e-05, "loss": 1.2202, "step": 2340 }, { "epoch": 0.7215107111264951, "grad_norm": 2.59907865524292, "learning_rate": 4.320214344568549e-05, "loss": 1.1947, "step": 2345 }, { "epoch": 0.7230491134956348, "grad_norm": 1.3555256128311157, "learning_rate": 4.317451062995458e-05, "loss": 1.1544, "step": 2350 }, { "epoch": 0.7245875158647744, "grad_norm": 2.351750135421753, "learning_rate": 4.3146830641260815e-05, "loss": 1.1858, "step": 2355 }, { "epoch": 0.726125918233914, "grad_norm": 1.5076963901519775, "learning_rate": 4.311910355144914e-05, "loss": 1.2452, "step": 2360 }, { "epoch": 0.7276643206030537, "grad_norm": 1.020558476448059, "learning_rate": 4.309132943248678e-05, "loss": 1.1731, "step": 2365 }, { "epoch": 0.7292027229721934, "grad_norm": 1.3791615962982178, "learning_rate": 4.306350835646303e-05, "loss": 1.1717, "step": 2370 }, { "epoch": 0.7307411253413331, "grad_norm": 1.8552170991897583, "learning_rate": 4.303564039558904e-05, "loss": 1.1315, "step": 2375 }, { "epoch": 0.7322795277104727, "grad_norm": 1.5803345441818237, "learning_rate": 4.3007725622197674e-05, "loss": 1.1341, "step": 2380 }, { "epoch": 0.7338179300796123, "grad_norm": 1.0923699140548706, "learning_rate": 4.2979764108743296e-05, "loss": 1.0577, "step": 2385 }, { "epoch": 0.735356332448752, "grad_norm": 0.9132150411605835, "learning_rate": 4.295175592780158e-05, "loss": 1.1672, "step": 2390 }, { "epoch": 0.7368947348178916, "grad_norm": 1.6605010032653809, "learning_rate": 4.2923701152069336e-05, "loss": 1.1087, "step": 2395 }, { "epoch": 0.7384331371870313, "grad_norm": 1.2207224369049072, "learning_rate": 4.28955998543643e-05, "loss": 1.1708, "step": 2400 }, { "epoch": 0.7399715395561709, "grad_norm": 2.0855581760406494, "learning_rate": 4.286745210762499e-05, "loss": 1.1594, "step": 2405 }, { "epoch": 0.7415099419253106, "grad_norm": 1.4070476293563843, "learning_rate": 4.283925798491044e-05, "loss": 1.2085, "step": 2410 }, { "epoch": 0.7430483442944502, "grad_norm": 1.0494474172592163, "learning_rate": 4.281101755940009e-05, "loss": 1.1877, "step": 2415 }, { "epoch": 0.7445867466635898, "grad_norm": 1.0690234899520874, "learning_rate": 4.2782730904393546e-05, "loss": 1.0301, "step": 2420 }, { "epoch": 0.7461251490327295, "grad_norm": 1.0572775602340698, "learning_rate": 4.275439809331041e-05, "loss": 1.1589, "step": 2425 }, { "epoch": 0.7476635514018691, "grad_norm": 1.0381057262420654, "learning_rate": 4.2726019199690093e-05, "loss": 1.1945, "step": 2430 }, { "epoch": 0.7492019537710088, "grad_norm": 2.4231226444244385, "learning_rate": 4.269759429719159e-05, "loss": 1.1851, "step": 2435 }, { "epoch": 0.7507403561401484, "grad_norm": 1.1113394498825073, "learning_rate": 4.266912345959335e-05, "loss": 1.1269, "step": 2440 }, { "epoch": 0.7522787585092882, "grad_norm": 1.7000584602355957, "learning_rate": 4.264060676079302e-05, "loss": 1.1546, "step": 2445 }, { "epoch": 0.7538171608784278, "grad_norm": 4.711215972900391, "learning_rate": 4.2612044274807295e-05, "loss": 1.1901, "step": 2450 }, { "epoch": 0.7553555632475674, "grad_norm": 1.1425622701644897, "learning_rate": 4.2583436075771706e-05, "loss": 1.2222, "step": 2455 }, { "epoch": 0.7568939656167071, "grad_norm": 1.1163311004638672, "learning_rate": 4.255478223794045e-05, "loss": 1.066, "step": 2460 }, { "epoch": 0.7584323679858467, "grad_norm": 0.741192102432251, "learning_rate": 4.252608283568616e-05, "loss": 1.3143, "step": 2465 }, { "epoch": 0.7599707703549864, "grad_norm": 0.9434729218482971, "learning_rate": 4.249733794349976e-05, "loss": 1.1981, "step": 2470 }, { "epoch": 0.761509172724126, "grad_norm": 1.200185775756836, "learning_rate": 4.246854763599022e-05, "loss": 1.1042, "step": 2475 }, { "epoch": 0.7630475750932656, "grad_norm": 1.1456506252288818, "learning_rate": 4.2439711987884406e-05, "loss": 1.1917, "step": 2480 }, { "epoch": 0.7645859774624053, "grad_norm": 1.4507484436035156, "learning_rate": 4.241083107402687e-05, "loss": 1.249, "step": 2485 }, { "epoch": 0.7661243798315449, "grad_norm": 1.5303988456726074, "learning_rate": 4.238190496937962e-05, "loss": 1.2334, "step": 2490 }, { "epoch": 0.7676627822006846, "grad_norm": 1.067226529121399, "learning_rate": 4.235293374902201e-05, "loss": 1.211, "step": 2495 }, { "epoch": 0.7692011845698242, "grad_norm": 1.0933020114898682, "learning_rate": 4.232391748815046e-05, "loss": 1.1696, "step": 2500 }, { "epoch": 0.7707395869389639, "grad_norm": 0.9041593670845032, "learning_rate": 4.2294856262078296e-05, "loss": 1.0995, "step": 2505 }, { "epoch": 0.7722779893081035, "grad_norm": 0.886325478553772, "learning_rate": 4.226575014623557e-05, "loss": 1.2787, "step": 2510 }, { "epoch": 0.7738163916772431, "grad_norm": 0.8392224311828613, "learning_rate": 4.223659921616885e-05, "loss": 1.1928, "step": 2515 }, { "epoch": 0.7753547940463829, "grad_norm": 2.0462377071380615, "learning_rate": 4.2207403547541e-05, "loss": 1.2198, "step": 2520 }, { "epoch": 0.7768931964155225, "grad_norm": 0.8376229405403137, "learning_rate": 4.2178163216131015e-05, "loss": 1.1459, "step": 2525 }, { "epoch": 0.7784315987846622, "grad_norm": 1.8360308408737183, "learning_rate": 4.214887829783383e-05, "loss": 1.2045, "step": 2530 }, { "epoch": 0.7799700011538018, "grad_norm": 1.962247610092163, "learning_rate": 4.2119548868660084e-05, "loss": 1.1892, "step": 2535 }, { "epoch": 0.7815084035229414, "grad_norm": 1.0114338397979736, "learning_rate": 4.209017500473596e-05, "loss": 1.2576, "step": 2540 }, { "epoch": 0.7830468058920811, "grad_norm": 2.040184259414673, "learning_rate": 4.206075678230297e-05, "loss": 1.0698, "step": 2545 }, { "epoch": 0.7845852082612207, "grad_norm": 1.3583984375, "learning_rate": 4.203129427771776e-05, "loss": 1.1858, "step": 2550 }, { "epoch": 0.7861236106303604, "grad_norm": 2.056542158126831, "learning_rate": 4.200178756745192e-05, "loss": 1.2202, "step": 2555 }, { "epoch": 0.7876620129995, "grad_norm": 1.1544386148452759, "learning_rate": 4.197223672809177e-05, "loss": 1.1182, "step": 2560 }, { "epoch": 0.7892004153686397, "grad_norm": 1.046268343925476, "learning_rate": 4.194264183633818e-05, "loss": 1.1568, "step": 2565 }, { "epoch": 0.7907388177377793, "grad_norm": 1.4613105058670044, "learning_rate": 4.1913002969006344e-05, "loss": 1.2269, "step": 2570 }, { "epoch": 0.7922772201069189, "grad_norm": 0.9770887494087219, "learning_rate": 4.188332020302561e-05, "loss": 1.0355, "step": 2575 }, { "epoch": 0.7938156224760586, "grad_norm": 1.2040941715240479, "learning_rate": 4.185359361543927e-05, "loss": 1.2607, "step": 2580 }, { "epoch": 0.7953540248451982, "grad_norm": 1.4072517156600952, "learning_rate": 4.182382328340434e-05, "loss": 1.2368, "step": 2585 }, { "epoch": 0.796892427214338, "grad_norm": 1.2777091264724731, "learning_rate": 4.17940092841914e-05, "loss": 1.3141, "step": 2590 }, { "epoch": 0.7984308295834776, "grad_norm": 0.8566577434539795, "learning_rate": 4.176415169518434e-05, "loss": 1.1347, "step": 2595 }, { "epoch": 0.7999692319526172, "grad_norm": 1.2150745391845703, "learning_rate": 4.173425059388023e-05, "loss": 0.99, "step": 2600 }, { "epoch": 0.8015076343217569, "grad_norm": 1.1194686889648438, "learning_rate": 4.1704306057889053e-05, "loss": 1.1714, "step": 2605 }, { "epoch": 0.8030460366908965, "grad_norm": 1.402685523033142, "learning_rate": 4.167431816493352e-05, "loss": 1.2142, "step": 2610 }, { "epoch": 0.8045844390600362, "grad_norm": 1.0395668745040894, "learning_rate": 4.1644286992848916e-05, "loss": 1.1401, "step": 2615 }, { "epoch": 0.8061228414291758, "grad_norm": 0.9008033871650696, "learning_rate": 4.161421261958281e-05, "loss": 1.2895, "step": 2620 }, { "epoch": 0.8076612437983155, "grad_norm": 1.628246545791626, "learning_rate": 4.158409512319493e-05, "loss": 1.0513, "step": 2625 }, { "epoch": 0.8091996461674551, "grad_norm": 1.2516933679580688, "learning_rate": 4.1553934581856945e-05, "loss": 1.1017, "step": 2630 }, { "epoch": 0.8107380485365947, "grad_norm": 1.1752599477767944, "learning_rate": 4.1523731073852215e-05, "loss": 1.2321, "step": 2635 }, { "epoch": 0.8122764509057344, "grad_norm": 1.0307414531707764, "learning_rate": 4.149348467757566e-05, "loss": 1.3027, "step": 2640 }, { "epoch": 0.813814853274874, "grad_norm": 1.7551286220550537, "learning_rate": 4.1463195471533476e-05, "loss": 1.0455, "step": 2645 }, { "epoch": 0.8153532556440137, "grad_norm": 1.2423598766326904, "learning_rate": 4.1432863534343016e-05, "loss": 1.0785, "step": 2650 }, { "epoch": 0.8168916580131533, "grad_norm": 2.572627305984497, "learning_rate": 4.140248894473253e-05, "loss": 1.2205, "step": 2655 }, { "epoch": 0.8184300603822929, "grad_norm": 1.2887595891952515, "learning_rate": 4.137207178154095e-05, "loss": 1.0928, "step": 2660 }, { "epoch": 0.8199684627514326, "grad_norm": 1.1292917728424072, "learning_rate": 4.134161212371776e-05, "loss": 1.0875, "step": 2665 }, { "epoch": 0.8215068651205722, "grad_norm": 1.0025163888931274, "learning_rate": 4.1311110050322674e-05, "loss": 1.3263, "step": 2670 }, { "epoch": 0.823045267489712, "grad_norm": 1.1535359621047974, "learning_rate": 4.128056564052558e-05, "loss": 1.0747, "step": 2675 }, { "epoch": 0.8245836698588516, "grad_norm": 1.1695550680160522, "learning_rate": 4.124997897360617e-05, "loss": 1.1265, "step": 2680 }, { "epoch": 0.8261220722279913, "grad_norm": 0.9830222725868225, "learning_rate": 4.1219350128953885e-05, "loss": 1.1475, "step": 2685 }, { "epoch": 0.8276604745971309, "grad_norm": 1.1742221117019653, "learning_rate": 4.118867918606759e-05, "loss": 1.2344, "step": 2690 }, { "epoch": 0.8291988769662705, "grad_norm": 0.9631347060203552, "learning_rate": 4.115796622455544e-05, "loss": 1.1101, "step": 2695 }, { "epoch": 0.8307372793354102, "grad_norm": 0.9511464238166809, "learning_rate": 4.112721132413467e-05, "loss": 1.2955, "step": 2700 }, { "epoch": 0.8322756817045498, "grad_norm": 1.5212576389312744, "learning_rate": 4.109641456463135e-05, "loss": 1.1907, "step": 2705 }, { "epoch": 0.8338140840736895, "grad_norm": 1.1152446269989014, "learning_rate": 4.106557602598017e-05, "loss": 1.143, "step": 2710 }, { "epoch": 0.8353524864428291, "grad_norm": 1.1330682039260864, "learning_rate": 4.103469578822432e-05, "loss": 1.1143, "step": 2715 }, { "epoch": 0.8368908888119688, "grad_norm": 1.170376181602478, "learning_rate": 4.1003773931515175e-05, "loss": 1.1621, "step": 2720 }, { "epoch": 0.8384292911811084, "grad_norm": 1.571735143661499, "learning_rate": 4.097281053611215e-05, "loss": 1.1911, "step": 2725 }, { "epoch": 0.839967693550248, "grad_norm": 0.9364776611328125, "learning_rate": 4.0941805682382484e-05, "loss": 1.171, "step": 2730 }, { "epoch": 0.8415060959193877, "grad_norm": 1.2696608304977417, "learning_rate": 4.091075945080101e-05, "loss": 1.2642, "step": 2735 }, { "epoch": 0.8430444982885273, "grad_norm": 4.6916584968566895, "learning_rate": 4.087967192194997e-05, "loss": 1.2212, "step": 2740 }, { "epoch": 0.8445829006576671, "grad_norm": 0.9691728353500366, "learning_rate": 4.0848543176518784e-05, "loss": 1.1457, "step": 2745 }, { "epoch": 0.8461213030268067, "grad_norm": 1.154585599899292, "learning_rate": 4.081737329530386e-05, "loss": 1.2012, "step": 2750 }, { "epoch": 0.8476597053959463, "grad_norm": 1.3288275003433228, "learning_rate": 4.0786162359208386e-05, "loss": 1.2624, "step": 2755 }, { "epoch": 0.849198107765086, "grad_norm": 4.425066947937012, "learning_rate": 4.075491044924209e-05, "loss": 1.0656, "step": 2760 }, { "epoch": 0.8507365101342256, "grad_norm": 0.917151927947998, "learning_rate": 4.072361764652105e-05, "loss": 1.0405, "step": 2765 }, { "epoch": 0.8522749125033653, "grad_norm": 0.9385243654251099, "learning_rate": 4.0692284032267516e-05, "loss": 1.2393, "step": 2770 }, { "epoch": 0.8538133148725049, "grad_norm": 1.1418960094451904, "learning_rate": 4.0660909687809625e-05, "loss": 1.2778, "step": 2775 }, { "epoch": 0.8553517172416446, "grad_norm": 0.9725258350372314, "learning_rate": 4.062949469458125e-05, "loss": 1.1223, "step": 2780 }, { "epoch": 0.8568901196107842, "grad_norm": 0.8571771383285522, "learning_rate": 4.059803913412178e-05, "loss": 1.1825, "step": 2785 }, { "epoch": 0.8584285219799238, "grad_norm": 1.6413207054138184, "learning_rate": 4.056654308807588e-05, "loss": 1.1504, "step": 2790 }, { "epoch": 0.8599669243490635, "grad_norm": 1.0783355236053467, "learning_rate": 4.053500663819331e-05, "loss": 1.2997, "step": 2795 }, { "epoch": 0.8615053267182031, "grad_norm": 1.5912343263626099, "learning_rate": 4.05034298663287e-05, "loss": 1.201, "step": 2800 }, { "epoch": 0.8630437290873428, "grad_norm": 1.5445067882537842, "learning_rate": 4.047181285444133e-05, "loss": 1.0921, "step": 2805 }, { "epoch": 0.8645821314564824, "grad_norm": 1.2195634841918945, "learning_rate": 4.0440155684594915e-05, "loss": 1.1598, "step": 2810 }, { "epoch": 0.866120533825622, "grad_norm": 1.3299264907836914, "learning_rate": 4.0408458438957454e-05, "loss": 1.2355, "step": 2815 }, { "epoch": 0.8676589361947618, "grad_norm": 1.7162269353866577, "learning_rate": 4.0376721199800896e-05, "loss": 1.1072, "step": 2820 }, { "epoch": 0.8691973385639014, "grad_norm": 1.240971326828003, "learning_rate": 4.034494404950103e-05, "loss": 1.184, "step": 2825 }, { "epoch": 0.8707357409330411, "grad_norm": 1.2077282667160034, "learning_rate": 4.0313127070537244e-05, "loss": 1.1602, "step": 2830 }, { "epoch": 0.8722741433021807, "grad_norm": 0.9052446484565735, "learning_rate": 4.028127034549229e-05, "loss": 1.1715, "step": 2835 }, { "epoch": 0.8738125456713204, "grad_norm": 1.6603598594665527, "learning_rate": 4.024937395705209e-05, "loss": 1.161, "step": 2840 }, { "epoch": 0.87535094804046, "grad_norm": 1.1748160123825073, "learning_rate": 4.0217437988005515e-05, "loss": 1.2401, "step": 2845 }, { "epoch": 0.8768893504095996, "grad_norm": 0.7816171646118164, "learning_rate": 4.0185462521244146e-05, "loss": 1.289, "step": 2850 }, { "epoch": 0.8784277527787393, "grad_norm": 2.378026247024536, "learning_rate": 4.015344763976212e-05, "loss": 1.1013, "step": 2855 }, { "epoch": 0.8799661551478789, "grad_norm": 0.9903531670570374, "learning_rate": 4.012139342665586e-05, "loss": 1.1572, "step": 2860 }, { "epoch": 0.8815045575170186, "grad_norm": 1.0575708150863647, "learning_rate": 4.0089299965123875e-05, "loss": 1.1393, "step": 2865 }, { "epoch": 0.8830429598861582, "grad_norm": 1.2809288501739502, "learning_rate": 4.005716733846653e-05, "loss": 1.1285, "step": 2870 }, { "epoch": 0.8845813622552978, "grad_norm": 2.3209915161132812, "learning_rate": 4.0024995630085885e-05, "loss": 1.1748, "step": 2875 }, { "epoch": 0.8861197646244375, "grad_norm": 1.7122613191604614, "learning_rate": 3.999278492348539e-05, "loss": 1.188, "step": 2880 }, { "epoch": 0.8876581669935771, "grad_norm": 1.0947169065475464, "learning_rate": 3.996053530226977e-05, "loss": 1.1547, "step": 2885 }, { "epoch": 0.8891965693627168, "grad_norm": 2.022311210632324, "learning_rate": 3.992824685014471e-05, "loss": 1.1289, "step": 2890 }, { "epoch": 0.8907349717318565, "grad_norm": 1.397825837135315, "learning_rate": 3.9895919650916716e-05, "loss": 1.2032, "step": 2895 }, { "epoch": 0.8922733741009962, "grad_norm": 0.9696956276893616, "learning_rate": 3.9863553788492834e-05, "loss": 1.0806, "step": 2900 }, { "epoch": 0.8938117764701358, "grad_norm": 1.9239165782928467, "learning_rate": 3.983114934688048e-05, "loss": 1.1205, "step": 2905 }, { "epoch": 0.8953501788392754, "grad_norm": 1.3101149797439575, "learning_rate": 3.97987064101872e-05, "loss": 1.1769, "step": 2910 }, { "epoch": 0.8968885812084151, "grad_norm": 0.9820067286491394, "learning_rate": 3.976622506262047e-05, "loss": 1.1518, "step": 2915 }, { "epoch": 0.8984269835775547, "grad_norm": 1.2778306007385254, "learning_rate": 3.973370538848744e-05, "loss": 1.156, "step": 2920 }, { "epoch": 0.8999653859466944, "grad_norm": 2.279306411743164, "learning_rate": 3.970114747219475e-05, "loss": 1.0314, "step": 2925 }, { "epoch": 0.901503788315834, "grad_norm": 1.2938816547393799, "learning_rate": 3.966855139824831e-05, "loss": 1.1778, "step": 2930 }, { "epoch": 0.9030421906849736, "grad_norm": 0.8507199883460999, "learning_rate": 3.963591725125305e-05, "loss": 1.1217, "step": 2935 }, { "epoch": 0.9045805930541133, "grad_norm": 1.3915897607803345, "learning_rate": 3.9603245115912736e-05, "loss": 1.2169, "step": 2940 }, { "epoch": 0.9061189954232529, "grad_norm": 1.3529155254364014, "learning_rate": 3.9570535077029724e-05, "loss": 1.1911, "step": 2945 }, { "epoch": 0.9076573977923926, "grad_norm": 1.5553487539291382, "learning_rate": 3.953778721950477e-05, "loss": 1.1837, "step": 2950 }, { "epoch": 0.9091958001615322, "grad_norm": 1.1085933446884155, "learning_rate": 3.9505001628336757e-05, "loss": 1.2342, "step": 2955 }, { "epoch": 0.9107342025306719, "grad_norm": 1.0837507247924805, "learning_rate": 3.947217838862255e-05, "loss": 1.2045, "step": 2960 }, { "epoch": 0.9122726048998115, "grad_norm": 0.9050890207290649, "learning_rate": 3.943931758555669e-05, "loss": 1.2006, "step": 2965 }, { "epoch": 0.9138110072689511, "grad_norm": 1.1101486682891846, "learning_rate": 3.940641930443125e-05, "loss": 1.1462, "step": 2970 }, { "epoch": 0.9153494096380909, "grad_norm": 2.048758029937744, "learning_rate": 3.9373483630635564e-05, "loss": 1.1595, "step": 2975 }, { "epoch": 0.9168878120072305, "grad_norm": 1.0824599266052246, "learning_rate": 3.934051064965602e-05, "loss": 1.1756, "step": 2980 }, { "epoch": 0.9184262143763702, "grad_norm": 2.319460153579712, "learning_rate": 3.9307500447075844e-05, "loss": 1.1852, "step": 2985 }, { "epoch": 0.9199646167455098, "grad_norm": 0.8897768259048462, "learning_rate": 3.927445310857487e-05, "loss": 1.0803, "step": 2990 }, { "epoch": 0.9215030191146495, "grad_norm": 0.7820705771446228, "learning_rate": 3.924136871992932e-05, "loss": 1.2017, "step": 2995 }, { "epoch": 0.9230414214837891, "grad_norm": 1.2413580417633057, "learning_rate": 3.9208247367011574e-05, "loss": 1.1179, "step": 3000 }, { "epoch": 0.9245798238529287, "grad_norm": 2.127316951751709, "learning_rate": 3.9175089135789987e-05, "loss": 1.059, "step": 3005 }, { "epoch": 0.9261182262220684, "grad_norm": 2.3183646202087402, "learning_rate": 3.914189411232858e-05, "loss": 1.1931, "step": 3010 }, { "epoch": 0.927656628591208, "grad_norm": 1.7735154628753662, "learning_rate": 3.9108662382786925e-05, "loss": 1.1452, "step": 3015 }, { "epoch": 0.9291950309603477, "grad_norm": 1.7767513990402222, "learning_rate": 3.9075394033419826e-05, "loss": 1.0446, "step": 3020 }, { "epoch": 0.9307334333294873, "grad_norm": 0.9649571776390076, "learning_rate": 3.904208915057716e-05, "loss": 1.1965, "step": 3025 }, { "epoch": 0.9322718356986269, "grad_norm": 1.123539686203003, "learning_rate": 3.900874782070362e-05, "loss": 1.1604, "step": 3030 }, { "epoch": 0.9338102380677666, "grad_norm": 1.48845374584198, "learning_rate": 3.897537013033849e-05, "loss": 1.1589, "step": 3035 }, { "epoch": 0.9353486404369062, "grad_norm": 0.8311707377433777, "learning_rate": 3.8941956166115454e-05, "loss": 1.2621, "step": 3040 }, { "epoch": 0.936887042806046, "grad_norm": 1.6711184978485107, "learning_rate": 3.890850601476233e-05, "loss": 1.2114, "step": 3045 }, { "epoch": 0.9384254451751856, "grad_norm": 1.2482969760894775, "learning_rate": 3.887501976310086e-05, "loss": 1.1918, "step": 3050 }, { "epoch": 0.9399638475443253, "grad_norm": 1.0476139783859253, "learning_rate": 3.884149749804648e-05, "loss": 1.1405, "step": 3055 }, { "epoch": 0.9415022499134649, "grad_norm": 0.864454984664917, "learning_rate": 3.880793930660813e-05, "loss": 1.1581, "step": 3060 }, { "epoch": 0.9430406522826045, "grad_norm": 1.3563313484191895, "learning_rate": 3.877434527588798e-05, "loss": 1.1005, "step": 3065 }, { "epoch": 0.9445790546517442, "grad_norm": 1.5107038021087646, "learning_rate": 3.8740715493081203e-05, "loss": 1.0558, "step": 3070 }, { "epoch": 0.9461174570208838, "grad_norm": 1.4398322105407715, "learning_rate": 3.87070500454758e-05, "loss": 1.2505, "step": 3075 }, { "epoch": 0.9476558593900235, "grad_norm": 1.0815845727920532, "learning_rate": 3.867334902045234e-05, "loss": 1.2257, "step": 3080 }, { "epoch": 0.9491942617591631, "grad_norm": 1.0675805807113647, "learning_rate": 3.863961250548371e-05, "loss": 1.0496, "step": 3085 }, { "epoch": 0.9507326641283027, "grad_norm": 1.3433740139007568, "learning_rate": 3.860584058813495e-05, "loss": 1.2241, "step": 3090 }, { "epoch": 0.9522710664974424, "grad_norm": 0.8536397218704224, "learning_rate": 3.8572033356062943e-05, "loss": 1.2073, "step": 3095 }, { "epoch": 0.953809468866582, "grad_norm": 1.1579813957214355, "learning_rate": 3.853819089701627e-05, "loss": 1.2149, "step": 3100 }, { "epoch": 0.9553478712357217, "grad_norm": 1.0714601278305054, "learning_rate": 3.850431329883493e-05, "loss": 1.2619, "step": 3105 }, { "epoch": 0.9568862736048613, "grad_norm": 1.3584688901901245, "learning_rate": 3.847040064945014e-05, "loss": 1.1022, "step": 3110 }, { "epoch": 0.958424675974001, "grad_norm": 3.527039051055908, "learning_rate": 3.843645303688408e-05, "loss": 1.3198, "step": 3115 }, { "epoch": 0.9599630783431407, "grad_norm": 0.9427330493927002, "learning_rate": 3.840247054924968e-05, "loss": 1.1009, "step": 3120 }, { "epoch": 0.9615014807122803, "grad_norm": 1.2566494941711426, "learning_rate": 3.836845327475041e-05, "loss": 1.1355, "step": 3125 }, { "epoch": 0.96303988308142, "grad_norm": 1.2846300601959229, "learning_rate": 3.833440130167999e-05, "loss": 1.1365, "step": 3130 }, { "epoch": 0.9645782854505596, "grad_norm": 0.7137424945831299, "learning_rate": 3.830031471842226e-05, "loss": 1.0481, "step": 3135 }, { "epoch": 0.9661166878196993, "grad_norm": 2.03141188621521, "learning_rate": 3.826619361345084e-05, "loss": 1.32, "step": 3140 }, { "epoch": 0.9676550901888389, "grad_norm": 2.1782736778259277, "learning_rate": 3.823203807532898e-05, "loss": 1.2055, "step": 3145 }, { "epoch": 0.9691934925579785, "grad_norm": 0.9288005232810974, "learning_rate": 3.8197848192709286e-05, "loss": 1.1541, "step": 3150 }, { "epoch": 0.9707318949271182, "grad_norm": 1.3783565759658813, "learning_rate": 3.816362405433353e-05, "loss": 1.213, "step": 3155 }, { "epoch": 0.9722702972962578, "grad_norm": 2.103428363800049, "learning_rate": 3.81293657490324e-05, "loss": 1.2212, "step": 3160 }, { "epoch": 0.9738086996653975, "grad_norm": 0.9342910051345825, "learning_rate": 3.809507336572522e-05, "loss": 1.2327, "step": 3165 }, { "epoch": 0.9753471020345371, "grad_norm": 1.7723548412322998, "learning_rate": 3.80607469934198e-05, "loss": 1.1732, "step": 3170 }, { "epoch": 0.9768855044036768, "grad_norm": 1.1316125392913818, "learning_rate": 3.80263867212122e-05, "loss": 1.1486, "step": 3175 }, { "epoch": 0.9784239067728164, "grad_norm": 0.8731313347816467, "learning_rate": 3.79919926382864e-05, "loss": 1.2067, "step": 3180 }, { "epoch": 0.979962309141956, "grad_norm": 0.8731054067611694, "learning_rate": 3.795756483391419e-05, "loss": 1.1672, "step": 3185 }, { "epoch": 0.9815007115110957, "grad_norm": 1.165216326713562, "learning_rate": 3.792310339745486e-05, "loss": 1.1711, "step": 3190 }, { "epoch": 0.9830391138802353, "grad_norm": 1.8140064477920532, "learning_rate": 3.788860841835502e-05, "loss": 1.1883, "step": 3195 }, { "epoch": 0.9845775162493751, "grad_norm": 1.9973480701446533, "learning_rate": 3.785407998614831e-05, "loss": 1.0442, "step": 3200 }, { "epoch": 0.9861159186185147, "grad_norm": 1.0596879720687866, "learning_rate": 3.781951819045521e-05, "loss": 1.2081, "step": 3205 }, { "epoch": 0.9876543209876543, "grad_norm": 0.9382377862930298, "learning_rate": 3.778492312098283e-05, "loss": 1.2106, "step": 3210 }, { "epoch": 0.989192723356794, "grad_norm": 1.565450668334961, "learning_rate": 3.7750294867524585e-05, "loss": 1.0911, "step": 3215 }, { "epoch": 0.9907311257259336, "grad_norm": 1.8136720657348633, "learning_rate": 3.771563351996006e-05, "loss": 1.1857, "step": 3220 }, { "epoch": 0.9922695280950733, "grad_norm": 1.7879712581634521, "learning_rate": 3.7680939168254733e-05, "loss": 1.2005, "step": 3225 }, { "epoch": 0.9938079304642129, "grad_norm": 0.8127261996269226, "learning_rate": 3.7646211902459736e-05, "loss": 1.1781, "step": 3230 }, { "epoch": 0.9953463328333526, "grad_norm": 0.8571304082870483, "learning_rate": 3.761145181271164e-05, "loss": 1.2937, "step": 3235 }, { "epoch": 0.9968847352024922, "grad_norm": 0.9852257966995239, "learning_rate": 3.757665898923223e-05, "loss": 1.0958, "step": 3240 }, { "epoch": 0.9984231375716318, "grad_norm": 1.0459402799606323, "learning_rate": 3.75418335223282e-05, "loss": 1.353, "step": 3245 }, { "epoch": 0.9999615399407715, "grad_norm": 0.9802791476249695, "learning_rate": 3.750697550239102e-05, "loss": 1.1591, "step": 3250 }, { "epoch": 1.0014999423099111, "grad_norm": 1.1630258560180664, "learning_rate": 3.747208501989663e-05, "loss": 1.1936, "step": 3255 }, { "epoch": 1.0030383446790507, "grad_norm": 1.5038493871688843, "learning_rate": 3.743716216540526e-05, "loss": 1.1801, "step": 3260 }, { "epoch": 1.0045767470481906, "grad_norm": 3.152625799179077, "learning_rate": 3.7402207029561116e-05, "loss": 1.0709, "step": 3265 }, { "epoch": 1.0061151494173302, "grad_norm": 0.881455659866333, "learning_rate": 3.736721970309222e-05, "loss": 1.2295, "step": 3270 }, { "epoch": 1.0076535517864698, "grad_norm": 1.6096093654632568, "learning_rate": 3.7332200276810145e-05, "loss": 1.1981, "step": 3275 }, { "epoch": 1.0091919541556094, "grad_norm": 0.9019604325294495, "learning_rate": 3.7297148841609785e-05, "loss": 1.1362, "step": 3280 }, { "epoch": 1.010730356524749, "grad_norm": 1.8702175617218018, "learning_rate": 3.72620654884691e-05, "loss": 1.0372, "step": 3285 }, { "epoch": 1.0122687588938888, "grad_norm": 0.9774813652038574, "learning_rate": 3.722695030844891e-05, "loss": 1.0715, "step": 3290 }, { "epoch": 1.0138071612630284, "grad_norm": 2.0321578979492188, "learning_rate": 3.7191803392692626e-05, "loss": 1.2701, "step": 3295 }, { "epoch": 1.015345563632168, "grad_norm": 0.8944913744926453, "learning_rate": 3.715662483242605e-05, "loss": 1.1084, "step": 3300 }, { "epoch": 1.0168839660013076, "grad_norm": 0.9244990944862366, "learning_rate": 3.712141471895711e-05, "loss": 1.1145, "step": 3305 }, { "epoch": 1.0184223683704472, "grad_norm": 1.136596441268921, "learning_rate": 3.708617314367562e-05, "loss": 1.0873, "step": 3310 }, { "epoch": 1.019960770739587, "grad_norm": 0.8394299745559692, "learning_rate": 3.7050900198053096e-05, "loss": 1.1808, "step": 3315 }, { "epoch": 1.0214991731087266, "grad_norm": 1.818331003189087, "learning_rate": 3.701559597364242e-05, "loss": 1.1542, "step": 3320 }, { "epoch": 1.0230375754778662, "grad_norm": 0.9814029932022095, "learning_rate": 3.6980260562077694e-05, "loss": 1.1148, "step": 3325 }, { "epoch": 1.0245759778470058, "grad_norm": 0.9357233643531799, "learning_rate": 3.6944894055073984e-05, "loss": 1.1327, "step": 3330 }, { "epoch": 1.0261143802161454, "grad_norm": 1.1413395404815674, "learning_rate": 3.690949654442701e-05, "loss": 1.053, "step": 3335 }, { "epoch": 1.0276527825852853, "grad_norm": 0.9855743050575256, "learning_rate": 3.6874068122013035e-05, "loss": 1.0974, "step": 3340 }, { "epoch": 1.0291911849544249, "grad_norm": 1.7641159296035767, "learning_rate": 3.6838608879788496e-05, "loss": 1.2385, "step": 3345 }, { "epoch": 1.0307295873235645, "grad_norm": 1.2974658012390137, "learning_rate": 3.680311890978985e-05, "loss": 1.2269, "step": 3350 }, { "epoch": 1.032267989692704, "grad_norm": 1.1772881746292114, "learning_rate": 3.6767598304133324e-05, "loss": 1.2326, "step": 3355 }, { "epoch": 1.0338063920618439, "grad_norm": 1.706217646598816, "learning_rate": 3.673204715501461e-05, "loss": 1.0957, "step": 3360 }, { "epoch": 1.0353447944309835, "grad_norm": 1.6567659378051758, "learning_rate": 3.669646555470874e-05, "loss": 1.1362, "step": 3365 }, { "epoch": 1.036883196800123, "grad_norm": 2.460679769515991, "learning_rate": 3.6660853595569754e-05, "loss": 1.1386, "step": 3370 }, { "epoch": 1.0384215991692627, "grad_norm": 1.0992755889892578, "learning_rate": 3.662521137003048e-05, "loss": 1.0517, "step": 3375 }, { "epoch": 1.0399600015384023, "grad_norm": 1.0326275825500488, "learning_rate": 3.6589538970602325e-05, "loss": 1.1088, "step": 3380 }, { "epoch": 1.0414984039075421, "grad_norm": 0.9226017594337463, "learning_rate": 3.6553836489875e-05, "loss": 1.1799, "step": 3385 }, { "epoch": 1.0430368062766817, "grad_norm": 2.348419189453125, "learning_rate": 3.65181040205163e-05, "loss": 1.1934, "step": 3390 }, { "epoch": 1.0445752086458213, "grad_norm": 0.982110857963562, "learning_rate": 3.648234165527185e-05, "loss": 1.1904, "step": 3395 }, { "epoch": 1.046113611014961, "grad_norm": 1.326486587524414, "learning_rate": 3.6446549486964884e-05, "loss": 1.1015, "step": 3400 }, { "epoch": 1.0476520133841005, "grad_norm": 1.5188394784927368, "learning_rate": 3.641072760849599e-05, "loss": 1.1911, "step": 3405 }, { "epoch": 1.0491904157532403, "grad_norm": 0.9235185384750366, "learning_rate": 3.637487611284285e-05, "loss": 1.1657, "step": 3410 }, { "epoch": 1.05072881812238, "grad_norm": 1.6619833707809448, "learning_rate": 3.633899509306004e-05, "loss": 1.108, "step": 3415 }, { "epoch": 1.0522672204915196, "grad_norm": 1.4106361865997314, "learning_rate": 3.630308464227877e-05, "loss": 1.2603, "step": 3420 }, { "epoch": 1.0538056228606592, "grad_norm": 1.5941157341003418, "learning_rate": 3.626714485370662e-05, "loss": 1.2641, "step": 3425 }, { "epoch": 1.0553440252297988, "grad_norm": 1.66989004611969, "learning_rate": 3.6231175820627344e-05, "loss": 1.2124, "step": 3430 }, { "epoch": 1.0568824275989386, "grad_norm": 1.6584402322769165, "learning_rate": 3.6195177636400565e-05, "loss": 1.1966, "step": 3435 }, { "epoch": 1.0584208299680782, "grad_norm": 1.228973150253296, "learning_rate": 3.615915039446162e-05, "loss": 0.9945, "step": 3440 }, { "epoch": 1.0599592323372178, "grad_norm": 2.353611946105957, "learning_rate": 3.6123094188321205e-05, "loss": 1.1743, "step": 3445 }, { "epoch": 1.0614976347063574, "grad_norm": 1.3476722240447998, "learning_rate": 3.608700911156525e-05, "loss": 1.0994, "step": 3450 }, { "epoch": 1.0630360370754972, "grad_norm": 2.2004075050354004, "learning_rate": 3.60508952578546e-05, "loss": 1.0864, "step": 3455 }, { "epoch": 1.0645744394446368, "grad_norm": 1.1636754274368286, "learning_rate": 3.601475272092478e-05, "loss": 1.1245, "step": 3460 }, { "epoch": 1.0661128418137764, "grad_norm": 1.2166703939437866, "learning_rate": 3.597858159458578e-05, "loss": 1.1763, "step": 3465 }, { "epoch": 1.067651244182916, "grad_norm": 1.0816514492034912, "learning_rate": 3.594238197272177e-05, "loss": 1.2429, "step": 3470 }, { "epoch": 1.0691896465520556, "grad_norm": 1.839756965637207, "learning_rate": 3.5906153949290935e-05, "loss": 1.1356, "step": 3475 }, { "epoch": 1.0707280489211954, "grad_norm": 1.1021186113357544, "learning_rate": 3.5869897618325126e-05, "loss": 1.2321, "step": 3480 }, { "epoch": 1.072266451290335, "grad_norm": 1.5904773473739624, "learning_rate": 3.5833613073929684e-05, "loss": 1.1777, "step": 3485 }, { "epoch": 1.0738048536594746, "grad_norm": 1.026505947113037, "learning_rate": 3.579730041028317e-05, "loss": 1.1767, "step": 3490 }, { "epoch": 1.0753432560286142, "grad_norm": 1.023456335067749, "learning_rate": 3.576095972163718e-05, "loss": 1.1832, "step": 3495 }, { "epoch": 1.0768816583977538, "grad_norm": 0.9071354269981384, "learning_rate": 3.572459110231598e-05, "loss": 1.0945, "step": 3500 }, { "epoch": 1.0784200607668937, "grad_norm": 1.8704088926315308, "learning_rate": 3.568819464671637e-05, "loss": 1.1804, "step": 3505 }, { "epoch": 1.0799584631360333, "grad_norm": 2.5177040100097656, "learning_rate": 3.565177044930739e-05, "loss": 1.1204, "step": 3510 }, { "epoch": 1.0814968655051729, "grad_norm": 2.31343674659729, "learning_rate": 3.56153186046301e-05, "loss": 1.0483, "step": 3515 }, { "epoch": 1.0830352678743125, "grad_norm": 1.1547645330429077, "learning_rate": 3.5578839207297306e-05, "loss": 1.1178, "step": 3520 }, { "epoch": 1.084573670243452, "grad_norm": 1.569669246673584, "learning_rate": 3.5542332351993324e-05, "loss": 1.1746, "step": 3525 }, { "epoch": 1.086112072612592, "grad_norm": 1.2639979124069214, "learning_rate": 3.550579813347376e-05, "loss": 1.0747, "step": 3530 }, { "epoch": 1.0876504749817315, "grad_norm": 0.9912775754928589, "learning_rate": 3.546923664656523e-05, "loss": 1.1877, "step": 3535 }, { "epoch": 1.089188877350871, "grad_norm": 1.9763455390930176, "learning_rate": 3.54326479861651e-05, "loss": 1.2445, "step": 3540 }, { "epoch": 1.0907272797200107, "grad_norm": 0.9376320838928223, "learning_rate": 3.539603224724133e-05, "loss": 1.2454, "step": 3545 }, { "epoch": 1.0922656820891503, "grad_norm": 1.8034776449203491, "learning_rate": 3.535938952483211e-05, "loss": 1.2255, "step": 3550 }, { "epoch": 1.0938040844582901, "grad_norm": 1.3023470640182495, "learning_rate": 3.532271991404568e-05, "loss": 1.1488, "step": 3555 }, { "epoch": 1.0953424868274297, "grad_norm": 1.3908381462097168, "learning_rate": 3.528602351006006e-05, "loss": 1.1498, "step": 3560 }, { "epoch": 1.0968808891965693, "grad_norm": 2.6319515705108643, "learning_rate": 3.524930040812286e-05, "loss": 1.2605, "step": 3565 }, { "epoch": 1.098419291565709, "grad_norm": 1.7958158254623413, "learning_rate": 3.521255070355093e-05, "loss": 1.1793, "step": 3570 }, { "epoch": 1.0999576939348485, "grad_norm": 1.8693649768829346, "learning_rate": 3.5175774491730204e-05, "loss": 1.2756, "step": 3575 }, { "epoch": 1.1014960963039884, "grad_norm": 4.634489059448242, "learning_rate": 3.513897186811539e-05, "loss": 1.1377, "step": 3580 }, { "epoch": 1.103034498673128, "grad_norm": 1.2640043497085571, "learning_rate": 3.510214292822978e-05, "loss": 1.0172, "step": 3585 }, { "epoch": 1.1045729010422676, "grad_norm": 0.9812289476394653, "learning_rate": 3.506528776766495e-05, "loss": 1.2052, "step": 3590 }, { "epoch": 1.1061113034114072, "grad_norm": 0.8409488797187805, "learning_rate": 3.5028406482080536e-05, "loss": 1.2435, "step": 3595 }, { "epoch": 1.107649705780547, "grad_norm": 0.9872679710388184, "learning_rate": 3.499149916720398e-05, "loss": 1.06, "step": 3600 }, { "epoch": 1.1091881081496866, "grad_norm": 2.1323914527893066, "learning_rate": 3.495456591883031e-05, "loss": 1.0506, "step": 3605 }, { "epoch": 1.1107265105188262, "grad_norm": 1.2901802062988281, "learning_rate": 3.4917606832821824e-05, "loss": 1.099, "step": 3610 }, { "epoch": 1.1122649128879658, "grad_norm": 1.414628505706787, "learning_rate": 3.488062200510791e-05, "loss": 1.1987, "step": 3615 }, { "epoch": 1.1138033152571054, "grad_norm": 1.1899245977401733, "learning_rate": 3.4843611531684786e-05, "loss": 1.1429, "step": 3620 }, { "epoch": 1.1153417176262452, "grad_norm": 0.9524982571601868, "learning_rate": 3.480657550861518e-05, "loss": 1.1281, "step": 3625 }, { "epoch": 1.1168801199953848, "grad_norm": 1.219228744506836, "learning_rate": 3.47695140320282e-05, "loss": 1.1641, "step": 3630 }, { "epoch": 1.1184185223645244, "grad_norm": 2.172743320465088, "learning_rate": 3.473242719811897e-05, "loss": 1.2067, "step": 3635 }, { "epoch": 1.119956924733664, "grad_norm": 1.288135051727295, "learning_rate": 3.4695315103148454e-05, "loss": 1.1588, "step": 3640 }, { "epoch": 1.1214953271028036, "grad_norm": 1.3018666505813599, "learning_rate": 3.465817784344318e-05, "loss": 1.1281, "step": 3645 }, { "epoch": 1.1230337294719435, "grad_norm": 1.5916048288345337, "learning_rate": 3.462101551539499e-05, "loss": 1.2701, "step": 3650 }, { "epoch": 1.124572131841083, "grad_norm": 1.3709497451782227, "learning_rate": 3.45838282154608e-05, "loss": 1.223, "step": 3655 }, { "epoch": 1.1261105342102227, "grad_norm": 1.505096673965454, "learning_rate": 3.4546616040162334e-05, "loss": 1.1068, "step": 3660 }, { "epoch": 1.1276489365793623, "grad_norm": 1.1039677858352661, "learning_rate": 3.450937908608587e-05, "loss": 1.216, "step": 3665 }, { "epoch": 1.129187338948502, "grad_norm": 1.4067137241363525, "learning_rate": 3.4472117449882025e-05, "loss": 1.1304, "step": 3670 }, { "epoch": 1.1307257413176417, "grad_norm": 0.9000976085662842, "learning_rate": 3.443483122826547e-05, "loss": 1.1709, "step": 3675 }, { "epoch": 1.1322641436867813, "grad_norm": 1.4269496202468872, "learning_rate": 3.439752051801467e-05, "loss": 1.2391, "step": 3680 }, { "epoch": 1.133802546055921, "grad_norm": 0.959568977355957, "learning_rate": 3.436018541597169e-05, "loss": 1.1445, "step": 3685 }, { "epoch": 1.1353409484250605, "grad_norm": 0.9494476318359375, "learning_rate": 3.4322826019041864e-05, "loss": 1.1943, "step": 3690 }, { "epoch": 1.1368793507942003, "grad_norm": 0.8512333631515503, "learning_rate": 3.428544242419362e-05, "loss": 1.2019, "step": 3695 }, { "epoch": 1.13841775316334, "grad_norm": 0.9752541780471802, "learning_rate": 3.4248034728458175e-05, "loss": 1.1219, "step": 3700 }, { "epoch": 1.1399561555324795, "grad_norm": 1.4414304494857788, "learning_rate": 3.4210603028929295e-05, "loss": 1.2591, "step": 3705 }, { "epoch": 1.1414945579016191, "grad_norm": 1.0669443607330322, "learning_rate": 3.417314742276308e-05, "loss": 1.1153, "step": 3710 }, { "epoch": 1.1430329602707587, "grad_norm": 1.064900279045105, "learning_rate": 3.413566800717762e-05, "loss": 1.2842, "step": 3715 }, { "epoch": 1.1445713626398986, "grad_norm": 1.052623987197876, "learning_rate": 3.409816487945286e-05, "loss": 1.1178, "step": 3720 }, { "epoch": 1.1461097650090382, "grad_norm": 1.8392632007598877, "learning_rate": 3.4060638136930304e-05, "loss": 1.1444, "step": 3725 }, { "epoch": 1.1476481673781778, "grad_norm": 1.3130204677581787, "learning_rate": 3.402308787701268e-05, "loss": 1.1948, "step": 3730 }, { "epoch": 1.1491865697473174, "grad_norm": 1.6042733192443848, "learning_rate": 3.398551419716382e-05, "loss": 1.096, "step": 3735 }, { "epoch": 1.150724972116457, "grad_norm": 1.7846676111221313, "learning_rate": 3.3947917194908306e-05, "loss": 1.2706, "step": 3740 }, { "epoch": 1.1522633744855968, "grad_norm": 1.7049797773361206, "learning_rate": 3.3910296967831266e-05, "loss": 1.1966, "step": 3745 }, { "epoch": 1.1538017768547364, "grad_norm": 0.9584757685661316, "learning_rate": 3.3872653613578134e-05, "loss": 1.1628, "step": 3750 }, { "epoch": 1.155340179223876, "grad_norm": 1.128525972366333, "learning_rate": 3.383498722985432e-05, "loss": 1.163, "step": 3755 }, { "epoch": 1.1568785815930156, "grad_norm": 1.1508246660232544, "learning_rate": 3.379729791442506e-05, "loss": 1.1373, "step": 3760 }, { "epoch": 1.1584169839621552, "grad_norm": 0.9009411334991455, "learning_rate": 3.375958576511508e-05, "loss": 1.1696, "step": 3765 }, { "epoch": 1.159955386331295, "grad_norm": 1.28462553024292, "learning_rate": 3.372185087980838e-05, "loss": 1.0845, "step": 3770 }, { "epoch": 1.1614937887004346, "grad_norm": 1.2223782539367676, "learning_rate": 3.368409335644798e-05, "loss": 1.1714, "step": 3775 }, { "epoch": 1.1630321910695742, "grad_norm": 1.155945062637329, "learning_rate": 3.364631329303564e-05, "loss": 1.1125, "step": 3780 }, { "epoch": 1.1645705934387138, "grad_norm": 2.37724232673645, "learning_rate": 3.3608510787631654e-05, "loss": 1.2847, "step": 3785 }, { "epoch": 1.1661089958078534, "grad_norm": 1.2505033016204834, "learning_rate": 3.357068593835453e-05, "loss": 1.1486, "step": 3790 }, { "epoch": 1.1676473981769933, "grad_norm": 1.1727055311203003, "learning_rate": 3.35328388433808e-05, "loss": 1.1909, "step": 3795 }, { "epoch": 1.1691858005461329, "grad_norm": 2.013213872909546, "learning_rate": 3.3494969600944715e-05, "loss": 1.1553, "step": 3800 }, { "epoch": 1.1707242029152725, "grad_norm": 1.5769529342651367, "learning_rate": 3.345707830933803e-05, "loss": 1.2083, "step": 3805 }, { "epoch": 1.172262605284412, "grad_norm": 1.1164283752441406, "learning_rate": 3.3419165066909705e-05, "loss": 1.1371, "step": 3810 }, { "epoch": 1.1738010076535517, "grad_norm": 1.1431792974472046, "learning_rate": 3.338122997206571e-05, "loss": 1.1377, "step": 3815 }, { "epoch": 1.1753394100226915, "grad_norm": 1.0517457723617554, "learning_rate": 3.33432731232687e-05, "loss": 1.1152, "step": 3820 }, { "epoch": 1.176877812391831, "grad_norm": 1.159220814704895, "learning_rate": 3.3305294619037805e-05, "loss": 1.1877, "step": 3825 }, { "epoch": 1.1784162147609707, "grad_norm": 1.6318359375, "learning_rate": 3.326729455794838e-05, "loss": 1.1014, "step": 3830 }, { "epoch": 1.1799546171301103, "grad_norm": 1.4239236116409302, "learning_rate": 3.322927303863171e-05, "loss": 1.2185, "step": 3835 }, { "epoch": 1.1814930194992501, "grad_norm": 2.194650650024414, "learning_rate": 3.319123015977478e-05, "loss": 1.1972, "step": 3840 }, { "epoch": 1.1830314218683897, "grad_norm": 0.9801701307296753, "learning_rate": 3.315316602012001e-05, "loss": 1.1984, "step": 3845 }, { "epoch": 1.1845698242375293, "grad_norm": 1.7991083860397339, "learning_rate": 3.311508071846504e-05, "loss": 1.1742, "step": 3850 }, { "epoch": 1.186108226606669, "grad_norm": 2.0683629512786865, "learning_rate": 3.307697435366237e-05, "loss": 1.1949, "step": 3855 }, { "epoch": 1.1876466289758085, "grad_norm": 1.1672157049179077, "learning_rate": 3.303884702461924e-05, "loss": 1.1302, "step": 3860 }, { "epoch": 1.1891850313449484, "grad_norm": 2.2780561447143555, "learning_rate": 3.300069883029727e-05, "loss": 1.1606, "step": 3865 }, { "epoch": 1.190723433714088, "grad_norm": 0.9155642986297607, "learning_rate": 3.296252986971222e-05, "loss": 1.2142, "step": 3870 }, { "epoch": 1.1922618360832276, "grad_norm": 0.7124369740486145, "learning_rate": 3.29243402419338e-05, "loss": 1.2005, "step": 3875 }, { "epoch": 1.1938002384523672, "grad_norm": 0.9560266137123108, "learning_rate": 3.2886130046085306e-05, "loss": 1.1488, "step": 3880 }, { "epoch": 1.195338640821507, "grad_norm": 2.2523956298828125, "learning_rate": 3.284789938134346e-05, "loss": 1.1306, "step": 3885 }, { "epoch": 1.1968770431906466, "grad_norm": 1.153905987739563, "learning_rate": 3.2809648346938105e-05, "loss": 1.2155, "step": 3890 }, { "epoch": 1.1984154455597862, "grad_norm": 1.6172610521316528, "learning_rate": 3.2771377042151944e-05, "loss": 1.1306, "step": 3895 }, { "epoch": 1.1999538479289258, "grad_norm": 1.023131012916565, "learning_rate": 3.2733085566320285e-05, "loss": 1.1581, "step": 3900 }, { "epoch": 1.2014922502980654, "grad_norm": 1.280826449394226, "learning_rate": 3.2694774018830816e-05, "loss": 1.238, "step": 3905 }, { "epoch": 1.2030306526672052, "grad_norm": 0.9869760870933533, "learning_rate": 3.265644249912331e-05, "loss": 1.2139, "step": 3910 }, { "epoch": 1.2045690550363448, "grad_norm": 2.917651414871216, "learning_rate": 3.261809110668937e-05, "loss": 0.9819, "step": 3915 }, { "epoch": 1.2061074574054844, "grad_norm": 0.8974500298500061, "learning_rate": 3.25797199410722e-05, "loss": 1.144, "step": 3920 }, { "epoch": 1.207645859774624, "grad_norm": 1.1364940404891968, "learning_rate": 3.254132910186631e-05, "loss": 1.2498, "step": 3925 }, { "epoch": 1.2091842621437636, "grad_norm": 1.5066908597946167, "learning_rate": 3.2502918688717285e-05, "loss": 1.1648, "step": 3930 }, { "epoch": 1.2107226645129034, "grad_norm": 1.5828046798706055, "learning_rate": 3.2464488801321494e-05, "loss": 1.2591, "step": 3935 }, { "epoch": 1.212261066882043, "grad_norm": 1.869602918624878, "learning_rate": 3.2426039539425876e-05, "loss": 1.2592, "step": 3940 }, { "epoch": 1.2137994692511826, "grad_norm": 0.9975070357322693, "learning_rate": 3.2387571002827656e-05, "loss": 1.2349, "step": 3945 }, { "epoch": 1.2153378716203223, "grad_norm": 2.0380489826202393, "learning_rate": 3.234908329137406e-05, "loss": 1.1056, "step": 3950 }, { "epoch": 1.2168762739894619, "grad_norm": 1.3352590799331665, "learning_rate": 3.231057650496214e-05, "loss": 1.1998, "step": 3955 }, { "epoch": 1.2184146763586017, "grad_norm": 1.1479742527008057, "learning_rate": 3.2272050743538385e-05, "loss": 1.1915, "step": 3960 }, { "epoch": 1.2199530787277413, "grad_norm": 1.3033512830734253, "learning_rate": 3.22335061070986e-05, "loss": 1.1046, "step": 3965 }, { "epoch": 1.2214914810968809, "grad_norm": 1.171144723892212, "learning_rate": 3.219494269568753e-05, "loss": 1.1206, "step": 3970 }, { "epoch": 1.2230298834660205, "grad_norm": 1.440745234489441, "learning_rate": 3.215636060939869e-05, "loss": 1.112, "step": 3975 }, { "epoch": 1.22456828583516, "grad_norm": 1.118881344795227, "learning_rate": 3.211775994837405e-05, "loss": 1.166, "step": 3980 }, { "epoch": 1.2261066882043, "grad_norm": 1.8601933717727661, "learning_rate": 3.207914081280379e-05, "loss": 1.2695, "step": 3985 }, { "epoch": 1.2276450905734395, "grad_norm": 1.5979291200637817, "learning_rate": 3.204050330292604e-05, "loss": 1.1158, "step": 3990 }, { "epoch": 1.2291834929425791, "grad_norm": 1.368026614189148, "learning_rate": 3.2001847519026646e-05, "loss": 1.1164, "step": 3995 }, { "epoch": 1.2307218953117187, "grad_norm": 1.9506460428237915, "learning_rate": 3.196317356143884e-05, "loss": 1.1607, "step": 4000 }, { "epoch": 1.2322602976808583, "grad_norm": 1.1299954652786255, "learning_rate": 3.192448153054306e-05, "loss": 1.1518, "step": 4005 }, { "epoch": 1.2337987000499981, "grad_norm": 1.389312982559204, "learning_rate": 3.188577152676666e-05, "loss": 1.0802, "step": 4010 }, { "epoch": 1.2353371024191377, "grad_norm": 1.1280195713043213, "learning_rate": 3.1847043650583604e-05, "loss": 1.1637, "step": 4015 }, { "epoch": 1.2368755047882773, "grad_norm": 0.959011971950531, "learning_rate": 3.180829800251428e-05, "loss": 1.1652, "step": 4020 }, { "epoch": 1.238413907157417, "grad_norm": 1.3471107482910156, "learning_rate": 3.1769534683125195e-05, "loss": 1.1713, "step": 4025 }, { "epoch": 1.2399523095265566, "grad_norm": 1.7551147937774658, "learning_rate": 3.1730753793028724e-05, "loss": 1.1362, "step": 4030 }, { "epoch": 1.2414907118956964, "grad_norm": 1.71892249584198, "learning_rate": 3.169195543288283e-05, "loss": 1.1525, "step": 4035 }, { "epoch": 1.243029114264836, "grad_norm": 1.1600077152252197, "learning_rate": 3.165313970339087e-05, "loss": 1.1085, "step": 4040 }, { "epoch": 1.2445675166339756, "grad_norm": 0.9502371549606323, "learning_rate": 3.1614306705301204e-05, "loss": 1.2047, "step": 4045 }, { "epoch": 1.2461059190031152, "grad_norm": 1.5982012748718262, "learning_rate": 3.15754565394071e-05, "loss": 1.167, "step": 4050 }, { "epoch": 1.247644321372255, "grad_norm": 1.2877519130706787, "learning_rate": 3.153658930654631e-05, "loss": 1.0551, "step": 4055 }, { "epoch": 1.2491827237413946, "grad_norm": 0.9973001480102539, "learning_rate": 3.149770510760093e-05, "loss": 1.0985, "step": 4060 }, { "epoch": 1.2507211261105342, "grad_norm": 2.4550881385803223, "learning_rate": 3.14588040434971e-05, "loss": 1.2302, "step": 4065 }, { "epoch": 1.2522595284796738, "grad_norm": 1.004104733467102, "learning_rate": 3.1419886215204694e-05, "loss": 1.1658, "step": 4070 }, { "epoch": 1.2537979308488136, "grad_norm": 2.3353803157806396, "learning_rate": 3.138095172373714e-05, "loss": 1.1095, "step": 4075 }, { "epoch": 1.2553363332179532, "grad_norm": 3.044813394546509, "learning_rate": 3.134200067015108e-05, "loss": 1.2342, "step": 4080 }, { "epoch": 1.2568747355870928, "grad_norm": 1.2953006029129028, "learning_rate": 3.130303315554617e-05, "loss": 1.0888, "step": 4085 }, { "epoch": 1.2584131379562324, "grad_norm": 1.468589425086975, "learning_rate": 3.1264049281064775e-05, "loss": 1.0256, "step": 4090 }, { "epoch": 1.259951540325372, "grad_norm": 1.058424949645996, "learning_rate": 3.1225049147891737e-05, "loss": 1.1953, "step": 4095 }, { "epoch": 1.2614899426945119, "grad_norm": 1.2426944971084595, "learning_rate": 3.118603285725409e-05, "loss": 1.2162, "step": 4100 }, { "epoch": 1.2630283450636515, "grad_norm": 1.318311095237732, "learning_rate": 3.114700051042081e-05, "loss": 1.0836, "step": 4105 }, { "epoch": 1.264566747432791, "grad_norm": 1.1298370361328125, "learning_rate": 3.110795220870253e-05, "loss": 1.1453, "step": 4110 }, { "epoch": 1.2661051498019307, "grad_norm": 1.2516565322875977, "learning_rate": 3.1068888053451334e-05, "loss": 1.1972, "step": 4115 }, { "epoch": 1.2676435521710703, "grad_norm": 1.364283800125122, "learning_rate": 3.102980814606042e-05, "loss": 1.0966, "step": 4120 }, { "epoch": 1.26918195454021, "grad_norm": 1.593657374382019, "learning_rate": 3.099071258796387e-05, "loss": 1.1734, "step": 4125 }, { "epoch": 1.2707203569093497, "grad_norm": 1.190921425819397, "learning_rate": 3.0951601480636404e-05, "loss": 1.2381, "step": 4130 }, { "epoch": 1.2722587592784893, "grad_norm": 1.0939090251922607, "learning_rate": 3.091247492559312e-05, "loss": 1.0485, "step": 4135 }, { "epoch": 1.273797161647629, "grad_norm": 1.5152145624160767, "learning_rate": 3.087333302438916e-05, "loss": 1.0804, "step": 4140 }, { "epoch": 1.2753355640167685, "grad_norm": 1.4711779356002808, "learning_rate": 3.0834175878619546e-05, "loss": 1.0904, "step": 4145 }, { "epoch": 1.2768739663859083, "grad_norm": 1.8078420162200928, "learning_rate": 3.0795003589918834e-05, "loss": 1.1033, "step": 4150 }, { "epoch": 1.278412368755048, "grad_norm": 1.389979600906372, "learning_rate": 3.0755816259960915e-05, "loss": 1.0441, "step": 4155 }, { "epoch": 1.2799507711241875, "grad_norm": 1.3937149047851562, "learning_rate": 3.071661399045869e-05, "loss": 1.235, "step": 4160 }, { "epoch": 1.2814891734933271, "grad_norm": 1.5922768115997314, "learning_rate": 3.067739688316385e-05, "loss": 1.1928, "step": 4165 }, { "epoch": 1.2830275758624667, "grad_norm": 1.0673027038574219, "learning_rate": 3.0638165039866614e-05, "loss": 1.0306, "step": 4170 }, { "epoch": 1.2845659782316066, "grad_norm": 1.3722416162490845, "learning_rate": 3.0598918562395425e-05, "loss": 1.0907, "step": 4175 }, { "epoch": 1.2861043806007462, "grad_norm": 0.9146732687950134, "learning_rate": 3.0559657552616716e-05, "loss": 1.1372, "step": 4180 }, { "epoch": 1.2876427829698858, "grad_norm": 1.4379206895828247, "learning_rate": 3.0520382112434636e-05, "loss": 1.1397, "step": 4185 }, { "epoch": 1.2891811853390254, "grad_norm": 1.0497307777404785, "learning_rate": 3.0481092343790822e-05, "loss": 1.1, "step": 4190 }, { "epoch": 1.290719587708165, "grad_norm": 0.9752093553543091, "learning_rate": 3.044178834866405e-05, "loss": 1.1479, "step": 4195 }, { "epoch": 1.2922579900773048, "grad_norm": 1.132630467414856, "learning_rate": 3.0402470229070056e-05, "loss": 1.1148, "step": 4200 }, { "epoch": 1.2937963924464444, "grad_norm": 1.2013700008392334, "learning_rate": 3.0363138087061222e-05, "loss": 1.0205, "step": 4205 }, { "epoch": 1.295334794815584, "grad_norm": 1.7345274686813354, "learning_rate": 3.032379202472634e-05, "loss": 1.1862, "step": 4210 }, { "epoch": 1.2968731971847236, "grad_norm": 1.4804950952529907, "learning_rate": 3.0284432144190315e-05, "loss": 1.0884, "step": 4215 }, { "epoch": 1.2984115995538632, "grad_norm": 1.7368816137313843, "learning_rate": 3.0245058547613924e-05, "loss": 1.2023, "step": 4220 }, { "epoch": 1.299950001923003, "grad_norm": 1.1152502298355103, "learning_rate": 3.0205671337193566e-05, "loss": 1.1609, "step": 4225 }, { "epoch": 1.3014884042921426, "grad_norm": 0.8946184515953064, "learning_rate": 3.016627061516094e-05, "loss": 1.0451, "step": 4230 }, { "epoch": 1.3030268066612822, "grad_norm": 0.9706270098686218, "learning_rate": 3.0126856483782838e-05, "loss": 1.1886, "step": 4235 }, { "epoch": 1.3045652090304218, "grad_norm": 2.113147020339966, "learning_rate": 3.008742904536085e-05, "loss": 1.1681, "step": 4240 }, { "epoch": 1.3061036113995614, "grad_norm": 1.2672151327133179, "learning_rate": 3.0047988402231113e-05, "loss": 1.1769, "step": 4245 }, { "epoch": 1.3076420137687013, "grad_norm": 0.8190154433250427, "learning_rate": 3.000853465676402e-05, "loss": 1.0251, "step": 4250 }, { "epoch": 1.3091804161378409, "grad_norm": 1.9547206163406372, "learning_rate": 2.9969067911363992e-05, "loss": 1.2783, "step": 4255 }, { "epoch": 1.3107188185069805, "grad_norm": 1.1879409551620483, "learning_rate": 2.992958826846918e-05, "loss": 1.0925, "step": 4260 }, { "epoch": 1.31225722087612, "grad_norm": 1.0043494701385498, "learning_rate": 2.9890095830551207e-05, "loss": 1.135, "step": 4265 }, { "epoch": 1.3137956232452597, "grad_norm": 1.3702094554901123, "learning_rate": 2.985059070011492e-05, "loss": 1.1819, "step": 4270 }, { "epoch": 1.3153340256143995, "grad_norm": 1.28737473487854, "learning_rate": 2.9811072979698107e-05, "loss": 1.146, "step": 4275 }, { "epoch": 1.316872427983539, "grad_norm": 1.6883684396743774, "learning_rate": 2.9771542771871224e-05, "loss": 1.0705, "step": 4280 }, { "epoch": 1.3184108303526787, "grad_norm": 0.9764033555984497, "learning_rate": 2.973200017923715e-05, "loss": 1.1656, "step": 4285 }, { "epoch": 1.3199492327218185, "grad_norm": 1.5175962448120117, "learning_rate": 2.96924453044309e-05, "loss": 1.1809, "step": 4290 }, { "epoch": 1.321487635090958, "grad_norm": 1.7922492027282715, "learning_rate": 2.9652878250119375e-05, "loss": 1.0016, "step": 4295 }, { "epoch": 1.3230260374600977, "grad_norm": 1.114981770515442, "learning_rate": 2.9613299119001082e-05, "loss": 1.2292, "step": 4300 }, { "epoch": 1.3245644398292373, "grad_norm": 1.1757748126983643, "learning_rate": 2.9573708013805885e-05, "loss": 1.1243, "step": 4305 }, { "epoch": 1.326102842198377, "grad_norm": 1.034613847732544, "learning_rate": 2.953410503729471e-05, "loss": 1.2089, "step": 4310 }, { "epoch": 1.3276412445675168, "grad_norm": 1.4121217727661133, "learning_rate": 2.9494490292259326e-05, "loss": 1.0332, "step": 4315 }, { "epoch": 1.3291796469366564, "grad_norm": 1.6308557987213135, "learning_rate": 2.945486388152201e-05, "loss": 1.1536, "step": 4320 }, { "epoch": 1.330718049305796, "grad_norm": 1.5427446365356445, "learning_rate": 2.941522590793534e-05, "loss": 1.2825, "step": 4325 }, { "epoch": 1.3322564516749356, "grad_norm": 2.0228891372680664, "learning_rate": 2.9375576474381905e-05, "loss": 1.0036, "step": 4330 }, { "epoch": 1.3337948540440752, "grad_norm": 1.04116952419281, "learning_rate": 2.9335915683774034e-05, "loss": 1.0644, "step": 4335 }, { "epoch": 1.335333256413215, "grad_norm": 2.5048625469207764, "learning_rate": 2.9296243639053545e-05, "loss": 1.0763, "step": 4340 }, { "epoch": 1.3368716587823546, "grad_norm": 1.2487713098526, "learning_rate": 2.9256560443191434e-05, "loss": 1.2327, "step": 4345 }, { "epoch": 1.3384100611514942, "grad_norm": 1.1335622072219849, "learning_rate": 2.9216866199187697e-05, "loss": 1.1097, "step": 4350 }, { "epoch": 1.3399484635206338, "grad_norm": 1.3442351818084717, "learning_rate": 2.9177161010070946e-05, "loss": 1.1278, "step": 4355 }, { "epoch": 1.3414868658897734, "grad_norm": 1.2338272333145142, "learning_rate": 2.9137444978898244e-05, "loss": 1.0971, "step": 4360 }, { "epoch": 1.3430252682589132, "grad_norm": 1.0198732614517212, "learning_rate": 2.9097718208754777e-05, "loss": 1.1223, "step": 4365 }, { "epoch": 1.3445636706280528, "grad_norm": 1.195879578590393, "learning_rate": 2.90579808027536e-05, "loss": 1.1469, "step": 4370 }, { "epoch": 1.3461020729971924, "grad_norm": 1.2800102233886719, "learning_rate": 2.901823286403539e-05, "loss": 1.1358, "step": 4375 }, { "epoch": 1.347640475366332, "grad_norm": 1.07575261592865, "learning_rate": 2.897847449576815e-05, "loss": 1.1122, "step": 4380 }, { "epoch": 1.3491788777354716, "grad_norm": 0.9419439435005188, "learning_rate": 2.8938705801146958e-05, "loss": 1.1214, "step": 4385 }, { "epoch": 1.3507172801046115, "grad_norm": 1.0005466938018799, "learning_rate": 2.88989268833937e-05, "loss": 1.2561, "step": 4390 }, { "epoch": 1.352255682473751, "grad_norm": 0.9994781613349915, "learning_rate": 2.8859137845756784e-05, "loss": 1.125, "step": 4395 }, { "epoch": 1.3537940848428907, "grad_norm": 1.0525974035263062, "learning_rate": 2.8819338791510887e-05, "loss": 1.1561, "step": 4400 }, { "epoch": 1.3553324872120303, "grad_norm": 1.1305088996887207, "learning_rate": 2.8779529823956704e-05, "loss": 1.129, "step": 4405 }, { "epoch": 1.3568708895811699, "grad_norm": 1.073445439338684, "learning_rate": 2.8739711046420626e-05, "loss": 1.1403, "step": 4410 }, { "epoch": 1.3584092919503097, "grad_norm": 1.792883038520813, "learning_rate": 2.8699882562254538e-05, "loss": 1.0386, "step": 4415 }, { "epoch": 1.3599476943194493, "grad_norm": 1.0318684577941895, "learning_rate": 2.8660044474835514e-05, "loss": 1.2134, "step": 4420 }, { "epoch": 1.3614860966885889, "grad_norm": 0.9824214577674866, "learning_rate": 2.862019688756553e-05, "loss": 1.1874, "step": 4425 }, { "epoch": 1.3630244990577285, "grad_norm": 3.1488852500915527, "learning_rate": 2.858033990387125e-05, "loss": 1.1967, "step": 4430 }, { "epoch": 1.364562901426868, "grad_norm": 2.4233949184417725, "learning_rate": 2.8540473627203708e-05, "loss": 1.0966, "step": 4435 }, { "epoch": 1.366101303796008, "grad_norm": 1.065972924232483, "learning_rate": 2.8500598161038057e-05, "loss": 1.0942, "step": 4440 }, { "epoch": 1.3676397061651475, "grad_norm": 1.105650782585144, "learning_rate": 2.8460713608873323e-05, "loss": 1.2002, "step": 4445 }, { "epoch": 1.3691781085342871, "grad_norm": 2.50470232963562, "learning_rate": 2.8420820074232086e-05, "loss": 1.0063, "step": 4450 }, { "epoch": 1.3707165109034267, "grad_norm": 1.2763255834579468, "learning_rate": 2.8380917660660262e-05, "loss": 1.1806, "step": 4455 }, { "epoch": 1.3722549132725663, "grad_norm": 1.4548883438110352, "learning_rate": 2.8341006471726816e-05, "loss": 1.091, "step": 4460 }, { "epoch": 1.3737933156417061, "grad_norm": 4.325429916381836, "learning_rate": 2.830108661102346e-05, "loss": 1.1485, "step": 4465 }, { "epoch": 1.3753317180108457, "grad_norm": 1.4059125185012817, "learning_rate": 2.826115818216444e-05, "loss": 1.109, "step": 4470 }, { "epoch": 1.3768701203799854, "grad_norm": 1.088011384010315, "learning_rate": 2.822122128878625e-05, "loss": 1.1762, "step": 4475 }, { "epoch": 1.378408522749125, "grad_norm": 0.8821271061897278, "learning_rate": 2.818127603454732e-05, "loss": 1.0974, "step": 4480 }, { "epoch": 1.3799469251182646, "grad_norm": 1.2587283849716187, "learning_rate": 2.8141322523127817e-05, "loss": 1.0408, "step": 4485 }, { "epoch": 1.3814853274874044, "grad_norm": 1.244844675064087, "learning_rate": 2.810136085822931e-05, "loss": 1.1508, "step": 4490 }, { "epoch": 1.383023729856544, "grad_norm": 1.0243417024612427, "learning_rate": 2.8061391143574545e-05, "loss": 1.14, "step": 4495 }, { "epoch": 1.3845621322256836, "grad_norm": 1.2699311971664429, "learning_rate": 2.8021413482907176e-05, "loss": 1.264, "step": 4500 }, { "epoch": 1.3861005345948234, "grad_norm": 1.102189064025879, "learning_rate": 2.798142797999144e-05, "loss": 1.0759, "step": 4505 }, { "epoch": 1.3876389369639628, "grad_norm": 1.9496086835861206, "learning_rate": 2.794143473861198e-05, "loss": 1.0819, "step": 4510 }, { "epoch": 1.3891773393331026, "grad_norm": 0.8148252367973328, "learning_rate": 2.7901433862573495e-05, "loss": 1.1767, "step": 4515 }, { "epoch": 1.3907157417022422, "grad_norm": 1.303236961364746, "learning_rate": 2.786142545570049e-05, "loss": 1.2153, "step": 4520 }, { "epoch": 1.3922541440713818, "grad_norm": 2.4543964862823486, "learning_rate": 2.782140962183704e-05, "loss": 0.9841, "step": 4525 }, { "epoch": 1.3937925464405216, "grad_norm": 1.1783264875411987, "learning_rate": 2.7781386464846497e-05, "loss": 1.1419, "step": 4530 }, { "epoch": 1.3953309488096612, "grad_norm": 1.238843321800232, "learning_rate": 2.7741356088611205e-05, "loss": 1.0705, "step": 4535 }, { "epoch": 1.3968693511788008, "grad_norm": 1.183280348777771, "learning_rate": 2.7701318597032248e-05, "loss": 1.2047, "step": 4540 }, { "epoch": 1.3984077535479404, "grad_norm": 1.5718815326690674, "learning_rate": 2.7661274094029193e-05, "loss": 1.2228, "step": 4545 }, { "epoch": 1.39994615591708, "grad_norm": 1.5735619068145752, "learning_rate": 2.7621222683539792e-05, "loss": 1.2255, "step": 4550 }, { "epoch": 1.4014845582862199, "grad_norm": 1.1554558277130127, "learning_rate": 2.7581164469519732e-05, "loss": 1.1794, "step": 4555 }, { "epoch": 1.4030229606553595, "grad_norm": 1.498023509979248, "learning_rate": 2.754109955594235e-05, "loss": 1.1865, "step": 4560 }, { "epoch": 1.404561363024499, "grad_norm": 1.2338696718215942, "learning_rate": 2.7501028046798387e-05, "loss": 1.0968, "step": 4565 }, { "epoch": 1.4060997653936387, "grad_norm": 1.8019814491271973, "learning_rate": 2.7460950046095696e-05, "loss": 1.1228, "step": 4570 }, { "epoch": 1.4076381677627783, "grad_norm": 1.3556607961654663, "learning_rate": 2.742086565785896e-05, "loss": 1.1827, "step": 4575 }, { "epoch": 1.409176570131918, "grad_norm": 1.5109410285949707, "learning_rate": 2.738077498612949e-05, "loss": 1.1437, "step": 4580 }, { "epoch": 1.4107149725010577, "grad_norm": 1.0716326236724854, "learning_rate": 2.7340678134964855e-05, "loss": 1.2372, "step": 4585 }, { "epoch": 1.4122533748701973, "grad_norm": 1.2416647672653198, "learning_rate": 2.7300575208438683e-05, "loss": 1.1694, "step": 4590 }, { "epoch": 1.413791777239337, "grad_norm": 1.2061866521835327, "learning_rate": 2.7260466310640377e-05, "loss": 1.2479, "step": 4595 }, { "epoch": 1.4153301796084765, "grad_norm": 1.7378590106964111, "learning_rate": 2.7220351545674834e-05, "loss": 1.1552, "step": 4600 }, { "epoch": 1.4168685819776163, "grad_norm": 1.3554580211639404, "learning_rate": 2.7180231017662178e-05, "loss": 1.1731, "step": 4605 }, { "epoch": 1.418406984346756, "grad_norm": 1.5967758893966675, "learning_rate": 2.7140104830737496e-05, "loss": 1.2213, "step": 4610 }, { "epoch": 1.4199453867158955, "grad_norm": 1.3568000793457031, "learning_rate": 2.709997308905055e-05, "loss": 1.1097, "step": 4615 }, { "epoch": 1.4214837890850351, "grad_norm": 1.3305199146270752, "learning_rate": 2.705983589676554e-05, "loss": 1.0566, "step": 4620 }, { "epoch": 1.4230221914541747, "grad_norm": 1.3310613632202148, "learning_rate": 2.7019693358060792e-05, "loss": 1.1266, "step": 4625 }, { "epoch": 1.4245605938233146, "grad_norm": 1.6512633562088013, "learning_rate": 2.6979545577128522e-05, "loss": 1.1205, "step": 4630 }, { "epoch": 1.4260989961924542, "grad_norm": 0.985725462436676, "learning_rate": 2.6939392658174568e-05, "loss": 1.1683, "step": 4635 }, { "epoch": 1.4276373985615938, "grad_norm": 1.284145712852478, "learning_rate": 2.6899234705418052e-05, "loss": 1.1016, "step": 4640 }, { "epoch": 1.4291758009307334, "grad_norm": 1.1945017576217651, "learning_rate": 2.685907182309122e-05, "loss": 1.1832, "step": 4645 }, { "epoch": 1.430714203299873, "grad_norm": 1.2088850736618042, "learning_rate": 2.681890411543908e-05, "loss": 1.1577, "step": 4650 }, { "epoch": 1.4322526056690128, "grad_norm": 2.208871841430664, "learning_rate": 2.6778731686719178e-05, "loss": 1.2002, "step": 4655 }, { "epoch": 1.4337910080381524, "grad_norm": 0.9800326824188232, "learning_rate": 2.6738554641201298e-05, "loss": 1.1888, "step": 4660 }, { "epoch": 1.435329410407292, "grad_norm": 5.183529853820801, "learning_rate": 2.669837308316723e-05, "loss": 1.0284, "step": 4665 }, { "epoch": 1.4368678127764316, "grad_norm": 1.1594517230987549, "learning_rate": 2.6658187116910455e-05, "loss": 1.1301, "step": 4670 }, { "epoch": 1.4384062151455712, "grad_norm": 1.1288197040557861, "learning_rate": 2.6617996846735904e-05, "loss": 1.1265, "step": 4675 }, { "epoch": 1.439944617514711, "grad_norm": 2.477808952331543, "learning_rate": 2.6577802376959698e-05, "loss": 1.1718, "step": 4680 }, { "epoch": 1.4414830198838506, "grad_norm": 1.1020774841308594, "learning_rate": 2.653760381190881e-05, "loss": 1.1919, "step": 4685 }, { "epoch": 1.4430214222529902, "grad_norm": 1.5409555435180664, "learning_rate": 2.64974012559209e-05, "loss": 1.1858, "step": 4690 }, { "epoch": 1.4445598246221298, "grad_norm": 1.6623104810714722, "learning_rate": 2.6457194813343948e-05, "loss": 1.2185, "step": 4695 }, { "epoch": 1.4460982269912694, "grad_norm": 1.2908927202224731, "learning_rate": 2.641698458853603e-05, "loss": 1.0986, "step": 4700 }, { "epoch": 1.4476366293604093, "grad_norm": 1.120782732963562, "learning_rate": 2.637677068586505e-05, "loss": 1.2856, "step": 4705 }, { "epoch": 1.4491750317295489, "grad_norm": 0.8865787386894226, "learning_rate": 2.6336553209708447e-05, "loss": 1.2691, "step": 4710 }, { "epoch": 1.4507134340986885, "grad_norm": 1.8774168491363525, "learning_rate": 2.6296332264452934e-05, "loss": 1.1015, "step": 4715 }, { "epoch": 1.4522518364678283, "grad_norm": 1.0902308225631714, "learning_rate": 2.6256107954494242e-05, "loss": 1.1832, "step": 4720 }, { "epoch": 1.4537902388369677, "grad_norm": 1.2977639436721802, "learning_rate": 2.6215880384236818e-05, "loss": 1.2232, "step": 4725 }, { "epoch": 1.4553286412061075, "grad_norm": 1.2228929996490479, "learning_rate": 2.6175649658093586e-05, "loss": 1.0803, "step": 4730 }, { "epoch": 1.456867043575247, "grad_norm": 1.099314570426941, "learning_rate": 2.6135415880485654e-05, "loss": 1.1727, "step": 4735 }, { "epoch": 1.4584054459443867, "grad_norm": 1.762929081916809, "learning_rate": 2.609517915584204e-05, "loss": 1.2126, "step": 4740 }, { "epoch": 1.4599438483135265, "grad_norm": 1.0587412118911743, "learning_rate": 2.6054939588599448e-05, "loss": 1.1546, "step": 4745 }, { "epoch": 1.4614822506826661, "grad_norm": 1.0290122032165527, "learning_rate": 2.6014697283201907e-05, "loss": 1.0129, "step": 4750 }, { "epoch": 1.4630206530518057, "grad_norm": 1.801530361175537, "learning_rate": 2.597445234410058e-05, "loss": 1.2372, "step": 4755 }, { "epoch": 1.4645590554209453, "grad_norm": 2.0911014080047607, "learning_rate": 2.5934204875753494e-05, "loss": 1.0657, "step": 4760 }, { "epoch": 1.466097457790085, "grad_norm": 1.0208872556686401, "learning_rate": 2.589395498262519e-05, "loss": 1.2039, "step": 4765 }, { "epoch": 1.4676358601592248, "grad_norm": 1.2312899827957153, "learning_rate": 2.5853702769186528e-05, "loss": 1.1466, "step": 4770 }, { "epoch": 1.4691742625283644, "grad_norm": 1.3418748378753662, "learning_rate": 2.5813448339914393e-05, "loss": 1.1439, "step": 4775 }, { "epoch": 1.470712664897504, "grad_norm": 1.0246156454086304, "learning_rate": 2.5773191799291417e-05, "loss": 1.1856, "step": 4780 }, { "epoch": 1.4722510672666436, "grad_norm": 1.1938049793243408, "learning_rate": 2.5732933251805713e-05, "loss": 1.1144, "step": 4785 }, { "epoch": 1.4737894696357832, "grad_norm": 2.593672037124634, "learning_rate": 2.569267280195059e-05, "loss": 1.1321, "step": 4790 }, { "epoch": 1.475327872004923, "grad_norm": 1.1651784181594849, "learning_rate": 2.5652410554224322e-05, "loss": 1.1729, "step": 4795 }, { "epoch": 1.4768662743740626, "grad_norm": 1.3578174114227295, "learning_rate": 2.5612146613129828e-05, "loss": 1.0952, "step": 4800 }, { "epoch": 1.4784046767432022, "grad_norm": 2.145015001296997, "learning_rate": 2.5571881083174427e-05, "loss": 1.0512, "step": 4805 }, { "epoch": 1.4799430791123418, "grad_norm": 1.4912582635879517, "learning_rate": 2.553161406886955e-05, "loss": 1.0594, "step": 4810 }, { "epoch": 1.4814814814814814, "grad_norm": 1.3633592128753662, "learning_rate": 2.5491345674730522e-05, "loss": 1.2463, "step": 4815 }, { "epoch": 1.4830198838506212, "grad_norm": 1.1500188112258911, "learning_rate": 2.5451076005276197e-05, "loss": 1.1274, "step": 4820 }, { "epoch": 1.4845582862197608, "grad_norm": 1.1924505233764648, "learning_rate": 2.5410805165028772e-05, "loss": 1.0678, "step": 4825 }, { "epoch": 1.4860966885889004, "grad_norm": 2.115426540374756, "learning_rate": 2.537053325851348e-05, "loss": 1.2476, "step": 4830 }, { "epoch": 1.48763509095804, "grad_norm": 1.191912055015564, "learning_rate": 2.5330260390258302e-05, "loss": 1.1183, "step": 4835 }, { "epoch": 1.4891734933271796, "grad_norm": 0.8928827047348022, "learning_rate": 2.5289986664793743e-05, "loss": 1.1317, "step": 4840 }, { "epoch": 1.4907118956963195, "grad_norm": 1.3792165517807007, "learning_rate": 2.52497121866525e-05, "loss": 1.2063, "step": 4845 }, { "epoch": 1.492250298065459, "grad_norm": 1.0137156248092651, "learning_rate": 2.520943706036927e-05, "loss": 1.1008, "step": 4850 }, { "epoch": 1.4937887004345987, "grad_norm": 1.1347684860229492, "learning_rate": 2.5169161390480382e-05, "loss": 1.1237, "step": 4855 }, { "epoch": 1.4953271028037383, "grad_norm": 1.1808063983917236, "learning_rate": 2.5128885281523606e-05, "loss": 1.242, "step": 4860 }, { "epoch": 1.4968655051728779, "grad_norm": 1.2332245111465454, "learning_rate": 2.508860883803784e-05, "loss": 1.1619, "step": 4865 }, { "epoch": 1.4984039075420177, "grad_norm": 1.0898313522338867, "learning_rate": 2.5048332164562872e-05, "loss": 1.2496, "step": 4870 }, { "epoch": 1.4999423099111573, "grad_norm": 1.0363882780075073, "learning_rate": 2.500805536563905e-05, "loss": 1.1321, "step": 4875 }, { "epoch": 1.501480712280297, "grad_norm": 1.2297104597091675, "learning_rate": 2.4967778545807074e-05, "loss": 1.1335, "step": 4880 }, { "epoch": 1.5030191146494367, "grad_norm": 1.6471554040908813, "learning_rate": 2.4927501809607692e-05, "loss": 1.1601, "step": 4885 }, { "epoch": 1.504557517018576, "grad_norm": 1.9324811697006226, "learning_rate": 2.4887225261581436e-05, "loss": 1.1363, "step": 4890 }, { "epoch": 1.506095919387716, "grad_norm": 1.3468040227890015, "learning_rate": 2.4846949006268344e-05, "loss": 1.2231, "step": 4895 }, { "epoch": 1.5076343217568555, "grad_norm": 1.3519035577774048, "learning_rate": 2.4806673148207693e-05, "loss": 1.2626, "step": 4900 }, { "epoch": 1.5091727241259951, "grad_norm": 0.9620422720909119, "learning_rate": 2.476639779193776e-05, "loss": 1.2636, "step": 4905 }, { "epoch": 1.510711126495135, "grad_norm": 1.1294174194335938, "learning_rate": 2.4726123041995463e-05, "loss": 1.234, "step": 4910 }, { "epoch": 1.5122495288642743, "grad_norm": 1.0007851123809814, "learning_rate": 2.4685849002916183e-05, "loss": 1.1924, "step": 4915 }, { "epoch": 1.5137879312334142, "grad_norm": 1.8807976245880127, "learning_rate": 2.4645575779233464e-05, "loss": 1.2864, "step": 4920 }, { "epoch": 1.5153263336025538, "grad_norm": 2.8548858165740967, "learning_rate": 2.460530347547871e-05, "loss": 1.1517, "step": 4925 }, { "epoch": 1.5168647359716934, "grad_norm": 1.5825186967849731, "learning_rate": 2.4565032196180952e-05, "loss": 1.2202, "step": 4930 }, { "epoch": 1.5184031383408332, "grad_norm": 1.0249121189117432, "learning_rate": 2.4524762045866555e-05, "loss": 1.2323, "step": 4935 }, { "epoch": 1.5199415407099726, "grad_norm": 0.9302557706832886, "learning_rate": 2.4484493129058944e-05, "loss": 1.2025, "step": 4940 }, { "epoch": 1.5214799430791124, "grad_norm": 2.6338999271392822, "learning_rate": 2.444422555027837e-05, "loss": 1.1265, "step": 4945 }, { "epoch": 1.523018345448252, "grad_norm": 1.4891122579574585, "learning_rate": 2.4403959414041583e-05, "loss": 1.0759, "step": 4950 }, { "epoch": 1.5245567478173916, "grad_norm": 1.1123028993606567, "learning_rate": 2.4363694824861615e-05, "loss": 1.0696, "step": 4955 }, { "epoch": 1.5260951501865314, "grad_norm": 2.104362964630127, "learning_rate": 2.4323431887247446e-05, "loss": 1.0459, "step": 4960 }, { "epoch": 1.5276335525556708, "grad_norm": 0.9691029787063599, "learning_rate": 2.4283170705703812e-05, "loss": 1.2534, "step": 4965 }, { "epoch": 1.5291719549248106, "grad_norm": 1.148000717163086, "learning_rate": 2.424291138473085e-05, "loss": 1.2542, "step": 4970 }, { "epoch": 1.5307103572939502, "grad_norm": 0.9757530093193054, "learning_rate": 2.4202654028823913e-05, "loss": 1.087, "step": 4975 }, { "epoch": 1.5322487596630898, "grad_norm": 1.5252931118011475, "learning_rate": 2.4162398742473214e-05, "loss": 1.2709, "step": 4980 }, { "epoch": 1.5337871620322296, "grad_norm": 1.3991789817810059, "learning_rate": 2.4122145630163616e-05, "loss": 1.0422, "step": 4985 }, { "epoch": 1.535325564401369, "grad_norm": 1.2912970781326294, "learning_rate": 2.408189479637432e-05, "loss": 1.1019, "step": 4990 }, { "epoch": 1.5368639667705088, "grad_norm": 1.0709604024887085, "learning_rate": 2.4041646345578637e-05, "loss": 1.1101, "step": 4995 }, { "epoch": 1.5384023691396485, "grad_norm": 1.062154769897461, "learning_rate": 2.4001400382243675e-05, "loss": 1.1262, "step": 5000 }, { "epoch": 1.539940771508788, "grad_norm": 1.4915518760681152, "learning_rate": 2.3961157010830095e-05, "loss": 1.2232, "step": 5005 }, { "epoch": 1.5414791738779279, "grad_norm": 1.2963896989822388, "learning_rate": 2.3920916335791833e-05, "loss": 1.1251, "step": 5010 }, { "epoch": 1.5430175762470673, "grad_norm": 1.115599274635315, "learning_rate": 2.3880678461575805e-05, "loss": 1.1748, "step": 5015 }, { "epoch": 1.544555978616207, "grad_norm": 1.3244112730026245, "learning_rate": 2.3840443492621674e-05, "loss": 1.1706, "step": 5020 }, { "epoch": 1.5460943809853467, "grad_norm": 1.903247356414795, "learning_rate": 2.380021153336158e-05, "loss": 1.1706, "step": 5025 }, { "epoch": 1.5476327833544863, "grad_norm": 2.5160722732543945, "learning_rate": 2.375998268821982e-05, "loss": 1.0949, "step": 5030 }, { "epoch": 1.549171185723626, "grad_norm": 1.065032958984375, "learning_rate": 2.371975706161262e-05, "loss": 1.1651, "step": 5035 }, { "epoch": 1.5507095880927657, "grad_norm": 1.1798903942108154, "learning_rate": 2.3679534757947862e-05, "loss": 1.0752, "step": 5040 }, { "epoch": 1.5522479904619053, "grad_norm": 1.417855143547058, "learning_rate": 2.3639315881624777e-05, "loss": 1.204, "step": 5045 }, { "epoch": 1.553786392831045, "grad_norm": 3.4984400272369385, "learning_rate": 2.3599100537033728e-05, "loss": 1.2201, "step": 5050 }, { "epoch": 1.5553247952001845, "grad_norm": 1.7004238367080688, "learning_rate": 2.35588888285559e-05, "loss": 1.2381, "step": 5055 }, { "epoch": 1.5568631975693243, "grad_norm": 1.1954479217529297, "learning_rate": 2.3518680860563026e-05, "loss": 1.291, "step": 5060 }, { "epoch": 1.558401599938464, "grad_norm": 1.0503448247909546, "learning_rate": 2.3478476737417177e-05, "loss": 1.114, "step": 5065 }, { "epoch": 1.5599400023076035, "grad_norm": 1.9155570268630981, "learning_rate": 2.3438276563470382e-05, "loss": 1.2755, "step": 5070 }, { "epoch": 1.5614784046767431, "grad_norm": 1.2648513317108154, "learning_rate": 2.3398080443064453e-05, "loss": 1.1355, "step": 5075 }, { "epoch": 1.5630168070458828, "grad_norm": 1.9838799238204956, "learning_rate": 2.335788848053069e-05, "loss": 1.1396, "step": 5080 }, { "epoch": 1.5645552094150226, "grad_norm": 0.9827559590339661, "learning_rate": 2.331770078018958e-05, "loss": 1.1287, "step": 5085 }, { "epoch": 1.5660936117841622, "grad_norm": 1.3725968599319458, "learning_rate": 2.3277517446350566e-05, "loss": 1.1653, "step": 5090 }, { "epoch": 1.5676320141533018, "grad_norm": 1.1363013982772827, "learning_rate": 2.3237338583311742e-05, "loss": 1.1403, "step": 5095 }, { "epoch": 1.5691704165224416, "grad_norm": 1.514285922050476, "learning_rate": 2.3197164295359593e-05, "loss": 1.1604, "step": 5100 }, { "epoch": 1.570708818891581, "grad_norm": 1.5780757665634155, "learning_rate": 2.3156994686768753e-05, "loss": 1.1634, "step": 5105 }, { "epoch": 1.5722472212607208, "grad_norm": 2.1901051998138428, "learning_rate": 2.3116829861801686e-05, "loss": 1.0262, "step": 5110 }, { "epoch": 1.5737856236298604, "grad_norm": 0.9369887113571167, "learning_rate": 2.307666992470845e-05, "loss": 1.1561, "step": 5115 }, { "epoch": 1.575324025999, "grad_norm": 1.2366549968719482, "learning_rate": 2.3036514979726442e-05, "loss": 1.1065, "step": 5120 }, { "epoch": 1.5768624283681398, "grad_norm": 1.257912516593933, "learning_rate": 2.2996365131080046e-05, "loss": 1.1071, "step": 5125 }, { "epoch": 1.5784008307372792, "grad_norm": 1.1271820068359375, "learning_rate": 2.295622048298045e-05, "loss": 1.1964, "step": 5130 }, { "epoch": 1.579939233106419, "grad_norm": 1.9496513605117798, "learning_rate": 2.2916081139625362e-05, "loss": 1.1061, "step": 5135 }, { "epoch": 1.5814776354755586, "grad_norm": 1.8635252714157104, "learning_rate": 2.287594720519869e-05, "loss": 1.0035, "step": 5140 }, { "epoch": 1.5830160378446982, "grad_norm": 0.9544889330863953, "learning_rate": 2.2835818783870312e-05, "loss": 1.2609, "step": 5145 }, { "epoch": 1.584554440213838, "grad_norm": 1.032304048538208, "learning_rate": 2.2795695979795813e-05, "loss": 1.2609, "step": 5150 }, { "epoch": 1.5860928425829774, "grad_norm": 1.1428989171981812, "learning_rate": 2.275557889711617e-05, "loss": 1.1503, "step": 5155 }, { "epoch": 1.5876312449521173, "grad_norm": 1.0252608060836792, "learning_rate": 2.271546763995752e-05, "loss": 1.0164, "step": 5160 }, { "epoch": 1.5891696473212569, "grad_norm": 0.9309441447257996, "learning_rate": 2.2675362312430894e-05, "loss": 1.118, "step": 5165 }, { "epoch": 1.5907080496903965, "grad_norm": 1.2354007959365845, "learning_rate": 2.2635263018631915e-05, "loss": 1.1657, "step": 5170 }, { "epoch": 1.5922464520595363, "grad_norm": 1.468483805656433, "learning_rate": 2.2595169862640568e-05, "loss": 1.1834, "step": 5175 }, { "epoch": 1.5937848544286757, "grad_norm": 1.2876496315002441, "learning_rate": 2.255508294852086e-05, "loss": 1.152, "step": 5180 }, { "epoch": 1.5953232567978155, "grad_norm": 1.2103363275527954, "learning_rate": 2.2515002380320655e-05, "loss": 1.3024, "step": 5185 }, { "epoch": 1.596861659166955, "grad_norm": 1.977842926979065, "learning_rate": 2.2474928262071307e-05, "loss": 1.1682, "step": 5190 }, { "epoch": 1.5984000615360947, "grad_norm": 2.3454501628875732, "learning_rate": 2.243486069778744e-05, "loss": 1.1026, "step": 5195 }, { "epoch": 1.5999384639052345, "grad_norm": 1.8084458112716675, "learning_rate": 2.239479979146667e-05, "loss": 1.1451, "step": 5200 }, { "epoch": 1.601476866274374, "grad_norm": 1.0935570001602173, "learning_rate": 2.235474564708933e-05, "loss": 0.9931, "step": 5205 }, { "epoch": 1.6030152686435137, "grad_norm": 1.21275794506073, "learning_rate": 2.2314698368618198e-05, "loss": 1.2305, "step": 5210 }, { "epoch": 1.6045536710126533, "grad_norm": 1.6089550256729126, "learning_rate": 2.227465805999823e-05, "loss": 1.1902, "step": 5215 }, { "epoch": 1.606092073381793, "grad_norm": 1.717024803161621, "learning_rate": 2.2234624825156293e-05, "loss": 1.204, "step": 5220 }, { "epoch": 1.6076304757509328, "grad_norm": 1.2060118913650513, "learning_rate": 2.219459876800091e-05, "loss": 1.1242, "step": 5225 }, { "epoch": 1.6091688781200721, "grad_norm": 0.8598418831825256, "learning_rate": 2.2154579992421964e-05, "loss": 1.0447, "step": 5230 }, { "epoch": 1.610707280489212, "grad_norm": 0.9737907648086548, "learning_rate": 2.2114568602290406e-05, "loss": 1.1645, "step": 5235 }, { "epoch": 1.6122456828583516, "grad_norm": 1.1507982015609741, "learning_rate": 2.2074564701458065e-05, "loss": 1.1268, "step": 5240 }, { "epoch": 1.6137840852274912, "grad_norm": 1.7262670993804932, "learning_rate": 2.2034568393757313e-05, "loss": 1.1256, "step": 5245 }, { "epoch": 1.615322487596631, "grad_norm": 1.7809712886810303, "learning_rate": 2.1994579783000804e-05, "loss": 1.2046, "step": 5250 }, { "epoch": 1.6168608899657706, "grad_norm": 2.4736218452453613, "learning_rate": 2.1954598972981237e-05, "loss": 1.1503, "step": 5255 }, { "epoch": 1.6183992923349102, "grad_norm": 1.1869739294052124, "learning_rate": 2.1914626067471032e-05, "loss": 1.0835, "step": 5260 }, { "epoch": 1.6199376947040498, "grad_norm": 1.188988447189331, "learning_rate": 2.187466117022212e-05, "loss": 1.0705, "step": 5265 }, { "epoch": 1.6214760970731894, "grad_norm": 1.4258966445922852, "learning_rate": 2.183470438496563e-05, "loss": 1.1984, "step": 5270 }, { "epoch": 1.6230144994423292, "grad_norm": 0.839988112449646, "learning_rate": 2.1794755815411642e-05, "loss": 1.1258, "step": 5275 }, { "epoch": 1.6245529018114688, "grad_norm": 1.4149938821792603, "learning_rate": 2.175481556524892e-05, "loss": 1.1313, "step": 5280 }, { "epoch": 1.6260913041806084, "grad_norm": 1.8420780897140503, "learning_rate": 2.1714883738144627e-05, "loss": 1.1325, "step": 5285 }, { "epoch": 1.627629706549748, "grad_norm": 1.2860522270202637, "learning_rate": 2.1674960437744044e-05, "loss": 1.2157, "step": 5290 }, { "epoch": 1.6291681089188876, "grad_norm": 1.9698175191879272, "learning_rate": 2.1635045767670356e-05, "loss": 1.1402, "step": 5295 }, { "epoch": 1.6307065112880275, "grad_norm": 1.716629147529602, "learning_rate": 2.1595139831524326e-05, "loss": 1.1031, "step": 5300 }, { "epoch": 1.632244913657167, "grad_norm": 1.4469795227050781, "learning_rate": 2.155524273288405e-05, "loss": 1.1826, "step": 5305 }, { "epoch": 1.6337833160263067, "grad_norm": 1.2318928241729736, "learning_rate": 2.1515354575304695e-05, "loss": 1.1256, "step": 5310 }, { "epoch": 1.6353217183954465, "grad_norm": 1.3652349710464478, "learning_rate": 2.1475475462318202e-05, "loss": 1.2348, "step": 5315 }, { "epoch": 1.6368601207645859, "grad_norm": 1.40614914894104, "learning_rate": 2.1435605497433057e-05, "loss": 1.1109, "step": 5320 }, { "epoch": 1.6383985231337257, "grad_norm": 1.7324262857437134, "learning_rate": 2.139574478413398e-05, "loss": 1.0686, "step": 5325 }, { "epoch": 1.6399369255028653, "grad_norm": 1.711517333984375, "learning_rate": 2.135589342588171e-05, "loss": 0.9758, "step": 5330 }, { "epoch": 1.641475327872005, "grad_norm": 1.4670315980911255, "learning_rate": 2.1316051526112672e-05, "loss": 1.0558, "step": 5335 }, { "epoch": 1.6430137302411447, "grad_norm": 1.625055193901062, "learning_rate": 2.1276219188238768e-05, "loss": 1.1067, "step": 5340 }, { "epoch": 1.644552132610284, "grad_norm": 2.5813450813293457, "learning_rate": 2.1236396515647046e-05, "loss": 1.0579, "step": 5345 }, { "epoch": 1.646090534979424, "grad_norm": 1.4112730026245117, "learning_rate": 2.1196583611699503e-05, "loss": 1.2685, "step": 5350 }, { "epoch": 1.6476289373485635, "grad_norm": 0.8786386251449585, "learning_rate": 2.1156780579732764e-05, "loss": 1.1172, "step": 5355 }, { "epoch": 1.6491673397177031, "grad_norm": 0.9437426328659058, "learning_rate": 2.111698752305783e-05, "loss": 1.0846, "step": 5360 }, { "epoch": 1.650705742086843, "grad_norm": 1.353835940361023, "learning_rate": 2.1077204544959825e-05, "loss": 1.1903, "step": 5365 }, { "epoch": 1.6522441444559823, "grad_norm": 1.4217884540557861, "learning_rate": 2.1037431748697688e-05, "loss": 1.0995, "step": 5370 }, { "epoch": 1.6537825468251222, "grad_norm": 1.296500325202942, "learning_rate": 2.099766923750395e-05, "loss": 1.1283, "step": 5375 }, { "epoch": 1.6553209491942618, "grad_norm": 1.42235267162323, "learning_rate": 2.095791711458444e-05, "loss": 1.1127, "step": 5380 }, { "epoch": 1.6568593515634014, "grad_norm": 1.1460708379745483, "learning_rate": 2.0918175483118036e-05, "loss": 1.0967, "step": 5385 }, { "epoch": 1.6583977539325412, "grad_norm": 1.1948250532150269, "learning_rate": 2.0878444446256364e-05, "loss": 1.2248, "step": 5390 }, { "epoch": 1.6599361563016806, "grad_norm": 0.8646104335784912, "learning_rate": 2.083872410712357e-05, "loss": 1.1827, "step": 5395 }, { "epoch": 1.6614745586708204, "grad_norm": 1.0145443677902222, "learning_rate": 2.079901456881601e-05, "loss": 1.0875, "step": 5400 }, { "epoch": 1.66301296103996, "grad_norm": 1.4740755558013916, "learning_rate": 2.075931593440203e-05, "loss": 1.1525, "step": 5405 }, { "epoch": 1.6645513634090996, "grad_norm": 1.4376249313354492, "learning_rate": 2.0719628306921664e-05, "loss": 1.1863, "step": 5410 }, { "epoch": 1.6660897657782394, "grad_norm": 1.4571655988693237, "learning_rate": 2.067995178938638e-05, "loss": 0.8931, "step": 5415 }, { "epoch": 1.6676281681473788, "grad_norm": 1.8079279661178589, "learning_rate": 2.0640286484778804e-05, "loss": 0.9931, "step": 5420 }, { "epoch": 1.6691665705165186, "grad_norm": 1.1777446269989014, "learning_rate": 2.0600632496052457e-05, "loss": 1.2278, "step": 5425 }, { "epoch": 1.6707049728856582, "grad_norm": 1.0672292709350586, "learning_rate": 2.05609899261315e-05, "loss": 1.2236, "step": 5430 }, { "epoch": 1.6722433752547978, "grad_norm": 1.5936694145202637, "learning_rate": 2.0521358877910444e-05, "loss": 1.2029, "step": 5435 }, { "epoch": 1.6737817776239376, "grad_norm": 1.6731094121932983, "learning_rate": 2.0481739454253904e-05, "loss": 1.1809, "step": 5440 }, { "epoch": 1.675320179993077, "grad_norm": 2.1021690368652344, "learning_rate": 2.044213175799632e-05, "loss": 1.1373, "step": 5445 }, { "epoch": 1.6768585823622169, "grad_norm": 2.2051169872283936, "learning_rate": 2.0402535891941695e-05, "loss": 1.1215, "step": 5450 }, { "epoch": 1.6783969847313565, "grad_norm": 2.0235435962677, "learning_rate": 2.0362951958863306e-05, "loss": 1.0841, "step": 5455 }, { "epoch": 1.679935387100496, "grad_norm": 1.1775131225585938, "learning_rate": 2.0323380061503494e-05, "loss": 1.0182, "step": 5460 }, { "epoch": 1.6814737894696359, "grad_norm": 1.5589635372161865, "learning_rate": 2.0283820302573327e-05, "loss": 1.1266, "step": 5465 }, { "epoch": 1.6830121918387755, "grad_norm": 1.2760752439498901, "learning_rate": 2.024427278475239e-05, "loss": 1.0757, "step": 5470 }, { "epoch": 1.684550594207915, "grad_norm": 1.4246011972427368, "learning_rate": 2.0204737610688482e-05, "loss": 1.0888, "step": 5475 }, { "epoch": 1.6860889965770547, "grad_norm": 1.7687841653823853, "learning_rate": 2.0165214882997363e-05, "loss": 1.1195, "step": 5480 }, { "epoch": 1.6876273989461943, "grad_norm": 1.873183012008667, "learning_rate": 2.012570470426249e-05, "loss": 1.2103, "step": 5485 }, { "epoch": 1.6891658013153341, "grad_norm": 1.3162018060684204, "learning_rate": 2.0086207177034765e-05, "loss": 1.1606, "step": 5490 }, { "epoch": 1.6907042036844737, "grad_norm": 1.9198535680770874, "learning_rate": 2.0046722403832227e-05, "loss": 1.1806, "step": 5495 }, { "epoch": 1.6922426060536133, "grad_norm": 1.908752202987671, "learning_rate": 2.000725048713983e-05, "loss": 1.2071, "step": 5500 }, { "epoch": 1.693781008422753, "grad_norm": 1.1114752292633057, "learning_rate": 1.996779152940914e-05, "loss": 1.1524, "step": 5505 }, { "epoch": 1.6953194107918925, "grad_norm": 1.055211067199707, "learning_rate": 1.99283456330581e-05, "loss": 1.1581, "step": 5510 }, { "epoch": 1.6968578131610323, "grad_norm": 3.748779773712158, "learning_rate": 1.988891290047075e-05, "loss": 1.1191, "step": 5515 }, { "epoch": 1.698396215530172, "grad_norm": 1.2784082889556885, "learning_rate": 1.9849493433996963e-05, "loss": 1.0007, "step": 5520 }, { "epoch": 1.6999346178993116, "grad_norm": 1.2205626964569092, "learning_rate": 1.9810087335952172e-05, "loss": 1.1451, "step": 5525 }, { "epoch": 1.7014730202684514, "grad_norm": 0.999777615070343, "learning_rate": 1.977069470861714e-05, "loss": 1.1493, "step": 5530 }, { "epoch": 1.7030114226375908, "grad_norm": 1.3446388244628906, "learning_rate": 1.9731315654237613e-05, "loss": 1.1549, "step": 5535 }, { "epoch": 1.7045498250067306, "grad_norm": 1.1934943199157715, "learning_rate": 1.9691950275024144e-05, "loss": 1.2842, "step": 5540 }, { "epoch": 1.7060882273758702, "grad_norm": 1.0282137393951416, "learning_rate": 1.9652598673151798e-05, "loss": 0.9822, "step": 5545 }, { "epoch": 1.7076266297450098, "grad_norm": 1.6977639198303223, "learning_rate": 1.961326095075986e-05, "loss": 1.1677, "step": 5550 }, { "epoch": 1.7091650321141496, "grad_norm": 1.1296579837799072, "learning_rate": 1.9573937209951604e-05, "loss": 1.1473, "step": 5555 }, { "epoch": 1.710703434483289, "grad_norm": 1.5597577095031738, "learning_rate": 1.9534627552793998e-05, "loss": 1.1344, "step": 5560 }, { "epoch": 1.7122418368524288, "grad_norm": 1.7576704025268555, "learning_rate": 1.9495332081317464e-05, "loss": 1.1523, "step": 5565 }, { "epoch": 1.7137802392215684, "grad_norm": 0.7898119688034058, "learning_rate": 1.945605089751561e-05, "loss": 1.2694, "step": 5570 }, { "epoch": 1.715318641590708, "grad_norm": 1.6534613370895386, "learning_rate": 1.9416784103344958e-05, "loss": 1.2023, "step": 5575 }, { "epoch": 1.7168570439598478, "grad_norm": 1.4977833032608032, "learning_rate": 1.937753180072466e-05, "loss": 1.134, "step": 5580 }, { "epoch": 1.7183954463289872, "grad_norm": 1.0111984014511108, "learning_rate": 1.93382940915363e-05, "loss": 1.1758, "step": 5585 }, { "epoch": 1.719933848698127, "grad_norm": 1.2132163047790527, "learning_rate": 1.9299071077623536e-05, "loss": 1.0628, "step": 5590 }, { "epoch": 1.7214722510672666, "grad_norm": 1.3343091011047363, "learning_rate": 1.9259862860791894e-05, "loss": 1.0998, "step": 5595 }, { "epoch": 1.7230106534364062, "grad_norm": 1.314704179763794, "learning_rate": 1.922066954280852e-05, "loss": 1.1529, "step": 5600 }, { "epoch": 1.724549055805546, "grad_norm": 1.7054861783981323, "learning_rate": 1.918149122540187e-05, "loss": 1.1795, "step": 5605 }, { "epoch": 1.7260874581746855, "grad_norm": 1.477265477180481, "learning_rate": 1.9142328010261463e-05, "loss": 1.1547, "step": 5610 }, { "epoch": 1.7276258605438253, "grad_norm": 3.371182918548584, "learning_rate": 1.910317999903762e-05, "loss": 1.13, "step": 5615 }, { "epoch": 1.7291642629129649, "grad_norm": 1.1603204011917114, "learning_rate": 1.9064047293341205e-05, "loss": 1.2422, "step": 5620 }, { "epoch": 1.7307026652821045, "grad_norm": 1.587916612625122, "learning_rate": 1.9024929994743354e-05, "loss": 1.2718, "step": 5625 }, { "epoch": 1.7322410676512443, "grad_norm": 1.9051862955093384, "learning_rate": 1.8985828204775206e-05, "loss": 1.0266, "step": 5630 }, { "epoch": 1.7337794700203837, "grad_norm": 0.9715479016304016, "learning_rate": 1.8946742024927662e-05, "loss": 1.0988, "step": 5635 }, { "epoch": 1.7353178723895235, "grad_norm": 1.471206545829773, "learning_rate": 1.8907671556651102e-05, "loss": 1.2839, "step": 5640 }, { "epoch": 1.736856274758663, "grad_norm": 1.127846598625183, "learning_rate": 1.8868616901355096e-05, "loss": 1.084, "step": 5645 }, { "epoch": 1.7383946771278027, "grad_norm": 1.6066924333572388, "learning_rate": 1.8829578160408216e-05, "loss": 1.1079, "step": 5650 }, { "epoch": 1.7399330794969425, "grad_norm": 1.393292784690857, "learning_rate": 1.8790555435137697e-05, "loss": 1.0559, "step": 5655 }, { "epoch": 1.741471481866082, "grad_norm": 1.9487007856369019, "learning_rate": 1.875154882682922e-05, "loss": 1.1889, "step": 5660 }, { "epoch": 1.7430098842352217, "grad_norm": 0.9660601019859314, "learning_rate": 1.8712558436726623e-05, "loss": 1.1461, "step": 5665 }, { "epoch": 1.7445482866043613, "grad_norm": 0.9336276054382324, "learning_rate": 1.8673584366031647e-05, "loss": 1.1877, "step": 5670 }, { "epoch": 1.746086688973501, "grad_norm": 1.3800424337387085, "learning_rate": 1.8634626715903693e-05, "loss": 1.2214, "step": 5675 }, { "epoch": 1.7476250913426408, "grad_norm": 0.946338415145874, "learning_rate": 1.8595685587459522e-05, "loss": 1.2095, "step": 5680 }, { "epoch": 1.7491634937117804, "grad_norm": 2.1087701320648193, "learning_rate": 1.8556761081773013e-05, "loss": 1.0836, "step": 5685 }, { "epoch": 1.75070189608092, "grad_norm": 0.8675004243850708, "learning_rate": 1.851785329987492e-05, "loss": 1.1866, "step": 5690 }, { "epoch": 1.7522402984500596, "grad_norm": 1.0243099927902222, "learning_rate": 1.8478962342752583e-05, "loss": 1.2131, "step": 5695 }, { "epoch": 1.7537787008191992, "grad_norm": 1.2281181812286377, "learning_rate": 1.8440088311349634e-05, "loss": 1.2623, "step": 5700 }, { "epoch": 1.755317103188339, "grad_norm": 1.4797272682189941, "learning_rate": 1.840123130656583e-05, "loss": 1.2044, "step": 5705 }, { "epoch": 1.7568555055574786, "grad_norm": 1.8646970987319946, "learning_rate": 1.8362391429256698e-05, "loss": 1.152, "step": 5710 }, { "epoch": 1.7583939079266182, "grad_norm": 1.331153154373169, "learning_rate": 1.8323568780233325e-05, "loss": 1.2001, "step": 5715 }, { "epoch": 1.7599323102957578, "grad_norm": 1.3193144798278809, "learning_rate": 1.8284763460262085e-05, "loss": 1.2019, "step": 5720 }, { "epoch": 1.7614707126648974, "grad_norm": 2.1348347663879395, "learning_rate": 1.824597557006434e-05, "loss": 1.1155, "step": 5725 }, { "epoch": 1.7630091150340372, "grad_norm": 0.8556899428367615, "learning_rate": 1.820720521031626e-05, "loss": 1.208, "step": 5730 }, { "epoch": 1.7645475174031768, "grad_norm": 1.1282209157943726, "learning_rate": 1.8168452481648476e-05, "loss": 1.2334, "step": 5735 }, { "epoch": 1.7660859197723164, "grad_norm": 1.082572102546692, "learning_rate": 1.8129717484645876e-05, "loss": 1.0867, "step": 5740 }, { "epoch": 1.7676243221414563, "grad_norm": 1.2311683893203735, "learning_rate": 1.809100031984734e-05, "loss": 1.2453, "step": 5745 }, { "epoch": 1.7691627245105956, "grad_norm": 1.6507987976074219, "learning_rate": 1.805230108774541e-05, "loss": 1.1704, "step": 5750 }, { "epoch": 1.7707011268797355, "grad_norm": 1.152384877204895, "learning_rate": 1.8013619888786127e-05, "loss": 1.2105, "step": 5755 }, { "epoch": 1.772239529248875, "grad_norm": 1.057258129119873, "learning_rate": 1.7974956823368727e-05, "loss": 1.2076, "step": 5760 }, { "epoch": 1.7737779316180147, "grad_norm": 0.8284866809844971, "learning_rate": 1.7936311991845355e-05, "loss": 1.1031, "step": 5765 }, { "epoch": 1.7753163339871545, "grad_norm": 0.9494788646697998, "learning_rate": 1.789768549452085e-05, "loss": 1.1328, "step": 5770 }, { "epoch": 1.7768547363562939, "grad_norm": 0.9356427192687988, "learning_rate": 1.785907743165245e-05, "loss": 1.1613, "step": 5775 }, { "epoch": 1.7783931387254337, "grad_norm": 1.4174987077713013, "learning_rate": 1.7820487903449544e-05, "loss": 1.1835, "step": 5780 }, { "epoch": 1.7799315410945733, "grad_norm": 0.9856932163238525, "learning_rate": 1.778191701007343e-05, "loss": 1.1883, "step": 5785 }, { "epoch": 1.781469943463713, "grad_norm": 1.5140010118484497, "learning_rate": 1.7743364851637017e-05, "loss": 1.0919, "step": 5790 }, { "epoch": 1.7830083458328527, "grad_norm": 1.0665122270584106, "learning_rate": 1.7704831528204608e-05, "loss": 1.2349, "step": 5795 }, { "epoch": 1.784546748201992, "grad_norm": 1.7453125715255737, "learning_rate": 1.7666317139791618e-05, "loss": 1.0932, "step": 5800 }, { "epoch": 1.786085150571132, "grad_norm": 1.5194388628005981, "learning_rate": 1.7627821786364265e-05, "loss": 1.1284, "step": 5805 }, { "epoch": 1.7876235529402715, "grad_norm": 1.0937379598617554, "learning_rate": 1.7589345567839433e-05, "loss": 1.1492, "step": 5810 }, { "epoch": 1.7891619553094111, "grad_norm": 1.1306276321411133, "learning_rate": 1.75508885840843e-05, "loss": 1.1348, "step": 5815 }, { "epoch": 1.790700357678551, "grad_norm": 1.172468900680542, "learning_rate": 1.7512450934916128e-05, "loss": 1.1805, "step": 5820 }, { "epoch": 1.7922387600476903, "grad_norm": 1.2849513292312622, "learning_rate": 1.747403272010199e-05, "loss": 1.2076, "step": 5825 }, { "epoch": 1.7937771624168302, "grad_norm": 1.0420297384262085, "learning_rate": 1.7435634039358527e-05, "loss": 1.1641, "step": 5830 }, { "epoch": 1.7953155647859698, "grad_norm": 0.999232828617096, "learning_rate": 1.7397254992351662e-05, "loss": 1.0551, "step": 5835 }, { "epoch": 1.7968539671551094, "grad_norm": 1.499186635017395, "learning_rate": 1.7358895678696368e-05, "loss": 1.2383, "step": 5840 }, { "epoch": 1.7983923695242492, "grad_norm": 1.5105886459350586, "learning_rate": 1.73205561979564e-05, "loss": 1.1292, "step": 5845 }, { "epoch": 1.7999307718933886, "grad_norm": 1.0507991313934326, "learning_rate": 1.7282236649644035e-05, "loss": 1.1165, "step": 5850 }, { "epoch": 1.8014691742625284, "grad_norm": 1.8862839937210083, "learning_rate": 1.7243937133219818e-05, "loss": 1.1018, "step": 5855 }, { "epoch": 1.803007576631668, "grad_norm": 1.0384774208068848, "learning_rate": 1.7205657748092275e-05, "loss": 1.2132, "step": 5860 }, { "epoch": 1.8045459790008076, "grad_norm": 1.6291218996047974, "learning_rate": 1.716739859361771e-05, "loss": 1.1987, "step": 5865 }, { "epoch": 1.8060843813699474, "grad_norm": 0.9344973564147949, "learning_rate": 1.712915976909992e-05, "loss": 1.1942, "step": 5870 }, { "epoch": 1.8076227837390868, "grad_norm": 1.1848701238632202, "learning_rate": 1.7090941373789898e-05, "loss": 1.2246, "step": 5875 }, { "epoch": 1.8091611861082266, "grad_norm": 2.381784439086914, "learning_rate": 1.7052743506885652e-05, "loss": 1.1942, "step": 5880 }, { "epoch": 1.8106995884773662, "grad_norm": 2.1836729049682617, "learning_rate": 1.701456626753189e-05, "loss": 1.1698, "step": 5885 }, { "epoch": 1.8122379908465058, "grad_norm": 1.0437754392623901, "learning_rate": 1.6976409754819767e-05, "loss": 1.195, "step": 5890 }, { "epoch": 1.8137763932156457, "grad_norm": 1.6375466585159302, "learning_rate": 1.6938274067786663e-05, "loss": 1.2909, "step": 5895 }, { "epoch": 1.815314795584785, "grad_norm": 1.204934000968933, "learning_rate": 1.6900159305415892e-05, "loss": 1.1238, "step": 5900 }, { "epoch": 1.8168531979539249, "grad_norm": 1.3872859477996826, "learning_rate": 1.6862065566636466e-05, "loss": 1.2359, "step": 5905 }, { "epoch": 1.8183916003230645, "grad_norm": 1.733851432800293, "learning_rate": 1.682399295032283e-05, "loss": 1.1205, "step": 5910 }, { "epoch": 1.819930002692204, "grad_norm": 1.224089503288269, "learning_rate": 1.6785941555294573e-05, "loss": 1.1741, "step": 5915 }, { "epoch": 1.8214684050613439, "grad_norm": 1.3531755208969116, "learning_rate": 1.675551578496907e-05, "loss": 1.1854, "step": 5920 }, { "epoch": 1.8230068074304835, "grad_norm": 1.2245914936065674, "learning_rate": 1.6717502837103975e-05, "loss": 1.1473, "step": 5925 }, { "epoch": 1.824545209799623, "grad_norm": 1.7382968664169312, "learning_rate": 1.6679511386925337e-05, "loss": 1.1192, "step": 5930 }, { "epoch": 1.8260836121687627, "grad_norm": 2.019582748413086, "learning_rate": 1.6641541533042098e-05, "loss": 1.1244, "step": 5935 }, { "epoch": 1.8276220145379023, "grad_norm": 1.7222234010696411, "learning_rate": 1.6603593374007153e-05, "loss": 1.0861, "step": 5940 }, { "epoch": 1.8291604169070421, "grad_norm": 1.1440834999084473, "learning_rate": 1.656566700831708e-05, "loss": 1.1462, "step": 5945 }, { "epoch": 1.8306988192761817, "grad_norm": 2.6508567333221436, "learning_rate": 1.6527762534411888e-05, "loss": 1.0422, "step": 5950 }, { "epoch": 1.8322372216453213, "grad_norm": 1.4697010517120361, "learning_rate": 1.6489880050674767e-05, "loss": 1.184, "step": 5955 }, { "epoch": 1.8337756240144611, "grad_norm": 1.2767016887664795, "learning_rate": 1.6452019655431828e-05, "loss": 1.1303, "step": 5960 }, { "epoch": 1.8353140263836005, "grad_norm": 1.345672607421875, "learning_rate": 1.641418144695185e-05, "loss": 1.0967, "step": 5965 }, { "epoch": 1.8368524287527404, "grad_norm": 0.7754538059234619, "learning_rate": 1.637636552344604e-05, "loss": 1.1405, "step": 5970 }, { "epoch": 1.83839083112188, "grad_norm": 1.5453096628189087, "learning_rate": 1.6338571983067754e-05, "loss": 1.0661, "step": 5975 }, { "epoch": 1.8399292334910196, "grad_norm": 1.0012327432632446, "learning_rate": 1.6300800923912224e-05, "loss": 1.1432, "step": 5980 }, { "epoch": 1.8414676358601594, "grad_norm": 0.9194159507751465, "learning_rate": 1.6263052444016374e-05, "loss": 1.1303, "step": 5985 }, { "epoch": 1.8430060382292988, "grad_norm": 0.9644936323165894, "learning_rate": 1.62253266413585e-05, "loss": 1.177, "step": 5990 }, { "epoch": 1.8445444405984386, "grad_norm": 1.5765095949172974, "learning_rate": 1.6187623613858038e-05, "loss": 1.1667, "step": 5995 }, { "epoch": 1.8460828429675782, "grad_norm": 0.9766362309455872, "learning_rate": 1.6149943459375312e-05, "loss": 1.0883, "step": 6000 }, { "epoch": 1.8476212453367178, "grad_norm": 1.214890480041504, "learning_rate": 1.6112286275711298e-05, "loss": 1.1184, "step": 6005 }, { "epoch": 1.8491596477058576, "grad_norm": 1.128957986831665, "learning_rate": 1.6074652160607302e-05, "loss": 1.2097, "step": 6010 }, { "epoch": 1.850698050074997, "grad_norm": 0.8823169469833374, "learning_rate": 1.603704121174479e-05, "loss": 1.1738, "step": 6015 }, { "epoch": 1.8522364524441368, "grad_norm": 1.1140327453613281, "learning_rate": 1.5999453526745104e-05, "loss": 1.116, "step": 6020 }, { "epoch": 1.8537748548132764, "grad_norm": 1.464568853378296, "learning_rate": 1.5961889203169184e-05, "loss": 1.1432, "step": 6025 }, { "epoch": 1.855313257182416, "grad_norm": 1.134320616722107, "learning_rate": 1.592434833851734e-05, "loss": 1.1429, "step": 6030 }, { "epoch": 1.8568516595515558, "grad_norm": 1.3063602447509766, "learning_rate": 1.5886831030229e-05, "loss": 1.1284, "step": 6035 }, { "epoch": 1.8583900619206952, "grad_norm": 2.385805130004883, "learning_rate": 1.5849337375682435e-05, "loss": 1.1124, "step": 6040 }, { "epoch": 1.859928464289835, "grad_norm": 0.9437244534492493, "learning_rate": 1.5811867472194535e-05, "loss": 1.173, "step": 6045 }, { "epoch": 1.8614668666589747, "grad_norm": 1.2181042432785034, "learning_rate": 1.577442141702054e-05, "loss": 1.3157, "step": 6050 }, { "epoch": 1.8630052690281143, "grad_norm": 1.382906198501587, "learning_rate": 1.5736999307353785e-05, "loss": 1.0986, "step": 6055 }, { "epoch": 1.864543671397254, "grad_norm": 1.4896674156188965, "learning_rate": 1.5699601240325474e-05, "loss": 1.2179, "step": 6060 }, { "epoch": 1.8660820737663935, "grad_norm": 1.4793058633804321, "learning_rate": 1.5662227313004364e-05, "loss": 1.0728, "step": 6065 }, { "epoch": 1.8676204761355333, "grad_norm": 1.6949820518493652, "learning_rate": 1.5624877622396588e-05, "loss": 1.1914, "step": 6070 }, { "epoch": 1.8691588785046729, "grad_norm": 1.629065990447998, "learning_rate": 1.5587552265445375e-05, "loss": 1.2429, "step": 6075 }, { "epoch": 1.8706972808738125, "grad_norm": 1.2364530563354492, "learning_rate": 1.5550251339030783e-05, "loss": 1.0742, "step": 6080 }, { "epoch": 1.8722356832429523, "grad_norm": 0.8806670904159546, "learning_rate": 1.5512974939969464e-05, "loss": 1.1999, "step": 6085 }, { "epoch": 1.8737740856120917, "grad_norm": 1.9689141511917114, "learning_rate": 1.5475723165014393e-05, "loss": 1.1117, "step": 6090 }, { "epoch": 1.8753124879812315, "grad_norm": 0.9261854887008667, "learning_rate": 1.543849611085465e-05, "loss": 1.0828, "step": 6095 }, { "epoch": 1.8768508903503711, "grad_norm": 1.5326859951019287, "learning_rate": 1.5401293874115147e-05, "loss": 1.0189, "step": 6100 }, { "epoch": 1.8783892927195107, "grad_norm": 1.155781865119934, "learning_rate": 1.5364116551356376e-05, "loss": 1.0945, "step": 6105 }, { "epoch": 1.8799276950886505, "grad_norm": 1.5384670495986938, "learning_rate": 1.532696423907416e-05, "loss": 1.205, "step": 6110 }, { "epoch": 1.88146609745779, "grad_norm": 1.558389663696289, "learning_rate": 1.528983703369943e-05, "loss": 1.1512, "step": 6115 }, { "epoch": 1.8830044998269297, "grad_norm": 1.9130889177322388, "learning_rate": 1.5252735031597915e-05, "loss": 1.1126, "step": 6120 }, { "epoch": 1.8845429021960693, "grad_norm": 1.6255749464035034, "learning_rate": 1.521565832906994e-05, "loss": 1.1858, "step": 6125 }, { "epoch": 1.886081304565209, "grad_norm": 1.8116388320922852, "learning_rate": 1.5178607022350186e-05, "loss": 1.0117, "step": 6130 }, { "epoch": 1.8876197069343488, "grad_norm": 1.0656421184539795, "learning_rate": 1.5141581207607391e-05, "loss": 1.1256, "step": 6135 }, { "epoch": 1.8891581093034884, "grad_norm": 1.6898198127746582, "learning_rate": 1.5104580980944141e-05, "loss": 1.0737, "step": 6140 }, { "epoch": 1.890696511672628, "grad_norm": 1.3169077634811401, "learning_rate": 1.5067606438396595e-05, "loss": 1.1341, "step": 6145 }, { "epoch": 1.8922349140417676, "grad_norm": 0.8919572234153748, "learning_rate": 1.5030657675934256e-05, "loss": 1.0732, "step": 6150 }, { "epoch": 1.8937733164109072, "grad_norm": 2.686988353729248, "learning_rate": 1.4993734789459718e-05, "loss": 1.229, "step": 6155 }, { "epoch": 1.895311718780047, "grad_norm": 1.9947311878204346, "learning_rate": 1.4956837874808391e-05, "loss": 1.116, "step": 6160 }, { "epoch": 1.8968501211491866, "grad_norm": 1.5105503797531128, "learning_rate": 1.4919967027748306e-05, "loss": 1.1457, "step": 6165 }, { "epoch": 1.8983885235183262, "grad_norm": 1.2878894805908203, "learning_rate": 1.4883122343979822e-05, "loss": 1.1197, "step": 6170 }, { "epoch": 1.8999269258874658, "grad_norm": 1.2365055084228516, "learning_rate": 1.4846303919135355e-05, "loss": 1.1429, "step": 6175 }, { "epoch": 1.9014653282566054, "grad_norm": 2.5517539978027344, "learning_rate": 1.4809511848779217e-05, "loss": 1.1013, "step": 6180 }, { "epoch": 1.9030037306257452, "grad_norm": 1.3758244514465332, "learning_rate": 1.4772746228407289e-05, "loss": 1.0995, "step": 6185 }, { "epoch": 1.9045421329948848, "grad_norm": 1.0151314735412598, "learning_rate": 1.4736007153446801e-05, "loss": 1.0569, "step": 6190 }, { "epoch": 1.9060805353640244, "grad_norm": 0.9583563208580017, "learning_rate": 1.4699294719256091e-05, "loss": 1.1816, "step": 6195 }, { "epoch": 1.9076189377331643, "grad_norm": 1.0372930765151978, "learning_rate": 1.466260902112433e-05, "loss": 1.1818, "step": 6200 }, { "epoch": 1.9091573401023036, "grad_norm": 1.4027732610702515, "learning_rate": 1.4625950154271317e-05, "loss": 1.3359, "step": 6205 }, { "epoch": 1.9106957424714435, "grad_norm": 1.8428670167922974, "learning_rate": 1.4589318213847197e-05, "loss": 1.1615, "step": 6210 }, { "epoch": 1.912234144840583, "grad_norm": 2.2269155979156494, "learning_rate": 1.4552713294932226e-05, "loss": 1.1065, "step": 6215 }, { "epoch": 1.9137725472097227, "grad_norm": 1.9540749788284302, "learning_rate": 1.4516135492536539e-05, "loss": 1.0822, "step": 6220 }, { "epoch": 1.9153109495788625, "grad_norm": 1.690127968788147, "learning_rate": 1.447958490159987e-05, "loss": 1.1871, "step": 6225 }, { "epoch": 1.9168493519480019, "grad_norm": 1.6063660383224487, "learning_rate": 1.444306161699131e-05, "loss": 1.2001, "step": 6230 }, { "epoch": 1.9183877543171417, "grad_norm": 1.3402923345565796, "learning_rate": 1.4406565733509126e-05, "loss": 1.0825, "step": 6235 }, { "epoch": 1.9199261566862813, "grad_norm": 1.2019435167312622, "learning_rate": 1.4370097345880407e-05, "loss": 1.2244, "step": 6240 }, { "epoch": 1.921464559055421, "grad_norm": 0.9290767312049866, "learning_rate": 1.433365654876091e-05, "loss": 1.2298, "step": 6245 }, { "epoch": 1.9230029614245607, "grad_norm": 1.7722569704055786, "learning_rate": 1.4297243436734797e-05, "loss": 1.2057, "step": 6250 }, { "epoch": 1.9245413637937, "grad_norm": 1.9733870029449463, "learning_rate": 1.4260858104314297e-05, "loss": 1.1947, "step": 6255 }, { "epoch": 1.92607976616284, "grad_norm": 1.5969133377075195, "learning_rate": 1.422450064593961e-05, "loss": 1.1204, "step": 6260 }, { "epoch": 1.9276181685319795, "grad_norm": 1.4854793548583984, "learning_rate": 1.4188171155978566e-05, "loss": 1.2412, "step": 6265 }, { "epoch": 1.9291565709011191, "grad_norm": 1.1663974523544312, "learning_rate": 1.4151869728726378e-05, "loss": 1.2189, "step": 6270 }, { "epoch": 1.930694973270259, "grad_norm": 1.081315279006958, "learning_rate": 1.4115596458405459e-05, "loss": 1.2044, "step": 6275 }, { "epoch": 1.9322333756393983, "grad_norm": 1.7983838319778442, "learning_rate": 1.4079351439165106e-05, "loss": 1.187, "step": 6280 }, { "epoch": 1.9337717780085382, "grad_norm": 1.329074740409851, "learning_rate": 1.4043134765081297e-05, "loss": 1.1532, "step": 6285 }, { "epoch": 1.9353101803776778, "grad_norm": 1.1615062952041626, "learning_rate": 1.4006946530156462e-05, "loss": 1.1056, "step": 6290 }, { "epoch": 1.9368485827468174, "grad_norm": 1.0058648586273193, "learning_rate": 1.397078682831917e-05, "loss": 1.1568, "step": 6295 }, { "epoch": 1.9383869851159572, "grad_norm": 1.2599167823791504, "learning_rate": 1.3934655753423976e-05, "loss": 1.1126, "step": 6300 }, { "epoch": 1.9399253874850966, "grad_norm": 1.3756744861602783, "learning_rate": 1.389855339925113e-05, "loss": 1.0468, "step": 6305 }, { "epoch": 1.9414637898542364, "grad_norm": 1.7275495529174805, "learning_rate": 1.386247985950628e-05, "loss": 1.1714, "step": 6310 }, { "epoch": 1.943002192223376, "grad_norm": 1.5819746255874634, "learning_rate": 1.3826435227820344e-05, "loss": 1.1858, "step": 6315 }, { "epoch": 1.9445405945925156, "grad_norm": 1.3510098457336426, "learning_rate": 1.3790419597749199e-05, "loss": 1.1779, "step": 6320 }, { "epoch": 1.9460789969616554, "grad_norm": 1.3834389448165894, "learning_rate": 1.3754433062773409e-05, "loss": 1.1468, "step": 6325 }, { "epoch": 1.9476173993307948, "grad_norm": 1.2175872325897217, "learning_rate": 1.3718475716298073e-05, "loss": 1.1402, "step": 6330 }, { "epoch": 1.9491558016999346, "grad_norm": 0.9370445609092712, "learning_rate": 1.368254765165249e-05, "loss": 1.0989, "step": 6335 }, { "epoch": 1.9506942040690742, "grad_norm": 2.2398204803466797, "learning_rate": 1.3646648962089965e-05, "loss": 1.17, "step": 6340 }, { "epoch": 1.9522326064382138, "grad_norm": 1.014602780342102, "learning_rate": 1.3610779740787571e-05, "loss": 1.1466, "step": 6345 }, { "epoch": 1.9537710088073537, "grad_norm": 1.669202446937561, "learning_rate": 1.3574940080845875e-05, "loss": 1.2066, "step": 6350 }, { "epoch": 1.9553094111764933, "grad_norm": 1.477262258529663, "learning_rate": 1.3539130075288731e-05, "loss": 1.0777, "step": 6355 }, { "epoch": 1.9568478135456329, "grad_norm": 2.8099093437194824, "learning_rate": 1.3503349817063047e-05, "loss": 1.2807, "step": 6360 }, { "epoch": 1.9583862159147725, "grad_norm": 1.3674906492233276, "learning_rate": 1.3467599399038445e-05, "loss": 1.1896, "step": 6365 }, { "epoch": 1.959924618283912, "grad_norm": 1.12281334400177, "learning_rate": 1.3431878914007167e-05, "loss": 1.1885, "step": 6370 }, { "epoch": 1.961463020653052, "grad_norm": 1.3317580223083496, "learning_rate": 1.3396188454683745e-05, "loss": 1.1905, "step": 6375 }, { "epoch": 1.9630014230221915, "grad_norm": 1.4334850311279297, "learning_rate": 1.3360528113704751e-05, "loss": 1.1287, "step": 6380 }, { "epoch": 1.964539825391331, "grad_norm": 1.2335008382797241, "learning_rate": 1.332489798362862e-05, "loss": 1.2626, "step": 6385 }, { "epoch": 1.9660782277604707, "grad_norm": 2.188669204711914, "learning_rate": 1.3289298156935348e-05, "loss": 1.0611, "step": 6390 }, { "epoch": 1.9676166301296103, "grad_norm": 2.0799977779388428, "learning_rate": 1.3253728726026276e-05, "loss": 1.1448, "step": 6395 }, { "epoch": 1.9691550324987501, "grad_norm": 1.1249916553497314, "learning_rate": 1.321818978322387e-05, "loss": 1.0762, "step": 6400 }, { "epoch": 1.9706934348678897, "grad_norm": 1.3204679489135742, "learning_rate": 1.3182681420771453e-05, "loss": 1.0773, "step": 6405 }, { "epoch": 1.9722318372370293, "grad_norm": 1.5374983549118042, "learning_rate": 1.3147203730832963e-05, "loss": 1.0408, "step": 6410 }, { "epoch": 1.9737702396061692, "grad_norm": 1.7096116542816162, "learning_rate": 1.3111756805492752e-05, "loss": 1.1046, "step": 6415 }, { "epoch": 1.9753086419753085, "grad_norm": 1.8964815139770508, "learning_rate": 1.3076340736755293e-05, "loss": 1.1332, "step": 6420 }, { "epoch": 1.9768470443444484, "grad_norm": 1.614093542098999, "learning_rate": 1.304095561654498e-05, "loss": 1.1041, "step": 6425 }, { "epoch": 1.978385446713588, "grad_norm": 1.7104381322860718, "learning_rate": 1.3005601536705889e-05, "loss": 1.3129, "step": 6430 }, { "epoch": 1.9799238490827276, "grad_norm": 1.825789451599121, "learning_rate": 1.2970278589001505e-05, "loss": 1.2037, "step": 6435 }, { "epoch": 1.9814622514518674, "grad_norm": 0.9708835482597351, "learning_rate": 1.293498686511454e-05, "loss": 1.1983, "step": 6440 }, { "epoch": 1.9830006538210068, "grad_norm": 1.194024920463562, "learning_rate": 1.2899726456646635e-05, "loss": 1.1271, "step": 6445 }, { "epoch": 1.9845390561901466, "grad_norm": 1.3245621919631958, "learning_rate": 1.2864497455118152e-05, "loss": 1.1675, "step": 6450 }, { "epoch": 1.9860774585592862, "grad_norm": 1.3149336576461792, "learning_rate": 1.2829299951967954e-05, "loss": 1.0849, "step": 6455 }, { "epoch": 1.9876158609284258, "grad_norm": 1.097424030303955, "learning_rate": 1.2794134038553141e-05, "loss": 1.0848, "step": 6460 }, { "epoch": 1.9891542632975656, "grad_norm": 1.0103280544281006, "learning_rate": 1.2758999806148813e-05, "loss": 1.1289, "step": 6465 }, { "epoch": 1.990692665666705, "grad_norm": 1.243175983428955, "learning_rate": 1.2723897345947828e-05, "loss": 1.1198, "step": 6470 }, { "epoch": 1.9922310680358448, "grad_norm": 2.570831298828125, "learning_rate": 1.2688826749060611e-05, "loss": 1.1318, "step": 6475 }, { "epoch": 1.9937694704049844, "grad_norm": 1.352293610572815, "learning_rate": 1.2653788106514852e-05, "loss": 1.2472, "step": 6480 }, { "epoch": 1.995307872774124, "grad_norm": 2.3871512413024902, "learning_rate": 1.2618781509255332e-05, "loss": 1.2089, "step": 6485 }, { "epoch": 1.9968462751432638, "grad_norm": 0.9660901427268982, "learning_rate": 1.2583807048143617e-05, "loss": 1.2677, "step": 6490 }, { "epoch": 1.9983846775124032, "grad_norm": 1.012502670288086, "learning_rate": 1.2548864813957909e-05, "loss": 1.214, "step": 6495 }, { "epoch": 1.999923079881543, "grad_norm": 2.1729748249053955, "learning_rate": 1.2513954897392727e-05, "loss": 1.1746, "step": 6500 }, { "epoch": 2.001461482250683, "grad_norm": 1.1099797487258911, "learning_rate": 1.2479077389058708e-05, "loss": 1.0032, "step": 6505 }, { "epoch": 2.0029998846198223, "grad_norm": 1.2987159490585327, "learning_rate": 1.2444232379482398e-05, "loss": 1.0929, "step": 6510 }, { "epoch": 2.004538286988962, "grad_norm": 1.4184848070144653, "learning_rate": 1.2409419959105981e-05, "loss": 1.1196, "step": 6515 }, { "epoch": 2.0060766893581015, "grad_norm": 1.006828784942627, "learning_rate": 1.237464021828704e-05, "loss": 1.1062, "step": 6520 }, { "epoch": 2.0076150917272413, "grad_norm": 1.3644285202026367, "learning_rate": 1.233989324729834e-05, "loss": 1.1301, "step": 6525 }, { "epoch": 2.009153494096381, "grad_norm": 0.9252627491950989, "learning_rate": 1.2305179136327608e-05, "loss": 1.2008, "step": 6530 }, { "epoch": 2.0106918964655205, "grad_norm": 2.844708204269409, "learning_rate": 1.2270497975477253e-05, "loss": 1.1736, "step": 6535 }, { "epoch": 2.0122302988346603, "grad_norm": 1.0435899496078491, "learning_rate": 1.2235849854764194e-05, "loss": 1.1997, "step": 6540 }, { "epoch": 2.0137687012037997, "grad_norm": 1.7181456089019775, "learning_rate": 1.2201234864119554e-05, "loss": 1.0245, "step": 6545 }, { "epoch": 2.0153071035729395, "grad_norm": 2.6238577365875244, "learning_rate": 1.2166653093388506e-05, "loss": 1.0753, "step": 6550 }, { "epoch": 2.0168455059420793, "grad_norm": 1.2975784540176392, "learning_rate": 1.2132104632329963e-05, "loss": 1.1651, "step": 6555 }, { "epoch": 2.0183839083112187, "grad_norm": 1.737640380859375, "learning_rate": 1.2097589570616394e-05, "loss": 1.1217, "step": 6560 }, { "epoch": 2.0199223106803585, "grad_norm": 1.2361128330230713, "learning_rate": 1.2063107997833581e-05, "loss": 1.1589, "step": 6565 }, { "epoch": 2.021460713049498, "grad_norm": 1.1839301586151123, "learning_rate": 1.2028660003480399e-05, "loss": 1.1789, "step": 6570 }, { "epoch": 2.0229991154186377, "grad_norm": 1.2682217359542847, "learning_rate": 1.1994245676968538e-05, "loss": 1.2168, "step": 6575 }, { "epoch": 2.0245375177877776, "grad_norm": 1.4984492063522339, "learning_rate": 1.1959865107622307e-05, "loss": 1.0566, "step": 6580 }, { "epoch": 2.026075920156917, "grad_norm": 1.7460529804229736, "learning_rate": 1.1925518384678421e-05, "loss": 1.0522, "step": 6585 }, { "epoch": 2.027614322526057, "grad_norm": 1.5842326879501343, "learning_rate": 1.1891205597285712e-05, "loss": 1.0596, "step": 6590 }, { "epoch": 2.029152724895196, "grad_norm": 1.3402278423309326, "learning_rate": 1.1856926834504963e-05, "loss": 1.2102, "step": 6595 }, { "epoch": 2.030691127264336, "grad_norm": 1.3944041728973389, "learning_rate": 1.1822682185308612e-05, "loss": 1.1225, "step": 6600 }, { "epoch": 2.032229529633476, "grad_norm": 1.0951045751571655, "learning_rate": 1.1788471738580581e-05, "loss": 1.0554, "step": 6605 }, { "epoch": 2.033767932002615, "grad_norm": 0.8644259572029114, "learning_rate": 1.1754295583116004e-05, "loss": 1.1563, "step": 6610 }, { "epoch": 2.035306334371755, "grad_norm": 0.9207926392555237, "learning_rate": 1.1720153807620999e-05, "loss": 1.2274, "step": 6615 }, { "epoch": 2.0368447367408944, "grad_norm": 0.9534634947776794, "learning_rate": 1.168604650071247e-05, "loss": 1.1378, "step": 6620 }, { "epoch": 2.038383139110034, "grad_norm": 1.1268069744110107, "learning_rate": 1.1651973750917854e-05, "loss": 1.1663, "step": 6625 }, { "epoch": 2.039921541479174, "grad_norm": 1.331639051437378, "learning_rate": 1.1617935646674885e-05, "loss": 1.1084, "step": 6630 }, { "epoch": 2.0414599438483134, "grad_norm": 1.2026249170303345, "learning_rate": 1.1583932276331358e-05, "loss": 1.2261, "step": 6635 }, { "epoch": 2.0429983462174532, "grad_norm": 0.9286217093467712, "learning_rate": 1.154996372814495e-05, "loss": 1.0844, "step": 6640 }, { "epoch": 2.0445367485865926, "grad_norm": 1.787848711013794, "learning_rate": 1.1516030090282914e-05, "loss": 1.2545, "step": 6645 }, { "epoch": 2.0460751509557324, "grad_norm": 1.1162513494491577, "learning_rate": 1.1482131450821937e-05, "loss": 1.1272, "step": 6650 }, { "epoch": 2.0476135533248723, "grad_norm": 1.1817599534988403, "learning_rate": 1.1448267897747818e-05, "loss": 1.256, "step": 6655 }, { "epoch": 2.0491519556940117, "grad_norm": 1.7543742656707764, "learning_rate": 1.1414439518955334e-05, "loss": 1.1881, "step": 6660 }, { "epoch": 2.0506903580631515, "grad_norm": 1.54066801071167, "learning_rate": 1.1380646402247927e-05, "loss": 1.1412, "step": 6665 }, { "epoch": 2.052228760432291, "grad_norm": 1.0868725776672363, "learning_rate": 1.1346888635337522e-05, "loss": 1.02, "step": 6670 }, { "epoch": 2.0537671628014307, "grad_norm": 1.416038155555725, "learning_rate": 1.1313166305844306e-05, "loss": 1.0248, "step": 6675 }, { "epoch": 2.0553055651705705, "grad_norm": 1.1514451503753662, "learning_rate": 1.1279479501296492e-05, "loss": 1.13, "step": 6680 }, { "epoch": 2.05684396753971, "grad_norm": 1.4153822660446167, "learning_rate": 1.1245828309130061e-05, "loss": 1.121, "step": 6685 }, { "epoch": 2.0583823699088497, "grad_norm": 1.6066848039627075, "learning_rate": 1.1212212816688558e-05, "loss": 1.0533, "step": 6690 }, { "epoch": 2.059920772277989, "grad_norm": 1.7452136278152466, "learning_rate": 1.1178633111222909e-05, "loss": 1.1413, "step": 6695 }, { "epoch": 2.061459174647129, "grad_norm": 0.9885658025741577, "learning_rate": 1.1145089279891102e-05, "loss": 1.2047, "step": 6700 }, { "epoch": 2.0629975770162687, "grad_norm": 4.310669898986816, "learning_rate": 1.1111581409758043e-05, "loss": 1.1422, "step": 6705 }, { "epoch": 2.064535979385408, "grad_norm": 1.114450454711914, "learning_rate": 1.107810958779531e-05, "loss": 1.1967, "step": 6710 }, { "epoch": 2.066074381754548, "grad_norm": 1.012830376625061, "learning_rate": 1.1044673900880858e-05, "loss": 1.1861, "step": 6715 }, { "epoch": 2.0676127841236878, "grad_norm": 1.9352067708969116, "learning_rate": 1.101127443579891e-05, "loss": 1.1204, "step": 6720 }, { "epoch": 2.069151186492827, "grad_norm": 1.8013137578964233, "learning_rate": 1.0977911279239663e-05, "loss": 1.1936, "step": 6725 }, { "epoch": 2.070689588861967, "grad_norm": 1.516996145248413, "learning_rate": 1.0944584517799045e-05, "loss": 1.1004, "step": 6730 }, { "epoch": 2.0722279912311063, "grad_norm": 1.0563892126083374, "learning_rate": 1.091129423797855e-05, "loss": 1.1286, "step": 6735 }, { "epoch": 2.073766393600246, "grad_norm": 1.5256750583648682, "learning_rate": 1.0878040526184965e-05, "loss": 1.1727, "step": 6740 }, { "epoch": 2.075304795969386, "grad_norm": 1.3976362943649292, "learning_rate": 1.0844823468730158e-05, "loss": 1.1221, "step": 6745 }, { "epoch": 2.0768431983385254, "grad_norm": 1.1188265085220337, "learning_rate": 1.081164315183088e-05, "loss": 1.1319, "step": 6750 }, { "epoch": 2.078381600707665, "grad_norm": 2.3145363330841064, "learning_rate": 1.0778499661608491e-05, "loss": 1.1622, "step": 6755 }, { "epoch": 2.0799200030768046, "grad_norm": 2.635695457458496, "learning_rate": 1.0745393084088789e-05, "loss": 1.2318, "step": 6760 }, { "epoch": 2.0814584054459444, "grad_norm": 1.1798350811004639, "learning_rate": 1.0712323505201773e-05, "loss": 0.9846, "step": 6765 }, { "epoch": 2.0829968078150842, "grad_norm": 1.4065446853637695, "learning_rate": 1.0679291010781362e-05, "loss": 1.1356, "step": 6770 }, { "epoch": 2.0845352101842236, "grad_norm": 1.7471407651901245, "learning_rate": 1.0646295686565259e-05, "loss": 1.08, "step": 6775 }, { "epoch": 2.0860736125533634, "grad_norm": 1.3534942865371704, "learning_rate": 1.0613337618194691e-05, "loss": 1.1235, "step": 6780 }, { "epoch": 2.087612014922503, "grad_norm": 0.9599131941795349, "learning_rate": 1.0580416891214162e-05, "loss": 1.1837, "step": 6785 }, { "epoch": 2.0891504172916426, "grad_norm": 1.544646143913269, "learning_rate": 1.0547533591071285e-05, "loss": 1.2486, "step": 6790 }, { "epoch": 2.0906888196607825, "grad_norm": 1.2420892715454102, "learning_rate": 1.0514687803116499e-05, "loss": 1.2542, "step": 6795 }, { "epoch": 2.092227222029922, "grad_norm": 1.1411772966384888, "learning_rate": 1.0481879612602882e-05, "loss": 1.1346, "step": 6800 }, { "epoch": 2.0937656243990617, "grad_norm": 1.7670328617095947, "learning_rate": 1.0449109104685958e-05, "loss": 1.0802, "step": 6805 }, { "epoch": 2.095304026768201, "grad_norm": 2.038987636566162, "learning_rate": 1.0416376364423396e-05, "loss": 1.1316, "step": 6810 }, { "epoch": 2.096842429137341, "grad_norm": 1.5804378986358643, "learning_rate": 1.0383681476774876e-05, "loss": 1.172, "step": 6815 }, { "epoch": 2.0983808315064807, "grad_norm": 1.1744779348373413, "learning_rate": 1.035102452660183e-05, "loss": 1.1978, "step": 6820 }, { "epoch": 2.09991923387562, "grad_norm": 1.132908582687378, "learning_rate": 1.031840559866717e-05, "loss": 1.1058, "step": 6825 }, { "epoch": 2.10145763624476, "grad_norm": 1.5223757028579712, "learning_rate": 1.0285824777635172e-05, "loss": 1.0153, "step": 6830 }, { "epoch": 2.1029960386138993, "grad_norm": 1.6681894063949585, "learning_rate": 1.0253282148071198e-05, "loss": 1.1409, "step": 6835 }, { "epoch": 2.104534440983039, "grad_norm": 1.0178622007369995, "learning_rate": 1.022077779444145e-05, "loss": 1.3659, "step": 6840 }, { "epoch": 2.106072843352179, "grad_norm": 1.5879106521606445, "learning_rate": 1.0188311801112823e-05, "loss": 1.0986, "step": 6845 }, { "epoch": 2.1076112457213183, "grad_norm": 1.4189927577972412, "learning_rate": 1.0155884252352616e-05, "loss": 1.1785, "step": 6850 }, { "epoch": 2.109149648090458, "grad_norm": 1.4003583192825317, "learning_rate": 1.0123495232328342e-05, "loss": 1.1677, "step": 6855 }, { "epoch": 2.1106880504595975, "grad_norm": 1.5137903690338135, "learning_rate": 1.009114482510754e-05, "loss": 1.0557, "step": 6860 }, { "epoch": 2.1122264528287373, "grad_norm": 1.2828454971313477, "learning_rate": 1.0058833114657493e-05, "loss": 0.9958, "step": 6865 }, { "epoch": 2.113764855197877, "grad_norm": 1.0032036304473877, "learning_rate": 1.0026560184845066e-05, "loss": 1.0089, "step": 6870 }, { "epoch": 2.1153032575670165, "grad_norm": 1.6723458766937256, "learning_rate": 9.994326119436478e-06, "loss": 1.0826, "step": 6875 }, { "epoch": 2.1168416599361564, "grad_norm": 1.6122030019760132, "learning_rate": 9.962131002097022e-06, "loss": 1.0821, "step": 6880 }, { "epoch": 2.1183800623052957, "grad_norm": 1.042712926864624, "learning_rate": 9.929974916390953e-06, "loss": 1.0976, "step": 6885 }, { "epoch": 2.1199184646744356, "grad_norm": 1.353058099746704, "learning_rate": 9.897857945781196e-06, "loss": 1.2036, "step": 6890 }, { "epoch": 2.1214568670435754, "grad_norm": 1.3776549100875854, "learning_rate": 9.865780173629147e-06, "loss": 1.1875, "step": 6895 }, { "epoch": 2.1229952694127148, "grad_norm": 1.416298747062683, "learning_rate": 9.833741683194475e-06, "loss": 1.1701, "step": 6900 }, { "epoch": 2.1245336717818546, "grad_norm": 3.549651861190796, "learning_rate": 9.801742557634872e-06, "loss": 1.1895, "step": 6905 }, { "epoch": 2.1260720741509944, "grad_norm": 1.4298040866851807, "learning_rate": 9.76978288000586e-06, "loss": 1.2195, "step": 6910 }, { "epoch": 2.127610476520134, "grad_norm": 1.3986989259719849, "learning_rate": 9.73786273326059e-06, "loss": 1.2749, "step": 6915 }, { "epoch": 2.1291488788892736, "grad_norm": 1.6647675037384033, "learning_rate": 9.70598220024958e-06, "loss": 1.1431, "step": 6920 }, { "epoch": 2.130687281258413, "grad_norm": 1.683068037033081, "learning_rate": 9.674141363720554e-06, "loss": 1.1656, "step": 6925 }, { "epoch": 2.132225683627553, "grad_norm": 1.4037578105926514, "learning_rate": 9.642340306318203e-06, "loss": 1.1445, "step": 6930 }, { "epoch": 2.1337640859966926, "grad_norm": 3.2454349994659424, "learning_rate": 9.61057911058393e-06, "loss": 1.1231, "step": 6935 }, { "epoch": 2.135302488365832, "grad_norm": 1.0862977504730225, "learning_rate": 9.578857858955715e-06, "loss": 1.0971, "step": 6940 }, { "epoch": 2.136840890734972, "grad_norm": 1.2978265285491943, "learning_rate": 9.547176633767857e-06, "loss": 1.2674, "step": 6945 }, { "epoch": 2.1383792931041112, "grad_norm": 1.3811675310134888, "learning_rate": 9.515535517250737e-06, "loss": 1.1261, "step": 6950 }, { "epoch": 2.139917695473251, "grad_norm": 1.3552972078323364, "learning_rate": 9.483934591530668e-06, "loss": 0.9968, "step": 6955 }, { "epoch": 2.141456097842391, "grad_norm": 1.5487886667251587, "learning_rate": 9.452373938629619e-06, "loss": 1.1008, "step": 6960 }, { "epoch": 2.1429945002115303, "grad_norm": 1.354445219039917, "learning_rate": 9.420853640465025e-06, "loss": 1.11, "step": 6965 }, { "epoch": 2.14453290258067, "grad_norm": 1.501561164855957, "learning_rate": 9.389373778849612e-06, "loss": 1.2466, "step": 6970 }, { "epoch": 2.1460713049498095, "grad_norm": 1.2629624605178833, "learning_rate": 9.357934435491106e-06, "loss": 1.2212, "step": 6975 }, { "epoch": 2.1476097073189493, "grad_norm": 2.1113383769989014, "learning_rate": 9.3265356919921e-06, "loss": 1.1319, "step": 6980 }, { "epoch": 2.149148109688089, "grad_norm": 1.405587077140808, "learning_rate": 9.295177629849802e-06, "loss": 1.2019, "step": 6985 }, { "epoch": 2.1506865120572285, "grad_norm": 1.0090694427490234, "learning_rate": 9.26386033045582e-06, "loss": 1.2154, "step": 6990 }, { "epoch": 2.1522249144263683, "grad_norm": 1.7210766077041626, "learning_rate": 9.232583875095949e-06, "loss": 1.1205, "step": 6995 }, { "epoch": 2.1537633167955077, "grad_norm": 3.97908616065979, "learning_rate": 9.201348344950001e-06, "loss": 1.0795, "step": 7000 }, { "epoch": 2.1553017191646475, "grad_norm": 1.465118646621704, "learning_rate": 9.170153821091537e-06, "loss": 1.1504, "step": 7005 }, { "epoch": 2.1568401215337873, "grad_norm": 1.2877440452575684, "learning_rate": 9.13900038448771e-06, "loss": 1.2082, "step": 7010 }, { "epoch": 2.1583785239029267, "grad_norm": 1.5617296695709229, "learning_rate": 9.107888115999002e-06, "loss": 1.1485, "step": 7015 }, { "epoch": 2.1599169262720666, "grad_norm": 1.6196231842041016, "learning_rate": 9.07681709637905e-06, "loss": 1.1636, "step": 7020 }, { "epoch": 2.161455328641206, "grad_norm": 1.7071741819381714, "learning_rate": 9.045787406274437e-06, "loss": 1.2166, "step": 7025 }, { "epoch": 2.1629937310103458, "grad_norm": 1.6276894807815552, "learning_rate": 9.014799126224471e-06, "loss": 1.0885, "step": 7030 }, { "epoch": 2.1645321333794856, "grad_norm": 1.0166163444519043, "learning_rate": 8.983852336660959e-06, "loss": 1.203, "step": 7035 }, { "epoch": 2.166070535748625, "grad_norm": 1.7075228691101074, "learning_rate": 8.952947117908047e-06, "loss": 1.0762, "step": 7040 }, { "epoch": 2.167608938117765, "grad_norm": 1.1821060180664062, "learning_rate": 8.922083550181959e-06, "loss": 1.0562, "step": 7045 }, { "epoch": 2.169147340486904, "grad_norm": 1.4123872518539429, "learning_rate": 8.891261713590807e-06, "loss": 1.1853, "step": 7050 }, { "epoch": 2.170685742856044, "grad_norm": 1.1683062314987183, "learning_rate": 8.860481688134417e-06, "loss": 1.1117, "step": 7055 }, { "epoch": 2.172224145225184, "grad_norm": 2.6672306060791016, "learning_rate": 8.829743553704056e-06, "loss": 1.1283, "step": 7060 }, { "epoch": 2.173762547594323, "grad_norm": 1.9148149490356445, "learning_rate": 8.799047390082296e-06, "loss": 1.1936, "step": 7065 }, { "epoch": 2.175300949963463, "grad_norm": 1.1535136699676514, "learning_rate": 8.768393276942743e-06, "loss": 1.1856, "step": 7070 }, { "epoch": 2.1768393523326024, "grad_norm": 2.045712471008301, "learning_rate": 8.737781293849864e-06, "loss": 1.2375, "step": 7075 }, { "epoch": 2.178377754701742, "grad_norm": 1.2342534065246582, "learning_rate": 8.707211520258782e-06, "loss": 1.186, "step": 7080 }, { "epoch": 2.179916157070882, "grad_norm": 1.2086020708084106, "learning_rate": 8.676684035515076e-06, "loss": 1.1079, "step": 7085 }, { "epoch": 2.1814545594400214, "grad_norm": 1.6078280210494995, "learning_rate": 8.646198918854526e-06, "loss": 1.0061, "step": 7090 }, { "epoch": 2.1829929618091612, "grad_norm": 1.091389775276184, "learning_rate": 8.61575624940298e-06, "loss": 1.0278, "step": 7095 }, { "epoch": 2.1845313641783006, "grad_norm": 1.2044703960418701, "learning_rate": 8.585356106176094e-06, "loss": 1.1275, "step": 7100 }, { "epoch": 2.1860697665474405, "grad_norm": 1.288192629814148, "learning_rate": 8.55499856807913e-06, "loss": 1.0621, "step": 7105 }, { "epoch": 2.1876081689165803, "grad_norm": 1.0985409021377563, "learning_rate": 8.524683713906805e-06, "loss": 1.0334, "step": 7110 }, { "epoch": 2.1891465712857197, "grad_norm": 1.8316186666488647, "learning_rate": 8.49441162234301e-06, "loss": 1.1259, "step": 7115 }, { "epoch": 2.1906849736548595, "grad_norm": 1.233383059501648, "learning_rate": 8.464182371960668e-06, "loss": 1.213, "step": 7120 }, { "epoch": 2.192223376023999, "grad_norm": 1.8326971530914307, "learning_rate": 8.433996041221492e-06, "loss": 1.165, "step": 7125 }, { "epoch": 2.1937617783931387, "grad_norm": 1.156266450881958, "learning_rate": 8.403852708475792e-06, "loss": 1.1034, "step": 7130 }, { "epoch": 2.1953001807622785, "grad_norm": 1.1190317869186401, "learning_rate": 8.373752451962286e-06, "loss": 1.0357, "step": 7135 }, { "epoch": 2.196838583131418, "grad_norm": 1.4066323041915894, "learning_rate": 8.34369534980789e-06, "loss": 1.0494, "step": 7140 }, { "epoch": 2.1983769855005577, "grad_norm": 1.2038296461105347, "learning_rate": 8.31368148002748e-06, "loss": 1.1883, "step": 7145 }, { "epoch": 2.199915387869697, "grad_norm": 1.0969994068145752, "learning_rate": 8.283710920523763e-06, "loss": 1.0538, "step": 7150 }, { "epoch": 2.201453790238837, "grad_norm": 1.6629050970077515, "learning_rate": 8.253783749086993e-06, "loss": 1.1396, "step": 7155 }, { "epoch": 2.2029921926079767, "grad_norm": 1.3252410888671875, "learning_rate": 8.223900043394825e-06, "loss": 1.2732, "step": 7160 }, { "epoch": 2.204530594977116, "grad_norm": 1.4837875366210938, "learning_rate": 8.194059881012105e-06, "loss": 1.0282, "step": 7165 }, { "epoch": 2.206068997346256, "grad_norm": 1.2127585411071777, "learning_rate": 8.164263339390635e-06, "loss": 1.067, "step": 7170 }, { "epoch": 2.2076073997153958, "grad_norm": 1.0039737224578857, "learning_rate": 8.13451049586903e-06, "loss": 1.1353, "step": 7175 }, { "epoch": 2.209145802084535, "grad_norm": 2.2187910079956055, "learning_rate": 8.104801427672456e-06, "loss": 1.0704, "step": 7180 }, { "epoch": 2.210684204453675, "grad_norm": 1.5790579319000244, "learning_rate": 8.07513621191246e-06, "loss": 1.1664, "step": 7185 }, { "epoch": 2.2122226068228144, "grad_norm": 0.9343917369842529, "learning_rate": 8.045514925586784e-06, "loss": 1.2461, "step": 7190 }, { "epoch": 2.213761009191954, "grad_norm": 1.3252272605895996, "learning_rate": 8.015937645579148e-06, "loss": 1.0664, "step": 7195 }, { "epoch": 2.215299411561094, "grad_norm": 1.1168420314788818, "learning_rate": 7.986404448659023e-06, "loss": 1.145, "step": 7200 }, { "epoch": 2.2168378139302334, "grad_norm": 3.980158567428589, "learning_rate": 7.956915411481505e-06, "loss": 1.099, "step": 7205 }, { "epoch": 2.218376216299373, "grad_norm": 1.854601502418518, "learning_rate": 7.927470610587028e-06, "loss": 1.2413, "step": 7210 }, { "epoch": 2.2199146186685126, "grad_norm": 1.069262981414795, "learning_rate": 7.898070122401224e-06, "loss": 1.2822, "step": 7215 }, { "epoch": 2.2214530210376524, "grad_norm": 1.6758251190185547, "learning_rate": 7.868714023234727e-06, "loss": 1.0832, "step": 7220 }, { "epoch": 2.2229914234067922, "grad_norm": 1.4776010513305664, "learning_rate": 7.839402389282924e-06, "loss": 1.1282, "step": 7225 }, { "epoch": 2.2245298257759316, "grad_norm": 1.4931747913360596, "learning_rate": 7.810135296625818e-06, "loss": 1.2089, "step": 7230 }, { "epoch": 2.2260682281450714, "grad_norm": 1.7104672193527222, "learning_rate": 7.78091282122779e-06, "loss": 1.0814, "step": 7235 }, { "epoch": 2.227606630514211, "grad_norm": 1.155410647392273, "learning_rate": 7.751735038937405e-06, "loss": 1.1818, "step": 7240 }, { "epoch": 2.2291450328833506, "grad_norm": 1.6542633771896362, "learning_rate": 7.722602025487243e-06, "loss": 1.2211, "step": 7245 }, { "epoch": 2.2306834352524905, "grad_norm": 1.7440063953399658, "learning_rate": 7.693513856493684e-06, "loss": 1.0336, "step": 7250 }, { "epoch": 2.23222183762163, "grad_norm": 1.1944167613983154, "learning_rate": 7.6644706074567e-06, "loss": 1.1502, "step": 7255 }, { "epoch": 2.2337602399907697, "grad_norm": 1.1156004667282104, "learning_rate": 7.63547235375966e-06, "loss": 1.1147, "step": 7260 }, { "epoch": 2.235298642359909, "grad_norm": 1.6988039016723633, "learning_rate": 7.6065191706691795e-06, "loss": 1.1423, "step": 7265 }, { "epoch": 2.236837044729049, "grad_norm": 1.2307072877883911, "learning_rate": 7.577611133334858e-06, "loss": 1.2132, "step": 7270 }, { "epoch": 2.2383754470981887, "grad_norm": 2.0237016677856445, "learning_rate": 7.54874831678914e-06, "loss": 0.992, "step": 7275 }, { "epoch": 2.239913849467328, "grad_norm": 1.5955995321273804, "learning_rate": 7.519930795947072e-06, "loss": 1.1199, "step": 7280 }, { "epoch": 2.241452251836468, "grad_norm": 2.980849504470825, "learning_rate": 7.491158645606167e-06, "loss": 1.2796, "step": 7285 }, { "epoch": 2.2429906542056073, "grad_norm": 1.1380635499954224, "learning_rate": 7.462431940446135e-06, "loss": 1.1254, "step": 7290 }, { "epoch": 2.244529056574747, "grad_norm": 1.3550338745117188, "learning_rate": 7.433750755028773e-06, "loss": 1.12, "step": 7295 }, { "epoch": 2.246067458943887, "grad_norm": 1.9419124126434326, "learning_rate": 7.40511516379769e-06, "loss": 1.1136, "step": 7300 }, { "epoch": 2.2476058613130263, "grad_norm": 2.2289304733276367, "learning_rate": 7.376525241078189e-06, "loss": 1.0981, "step": 7305 }, { "epoch": 2.249144263682166, "grad_norm": 1.1924974918365479, "learning_rate": 7.347981061077011e-06, "loss": 0.9997, "step": 7310 }, { "epoch": 2.250682666051306, "grad_norm": 2.636819362640381, "learning_rate": 7.319482697882168e-06, "loss": 1.1327, "step": 7315 }, { "epoch": 2.2522210684204453, "grad_norm": 1.150176763534546, "learning_rate": 7.291030225462781e-06, "loss": 1.1047, "step": 7320 }, { "epoch": 2.253759470789585, "grad_norm": 2.1936254501342773, "learning_rate": 7.262623717668821e-06, "loss": 1.0588, "step": 7325 }, { "epoch": 2.2552978731587245, "grad_norm": 1.0446133613586426, "learning_rate": 7.2342632482309825e-06, "loss": 1.0701, "step": 7330 }, { "epoch": 2.2568362755278644, "grad_norm": 1.2231892347335815, "learning_rate": 7.205948890760464e-06, "loss": 1.134, "step": 7335 }, { "epoch": 2.258374677897004, "grad_norm": 1.346924066543579, "learning_rate": 7.177680718748767e-06, "loss": 1.0544, "step": 7340 }, { "epoch": 2.2599130802661436, "grad_norm": 1.9672859907150269, "learning_rate": 7.149458805567505e-06, "loss": 1.1593, "step": 7345 }, { "epoch": 2.2614514826352834, "grad_norm": 2.7677721977233887, "learning_rate": 7.1212832244682585e-06, "loss": 1.183, "step": 7350 }, { "epoch": 2.2629898850044228, "grad_norm": 1.1894586086273193, "learning_rate": 7.093154048582313e-06, "loss": 1.0514, "step": 7355 }, { "epoch": 2.2645282873735626, "grad_norm": 1.667694091796875, "learning_rate": 7.065071350920538e-06, "loss": 1.0868, "step": 7360 }, { "epoch": 2.2660666897427024, "grad_norm": 1.0725141763687134, "learning_rate": 7.037035204373147e-06, "loss": 1.154, "step": 7365 }, { "epoch": 2.267605092111842, "grad_norm": 1.3035621643066406, "learning_rate": 7.009045681709522e-06, "loss": 1.135, "step": 7370 }, { "epoch": 2.2691434944809816, "grad_norm": 1.461717963218689, "learning_rate": 6.981102855578062e-06, "loss": 1.0843, "step": 7375 }, { "epoch": 2.270681896850121, "grad_norm": 1.022071361541748, "learning_rate": 6.953206798505918e-06, "loss": 1.0135, "step": 7380 }, { "epoch": 2.272220299219261, "grad_norm": 1.2068042755126953, "learning_rate": 6.925357582898886e-06, "loss": 1.1527, "step": 7385 }, { "epoch": 2.2737587015884007, "grad_norm": 1.2736159563064575, "learning_rate": 6.8975552810411765e-06, "loss": 1.1415, "step": 7390 }, { "epoch": 2.27529710395754, "grad_norm": 1.3243377208709717, "learning_rate": 6.869799965095214e-06, "loss": 1.1465, "step": 7395 }, { "epoch": 2.27683550632668, "grad_norm": 1.1751413345336914, "learning_rate": 6.842091707101473e-06, "loss": 1.157, "step": 7400 }, { "epoch": 2.2783739086958192, "grad_norm": 1.1726213693618774, "learning_rate": 6.814430578978309e-06, "loss": 1.1401, "step": 7405 }, { "epoch": 2.279912311064959, "grad_norm": 1.083336353302002, "learning_rate": 6.786816652521719e-06, "loss": 1.1267, "step": 7410 }, { "epoch": 2.281450713434099, "grad_norm": 3.8451592922210693, "learning_rate": 6.759249999405212e-06, "loss": 1.1394, "step": 7415 }, { "epoch": 2.2829891158032383, "grad_norm": 1.711990237236023, "learning_rate": 6.73173069117958e-06, "loss": 1.1841, "step": 7420 }, { "epoch": 2.284527518172378, "grad_norm": 1.154262900352478, "learning_rate": 6.704258799272722e-06, "loss": 1.0895, "step": 7425 }, { "epoch": 2.2860659205415175, "grad_norm": 1.3660565614700317, "learning_rate": 6.676834394989495e-06, "loss": 1.1304, "step": 7430 }, { "epoch": 2.2876043229106573, "grad_norm": 1.9204599857330322, "learning_rate": 6.649457549511459e-06, "loss": 1.033, "step": 7435 }, { "epoch": 2.289142725279797, "grad_norm": 1.165299654006958, "learning_rate": 6.622128333896768e-06, "loss": 1.0643, "step": 7440 }, { "epoch": 2.2906811276489365, "grad_norm": 1.4679332971572876, "learning_rate": 6.594846819079939e-06, "loss": 1.0537, "step": 7445 }, { "epoch": 2.2922195300180763, "grad_norm": 1.5946282148361206, "learning_rate": 6.56761307587167e-06, "loss": 1.146, "step": 7450 }, { "epoch": 2.2937579323872157, "grad_norm": 1.4443016052246094, "learning_rate": 6.540427174958661e-06, "loss": 1.1959, "step": 7455 }, { "epoch": 2.2952963347563555, "grad_norm": 0.886721670627594, "learning_rate": 6.513289186903463e-06, "loss": 1.1575, "step": 7460 }, { "epoch": 2.2968347371254954, "grad_norm": 1.1904209852218628, "learning_rate": 6.486199182144229e-06, "loss": 1.0784, "step": 7465 }, { "epoch": 2.2983731394946347, "grad_norm": 1.3473881483078003, "learning_rate": 6.459157230994603e-06, "loss": 1.1037, "step": 7470 }, { "epoch": 2.2999115418637746, "grad_norm": 1.1764485836029053, "learning_rate": 6.432163403643482e-06, "loss": 1.0831, "step": 7475 }, { "epoch": 2.301449944232914, "grad_norm": 1.0809344053268433, "learning_rate": 6.405217770154853e-06, "loss": 1.2457, "step": 7480 }, { "epoch": 2.3029883466020538, "grad_norm": 1.5625346899032593, "learning_rate": 6.378320400467636e-06, "loss": 1.2338, "step": 7485 }, { "epoch": 2.3045267489711936, "grad_norm": 0.9581547379493713, "learning_rate": 6.3514713643954475e-06, "loss": 1.1475, "step": 7490 }, { "epoch": 2.306065151340333, "grad_norm": 2.769169569015503, "learning_rate": 6.324670731626478e-06, "loss": 1.2225, "step": 7495 }, { "epoch": 2.307603553709473, "grad_norm": 1.2585043907165527, "learning_rate": 6.297918571723288e-06, "loss": 1.0701, "step": 7500 }, { "epoch": 2.309141956078612, "grad_norm": 1.8813979625701904, "learning_rate": 6.271214954122581e-06, "loss": 1.1267, "step": 7505 }, { "epoch": 2.310680358447752, "grad_norm": 1.1062171459197998, "learning_rate": 6.244559948135109e-06, "loss": 1.0784, "step": 7510 }, { "epoch": 2.312218760816892, "grad_norm": 1.0726068019866943, "learning_rate": 6.217953622945449e-06, "loss": 1.2017, "step": 7515 }, { "epoch": 2.313757163186031, "grad_norm": 1.0642775297164917, "learning_rate": 6.191396047611794e-06, "loss": 1.1995, "step": 7520 }, { "epoch": 2.315295565555171, "grad_norm": 1.322033405303955, "learning_rate": 6.164887291065838e-06, "loss": 1.0586, "step": 7525 }, { "epoch": 2.3168339679243104, "grad_norm": 1.146404504776001, "learning_rate": 6.138427422112539e-06, "loss": 1.1833, "step": 7530 }, { "epoch": 2.3183723702934502, "grad_norm": 1.8734060525894165, "learning_rate": 6.1120165094299655e-06, "loss": 1.0803, "step": 7535 }, { "epoch": 2.31991077266259, "grad_norm": 1.196945309638977, "learning_rate": 6.085654621569137e-06, "loss": 1.168, "step": 7540 }, { "epoch": 2.3214491750317294, "grad_norm": 0.9148879647254944, "learning_rate": 6.0593418269538045e-06, "loss": 1.0812, "step": 7545 }, { "epoch": 2.3229875774008693, "grad_norm": 1.2411441802978516, "learning_rate": 6.0330781938803034e-06, "loss": 1.0235, "step": 7550 }, { "epoch": 2.3245259797700086, "grad_norm": 1.3500548601150513, "learning_rate": 6.006863790517392e-06, "loss": 1.0834, "step": 7555 }, { "epoch": 2.3260643821391485, "grad_norm": 1.3045114278793335, "learning_rate": 5.980698684905989e-06, "loss": 1.0431, "step": 7560 }, { "epoch": 2.3276027845082883, "grad_norm": 1.2521401643753052, "learning_rate": 5.954582944959111e-06, "loss": 1.1774, "step": 7565 }, { "epoch": 2.3291411868774277, "grad_norm": 1.590803861618042, "learning_rate": 5.928516638461639e-06, "loss": 1.2343, "step": 7570 }, { "epoch": 2.3306795892465675, "grad_norm": 1.66427481174469, "learning_rate": 5.902499833070119e-06, "loss": 1.0343, "step": 7575 }, { "epoch": 2.332217991615707, "grad_norm": 1.8136968612670898, "learning_rate": 5.876532596312645e-06, "loss": 1.2398, "step": 7580 }, { "epoch": 2.3337563939848467, "grad_norm": 1.165752649307251, "learning_rate": 5.850614995588627e-06, "loss": 0.9754, "step": 7585 }, { "epoch": 2.3352947963539865, "grad_norm": 2.509021043777466, "learning_rate": 5.824747098168651e-06, "loss": 1.0585, "step": 7590 }, { "epoch": 2.336833198723126, "grad_norm": 1.7842833995819092, "learning_rate": 5.798928971194301e-06, "loss": 1.0558, "step": 7595 }, { "epoch": 2.3383716010922657, "grad_norm": 1.0677943229675293, "learning_rate": 5.773160681677983e-06, "loss": 1.2275, "step": 7600 }, { "epoch": 2.339910003461405, "grad_norm": 1.1144322156906128, "learning_rate": 5.747442296502725e-06, "loss": 1.3907, "step": 7605 }, { "epoch": 2.341448405830545, "grad_norm": 2.3197555541992188, "learning_rate": 5.721773882422057e-06, "loss": 1.2227, "step": 7610 }, { "epoch": 2.3429868081996847, "grad_norm": 1.0514782667160034, "learning_rate": 5.69615550605978e-06, "loss": 1.2214, "step": 7615 }, { "epoch": 2.344525210568824, "grad_norm": 1.129257082939148, "learning_rate": 5.6705872339098186e-06, "loss": 1.1048, "step": 7620 }, { "epoch": 2.346063612937964, "grad_norm": 2.5535309314727783, "learning_rate": 5.645069132336078e-06, "loss": 1.0361, "step": 7625 }, { "epoch": 2.3476020153071033, "grad_norm": 1.601874828338623, "learning_rate": 5.6196012675722055e-06, "loss": 1.1273, "step": 7630 }, { "epoch": 2.349140417676243, "grad_norm": 1.2392939329147339, "learning_rate": 5.594183705721484e-06, "loss": 0.9944, "step": 7635 }, { "epoch": 2.350678820045383, "grad_norm": 1.375126600265503, "learning_rate": 5.568816512756633e-06, "loss": 1.1266, "step": 7640 }, { "epoch": 2.3522172224145224, "grad_norm": 1.151104211807251, "learning_rate": 5.5434997545196015e-06, "loss": 1.2082, "step": 7645 }, { "epoch": 2.353755624783662, "grad_norm": 1.6542187929153442, "learning_rate": 5.5182334967214725e-06, "loss": 1.1022, "step": 7650 }, { "epoch": 2.355294027152802, "grad_norm": 3.7500414848327637, "learning_rate": 5.493017804942238e-06, "loss": 1.2594, "step": 7655 }, { "epoch": 2.3568324295219414, "grad_norm": 0.9335038661956787, "learning_rate": 5.467852744630633e-06, "loss": 1.0596, "step": 7660 }, { "epoch": 2.358370831891081, "grad_norm": 1.5884743928909302, "learning_rate": 5.4427383811039985e-06, "loss": 1.1742, "step": 7665 }, { "epoch": 2.3599092342602206, "grad_norm": 1.1601238250732422, "learning_rate": 5.417674779548062e-06, "loss": 1.2038, "step": 7670 }, { "epoch": 2.3614476366293604, "grad_norm": 1.1803358793258667, "learning_rate": 5.39266200501681e-06, "loss": 1.1756, "step": 7675 }, { "epoch": 2.3629860389985002, "grad_norm": 1.501497507095337, "learning_rate": 5.367700122432315e-06, "loss": 1.1091, "step": 7680 }, { "epoch": 2.3645244413676396, "grad_norm": 1.0719640254974365, "learning_rate": 5.342789196584527e-06, "loss": 1.1683, "step": 7685 }, { "epoch": 2.3660628437367794, "grad_norm": 1.4727404117584229, "learning_rate": 5.317929292131163e-06, "loss": 1.1414, "step": 7690 }, { "epoch": 2.367601246105919, "grad_norm": 1.195420742034912, "learning_rate": 5.293120473597515e-06, "loss": 1.1998, "step": 7695 }, { "epoch": 2.3691396484750586, "grad_norm": 1.4516286849975586, "learning_rate": 5.268362805376237e-06, "loss": 1.1199, "step": 7700 }, { "epoch": 2.3706780508441985, "grad_norm": 1.0601075887680054, "learning_rate": 5.243656351727258e-06, "loss": 1.1662, "step": 7705 }, { "epoch": 2.372216453213338, "grad_norm": 1.7174220085144043, "learning_rate": 5.219001176777574e-06, "loss": 1.1581, "step": 7710 }, { "epoch": 2.3737548555824777, "grad_norm": 1.3260307312011719, "learning_rate": 5.194397344521065e-06, "loss": 1.0319, "step": 7715 }, { "epoch": 2.375293257951617, "grad_norm": 1.4584814310073853, "learning_rate": 5.16984491881837e-06, "loss": 1.1525, "step": 7720 }, { "epoch": 2.376831660320757, "grad_norm": 1.018962025642395, "learning_rate": 5.145343963396682e-06, "loss": 1.2017, "step": 7725 }, { "epoch": 2.3783700626898967, "grad_norm": 1.7327367067337036, "learning_rate": 5.120894541849599e-06, "loss": 1.1709, "step": 7730 }, { "epoch": 2.379908465059036, "grad_norm": 1.5583730936050415, "learning_rate": 5.096496717636984e-06, "loss": 1.2094, "step": 7735 }, { "epoch": 2.381446867428176, "grad_norm": 1.2519557476043701, "learning_rate": 5.072150554084745e-06, "loss": 1.0785, "step": 7740 }, { "epoch": 2.3829852697973157, "grad_norm": 1.1944867372512817, "learning_rate": 5.04785611438473e-06, "loss": 1.2777, "step": 7745 }, { "epoch": 2.384523672166455, "grad_norm": 1.240370512008667, "learning_rate": 5.023613461594512e-06, "loss": 1.2303, "step": 7750 }, { "epoch": 2.386062074535595, "grad_norm": 1.117368459701538, "learning_rate": 4.999422658637254e-06, "loss": 1.0655, "step": 7755 }, { "epoch": 2.3876004769047343, "grad_norm": 1.7489937543869019, "learning_rate": 4.9752837683015505e-06, "loss": 1.203, "step": 7760 }, { "epoch": 2.389138879273874, "grad_norm": 1.187821388244629, "learning_rate": 4.95119685324125e-06, "loss": 1.1294, "step": 7765 }, { "epoch": 2.390677281643014, "grad_norm": 1.0779712200164795, "learning_rate": 4.927161975975284e-06, "loss": 1.1303, "step": 7770 }, { "epoch": 2.3922156840121533, "grad_norm": 1.066462516784668, "learning_rate": 4.903179198887536e-06, "loss": 1.2634, "step": 7775 }, { "epoch": 2.393754086381293, "grad_norm": 1.1906163692474365, "learning_rate": 4.879248584226645e-06, "loss": 1.0123, "step": 7780 }, { "epoch": 2.3952924887504325, "grad_norm": 1.7448662519454956, "learning_rate": 4.85537019410586e-06, "loss": 1.165, "step": 7785 }, { "epoch": 2.3968308911195724, "grad_norm": 1.0964466333389282, "learning_rate": 4.831544090502896e-06, "loss": 1.1348, "step": 7790 }, { "epoch": 2.398369293488712, "grad_norm": 1.2719080448150635, "learning_rate": 4.807770335259726e-06, "loss": 1.0279, "step": 7795 }, { "epoch": 2.3999076958578516, "grad_norm": 1.6911888122558594, "learning_rate": 4.784048990082484e-06, "loss": 1.2083, "step": 7800 }, { "epoch": 2.4014460982269914, "grad_norm": 1.3245162963867188, "learning_rate": 4.760380116541246e-06, "loss": 1.0125, "step": 7805 }, { "epoch": 2.402984500596131, "grad_norm": 1.16487455368042, "learning_rate": 4.736763776069897e-06, "loss": 1.1643, "step": 7810 }, { "epoch": 2.4045229029652706, "grad_norm": 1.4798593521118164, "learning_rate": 4.713200029965978e-06, "loss": 1.1108, "step": 7815 }, { "epoch": 2.4060613053344104, "grad_norm": 1.7683199644088745, "learning_rate": 4.689688939390521e-06, "loss": 1.1573, "step": 7820 }, { "epoch": 2.40759970770355, "grad_norm": 1.2997316122055054, "learning_rate": 4.666230565367874e-06, "loss": 1.132, "step": 7825 }, { "epoch": 2.4091381100726896, "grad_norm": 1.1322225332260132, "learning_rate": 4.642824968785572e-06, "loss": 1.1534, "step": 7830 }, { "epoch": 2.410676512441829, "grad_norm": 1.4464402198791504, "learning_rate": 4.619472210394154e-06, "loss": 1.1766, "step": 7835 }, { "epoch": 2.412214914810969, "grad_norm": 2.9878416061401367, "learning_rate": 4.596172350807004e-06, "loss": 1.1984, "step": 7840 }, { "epoch": 2.4137533171801087, "grad_norm": 1.0388379096984863, "learning_rate": 4.572925450500232e-06, "loss": 1.108, "step": 7845 }, { "epoch": 2.415291719549248, "grad_norm": 1.252841591835022, "learning_rate": 4.549731569812457e-06, "loss": 1.1951, "step": 7850 }, { "epoch": 2.416830121918388, "grad_norm": 1.5610461235046387, "learning_rate": 4.526590768944713e-06, "loss": 1.0645, "step": 7855 }, { "epoch": 2.4183685242875272, "grad_norm": 4.270869255065918, "learning_rate": 4.5035031079602445e-06, "loss": 1.1204, "step": 7860 }, { "epoch": 2.419906926656667, "grad_norm": 1.0736466646194458, "learning_rate": 4.480468646784364e-06, "loss": 1.0063, "step": 7865 }, { "epoch": 2.421445329025807, "grad_norm": 1.0907769203186035, "learning_rate": 4.457487445204311e-06, "loss": 1.1522, "step": 7870 }, { "epoch": 2.4229837313949463, "grad_norm": 2.1617796421051025, "learning_rate": 4.434559562869098e-06, "loss": 1.2111, "step": 7875 }, { "epoch": 2.424522133764086, "grad_norm": 1.529151201248169, "learning_rate": 4.411685059289314e-06, "loss": 1.1406, "step": 7880 }, { "epoch": 2.4260605361332255, "grad_norm": 2.3434321880340576, "learning_rate": 4.388863993837031e-06, "loss": 1.1284, "step": 7885 }, { "epoch": 2.4275989385023653, "grad_norm": 1.4387180805206299, "learning_rate": 4.366096425745597e-06, "loss": 1.2728, "step": 7890 }, { "epoch": 2.429137340871505, "grad_norm": 1.0508075952529907, "learning_rate": 4.343382414109512e-06, "loss": 1.2069, "step": 7895 }, { "epoch": 2.4306757432406445, "grad_norm": 1.2803845405578613, "learning_rate": 4.320722017884274e-06, "loss": 1.1109, "step": 7900 }, { "epoch": 2.4322141456097843, "grad_norm": 1.1172047853469849, "learning_rate": 4.2981152958862155e-06, "loss": 1.1645, "step": 7905 }, { "epoch": 2.4337525479789237, "grad_norm": 1.058237075805664, "learning_rate": 4.275562306792352e-06, "loss": 1.1356, "step": 7910 }, { "epoch": 2.4352909503480635, "grad_norm": 1.0330730676651, "learning_rate": 4.253063109140224e-06, "loss": 1.1087, "step": 7915 }, { "epoch": 2.4368293527172034, "grad_norm": 1.2216901779174805, "learning_rate": 4.2306177613277765e-06, "loss": 1.0055, "step": 7920 }, { "epoch": 2.4383677550863427, "grad_norm": 1.5149664878845215, "learning_rate": 4.208226321613154e-06, "loss": 1.1642, "step": 7925 }, { "epoch": 2.4399061574554826, "grad_norm": 1.3599375486373901, "learning_rate": 4.185888848114614e-06, "loss": 0.9859, "step": 7930 }, { "epoch": 2.441444559824622, "grad_norm": 1.1671425104141235, "learning_rate": 4.163605398810305e-06, "loss": 1.1014, "step": 7935 }, { "epoch": 2.4429829621937618, "grad_norm": 1.6030899286270142, "learning_rate": 4.141376031538186e-06, "loss": 1.1734, "step": 7940 }, { "epoch": 2.4445213645629016, "grad_norm": 1.209281086921692, "learning_rate": 4.1192008039958235e-06, "loss": 1.0406, "step": 7945 }, { "epoch": 2.446059766932041, "grad_norm": 2.271026372909546, "learning_rate": 4.097079773740256e-06, "loss": 1.1752, "step": 7950 }, { "epoch": 2.447598169301181, "grad_norm": 0.9315564632415771, "learning_rate": 4.075012998187866e-06, "loss": 1.1408, "step": 7955 }, { "epoch": 2.44913657167032, "grad_norm": 1.7139698266983032, "learning_rate": 4.053000534614218e-06, "loss": 1.1057, "step": 7960 }, { "epoch": 2.45067497403946, "grad_norm": 1.6565955877304077, "learning_rate": 4.03104244015389e-06, "loss": 1.1457, "step": 7965 }, { "epoch": 2.4522133764086, "grad_norm": 1.3629543781280518, "learning_rate": 4.0091387718003415e-06, "loss": 1.2844, "step": 7970 }, { "epoch": 2.453751778777739, "grad_norm": 2.2979674339294434, "learning_rate": 3.987289586405785e-06, "loss": 1.1268, "step": 7975 }, { "epoch": 2.455290181146879, "grad_norm": 1.8692574501037598, "learning_rate": 3.9654949406809995e-06, "loss": 1.1367, "step": 7980 }, { "epoch": 2.4568285835160184, "grad_norm": 1.706198811531067, "learning_rate": 3.94375489119522e-06, "loss": 1.1304, "step": 7985 }, { "epoch": 2.4583669858851582, "grad_norm": 1.2476152181625366, "learning_rate": 3.922069494375963e-06, "loss": 1.1083, "step": 7990 }, { "epoch": 2.459905388254298, "grad_norm": 1.4470798969268799, "learning_rate": 3.900438806508885e-06, "loss": 1.1529, "step": 7995 }, { "epoch": 2.4614437906234374, "grad_norm": 1.186686635017395, "learning_rate": 3.878862883737666e-06, "loss": 1.1808, "step": 8000 }, { "epoch": 2.4629821929925773, "grad_norm": 0.927186906337738, "learning_rate": 3.857341782063812e-06, "loss": 1.1583, "step": 8005 }, { "epoch": 2.4645205953617166, "grad_norm": 1.1961028575897217, "learning_rate": 3.835875557346552e-06, "loss": 1.0924, "step": 8010 }, { "epoch": 2.4660589977308565, "grad_norm": 1.1408125162124634, "learning_rate": 3.814464265302692e-06, "loss": 1.1786, "step": 8015 }, { "epoch": 2.4675974000999963, "grad_norm": 1.9700957536697388, "learning_rate": 3.7931079615064284e-06, "loss": 1.0253, "step": 8020 }, { "epoch": 2.4691358024691357, "grad_norm": 1.8638861179351807, "learning_rate": 3.7718067013892465e-06, "loss": 1.2051, "step": 8025 }, { "epoch": 2.4706742048382755, "grad_norm": 1.2263269424438477, "learning_rate": 3.7505605402397753e-06, "loss": 1.1933, "step": 8030 }, { "epoch": 2.472212607207415, "grad_norm": 1.856704831123352, "learning_rate": 3.7293695332036027e-06, "loss": 1.0577, "step": 8035 }, { "epoch": 2.4737510095765547, "grad_norm": 1.062137246131897, "learning_rate": 3.7082337352831923e-06, "loss": 1.1723, "step": 8040 }, { "epoch": 2.4752894119456945, "grad_norm": 2.7450308799743652, "learning_rate": 3.6871532013376896e-06, "loss": 1.3035, "step": 8045 }, { "epoch": 2.476827814314834, "grad_norm": 1.344294786453247, "learning_rate": 3.666127986082796e-06, "loss": 1.2075, "step": 8050 }, { "epoch": 2.4783662166839737, "grad_norm": 1.329000473022461, "learning_rate": 3.645158144090649e-06, "loss": 1.0978, "step": 8055 }, { "epoch": 2.479904619053113, "grad_norm": 1.112028956413269, "learning_rate": 3.624243729789642e-06, "loss": 1.1551, "step": 8060 }, { "epoch": 2.481443021422253, "grad_norm": 0.9248641133308411, "learning_rate": 3.603384797464318e-06, "loss": 1.0623, "step": 8065 }, { "epoch": 2.4829814237913927, "grad_norm": 1.3773822784423828, "learning_rate": 3.582581401255211e-06, "loss": 1.146, "step": 8070 }, { "epoch": 2.484519826160532, "grad_norm": 0.9391597509384155, "learning_rate": 3.561833595158698e-06, "loss": 1.1969, "step": 8075 }, { "epoch": 2.486058228529672, "grad_norm": 1.0027233362197876, "learning_rate": 3.5411414330268676e-06, "loss": 0.9617, "step": 8080 }, { "epoch": 2.487596630898812, "grad_norm": 1.8053239583969116, "learning_rate": 3.5205049685674035e-06, "loss": 1.0424, "step": 8085 }, { "epoch": 2.489135033267951, "grad_norm": 1.6183481216430664, "learning_rate": 3.4999242553433954e-06, "loss": 1.0828, "step": 8090 }, { "epoch": 2.490673435637091, "grad_norm": 1.1142714023590088, "learning_rate": 3.4793993467732518e-06, "loss": 1.2526, "step": 8095 }, { "epoch": 2.4922118380062304, "grad_norm": 2.0687549114227295, "learning_rate": 3.458930296130519e-06, "loss": 1.0448, "step": 8100 }, { "epoch": 2.49375024037537, "grad_norm": 1.1878471374511719, "learning_rate": 3.4385171565437606e-06, "loss": 1.1272, "step": 8105 }, { "epoch": 2.49528864274451, "grad_norm": 1.824986219406128, "learning_rate": 3.418159980996441e-06, "loss": 1.0827, "step": 8110 }, { "epoch": 2.4968270451136494, "grad_norm": 1.2641549110412598, "learning_rate": 3.3978588223267383e-06, "loss": 1.0358, "step": 8115 }, { "epoch": 2.498365447482789, "grad_norm": 1.385815143585205, "learning_rate": 3.3776137332274553e-06, "loss": 1.1586, "step": 8120 }, { "epoch": 2.4999038498519286, "grad_norm": 1.2434886693954468, "learning_rate": 3.3574247662458645e-06, "loss": 1.0912, "step": 8125 }, { "epoch": 2.5014422522210684, "grad_norm": 1.060655951499939, "learning_rate": 3.3372919737835574e-06, "loss": 1.2092, "step": 8130 }, { "epoch": 2.502980654590208, "grad_norm": 1.700639247894287, "learning_rate": 3.317215408096322e-06, "loss": 1.0629, "step": 8135 }, { "epoch": 2.5045190569593476, "grad_norm": 1.9962120056152344, "learning_rate": 3.297195121294022e-06, "loss": 1.1027, "step": 8140 }, { "epoch": 2.5060574593284874, "grad_norm": 0.9801268577575684, "learning_rate": 3.2772311653404276e-06, "loss": 1.2366, "step": 8145 }, { "epoch": 2.5075958616976273, "grad_norm": 1.4176025390625, "learning_rate": 3.257323592053116e-06, "loss": 1.1483, "step": 8150 }, { "epoch": 2.5091342640667667, "grad_norm": 0.9962260127067566, "learning_rate": 3.2374724531033044e-06, "loss": 1.0985, "step": 8155 }, { "epoch": 2.5106726664359065, "grad_norm": 1.4499822854995728, "learning_rate": 3.2176778000157367e-06, "loss": 1.0903, "step": 8160 }, { "epoch": 2.512211068805046, "grad_norm": 2.288475751876831, "learning_rate": 3.1979396841685577e-06, "loss": 1.1369, "step": 8165 }, { "epoch": 2.5137494711741857, "grad_norm": 1.5029224157333374, "learning_rate": 3.17825815679314e-06, "loss": 1.2038, "step": 8170 }, { "epoch": 2.5152878735433255, "grad_norm": 1.0565627813339233, "learning_rate": 3.1586332689740037e-06, "loss": 1.1077, "step": 8175 }, { "epoch": 2.516826275912465, "grad_norm": 2.1373684406280518, "learning_rate": 3.1390650716486474e-06, "loss": 1.0097, "step": 8180 }, { "epoch": 2.5183646782816047, "grad_norm": 1.037817358970642, "learning_rate": 3.119553615607426e-06, "loss": 1.0125, "step": 8185 }, { "epoch": 2.519903080650744, "grad_norm": 1.1087840795516968, "learning_rate": 3.1000989514934105e-06, "loss": 1.0558, "step": 8190 }, { "epoch": 2.521441483019884, "grad_norm": 1.8037227392196655, "learning_rate": 3.0807011298022852e-06, "loss": 1.1562, "step": 8195 }, { "epoch": 2.5229798853890237, "grad_norm": 1.823003888130188, "learning_rate": 3.061360200882174e-06, "loss": 1.1018, "step": 8200 }, { "epoch": 2.524518287758163, "grad_norm": 1.0306799411773682, "learning_rate": 3.0420762149335565e-06, "loss": 1.0944, "step": 8205 }, { "epoch": 2.526056690127303, "grad_norm": 1.979805827140808, "learning_rate": 3.022849222009097e-06, "loss": 1.1025, "step": 8210 }, { "epoch": 2.5275950924964423, "grad_norm": 1.0842509269714355, "learning_rate": 3.0036792720135266e-06, "loss": 1.1262, "step": 8215 }, { "epoch": 2.529133494865582, "grad_norm": 1.5210621356964111, "learning_rate": 2.9845664147035326e-06, "loss": 1.1311, "step": 8220 }, { "epoch": 2.530671897234722, "grad_norm": 1.3788422346115112, "learning_rate": 2.965510699687615e-06, "loss": 1.1261, "step": 8225 }, { "epoch": 2.5322102996038613, "grad_norm": 1.1447571516036987, "learning_rate": 2.9465121764259447e-06, "loss": 1.2294, "step": 8230 }, { "epoch": 2.533748701973001, "grad_norm": 1.024839162826538, "learning_rate": 2.927570894230261e-06, "loss": 1.1547, "step": 8235 }, { "epoch": 2.5352871043421406, "grad_norm": 1.3132535219192505, "learning_rate": 2.908686902263724e-06, "loss": 1.0528, "step": 8240 }, { "epoch": 2.5368255067112804, "grad_norm": 1.7579002380371094, "learning_rate": 2.889860249540788e-06, "loss": 1.2613, "step": 8245 }, { "epoch": 2.53836390908042, "grad_norm": 2.2410359382629395, "learning_rate": 2.8710909849270994e-06, "loss": 1.0307, "step": 8250 }, { "epoch": 2.5399023114495596, "grad_norm": 2.274693489074707, "learning_rate": 2.852379157139329e-06, "loss": 1.217, "step": 8255 }, { "epoch": 2.5414407138186994, "grad_norm": 1.411793828010559, "learning_rate": 2.8337248147450757e-06, "loss": 1.0502, "step": 8260 }, { "epoch": 2.542979116187839, "grad_norm": 1.5521130561828613, "learning_rate": 2.815128006162751e-06, "loss": 1.3229, "step": 8265 }, { "epoch": 2.5445175185569786, "grad_norm": 0.9623503684997559, "learning_rate": 2.7965887796613884e-06, "loss": 1.1239, "step": 8270 }, { "epoch": 2.5460559209261184, "grad_norm": 2.273772716522217, "learning_rate": 2.7781071833606065e-06, "loss": 1.063, "step": 8275 }, { "epoch": 2.547594323295258, "grad_norm": 2.1022188663482666, "learning_rate": 2.7596832652304283e-06, "loss": 1.1397, "step": 8280 }, { "epoch": 2.5491327256643976, "grad_norm": 1.1091395616531372, "learning_rate": 2.7413170730911597e-06, "loss": 1.1236, "step": 8285 }, { "epoch": 2.550671128033537, "grad_norm": 2.22455096244812, "learning_rate": 2.7230086546132907e-06, "loss": 1.0664, "step": 8290 }, { "epoch": 2.552209530402677, "grad_norm": 1.0935211181640625, "learning_rate": 2.70475805731735e-06, "loss": 1.1093, "step": 8295 }, { "epoch": 2.5537479327718167, "grad_norm": 1.054491639137268, "learning_rate": 2.6865653285737757e-06, "loss": 1.1725, "step": 8300 }, { "epoch": 2.555286335140956, "grad_norm": 1.5011531114578247, "learning_rate": 2.668430515602832e-06, "loss": 1.0883, "step": 8305 }, { "epoch": 2.556824737510096, "grad_norm": 1.2729263305664062, "learning_rate": 2.6503536654744338e-06, "loss": 1.1141, "step": 8310 }, { "epoch": 2.5583631398792352, "grad_norm": 1.7168303728103638, "learning_rate": 2.6323348251080626e-06, "loss": 1.1521, "step": 8315 }, { "epoch": 2.559901542248375, "grad_norm": 1.7469420433044434, "learning_rate": 2.6143740412726435e-06, "loss": 1.0427, "step": 8320 }, { "epoch": 2.561439944617515, "grad_norm": 1.154000997543335, "learning_rate": 2.596471360586378e-06, "loss": 1.1774, "step": 8325 }, { "epoch": 2.5629783469866543, "grad_norm": 1.1220345497131348, "learning_rate": 2.5786268295166892e-06, "loss": 1.1262, "step": 8330 }, { "epoch": 2.564516749355794, "grad_norm": 1.7687299251556396, "learning_rate": 2.5608404943800622e-06, "loss": 1.2676, "step": 8335 }, { "epoch": 2.5660551517249335, "grad_norm": 1.3014109134674072, "learning_rate": 2.5431124013419237e-06, "loss": 1.0543, "step": 8340 }, { "epoch": 2.5675935540940733, "grad_norm": 3.2140731811523438, "learning_rate": 2.525442596416541e-06, "loss": 1.1211, "step": 8345 }, { "epoch": 2.569131956463213, "grad_norm": 1.5560851097106934, "learning_rate": 2.5078311254668834e-06, "loss": 1.1792, "step": 8350 }, { "epoch": 2.5706703588323525, "grad_norm": 1.287881851196289, "learning_rate": 2.490278034204502e-06, "loss": 1.1749, "step": 8355 }, { "epoch": 2.5722087612014923, "grad_norm": 1.4491333961486816, "learning_rate": 2.4727833681894437e-06, "loss": 1.079, "step": 8360 }, { "epoch": 2.5737471635706317, "grad_norm": 1.628779411315918, "learning_rate": 2.4553471728300885e-06, "loss": 1.1926, "step": 8365 }, { "epoch": 2.5752855659397715, "grad_norm": 1.1830757856369019, "learning_rate": 2.4379694933830634e-06, "loss": 1.1329, "step": 8370 }, { "epoch": 2.5768239683089114, "grad_norm": 1.2473928928375244, "learning_rate": 2.4206503749531236e-06, "loss": 1.0945, "step": 8375 }, { "epoch": 2.5783623706780507, "grad_norm": 0.9560319781303406, "learning_rate": 2.4033898624929884e-06, "loss": 1.2291, "step": 8380 }, { "epoch": 2.5799007730471906, "grad_norm": 1.598518967628479, "learning_rate": 2.386188000803302e-06, "loss": 1.113, "step": 8385 }, { "epoch": 2.58143917541633, "grad_norm": 3.0550365447998047, "learning_rate": 2.3690448345324634e-06, "loss": 1.2072, "step": 8390 }, { "epoch": 2.5829775777854698, "grad_norm": 2.201508045196533, "learning_rate": 2.351960408176518e-06, "loss": 1.092, "step": 8395 }, { "epoch": 2.5845159801546096, "grad_norm": 1.3211005926132202, "learning_rate": 2.3349347660790582e-06, "loss": 1.2229, "step": 8400 }, { "epoch": 2.586054382523749, "grad_norm": 1.2909151315689087, "learning_rate": 2.317967952431094e-06, "loss": 1.0366, "step": 8405 }, { "epoch": 2.587592784892889, "grad_norm": 1.7140899896621704, "learning_rate": 2.3010600112709364e-06, "loss": 1.1082, "step": 8410 }, { "epoch": 2.589131187262028, "grad_norm": 1.139414668083191, "learning_rate": 2.2842109864841034e-06, "loss": 1.1882, "step": 8415 }, { "epoch": 2.590669589631168, "grad_norm": 1.4526281356811523, "learning_rate": 2.2674209218031787e-06, "loss": 1.1233, "step": 8420 }, { "epoch": 2.592207992000308, "grad_norm": 1.918116569519043, "learning_rate": 2.25068986080772e-06, "loss": 1.2028, "step": 8425 }, { "epoch": 2.593746394369447, "grad_norm": 1.6036014556884766, "learning_rate": 2.2340178469241467e-06, "loss": 1.2098, "step": 8430 }, { "epoch": 2.595284796738587, "grad_norm": 1.7327996492385864, "learning_rate": 2.2174049234255895e-06, "loss": 1.095, "step": 8435 }, { "epoch": 2.5968231991077264, "grad_norm": 2.0552759170532227, "learning_rate": 2.2008511334318306e-06, "loss": 1.2627, "step": 8440 }, { "epoch": 2.5983616014768662, "grad_norm": 1.393501877784729, "learning_rate": 2.184356519909167e-06, "loss": 1.0631, "step": 8445 }, { "epoch": 2.599900003846006, "grad_norm": 3.89595890045166, "learning_rate": 2.1679211256702884e-06, "loss": 1.0974, "step": 8450 }, { "epoch": 2.6014384062151454, "grad_norm": 1.317922592163086, "learning_rate": 2.1515449933741854e-06, "loss": 1.1896, "step": 8455 }, { "epoch": 2.6029768085842853, "grad_norm": 1.5205051898956299, "learning_rate": 2.135228165526032e-06, "loss": 1.2268, "step": 8460 }, { "epoch": 2.6045152109534246, "grad_norm": 1.2494088411331177, "learning_rate": 2.118970684477062e-06, "loss": 1.1769, "step": 8465 }, { "epoch": 2.6060536133225645, "grad_norm": 1.1973810195922852, "learning_rate": 2.1027725924244903e-06, "loss": 1.1078, "step": 8470 }, { "epoch": 2.6075920156917043, "grad_norm": 1.2799108028411865, "learning_rate": 2.0866339314113662e-06, "loss": 1.2161, "step": 8475 }, { "epoch": 2.6091304180608437, "grad_norm": 1.1334717273712158, "learning_rate": 2.0705547433264943e-06, "loss": 1.1708, "step": 8480 }, { "epoch": 2.6106688204299835, "grad_norm": 1.3745144605636597, "learning_rate": 2.0545350699043174e-06, "loss": 1.1895, "step": 8485 }, { "epoch": 2.612207222799123, "grad_norm": 1.3669915199279785, "learning_rate": 2.0385749527247837e-06, "loss": 1.2637, "step": 8490 }, { "epoch": 2.6137456251682627, "grad_norm": 1.3522690534591675, "learning_rate": 2.0226744332132812e-06, "loss": 1.1513, "step": 8495 }, { "epoch": 2.6152840275374025, "grad_norm": 1.3111488819122314, "learning_rate": 2.0068335526405023e-06, "loss": 1.0883, "step": 8500 }, { "epoch": 2.616822429906542, "grad_norm": 1.5023269653320312, "learning_rate": 1.9910523521223355e-06, "loss": 1.0835, "step": 8505 }, { "epoch": 2.6183608322756817, "grad_norm": 1.2174749374389648, "learning_rate": 1.975330872619782e-06, "loss": 1.2002, "step": 8510 }, { "epoch": 2.619899234644821, "grad_norm": 1.0910028219223022, "learning_rate": 1.95966915493882e-06, "loss": 1.2196, "step": 8515 }, { "epoch": 2.621437637013961, "grad_norm": 1.2541165351867676, "learning_rate": 1.9440672397303127e-06, "loss": 1.0791, "step": 8520 }, { "epoch": 2.6229760393831008, "grad_norm": 1.4798517227172852, "learning_rate": 1.928525167489914e-06, "loss": 1.2197, "step": 8525 }, { "epoch": 2.62451444175224, "grad_norm": 2.8591833114624023, "learning_rate": 1.913042978557944e-06, "loss": 1.0815, "step": 8530 }, { "epoch": 2.62605284412138, "grad_norm": 1.3371671438217163, "learning_rate": 1.8976207131192914e-06, "loss": 1.1042, "step": 8535 }, { "epoch": 2.6275912464905193, "grad_norm": 1.2439570426940918, "learning_rate": 1.8822584112033082e-06, "loss": 1.0295, "step": 8540 }, { "epoch": 2.629129648859659, "grad_norm": 1.8023666143417358, "learning_rate": 1.8669561126837236e-06, "loss": 1.1148, "step": 8545 }, { "epoch": 2.630668051228799, "grad_norm": 1.4807002544403076, "learning_rate": 1.8517138572784976e-06, "loss": 1.1129, "step": 8550 }, { "epoch": 2.632206453597939, "grad_norm": 1.9293781518936157, "learning_rate": 1.836531684549772e-06, "loss": 1.1038, "step": 8555 }, { "epoch": 2.633744855967078, "grad_norm": 0.8880111575126648, "learning_rate": 1.821409633903723e-06, "loss": 1.2579, "step": 8560 }, { "epoch": 2.6352832583362176, "grad_norm": 1.4220179319381714, "learning_rate": 1.8063477445904835e-06, "loss": 1.2608, "step": 8565 }, { "epoch": 2.6368216607053574, "grad_norm": 1.7923526763916016, "learning_rate": 1.7913460557040351e-06, "loss": 1.212, "step": 8570 }, { "epoch": 2.638360063074497, "grad_norm": 1.4313488006591797, "learning_rate": 1.776404606182097e-06, "loss": 1.1288, "step": 8575 }, { "epoch": 2.639898465443637, "grad_norm": 1.1481549739837646, "learning_rate": 1.7615234348060449e-06, "loss": 1.1998, "step": 8580 }, { "epoch": 2.6414368678127764, "grad_norm": 2.2744739055633545, "learning_rate": 1.7467025802007987e-06, "loss": 1.1055, "step": 8585 }, { "epoch": 2.642975270181916, "grad_norm": 1.0382848978042603, "learning_rate": 1.7319420808347142e-06, "loss": 1.1111, "step": 8590 }, { "epoch": 2.6445136725510556, "grad_norm": 1.4756346940994263, "learning_rate": 1.717241975019493e-06, "loss": 1.1661, "step": 8595 }, { "epoch": 2.6460520749201955, "grad_norm": 0.9779161214828491, "learning_rate": 1.7026023009100944e-06, "loss": 0.9952, "step": 8600 }, { "epoch": 2.6475904772893353, "grad_norm": 1.9879071712493896, "learning_rate": 1.688023096504604e-06, "loss": 1.2905, "step": 8605 }, { "epoch": 2.6491288796584747, "grad_norm": 1.308909296989441, "learning_rate": 1.673504399644174e-06, "loss": 1.0405, "step": 8610 }, { "epoch": 2.6506672820276145, "grad_norm": 1.6702406406402588, "learning_rate": 1.6590462480128882e-06, "loss": 1.101, "step": 8615 }, { "epoch": 2.652205684396754, "grad_norm": 1.1122729778289795, "learning_rate": 1.644648679137703e-06, "loss": 1.0497, "step": 8620 }, { "epoch": 2.6537440867658937, "grad_norm": 1.0254093408584595, "learning_rate": 1.630311730388312e-06, "loss": 1.2324, "step": 8625 }, { "epoch": 2.6552824891350335, "grad_norm": 2.15950608253479, "learning_rate": 1.6160354389770649e-06, "loss": 1.1059, "step": 8630 }, { "epoch": 2.656820891504173, "grad_norm": 1.1212074756622314, "learning_rate": 1.6018198419588793e-06, "loss": 1.1365, "step": 8635 }, { "epoch": 2.6583592938733127, "grad_norm": 0.9412396550178528, "learning_rate": 1.5876649762311458e-06, "loss": 1.1834, "step": 8640 }, { "epoch": 2.659897696242452, "grad_norm": 1.0666898488998413, "learning_rate": 1.5735708785336033e-06, "loss": 1.1222, "step": 8645 }, { "epoch": 2.661436098611592, "grad_norm": 0.9774393439292908, "learning_rate": 1.559537585448273e-06, "loss": 1.0845, "step": 8650 }, { "epoch": 2.6629745009807317, "grad_norm": 2.3956403732299805, "learning_rate": 1.5455651333993626e-06, "loss": 1.1634, "step": 8655 }, { "epoch": 2.664512903349871, "grad_norm": 1.4635967016220093, "learning_rate": 1.5316535586531483e-06, "loss": 1.2237, "step": 8660 }, { "epoch": 2.666051305719011, "grad_norm": 2.2025790214538574, "learning_rate": 1.5178028973179104e-06, "loss": 1.1956, "step": 8665 }, { "epoch": 2.6675897080881503, "grad_norm": 1.1397591829299927, "learning_rate": 1.504013185343811e-06, "loss": 1.183, "step": 8670 }, { "epoch": 2.66912811045729, "grad_norm": 1.7163399457931519, "learning_rate": 1.4902844585228282e-06, "loss": 1.0752, "step": 8675 }, { "epoch": 2.67066651282643, "grad_norm": 1.7945340871810913, "learning_rate": 1.476616752488641e-06, "loss": 1.1313, "step": 8680 }, { "epoch": 2.6722049151955694, "grad_norm": 1.072609543800354, "learning_rate": 1.4630101027165444e-06, "loss": 0.9894, "step": 8685 }, { "epoch": 2.673743317564709, "grad_norm": 1.3113356828689575, "learning_rate": 1.4494645445233658e-06, "loss": 1.1927, "step": 8690 }, { "epoch": 2.6752817199338486, "grad_norm": 2.284916639328003, "learning_rate": 1.4359801130673616e-06, "loss": 1.206, "step": 8695 }, { "epoch": 2.6768201223029884, "grad_norm": 1.0177814960479736, "learning_rate": 1.4225568433481329e-06, "loss": 1.1056, "step": 8700 }, { "epoch": 2.678358524672128, "grad_norm": 1.0384098291397095, "learning_rate": 1.4091947702065262e-06, "loss": 1.1469, "step": 8705 }, { "epoch": 2.6798969270412676, "grad_norm": 2.263399362564087, "learning_rate": 1.3958939283245543e-06, "loss": 1.038, "step": 8710 }, { "epoch": 2.6814353294104074, "grad_norm": 2.890347480773926, "learning_rate": 1.382654352225296e-06, "loss": 1.1278, "step": 8715 }, { "epoch": 2.682973731779547, "grad_norm": 1.9222828149795532, "learning_rate": 1.3694760762728215e-06, "loss": 1.1225, "step": 8720 }, { "epoch": 2.6845121341486866, "grad_norm": 1.1517970561981201, "learning_rate": 1.3563591346720804e-06, "loss": 1.1967, "step": 8725 }, { "epoch": 2.6860505365178264, "grad_norm": 1.7945805788040161, "learning_rate": 1.3433035614688338e-06, "loss": 1.1659, "step": 8730 }, { "epoch": 2.687588938886966, "grad_norm": 0.9292943477630615, "learning_rate": 1.3303093905495528e-06, "loss": 1.2293, "step": 8735 }, { "epoch": 2.6891273412561056, "grad_norm": 1.3209861516952515, "learning_rate": 1.3173766556413393e-06, "loss": 1.0543, "step": 8740 }, { "epoch": 2.690665743625245, "grad_norm": 1.1976464986801147, "learning_rate": 1.3045053903118303e-06, "loss": 1.0793, "step": 8745 }, { "epoch": 2.692204145994385, "grad_norm": 1.2206952571868896, "learning_rate": 1.2916956279691223e-06, "loss": 1.1523, "step": 8750 }, { "epoch": 2.6937425483635247, "grad_norm": 1.2255494594573975, "learning_rate": 1.2789474018616714e-06, "loss": 1.0986, "step": 8755 }, { "epoch": 2.695280950732664, "grad_norm": 2.243601083755493, "learning_rate": 1.26626074507821e-06, "loss": 1.0889, "step": 8760 }, { "epoch": 2.696819353101804, "grad_norm": 1.0789637565612793, "learning_rate": 1.2536356905476748e-06, "loss": 1.1933, "step": 8765 }, { "epoch": 2.6983577554709433, "grad_norm": 1.564278483390808, "learning_rate": 1.2410722710390954e-06, "loss": 1.1819, "step": 8770 }, { "epoch": 2.699896157840083, "grad_norm": 1.5282732248306274, "learning_rate": 1.2285705191615426e-06, "loss": 1.2118, "step": 8775 }, { "epoch": 2.701434560209223, "grad_norm": 1.438302993774414, "learning_rate": 1.21613046736401e-06, "loss": 1.2306, "step": 8780 }, { "epoch": 2.7029729625783623, "grad_norm": 1.248544454574585, "learning_rate": 1.203752147935347e-06, "loss": 1.2009, "step": 8785 }, { "epoch": 2.704511364947502, "grad_norm": 1.5993555784225464, "learning_rate": 1.1914355930041837e-06, "loss": 1.0794, "step": 8790 }, { "epoch": 2.7060497673166415, "grad_norm": 1.6044913530349731, "learning_rate": 1.179180834538826e-06, "loss": 1.1427, "step": 8795 }, { "epoch": 2.7075881696857813, "grad_norm": 1.4509234428405762, "learning_rate": 1.166987904347186e-06, "loss": 1.123, "step": 8800 }, { "epoch": 2.709126572054921, "grad_norm": 1.0035152435302734, "learning_rate": 1.1548568340767036e-06, "loss": 1.1457, "step": 8805 }, { "epoch": 2.7106649744240605, "grad_norm": 1.530730962753296, "learning_rate": 1.142787655214253e-06, "loss": 1.1163, "step": 8810 }, { "epoch": 2.7122033767932003, "grad_norm": 1.7669676542282104, "learning_rate": 1.1307803990860594e-06, "loss": 1.0995, "step": 8815 }, { "epoch": 2.7137417791623397, "grad_norm": 1.4253056049346924, "learning_rate": 1.1188350968576372e-06, "loss": 1.1652, "step": 8820 }, { "epoch": 2.7152801815314795, "grad_norm": 1.14907705783844, "learning_rate": 1.1069517795336825e-06, "loss": 1.1864, "step": 8825 }, { "epoch": 2.7168185839006194, "grad_norm": 1.3076614141464233, "learning_rate": 1.0951304779580146e-06, "loss": 1.0647, "step": 8830 }, { "epoch": 2.7183569862697587, "grad_norm": 1.654503583908081, "learning_rate": 1.0833712228134952e-06, "loss": 1.2105, "step": 8835 }, { "epoch": 2.7198953886388986, "grad_norm": 1.136584758758545, "learning_rate": 1.0716740446219175e-06, "loss": 1.167, "step": 8840 }, { "epoch": 2.721433791008038, "grad_norm": 1.5034282207489014, "learning_rate": 1.0600389737439681e-06, "loss": 1.1643, "step": 8845 }, { "epoch": 2.7229721933771778, "grad_norm": 1.1271815299987793, "learning_rate": 1.0484660403791314e-06, "loss": 1.2436, "step": 8850 }, { "epoch": 2.7245105957463176, "grad_norm": 1.1455885171890259, "learning_rate": 1.0369552745656013e-06, "loss": 1.0879, "step": 8855 }, { "epoch": 2.726048998115457, "grad_norm": 1.2253069877624512, "learning_rate": 1.025506706180221e-06, "loss": 1.206, "step": 8860 }, { "epoch": 2.727587400484597, "grad_norm": 1.2960996627807617, "learning_rate": 1.0141203649383924e-06, "loss": 1.2533, "step": 8865 }, { "epoch": 2.729125802853736, "grad_norm": 1.177007794380188, "learning_rate": 1.0027962803939944e-06, "loss": 1.1394, "step": 8870 }, { "epoch": 2.730664205222876, "grad_norm": 1.0711764097213745, "learning_rate": 9.91534481939338e-07, "loss": 1.1713, "step": 8875 }, { "epoch": 2.732202607592016, "grad_norm": 1.1851369142532349, "learning_rate": 9.80334998805041e-07, "loss": 1.1108, "step": 8880 }, { "epoch": 2.733741009961155, "grad_norm": 1.4155503511428833, "learning_rate": 9.691978600599977e-07, "loss": 1.191, "step": 8885 }, { "epoch": 2.735279412330295, "grad_norm": 1.3459392786026, "learning_rate": 9.581230946112824e-07, "loss": 0.9247, "step": 8890 }, { "epoch": 2.7368178146994344, "grad_norm": 1.4829347133636475, "learning_rate": 9.471107312040567e-07, "loss": 1.0897, "step": 8895 }, { "epoch": 2.7383562170685742, "grad_norm": 1.6139320135116577, "learning_rate": 9.361607984215342e-07, "loss": 1.1201, "step": 8900 }, { "epoch": 2.739894619437714, "grad_norm": 1.500117540359497, "learning_rate": 9.25273324684886e-07, "loss": 1.2296, "step": 8905 }, { "epoch": 2.7414330218068534, "grad_norm": 0.9885631203651428, "learning_rate": 9.144483382531571e-07, "loss": 1.1622, "step": 8910 }, { "epoch": 2.7429714241759933, "grad_norm": 1.3149123191833496, "learning_rate": 9.036858672232057e-07, "loss": 1.1528, "step": 8915 }, { "epoch": 2.7445098265451326, "grad_norm": 1.1396979093551636, "learning_rate": 8.929859395296364e-07, "loss": 1.1723, "step": 8920 }, { "epoch": 2.7460482289142725, "grad_norm": 1.0069197416305542, "learning_rate": 8.823485829447003e-07, "loss": 1.1311, "step": 8925 }, { "epoch": 2.7475866312834123, "grad_norm": 1.2922357320785522, "learning_rate": 8.717738250782675e-07, "loss": 1.2323, "step": 8930 }, { "epoch": 2.7491250336525517, "grad_norm": 1.268516182899475, "learning_rate": 8.612616933777046e-07, "loss": 1.1243, "step": 8935 }, { "epoch": 2.7506634360216915, "grad_norm": 1.552514910697937, "learning_rate": 8.508122151278442e-07, "loss": 1.0926, "step": 8940 }, { "epoch": 2.752201838390831, "grad_norm": 1.1461021900177002, "learning_rate": 8.404254174509019e-07, "loss": 1.1677, "step": 8945 }, { "epoch": 2.7537402407599707, "grad_norm": 1.04787278175354, "learning_rate": 8.301013273063791e-07, "loss": 1.1723, "step": 8950 }, { "epoch": 2.7552786431291105, "grad_norm": 2.725139617919922, "learning_rate": 8.198399714910404e-07, "loss": 1.183, "step": 8955 }, { "epoch": 2.75681704549825, "grad_norm": 3.641418218612671, "learning_rate": 8.096413766388117e-07, "loss": 1.0918, "step": 8960 }, { "epoch": 2.7583554478673897, "grad_norm": 1.5253046751022339, "learning_rate": 7.995055692207127e-07, "loss": 1.2304, "step": 8965 }, { "epoch": 2.759893850236529, "grad_norm": 1.2496076822280884, "learning_rate": 7.894325755448073e-07, "loss": 1.1795, "step": 8970 }, { "epoch": 2.761432252605669, "grad_norm": 1.7881182432174683, "learning_rate": 7.794224217561152e-07, "loss": 1.0852, "step": 8975 }, { "epoch": 2.7629706549748088, "grad_norm": 2.052133083343506, "learning_rate": 7.694751338365447e-07, "loss": 1.1222, "step": 8980 }, { "epoch": 2.7645090573439486, "grad_norm": 1.0607606172561646, "learning_rate": 7.595907376048512e-07, "loss": 1.0621, "step": 8985 }, { "epoch": 2.766047459713088, "grad_norm": 1.3713186979293823, "learning_rate": 7.497692587165345e-07, "loss": 1.2151, "step": 8990 }, { "epoch": 2.7675858620822273, "grad_norm": 1.383420467376709, "learning_rate": 7.40010722663792e-07, "loss": 1.0939, "step": 8995 }, { "epoch": 2.769124264451367, "grad_norm": 0.9419933557510376, "learning_rate": 7.303151547754627e-07, "loss": 1.1962, "step": 9000 }, { "epoch": 2.770662666820507, "grad_norm": 1.4444811344146729, "learning_rate": 7.20682580216922e-07, "loss": 1.1294, "step": 9005 }, { "epoch": 2.772201069189647, "grad_norm": 1.7534838914871216, "learning_rate": 7.111130239900677e-07, "loss": 1.1511, "step": 9010 }, { "epoch": 2.773739471558786, "grad_norm": 1.5162113904953003, "learning_rate": 7.016065109332226e-07, "loss": 1.0526, "step": 9015 }, { "epoch": 2.7752778739279256, "grad_norm": 1.8117800951004028, "learning_rate": 6.921630657210659e-07, "loss": 1.0708, "step": 9020 }, { "epoch": 2.7768162762970654, "grad_norm": 1.098140001296997, "learning_rate": 6.827827128645992e-07, "loss": 1.0874, "step": 9025 }, { "epoch": 2.7783546786662052, "grad_norm": 0.9571955800056458, "learning_rate": 6.734654767110521e-07, "loss": 1.1287, "step": 9030 }, { "epoch": 2.779893081035345, "grad_norm": 2.1803581714630127, "learning_rate": 6.6421138144383e-07, "loss": 1.2326, "step": 9035 }, { "epoch": 2.7814314834044844, "grad_norm": 1.0323264598846436, "learning_rate": 6.550204510824609e-07, "loss": 1.1212, "step": 9040 }, { "epoch": 2.7829698857736243, "grad_norm": 2.1114048957824707, "learning_rate": 6.458927094825179e-07, "loss": 1.2275, "step": 9045 }, { "epoch": 2.7845082881427636, "grad_norm": 1.30685293674469, "learning_rate": 6.368281803355691e-07, "loss": 1.1677, "step": 9050 }, { "epoch": 2.7860466905119035, "grad_norm": 1.824419379234314, "learning_rate": 6.27826887169114e-07, "loss": 1.057, "step": 9055 }, { "epoch": 2.7875850928810433, "grad_norm": 0.9725929498672485, "learning_rate": 6.188888533465053e-07, "loss": 1.166, "step": 9060 }, { "epoch": 2.7891234952501827, "grad_norm": 1.6305102109909058, "learning_rate": 6.100141020669137e-07, "loss": 1.166, "step": 9065 }, { "epoch": 2.7906618976193225, "grad_norm": 1.65754234790802, "learning_rate": 6.012026563652573e-07, "loss": 1.2142, "step": 9070 }, { "epoch": 2.792200299988462, "grad_norm": 1.303271770477295, "learning_rate": 5.924545391121361e-07, "loss": 1.2034, "step": 9075 }, { "epoch": 2.7937387023576017, "grad_norm": 2.9017386436462402, "learning_rate": 5.837697730137814e-07, "loss": 1.0679, "step": 9080 }, { "epoch": 2.7952771047267415, "grad_norm": 1.4329947233200073, "learning_rate": 5.751483806119923e-07, "loss": 1.2121, "step": 9085 }, { "epoch": 2.796815507095881, "grad_norm": 0.9450810551643372, "learning_rate": 5.665903842840714e-07, "loss": 1.0761, "step": 9090 }, { "epoch": 2.7983539094650207, "grad_norm": 2.1919000148773193, "learning_rate": 5.580958062427866e-07, "loss": 1.1133, "step": 9095 }, { "epoch": 2.79989231183416, "grad_norm": 1.355247139930725, "learning_rate": 5.496646685362844e-07, "loss": 1.0562, "step": 9100 }, { "epoch": 2.8014307142033, "grad_norm": 1.137313723564148, "learning_rate": 5.412969930480599e-07, "loss": 1.1331, "step": 9105 }, { "epoch": 2.8029691165724397, "grad_norm": 3.3919472694396973, "learning_rate": 5.329928014968843e-07, "loss": 1.1539, "step": 9110 }, { "epoch": 2.804507518941579, "grad_norm": 1.3040916919708252, "learning_rate": 5.247521154367552e-07, "loss": 1.1187, "step": 9115 }, { "epoch": 2.806045921310719, "grad_norm": 1.5795443058013916, "learning_rate": 5.165749562568323e-07, "loss": 1.1518, "step": 9120 }, { "epoch": 2.8075843236798583, "grad_norm": 1.486302137374878, "learning_rate": 5.084613451813935e-07, "loss": 1.0955, "step": 9125 }, { "epoch": 2.809122726048998, "grad_norm": 1.4122854471206665, "learning_rate": 5.00411303269771e-07, "loss": 1.1842, "step": 9130 }, { "epoch": 2.810661128418138, "grad_norm": 1.3084039688110352, "learning_rate": 4.924248514163038e-07, "loss": 1.1416, "step": 9135 }, { "epoch": 2.8121995307872774, "grad_norm": 1.42378830909729, "learning_rate": 4.845020103502712e-07, "loss": 1.1191, "step": 9140 }, { "epoch": 2.813737933156417, "grad_norm": 1.8624348640441895, "learning_rate": 4.766428006358542e-07, "loss": 1.021, "step": 9145 }, { "epoch": 2.8152763355255566, "grad_norm": 1.3880897760391235, "learning_rate": 4.688472426720714e-07, "loss": 1.0401, "step": 9150 }, { "epoch": 2.8168147378946964, "grad_norm": 1.7636202573776245, "learning_rate": 4.611153566927373e-07, "loss": 1.1469, "step": 9155 }, { "epoch": 2.818353140263836, "grad_norm": 1.6327768564224243, "learning_rate": 4.534471627663878e-07, "loss": 1.1372, "step": 9160 }, { "epoch": 2.8198915426329756, "grad_norm": 1.5780202150344849, "learning_rate": 4.4584268079625735e-07, "loss": 1.1779, "step": 9165 }, { "epoch": 2.8214299450021154, "grad_norm": 1.1375956535339355, "learning_rate": 4.3830193052020186e-07, "loss": 1.11, "step": 9170 }, { "epoch": 2.822968347371255, "grad_norm": 1.3757110834121704, "learning_rate": 4.308249315106649e-07, "loss": 1.0576, "step": 9175 }, { "epoch": 2.8245067497403946, "grad_norm": 1.2449181079864502, "learning_rate": 4.234117031746143e-07, "loss": 1.1202, "step": 9180 }, { "epoch": 2.8260451521095344, "grad_norm": 1.2886162996292114, "learning_rate": 4.1606226475350287e-07, "loss": 1.2508, "step": 9185 }, { "epoch": 2.827583554478674, "grad_norm": 2.6857354640960693, "learning_rate": 4.087766353232103e-07, "loss": 1.1784, "step": 9190 }, { "epoch": 2.8291219568478136, "grad_norm": 1.6899442672729492, "learning_rate": 4.015548337939962e-07, "loss": 1.2262, "step": 9195 }, { "epoch": 2.830660359216953, "grad_norm": 1.3128477334976196, "learning_rate": 3.943968789104496e-07, "loss": 1.186, "step": 9200 }, { "epoch": 2.832198761586093, "grad_norm": 1.329758882522583, "learning_rate": 3.87302789251448e-07, "loss": 1.1431, "step": 9205 }, { "epoch": 2.8337371639552327, "grad_norm": 1.1432725191116333, "learning_rate": 3.8027258323010127e-07, "loss": 1.0308, "step": 9210 }, { "epoch": 2.835275566324372, "grad_norm": 1.351478099822998, "learning_rate": 3.733062790936964e-07, "loss": 1.0939, "step": 9215 }, { "epoch": 2.836813968693512, "grad_norm": 1.2677690982818604, "learning_rate": 3.6640389492367534e-07, "loss": 1.1051, "step": 9220 }, { "epoch": 2.8383523710626513, "grad_norm": 2.572026014328003, "learning_rate": 3.5956544863555983e-07, "loss": 1.0794, "step": 9225 }, { "epoch": 2.839890773431791, "grad_norm": 1.2987449169158936, "learning_rate": 3.5279095797892127e-07, "loss": 1.2363, "step": 9230 }, { "epoch": 2.841429175800931, "grad_norm": 1.7580738067626953, "learning_rate": 3.460804405373302e-07, "loss": 1.0583, "step": 9235 }, { "epoch": 2.8429675781700703, "grad_norm": 1.4407984018325806, "learning_rate": 3.394339137283098e-07, "loss": 1.2122, "step": 9240 }, { "epoch": 2.84450598053921, "grad_norm": 1.2053810358047485, "learning_rate": 3.328513948032991e-07, "loss": 1.1871, "step": 9245 }, { "epoch": 2.8460443829083495, "grad_norm": 1.9730299711227417, "learning_rate": 3.263329008475924e-07, "loss": 1.0696, "step": 9250 }, { "epoch": 2.8475827852774893, "grad_norm": 0.805873453617096, "learning_rate": 3.1987844878030307e-07, "loss": 1.0279, "step": 9255 }, { "epoch": 2.849121187646629, "grad_norm": 1.1469902992248535, "learning_rate": 3.1348805535432735e-07, "loss": 1.1212, "step": 9260 }, { "epoch": 2.8506595900157685, "grad_norm": 1.4342927932739258, "learning_rate": 3.071617371562946e-07, "loss": 1.1438, "step": 9265 }, { "epoch": 2.8521979923849083, "grad_norm": 1.4344321489334106, "learning_rate": 3.0089951060651156e-07, "loss": 1.2395, "step": 9270 }, { "epoch": 2.8537363947540477, "grad_norm": 1.1882743835449219, "learning_rate": 2.947013919589431e-07, "loss": 1.0578, "step": 9275 }, { "epoch": 2.8552747971231875, "grad_norm": 1.8451581001281738, "learning_rate": 2.88567397301151e-07, "loss": 1.1106, "step": 9280 }, { "epoch": 2.8568131994923274, "grad_norm": 1.0893770456314087, "learning_rate": 2.824975425542664e-07, "loss": 1.0286, "step": 9285 }, { "epoch": 2.8583516018614668, "grad_norm": 1.2297017574310303, "learning_rate": 2.764918434729369e-07, "loss": 1.1635, "step": 9290 }, { "epoch": 2.8598900042306066, "grad_norm": 1.1600873470306396, "learning_rate": 2.7055031564529043e-07, "loss": 1.1557, "step": 9295 }, { "epoch": 2.861428406599746, "grad_norm": 1.1898655891418457, "learning_rate": 2.646729744928966e-07, "loss": 1.0724, "step": 9300 }, { "epoch": 2.862966808968886, "grad_norm": 1.7865312099456787, "learning_rate": 2.588598352707278e-07, "loss": 1.1252, "step": 9305 }, { "epoch": 2.8645052113380256, "grad_norm": 1.6626031398773193, "learning_rate": 2.531109130671061e-07, "loss": 1.1687, "step": 9310 }, { "epoch": 2.866043613707165, "grad_norm": 1.245299220085144, "learning_rate": 2.474262228036872e-07, "loss": 1.0814, "step": 9315 }, { "epoch": 2.867582016076305, "grad_norm": 1.5507864952087402, "learning_rate": 2.418057792354045e-07, "loss": 1.085, "step": 9320 }, { "epoch": 2.869120418445444, "grad_norm": 2.039008140563965, "learning_rate": 2.3624959695043302e-07, "loss": 1.0728, "step": 9325 }, { "epoch": 2.870658820814584, "grad_norm": 1.1104451417922974, "learning_rate": 2.3075769037015638e-07, "loss": 1.1025, "step": 9330 }, { "epoch": 2.872197223183724, "grad_norm": 1.1640777587890625, "learning_rate": 2.2533007374912485e-07, "loss": 1.2473, "step": 9335 }, { "epoch": 2.873735625552863, "grad_norm": 1.0820621252059937, "learning_rate": 2.1996676117502224e-07, "loss": 1.1593, "step": 9340 }, { "epoch": 2.875274027922003, "grad_norm": 1.981393575668335, "learning_rate": 2.146677665686325e-07, "loss": 1.2071, "step": 9345 }, { "epoch": 2.8768124302911424, "grad_norm": 1.2725703716278076, "learning_rate": 2.094331036837871e-07, "loss": 1.179, "step": 9350 }, { "epoch": 2.8783508326602822, "grad_norm": 1.1721535921096802, "learning_rate": 2.0426278610735094e-07, "loss": 1.1551, "step": 9355 }, { "epoch": 2.879889235029422, "grad_norm": 1.6702649593353271, "learning_rate": 1.9915682725917262e-07, "loss": 0.9862, "step": 9360 }, { "epoch": 2.8814276373985614, "grad_norm": 0.7977102994918823, "learning_rate": 1.9411524039205376e-07, "loss": 1.1702, "step": 9365 }, { "epoch": 2.8829660397677013, "grad_norm": 1.0486433506011963, "learning_rate": 1.891380385917213e-07, "loss": 1.1837, "step": 9370 }, { "epoch": 2.8845044421368407, "grad_norm": 1.5188802480697632, "learning_rate": 1.842252347767748e-07, "loss": 1.153, "step": 9375 }, { "epoch": 2.8860428445059805, "grad_norm": 1.3773759603500366, "learning_rate": 1.7937684169867797e-07, "loss": 1.1127, "step": 9380 }, { "epoch": 2.8875812468751203, "grad_norm": 1.7607942819595337, "learning_rate": 1.7459287194170615e-07, "loss": 1.1165, "step": 9385 }, { "epoch": 2.8891196492442597, "grad_norm": 1.1703269481658936, "learning_rate": 1.6987333792292115e-07, "loss": 1.2549, "step": 9390 }, { "epoch": 2.8906580516133995, "grad_norm": 1.1491749286651611, "learning_rate": 1.6521825189213526e-07, "loss": 1.1517, "step": 9395 }, { "epoch": 2.892196453982539, "grad_norm": 1.7843077182769775, "learning_rate": 1.6062762593188896e-07, "loss": 1.0008, "step": 9400 }, { "epoch": 2.8937348563516787, "grad_norm": 1.4365227222442627, "learning_rate": 1.5610147195740943e-07, "loss": 1.0936, "step": 9405 }, { "epoch": 2.8952732587208185, "grad_norm": 1.1041733026504517, "learning_rate": 1.5163980171658542e-07, "loss": 1.171, "step": 9410 }, { "epoch": 2.896811661089958, "grad_norm": 0.9337441921234131, "learning_rate": 1.472426267899285e-07, "loss": 1.1666, "step": 9415 }, { "epoch": 2.8983500634590977, "grad_norm": 1.0685206651687622, "learning_rate": 1.4290995859055633e-07, "loss": 1.164, "step": 9420 }, { "epoch": 2.899888465828237, "grad_norm": 1.5505434274673462, "learning_rate": 1.386418083641483e-07, "loss": 1.2611, "step": 9425 }, { "epoch": 2.901426868197377, "grad_norm": 1.558901071548462, "learning_rate": 1.3443818718893442e-07, "loss": 1.1891, "step": 9430 }, { "epoch": 2.9029652705665168, "grad_norm": 1.5380712747573853, "learning_rate": 1.3029910597564532e-07, "loss": 1.1009, "step": 9435 }, { "epoch": 2.9045036729356566, "grad_norm": 1.8693783283233643, "learning_rate": 1.2622457546749567e-07, "loss": 1.1941, "step": 9440 }, { "epoch": 2.906042075304796, "grad_norm": 1.0699331760406494, "learning_rate": 1.2221460624016466e-07, "loss": 1.1932, "step": 9445 }, { "epoch": 2.9075804776739353, "grad_norm": 1.01662278175354, "learning_rate": 1.1826920870174895e-07, "loss": 1.2584, "step": 9450 }, { "epoch": 2.909118880043075, "grad_norm": 1.0580191612243652, "learning_rate": 1.143883930927514e-07, "loss": 1.1569, "step": 9455 }, { "epoch": 2.910657282412215, "grad_norm": 1.4499502182006836, "learning_rate": 1.1057216948604509e-07, "loss": 1.1326, "step": 9460 }, { "epoch": 2.912195684781355, "grad_norm": 1.1739625930786133, "learning_rate": 1.068205477868539e-07, "loss": 1.011, "step": 9465 }, { "epoch": 2.913734087150494, "grad_norm": 1.2261017560958862, "learning_rate": 1.0313353773271917e-07, "loss": 1.1797, "step": 9470 }, { "epoch": 2.915272489519634, "grad_norm": 1.1943910121917725, "learning_rate": 9.951114889348855e-08, "loss": 1.1561, "step": 9475 }, { "epoch": 2.9168108918887734, "grad_norm": 1.6418399810791016, "learning_rate": 9.595339067127174e-08, "loss": 1.084, "step": 9480 }, { "epoch": 2.9183492942579132, "grad_norm": 1.0940558910369873, "learning_rate": 9.24602723004292e-08, "loss": 1.109, "step": 9485 }, { "epoch": 2.919887696627053, "grad_norm": 1.4907678365707397, "learning_rate": 8.903180284755008e-08, "loss": 1.0309, "step": 9490 }, { "epoch": 2.9214260989961924, "grad_norm": 1.6203380823135376, "learning_rate": 8.566799121141334e-08, "loss": 1.1081, "step": 9495 }, { "epoch": 2.9229645013653323, "grad_norm": 1.3921715021133423, "learning_rate": 8.23688461229849e-08, "loss": 1.1672, "step": 9500 }, { "epoch": 2.9245029037344716, "grad_norm": 1.6819571256637573, "learning_rate": 7.913437614538166e-08, "loss": 1.0484, "step": 9505 }, { "epoch": 2.9260413061036115, "grad_norm": 1.9692211151123047, "learning_rate": 7.596458967384922e-08, "loss": 1.2351, "step": 9510 }, { "epoch": 2.9275797084727513, "grad_norm": 1.2511972188949585, "learning_rate": 7.285949493574806e-08, "loss": 1.2732, "step": 9515 }, { "epoch": 2.9291181108418907, "grad_norm": 0.920283854007721, "learning_rate": 6.98190999905285e-08, "loss": 1.1274, "step": 9520 }, { "epoch": 2.9306565132110305, "grad_norm": 1.1254204511642456, "learning_rate": 6.684341272970018e-08, "loss": 1.023, "step": 9525 }, { "epoch": 2.93219491558017, "grad_norm": 0.9248682856559753, "learning_rate": 6.393244087683215e-08, "loss": 1.2624, "step": 9530 }, { "epoch": 2.9337333179493097, "grad_norm": 1.0681920051574707, "learning_rate": 6.108619198751109e-08, "loss": 1.1346, "step": 9535 }, { "epoch": 2.9352717203184495, "grad_norm": 1.8738933801651, "learning_rate": 5.8304673449338653e-08, "loss": 1.1768, "step": 9540 }, { "epoch": 2.936810122687589, "grad_norm": 1.5245164632797241, "learning_rate": 5.558789248190366e-08, "loss": 1.143, "step": 9545 }, { "epoch": 2.9383485250567287, "grad_norm": 1.1693507432937622, "learning_rate": 5.293585613675989e-08, "loss": 1.1751, "step": 9550 }, { "epoch": 2.939886927425868, "grad_norm": 1.6646199226379395, "learning_rate": 5.034857129741777e-08, "loss": 1.122, "step": 9555 }, { "epoch": 2.941425329795008, "grad_norm": 1.3896692991256714, "learning_rate": 4.782604467931939e-08, "loss": 1.1138, "step": 9560 }, { "epoch": 2.9429637321641477, "grad_norm": 1.3115090131759644, "learning_rate": 4.5368282829827415e-08, "loss": 1.1148, "step": 9565 }, { "epoch": 2.944502134533287, "grad_norm": 1.0006541013717651, "learning_rate": 4.2975292128200064e-08, "loss": 1.2622, "step": 9570 }, { "epoch": 2.946040536902427, "grad_norm": 1.6213256120681763, "learning_rate": 4.064707878557728e-08, "loss": 1.2263, "step": 9575 }, { "epoch": 2.9475789392715663, "grad_norm": 4.496711254119873, "learning_rate": 3.838364884496681e-08, "loss": 1.2043, "step": 9580 }, { "epoch": 2.949117341640706, "grad_norm": 1.6494616270065308, "learning_rate": 3.618500818123039e-08, "loss": 1.1806, "step": 9585 }, { "epoch": 2.950655744009846, "grad_norm": 1.9413509368896484, "learning_rate": 3.405116250106144e-08, "loss": 1.2, "step": 9590 }, { "epoch": 2.9521941463789854, "grad_norm": 2.1733412742614746, "learning_rate": 3.1982117342979624e-08, "loss": 1.0731, "step": 9595 }, { "epoch": 2.953732548748125, "grad_norm": 1.8069325685501099, "learning_rate": 2.9977878077305785e-08, "loss": 1.2088, "step": 9600 }, { "epoch": 2.9552709511172646, "grad_norm": 2.179551362991333, "learning_rate": 2.8038449906153673e-08, "loss": 1.082, "step": 9605 }, { "epoch": 2.9568093534864044, "grad_norm": 1.2581409215927124, "learning_rate": 2.6163837863418806e-08, "loss": 1.076, "step": 9610 }, { "epoch": 2.958347755855544, "grad_norm": 1.4859249591827393, "learning_rate": 2.4354046814764607e-08, "loss": 1.085, "step": 9615 }, { "epoch": 2.9598861582246836, "grad_norm": 1.2801562547683716, "learning_rate": 2.260908145760299e-08, "loss": 1.1067, "step": 9620 }, { "epoch": 2.9614245605938234, "grad_norm": 1.3596243858337402, "learning_rate": 2.0928946321091547e-08, "loss": 1.2352, "step": 9625 }, { "epoch": 2.962962962962963, "grad_norm": 1.4657814502716064, "learning_rate": 1.931364576611139e-08, "loss": 1.1577, "step": 9630 }, { "epoch": 2.9645013653321026, "grad_norm": 1.2036793231964111, "learning_rate": 1.7763183985269883e-08, "loss": 1.1403, "step": 9635 }, { "epoch": 2.9660397677012424, "grad_norm": 2.143611192703247, "learning_rate": 1.6277565002875696e-08, "loss": 1.0896, "step": 9640 }, { "epoch": 2.967578170070382, "grad_norm": 1.2435050010681152, "learning_rate": 1.4856792674936004e-08, "loss": 1.2366, "step": 9645 }, { "epoch": 2.9691165724395217, "grad_norm": 1.358459711074829, "learning_rate": 1.3500870689145407e-08, "loss": 1.2478, "step": 9650 }, { "epoch": 2.970654974808661, "grad_norm": 1.11142897605896, "learning_rate": 1.2209802564877582e-08, "loss": 1.2521, "step": 9655 }, { "epoch": 2.972193377177801, "grad_norm": 1.6148406267166138, "learning_rate": 1.0983591653168645e-08, "loss": 1.275, "step": 9660 }, { "epoch": 2.9737317795469407, "grad_norm": 0.9854974150657654, "learning_rate": 9.822241136722699e-09, "loss": 1.0939, "step": 9665 }, { "epoch": 2.97527018191608, "grad_norm": 1.0898098945617676, "learning_rate": 8.72575402988407e-09, "loss": 1.1264, "step": 9670 }, { "epoch": 2.97680858428522, "grad_norm": 1.4026813507080078, "learning_rate": 7.694133178653973e-09, "loss": 1.0411, "step": 9675 }, { "epoch": 2.9783469866543593, "grad_norm": 1.993453025817871, "learning_rate": 6.727381260657195e-09, "loss": 1.2356, "step": 9680 }, { "epoch": 2.979885389023499, "grad_norm": 1.4184306859970093, "learning_rate": 5.825500785150428e-09, "loss": 1.139, "step": 9685 }, { "epoch": 2.981423791392639, "grad_norm": 1.8053029775619507, "learning_rate": 4.988494093022267e-09, "loss": 1.1943, "step": 9690 }, { "epoch": 2.9829621937617783, "grad_norm": 1.1999495029449463, "learning_rate": 4.216363356765452e-09, "loss": 1.2553, "step": 9695 }, { "epoch": 2.984500596130918, "grad_norm": 1.530155062675476, "learning_rate": 3.5091105804907487e-09, "loss": 1.1807, "step": 9700 }, { "epoch": 2.9860389985000575, "grad_norm": 1.3728301525115967, "learning_rate": 2.8667375999102964e-09, "loss": 1.2475, "step": 9705 }, { "epoch": 2.9875774008691973, "grad_norm": 1.2360360622406006, "learning_rate": 2.2892460823403794e-09, "loss": 0.998, "step": 9710 }, { "epoch": 2.989115803238337, "grad_norm": 1.8913969993591309, "learning_rate": 1.7766375266931035e-09, "loss": 1.0908, "step": 9715 }, { "epoch": 2.9906542056074765, "grad_norm": 0.986847460269928, "learning_rate": 1.328913263473619e-09, "loss": 1.07, "step": 9720 }, { "epoch": 2.9921926079766163, "grad_norm": 1.6557801961898804, "learning_rate": 9.460744547745704e-10, "loss": 1.1596, "step": 9725 }, { "epoch": 2.9937310103457557, "grad_norm": 2.2507128715515137, "learning_rate": 6.281220942733201e-10, "loss": 1.0748, "step": 9730 }, { "epoch": 2.9952694127148956, "grad_norm": 2.129380702972412, "learning_rate": 3.750570072375004e-10, "loss": 1.3533, "step": 9735 }, { "epoch": 2.9968078150840354, "grad_norm": 1.2016565799713135, "learning_rate": 1.8687985050558355e-10, "loss": 1.0902, "step": 9740 }, { "epoch": 2.9983462174531748, "grad_norm": 1.1874008178710938, "learning_rate": 6.359111250908711e-11, "loss": 1.1668, "step": 9745 }, { "epoch": 2.9998846198223146, "grad_norm": 1.8799904584884644, "learning_rate": 5.191113247593471e-12, "loss": 1.1766, "step": 9750 }, { "epoch": 2.9998846198223146, "step": 9750, "total_flos": 7.742111986089984e+17, "train_loss": 1.1629726183475593, "train_runtime": 20329.8435, "train_samples_per_second": 7.674, "train_steps_per_second": 0.48 } ], "logging_steps": 5, "max_steps": 9750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 7.742111986089984e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }