{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 47871, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006266842138246538, "grad_norm": 3.7908527851104736, "learning_rate": 1.0000000000000002e-06, "loss": 0.6843, "step": 10 }, { "epoch": 0.0012533684276493076, "grad_norm": 4.450835704803467, "learning_rate": 2.0000000000000003e-06, "loss": 0.6671, "step": 20 }, { "epoch": 0.0018800526414739613, "grad_norm": 4.033405780792236, "learning_rate": 3e-06, "loss": 0.6299, "step": 30 }, { "epoch": 0.002506736855298615, "grad_norm": 4.196229457855225, "learning_rate": 4.000000000000001e-06, "loss": 0.5655, "step": 40 }, { "epoch": 0.0031334210691232687, "grad_norm": 2.715080738067627, "learning_rate": 5e-06, "loss": 0.4726, "step": 50 }, { "epoch": 0.0037601052829479226, "grad_norm": 2.747175455093384, "learning_rate": 6e-06, "loss": 0.3862, "step": 60 }, { "epoch": 0.0043867894967725765, "grad_norm": 2.173097848892212, "learning_rate": 7.000000000000001e-06, "loss": 0.3775, "step": 70 }, { "epoch": 0.00501347371059723, "grad_norm": 1.9927845001220703, "learning_rate": 8.000000000000001e-06, "loss": 0.3042, "step": 80 }, { "epoch": 0.0056401579244218835, "grad_norm": 1.9776703119277954, "learning_rate": 9e-06, "loss": 0.2959, "step": 90 }, { "epoch": 0.006266842138246537, "grad_norm": 1.9182004928588867, "learning_rate": 1e-05, "loss": 0.2846, "step": 100 }, { "epoch": 0.006893526352071191, "grad_norm": 2.0619113445281982, "learning_rate": 1.1000000000000001e-05, "loss": 0.3035, "step": 110 }, { "epoch": 0.007520210565895845, "grad_norm": 0.834303617477417, "learning_rate": 1.2e-05, "loss": 0.2613, "step": 120 }, { "epoch": 0.008146894779720498, "grad_norm": 2.8216769695281982, "learning_rate": 1.3000000000000001e-05, "loss": 0.2193, "step": 130 }, { "epoch": 0.008773578993545153, "grad_norm": 0.9458842873573303, "learning_rate": 1.4000000000000001e-05, "loss": 0.2789, "step": 140 }, { "epoch": 0.009400263207369806, "grad_norm": 2.4860618114471436, "learning_rate": 1.5e-05, "loss": 0.2208, "step": 150 }, { "epoch": 0.01002694742119446, "grad_norm": 0.7528994083404541, "learning_rate": 1.6000000000000003e-05, "loss": 0.3034, "step": 160 }, { "epoch": 0.010653631635019114, "grad_norm": 0.22199566662311554, "learning_rate": 1.7000000000000003e-05, "loss": 0.1009, "step": 170 }, { "epoch": 0.011280315848843767, "grad_norm": 3.4047577381134033, "learning_rate": 1.8e-05, "loss": 0.2154, "step": 180 }, { "epoch": 0.011907000062668422, "grad_norm": 3.8273696899414062, "learning_rate": 1.9e-05, "loss": 0.1748, "step": 190 }, { "epoch": 0.012533684276493075, "grad_norm": 0.13623806834220886, "learning_rate": 2e-05, "loss": 0.0289, "step": 200 }, { "epoch": 0.01316036849031773, "grad_norm": 3.247593879699707, "learning_rate": 2.1e-05, "loss": 0.4862, "step": 210 }, { "epoch": 0.013787052704142383, "grad_norm": 11.889203071594238, "learning_rate": 2.2000000000000003e-05, "loss": 0.3237, "step": 220 }, { "epoch": 0.014413736917967036, "grad_norm": 12.991532325744629, "learning_rate": 2.3000000000000003e-05, "loss": 0.0925, "step": 230 }, { "epoch": 0.01504042113179169, "grad_norm": 0.5583555102348328, "learning_rate": 2.4e-05, "loss": 0.1658, "step": 240 }, { "epoch": 0.015667105345616345, "grad_norm": 1.1460843086242676, "learning_rate": 2.5e-05, "loss": 0.2338, "step": 250 }, { "epoch": 0.016293789559440996, "grad_norm": 0.3752973973751068, "learning_rate": 2.6000000000000002e-05, "loss": 0.0844, "step": 260 }, { "epoch": 0.01692047377326565, "grad_norm": 0.15886737406253815, "learning_rate": 2.7000000000000002e-05, "loss": 0.2237, "step": 270 }, { "epoch": 0.017547157987090306, "grad_norm": 17.93160629272461, "learning_rate": 2.8000000000000003e-05, "loss": 0.1505, "step": 280 }, { "epoch": 0.018173842200914957, "grad_norm": 5.02848482131958, "learning_rate": 2.9e-05, "loss": 0.2666, "step": 290 }, { "epoch": 0.018800526414739612, "grad_norm": 0.12570694088935852, "learning_rate": 3e-05, "loss": 0.1358, "step": 300 }, { "epoch": 0.019427210628564267, "grad_norm": 7.472323894500732, "learning_rate": 3.1e-05, "loss": 0.2556, "step": 310 }, { "epoch": 0.02005389484238892, "grad_norm": 0.15033140778541565, "learning_rate": 3.2000000000000005e-05, "loss": 0.1621, "step": 320 }, { "epoch": 0.020680579056213573, "grad_norm": 0.3948344588279724, "learning_rate": 3.3e-05, "loss": 0.021, "step": 330 }, { "epoch": 0.021307263270038228, "grad_norm": 0.1560896635055542, "learning_rate": 3.4000000000000007e-05, "loss": 0.1498, "step": 340 }, { "epoch": 0.021933947483862883, "grad_norm": 0.2152891755104065, "learning_rate": 3.5e-05, "loss": 0.1955, "step": 350 }, { "epoch": 0.022560631697687534, "grad_norm": 0.12051887810230255, "learning_rate": 3.6e-05, "loss": 0.1427, "step": 360 }, { "epoch": 0.02318731591151219, "grad_norm": 0.1034441739320755, "learning_rate": 3.7e-05, "loss": 0.2262, "step": 370 }, { "epoch": 0.023814000125336843, "grad_norm": 7.567387580871582, "learning_rate": 3.8e-05, "loss": 0.3546, "step": 380 }, { "epoch": 0.024440684339161498, "grad_norm": 12.388092994689941, "learning_rate": 3.9000000000000006e-05, "loss": 0.1448, "step": 390 }, { "epoch": 0.02506736855298615, "grad_norm": 4.503359317779541, "learning_rate": 4e-05, "loss": 0.0713, "step": 400 }, { "epoch": 0.025694052766810804, "grad_norm": 0.23637720942497253, "learning_rate": 4.1e-05, "loss": 0.2683, "step": 410 }, { "epoch": 0.02632073698063546, "grad_norm": 0.2273373007774353, "learning_rate": 4.2e-05, "loss": 0.0707, "step": 420 }, { "epoch": 0.02694742119446011, "grad_norm": 21.912202835083008, "learning_rate": 4.3e-05, "loss": 0.0993, "step": 430 }, { "epoch": 0.027574105408284765, "grad_norm": 6.012325763702393, "learning_rate": 4.4000000000000006e-05, "loss": 0.1063, "step": 440 }, { "epoch": 0.02820078962210942, "grad_norm": 1.8421870470046997, "learning_rate": 4.5e-05, "loss": 0.4188, "step": 450 }, { "epoch": 0.02882747383593407, "grad_norm": 0.906676173210144, "learning_rate": 4.600000000000001e-05, "loss": 0.1205, "step": 460 }, { "epoch": 0.029454158049758726, "grad_norm": 9.664088249206543, "learning_rate": 4.7e-05, "loss": 0.2815, "step": 470 }, { "epoch": 0.03008084226358338, "grad_norm": 0.122682586312294, "learning_rate": 4.8e-05, "loss": 0.1559, "step": 480 }, { "epoch": 0.030707526477408036, "grad_norm": 0.26290163397789, "learning_rate": 4.9e-05, "loss": 0.0847, "step": 490 }, { "epoch": 0.03133421069123269, "grad_norm": 0.10128908604383469, "learning_rate": 5e-05, "loss": 0.1371, "step": 500 }, { "epoch": 0.03196089490505734, "grad_norm": 12.029622077941895, "learning_rate": 4.998944501910452e-05, "loss": 0.2847, "step": 510 }, { "epoch": 0.03258757911888199, "grad_norm": 0.16734056174755096, "learning_rate": 4.997889003820903e-05, "loss": 0.1148, "step": 520 }, { "epoch": 0.03321426333270665, "grad_norm": 31.41100311279297, "learning_rate": 4.9968335057313546e-05, "loss": 0.195, "step": 530 }, { "epoch": 0.0338409475465313, "grad_norm": 3.025646209716797, "learning_rate": 4.995778007641806e-05, "loss": 0.2554, "step": 540 }, { "epoch": 0.03446763176035596, "grad_norm": 4.002821922302246, "learning_rate": 4.994722509552258e-05, "loss": 0.1961, "step": 550 }, { "epoch": 0.03509431597418061, "grad_norm": 0.12222342193126678, "learning_rate": 4.9936670114627096e-05, "loss": 0.3323, "step": 560 }, { "epoch": 0.03572100018800527, "grad_norm": 21.177127838134766, "learning_rate": 4.992611513373161e-05, "loss": 0.4417, "step": 570 }, { "epoch": 0.036347684401829915, "grad_norm": 0.5925000905990601, "learning_rate": 4.991556015283613e-05, "loss": 0.223, "step": 580 }, { "epoch": 0.03697436861565457, "grad_norm": 16.268051147460938, "learning_rate": 4.990500517194064e-05, "loss": 0.181, "step": 590 }, { "epoch": 0.037601052829479224, "grad_norm": 0.11519550532102585, "learning_rate": 4.9894450191045155e-05, "loss": 0.1659, "step": 600 }, { "epoch": 0.03822773704330388, "grad_norm": 0.1412481963634491, "learning_rate": 4.988389521014967e-05, "loss": 0.2244, "step": 610 }, { "epoch": 0.038854421257128534, "grad_norm": 3.1080451011657715, "learning_rate": 4.987334022925419e-05, "loss": 0.0507, "step": 620 }, { "epoch": 0.03948110547095319, "grad_norm": 0.1547059267759323, "learning_rate": 4.98627852483587e-05, "loss": 0.2914, "step": 630 }, { "epoch": 0.04010778968477784, "grad_norm": 0.22034786641597748, "learning_rate": 4.9852230267463215e-05, "loss": 0.223, "step": 640 }, { "epoch": 0.04073447389860249, "grad_norm": 0.7811187505722046, "learning_rate": 4.984167528656773e-05, "loss": 0.2595, "step": 650 }, { "epoch": 0.041361158112427146, "grad_norm": 0.7039777040481567, "learning_rate": 4.983112030567225e-05, "loss": 0.1963, "step": 660 }, { "epoch": 0.0419878423262518, "grad_norm": 15.276497840881348, "learning_rate": 4.9820565324776765e-05, "loss": 0.1323, "step": 670 }, { "epoch": 0.042614526540076456, "grad_norm": 8.510149002075195, "learning_rate": 4.981001034388128e-05, "loss": 0.1898, "step": 680 }, { "epoch": 0.04324121075390111, "grad_norm": 1.145938515663147, "learning_rate": 4.97994553629858e-05, "loss": 0.1372, "step": 690 }, { "epoch": 0.043867894967725765, "grad_norm": 0.04701605811715126, "learning_rate": 4.978890038209031e-05, "loss": 0.1105, "step": 700 }, { "epoch": 0.04449457918155042, "grad_norm": 9.567878723144531, "learning_rate": 4.9778345401194825e-05, "loss": 0.3655, "step": 710 }, { "epoch": 0.04512126339537507, "grad_norm": 14.110966682434082, "learning_rate": 4.976779042029934e-05, "loss": 0.1356, "step": 720 }, { "epoch": 0.04574794760919972, "grad_norm": 0.08877156674861908, "learning_rate": 4.975723543940386e-05, "loss": 0.1217, "step": 730 }, { "epoch": 0.04637463182302438, "grad_norm": 0.07869545370340347, "learning_rate": 4.974668045850837e-05, "loss": 0.1509, "step": 740 }, { "epoch": 0.04700131603684903, "grad_norm": 5.638880252838135, "learning_rate": 4.973612547761289e-05, "loss": 0.1362, "step": 750 }, { "epoch": 0.04762800025067369, "grad_norm": 0.07686334103345871, "learning_rate": 4.972557049671741e-05, "loss": 0.0054, "step": 760 }, { "epoch": 0.04825468446449834, "grad_norm": 0.11897100508213043, "learning_rate": 4.971501551582192e-05, "loss": 0.2837, "step": 770 }, { "epoch": 0.048881368678322996, "grad_norm": 15.471230506896973, "learning_rate": 4.9704460534926435e-05, "loss": 0.0393, "step": 780 }, { "epoch": 0.049508052892147644, "grad_norm": 9.890192985534668, "learning_rate": 4.969390555403095e-05, "loss": 0.1452, "step": 790 }, { "epoch": 0.0501347371059723, "grad_norm": 12.074395179748535, "learning_rate": 4.968335057313547e-05, "loss": 0.4167, "step": 800 }, { "epoch": 0.050761421319796954, "grad_norm": 5.107584476470947, "learning_rate": 4.967279559223998e-05, "loss": 0.171, "step": 810 }, { "epoch": 0.05138810553362161, "grad_norm": 0.09638172388076782, "learning_rate": 4.9662240611344495e-05, "loss": 0.0487, "step": 820 }, { "epoch": 0.05201478974744626, "grad_norm": 6.376888275146484, "learning_rate": 4.965168563044901e-05, "loss": 0.1593, "step": 830 }, { "epoch": 0.05264147396127092, "grad_norm": 2.0702414512634277, "learning_rate": 4.964113064955352e-05, "loss": 0.277, "step": 840 }, { "epoch": 0.053268158175095566, "grad_norm": 0.21229584515094757, "learning_rate": 4.9630575668658045e-05, "loss": 0.1619, "step": 850 }, { "epoch": 0.05389484238892022, "grad_norm": 15.22604751586914, "learning_rate": 4.962002068776256e-05, "loss": 0.1956, "step": 860 }, { "epoch": 0.054521526602744876, "grad_norm": 0.05341089889407158, "learning_rate": 4.960946570686708e-05, "loss": 0.3803, "step": 870 }, { "epoch": 0.05514821081656953, "grad_norm": 0.09002327173948288, "learning_rate": 4.959891072597159e-05, "loss": 0.0758, "step": 880 }, { "epoch": 0.055774895030394185, "grad_norm": 6.2136430740356445, "learning_rate": 4.9588355745076104e-05, "loss": 0.1879, "step": 890 }, { "epoch": 0.05640157924421884, "grad_norm": 0.36718255281448364, "learning_rate": 4.957780076418062e-05, "loss": 0.2352, "step": 900 }, { "epoch": 0.057028263458043495, "grad_norm": 0.7451048493385315, "learning_rate": 4.956724578328513e-05, "loss": 0.1774, "step": 910 }, { "epoch": 0.05765494767186814, "grad_norm": 0.721674382686615, "learning_rate": 4.955669080238965e-05, "loss": 0.1824, "step": 920 }, { "epoch": 0.0582816318856928, "grad_norm": 18.23922348022461, "learning_rate": 4.9546135821494164e-05, "loss": 0.2313, "step": 930 }, { "epoch": 0.05890831609951745, "grad_norm": 10.863558769226074, "learning_rate": 4.953558084059868e-05, "loss": 0.271, "step": 940 }, { "epoch": 0.05953500031334211, "grad_norm": 14.508039474487305, "learning_rate": 4.95250258597032e-05, "loss": 0.1408, "step": 950 }, { "epoch": 0.06016168452716676, "grad_norm": 5.872216701507568, "learning_rate": 4.9514470878807714e-05, "loss": 0.1507, "step": 960 }, { "epoch": 0.060788368740991416, "grad_norm": 7.473176002502441, "learning_rate": 4.950391589791223e-05, "loss": 0.2671, "step": 970 }, { "epoch": 0.06141505295481607, "grad_norm": 3.9476542472839355, "learning_rate": 4.949336091701674e-05, "loss": 0.264, "step": 980 }, { "epoch": 0.06204173716864072, "grad_norm": 0.1643088310956955, "learning_rate": 4.948280593612126e-05, "loss": 0.1544, "step": 990 }, { "epoch": 0.06266842138246538, "grad_norm": 0.5590381026268005, "learning_rate": 4.9472250955225774e-05, "loss": 0.2018, "step": 1000 }, { "epoch": 0.06329510559629004, "grad_norm": 3.769355058670044, "learning_rate": 4.946169597433029e-05, "loss": 0.2406, "step": 1010 }, { "epoch": 0.06392178981011468, "grad_norm": 11.80628776550293, "learning_rate": 4.94511409934348e-05, "loss": 0.1427, "step": 1020 }, { "epoch": 0.06454847402393933, "grad_norm": 11.33843994140625, "learning_rate": 4.944058601253932e-05, "loss": 0.3128, "step": 1030 }, { "epoch": 0.06517515823776399, "grad_norm": 1.8439558744430542, "learning_rate": 4.9430031031643834e-05, "loss": 0.0682, "step": 1040 }, { "epoch": 0.06580184245158864, "grad_norm": 0.4288315176963806, "learning_rate": 4.941947605074835e-05, "loss": 0.1597, "step": 1050 }, { "epoch": 0.0664285266654133, "grad_norm": 0.11158425360918045, "learning_rate": 4.940892106985287e-05, "loss": 0.0531, "step": 1060 }, { "epoch": 0.06705521087923795, "grad_norm": 1.80587899684906, "learning_rate": 4.9398366088957384e-05, "loss": 0.1558, "step": 1070 }, { "epoch": 0.0676818950930626, "grad_norm": 9.482861518859863, "learning_rate": 4.93878111080619e-05, "loss": 0.1914, "step": 1080 }, { "epoch": 0.06830857930688726, "grad_norm": 3.83756685256958, "learning_rate": 4.937725612716641e-05, "loss": 0.1753, "step": 1090 }, { "epoch": 0.06893526352071191, "grad_norm": 0.12418217211961746, "learning_rate": 4.936670114627093e-05, "loss": 0.134, "step": 1100 }, { "epoch": 0.06956194773453657, "grad_norm": 3.1548798084259033, "learning_rate": 4.9356146165375444e-05, "loss": 0.1173, "step": 1110 }, { "epoch": 0.07018863194836122, "grad_norm": 2.56209135055542, "learning_rate": 4.934559118447996e-05, "loss": 0.2539, "step": 1120 }, { "epoch": 0.07081531616218588, "grad_norm": 3.5509636402130127, "learning_rate": 4.933503620358447e-05, "loss": 0.2879, "step": 1130 }, { "epoch": 0.07144200037601053, "grad_norm": 0.764828622341156, "learning_rate": 4.932448122268899e-05, "loss": 0.2342, "step": 1140 }, { "epoch": 0.07206868458983519, "grad_norm": 4.222464561462402, "learning_rate": 4.9313926241793503e-05, "loss": 0.0773, "step": 1150 }, { "epoch": 0.07269536880365983, "grad_norm": 0.05031255632638931, "learning_rate": 4.930337126089802e-05, "loss": 0.0528, "step": 1160 }, { "epoch": 0.07332205301748448, "grad_norm": 12.016239166259766, "learning_rate": 4.929281628000254e-05, "loss": 0.4079, "step": 1170 }, { "epoch": 0.07394873723130914, "grad_norm": 0.22898195683956146, "learning_rate": 4.928226129910705e-05, "loss": 0.1286, "step": 1180 }, { "epoch": 0.0745754214451338, "grad_norm": 1.5522078275680542, "learning_rate": 4.927170631821157e-05, "loss": 0.1678, "step": 1190 }, { "epoch": 0.07520210565895845, "grad_norm": 8.510868072509766, "learning_rate": 4.926115133731608e-05, "loss": 0.1897, "step": 1200 }, { "epoch": 0.0758287898727831, "grad_norm": 15.368112564086914, "learning_rate": 4.9250596356420597e-05, "loss": 0.1667, "step": 1210 }, { "epoch": 0.07645547408660776, "grad_norm": 5.564446926116943, "learning_rate": 4.924004137552511e-05, "loss": 0.0653, "step": 1220 }, { "epoch": 0.07708215830043241, "grad_norm": 4.638933181762695, "learning_rate": 4.922948639462962e-05, "loss": 0.2175, "step": 1230 }, { "epoch": 0.07770884251425707, "grad_norm": 11.21340274810791, "learning_rate": 4.921893141373414e-05, "loss": 0.1778, "step": 1240 }, { "epoch": 0.07833552672808172, "grad_norm": 0.5090659260749817, "learning_rate": 4.920837643283866e-05, "loss": 0.0966, "step": 1250 }, { "epoch": 0.07896221094190638, "grad_norm": 0.08244244009256363, "learning_rate": 4.919782145194318e-05, "loss": 0.1893, "step": 1260 }, { "epoch": 0.07958889515573103, "grad_norm": 0.22240492701530457, "learning_rate": 4.918726647104769e-05, "loss": 0.1542, "step": 1270 }, { "epoch": 0.08021557936955569, "grad_norm": 0.508705198764801, "learning_rate": 4.9176711490152206e-05, "loss": 0.179, "step": 1280 }, { "epoch": 0.08084226358338034, "grad_norm": 4.623276710510254, "learning_rate": 4.916615650925672e-05, "loss": 0.3569, "step": 1290 }, { "epoch": 0.08146894779720498, "grad_norm": 2.1783127784729004, "learning_rate": 4.915560152836123e-05, "loss": 0.1525, "step": 1300 }, { "epoch": 0.08209563201102964, "grad_norm": 0.3979020118713379, "learning_rate": 4.914504654746575e-05, "loss": 0.0309, "step": 1310 }, { "epoch": 0.08272231622485429, "grad_norm": 0.14067301154136658, "learning_rate": 4.9134491566570266e-05, "loss": 0.1543, "step": 1320 }, { "epoch": 0.08334900043867895, "grad_norm": 0.15461118519306183, "learning_rate": 4.912393658567478e-05, "loss": 0.0617, "step": 1330 }, { "epoch": 0.0839756846525036, "grad_norm": 0.0635291114449501, "learning_rate": 4.911338160477929e-05, "loss": 0.1736, "step": 1340 }, { "epoch": 0.08460236886632826, "grad_norm": 0.07697498798370361, "learning_rate": 4.9102826623883816e-05, "loss": 0.0851, "step": 1350 }, { "epoch": 0.08522905308015291, "grad_norm": 3.9091408252716064, "learning_rate": 4.909227164298833e-05, "loss": 0.2063, "step": 1360 }, { "epoch": 0.08585573729397757, "grad_norm": 0.49504515528678894, "learning_rate": 4.908171666209284e-05, "loss": 0.0456, "step": 1370 }, { "epoch": 0.08648242150780222, "grad_norm": 3.6886634826660156, "learning_rate": 4.907116168119736e-05, "loss": 0.1798, "step": 1380 }, { "epoch": 0.08710910572162688, "grad_norm": 24.821048736572266, "learning_rate": 4.9060606700301876e-05, "loss": 0.0989, "step": 1390 }, { "epoch": 0.08773578993545153, "grad_norm": 10.899468421936035, "learning_rate": 4.905005171940639e-05, "loss": 0.4166, "step": 1400 }, { "epoch": 0.08836247414927619, "grad_norm": 2.6948788166046143, "learning_rate": 4.90394967385109e-05, "loss": 0.3788, "step": 1410 }, { "epoch": 0.08898915836310084, "grad_norm": 2.010342836380005, "learning_rate": 4.902894175761542e-05, "loss": 0.1184, "step": 1420 }, { "epoch": 0.08961584257692548, "grad_norm": 0.03161928057670593, "learning_rate": 4.9018386776719936e-05, "loss": 0.0987, "step": 1430 }, { "epoch": 0.09024252679075014, "grad_norm": 11.630290031433105, "learning_rate": 4.900783179582445e-05, "loss": 0.1251, "step": 1440 }, { "epoch": 0.09086921100457479, "grad_norm": 0.29562464356422424, "learning_rate": 4.899727681492897e-05, "loss": 0.1273, "step": 1450 }, { "epoch": 0.09149589521839945, "grad_norm": 55.65250015258789, "learning_rate": 4.8986721834033486e-05, "loss": 0.1763, "step": 1460 }, { "epoch": 0.0921225794322241, "grad_norm": 2.1838998794555664, "learning_rate": 4.8976166853138e-05, "loss": 0.33, "step": 1470 }, { "epoch": 0.09274926364604875, "grad_norm": 7.990715980529785, "learning_rate": 4.896561187224251e-05, "loss": 0.1245, "step": 1480 }, { "epoch": 0.09337594785987341, "grad_norm": 8.314751625061035, "learning_rate": 4.895505689134703e-05, "loss": 0.093, "step": 1490 }, { "epoch": 0.09400263207369806, "grad_norm": 2.740541458129883, "learning_rate": 4.8944501910451546e-05, "loss": 0.325, "step": 1500 }, { "epoch": 0.09462931628752272, "grad_norm": 0.10673114657402039, "learning_rate": 4.8933946929556055e-05, "loss": 0.228, "step": 1510 }, { "epoch": 0.09525600050134737, "grad_norm": 0.07358872145414352, "learning_rate": 4.892339194866057e-05, "loss": 0.134, "step": 1520 }, { "epoch": 0.09588268471517203, "grad_norm": 0.673576295375824, "learning_rate": 4.891283696776509e-05, "loss": 0.0626, "step": 1530 }, { "epoch": 0.09650936892899668, "grad_norm": 9.495621681213379, "learning_rate": 4.8902281986869605e-05, "loss": 0.3845, "step": 1540 }, { "epoch": 0.09713605314282134, "grad_norm": 4.396236419677734, "learning_rate": 4.889172700597412e-05, "loss": 0.1065, "step": 1550 }, { "epoch": 0.09776273735664599, "grad_norm": 0.25836917757987976, "learning_rate": 4.888117202507864e-05, "loss": 0.2009, "step": 1560 }, { "epoch": 0.09838942157047063, "grad_norm": 0.12051882594823837, "learning_rate": 4.8870617044183155e-05, "loss": 0.0977, "step": 1570 }, { "epoch": 0.09901610578429529, "grad_norm": 1.9034405946731567, "learning_rate": 4.886006206328767e-05, "loss": 0.1978, "step": 1580 }, { "epoch": 0.09964278999811994, "grad_norm": 0.4317615032196045, "learning_rate": 4.884950708239218e-05, "loss": 0.1936, "step": 1590 }, { "epoch": 0.1002694742119446, "grad_norm": 1.0015511512756348, "learning_rate": 4.88389521014967e-05, "loss": 0.1216, "step": 1600 }, { "epoch": 0.10089615842576925, "grad_norm": 0.307364284992218, "learning_rate": 4.8828397120601215e-05, "loss": 0.0249, "step": 1610 }, { "epoch": 0.10152284263959391, "grad_norm": 0.7230989336967468, "learning_rate": 4.8817842139705725e-05, "loss": 0.1124, "step": 1620 }, { "epoch": 0.10214952685341856, "grad_norm": 3.9494760036468506, "learning_rate": 4.880728715881024e-05, "loss": 0.1477, "step": 1630 }, { "epoch": 0.10277621106724322, "grad_norm": 0.13938945531845093, "learning_rate": 4.879673217791476e-05, "loss": 0.2507, "step": 1640 }, { "epoch": 0.10340289528106787, "grad_norm": 3.76981520652771, "learning_rate": 4.8786177197019275e-05, "loss": 0.2671, "step": 1650 }, { "epoch": 0.10402957949489253, "grad_norm": 0.21893472969532013, "learning_rate": 4.877562221612379e-05, "loss": 0.1446, "step": 1660 }, { "epoch": 0.10465626370871718, "grad_norm": 0.17878833413124084, "learning_rate": 4.876506723522831e-05, "loss": 0.1263, "step": 1670 }, { "epoch": 0.10528294792254184, "grad_norm": 0.6086212396621704, "learning_rate": 4.8754512254332825e-05, "loss": 0.1327, "step": 1680 }, { "epoch": 0.10590963213636649, "grad_norm": 0.3757482171058655, "learning_rate": 4.8743957273437335e-05, "loss": 0.1065, "step": 1690 }, { "epoch": 0.10653631635019113, "grad_norm": 0.045173462480306625, "learning_rate": 4.873340229254185e-05, "loss": 0.1215, "step": 1700 }, { "epoch": 0.10716300056401579, "grad_norm": 0.026341581717133522, "learning_rate": 4.872284731164637e-05, "loss": 0.1255, "step": 1710 }, { "epoch": 0.10778968477784044, "grad_norm": 0.036156751215457916, "learning_rate": 4.8712292330750885e-05, "loss": 0.2899, "step": 1720 }, { "epoch": 0.1084163689916651, "grad_norm": 22.635560989379883, "learning_rate": 4.8701737349855395e-05, "loss": 0.2425, "step": 1730 }, { "epoch": 0.10904305320548975, "grad_norm": 7.215483665466309, "learning_rate": 4.869118236895991e-05, "loss": 0.1708, "step": 1740 }, { "epoch": 0.1096697374193144, "grad_norm": 8.375259399414062, "learning_rate": 4.8680627388064435e-05, "loss": 0.0651, "step": 1750 }, { "epoch": 0.11029642163313906, "grad_norm": 0.029143275693058968, "learning_rate": 4.8670072407168945e-05, "loss": 0.0235, "step": 1760 }, { "epoch": 0.11092310584696372, "grad_norm": 0.640413224697113, "learning_rate": 4.865951742627346e-05, "loss": 0.2324, "step": 1770 }, { "epoch": 0.11154979006078837, "grad_norm": 5.40239143371582, "learning_rate": 4.864896244537798e-05, "loss": 0.1828, "step": 1780 }, { "epoch": 0.11217647427461303, "grad_norm": 6.553516864776611, "learning_rate": 4.8638407464482495e-05, "loss": 0.0174, "step": 1790 }, { "epoch": 0.11280315848843768, "grad_norm": 5.948601722717285, "learning_rate": 4.8627852483587004e-05, "loss": 0.2896, "step": 1800 }, { "epoch": 0.11342984270226233, "grad_norm": 0.044085729867219925, "learning_rate": 4.861729750269152e-05, "loss": 0.1238, "step": 1810 }, { "epoch": 0.11405652691608699, "grad_norm": 1.21904718875885, "learning_rate": 4.860674252179604e-05, "loss": 0.2042, "step": 1820 }, { "epoch": 0.11468321112991164, "grad_norm": 0.3443450927734375, "learning_rate": 4.859618754090055e-05, "loss": 0.0531, "step": 1830 }, { "epoch": 0.11530989534373629, "grad_norm": 5.81721305847168, "learning_rate": 4.858563256000507e-05, "loss": 0.3022, "step": 1840 }, { "epoch": 0.11593657955756094, "grad_norm": 1.6238272190093994, "learning_rate": 4.857507757910959e-05, "loss": 0.2342, "step": 1850 }, { "epoch": 0.1165632637713856, "grad_norm": 1.0162500143051147, "learning_rate": 4.8564522598214104e-05, "loss": 0.0913, "step": 1860 }, { "epoch": 0.11718994798521025, "grad_norm": 0.029920578002929688, "learning_rate": 4.8553967617318614e-05, "loss": 0.167, "step": 1870 }, { "epoch": 0.1178166321990349, "grad_norm": 0.19740080833435059, "learning_rate": 4.854341263642313e-05, "loss": 0.2157, "step": 1880 }, { "epoch": 0.11844331641285956, "grad_norm": 7.18571662902832, "learning_rate": 4.853285765552765e-05, "loss": 0.2966, "step": 1890 }, { "epoch": 0.11907000062668421, "grad_norm": 0.09219271689653397, "learning_rate": 4.852230267463216e-05, "loss": 0.1399, "step": 1900 }, { "epoch": 0.11969668484050887, "grad_norm": 0.7403557896614075, "learning_rate": 4.8511747693736674e-05, "loss": 0.1223, "step": 1910 }, { "epoch": 0.12032336905433352, "grad_norm": 19.517911911010742, "learning_rate": 4.850119271284119e-05, "loss": 0.2518, "step": 1920 }, { "epoch": 0.12095005326815818, "grad_norm": 0.033364780247211456, "learning_rate": 4.849063773194571e-05, "loss": 0.2251, "step": 1930 }, { "epoch": 0.12157673748198283, "grad_norm": 0.16062411665916443, "learning_rate": 4.8480082751050224e-05, "loss": 0.1562, "step": 1940 }, { "epoch": 0.12220342169580749, "grad_norm": 0.25009897351264954, "learning_rate": 4.846952777015474e-05, "loss": 0.0093, "step": 1950 }, { "epoch": 0.12283010590963214, "grad_norm": 0.11893218010663986, "learning_rate": 4.845897278925926e-05, "loss": 0.0805, "step": 1960 }, { "epoch": 0.12345679012345678, "grad_norm": 0.02064370922744274, "learning_rate": 4.8448417808363774e-05, "loss": 0.2079, "step": 1970 }, { "epoch": 0.12408347433728144, "grad_norm": 0.1334376484155655, "learning_rate": 4.8437862827468284e-05, "loss": 0.139, "step": 1980 }, { "epoch": 0.12471015855110609, "grad_norm": 0.07230320572853088, "learning_rate": 4.84273078465728e-05, "loss": 0.1551, "step": 1990 }, { "epoch": 0.12533684276493076, "grad_norm": 0.08805450797080994, "learning_rate": 4.841675286567732e-05, "loss": 0.0033, "step": 2000 }, { "epoch": 0.1259635269787554, "grad_norm": 0.07734368741512299, "learning_rate": 4.840619788478183e-05, "loss": 0.3391, "step": 2010 }, { "epoch": 0.12659021119258007, "grad_norm": 0.12036556750535965, "learning_rate": 4.8395642903886344e-05, "loss": 0.1158, "step": 2020 }, { "epoch": 0.1272168954064047, "grad_norm": 6.732293605804443, "learning_rate": 4.838508792299086e-05, "loss": 0.1857, "step": 2030 }, { "epoch": 0.12784357962022935, "grad_norm": 4.408974647521973, "learning_rate": 4.837453294209538e-05, "loss": 0.0729, "step": 2040 }, { "epoch": 0.12847026383405402, "grad_norm": 3.5390381813049316, "learning_rate": 4.8363977961199894e-05, "loss": 0.1274, "step": 2050 }, { "epoch": 0.12909694804787866, "grad_norm": 0.04466002434492111, "learning_rate": 4.835342298030441e-05, "loss": 0.0609, "step": 2060 }, { "epoch": 0.12972363226170333, "grad_norm": 24.187576293945312, "learning_rate": 4.834286799940893e-05, "loss": 0.028, "step": 2070 }, { "epoch": 0.13035031647552797, "grad_norm": 0.023793770000338554, "learning_rate": 4.833231301851344e-05, "loss": 0.0028, "step": 2080 }, { "epoch": 0.13097700068935264, "grad_norm": 0.20558828115463257, "learning_rate": 4.8321758037617953e-05, "loss": 0.2252, "step": 2090 }, { "epoch": 0.13160368490317728, "grad_norm": 0.06289856880903244, "learning_rate": 4.831120305672247e-05, "loss": 0.0928, "step": 2100 }, { "epoch": 0.13223036911700195, "grad_norm": 7.317686080932617, "learning_rate": 4.830064807582699e-05, "loss": 0.1462, "step": 2110 }, { "epoch": 0.1328570533308266, "grad_norm": 2.1458468437194824, "learning_rate": 4.8290093094931497e-05, "loss": 0.1463, "step": 2120 }, { "epoch": 0.13348373754465126, "grad_norm": 4.286557197570801, "learning_rate": 4.827953811403601e-05, "loss": 0.1568, "step": 2130 }, { "epoch": 0.1341104217584759, "grad_norm": 8.06277084350586, "learning_rate": 4.826898313314053e-05, "loss": 0.0988, "step": 2140 }, { "epoch": 0.13473710597230057, "grad_norm": 37.442718505859375, "learning_rate": 4.8258428152245047e-05, "loss": 0.0563, "step": 2150 }, { "epoch": 0.1353637901861252, "grad_norm": 1.2893574237823486, "learning_rate": 4.824787317134956e-05, "loss": 0.053, "step": 2160 }, { "epoch": 0.13599047439994988, "grad_norm": 28.35834503173828, "learning_rate": 4.823731819045408e-05, "loss": 0.0935, "step": 2170 }, { "epoch": 0.13661715861377452, "grad_norm": 0.25138330459594727, "learning_rate": 4.8226763209558597e-05, "loss": 0.0325, "step": 2180 }, { "epoch": 0.13724384282759916, "grad_norm": 1.9327008724212646, "learning_rate": 4.8216208228663106e-05, "loss": 0.0734, "step": 2190 }, { "epoch": 0.13787052704142383, "grad_norm": 0.01644737273454666, "learning_rate": 4.820565324776762e-05, "loss": 0.1206, "step": 2200 }, { "epoch": 0.13849721125524847, "grad_norm": 0.38374465703964233, "learning_rate": 4.819509826687214e-05, "loss": 0.1869, "step": 2210 }, { "epoch": 0.13912389546907314, "grad_norm": 3.8984410762786865, "learning_rate": 4.818454328597665e-05, "loss": 0.1447, "step": 2220 }, { "epoch": 0.13975057968289778, "grad_norm": 0.06196863204240799, "learning_rate": 4.8173988305081166e-05, "loss": 0.2484, "step": 2230 }, { "epoch": 0.14037726389672245, "grad_norm": 0.36186841130256653, "learning_rate": 4.816343332418568e-05, "loss": 0.068, "step": 2240 }, { "epoch": 0.1410039481105471, "grad_norm": 0.07987497001886368, "learning_rate": 4.8152878343290206e-05, "loss": 0.2028, "step": 2250 }, { "epoch": 0.14163063232437176, "grad_norm": 6.794961929321289, "learning_rate": 4.8142323362394716e-05, "loss": 0.2791, "step": 2260 }, { "epoch": 0.1422573165381964, "grad_norm": 0.14792947471141815, "learning_rate": 4.813176838149923e-05, "loss": 0.097, "step": 2270 }, { "epoch": 0.14288400075202107, "grad_norm": 2.2044332027435303, "learning_rate": 4.812121340060375e-05, "loss": 0.2176, "step": 2280 }, { "epoch": 0.1435106849658457, "grad_norm": 1.7671102285385132, "learning_rate": 4.811065841970826e-05, "loss": 0.1282, "step": 2290 }, { "epoch": 0.14413736917967038, "grad_norm": 1.2018122673034668, "learning_rate": 4.8100103438812776e-05, "loss": 0.0809, "step": 2300 }, { "epoch": 0.14476405339349502, "grad_norm": 0.08096177130937576, "learning_rate": 4.808954845791729e-05, "loss": 0.1052, "step": 2310 }, { "epoch": 0.14539073760731966, "grad_norm": 0.1800854206085205, "learning_rate": 4.807899347702181e-05, "loss": 0.3427, "step": 2320 }, { "epoch": 0.14601742182114433, "grad_norm": 0.6755953431129456, "learning_rate": 4.806843849612632e-05, "loss": 0.1775, "step": 2330 }, { "epoch": 0.14664410603496897, "grad_norm": 0.213535875082016, "learning_rate": 4.805788351523084e-05, "loss": 0.0122, "step": 2340 }, { "epoch": 0.14727079024879364, "grad_norm": 0.19489499926567078, "learning_rate": 4.804732853433536e-05, "loss": 0.1821, "step": 2350 }, { "epoch": 0.14789747446261828, "grad_norm": 0.05088840797543526, "learning_rate": 4.8036773553439876e-05, "loss": 0.2758, "step": 2360 }, { "epoch": 0.14852415867644295, "grad_norm": 3.9418184757232666, "learning_rate": 4.8026218572544386e-05, "loss": 0.1707, "step": 2370 }, { "epoch": 0.1491508428902676, "grad_norm": 0.08057057112455368, "learning_rate": 4.80156635916489e-05, "loss": 0.0479, "step": 2380 }, { "epoch": 0.14977752710409226, "grad_norm": 0.035881925374269485, "learning_rate": 4.800510861075342e-05, "loss": 0.0087, "step": 2390 }, { "epoch": 0.1504042113179169, "grad_norm": 0.07139906287193298, "learning_rate": 4.799455362985793e-05, "loss": 0.1374, "step": 2400 }, { "epoch": 0.15103089553174157, "grad_norm": 1.764007329940796, "learning_rate": 4.7983998648962446e-05, "loss": 0.3789, "step": 2410 }, { "epoch": 0.1516575797455662, "grad_norm": 0.17716698348522186, "learning_rate": 4.797344366806696e-05, "loss": 0.1987, "step": 2420 }, { "epoch": 0.15228426395939088, "grad_norm": 0.5136746764183044, "learning_rate": 4.796288868717148e-05, "loss": 0.1136, "step": 2430 }, { "epoch": 0.15291094817321552, "grad_norm": 8.765165328979492, "learning_rate": 4.7952333706275996e-05, "loss": 0.2001, "step": 2440 }, { "epoch": 0.15353763238704016, "grad_norm": 6.890345573425293, "learning_rate": 4.794177872538051e-05, "loss": 0.1941, "step": 2450 }, { "epoch": 0.15416431660086483, "grad_norm": 2.7363126277923584, "learning_rate": 4.793122374448503e-05, "loss": 0.1513, "step": 2460 }, { "epoch": 0.15479100081468947, "grad_norm": 0.43579980731010437, "learning_rate": 4.792066876358954e-05, "loss": 0.1283, "step": 2470 }, { "epoch": 0.15541768502851414, "grad_norm": 10.213183403015137, "learning_rate": 4.7910113782694055e-05, "loss": 0.211, "step": 2480 }, { "epoch": 0.15604436924233878, "grad_norm": 2.5786595344543457, "learning_rate": 4.789955880179857e-05, "loss": 0.1093, "step": 2490 }, { "epoch": 0.15667105345616344, "grad_norm": 8.773494720458984, "learning_rate": 4.788900382090309e-05, "loss": 0.0228, "step": 2500 }, { "epoch": 0.15729773766998809, "grad_norm": 1.877724528312683, "learning_rate": 4.78784488400076e-05, "loss": 0.0768, "step": 2510 }, { "epoch": 0.15792442188381275, "grad_norm": 0.2219085693359375, "learning_rate": 4.7867893859112115e-05, "loss": 0.1902, "step": 2520 }, { "epoch": 0.1585511060976374, "grad_norm": 2.0254573822021484, "learning_rate": 4.785733887821663e-05, "loss": 0.2101, "step": 2530 }, { "epoch": 0.15917779031146206, "grad_norm": 0.1042730063199997, "learning_rate": 4.784678389732115e-05, "loss": 0.1561, "step": 2540 }, { "epoch": 0.1598044745252867, "grad_norm": 3.1619834899902344, "learning_rate": 4.7836228916425665e-05, "loss": 0.2101, "step": 2550 }, { "epoch": 0.16043115873911137, "grad_norm": 0.6285260319709778, "learning_rate": 4.782567393553018e-05, "loss": 0.0834, "step": 2560 }, { "epoch": 0.16105784295293601, "grad_norm": 0.13654287159442902, "learning_rate": 4.78151189546347e-05, "loss": 0.0928, "step": 2570 }, { "epoch": 0.16168452716676068, "grad_norm": 0.3744922876358032, "learning_rate": 4.780456397373921e-05, "loss": 0.2121, "step": 2580 }, { "epoch": 0.16231121138058532, "grad_norm": 7.18287992477417, "learning_rate": 4.7794008992843725e-05, "loss": 0.1356, "step": 2590 }, { "epoch": 0.16293789559440996, "grad_norm": 0.1572078913450241, "learning_rate": 4.778345401194824e-05, "loss": 0.1453, "step": 2600 }, { "epoch": 0.16356457980823463, "grad_norm": 0.1542951464653015, "learning_rate": 4.777289903105275e-05, "loss": 0.1251, "step": 2610 }, { "epoch": 0.16419126402205927, "grad_norm": 3.625025749206543, "learning_rate": 4.776234405015727e-05, "loss": 0.3419, "step": 2620 }, { "epoch": 0.16481794823588394, "grad_norm": 0.07648801058530807, "learning_rate": 4.7751789069261785e-05, "loss": 0.0818, "step": 2630 }, { "epoch": 0.16544463244970858, "grad_norm": 0.20059090852737427, "learning_rate": 4.77412340883663e-05, "loss": 0.19, "step": 2640 }, { "epoch": 0.16607131666353325, "grad_norm": 0.1197114810347557, "learning_rate": 4.773067910747082e-05, "loss": 0.0858, "step": 2650 }, { "epoch": 0.1666980008773579, "grad_norm": 0.030985107645392418, "learning_rate": 4.7720124126575335e-05, "loss": 0.0555, "step": 2660 }, { "epoch": 0.16732468509118256, "grad_norm": 0.05402218550443649, "learning_rate": 4.770956914567985e-05, "loss": 0.013, "step": 2670 }, { "epoch": 0.1679513693050072, "grad_norm": 0.8109678626060486, "learning_rate": 4.769901416478436e-05, "loss": 0.097, "step": 2680 }, { "epoch": 0.16857805351883187, "grad_norm": 7.592169284820557, "learning_rate": 4.768845918388888e-05, "loss": 0.1593, "step": 2690 }, { "epoch": 0.1692047377326565, "grad_norm": 0.12266740947961807, "learning_rate": 4.7677904202993395e-05, "loss": 0.3517, "step": 2700 }, { "epoch": 0.16983142194648118, "grad_norm": 0.22772647440433502, "learning_rate": 4.766734922209791e-05, "loss": 0.2029, "step": 2710 }, { "epoch": 0.17045810616030582, "grad_norm": 0.8687103986740112, "learning_rate": 4.765679424120242e-05, "loss": 0.0864, "step": 2720 }, { "epoch": 0.17108479037413046, "grad_norm": 0.15315799415111542, "learning_rate": 4.764623926030694e-05, "loss": 0.1457, "step": 2730 }, { "epoch": 0.17171147458795513, "grad_norm": 2.896604537963867, "learning_rate": 4.7635684279411454e-05, "loss": 0.1904, "step": 2740 }, { "epoch": 0.17233815880177977, "grad_norm": 0.11487535387277603, "learning_rate": 4.762512929851598e-05, "loss": 0.2531, "step": 2750 }, { "epoch": 0.17296484301560444, "grad_norm": 5.1275811195373535, "learning_rate": 4.761457431762049e-05, "loss": 0.166, "step": 2760 }, { "epoch": 0.17359152722942908, "grad_norm": 0.2095639705657959, "learning_rate": 4.7604019336725004e-05, "loss": 0.0942, "step": 2770 }, { "epoch": 0.17421821144325375, "grad_norm": 7.469160556793213, "learning_rate": 4.759346435582952e-05, "loss": 0.2922, "step": 2780 }, { "epoch": 0.1748448956570784, "grad_norm": 1.80148184299469, "learning_rate": 4.758290937493403e-05, "loss": 0.1071, "step": 2790 }, { "epoch": 0.17547157987090306, "grad_norm": 0.3815973699092865, "learning_rate": 4.757235439403855e-05, "loss": 0.1167, "step": 2800 }, { "epoch": 0.1760982640847277, "grad_norm": 0.01410369761288166, "learning_rate": 4.7561799413143064e-05, "loss": 0.0313, "step": 2810 }, { "epoch": 0.17672494829855237, "grad_norm": 0.0505596324801445, "learning_rate": 4.755124443224758e-05, "loss": 0.212, "step": 2820 }, { "epoch": 0.177351632512377, "grad_norm": 5.202013969421387, "learning_rate": 4.754068945135209e-05, "loss": 0.1225, "step": 2830 }, { "epoch": 0.17797831672620168, "grad_norm": 0.5114884972572327, "learning_rate": 4.7530134470456614e-05, "loss": 0.0803, "step": 2840 }, { "epoch": 0.17860500094002632, "grad_norm": 0.32649967074394226, "learning_rate": 4.751957948956113e-05, "loss": 0.0959, "step": 2850 }, { "epoch": 0.17923168515385096, "grad_norm": 0.49903982877731323, "learning_rate": 4.750902450866564e-05, "loss": 0.1916, "step": 2860 }, { "epoch": 0.17985836936767563, "grad_norm": 0.5530174970626831, "learning_rate": 4.749846952777016e-05, "loss": 0.0615, "step": 2870 }, { "epoch": 0.18048505358150027, "grad_norm": 0.010926044546067715, "learning_rate": 4.7487914546874674e-05, "loss": 0.0331, "step": 2880 }, { "epoch": 0.18111173779532494, "grad_norm": 8.002918243408203, "learning_rate": 4.747735956597919e-05, "loss": 0.4187, "step": 2890 }, { "epoch": 0.18173842200914958, "grad_norm": 0.23336580395698547, "learning_rate": 4.74668045850837e-05, "loss": 0.0829, "step": 2900 }, { "epoch": 0.18236510622297425, "grad_norm": 2.910646915435791, "learning_rate": 4.745624960418822e-05, "loss": 0.2485, "step": 2910 }, { "epoch": 0.1829917904367989, "grad_norm": 0.33291372656822205, "learning_rate": 4.7445694623292734e-05, "loss": 0.1436, "step": 2920 }, { "epoch": 0.18361847465062356, "grad_norm": 8.978804588317871, "learning_rate": 4.743513964239725e-05, "loss": 0.0809, "step": 2930 }, { "epoch": 0.1842451588644482, "grad_norm": 39.01954650878906, "learning_rate": 4.742458466150177e-05, "loss": 0.1548, "step": 2940 }, { "epoch": 0.18487184307827287, "grad_norm": 0.8386328816413879, "learning_rate": 4.7414029680606284e-05, "loss": 0.2184, "step": 2950 }, { "epoch": 0.1854985272920975, "grad_norm": 0.12659718096256256, "learning_rate": 4.74034746997108e-05, "loss": 0.1347, "step": 2960 }, { "epoch": 0.18612521150592218, "grad_norm": 0.6178060173988342, "learning_rate": 4.739291971881531e-05, "loss": 0.1945, "step": 2970 }, { "epoch": 0.18675189571974682, "grad_norm": 0.10851160436868668, "learning_rate": 4.738236473791983e-05, "loss": 0.1198, "step": 2980 }, { "epoch": 0.18737857993357146, "grad_norm": 2.062321186065674, "learning_rate": 4.7371809757024344e-05, "loss": 0.3923, "step": 2990 }, { "epoch": 0.18800526414739613, "grad_norm": 0.0955008938908577, "learning_rate": 4.7361254776128853e-05, "loss": 0.1186, "step": 3000 }, { "epoch": 0.18863194836122077, "grad_norm": 0.11655368655920029, "learning_rate": 4.735069979523337e-05, "loss": 0.1031, "step": 3010 }, { "epoch": 0.18925863257504544, "grad_norm": 0.2267540544271469, "learning_rate": 4.734014481433789e-05, "loss": 0.1104, "step": 3020 }, { "epoch": 0.18988531678887008, "grad_norm": 12.633221626281738, "learning_rate": 4.7329589833442403e-05, "loss": 0.2682, "step": 3030 }, { "epoch": 0.19051200100269475, "grad_norm": 0.10257907211780548, "learning_rate": 4.731903485254692e-05, "loss": 0.1305, "step": 3040 }, { "epoch": 0.1911386852165194, "grad_norm": 0.056987181305885315, "learning_rate": 4.730847987165144e-05, "loss": 0.0214, "step": 3050 }, { "epoch": 0.19176536943034406, "grad_norm": 0.05389223247766495, "learning_rate": 4.7297924890755953e-05, "loss": 0.0817, "step": 3060 }, { "epoch": 0.1923920536441687, "grad_norm": 0.11531835794448853, "learning_rate": 4.728736990986046e-05, "loss": 0.2344, "step": 3070 }, { "epoch": 0.19301873785799337, "grad_norm": 0.5801142454147339, "learning_rate": 4.727681492896498e-05, "loss": 0.1039, "step": 3080 }, { "epoch": 0.193645422071818, "grad_norm": 2.6055166721343994, "learning_rate": 4.7266259948069497e-05, "loss": 0.2305, "step": 3090 }, { "epoch": 0.19427210628564268, "grad_norm": 0.3381422758102417, "learning_rate": 4.725570496717401e-05, "loss": 0.2068, "step": 3100 }, { "epoch": 0.19489879049946732, "grad_norm": 4.314819812774658, "learning_rate": 4.724514998627852e-05, "loss": 0.0849, "step": 3110 }, { "epoch": 0.19552547471329199, "grad_norm": 0.05561206117272377, "learning_rate": 4.723459500538304e-05, "loss": 0.1866, "step": 3120 }, { "epoch": 0.19615215892711663, "grad_norm": 0.33232080936431885, "learning_rate": 4.7224040024487556e-05, "loss": 0.2515, "step": 3130 }, { "epoch": 0.19677884314094127, "grad_norm": 0.579521656036377, "learning_rate": 4.721348504359207e-05, "loss": 0.1305, "step": 3140 }, { "epoch": 0.19740552735476594, "grad_norm": 0.43451932072639465, "learning_rate": 4.720293006269659e-05, "loss": 0.1847, "step": 3150 }, { "epoch": 0.19803221156859058, "grad_norm": 1.7295702695846558, "learning_rate": 4.7192375081801106e-05, "loss": 0.042, "step": 3160 }, { "epoch": 0.19865889578241525, "grad_norm": 0.11605259031057358, "learning_rate": 4.718182010090562e-05, "loss": 0.1818, "step": 3170 }, { "epoch": 0.1992855799962399, "grad_norm": 3.375051975250244, "learning_rate": 4.717126512001013e-05, "loss": 0.0832, "step": 3180 }, { "epoch": 0.19991226421006456, "grad_norm": 15.903059005737305, "learning_rate": 4.716071013911465e-05, "loss": 0.4062, "step": 3190 }, { "epoch": 0.2005389484238892, "grad_norm": 7.629429817199707, "learning_rate": 4.7150155158219166e-05, "loss": 0.3307, "step": 3200 }, { "epoch": 0.20116563263771386, "grad_norm": 0.11025646328926086, "learning_rate": 4.713960017732368e-05, "loss": 0.0341, "step": 3210 }, { "epoch": 0.2017923168515385, "grad_norm": 5.843116283416748, "learning_rate": 4.712904519642819e-05, "loss": 0.1977, "step": 3220 }, { "epoch": 0.20241900106536317, "grad_norm": 5.388326168060303, "learning_rate": 4.711849021553271e-05, "loss": 0.1191, "step": 3230 }, { "epoch": 0.20304568527918782, "grad_norm": 14.07459831237793, "learning_rate": 4.7107935234637226e-05, "loss": 0.1881, "step": 3240 }, { "epoch": 0.20367236949301248, "grad_norm": 1.0998096466064453, "learning_rate": 4.709738025374174e-05, "loss": 0.1228, "step": 3250 }, { "epoch": 0.20429905370683712, "grad_norm": 1.7342157363891602, "learning_rate": 4.708682527284626e-05, "loss": 0.0381, "step": 3260 }, { "epoch": 0.20492573792066177, "grad_norm": 1.3697205781936646, "learning_rate": 4.7076270291950776e-05, "loss": 0.2799, "step": 3270 }, { "epoch": 0.20555242213448643, "grad_norm": 0.14480531215667725, "learning_rate": 4.706571531105529e-05, "loss": 0.236, "step": 3280 }, { "epoch": 0.20617910634831108, "grad_norm": 11.9490385055542, "learning_rate": 4.70551603301598e-05, "loss": 0.2193, "step": 3290 }, { "epoch": 0.20680579056213574, "grad_norm": 0.29927733540534973, "learning_rate": 4.704460534926432e-05, "loss": 0.1346, "step": 3300 }, { "epoch": 0.20743247477596038, "grad_norm": 1.87830650806427, "learning_rate": 4.7034050368368836e-05, "loss": 0.1282, "step": 3310 }, { "epoch": 0.20805915898978505, "grad_norm": 0.37590470910072327, "learning_rate": 4.7023495387473346e-05, "loss": 0.0127, "step": 3320 }, { "epoch": 0.2086858432036097, "grad_norm": 0.24762612581253052, "learning_rate": 4.701294040657786e-05, "loss": 0.3277, "step": 3330 }, { "epoch": 0.20931252741743436, "grad_norm": 3.689312219619751, "learning_rate": 4.7002385425682386e-05, "loss": 0.1627, "step": 3340 }, { "epoch": 0.209939211631259, "grad_norm": 8.164970397949219, "learning_rate": 4.69918304447869e-05, "loss": 0.2183, "step": 3350 }, { "epoch": 0.21056589584508367, "grad_norm": 5.536794662475586, "learning_rate": 4.698127546389141e-05, "loss": 0.2664, "step": 3360 }, { "epoch": 0.2111925800589083, "grad_norm": 0.11749021708965302, "learning_rate": 4.697072048299593e-05, "loss": 0.0704, "step": 3370 }, { "epoch": 0.21181926427273298, "grad_norm": 0.28687575459480286, "learning_rate": 4.6960165502100446e-05, "loss": 0.1234, "step": 3380 }, { "epoch": 0.21244594848655762, "grad_norm": 4.869693756103516, "learning_rate": 4.6949610521204955e-05, "loss": 0.0407, "step": 3390 }, { "epoch": 0.21307263270038226, "grad_norm": 0.13574935495853424, "learning_rate": 4.693905554030947e-05, "loss": 0.0768, "step": 3400 }, { "epoch": 0.21369931691420693, "grad_norm": 0.0712619423866272, "learning_rate": 4.692850055941399e-05, "loss": 0.1368, "step": 3410 }, { "epoch": 0.21432600112803157, "grad_norm": 0.12375117838382721, "learning_rate": 4.6917945578518505e-05, "loss": 0.1302, "step": 3420 }, { "epoch": 0.21495268534185624, "grad_norm": 0.2046600729227066, "learning_rate": 4.690739059762302e-05, "loss": 0.067, "step": 3430 }, { "epoch": 0.21557936955568088, "grad_norm": 0.017954392358660698, "learning_rate": 4.689683561672754e-05, "loss": 0.0414, "step": 3440 }, { "epoch": 0.21620605376950555, "grad_norm": 0.08368971198797226, "learning_rate": 4.6886280635832055e-05, "loss": 0.2528, "step": 3450 }, { "epoch": 0.2168327379833302, "grad_norm": 1.2165157794952393, "learning_rate": 4.6875725654936565e-05, "loss": 0.0628, "step": 3460 }, { "epoch": 0.21745942219715486, "grad_norm": 0.062159277498722076, "learning_rate": 4.686517067404108e-05, "loss": 0.1981, "step": 3470 }, { "epoch": 0.2180861064109795, "grad_norm": 0.049053970724344254, "learning_rate": 4.68546156931456e-05, "loss": 0.0574, "step": 3480 }, { "epoch": 0.21871279062480417, "grad_norm": 0.01489468477666378, "learning_rate": 4.6844060712250115e-05, "loss": 0.0837, "step": 3490 }, { "epoch": 0.2193394748386288, "grad_norm": 0.02858012728393078, "learning_rate": 4.6833505731354625e-05, "loss": 0.0839, "step": 3500 }, { "epoch": 0.21996615905245348, "grad_norm": 10.893243789672852, "learning_rate": 4.682295075045914e-05, "loss": 0.4084, "step": 3510 }, { "epoch": 0.22059284326627812, "grad_norm": 0.19011631608009338, "learning_rate": 4.681239576956366e-05, "loss": 0.0307, "step": 3520 }, { "epoch": 0.2212195274801028, "grad_norm": 3.5512607097625732, "learning_rate": 4.6801840788668175e-05, "loss": 0.1811, "step": 3530 }, { "epoch": 0.22184621169392743, "grad_norm": 0.42964065074920654, "learning_rate": 4.679128580777269e-05, "loss": 0.2602, "step": 3540 }, { "epoch": 0.22247289590775207, "grad_norm": 0.08508183062076569, "learning_rate": 4.678073082687721e-05, "loss": 0.0838, "step": 3550 }, { "epoch": 0.22309958012157674, "grad_norm": 0.020195091143250465, "learning_rate": 4.6770175845981725e-05, "loss": 0.1333, "step": 3560 }, { "epoch": 0.22372626433540138, "grad_norm": 0.02404479682445526, "learning_rate": 4.6759620865086235e-05, "loss": 0.0295, "step": 3570 }, { "epoch": 0.22435294854922605, "grad_norm": 8.078899383544922, "learning_rate": 4.674906588419075e-05, "loss": 0.2297, "step": 3580 }, { "epoch": 0.2249796327630507, "grad_norm": 3.0931432247161865, "learning_rate": 4.673851090329527e-05, "loss": 0.0381, "step": 3590 }, { "epoch": 0.22560631697687536, "grad_norm": 0.7269296646118164, "learning_rate": 4.6727955922399785e-05, "loss": 0.099, "step": 3600 }, { "epoch": 0.2262330011907, "grad_norm": 0.09635982662439346, "learning_rate": 4.6717400941504295e-05, "loss": 0.1602, "step": 3610 }, { "epoch": 0.22685968540452467, "grad_norm": 0.36591339111328125, "learning_rate": 4.670684596060881e-05, "loss": 0.1373, "step": 3620 }, { "epoch": 0.2274863696183493, "grad_norm": 2.797891616821289, "learning_rate": 4.669629097971333e-05, "loss": 0.2, "step": 3630 }, { "epoch": 0.22811305383217398, "grad_norm": 0.13196179270744324, "learning_rate": 4.6685735998817845e-05, "loss": 0.1875, "step": 3640 }, { "epoch": 0.22873973804599862, "grad_norm": 0.13646236062049866, "learning_rate": 4.667518101792236e-05, "loss": 0.3006, "step": 3650 }, { "epoch": 0.2293664222598233, "grad_norm": 0.4018650949001312, "learning_rate": 4.666462603702688e-05, "loss": 0.4341, "step": 3660 }, { "epoch": 0.22999310647364793, "grad_norm": 0.5656597018241882, "learning_rate": 4.6654071056131395e-05, "loss": 0.0513, "step": 3670 }, { "epoch": 0.23061979068747257, "grad_norm": 10.083354949951172, "learning_rate": 4.6643516075235904e-05, "loss": 0.2332, "step": 3680 }, { "epoch": 0.23124647490129724, "grad_norm": 0.1959206759929657, "learning_rate": 4.663296109434042e-05, "loss": 0.139, "step": 3690 }, { "epoch": 0.23187315911512188, "grad_norm": 0.418293297290802, "learning_rate": 4.662240611344494e-05, "loss": 0.1942, "step": 3700 }, { "epoch": 0.23249984332894655, "grad_norm": 1.2529011964797974, "learning_rate": 4.661185113254945e-05, "loss": 0.0641, "step": 3710 }, { "epoch": 0.2331265275427712, "grad_norm": 41.292633056640625, "learning_rate": 4.6601296151653964e-05, "loss": 0.0563, "step": 3720 }, { "epoch": 0.23375321175659586, "grad_norm": 0.023516474291682243, "learning_rate": 4.659074117075848e-05, "loss": 0.0929, "step": 3730 }, { "epoch": 0.2343798959704205, "grad_norm": 3.448504686355591, "learning_rate": 4.6580186189863e-05, "loss": 0.1342, "step": 3740 }, { "epoch": 0.23500658018424517, "grad_norm": 0.05701204016804695, "learning_rate": 4.6569631208967514e-05, "loss": 0.1256, "step": 3750 }, { "epoch": 0.2356332643980698, "grad_norm": 0.11099798232316971, "learning_rate": 4.655907622807203e-05, "loss": 0.1523, "step": 3760 }, { "epoch": 0.23625994861189448, "grad_norm": 0.04083950072526932, "learning_rate": 4.654852124717655e-05, "loss": 0.0609, "step": 3770 }, { "epoch": 0.23688663282571912, "grad_norm": 6.568485260009766, "learning_rate": 4.653796626628106e-05, "loss": 0.1807, "step": 3780 }, { "epoch": 0.2375133170395438, "grad_norm": 9.865739822387695, "learning_rate": 4.6527411285385574e-05, "loss": 0.1791, "step": 3790 }, { "epoch": 0.23814000125336843, "grad_norm": 0.05502455309033394, "learning_rate": 4.651685630449009e-05, "loss": 0.0791, "step": 3800 }, { "epoch": 0.23876668546719307, "grad_norm": 1.563883662223816, "learning_rate": 4.650630132359461e-05, "loss": 0.2054, "step": 3810 }, { "epoch": 0.23939336968101774, "grad_norm": 0.3699815571308136, "learning_rate": 4.649574634269912e-05, "loss": 0.1275, "step": 3820 }, { "epoch": 0.24002005389484238, "grad_norm": 0.03253927826881409, "learning_rate": 4.6485191361803634e-05, "loss": 0.077, "step": 3830 }, { "epoch": 0.24064673810866705, "grad_norm": 0.44651317596435547, "learning_rate": 4.647463638090816e-05, "loss": 0.1526, "step": 3840 }, { "epoch": 0.2412734223224917, "grad_norm": 0.019008852541446686, "learning_rate": 4.646408140001267e-05, "loss": 0.0415, "step": 3850 }, { "epoch": 0.24190010653631636, "grad_norm": 14.093941688537598, "learning_rate": 4.6453526419117184e-05, "loss": 0.287, "step": 3860 }, { "epoch": 0.242526790750141, "grad_norm": 2.9186899662017822, "learning_rate": 4.64429714382217e-05, "loss": 0.3775, "step": 3870 }, { "epoch": 0.24315347496396567, "grad_norm": 8.830793380737305, "learning_rate": 4.643241645732622e-05, "loss": 0.0785, "step": 3880 }, { "epoch": 0.2437801591777903, "grad_norm": 6.0299072265625, "learning_rate": 4.642186147643073e-05, "loss": 0.2825, "step": 3890 }, { "epoch": 0.24440684339161498, "grad_norm": 0.060153331607580185, "learning_rate": 4.6411306495535244e-05, "loss": 0.1284, "step": 3900 }, { "epoch": 0.24503352760543962, "grad_norm": 0.03562921658158302, "learning_rate": 4.640075151463976e-05, "loss": 0.0256, "step": 3910 }, { "epoch": 0.24566021181926428, "grad_norm": 0.1369025558233261, "learning_rate": 4.639019653374427e-05, "loss": 0.0423, "step": 3920 }, { "epoch": 0.24628689603308893, "grad_norm": 0.3345741331577301, "learning_rate": 4.6379641552848794e-05, "loss": 0.1194, "step": 3930 }, { "epoch": 0.24691358024691357, "grad_norm": 2.964259386062622, "learning_rate": 4.636908657195331e-05, "loss": 0.1352, "step": 3940 }, { "epoch": 0.24754026446073824, "grad_norm": 0.02697465941309929, "learning_rate": 4.635853159105783e-05, "loss": 0.0331, "step": 3950 }, { "epoch": 0.24816694867456288, "grad_norm": 0.025438886135816574, "learning_rate": 4.634797661016234e-05, "loss": 0.2382, "step": 3960 }, { "epoch": 0.24879363288838754, "grad_norm": 0.09332406520843506, "learning_rate": 4.6337421629266853e-05, "loss": 0.3281, "step": 3970 }, { "epoch": 0.24942031710221219, "grad_norm": 0.11696890741586685, "learning_rate": 4.632686664837137e-05, "loss": 0.152, "step": 3980 }, { "epoch": 0.25004700131603685, "grad_norm": 2.8147506713867188, "learning_rate": 4.631631166747589e-05, "loss": 0.1781, "step": 3990 }, { "epoch": 0.2506736855298615, "grad_norm": 0.088865727186203, "learning_rate": 4.63057566865804e-05, "loss": 0.0816, "step": 4000 }, { "epoch": 0.25130036974368614, "grad_norm": 0.10434025526046753, "learning_rate": 4.629520170568491e-05, "loss": 0.0756, "step": 4010 }, { "epoch": 0.2519270539575108, "grad_norm": 8.183455467224121, "learning_rate": 4.628464672478943e-05, "loss": 0.1846, "step": 4020 }, { "epoch": 0.2525537381713355, "grad_norm": 0.05632643774151802, "learning_rate": 4.6274091743893947e-05, "loss": 0.0299, "step": 4030 }, { "epoch": 0.25318042238516014, "grad_norm": 2.964418649673462, "learning_rate": 4.626353676299846e-05, "loss": 0.4379, "step": 4040 }, { "epoch": 0.25380710659898476, "grad_norm": 0.20549556612968445, "learning_rate": 4.625298178210298e-05, "loss": 0.2554, "step": 4050 }, { "epoch": 0.2544337908128094, "grad_norm": 0.5683102011680603, "learning_rate": 4.6242426801207497e-05, "loss": 0.1996, "step": 4060 }, { "epoch": 0.2550604750266341, "grad_norm": 0.22006919980049133, "learning_rate": 4.6231871820312006e-05, "loss": 0.1015, "step": 4070 }, { "epoch": 0.2556871592404587, "grad_norm": 0.06379158049821854, "learning_rate": 4.622131683941652e-05, "loss": 0.1321, "step": 4080 }, { "epoch": 0.2563138434542834, "grad_norm": 0.1231253445148468, "learning_rate": 4.621076185852104e-05, "loss": 0.0246, "step": 4090 }, { "epoch": 0.25694052766810804, "grad_norm": 0.11641805619001389, "learning_rate": 4.620020687762555e-05, "loss": 0.1296, "step": 4100 }, { "epoch": 0.2575672118819327, "grad_norm": 8.17453384399414, "learning_rate": 4.6189651896730066e-05, "loss": 0.1271, "step": 4110 }, { "epoch": 0.2581938960957573, "grad_norm": 2.913849115371704, "learning_rate": 4.617909691583458e-05, "loss": 0.2079, "step": 4120 }, { "epoch": 0.258820580309582, "grad_norm": 5.707304954528809, "learning_rate": 4.61685419349391e-05, "loss": 0.2832, "step": 4130 }, { "epoch": 0.25944726452340666, "grad_norm": 5.583265781402588, "learning_rate": 4.6157986954043616e-05, "loss": 0.1184, "step": 4140 }, { "epoch": 0.26007394873723133, "grad_norm": 0.47731733322143555, "learning_rate": 4.614743197314813e-05, "loss": 0.2857, "step": 4150 }, { "epoch": 0.26070063295105594, "grad_norm": 0.46668675541877747, "learning_rate": 4.613687699225265e-05, "loss": 0.1019, "step": 4160 }, { "epoch": 0.2613273171648806, "grad_norm": 3.6754000186920166, "learning_rate": 4.612632201135716e-05, "loss": 0.2119, "step": 4170 }, { "epoch": 0.2619540013787053, "grad_norm": 0.11781708151102066, "learning_rate": 4.6115767030461676e-05, "loss": 0.059, "step": 4180 }, { "epoch": 0.26258068559252995, "grad_norm": 0.6701149344444275, "learning_rate": 4.610521204956619e-05, "loss": 0.0225, "step": 4190 }, { "epoch": 0.26320736980635456, "grad_norm": 7.709531307220459, "learning_rate": 4.609465706867071e-05, "loss": 0.0726, "step": 4200 }, { "epoch": 0.26383405402017923, "grad_norm": 0.12035151571035385, "learning_rate": 4.608410208777522e-05, "loss": 0.1888, "step": 4210 }, { "epoch": 0.2644607382340039, "grad_norm": 8.956652641296387, "learning_rate": 4.6073547106879736e-05, "loss": 0.2796, "step": 4220 }, { "epoch": 0.2650874224478285, "grad_norm": 0.8151219487190247, "learning_rate": 4.606299212598425e-05, "loss": 0.0759, "step": 4230 }, { "epoch": 0.2657141066616532, "grad_norm": 0.19752751290798187, "learning_rate": 4.605243714508877e-05, "loss": 0.1748, "step": 4240 }, { "epoch": 0.26634079087547785, "grad_norm": 8.772632598876953, "learning_rate": 4.6041882164193286e-05, "loss": 0.1709, "step": 4250 }, { "epoch": 0.2669674750893025, "grad_norm": 0.10757267475128174, "learning_rate": 4.60313271832978e-05, "loss": 0.2218, "step": 4260 }, { "epoch": 0.26759415930312713, "grad_norm": 0.11040544509887695, "learning_rate": 4.602077220240232e-05, "loss": 0.1871, "step": 4270 }, { "epoch": 0.2682208435169518, "grad_norm": 0.30749446153640747, "learning_rate": 4.601021722150683e-05, "loss": 0.0953, "step": 4280 }, { "epoch": 0.26884752773077647, "grad_norm": 0.0919603630900383, "learning_rate": 4.5999662240611346e-05, "loss": 0.2144, "step": 4290 }, { "epoch": 0.26947421194460114, "grad_norm": 0.063777856528759, "learning_rate": 4.598910725971586e-05, "loss": 0.0812, "step": 4300 }, { "epoch": 0.27010089615842575, "grad_norm": 5.391152858734131, "learning_rate": 4.597855227882037e-05, "loss": 0.2445, "step": 4310 }, { "epoch": 0.2707275803722504, "grad_norm": 1.2878516912460327, "learning_rate": 4.596799729792489e-05, "loss": 0.0912, "step": 4320 }, { "epoch": 0.2713542645860751, "grad_norm": 24.52849006652832, "learning_rate": 4.5957442317029405e-05, "loss": 0.042, "step": 4330 }, { "epoch": 0.27198094879989976, "grad_norm": 0.16191601753234863, "learning_rate": 4.594688733613393e-05, "loss": 0.1879, "step": 4340 }, { "epoch": 0.27260763301372437, "grad_norm": 0.04222509637475014, "learning_rate": 4.593633235523844e-05, "loss": 0.0576, "step": 4350 }, { "epoch": 0.27323431722754904, "grad_norm": 0.04624179005622864, "learning_rate": 4.5925777374342955e-05, "loss": 0.1089, "step": 4360 }, { "epoch": 0.2738610014413737, "grad_norm": 3.643953323364258, "learning_rate": 4.591522239344747e-05, "loss": 0.2434, "step": 4370 }, { "epoch": 0.2744876856551983, "grad_norm": 3.4364843368530273, "learning_rate": 4.590466741255199e-05, "loss": 0.2215, "step": 4380 }, { "epoch": 0.275114369869023, "grad_norm": 1.3025727272033691, "learning_rate": 4.58941124316565e-05, "loss": 0.1839, "step": 4390 }, { "epoch": 0.27574105408284766, "grad_norm": 15.097777366638184, "learning_rate": 4.5883557450761015e-05, "loss": 0.1754, "step": 4400 }, { "epoch": 0.2763677382966723, "grad_norm": 0.12557531893253326, "learning_rate": 4.587300246986553e-05, "loss": 0.136, "step": 4410 }, { "epoch": 0.27699442251049694, "grad_norm": 0.080543152987957, "learning_rate": 4.586244748897004e-05, "loss": 0.009, "step": 4420 }, { "epoch": 0.2776211067243216, "grad_norm": 0.035216640681028366, "learning_rate": 4.5851892508074565e-05, "loss": 0.0359, "step": 4430 }, { "epoch": 0.2782477909381463, "grad_norm": 0.03987397626042366, "learning_rate": 4.584133752717908e-05, "loss": 0.1096, "step": 4440 }, { "epoch": 0.27887447515197095, "grad_norm": 0.11862228810787201, "learning_rate": 4.58307825462836e-05, "loss": 0.0751, "step": 4450 }, { "epoch": 0.27950115936579556, "grad_norm": 172.26731872558594, "learning_rate": 4.582022756538811e-05, "loss": 0.1038, "step": 4460 }, { "epoch": 0.28012784357962023, "grad_norm": 2.120897054672241, "learning_rate": 4.5809672584492625e-05, "loss": 0.2261, "step": 4470 }, { "epoch": 0.2807545277934449, "grad_norm": 14.573572158813477, "learning_rate": 4.579911760359714e-05, "loss": 0.2673, "step": 4480 }, { "epoch": 0.2813812120072695, "grad_norm": 1.8395880460739136, "learning_rate": 4.578856262270165e-05, "loss": 0.115, "step": 4490 }, { "epoch": 0.2820078962210942, "grad_norm": 3.6968624591827393, "learning_rate": 4.577800764180617e-05, "loss": 0.1004, "step": 4500 }, { "epoch": 0.28263458043491885, "grad_norm": 3.105196237564087, "learning_rate": 4.5767452660910685e-05, "loss": 0.1143, "step": 4510 }, { "epoch": 0.2832612646487435, "grad_norm": 5.869255542755127, "learning_rate": 4.57568976800152e-05, "loss": 0.1985, "step": 4520 }, { "epoch": 0.28388794886256813, "grad_norm": 5.56795597076416, "learning_rate": 4.574634269911972e-05, "loss": 0.0755, "step": 4530 }, { "epoch": 0.2845146330763928, "grad_norm": 2.6562743186950684, "learning_rate": 4.5735787718224235e-05, "loss": 0.212, "step": 4540 }, { "epoch": 0.28514131729021747, "grad_norm": 7.3283867835998535, "learning_rate": 4.572523273732875e-05, "loss": 0.1854, "step": 4550 }, { "epoch": 0.28576800150404214, "grad_norm": 0.06201328709721565, "learning_rate": 4.571467775643326e-05, "loss": 0.1102, "step": 4560 }, { "epoch": 0.28639468571786675, "grad_norm": 0.10211961716413498, "learning_rate": 4.570412277553778e-05, "loss": 0.0591, "step": 4570 }, { "epoch": 0.2870213699316914, "grad_norm": 0.08829760551452637, "learning_rate": 4.5693567794642295e-05, "loss": 0.0468, "step": 4580 }, { "epoch": 0.2876480541455161, "grad_norm": 6.4518141746521, "learning_rate": 4.568301281374681e-05, "loss": 0.2678, "step": 4590 }, { "epoch": 0.28827473835934075, "grad_norm": 0.13497485220432281, "learning_rate": 4.567245783285132e-05, "loss": 0.0539, "step": 4600 }, { "epoch": 0.28890142257316537, "grad_norm": 0.0946924239397049, "learning_rate": 4.566190285195584e-05, "loss": 0.1492, "step": 4610 }, { "epoch": 0.28952810678699004, "grad_norm": 0.4621901512145996, "learning_rate": 4.5651347871060354e-05, "loss": 0.0721, "step": 4620 }, { "epoch": 0.2901547910008147, "grad_norm": 4.470915794372559, "learning_rate": 4.564079289016487e-05, "loss": 0.1717, "step": 4630 }, { "epoch": 0.2907814752146393, "grad_norm": 0.09481313824653625, "learning_rate": 4.563023790926939e-05, "loss": 0.1039, "step": 4640 }, { "epoch": 0.291408159428464, "grad_norm": 0.095061294734478, "learning_rate": 4.5619682928373904e-05, "loss": 0.3072, "step": 4650 }, { "epoch": 0.29203484364228866, "grad_norm": 5.283687114715576, "learning_rate": 4.560912794747842e-05, "loss": 0.0254, "step": 4660 }, { "epoch": 0.2926615278561133, "grad_norm": 0.07655642926692963, "learning_rate": 4.559857296658293e-05, "loss": 0.1093, "step": 4670 }, { "epoch": 0.29328821206993794, "grad_norm": 0.12662218511104584, "learning_rate": 4.558801798568745e-05, "loss": 0.0054, "step": 4680 }, { "epoch": 0.2939148962837626, "grad_norm": 6.806703567504883, "learning_rate": 4.5577463004791964e-05, "loss": 0.1459, "step": 4690 }, { "epoch": 0.2945415804975873, "grad_norm": 0.09386972337961197, "learning_rate": 4.5566908023896474e-05, "loss": 0.1345, "step": 4700 }, { "epoch": 0.29516826471141194, "grad_norm": 3.9050133228302, "learning_rate": 4.555635304300099e-05, "loss": 0.2161, "step": 4710 }, { "epoch": 0.29579494892523656, "grad_norm": 0.464335560798645, "learning_rate": 4.554579806210551e-05, "loss": 0.0422, "step": 4720 }, { "epoch": 0.2964216331390612, "grad_norm": 0.06394585967063904, "learning_rate": 4.5535243081210024e-05, "loss": 0.1675, "step": 4730 }, { "epoch": 0.2970483173528859, "grad_norm": 13.138270378112793, "learning_rate": 4.552468810031454e-05, "loss": 0.296, "step": 4740 }, { "epoch": 0.29767500156671056, "grad_norm": 6.973242282867432, "learning_rate": 4.551413311941906e-05, "loss": 0.0738, "step": 4750 }, { "epoch": 0.2983016857805352, "grad_norm": 0.09183468669652939, "learning_rate": 4.5503578138523574e-05, "loss": 0.0267, "step": 4760 }, { "epoch": 0.29892836999435984, "grad_norm": 0.07803864032030106, "learning_rate": 4.549302315762809e-05, "loss": 0.2194, "step": 4770 }, { "epoch": 0.2995550542081845, "grad_norm": 0.13608647882938385, "learning_rate": 4.54824681767326e-05, "loss": 0.2405, "step": 4780 }, { "epoch": 0.3001817384220091, "grad_norm": 0.2874913811683655, "learning_rate": 4.547191319583712e-05, "loss": 0.0569, "step": 4790 }, { "epoch": 0.3008084226358338, "grad_norm": 0.12187890708446503, "learning_rate": 4.5461358214941634e-05, "loss": 0.1173, "step": 4800 }, { "epoch": 0.30143510684965846, "grad_norm": 0.8910410404205322, "learning_rate": 4.5450803234046144e-05, "loss": 0.0923, "step": 4810 }, { "epoch": 0.30206179106348313, "grad_norm": 3.576404571533203, "learning_rate": 4.544024825315066e-05, "loss": 0.1787, "step": 4820 }, { "epoch": 0.30268847527730774, "grad_norm": 5.950135707855225, "learning_rate": 4.5429693272255184e-05, "loss": 0.2014, "step": 4830 }, { "epoch": 0.3033151594911324, "grad_norm": 0.23504792153835297, "learning_rate": 4.54191382913597e-05, "loss": 0.0568, "step": 4840 }, { "epoch": 0.3039418437049571, "grad_norm": 0.1148870512843132, "learning_rate": 4.540858331046421e-05, "loss": 0.193, "step": 4850 }, { "epoch": 0.30456852791878175, "grad_norm": 2.612504482269287, "learning_rate": 4.539802832956873e-05, "loss": 0.1086, "step": 4860 }, { "epoch": 0.30519521213260636, "grad_norm": 0.021087897941470146, "learning_rate": 4.5387473348673244e-05, "loss": 0.0463, "step": 4870 }, { "epoch": 0.30582189634643103, "grad_norm": 6.417503356933594, "learning_rate": 4.5376918367777754e-05, "loss": 0.1688, "step": 4880 }, { "epoch": 0.3064485805602557, "grad_norm": 0.1166616752743721, "learning_rate": 4.536636338688227e-05, "loss": 0.1142, "step": 4890 }, { "epoch": 0.3070752647740803, "grad_norm": 0.11154291033744812, "learning_rate": 4.535580840598679e-05, "loss": 0.1412, "step": 4900 }, { "epoch": 0.307701948987905, "grad_norm": 0.1466520130634308, "learning_rate": 4.5345253425091303e-05, "loss": 0.0189, "step": 4910 }, { "epoch": 0.30832863320172965, "grad_norm": 0.09126675128936768, "learning_rate": 4.533469844419581e-05, "loss": 0.0135, "step": 4920 }, { "epoch": 0.3089553174155543, "grad_norm": 0.06963896751403809, "learning_rate": 4.532414346330034e-05, "loss": 0.3541, "step": 4930 }, { "epoch": 0.30958200162937893, "grad_norm": 0.28274479508399963, "learning_rate": 4.5313588482404853e-05, "loss": 0.1919, "step": 4940 }, { "epoch": 0.3102086858432036, "grad_norm": 1.2750145196914673, "learning_rate": 4.530303350150936e-05, "loss": 0.0554, "step": 4950 }, { "epoch": 0.31083537005702827, "grad_norm": 0.06011820584535599, "learning_rate": 4.529247852061388e-05, "loss": 0.0602, "step": 4960 }, { "epoch": 0.31146205427085294, "grad_norm": 1.432435154914856, "learning_rate": 4.52819235397184e-05, "loss": 0.011, "step": 4970 }, { "epoch": 0.31208873848467755, "grad_norm": 0.08396012336015701, "learning_rate": 4.527136855882291e-05, "loss": 0.0512, "step": 4980 }, { "epoch": 0.3127154226985022, "grad_norm": 0.01751044951379299, "learning_rate": 4.526081357792742e-05, "loss": 0.0981, "step": 4990 }, { "epoch": 0.3133421069123269, "grad_norm": 0.02498529851436615, "learning_rate": 4.525025859703194e-05, "loss": 0.1803, "step": 5000 }, { "epoch": 0.31396879112615156, "grad_norm": 3.77972674369812, "learning_rate": 4.5239703616136456e-05, "loss": 0.2917, "step": 5010 }, { "epoch": 0.31459547533997617, "grad_norm": 0.40921223163604736, "learning_rate": 4.522914863524097e-05, "loss": 0.1298, "step": 5020 }, { "epoch": 0.31522215955380084, "grad_norm": 11.053905487060547, "learning_rate": 4.521859365434549e-05, "loss": 0.1119, "step": 5030 }, { "epoch": 0.3158488437676255, "grad_norm": 12.93051815032959, "learning_rate": 4.5208038673450006e-05, "loss": 0.0669, "step": 5040 }, { "epoch": 0.3164755279814501, "grad_norm": 0.1682542860507965, "learning_rate": 4.519748369255452e-05, "loss": 0.0718, "step": 5050 }, { "epoch": 0.3171022121952748, "grad_norm": 6.569287300109863, "learning_rate": 4.518692871165903e-05, "loss": 0.1773, "step": 5060 }, { "epoch": 0.31772889640909946, "grad_norm": 0.24212561547756195, "learning_rate": 4.517637373076355e-05, "loss": 0.0564, "step": 5070 }, { "epoch": 0.31835558062292413, "grad_norm": 0.04047981649637222, "learning_rate": 4.5165818749868066e-05, "loss": 0.1893, "step": 5080 }, { "epoch": 0.31898226483674874, "grad_norm": 0.6782110333442688, "learning_rate": 4.5155263768972576e-05, "loss": 0.2226, "step": 5090 }, { "epoch": 0.3196089490505734, "grad_norm": 0.14049291610717773, "learning_rate": 4.514470878807709e-05, "loss": 0.091, "step": 5100 }, { "epoch": 0.3202356332643981, "grad_norm": 0.1849367469549179, "learning_rate": 4.513415380718161e-05, "loss": 0.1393, "step": 5110 }, { "epoch": 0.32086231747822275, "grad_norm": 0.1305677443742752, "learning_rate": 4.5123598826286126e-05, "loss": 0.221, "step": 5120 }, { "epoch": 0.32148900169204736, "grad_norm": 0.4029105603694916, "learning_rate": 4.511304384539064e-05, "loss": 0.0189, "step": 5130 }, { "epoch": 0.32211568590587203, "grad_norm": 0.062030546367168427, "learning_rate": 4.510248886449516e-05, "loss": 0.0842, "step": 5140 }, { "epoch": 0.3227423701196967, "grad_norm": 9.721570014953613, "learning_rate": 4.5091933883599676e-05, "loss": 0.2981, "step": 5150 }, { "epoch": 0.32336905433352137, "grad_norm": 0.10169248282909393, "learning_rate": 4.5081378902704186e-05, "loss": 0.1636, "step": 5160 }, { "epoch": 0.323995738547346, "grad_norm": 0.22179928421974182, "learning_rate": 4.50708239218087e-05, "loss": 0.1367, "step": 5170 }, { "epoch": 0.32462242276117065, "grad_norm": 5.547093868255615, "learning_rate": 4.506026894091322e-05, "loss": 0.1028, "step": 5180 }, { "epoch": 0.3252491069749953, "grad_norm": 8.620623588562012, "learning_rate": 4.5049713960017736e-05, "loss": 0.1962, "step": 5190 }, { "epoch": 0.32587579118881993, "grad_norm": 3.4595415592193604, "learning_rate": 4.5039158979122246e-05, "loss": 0.19, "step": 5200 }, { "epoch": 0.3265024754026446, "grad_norm": 0.08240117877721786, "learning_rate": 4.502860399822676e-05, "loss": 0.0581, "step": 5210 }, { "epoch": 0.32712915961646927, "grad_norm": 0.07359209656715393, "learning_rate": 4.501804901733128e-05, "loss": 0.0856, "step": 5220 }, { "epoch": 0.32775584383029394, "grad_norm": 1.3321644067764282, "learning_rate": 4.5007494036435796e-05, "loss": 0.1497, "step": 5230 }, { "epoch": 0.32838252804411855, "grad_norm": 3.477701425552368, "learning_rate": 4.499693905554031e-05, "loss": 0.2044, "step": 5240 }, { "epoch": 0.3290092122579432, "grad_norm": 6.64050817489624, "learning_rate": 4.498638407464483e-05, "loss": 0.3428, "step": 5250 }, { "epoch": 0.3296358964717679, "grad_norm": 0.25289374589920044, "learning_rate": 4.4975829093749346e-05, "loss": 0.1021, "step": 5260 }, { "epoch": 0.33026258068559255, "grad_norm": 0.14394524693489075, "learning_rate": 4.4965274112853856e-05, "loss": 0.1675, "step": 5270 }, { "epoch": 0.33088926489941717, "grad_norm": 3.0389928817749023, "learning_rate": 4.495471913195837e-05, "loss": 0.1389, "step": 5280 }, { "epoch": 0.33151594911324184, "grad_norm": 4.999429702758789, "learning_rate": 4.494416415106289e-05, "loss": 0.1257, "step": 5290 }, { "epoch": 0.3321426333270665, "grad_norm": 9.429051399230957, "learning_rate": 4.4933609170167405e-05, "loss": 0.1603, "step": 5300 }, { "epoch": 0.3327693175408911, "grad_norm": 3.5542385578155518, "learning_rate": 4.4923054189271915e-05, "loss": 0.1632, "step": 5310 }, { "epoch": 0.3333960017547158, "grad_norm": 0.5139914155006409, "learning_rate": 4.491249920837643e-05, "loss": 0.063, "step": 5320 }, { "epoch": 0.33402268596854046, "grad_norm": 0.7108550071716309, "learning_rate": 4.4901944227480955e-05, "loss": 0.0941, "step": 5330 }, { "epoch": 0.3346493701823651, "grad_norm": 0.048141974955797195, "learning_rate": 4.4891389246585465e-05, "loss": 0.1396, "step": 5340 }, { "epoch": 0.33527605439618974, "grad_norm": 2.3768675327301025, "learning_rate": 4.488083426568998e-05, "loss": 0.1904, "step": 5350 }, { "epoch": 0.3359027386100144, "grad_norm": 0.14059720933437347, "learning_rate": 4.48702792847945e-05, "loss": 0.1807, "step": 5360 }, { "epoch": 0.3365294228238391, "grad_norm": 0.13699670135974884, "learning_rate": 4.4859724303899015e-05, "loss": 0.1932, "step": 5370 }, { "epoch": 0.33715610703766374, "grad_norm": 0.14927171170711517, "learning_rate": 4.4849169323003525e-05, "loss": 0.146, "step": 5380 }, { "epoch": 0.33778279125148836, "grad_norm": 1.7202868461608887, "learning_rate": 4.483861434210804e-05, "loss": 0.2793, "step": 5390 }, { "epoch": 0.338409475465313, "grad_norm": 0.2133670300245285, "learning_rate": 4.482805936121256e-05, "loss": 0.0996, "step": 5400 }, { "epoch": 0.3390361596791377, "grad_norm": 0.10931286215782166, "learning_rate": 4.481750438031707e-05, "loss": 0.1403, "step": 5410 }, { "epoch": 0.33966284389296236, "grad_norm": 0.11824043095111847, "learning_rate": 4.4806949399421585e-05, "loss": 0.1866, "step": 5420 }, { "epoch": 0.340289528106787, "grad_norm": 3.111504554748535, "learning_rate": 4.479639441852611e-05, "loss": 0.1339, "step": 5430 }, { "epoch": 0.34091621232061164, "grad_norm": 0.15324468910694122, "learning_rate": 4.4785839437630625e-05, "loss": 0.1496, "step": 5440 }, { "epoch": 0.3415428965344363, "grad_norm": 15.096182823181152, "learning_rate": 4.4775284456735135e-05, "loss": 0.2691, "step": 5450 }, { "epoch": 0.3421695807482609, "grad_norm": 6.626861095428467, "learning_rate": 4.476472947583965e-05, "loss": 0.2139, "step": 5460 }, { "epoch": 0.3427962649620856, "grad_norm": 0.3588835895061493, "learning_rate": 4.475417449494417e-05, "loss": 0.0899, "step": 5470 }, { "epoch": 0.34342294917591026, "grad_norm": 3.0588839054107666, "learning_rate": 4.474361951404868e-05, "loss": 0.1258, "step": 5480 }, { "epoch": 0.34404963338973493, "grad_norm": 0.17878539860248566, "learning_rate": 4.4733064533153195e-05, "loss": 0.1772, "step": 5490 }, { "epoch": 0.34467631760355955, "grad_norm": 1.422548532485962, "learning_rate": 4.472250955225771e-05, "loss": 0.1118, "step": 5500 }, { "epoch": 0.3453030018173842, "grad_norm": 0.09782126545906067, "learning_rate": 4.471195457136223e-05, "loss": 0.0883, "step": 5510 }, { "epoch": 0.3459296860312089, "grad_norm": 1.7103350162506104, "learning_rate": 4.4701399590466745e-05, "loss": 0.0905, "step": 5520 }, { "epoch": 0.34655637024503355, "grad_norm": 0.24084354937076569, "learning_rate": 4.469084460957126e-05, "loss": 0.157, "step": 5530 }, { "epoch": 0.34718305445885816, "grad_norm": 0.39323166012763977, "learning_rate": 4.468028962867578e-05, "loss": 0.165, "step": 5540 }, { "epoch": 0.34780973867268283, "grad_norm": 4.8996124267578125, "learning_rate": 4.466973464778029e-05, "loss": 0.1703, "step": 5550 }, { "epoch": 0.3484364228865075, "grad_norm": 0.06810665875673294, "learning_rate": 4.4659179666884805e-05, "loss": 0.1135, "step": 5560 }, { "epoch": 0.3490631071003321, "grad_norm": 0.4743961989879608, "learning_rate": 4.464862468598932e-05, "loss": 0.1448, "step": 5570 }, { "epoch": 0.3496897913141568, "grad_norm": 1.2920023202896118, "learning_rate": 4.463806970509384e-05, "loss": 0.1119, "step": 5580 }, { "epoch": 0.35031647552798145, "grad_norm": 3.457261323928833, "learning_rate": 4.462751472419835e-05, "loss": 0.1757, "step": 5590 }, { "epoch": 0.3509431597418061, "grad_norm": 3.696157932281494, "learning_rate": 4.4616959743302864e-05, "loss": 0.186, "step": 5600 }, { "epoch": 0.35156984395563073, "grad_norm": 0.24386389553546906, "learning_rate": 4.460640476240738e-05, "loss": 0.1244, "step": 5610 }, { "epoch": 0.3521965281694554, "grad_norm": 26.246967315673828, "learning_rate": 4.45958497815119e-05, "loss": 0.1493, "step": 5620 }, { "epoch": 0.35282321238328007, "grad_norm": 0.29406628012657166, "learning_rate": 4.4585294800616414e-05, "loss": 0.1353, "step": 5630 }, { "epoch": 0.35344989659710474, "grad_norm": 0.42619481682777405, "learning_rate": 4.457473981972093e-05, "loss": 0.1239, "step": 5640 }, { "epoch": 0.35407658081092935, "grad_norm": 0.22796322405338287, "learning_rate": 4.456418483882545e-05, "loss": 0.1861, "step": 5650 }, { "epoch": 0.354703265024754, "grad_norm": 0.15700025856494904, "learning_rate": 4.455362985792996e-05, "loss": 0.1362, "step": 5660 }, { "epoch": 0.3553299492385787, "grad_norm": 0.25178059935569763, "learning_rate": 4.4543074877034474e-05, "loss": 0.1262, "step": 5670 }, { "epoch": 0.35595663345240336, "grad_norm": 0.06967005878686905, "learning_rate": 4.453251989613899e-05, "loss": 0.1283, "step": 5680 }, { "epoch": 0.356583317666228, "grad_norm": 0.48487424850463867, "learning_rate": 4.452196491524351e-05, "loss": 0.1818, "step": 5690 }, { "epoch": 0.35721000188005264, "grad_norm": 1.9316906929016113, "learning_rate": 4.451140993434802e-05, "loss": 0.0954, "step": 5700 }, { "epoch": 0.3578366860938773, "grad_norm": 0.09588257223367691, "learning_rate": 4.4500854953452534e-05, "loss": 0.0795, "step": 5710 }, { "epoch": 0.3584633703077019, "grad_norm": 1.1819512844085693, "learning_rate": 4.449029997255705e-05, "loss": 0.2532, "step": 5720 }, { "epoch": 0.3590900545215266, "grad_norm": 0.40093863010406494, "learning_rate": 4.447974499166157e-05, "loss": 0.0591, "step": 5730 }, { "epoch": 0.35971673873535126, "grad_norm": 9.683305740356445, "learning_rate": 4.4469190010766084e-05, "loss": 0.0656, "step": 5740 }, { "epoch": 0.36034342294917593, "grad_norm": 0.07515076547861099, "learning_rate": 4.44586350298706e-05, "loss": 0.1718, "step": 5750 }, { "epoch": 0.36097010716300054, "grad_norm": 0.0691744014620781, "learning_rate": 4.444808004897512e-05, "loss": 0.2273, "step": 5760 }, { "epoch": 0.3615967913768252, "grad_norm": 8.280632019042969, "learning_rate": 4.443752506807963e-05, "loss": 0.1762, "step": 5770 }, { "epoch": 0.3622234755906499, "grad_norm": 0.13394978642463684, "learning_rate": 4.4426970087184144e-05, "loss": 0.168, "step": 5780 }, { "epoch": 0.36285015980447455, "grad_norm": 0.17555440962314606, "learning_rate": 4.441641510628866e-05, "loss": 0.085, "step": 5790 }, { "epoch": 0.36347684401829916, "grad_norm": 13.172547340393066, "learning_rate": 4.440586012539317e-05, "loss": 0.2534, "step": 5800 }, { "epoch": 0.36410352823212383, "grad_norm": 0.8912064433097839, "learning_rate": 4.439530514449769e-05, "loss": 0.087, "step": 5810 }, { "epoch": 0.3647302124459485, "grad_norm": 0.07865786552429199, "learning_rate": 4.4384750163602204e-05, "loss": 0.0783, "step": 5820 }, { "epoch": 0.36535689665977317, "grad_norm": 7.192616939544678, "learning_rate": 4.437419518270673e-05, "loss": 0.0805, "step": 5830 }, { "epoch": 0.3659835808735978, "grad_norm": 6.647584915161133, "learning_rate": 4.436364020181124e-05, "loss": 0.3214, "step": 5840 }, { "epoch": 0.36661026508742245, "grad_norm": 3.0235965251922607, "learning_rate": 4.4353085220915754e-05, "loss": 0.2507, "step": 5850 }, { "epoch": 0.3672369493012471, "grad_norm": 0.1376418173313141, "learning_rate": 4.434253024002027e-05, "loss": 0.0591, "step": 5860 }, { "epoch": 0.36786363351507173, "grad_norm": 0.08546671271324158, "learning_rate": 4.433197525912478e-05, "loss": 0.0486, "step": 5870 }, { "epoch": 0.3684903177288964, "grad_norm": 3.4542911052703857, "learning_rate": 4.43214202782293e-05, "loss": 0.1363, "step": 5880 }, { "epoch": 0.36911700194272107, "grad_norm": 0.18823327124118805, "learning_rate": 4.431086529733381e-05, "loss": 0.0364, "step": 5890 }, { "epoch": 0.36974368615654574, "grad_norm": 0.26859134435653687, "learning_rate": 4.430031031643833e-05, "loss": 0.0906, "step": 5900 }, { "epoch": 0.37037037037037035, "grad_norm": 67.65911102294922, "learning_rate": 4.428975533554284e-05, "loss": 0.0909, "step": 5910 }, { "epoch": 0.370997054584195, "grad_norm": 15.827759742736816, "learning_rate": 4.427920035464736e-05, "loss": 0.1871, "step": 5920 }, { "epoch": 0.3716237387980197, "grad_norm": 4.99617862701416, "learning_rate": 4.426864537375188e-05, "loss": 0.155, "step": 5930 }, { "epoch": 0.37225042301184436, "grad_norm": 8.309243202209473, "learning_rate": 4.425809039285639e-05, "loss": 0.4023, "step": 5940 }, { "epoch": 0.37287710722566897, "grad_norm": 0.054077792912721634, "learning_rate": 4.4247535411960906e-05, "loss": 0.1171, "step": 5950 }, { "epoch": 0.37350379143949364, "grad_norm": 7.583477020263672, "learning_rate": 4.423698043106542e-05, "loss": 0.1549, "step": 5960 }, { "epoch": 0.3741304756533183, "grad_norm": 0.06153470277786255, "learning_rate": 4.422642545016994e-05, "loss": 0.0672, "step": 5970 }, { "epoch": 0.3747571598671429, "grad_norm": 0.42908042669296265, "learning_rate": 4.421587046927445e-05, "loss": 0.081, "step": 5980 }, { "epoch": 0.3753838440809676, "grad_norm": 0.088747039437294, "learning_rate": 4.4205315488378966e-05, "loss": 0.1062, "step": 5990 }, { "epoch": 0.37601052829479226, "grad_norm": 0.06233467161655426, "learning_rate": 4.419476050748348e-05, "loss": 0.0545, "step": 6000 }, { "epoch": 0.3766372125086169, "grad_norm": 0.06656718254089355, "learning_rate": 4.4184205526588e-05, "loss": 0.1114, "step": 6010 }, { "epoch": 0.37726389672244154, "grad_norm": 0.16433706879615784, "learning_rate": 4.4173650545692516e-05, "loss": 0.0839, "step": 6020 }, { "epoch": 0.3778905809362662, "grad_norm": 0.40656861662864685, "learning_rate": 4.416309556479703e-05, "loss": 0.0803, "step": 6030 }, { "epoch": 0.3785172651500909, "grad_norm": 0.031963735818862915, "learning_rate": 4.415254058390155e-05, "loss": 0.09, "step": 6040 }, { "epoch": 0.37914394936391554, "grad_norm": 0.03380054607987404, "learning_rate": 4.414198560300606e-05, "loss": 0.307, "step": 6050 }, { "epoch": 0.37977063357774016, "grad_norm": 0.06074713170528412, "learning_rate": 4.4131430622110576e-05, "loss": 0.0836, "step": 6060 }, { "epoch": 0.3803973177915648, "grad_norm": 0.25206536054611206, "learning_rate": 4.412087564121509e-05, "loss": 0.0627, "step": 6070 }, { "epoch": 0.3810240020053895, "grad_norm": 0.6263360381126404, "learning_rate": 4.411032066031961e-05, "loss": 0.0528, "step": 6080 }, { "epoch": 0.38165068621921416, "grad_norm": 0.07583242654800415, "learning_rate": 4.409976567942412e-05, "loss": 0.4302, "step": 6090 }, { "epoch": 0.3822773704330388, "grad_norm": 0.07200941443443298, "learning_rate": 4.4089210698528636e-05, "loss": 0.0395, "step": 6100 }, { "epoch": 0.38290405464686345, "grad_norm": 9.109925270080566, "learning_rate": 4.407865571763315e-05, "loss": 0.2408, "step": 6110 }, { "epoch": 0.3835307388606881, "grad_norm": 48.827877044677734, "learning_rate": 4.406810073673767e-05, "loss": 0.1589, "step": 6120 }, { "epoch": 0.3841574230745127, "grad_norm": 3.1407594680786133, "learning_rate": 4.4057545755842186e-05, "loss": 0.1474, "step": 6130 }, { "epoch": 0.3847841072883374, "grad_norm": 0.10355928540229797, "learning_rate": 4.40469907749467e-05, "loss": 0.0277, "step": 6140 }, { "epoch": 0.38541079150216206, "grad_norm": 0.45885977149009705, "learning_rate": 4.403643579405122e-05, "loss": 0.1817, "step": 6150 }, { "epoch": 0.38603747571598673, "grad_norm": 0.1281880885362625, "learning_rate": 4.402588081315573e-05, "loss": 0.1287, "step": 6160 }, { "epoch": 0.38666415992981135, "grad_norm": 0.21207277476787567, "learning_rate": 4.4015325832260246e-05, "loss": 0.1862, "step": 6170 }, { "epoch": 0.387290844143636, "grad_norm": 6.804543495178223, "learning_rate": 4.400477085136476e-05, "loss": 0.2877, "step": 6180 }, { "epoch": 0.3879175283574607, "grad_norm": 0.1996593177318573, "learning_rate": 4.399421587046927e-05, "loss": 0.1129, "step": 6190 }, { "epoch": 0.38854421257128535, "grad_norm": 0.15772663056850433, "learning_rate": 4.398366088957379e-05, "loss": 0.1105, "step": 6200 }, { "epoch": 0.38917089678510997, "grad_norm": 0.14258632063865662, "learning_rate": 4.3973105908678306e-05, "loss": 0.1717, "step": 6210 }, { "epoch": 0.38979758099893463, "grad_norm": 3.9509053230285645, "learning_rate": 4.396255092778282e-05, "loss": 0.176, "step": 6220 }, { "epoch": 0.3904242652127593, "grad_norm": 0.16813886165618896, "learning_rate": 4.395199594688734e-05, "loss": 0.0789, "step": 6230 }, { "epoch": 0.39105094942658397, "grad_norm": 0.6697870492935181, "learning_rate": 4.3941440965991855e-05, "loss": 0.0794, "step": 6240 }, { "epoch": 0.3916776336404086, "grad_norm": 2.156053066253662, "learning_rate": 4.393088598509637e-05, "loss": 0.207, "step": 6250 }, { "epoch": 0.39230431785423325, "grad_norm": 0.9125531911849976, "learning_rate": 4.392033100420088e-05, "loss": 0.2005, "step": 6260 }, { "epoch": 0.3929310020680579, "grad_norm": 1.7858306169509888, "learning_rate": 4.39097760233054e-05, "loss": 0.126, "step": 6270 }, { "epoch": 0.39355768628188253, "grad_norm": 4.1872687339782715, "learning_rate": 4.3899221042409915e-05, "loss": 0.2317, "step": 6280 }, { "epoch": 0.3941843704957072, "grad_norm": 0.26265501976013184, "learning_rate": 4.388866606151443e-05, "loss": 0.144, "step": 6290 }, { "epoch": 0.3948110547095319, "grad_norm": 0.15700949728488922, "learning_rate": 4.387811108061894e-05, "loss": 0.1866, "step": 6300 }, { "epoch": 0.39543773892335654, "grad_norm": 3.4927055835723877, "learning_rate": 4.386755609972346e-05, "loss": 0.1554, "step": 6310 }, { "epoch": 0.39606442313718115, "grad_norm": 0.31977131962776184, "learning_rate": 4.3857001118827975e-05, "loss": 0.249, "step": 6320 }, { "epoch": 0.3966911073510058, "grad_norm": 0.11661989986896515, "learning_rate": 4.384644613793249e-05, "loss": 0.1728, "step": 6330 }, { "epoch": 0.3973177915648305, "grad_norm": 3.7508764266967773, "learning_rate": 4.383589115703701e-05, "loss": 0.3801, "step": 6340 }, { "epoch": 0.39794447577865516, "grad_norm": 0.44953954219818115, "learning_rate": 4.3825336176141525e-05, "loss": 0.0824, "step": 6350 }, { "epoch": 0.3985711599924798, "grad_norm": 0.4894031286239624, "learning_rate": 4.381478119524604e-05, "loss": 0.1609, "step": 6360 }, { "epoch": 0.39919784420630444, "grad_norm": 0.49157753586769104, "learning_rate": 4.380422621435055e-05, "loss": 0.135, "step": 6370 }, { "epoch": 0.3998245284201291, "grad_norm": 0.10516194254159927, "learning_rate": 4.379367123345507e-05, "loss": 0.1034, "step": 6380 }, { "epoch": 0.4004512126339537, "grad_norm": 0.24221737682819366, "learning_rate": 4.3783116252559585e-05, "loss": 0.1595, "step": 6390 }, { "epoch": 0.4010778968477784, "grad_norm": 0.21182815730571747, "learning_rate": 4.37725612716641e-05, "loss": 0.0767, "step": 6400 }, { "epoch": 0.40170458106160306, "grad_norm": 0.17457489669322968, "learning_rate": 4.376200629076861e-05, "loss": 0.1248, "step": 6410 }, { "epoch": 0.40233126527542773, "grad_norm": 3.25508975982666, "learning_rate": 4.3751451309873135e-05, "loss": 0.1442, "step": 6420 }, { "epoch": 0.40295794948925234, "grad_norm": 0.06830067932605743, "learning_rate": 4.374089632897765e-05, "loss": 0.1064, "step": 6430 }, { "epoch": 0.403584633703077, "grad_norm": 4.027711868286133, "learning_rate": 4.373034134808216e-05, "loss": 0.2826, "step": 6440 }, { "epoch": 0.4042113179169017, "grad_norm": 6.067227840423584, "learning_rate": 4.371978636718668e-05, "loss": 0.1604, "step": 6450 }, { "epoch": 0.40483800213072635, "grad_norm": 0.2599928081035614, "learning_rate": 4.3709231386291195e-05, "loss": 0.3021, "step": 6460 }, { "epoch": 0.40546468634455096, "grad_norm": 0.2688082456588745, "learning_rate": 4.369867640539571e-05, "loss": 0.2037, "step": 6470 }, { "epoch": 0.40609137055837563, "grad_norm": 0.13320320844650269, "learning_rate": 4.368812142450022e-05, "loss": 0.0776, "step": 6480 }, { "epoch": 0.4067180547722003, "grad_norm": 0.8527864813804626, "learning_rate": 4.367756644360474e-05, "loss": 0.1801, "step": 6490 }, { "epoch": 0.40734473898602497, "grad_norm": 14.777264595031738, "learning_rate": 4.3667011462709255e-05, "loss": 0.117, "step": 6500 }, { "epoch": 0.4079714231998496, "grad_norm": 0.06564256548881531, "learning_rate": 4.365645648181377e-05, "loss": 0.1407, "step": 6510 }, { "epoch": 0.40859810741367425, "grad_norm": 93.907470703125, "learning_rate": 4.364590150091829e-05, "loss": 0.1947, "step": 6520 }, { "epoch": 0.4092247916274989, "grad_norm": 0.08441044390201569, "learning_rate": 4.3635346520022804e-05, "loss": 0.0147, "step": 6530 }, { "epoch": 0.40985147584132353, "grad_norm": 6.032024383544922, "learning_rate": 4.362479153912732e-05, "loss": 0.1261, "step": 6540 }, { "epoch": 0.4104781600551482, "grad_norm": 0.7132280468940735, "learning_rate": 4.361423655823183e-05, "loss": 0.1141, "step": 6550 }, { "epoch": 0.41110484426897287, "grad_norm": 0.052206575870513916, "learning_rate": 4.360368157733635e-05, "loss": 0.0464, "step": 6560 }, { "epoch": 0.41173152848279754, "grad_norm": 0.04777556285262108, "learning_rate": 4.3593126596440864e-05, "loss": 0.0635, "step": 6570 }, { "epoch": 0.41235821269662215, "grad_norm": 0.02872728556394577, "learning_rate": 4.3582571615545374e-05, "loss": 0.0768, "step": 6580 }, { "epoch": 0.4129848969104468, "grad_norm": 0.03338625654578209, "learning_rate": 4.357201663464989e-05, "loss": 0.1111, "step": 6590 }, { "epoch": 0.4136115811242715, "grad_norm": 0.46986547112464905, "learning_rate": 4.356146165375441e-05, "loss": 0.1058, "step": 6600 }, { "epoch": 0.41423826533809616, "grad_norm": 0.296297162771225, "learning_rate": 4.3550906672858924e-05, "loss": 0.1546, "step": 6610 }, { "epoch": 0.41486494955192077, "grad_norm": 4.170858383178711, "learning_rate": 4.354035169196344e-05, "loss": 0.1412, "step": 6620 }, { "epoch": 0.41549163376574544, "grad_norm": 0.05129874125123024, "learning_rate": 4.352979671106796e-05, "loss": 0.0897, "step": 6630 }, { "epoch": 0.4161183179795701, "grad_norm": 3.248608112335205, "learning_rate": 4.3519241730172474e-05, "loss": 0.2496, "step": 6640 }, { "epoch": 0.4167450021933948, "grad_norm": 4.730010509490967, "learning_rate": 4.3508686749276984e-05, "loss": 0.0941, "step": 6650 }, { "epoch": 0.4173716864072194, "grad_norm": 0.11405862867832184, "learning_rate": 4.34981317683815e-05, "loss": 0.0985, "step": 6660 }, { "epoch": 0.41799837062104406, "grad_norm": 0.29627180099487305, "learning_rate": 4.348757678748602e-05, "loss": 0.1299, "step": 6670 }, { "epoch": 0.4186250548348687, "grad_norm": 0.11819622665643692, "learning_rate": 4.3477021806590534e-05, "loss": 0.2367, "step": 6680 }, { "epoch": 0.41925173904869334, "grad_norm": 7.079653739929199, "learning_rate": 4.3466466825695044e-05, "loss": 0.0979, "step": 6690 }, { "epoch": 0.419878423262518, "grad_norm": 0.5191217660903931, "learning_rate": 4.345591184479956e-05, "loss": 0.1518, "step": 6700 }, { "epoch": 0.4205051074763427, "grad_norm": 3.152400493621826, "learning_rate": 4.344535686390408e-05, "loss": 0.252, "step": 6710 }, { "epoch": 0.42113179169016735, "grad_norm": 1.489100456237793, "learning_rate": 4.3434801883008594e-05, "loss": 0.2649, "step": 6720 }, { "epoch": 0.42175847590399196, "grad_norm": 0.1367146521806717, "learning_rate": 4.342424690211311e-05, "loss": 0.0611, "step": 6730 }, { "epoch": 0.4223851601178166, "grad_norm": 0.22044914960861206, "learning_rate": 4.341369192121763e-05, "loss": 0.1254, "step": 6740 }, { "epoch": 0.4230118443316413, "grad_norm": 0.045484308153390884, "learning_rate": 4.3403136940322144e-05, "loss": 0.0632, "step": 6750 }, { "epoch": 0.42363852854546596, "grad_norm": 0.32636168599128723, "learning_rate": 4.3392581959426654e-05, "loss": 0.0894, "step": 6760 }, { "epoch": 0.4242652127592906, "grad_norm": 0.35098814964294434, "learning_rate": 4.338202697853117e-05, "loss": 0.3407, "step": 6770 }, { "epoch": 0.42489189697311525, "grad_norm": 0.25456687808036804, "learning_rate": 4.337147199763569e-05, "loss": 0.1445, "step": 6780 }, { "epoch": 0.4255185811869399, "grad_norm": 0.39973339438438416, "learning_rate": 4.33609170167402e-05, "loss": 0.204, "step": 6790 }, { "epoch": 0.42614526540076453, "grad_norm": 0.36315247416496277, "learning_rate": 4.3350362035844713e-05, "loss": 0.2102, "step": 6800 }, { "epoch": 0.4267719496145892, "grad_norm": 0.31502068042755127, "learning_rate": 4.333980705494923e-05, "loss": 0.1505, "step": 6810 }, { "epoch": 0.42739863382841387, "grad_norm": 0.1232970803976059, "learning_rate": 4.332925207405375e-05, "loss": 0.2021, "step": 6820 }, { "epoch": 0.42802531804223853, "grad_norm": 3.6902544498443604, "learning_rate": 4.331869709315826e-05, "loss": 0.1584, "step": 6830 }, { "epoch": 0.42865200225606315, "grad_norm": 0.17084215581417084, "learning_rate": 4.330814211226278e-05, "loss": 0.0913, "step": 6840 }, { "epoch": 0.4292786864698878, "grad_norm": 0.18105857074260712, "learning_rate": 4.32975871313673e-05, "loss": 0.1581, "step": 6850 }, { "epoch": 0.4299053706837125, "grad_norm": 1.236863613128662, "learning_rate": 4.328703215047181e-05, "loss": 0.1888, "step": 6860 }, { "epoch": 0.43053205489753715, "grad_norm": 0.0806477889418602, "learning_rate": 4.327647716957632e-05, "loss": 0.0882, "step": 6870 }, { "epoch": 0.43115873911136177, "grad_norm": 0.1351771354675293, "learning_rate": 4.326592218868084e-05, "loss": 0.0103, "step": 6880 }, { "epoch": 0.43178542332518643, "grad_norm": 0.04418093338608742, "learning_rate": 4.3255367207785356e-05, "loss": 0.1063, "step": 6890 }, { "epoch": 0.4324121075390111, "grad_norm": 0.06696956604719162, "learning_rate": 4.3244812226889866e-05, "loss": 0.1186, "step": 6900 }, { "epoch": 0.43303879175283577, "grad_norm": 2.718379020690918, "learning_rate": 4.323425724599438e-05, "loss": 0.2461, "step": 6910 }, { "epoch": 0.4336654759666604, "grad_norm": 3.3854384422302246, "learning_rate": 4.3223702265098906e-05, "loss": 0.2563, "step": 6920 }, { "epoch": 0.43429216018048505, "grad_norm": 6.008489608764648, "learning_rate": 4.321314728420342e-05, "loss": 0.1759, "step": 6930 }, { "epoch": 0.4349188443943097, "grad_norm": 0.43417903780937195, "learning_rate": 4.320259230330793e-05, "loss": 0.0919, "step": 6940 }, { "epoch": 0.43554552860813434, "grad_norm": 10.172123908996582, "learning_rate": 4.319203732241245e-05, "loss": 0.2124, "step": 6950 }, { "epoch": 0.436172212821959, "grad_norm": 0.26110365986824036, "learning_rate": 4.3181482341516966e-05, "loss": 0.2002, "step": 6960 }, { "epoch": 0.4367988970357837, "grad_norm": 3.4817702770233154, "learning_rate": 4.3170927360621476e-05, "loss": 0.1704, "step": 6970 }, { "epoch": 0.43742558124960834, "grad_norm": 2.7794384956359863, "learning_rate": 4.316037237972599e-05, "loss": 0.1826, "step": 6980 }, { "epoch": 0.43805226546343295, "grad_norm": 1.7215393781661987, "learning_rate": 4.314981739883051e-05, "loss": 0.1402, "step": 6990 }, { "epoch": 0.4386789496772576, "grad_norm": 0.0389961376786232, "learning_rate": 4.3139262417935026e-05, "loss": 0.0581, "step": 7000 }, { "epoch": 0.4393056338910823, "grad_norm": 0.11511804163455963, "learning_rate": 4.312870743703954e-05, "loss": 0.1763, "step": 7010 }, { "epoch": 0.43993231810490696, "grad_norm": 0.4011329412460327, "learning_rate": 4.311815245614406e-05, "loss": 0.012, "step": 7020 }, { "epoch": 0.4405590023187316, "grad_norm": 8.781197547912598, "learning_rate": 4.3107597475248576e-05, "loss": 0.084, "step": 7030 }, { "epoch": 0.44118568653255624, "grad_norm": 1.0521060228347778, "learning_rate": 4.3097042494353086e-05, "loss": 0.2094, "step": 7040 }, { "epoch": 0.4418123707463809, "grad_norm": 4.4334330558776855, "learning_rate": 4.30864875134576e-05, "loss": 0.1565, "step": 7050 }, { "epoch": 0.4424390549602056, "grad_norm": 3.078625440597534, "learning_rate": 4.307593253256212e-05, "loss": 0.1848, "step": 7060 }, { "epoch": 0.4430657391740302, "grad_norm": 0.9323499202728271, "learning_rate": 4.3065377551666636e-05, "loss": 0.1174, "step": 7070 }, { "epoch": 0.44369242338785486, "grad_norm": 0.04099570959806442, "learning_rate": 4.3054822570771146e-05, "loss": 0.2271, "step": 7080 }, { "epoch": 0.44431910760167953, "grad_norm": 4.120002746582031, "learning_rate": 4.304426758987566e-05, "loss": 0.0977, "step": 7090 }, { "epoch": 0.44494579181550414, "grad_norm": 0.0420520082116127, "learning_rate": 4.303371260898018e-05, "loss": 0.0655, "step": 7100 }, { "epoch": 0.4455724760293288, "grad_norm": 0.03804453834891319, "learning_rate": 4.3023157628084696e-05, "loss": 0.0494, "step": 7110 }, { "epoch": 0.4461991602431535, "grad_norm": 3.8197927474975586, "learning_rate": 4.301260264718921e-05, "loss": 0.1237, "step": 7120 }, { "epoch": 0.44682584445697815, "grad_norm": 3.5626463890075684, "learning_rate": 4.300204766629373e-05, "loss": 0.1843, "step": 7130 }, { "epoch": 0.44745252867080276, "grad_norm": 0.12504714727401733, "learning_rate": 4.2991492685398246e-05, "loss": 0.122, "step": 7140 }, { "epoch": 0.44807921288462743, "grad_norm": 2.9400346279144287, "learning_rate": 4.2980937704502756e-05, "loss": 0.2058, "step": 7150 }, { "epoch": 0.4487058970984521, "grad_norm": 0.2937048375606537, "learning_rate": 4.297038272360727e-05, "loss": 0.2501, "step": 7160 }, { "epoch": 0.44933258131227677, "grad_norm": 0.18238890171051025, "learning_rate": 4.295982774271179e-05, "loss": 0.0663, "step": 7170 }, { "epoch": 0.4499592655261014, "grad_norm": 0.12388616055250168, "learning_rate": 4.29492727618163e-05, "loss": 0.0381, "step": 7180 }, { "epoch": 0.45058594973992605, "grad_norm": 5.601403713226318, "learning_rate": 4.2938717780920815e-05, "loss": 0.2703, "step": 7190 }, { "epoch": 0.4512126339537507, "grad_norm": 0.20709013938903809, "learning_rate": 4.292816280002533e-05, "loss": 0.0724, "step": 7200 }, { "epoch": 0.45183931816757533, "grad_norm": 3.6804709434509277, "learning_rate": 4.291760781912985e-05, "loss": 0.1287, "step": 7210 }, { "epoch": 0.4524660023814, "grad_norm": 6.19010066986084, "learning_rate": 4.2907052838234365e-05, "loss": 0.3109, "step": 7220 }, { "epoch": 0.45309268659522467, "grad_norm": 0.1047079935669899, "learning_rate": 4.289649785733888e-05, "loss": 0.1165, "step": 7230 }, { "epoch": 0.45371937080904934, "grad_norm": 2.2670769691467285, "learning_rate": 4.28859428764434e-05, "loss": 0.0588, "step": 7240 }, { "epoch": 0.45434605502287395, "grad_norm": 39.598915100097656, "learning_rate": 4.2875387895547915e-05, "loss": 0.3843, "step": 7250 }, { "epoch": 0.4549727392366986, "grad_norm": 0.05779840797185898, "learning_rate": 4.2864832914652425e-05, "loss": 0.0862, "step": 7260 }, { "epoch": 0.4555994234505233, "grad_norm": 0.09832815825939178, "learning_rate": 4.285427793375694e-05, "loss": 0.2489, "step": 7270 }, { "epoch": 0.45622610766434796, "grad_norm": 0.26292064785957336, "learning_rate": 4.284372295286146e-05, "loss": 0.2071, "step": 7280 }, { "epoch": 0.45685279187817257, "grad_norm": 5.3300557136535645, "learning_rate": 4.283316797196597e-05, "loss": 0.1721, "step": 7290 }, { "epoch": 0.45747947609199724, "grad_norm": 0.23609036207199097, "learning_rate": 4.2822612991070485e-05, "loss": 0.0499, "step": 7300 }, { "epoch": 0.4581061603058219, "grad_norm": 1.547780990600586, "learning_rate": 4.2812058010175e-05, "loss": 0.0725, "step": 7310 }, { "epoch": 0.4587328445196466, "grad_norm": 2.538231611251831, "learning_rate": 4.280150302927952e-05, "loss": 0.2541, "step": 7320 }, { "epoch": 0.4593595287334712, "grad_norm": 0.7122913002967834, "learning_rate": 4.2790948048384035e-05, "loss": 0.0331, "step": 7330 }, { "epoch": 0.45998621294729586, "grad_norm": 0.13575519621372223, "learning_rate": 4.278039306748855e-05, "loss": 0.1148, "step": 7340 }, { "epoch": 0.4606128971611205, "grad_norm": 0.37606069445610046, "learning_rate": 4.276983808659307e-05, "loss": 0.169, "step": 7350 }, { "epoch": 0.46123958137494514, "grad_norm": 0.06538957357406616, "learning_rate": 4.275928310569758e-05, "loss": 0.1728, "step": 7360 }, { "epoch": 0.4618662655887698, "grad_norm": 0.12389545887708664, "learning_rate": 4.2748728124802095e-05, "loss": 0.1081, "step": 7370 }, { "epoch": 0.4624929498025945, "grad_norm": 0.15639159083366394, "learning_rate": 4.273817314390661e-05, "loss": 0.1994, "step": 7380 }, { "epoch": 0.46311963401641915, "grad_norm": 2.5816802978515625, "learning_rate": 4.272761816301113e-05, "loss": 0.1382, "step": 7390 }, { "epoch": 0.46374631823024376, "grad_norm": 0.1728868931531906, "learning_rate": 4.271706318211564e-05, "loss": 0.061, "step": 7400 }, { "epoch": 0.46437300244406843, "grad_norm": 0.4691903293132782, "learning_rate": 4.2706508201220155e-05, "loss": 0.0113, "step": 7410 }, { "epoch": 0.4649996866578931, "grad_norm": 0.23932453989982605, "learning_rate": 4.269595322032468e-05, "loss": 0.2199, "step": 7420 }, { "epoch": 0.46562637087171777, "grad_norm": 0.6389980912208557, "learning_rate": 4.268539823942919e-05, "loss": 0.1017, "step": 7430 }, { "epoch": 0.4662530550855424, "grad_norm": 2.4998295307159424, "learning_rate": 4.2674843258533705e-05, "loss": 0.222, "step": 7440 }, { "epoch": 0.46687973929936705, "grad_norm": 0.17659592628479004, "learning_rate": 4.266428827763822e-05, "loss": 0.213, "step": 7450 }, { "epoch": 0.4675064235131917, "grad_norm": 0.48938411474227905, "learning_rate": 4.265373329674274e-05, "loss": 0.1021, "step": 7460 }, { "epoch": 0.46813310772701633, "grad_norm": 3.5176033973693848, "learning_rate": 4.264317831584725e-05, "loss": 0.2315, "step": 7470 }, { "epoch": 0.468759791940841, "grad_norm": 3.836408853530884, "learning_rate": 4.2632623334951764e-05, "loss": 0.2891, "step": 7480 }, { "epoch": 0.46938647615466567, "grad_norm": 0.2693319618701935, "learning_rate": 4.262206835405628e-05, "loss": 0.1225, "step": 7490 }, { "epoch": 0.47001316036849033, "grad_norm": 0.3623167872428894, "learning_rate": 4.261151337316079e-05, "loss": 0.0797, "step": 7500 }, { "epoch": 0.47063984458231495, "grad_norm": 0.27036330103874207, "learning_rate": 4.2600958392265314e-05, "loss": 0.1628, "step": 7510 }, { "epoch": 0.4712665287961396, "grad_norm": 0.13844487071037292, "learning_rate": 4.259040341136983e-05, "loss": 0.1731, "step": 7520 }, { "epoch": 0.4718932130099643, "grad_norm": 0.19326917827129364, "learning_rate": 4.257984843047435e-05, "loss": 0.0219, "step": 7530 }, { "epoch": 0.47251989722378895, "grad_norm": 0.03843018412590027, "learning_rate": 4.256929344957886e-05, "loss": 0.0881, "step": 7540 }, { "epoch": 0.47314658143761357, "grad_norm": 1.3060014247894287, "learning_rate": 4.2558738468683374e-05, "loss": 0.2043, "step": 7550 }, { "epoch": 0.47377326565143824, "grad_norm": 0.21384786069393158, "learning_rate": 4.254818348778789e-05, "loss": 0.3474, "step": 7560 }, { "epoch": 0.4743999498652629, "grad_norm": 1.6711466312408447, "learning_rate": 4.25376285068924e-05, "loss": 0.171, "step": 7570 }, { "epoch": 0.4750266340790876, "grad_norm": 0.1102273091673851, "learning_rate": 4.252707352599692e-05, "loss": 0.0721, "step": 7580 }, { "epoch": 0.4756533182929122, "grad_norm": 0.3417176902294159, "learning_rate": 4.2516518545101434e-05, "loss": 0.1601, "step": 7590 }, { "epoch": 0.47628000250673685, "grad_norm": 0.049307115375995636, "learning_rate": 4.250596356420595e-05, "loss": 0.0476, "step": 7600 }, { "epoch": 0.4769066867205615, "grad_norm": 0.051750484853982925, "learning_rate": 4.249540858331047e-05, "loss": 0.0123, "step": 7610 }, { "epoch": 0.47753337093438614, "grad_norm": 0.17750048637390137, "learning_rate": 4.2484853602414984e-05, "loss": 0.2373, "step": 7620 }, { "epoch": 0.4781600551482108, "grad_norm": 0.0933203399181366, "learning_rate": 4.24742986215195e-05, "loss": 0.0497, "step": 7630 }, { "epoch": 0.4787867393620355, "grad_norm": 0.035495299845933914, "learning_rate": 4.246374364062402e-05, "loss": 0.0878, "step": 7640 }, { "epoch": 0.47941342357586014, "grad_norm": 0.04313305392861366, "learning_rate": 4.245318865972853e-05, "loss": 0.1121, "step": 7650 }, { "epoch": 0.48004010778968476, "grad_norm": 0.3072851002216339, "learning_rate": 4.2442633678833044e-05, "loss": 0.3325, "step": 7660 }, { "epoch": 0.4806667920035094, "grad_norm": 0.527976393699646, "learning_rate": 4.243207869793756e-05, "loss": 0.1979, "step": 7670 }, { "epoch": 0.4812934762173341, "grad_norm": 0.2459559291601181, "learning_rate": 4.242152371704207e-05, "loss": 0.1636, "step": 7680 }, { "epoch": 0.48192016043115876, "grad_norm": 0.31813257932662964, "learning_rate": 4.241096873614659e-05, "loss": 0.1305, "step": 7690 }, { "epoch": 0.4825468446449834, "grad_norm": 2.2913360595703125, "learning_rate": 4.2400413755251104e-05, "loss": 0.1391, "step": 7700 }, { "epoch": 0.48317352885880804, "grad_norm": 0.3729984760284424, "learning_rate": 4.238985877435562e-05, "loss": 0.045, "step": 7710 }, { "epoch": 0.4838002130726327, "grad_norm": 3.880664587020874, "learning_rate": 4.237930379346014e-05, "loss": 0.1644, "step": 7720 }, { "epoch": 0.4844268972864574, "grad_norm": 0.22369036078453064, "learning_rate": 4.2368748812564654e-05, "loss": 0.1288, "step": 7730 }, { "epoch": 0.485053581500282, "grad_norm": 13.893917083740234, "learning_rate": 4.235819383166917e-05, "loss": 0.1569, "step": 7740 }, { "epoch": 0.48568026571410666, "grad_norm": 4.144253730773926, "learning_rate": 4.234763885077368e-05, "loss": 0.0984, "step": 7750 }, { "epoch": 0.48630694992793133, "grad_norm": 0.7604568004608154, "learning_rate": 4.23370838698782e-05, "loss": 0.1548, "step": 7760 }, { "epoch": 0.48693363414175594, "grad_norm": 2.1528584957122803, "learning_rate": 4.232652888898271e-05, "loss": 0.1772, "step": 7770 }, { "epoch": 0.4875603183555806, "grad_norm": 3.7687759399414062, "learning_rate": 4.231597390808723e-05, "loss": 0.2759, "step": 7780 }, { "epoch": 0.4881870025694053, "grad_norm": 0.08065836876630783, "learning_rate": 4.230541892719174e-05, "loss": 0.1073, "step": 7790 }, { "epoch": 0.48881368678322995, "grad_norm": 0.07277223467826843, "learning_rate": 4.2294863946296257e-05, "loss": 0.1238, "step": 7800 }, { "epoch": 0.48944037099705456, "grad_norm": 0.6644709706306458, "learning_rate": 4.228430896540077e-05, "loss": 0.0536, "step": 7810 }, { "epoch": 0.49006705521087923, "grad_norm": 3.4870352745056152, "learning_rate": 4.227375398450529e-05, "loss": 0.1726, "step": 7820 }, { "epoch": 0.4906937394247039, "grad_norm": 0.24318358302116394, "learning_rate": 4.2263199003609807e-05, "loss": 0.042, "step": 7830 }, { "epoch": 0.49132042363852857, "grad_norm": 0.18962280452251434, "learning_rate": 4.225264402271432e-05, "loss": 0.2307, "step": 7840 }, { "epoch": 0.4919471078523532, "grad_norm": 0.16015306115150452, "learning_rate": 4.224208904181884e-05, "loss": 0.1665, "step": 7850 }, { "epoch": 0.49257379206617785, "grad_norm": 0.22559410333633423, "learning_rate": 4.223153406092335e-05, "loss": 0.117, "step": 7860 }, { "epoch": 0.4932004762800025, "grad_norm": 5.2421793937683105, "learning_rate": 4.2220979080027866e-05, "loss": 0.1174, "step": 7870 }, { "epoch": 0.49382716049382713, "grad_norm": 2.1046204566955566, "learning_rate": 4.221042409913238e-05, "loss": 0.0429, "step": 7880 }, { "epoch": 0.4944538447076518, "grad_norm": 0.19564154744148254, "learning_rate": 4.219986911823689e-05, "loss": 0.1396, "step": 7890 }, { "epoch": 0.49508052892147647, "grad_norm": 4.799094200134277, "learning_rate": 4.218931413734141e-05, "loss": 0.1839, "step": 7900 }, { "epoch": 0.49570721313530114, "grad_norm": 0.08568393439054489, "learning_rate": 4.2178759156445926e-05, "loss": 0.082, "step": 7910 }, { "epoch": 0.49633389734912575, "grad_norm": 2.0710394382476807, "learning_rate": 4.216820417555045e-05, "loss": 0.187, "step": 7920 }, { "epoch": 0.4969605815629504, "grad_norm": 5.502629280090332, "learning_rate": 4.215764919465496e-05, "loss": 0.2082, "step": 7930 }, { "epoch": 0.4975872657767751, "grad_norm": 1.7179450988769531, "learning_rate": 4.2147094213759476e-05, "loss": 0.1262, "step": 7940 }, { "epoch": 0.49821394999059976, "grad_norm": 0.49480941891670227, "learning_rate": 4.213653923286399e-05, "loss": 0.1315, "step": 7950 }, { "epoch": 0.49884063420442437, "grad_norm": 0.3045163154602051, "learning_rate": 4.21259842519685e-05, "loss": 0.032, "step": 7960 }, { "epoch": 0.49946731841824904, "grad_norm": 0.22712041437625885, "learning_rate": 4.211542927107302e-05, "loss": 0.1132, "step": 7970 }, { "epoch": 0.5000940026320737, "grad_norm": 0.0418301597237587, "learning_rate": 4.2104874290177536e-05, "loss": 0.1, "step": 7980 }, { "epoch": 0.5007206868458983, "grad_norm": 0.1894550770521164, "learning_rate": 4.209431930928205e-05, "loss": 0.0973, "step": 7990 }, { "epoch": 0.501347371059723, "grad_norm": 0.19991464912891388, "learning_rate": 4.208376432838656e-05, "loss": 0.058, "step": 8000 }, { "epoch": 0.5019740552735477, "grad_norm": 0.02192305587232113, "learning_rate": 4.2073209347491086e-05, "loss": 0.1147, "step": 8010 }, { "epoch": 0.5026007394873723, "grad_norm": 0.163367360830307, "learning_rate": 4.20626543665956e-05, "loss": 0.112, "step": 8020 }, { "epoch": 0.503227423701197, "grad_norm": 0.17916612327098846, "learning_rate": 4.205209938570012e-05, "loss": 0.1368, "step": 8030 }, { "epoch": 0.5038541079150216, "grad_norm": 2.2590155601501465, "learning_rate": 4.204154440480463e-05, "loss": 0.1517, "step": 8040 }, { "epoch": 0.5044807921288462, "grad_norm": 0.16151118278503418, "learning_rate": 4.2030989423909146e-05, "loss": 0.145, "step": 8050 }, { "epoch": 0.505107476342671, "grad_norm": 2.493932008743286, "learning_rate": 4.202043444301366e-05, "loss": 0.2022, "step": 8060 }, { "epoch": 0.5057341605564956, "grad_norm": 6.127912998199463, "learning_rate": 4.200987946211817e-05, "loss": 0.1948, "step": 8070 }, { "epoch": 0.5063608447703203, "grad_norm": 2.1237828731536865, "learning_rate": 4.199932448122269e-05, "loss": 0.1675, "step": 8080 }, { "epoch": 0.5069875289841449, "grad_norm": 2.187483549118042, "learning_rate": 4.1988769500327206e-05, "loss": 0.1937, "step": 8090 }, { "epoch": 0.5076142131979695, "grad_norm": 0.5250789523124695, "learning_rate": 4.197821451943172e-05, "loss": 0.182, "step": 8100 }, { "epoch": 0.5082408974117942, "grad_norm": 0.0238255113363266, "learning_rate": 4.196765953853624e-05, "loss": 0.0511, "step": 8110 }, { "epoch": 0.5088675816256188, "grad_norm": 0.04943560063838959, "learning_rate": 4.1957104557640756e-05, "loss": 0.1251, "step": 8120 }, { "epoch": 0.5094942658394435, "grad_norm": 0.1290494054555893, "learning_rate": 4.194654957674527e-05, "loss": 0.1943, "step": 8130 }, { "epoch": 0.5101209500532682, "grad_norm": 0.400713175535202, "learning_rate": 4.193599459584978e-05, "loss": 0.2568, "step": 8140 }, { "epoch": 0.5107476342670928, "grad_norm": 0.07351066917181015, "learning_rate": 4.19254396149543e-05, "loss": 0.0159, "step": 8150 }, { "epoch": 0.5113743184809174, "grad_norm": 3.4980196952819824, "learning_rate": 4.1914884634058815e-05, "loss": 0.2556, "step": 8160 }, { "epoch": 0.5120010026947421, "grad_norm": 0.1484570950269699, "learning_rate": 4.190432965316333e-05, "loss": 0.0522, "step": 8170 }, { "epoch": 0.5126276869085667, "grad_norm": 0.07907593250274658, "learning_rate": 4.189377467226784e-05, "loss": 0.0977, "step": 8180 }, { "epoch": 0.5132543711223915, "grad_norm": 0.056584157049655914, "learning_rate": 4.188321969137236e-05, "loss": 0.1622, "step": 8190 }, { "epoch": 0.5138810553362161, "grad_norm": 0.1531023234128952, "learning_rate": 4.1872664710476875e-05, "loss": 0.0139, "step": 8200 }, { "epoch": 0.5145077395500407, "grad_norm": 0.12977644801139832, "learning_rate": 4.186210972958139e-05, "loss": 0.1056, "step": 8210 }, { "epoch": 0.5151344237638654, "grad_norm": 2.399658679962158, "learning_rate": 4.185155474868591e-05, "loss": 0.2637, "step": 8220 }, { "epoch": 0.51576110797769, "grad_norm": 0.1842947155237198, "learning_rate": 4.1840999767790425e-05, "loss": 0.0058, "step": 8230 }, { "epoch": 0.5163877921915146, "grad_norm": 0.04608267545700073, "learning_rate": 4.183044478689494e-05, "loss": 0.0202, "step": 8240 }, { "epoch": 0.5170144764053394, "grad_norm": 0.1500760167837143, "learning_rate": 4.181988980599945e-05, "loss": 0.1144, "step": 8250 }, { "epoch": 0.517641160619164, "grad_norm": 3.4322853088378906, "learning_rate": 4.180933482510397e-05, "loss": 0.1559, "step": 8260 }, { "epoch": 0.5182678448329887, "grad_norm": 0.09054571390151978, "learning_rate": 4.1798779844208485e-05, "loss": 0.1007, "step": 8270 }, { "epoch": 0.5188945290468133, "grad_norm": 0.19511841237545013, "learning_rate": 4.1788224863312995e-05, "loss": 0.0458, "step": 8280 }, { "epoch": 0.5195212132606379, "grad_norm": 0.062319837510585785, "learning_rate": 4.177766988241751e-05, "loss": 0.0829, "step": 8290 }, { "epoch": 0.5201478974744627, "grad_norm": 0.17525716125965118, "learning_rate": 4.176711490152203e-05, "loss": 0.1351, "step": 8300 }, { "epoch": 0.5207745816882873, "grad_norm": 0.09585020691156387, "learning_rate": 4.1756559920626545e-05, "loss": 0.0419, "step": 8310 }, { "epoch": 0.5214012659021119, "grad_norm": 0.04737556353211403, "learning_rate": 4.174600493973106e-05, "loss": 0.0599, "step": 8320 }, { "epoch": 0.5220279501159366, "grad_norm": 0.3379499912261963, "learning_rate": 4.173544995883558e-05, "loss": 0.1293, "step": 8330 }, { "epoch": 0.5226546343297612, "grad_norm": 0.03692694380879402, "learning_rate": 4.1724894977940095e-05, "loss": 0.0598, "step": 8340 }, { "epoch": 0.5232813185435858, "grad_norm": 0.20106247067451477, "learning_rate": 4.1714339997044605e-05, "loss": 0.0969, "step": 8350 }, { "epoch": 0.5239080027574106, "grad_norm": 0.07053105533123016, "learning_rate": 4.170378501614912e-05, "loss": 0.2857, "step": 8360 }, { "epoch": 0.5245346869712352, "grad_norm": 0.15336304903030396, "learning_rate": 4.169323003525364e-05, "loss": 0.1083, "step": 8370 }, { "epoch": 0.5251613711850599, "grad_norm": 0.30457621812820435, "learning_rate": 4.1682675054358155e-05, "loss": 0.0545, "step": 8380 }, { "epoch": 0.5257880553988845, "grad_norm": 0.03971443325281143, "learning_rate": 4.1672120073462664e-05, "loss": 0.0083, "step": 8390 }, { "epoch": 0.5264147396127091, "grad_norm": 0.09428033977746964, "learning_rate": 4.166156509256718e-05, "loss": 0.1767, "step": 8400 }, { "epoch": 0.5270414238265339, "grad_norm": 0.18962548673152924, "learning_rate": 4.16510101116717e-05, "loss": 0.1163, "step": 8410 }, { "epoch": 0.5276681080403585, "grad_norm": 10.36578369140625, "learning_rate": 4.1640455130776214e-05, "loss": 0.2371, "step": 8420 }, { "epoch": 0.5282947922541831, "grad_norm": 331.3830261230469, "learning_rate": 4.162990014988073e-05, "loss": 0.102, "step": 8430 }, { "epoch": 0.5289214764680078, "grad_norm": 0.3419227600097656, "learning_rate": 4.161934516898525e-05, "loss": 0.0567, "step": 8440 }, { "epoch": 0.5295481606818324, "grad_norm": 0.0799892470240593, "learning_rate": 4.1608790188089764e-05, "loss": 0.0113, "step": 8450 }, { "epoch": 0.530174844895657, "grad_norm": 0.36934471130371094, "learning_rate": 4.1598235207194274e-05, "loss": 0.2659, "step": 8460 }, { "epoch": 0.5308015291094818, "grad_norm": 0.4492112398147583, "learning_rate": 4.158768022629879e-05, "loss": 0.127, "step": 8470 }, { "epoch": 0.5314282133233064, "grad_norm": 0.19504284858703613, "learning_rate": 4.157712524540331e-05, "loss": 0.1399, "step": 8480 }, { "epoch": 0.5320548975371311, "grad_norm": 0.7225850224494934, "learning_rate": 4.1566570264507824e-05, "loss": 0.0433, "step": 8490 }, { "epoch": 0.5326815817509557, "grad_norm": 11.716758728027344, "learning_rate": 4.1556015283612334e-05, "loss": 0.2159, "step": 8500 }, { "epoch": 0.5333082659647803, "grad_norm": 2.664440393447876, "learning_rate": 4.154546030271686e-05, "loss": 0.1738, "step": 8510 }, { "epoch": 0.533934950178605, "grad_norm": 2.3211724758148193, "learning_rate": 4.1534905321821374e-05, "loss": 0.1761, "step": 8520 }, { "epoch": 0.5345616343924297, "grad_norm": 0.3916930854320526, "learning_rate": 4.1524350340925884e-05, "loss": 0.1548, "step": 8530 }, { "epoch": 0.5351883186062543, "grad_norm": 0.0378059558570385, "learning_rate": 4.15137953600304e-05, "loss": 0.1087, "step": 8540 }, { "epoch": 0.535815002820079, "grad_norm": 4.720902919769287, "learning_rate": 4.150324037913492e-05, "loss": 0.2359, "step": 8550 }, { "epoch": 0.5364416870339036, "grad_norm": 0.04153915494680405, "learning_rate": 4.1492685398239434e-05, "loss": 0.1439, "step": 8560 }, { "epoch": 0.5370683712477282, "grad_norm": 0.03464338183403015, "learning_rate": 4.1482130417343944e-05, "loss": 0.1784, "step": 8570 }, { "epoch": 0.5376950554615529, "grad_norm": 0.17074334621429443, "learning_rate": 4.147157543644846e-05, "loss": 0.1967, "step": 8580 }, { "epoch": 0.5383217396753776, "grad_norm": 0.5356113314628601, "learning_rate": 4.146102045555298e-05, "loss": 0.2086, "step": 8590 }, { "epoch": 0.5389484238892023, "grad_norm": 0.6513091921806335, "learning_rate": 4.1450465474657494e-05, "loss": 0.1085, "step": 8600 }, { "epoch": 0.5395751081030269, "grad_norm": 0.09826638549566269, "learning_rate": 4.143991049376201e-05, "loss": 0.073, "step": 8610 }, { "epoch": 0.5402017923168515, "grad_norm": 3.7576205730438232, "learning_rate": 4.142935551286653e-05, "loss": 0.1715, "step": 8620 }, { "epoch": 0.5408284765306762, "grad_norm": 2.084686279296875, "learning_rate": 4.1418800531971044e-05, "loss": 0.1808, "step": 8630 }, { "epoch": 0.5414551607445008, "grad_norm": 6.913938999176025, "learning_rate": 4.1408245551075554e-05, "loss": 0.0754, "step": 8640 }, { "epoch": 0.5420818449583255, "grad_norm": 0.18210375308990479, "learning_rate": 4.139769057018007e-05, "loss": 0.0596, "step": 8650 }, { "epoch": 0.5427085291721502, "grad_norm": 0.15195395052433014, "learning_rate": 4.138713558928459e-05, "loss": 0.1203, "step": 8660 }, { "epoch": 0.5433352133859748, "grad_norm": 2.44944429397583, "learning_rate": 4.13765806083891e-05, "loss": 0.1983, "step": 8670 }, { "epoch": 0.5439618975997995, "grad_norm": 0.9037182331085205, "learning_rate": 4.1366025627493613e-05, "loss": 0.0473, "step": 8680 }, { "epoch": 0.5445885818136241, "grad_norm": 0.0421769879758358, "learning_rate": 4.135547064659813e-05, "loss": 0.2814, "step": 8690 }, { "epoch": 0.5452152660274487, "grad_norm": 0.23261401057243347, "learning_rate": 4.134491566570265e-05, "loss": 0.1457, "step": 8700 }, { "epoch": 0.5458419502412735, "grad_norm": 0.0654134452342987, "learning_rate": 4.1334360684807163e-05, "loss": 0.026, "step": 8710 }, { "epoch": 0.5464686344550981, "grad_norm": 4.39208459854126, "learning_rate": 4.132380570391168e-05, "loss": 0.0946, "step": 8720 }, { "epoch": 0.5470953186689227, "grad_norm": 0.5686179399490356, "learning_rate": 4.13132507230162e-05, "loss": 0.1669, "step": 8730 }, { "epoch": 0.5477220028827474, "grad_norm": 4.334964275360107, "learning_rate": 4.1302695742120707e-05, "loss": 0.1729, "step": 8740 }, { "epoch": 0.548348687096572, "grad_norm": 0.15098807215690613, "learning_rate": 4.129214076122522e-05, "loss": 0.1916, "step": 8750 }, { "epoch": 0.5489753713103966, "grad_norm": 0.24327068030834198, "learning_rate": 4.128158578032974e-05, "loss": 0.1069, "step": 8760 }, { "epoch": 0.5496020555242214, "grad_norm": 0.160555899143219, "learning_rate": 4.1271030799434257e-05, "loss": 0.1482, "step": 8770 }, { "epoch": 0.550228739738046, "grad_norm": 0.2681258022785187, "learning_rate": 4.1260475818538766e-05, "loss": 0.0383, "step": 8780 }, { "epoch": 0.5508554239518707, "grad_norm": 0.02957313507795334, "learning_rate": 4.124992083764328e-05, "loss": 0.006, "step": 8790 }, { "epoch": 0.5514821081656953, "grad_norm": 0.49831515550613403, "learning_rate": 4.12393658567478e-05, "loss": 0.1116, "step": 8800 }, { "epoch": 0.5521087923795199, "grad_norm": 0.022528450936079025, "learning_rate": 4.1228810875852316e-05, "loss": 0.0708, "step": 8810 }, { "epoch": 0.5527354765933447, "grad_norm": 0.4816558063030243, "learning_rate": 4.121825589495683e-05, "loss": 0.2754, "step": 8820 }, { "epoch": 0.5533621608071693, "grad_norm": 0.4593820571899414, "learning_rate": 4.120770091406135e-05, "loss": 0.1499, "step": 8830 }, { "epoch": 0.5539888450209939, "grad_norm": 0.5131998062133789, "learning_rate": 4.1197145933165866e-05, "loss": 0.1313, "step": 8840 }, { "epoch": 0.5546155292348186, "grad_norm": 0.5724000930786133, "learning_rate": 4.1186590952270376e-05, "loss": 0.2599, "step": 8850 }, { "epoch": 0.5552422134486432, "grad_norm": 0.21304863691329956, "learning_rate": 4.117603597137489e-05, "loss": 0.2477, "step": 8860 }, { "epoch": 0.5558688976624678, "grad_norm": 0.1989622563123703, "learning_rate": 4.116548099047941e-05, "loss": 0.1082, "step": 8870 }, { "epoch": 0.5564955818762926, "grad_norm": 0.16896386444568634, "learning_rate": 4.1154926009583926e-05, "loss": 0.1285, "step": 8880 }, { "epoch": 0.5571222660901172, "grad_norm": 0.305745005607605, "learning_rate": 4.1144371028688436e-05, "loss": 0.2389, "step": 8890 }, { "epoch": 0.5577489503039419, "grad_norm": 0.23722957074642181, "learning_rate": 4.113381604779295e-05, "loss": 0.2969, "step": 8900 }, { "epoch": 0.5583756345177665, "grad_norm": 0.3046024739742279, "learning_rate": 4.1123261066897476e-05, "loss": 0.1207, "step": 8910 }, { "epoch": 0.5590023187315911, "grad_norm": 5.19449520111084, "learning_rate": 4.1112706086001986e-05, "loss": 0.1542, "step": 8920 }, { "epoch": 0.5596290029454158, "grad_norm": 0.25576645135879517, "learning_rate": 4.11021511051065e-05, "loss": 0.0896, "step": 8930 }, { "epoch": 0.5602556871592405, "grad_norm": 0.04636983945965767, "learning_rate": 4.109159612421102e-05, "loss": 0.0247, "step": 8940 }, { "epoch": 0.5608823713730651, "grad_norm": 1.0605615377426147, "learning_rate": 4.1081041143315536e-05, "loss": 0.2397, "step": 8950 }, { "epoch": 0.5615090555868898, "grad_norm": 0.077987901866436, "learning_rate": 4.1070486162420046e-05, "loss": 0.0849, "step": 8960 }, { "epoch": 0.5621357398007144, "grad_norm": 0.4862576425075531, "learning_rate": 4.105993118152456e-05, "loss": 0.2118, "step": 8970 }, { "epoch": 0.562762424014539, "grad_norm": 0.3730028569698334, "learning_rate": 4.104937620062908e-05, "loss": 0.2261, "step": 8980 }, { "epoch": 0.5633891082283637, "grad_norm": 0.4581931233406067, "learning_rate": 4.103882121973359e-05, "loss": 0.1878, "step": 8990 }, { "epoch": 0.5640157924421884, "grad_norm": 0.2025977373123169, "learning_rate": 4.1028266238838106e-05, "loss": 0.2524, "step": 9000 }, { "epoch": 0.5646424766560131, "grad_norm": 0.7682130336761475, "learning_rate": 4.101771125794263e-05, "loss": 0.1354, "step": 9010 }, { "epoch": 0.5652691608698377, "grad_norm": 0.149651437997818, "learning_rate": 4.1007156277047146e-05, "loss": 0.0785, "step": 9020 }, { "epoch": 0.5658958450836623, "grad_norm": 0.07458806782960892, "learning_rate": 4.0996601296151656e-05, "loss": 0.0196, "step": 9030 }, { "epoch": 0.566522529297487, "grad_norm": 44.61407470703125, "learning_rate": 4.098604631525617e-05, "loss": 0.0744, "step": 9040 }, { "epoch": 0.5671492135113116, "grad_norm": 0.7599819898605347, "learning_rate": 4.097549133436069e-05, "loss": 0.1664, "step": 9050 }, { "epoch": 0.5677758977251363, "grad_norm": 0.8320948481559753, "learning_rate": 4.09649363534652e-05, "loss": 0.2156, "step": 9060 }, { "epoch": 0.568402581938961, "grad_norm": 0.09615202993154526, "learning_rate": 4.0954381372569715e-05, "loss": 0.0353, "step": 9070 }, { "epoch": 0.5690292661527856, "grad_norm": 3.353734254837036, "learning_rate": 4.094382639167423e-05, "loss": 0.1515, "step": 9080 }, { "epoch": 0.5696559503666103, "grad_norm": 0.161566361784935, "learning_rate": 4.093327141077875e-05, "loss": 0.2015, "step": 9090 }, { "epoch": 0.5702826345804349, "grad_norm": 0.31871262192726135, "learning_rate": 4.0922716429883265e-05, "loss": 0.1474, "step": 9100 }, { "epoch": 0.5709093187942595, "grad_norm": 2.3990204334259033, "learning_rate": 4.091216144898778e-05, "loss": 0.2111, "step": 9110 }, { "epoch": 0.5715360030080843, "grad_norm": 0.35437873005867004, "learning_rate": 4.09016064680923e-05, "loss": 0.1354, "step": 9120 }, { "epoch": 0.5721626872219089, "grad_norm": 0.26824936270713806, "learning_rate": 4.089105148719681e-05, "loss": 0.2922, "step": 9130 }, { "epoch": 0.5727893714357335, "grad_norm": 0.1944427490234375, "learning_rate": 4.0880496506301325e-05, "loss": 0.0781, "step": 9140 }, { "epoch": 0.5734160556495582, "grad_norm": 0.08825784921646118, "learning_rate": 4.086994152540584e-05, "loss": 0.1337, "step": 9150 }, { "epoch": 0.5740427398633828, "grad_norm": 0.08061042428016663, "learning_rate": 4.085938654451036e-05, "loss": 0.0975, "step": 9160 }, { "epoch": 0.5746694240772074, "grad_norm": 0.2497250884771347, "learning_rate": 4.084883156361487e-05, "loss": 0.0555, "step": 9170 }, { "epoch": 0.5752961082910322, "grad_norm": 0.11670225113630295, "learning_rate": 4.0838276582719385e-05, "loss": 0.0195, "step": 9180 }, { "epoch": 0.5759227925048568, "grad_norm": 0.09469518810510635, "learning_rate": 4.08277216018239e-05, "loss": 0.1508, "step": 9190 }, { "epoch": 0.5765494767186815, "grad_norm": 0.04566322639584541, "learning_rate": 4.081716662092842e-05, "loss": 0.1271, "step": 9200 }, { "epoch": 0.5771761609325061, "grad_norm": 0.6333693861961365, "learning_rate": 4.0806611640032935e-05, "loss": 0.2478, "step": 9210 }, { "epoch": 0.5778028451463307, "grad_norm": 0.05764749273657799, "learning_rate": 4.079605665913745e-05, "loss": 0.0117, "step": 9220 }, { "epoch": 0.5784295293601555, "grad_norm": 0.6019806265830994, "learning_rate": 4.078550167824197e-05, "loss": 0.1013, "step": 9230 }, { "epoch": 0.5790562135739801, "grad_norm": 0.11212746053934097, "learning_rate": 4.077494669734648e-05, "loss": 0.0529, "step": 9240 }, { "epoch": 0.5796828977878047, "grad_norm": 0.045619022101163864, "learning_rate": 4.0764391716450995e-05, "loss": 0.2901, "step": 9250 }, { "epoch": 0.5803095820016294, "grad_norm": 0.20387081801891327, "learning_rate": 4.075383673555551e-05, "loss": 0.3294, "step": 9260 }, { "epoch": 0.580936266215454, "grad_norm": 0.2026294320821762, "learning_rate": 4.074328175466003e-05, "loss": 0.0859, "step": 9270 }, { "epoch": 0.5815629504292786, "grad_norm": 0.07742750644683838, "learning_rate": 4.073272677376454e-05, "loss": 0.0185, "step": 9280 }, { "epoch": 0.5821896346431034, "grad_norm": 0.06963720172643661, "learning_rate": 4.0722171792869055e-05, "loss": 0.2457, "step": 9290 }, { "epoch": 0.582816318856928, "grad_norm": 0.0938970223069191, "learning_rate": 4.071161681197357e-05, "loss": 0.0966, "step": 9300 }, { "epoch": 0.5834430030707527, "grad_norm": 0.07957907766103745, "learning_rate": 4.070106183107809e-05, "loss": 0.0115, "step": 9310 }, { "epoch": 0.5840696872845773, "grad_norm": 0.06466685980558395, "learning_rate": 4.0690506850182605e-05, "loss": 0.1959, "step": 9320 }, { "epoch": 0.5846963714984019, "grad_norm": 8.07848834991455, "learning_rate": 4.067995186928712e-05, "loss": 0.1698, "step": 9330 }, { "epoch": 0.5853230557122266, "grad_norm": 3.3286468982696533, "learning_rate": 4.066939688839164e-05, "loss": 0.1606, "step": 9340 }, { "epoch": 0.5859497399260513, "grad_norm": 0.11040914058685303, "learning_rate": 4.065884190749615e-05, "loss": 0.1459, "step": 9350 }, { "epoch": 0.5865764241398759, "grad_norm": 0.13724346458911896, "learning_rate": 4.0648286926600664e-05, "loss": 0.0829, "step": 9360 }, { "epoch": 0.5872031083537006, "grad_norm": 0.22262322902679443, "learning_rate": 4.063773194570518e-05, "loss": 0.1914, "step": 9370 }, { "epoch": 0.5878297925675252, "grad_norm": 0.16995050013065338, "learning_rate": 4.062717696480969e-05, "loss": 0.2103, "step": 9380 }, { "epoch": 0.5884564767813498, "grad_norm": 0.22897568345069885, "learning_rate": 4.061662198391421e-05, "loss": 0.1384, "step": 9390 }, { "epoch": 0.5890831609951745, "grad_norm": 3.172271490097046, "learning_rate": 4.0606067003018724e-05, "loss": 0.0875, "step": 9400 }, { "epoch": 0.5897098452089992, "grad_norm": 0.12132357060909271, "learning_rate": 4.059551202212325e-05, "loss": 0.1005, "step": 9410 }, { "epoch": 0.5903365294228239, "grad_norm": 0.09609333425760269, "learning_rate": 4.058495704122776e-05, "loss": 0.046, "step": 9420 }, { "epoch": 0.5909632136366485, "grad_norm": 3.4071130752563477, "learning_rate": 4.0574402060332274e-05, "loss": 0.1476, "step": 9430 }, { "epoch": 0.5915898978504731, "grad_norm": 0.08280141651630402, "learning_rate": 4.056384707943679e-05, "loss": 0.1378, "step": 9440 }, { "epoch": 0.5922165820642978, "grad_norm": 0.13492263853549957, "learning_rate": 4.05532920985413e-05, "loss": 0.2137, "step": 9450 }, { "epoch": 0.5928432662781224, "grad_norm": 2.974963665008545, "learning_rate": 4.054273711764582e-05, "loss": 0.183, "step": 9460 }, { "epoch": 0.5934699504919471, "grad_norm": 2.765462636947632, "learning_rate": 4.0532182136750334e-05, "loss": 0.168, "step": 9470 }, { "epoch": 0.5940966347057718, "grad_norm": 0.14917774498462677, "learning_rate": 4.052162715585485e-05, "loss": 0.1352, "step": 9480 }, { "epoch": 0.5947233189195964, "grad_norm": 0.1324753761291504, "learning_rate": 4.051107217495936e-05, "loss": 0.0498, "step": 9490 }, { "epoch": 0.5953500031334211, "grad_norm": 3.4256207942962646, "learning_rate": 4.050051719406388e-05, "loss": 0.3867, "step": 9500 }, { "epoch": 0.5959766873472457, "grad_norm": 2.0906643867492676, "learning_rate": 4.04899622131684e-05, "loss": 0.295, "step": 9510 }, { "epoch": 0.5966033715610704, "grad_norm": 0.2582828104496002, "learning_rate": 4.047940723227291e-05, "loss": 0.2008, "step": 9520 }, { "epoch": 0.5972300557748951, "grad_norm": 0.3075207769870758, "learning_rate": 4.046885225137743e-05, "loss": 0.0913, "step": 9530 }, { "epoch": 0.5978567399887197, "grad_norm": 0.5495119690895081, "learning_rate": 4.0458297270481944e-05, "loss": 0.115, "step": 9540 }, { "epoch": 0.5984834242025443, "grad_norm": 0.28728142380714417, "learning_rate": 4.044774228958646e-05, "loss": 0.1403, "step": 9550 }, { "epoch": 0.599110108416369, "grad_norm": 0.25113600492477417, "learning_rate": 4.043718730869097e-05, "loss": 0.0978, "step": 9560 }, { "epoch": 0.5997367926301936, "grad_norm": 11.35430908203125, "learning_rate": 4.042663232779549e-05, "loss": 0.2455, "step": 9570 }, { "epoch": 0.6003634768440183, "grad_norm": 0.2805491089820862, "learning_rate": 4.0416077346900004e-05, "loss": 0.1732, "step": 9580 }, { "epoch": 0.600990161057843, "grad_norm": 0.5016766786575317, "learning_rate": 4.0405522366004514e-05, "loss": 0.1413, "step": 9590 }, { "epoch": 0.6016168452716676, "grad_norm": 0.1248757615685463, "learning_rate": 4.039496738510904e-05, "loss": 0.2073, "step": 9600 }, { "epoch": 0.6022435294854923, "grad_norm": 0.18444593250751495, "learning_rate": 4.0384412404213554e-05, "loss": 0.1955, "step": 9610 }, { "epoch": 0.6028702136993169, "grad_norm": 1.08712899684906, "learning_rate": 4.037385742331807e-05, "loss": 0.1148, "step": 9620 }, { "epoch": 0.6034968979131415, "grad_norm": 0.23765408992767334, "learning_rate": 4.036330244242258e-05, "loss": 0.0364, "step": 9630 }, { "epoch": 0.6041235821269663, "grad_norm": 0.44067704677581787, "learning_rate": 4.03527474615271e-05, "loss": 0.2301, "step": 9640 }, { "epoch": 0.6047502663407909, "grad_norm": 0.13342992961406708, "learning_rate": 4.0342192480631613e-05, "loss": 0.0981, "step": 9650 }, { "epoch": 0.6053769505546155, "grad_norm": 14.491141319274902, "learning_rate": 4.033163749973613e-05, "loss": 0.2648, "step": 9660 }, { "epoch": 0.6060036347684402, "grad_norm": 0.32491153478622437, "learning_rate": 4.032108251884064e-05, "loss": 0.058, "step": 9670 }, { "epoch": 0.6066303189822648, "grad_norm": 0.49822214245796204, "learning_rate": 4.0310527537945157e-05, "loss": 0.1284, "step": 9680 }, { "epoch": 0.6072570031960894, "grad_norm": 0.06606928259134293, "learning_rate": 4.029997255704967e-05, "loss": 0.1202, "step": 9690 }, { "epoch": 0.6078836874099142, "grad_norm": 0.10034257173538208, "learning_rate": 4.028941757615419e-05, "loss": 0.1395, "step": 9700 }, { "epoch": 0.6085103716237388, "grad_norm": 3.284310817718506, "learning_rate": 4.0278862595258707e-05, "loss": 0.2535, "step": 9710 }, { "epoch": 0.6091370558375635, "grad_norm": 3.463360548019409, "learning_rate": 4.026830761436322e-05, "loss": 0.1223, "step": 9720 }, { "epoch": 0.6097637400513881, "grad_norm": 0.18776389956474304, "learning_rate": 4.025775263346774e-05, "loss": 0.0571, "step": 9730 }, { "epoch": 0.6103904242652127, "grad_norm": 0.12362643331289291, "learning_rate": 4.024719765257225e-05, "loss": 0.0691, "step": 9740 }, { "epoch": 0.6110171084790375, "grad_norm": 6.846742630004883, "learning_rate": 4.0236642671676766e-05, "loss": 0.1198, "step": 9750 }, { "epoch": 0.6116437926928621, "grad_norm": 3.5688319206237793, "learning_rate": 4.022608769078128e-05, "loss": 0.2297, "step": 9760 }, { "epoch": 0.6122704769066867, "grad_norm": 8.02388858795166, "learning_rate": 4.021553270988579e-05, "loss": 0.1796, "step": 9770 }, { "epoch": 0.6128971611205114, "grad_norm": 0.3712387681007385, "learning_rate": 4.020497772899031e-05, "loss": 0.2178, "step": 9780 }, { "epoch": 0.613523845334336, "grad_norm": 0.2477940171957016, "learning_rate": 4.0194422748094826e-05, "loss": 0.0652, "step": 9790 }, { "epoch": 0.6141505295481606, "grad_norm": 4.8273210525512695, "learning_rate": 4.018386776719934e-05, "loss": 0.1627, "step": 9800 }, { "epoch": 0.6147772137619854, "grad_norm": 0.5402454137802124, "learning_rate": 4.017331278630386e-05, "loss": 0.1057, "step": 9810 }, { "epoch": 0.61540389797581, "grad_norm": 0.2719340920448303, "learning_rate": 4.0162757805408376e-05, "loss": 0.2223, "step": 9820 }, { "epoch": 0.6160305821896347, "grad_norm": 0.45198434591293335, "learning_rate": 4.015220282451289e-05, "loss": 0.1297, "step": 9830 }, { "epoch": 0.6166572664034593, "grad_norm": 2.249263048171997, "learning_rate": 4.01416478436174e-05, "loss": 0.3277, "step": 9840 }, { "epoch": 0.6172839506172839, "grad_norm": 2.3682870864868164, "learning_rate": 4.013109286272192e-05, "loss": 0.0852, "step": 9850 }, { "epoch": 0.6179106348311086, "grad_norm": 0.9312769770622253, "learning_rate": 4.0120537881826436e-05, "loss": 0.1365, "step": 9860 }, { "epoch": 0.6185373190449333, "grad_norm": 0.24179698526859283, "learning_rate": 4.010998290093095e-05, "loss": 0.1263, "step": 9870 }, { "epoch": 0.6191640032587579, "grad_norm": 0.24173693358898163, "learning_rate": 4.009942792003546e-05, "loss": 0.1996, "step": 9880 }, { "epoch": 0.6197906874725826, "grad_norm": 0.07061693072319031, "learning_rate": 4.008887293913998e-05, "loss": 0.2337, "step": 9890 }, { "epoch": 0.6204173716864072, "grad_norm": 0.650595486164093, "learning_rate": 4.0078317958244496e-05, "loss": 0.1252, "step": 9900 }, { "epoch": 0.6210440559002319, "grad_norm": 0.09584268927574158, "learning_rate": 4.006776297734901e-05, "loss": 0.0382, "step": 9910 }, { "epoch": 0.6216707401140565, "grad_norm": 61.076053619384766, "learning_rate": 4.005720799645353e-05, "loss": 0.0504, "step": 9920 }, { "epoch": 0.6222974243278812, "grad_norm": 0.37995097041130066, "learning_rate": 4.0046653015558046e-05, "loss": 0.0523, "step": 9930 }, { "epoch": 0.6229241085417059, "grad_norm": 15.864818572998047, "learning_rate": 4.003609803466256e-05, "loss": 0.2305, "step": 9940 }, { "epoch": 0.6235507927555305, "grad_norm": 31.124530792236328, "learning_rate": 4.002554305376707e-05, "loss": 0.0783, "step": 9950 }, { "epoch": 0.6241774769693551, "grad_norm": 3.1052074432373047, "learning_rate": 4.001498807287159e-05, "loss": 0.0864, "step": 9960 }, { "epoch": 0.6248041611831798, "grad_norm": 0.4385007619857788, "learning_rate": 4.0004433091976106e-05, "loss": 0.323, "step": 9970 }, { "epoch": 0.6254308453970044, "grad_norm": 2.410914659500122, "learning_rate": 3.9993878111080615e-05, "loss": 0.0486, "step": 9980 }, { "epoch": 0.626057529610829, "grad_norm": 0.31448647379875183, "learning_rate": 3.998332313018513e-05, "loss": 0.1045, "step": 9990 }, { "epoch": 0.6266842138246538, "grad_norm": 0.04953588917851448, "learning_rate": 3.9972768149289656e-05, "loss": 0.0343, "step": 10000 }, { "epoch": 0.6273108980384784, "grad_norm": 0.04356538504362106, "learning_rate": 3.996221316839417e-05, "loss": 0.2221, "step": 10010 }, { "epoch": 0.6279375822523031, "grad_norm": 0.3184496760368347, "learning_rate": 3.995165818749868e-05, "loss": 0.2251, "step": 10020 }, { "epoch": 0.6285642664661277, "grad_norm": 0.6410167217254639, "learning_rate": 3.99411032066032e-05, "loss": 0.0652, "step": 10030 }, { "epoch": 0.6291909506799523, "grad_norm": 3.2516913414001465, "learning_rate": 3.9930548225707715e-05, "loss": 0.1548, "step": 10040 }, { "epoch": 0.6298176348937771, "grad_norm": 0.24519649147987366, "learning_rate": 3.991999324481223e-05, "loss": 0.1789, "step": 10050 }, { "epoch": 0.6304443191076017, "grad_norm": 0.1177930012345314, "learning_rate": 3.990943826391674e-05, "loss": 0.1067, "step": 10060 }, { "epoch": 0.6310710033214263, "grad_norm": 0.08309656381607056, "learning_rate": 3.989888328302126e-05, "loss": 0.0873, "step": 10070 }, { "epoch": 0.631697687535251, "grad_norm": 0.3515457212924957, "learning_rate": 3.9888328302125775e-05, "loss": 0.3244, "step": 10080 }, { "epoch": 0.6323243717490756, "grad_norm": 0.4474419355392456, "learning_rate": 3.9877773321230285e-05, "loss": 0.1005, "step": 10090 }, { "epoch": 0.6329510559629002, "grad_norm": 0.13745348155498505, "learning_rate": 3.986721834033481e-05, "loss": 0.2018, "step": 10100 }, { "epoch": 0.633577740176725, "grad_norm": 0.32032960653305054, "learning_rate": 3.9856663359439325e-05, "loss": 0.069, "step": 10110 }, { "epoch": 0.6342044243905496, "grad_norm": 2.037172317504883, "learning_rate": 3.984610837854384e-05, "loss": 0.0672, "step": 10120 }, { "epoch": 0.6348311086043743, "grad_norm": 5.471995830535889, "learning_rate": 3.983555339764835e-05, "loss": 0.2724, "step": 10130 }, { "epoch": 0.6354577928181989, "grad_norm": 0.14681348204612732, "learning_rate": 3.982499841675287e-05, "loss": 0.1718, "step": 10140 }, { "epoch": 0.6360844770320235, "grad_norm": 1.499589443206787, "learning_rate": 3.9814443435857385e-05, "loss": 0.1107, "step": 10150 }, { "epoch": 0.6367111612458483, "grad_norm": 17.15564727783203, "learning_rate": 3.9803888454961895e-05, "loss": 0.1474, "step": 10160 }, { "epoch": 0.6373378454596729, "grad_norm": 2.377847909927368, "learning_rate": 3.979333347406641e-05, "loss": 0.0473, "step": 10170 }, { "epoch": 0.6379645296734975, "grad_norm": 1.3150869607925415, "learning_rate": 3.978277849317093e-05, "loss": 0.2218, "step": 10180 }, { "epoch": 0.6385912138873222, "grad_norm": 0.0968065857887268, "learning_rate": 3.9772223512275445e-05, "loss": 0.1458, "step": 10190 }, { "epoch": 0.6392178981011468, "grad_norm": 0.0983181819319725, "learning_rate": 3.976166853137996e-05, "loss": 0.0453, "step": 10200 }, { "epoch": 0.6398445823149714, "grad_norm": 0.128788560628891, "learning_rate": 3.975111355048448e-05, "loss": 0.077, "step": 10210 }, { "epoch": 0.6404712665287962, "grad_norm": 0.09855977445840836, "learning_rate": 3.9740558569588995e-05, "loss": 0.1964, "step": 10220 }, { "epoch": 0.6410979507426208, "grad_norm": 0.07261627167463303, "learning_rate": 3.9730003588693505e-05, "loss": 0.117, "step": 10230 }, { "epoch": 0.6417246349564455, "grad_norm": 0.1142692044377327, "learning_rate": 3.971944860779802e-05, "loss": 0.0995, "step": 10240 }, { "epoch": 0.6423513191702701, "grad_norm": 0.12659485638141632, "learning_rate": 3.970889362690254e-05, "loss": 0.0944, "step": 10250 }, { "epoch": 0.6429780033840947, "grad_norm": 7.10390567779541, "learning_rate": 3.9698338646007055e-05, "loss": 0.3312, "step": 10260 }, { "epoch": 0.6436046875979194, "grad_norm": 0.6107752919197083, "learning_rate": 3.9687783665111564e-05, "loss": 0.0876, "step": 10270 }, { "epoch": 0.6442313718117441, "grad_norm": 0.16075240075588226, "learning_rate": 3.967722868421608e-05, "loss": 0.0315, "step": 10280 }, { "epoch": 0.6448580560255687, "grad_norm": 0.17103694379329681, "learning_rate": 3.96666737033206e-05, "loss": 0.1716, "step": 10290 }, { "epoch": 0.6454847402393934, "grad_norm": 0.08142802864313126, "learning_rate": 3.9656118722425114e-05, "loss": 0.1019, "step": 10300 }, { "epoch": 0.646111424453218, "grad_norm": 0.07293316721916199, "learning_rate": 3.964556374152963e-05, "loss": 0.1578, "step": 10310 }, { "epoch": 0.6467381086670427, "grad_norm": 0.09897245466709137, "learning_rate": 3.963500876063415e-05, "loss": 0.1362, "step": 10320 }, { "epoch": 0.6473647928808673, "grad_norm": 3.195725202560425, "learning_rate": 3.9624453779738664e-05, "loss": 0.073, "step": 10330 }, { "epoch": 0.647991477094692, "grad_norm": 2.7055459022521973, "learning_rate": 3.9613898798843174e-05, "loss": 0.3671, "step": 10340 }, { "epoch": 0.6486181613085167, "grad_norm": 0.9218514561653137, "learning_rate": 3.960334381794769e-05, "loss": 0.0901, "step": 10350 }, { "epoch": 0.6492448455223413, "grad_norm": 0.10140139609575272, "learning_rate": 3.959278883705221e-05, "loss": 0.1005, "step": 10360 }, { "epoch": 0.6498715297361659, "grad_norm": 0.12973734736442566, "learning_rate": 3.958223385615672e-05, "loss": 0.098, "step": 10370 }, { "epoch": 0.6504982139499906, "grad_norm": 0.13121546804904938, "learning_rate": 3.9571678875261234e-05, "loss": 0.0977, "step": 10380 }, { "epoch": 0.6511248981638152, "grad_norm": 0.043394092470407486, "learning_rate": 3.956112389436575e-05, "loss": 0.2903, "step": 10390 }, { "epoch": 0.6517515823776399, "grad_norm": 0.1369188278913498, "learning_rate": 3.955056891347027e-05, "loss": 0.0737, "step": 10400 }, { "epoch": 0.6523782665914646, "grad_norm": 0.2751493752002716, "learning_rate": 3.9540013932574784e-05, "loss": 0.0196, "step": 10410 }, { "epoch": 0.6530049508052892, "grad_norm": 11.32926082611084, "learning_rate": 3.95294589516793e-05, "loss": 0.1617, "step": 10420 }, { "epoch": 0.6536316350191139, "grad_norm": 0.5413795113563538, "learning_rate": 3.951890397078382e-05, "loss": 0.1212, "step": 10430 }, { "epoch": 0.6542583192329385, "grad_norm": 0.045053768903017044, "learning_rate": 3.950834898988833e-05, "loss": 0.2104, "step": 10440 }, { "epoch": 0.6548850034467631, "grad_norm": 3.709371328353882, "learning_rate": 3.9497794008992844e-05, "loss": 0.1815, "step": 10450 }, { "epoch": 0.6555116876605879, "grad_norm": 0.1551133543252945, "learning_rate": 3.948723902809736e-05, "loss": 0.2463, "step": 10460 }, { "epoch": 0.6561383718744125, "grad_norm": 0.2704763114452362, "learning_rate": 3.947668404720188e-05, "loss": 0.2314, "step": 10470 }, { "epoch": 0.6567650560882371, "grad_norm": 0.2972564399242401, "learning_rate": 3.946612906630639e-05, "loss": 0.2011, "step": 10480 }, { "epoch": 0.6573917403020618, "grad_norm": 3.8430252075195312, "learning_rate": 3.9455574085410904e-05, "loss": 0.1605, "step": 10490 }, { "epoch": 0.6580184245158864, "grad_norm": 2.99223256111145, "learning_rate": 3.944501910451543e-05, "loss": 0.2343, "step": 10500 }, { "epoch": 0.658645108729711, "grad_norm": 2.406914710998535, "learning_rate": 3.9434464123619944e-05, "loss": 0.3027, "step": 10510 }, { "epoch": 0.6592717929435358, "grad_norm": 5.0974249839782715, "learning_rate": 3.9423909142724454e-05, "loss": 0.2039, "step": 10520 }, { "epoch": 0.6598984771573604, "grad_norm": 0.17414742708206177, "learning_rate": 3.941335416182897e-05, "loss": 0.0915, "step": 10530 }, { "epoch": 0.6605251613711851, "grad_norm": 2.815800189971924, "learning_rate": 3.940279918093349e-05, "loss": 0.1515, "step": 10540 }, { "epoch": 0.6611518455850097, "grad_norm": 0.3265558183193207, "learning_rate": 3.9392244200038e-05, "loss": 0.0477, "step": 10550 }, { "epoch": 0.6617785297988343, "grad_norm": 0.16620047390460968, "learning_rate": 3.9381689219142513e-05, "loss": 0.1954, "step": 10560 }, { "epoch": 0.6624052140126591, "grad_norm": 3.4621222019195557, "learning_rate": 3.937113423824703e-05, "loss": 0.2202, "step": 10570 }, { "epoch": 0.6630318982264837, "grad_norm": 0.23589849472045898, "learning_rate": 3.936057925735155e-05, "loss": 0.1349, "step": 10580 }, { "epoch": 0.6636585824403083, "grad_norm": 0.573025643825531, "learning_rate": 3.935002427645606e-05, "loss": 0.1222, "step": 10590 }, { "epoch": 0.664285266654133, "grad_norm": 0.22413015365600586, "learning_rate": 3.933946929556058e-05, "loss": 0.1157, "step": 10600 }, { "epoch": 0.6649119508679576, "grad_norm": 0.8842704892158508, "learning_rate": 3.93289143146651e-05, "loss": 0.0963, "step": 10610 }, { "epoch": 0.6655386350817822, "grad_norm": 2.976203680038452, "learning_rate": 3.931835933376961e-05, "loss": 0.1808, "step": 10620 }, { "epoch": 0.666165319295607, "grad_norm": 0.048292968422174454, "learning_rate": 3.930780435287412e-05, "loss": 0.0867, "step": 10630 }, { "epoch": 0.6667920035094316, "grad_norm": 0.40534451603889465, "learning_rate": 3.929724937197864e-05, "loss": 0.2062, "step": 10640 }, { "epoch": 0.6674186877232563, "grad_norm": 0.3869112432003021, "learning_rate": 3.9286694391083157e-05, "loss": 0.0908, "step": 10650 }, { "epoch": 0.6680453719370809, "grad_norm": 3.531752824783325, "learning_rate": 3.9276139410187666e-05, "loss": 0.197, "step": 10660 }, { "epoch": 0.6686720561509055, "grad_norm": 1.0175777673721313, "learning_rate": 3.926558442929218e-05, "loss": 0.1265, "step": 10670 }, { "epoch": 0.6692987403647302, "grad_norm": 55.90322494506836, "learning_rate": 3.92550294483967e-05, "loss": 0.0909, "step": 10680 }, { "epoch": 0.6699254245785549, "grad_norm": 0.06375054270029068, "learning_rate": 3.9244474467501216e-05, "loss": 0.3127, "step": 10690 }, { "epoch": 0.6705521087923795, "grad_norm": 1.1217492818832397, "learning_rate": 3.923391948660573e-05, "loss": 0.0276, "step": 10700 }, { "epoch": 0.6711787930062042, "grad_norm": 0.06201806664466858, "learning_rate": 3.922336450571025e-05, "loss": 0.009, "step": 10710 }, { "epoch": 0.6718054772200288, "grad_norm": 28.970144271850586, "learning_rate": 3.9212809524814766e-05, "loss": 0.016, "step": 10720 }, { "epoch": 0.6724321614338534, "grad_norm": 0.061413247138261795, "learning_rate": 3.9202254543919276e-05, "loss": 0.2635, "step": 10730 }, { "epoch": 0.6730588456476782, "grad_norm": 10.785362243652344, "learning_rate": 3.919169956302379e-05, "loss": 0.325, "step": 10740 }, { "epoch": 0.6736855298615028, "grad_norm": 0.165009006857872, "learning_rate": 3.918114458212831e-05, "loss": 0.2889, "step": 10750 }, { "epoch": 0.6743122140753275, "grad_norm": 0.1995621770620346, "learning_rate": 3.917058960123282e-05, "loss": 0.0757, "step": 10760 }, { "epoch": 0.6749388982891521, "grad_norm": 12.344490051269531, "learning_rate": 3.9160034620337336e-05, "loss": 0.0832, "step": 10770 }, { "epoch": 0.6755655825029767, "grad_norm": 0.12043475359678268, "learning_rate": 3.914947963944185e-05, "loss": 0.1071, "step": 10780 }, { "epoch": 0.6761922667168014, "grad_norm": 3.2827420234680176, "learning_rate": 3.913892465854637e-05, "loss": 0.2466, "step": 10790 }, { "epoch": 0.676818950930626, "grad_norm": 2.662658929824829, "learning_rate": 3.9128369677650886e-05, "loss": 0.2005, "step": 10800 }, { "epoch": 0.6774456351444507, "grad_norm": 3.157073974609375, "learning_rate": 3.91178146967554e-05, "loss": 0.3099, "step": 10810 }, { "epoch": 0.6780723193582754, "grad_norm": 0.13340094685554504, "learning_rate": 3.910725971585992e-05, "loss": 0.0161, "step": 10820 }, { "epoch": 0.6786990035721, "grad_norm": 0.18897636234760284, "learning_rate": 3.909670473496443e-05, "loss": 0.189, "step": 10830 }, { "epoch": 0.6793256877859247, "grad_norm": 0.26442772150039673, "learning_rate": 3.9086149754068946e-05, "loss": 0.2095, "step": 10840 }, { "epoch": 0.6799523719997493, "grad_norm": 2.244516134262085, "learning_rate": 3.907559477317346e-05, "loss": 0.21, "step": 10850 }, { "epoch": 0.680579056213574, "grad_norm": 54.9011116027832, "learning_rate": 3.906503979227798e-05, "loss": 0.0742, "step": 10860 }, { "epoch": 0.6812057404273987, "grad_norm": 0.1011706218123436, "learning_rate": 3.905448481138249e-05, "loss": 0.0987, "step": 10870 }, { "epoch": 0.6818324246412233, "grad_norm": 1.3590539693832397, "learning_rate": 3.9043929830487006e-05, "loss": 0.0191, "step": 10880 }, { "epoch": 0.6824591088550479, "grad_norm": 0.1384275257587433, "learning_rate": 3.903337484959152e-05, "loss": 0.1315, "step": 10890 }, { "epoch": 0.6830857930688726, "grad_norm": 0.08217030763626099, "learning_rate": 3.902281986869604e-05, "loss": 0.1368, "step": 10900 }, { "epoch": 0.6837124772826972, "grad_norm": 0.11884766072034836, "learning_rate": 3.9012264887800556e-05, "loss": 0.1184, "step": 10910 }, { "epoch": 0.6843391614965219, "grad_norm": 0.09507952630519867, "learning_rate": 3.900170990690507e-05, "loss": 0.1407, "step": 10920 }, { "epoch": 0.6849658457103466, "grad_norm": 0.34754639863967896, "learning_rate": 3.899115492600959e-05, "loss": 0.0828, "step": 10930 }, { "epoch": 0.6855925299241712, "grad_norm": 0.1255376785993576, "learning_rate": 3.89805999451141e-05, "loss": 0.1745, "step": 10940 }, { "epoch": 0.6862192141379959, "grad_norm": 3.3651392459869385, "learning_rate": 3.8970044964218615e-05, "loss": 0.2798, "step": 10950 }, { "epoch": 0.6868458983518205, "grad_norm": 0.08753059059381485, "learning_rate": 3.895948998332313e-05, "loss": 0.0903, "step": 10960 }, { "epoch": 0.6874725825656451, "grad_norm": 0.11063142120838165, "learning_rate": 3.894893500242765e-05, "loss": 0.1449, "step": 10970 }, { "epoch": 0.6880992667794699, "grad_norm": 0.11355997622013092, "learning_rate": 3.893838002153216e-05, "loss": 0.1304, "step": 10980 }, { "epoch": 0.6887259509932945, "grad_norm": 0.0895422026515007, "learning_rate": 3.8927825040636675e-05, "loss": 0.0438, "step": 10990 }, { "epoch": 0.6893526352071191, "grad_norm": 0.05675465241074562, "learning_rate": 3.89172700597412e-05, "loss": 0.0408, "step": 11000 }, { "epoch": 0.6899793194209438, "grad_norm": 0.7115016579627991, "learning_rate": 3.890671507884571e-05, "loss": 0.1766, "step": 11010 }, { "epoch": 0.6906060036347684, "grad_norm": 0.09861718118190765, "learning_rate": 3.8896160097950225e-05, "loss": 0.1257, "step": 11020 }, { "epoch": 0.691232687848593, "grad_norm": 0.9432891607284546, "learning_rate": 3.888560511705474e-05, "loss": 0.2082, "step": 11030 }, { "epoch": 0.6918593720624178, "grad_norm": 0.3203168213367462, "learning_rate": 3.887505013615926e-05, "loss": 0.1755, "step": 11040 }, { "epoch": 0.6924860562762424, "grad_norm": 0.22065997123718262, "learning_rate": 3.886449515526377e-05, "loss": 0.1434, "step": 11050 }, { "epoch": 0.6931127404900671, "grad_norm": 0.18533888459205627, "learning_rate": 3.8853940174368285e-05, "loss": 0.036, "step": 11060 }, { "epoch": 0.6937394247038917, "grad_norm": 0.09828941524028778, "learning_rate": 3.88433851934728e-05, "loss": 0.0293, "step": 11070 }, { "epoch": 0.6943661089177163, "grad_norm": 16.122100830078125, "learning_rate": 3.883283021257731e-05, "loss": 0.1103, "step": 11080 }, { "epoch": 0.694992793131541, "grad_norm": 0.05723113566637039, "learning_rate": 3.8822275231681835e-05, "loss": 0.1173, "step": 11090 }, { "epoch": 0.6956194773453657, "grad_norm": 8.0460205078125, "learning_rate": 3.881172025078635e-05, "loss": 0.1969, "step": 11100 }, { "epoch": 0.6962461615591903, "grad_norm": 0.07896614819765091, "learning_rate": 3.880116526989087e-05, "loss": 0.0467, "step": 11110 }, { "epoch": 0.696872845773015, "grad_norm": 5.321878433227539, "learning_rate": 3.879061028899538e-05, "loss": 0.1794, "step": 11120 }, { "epoch": 0.6974995299868396, "grad_norm": 0.0769563838839531, "learning_rate": 3.8780055308099895e-05, "loss": 0.0785, "step": 11130 }, { "epoch": 0.6981262142006642, "grad_norm": 2.8381550312042236, "learning_rate": 3.876950032720441e-05, "loss": 0.0109, "step": 11140 }, { "epoch": 0.698752898414489, "grad_norm": 0.10958783328533173, "learning_rate": 3.875894534630892e-05, "loss": 0.1323, "step": 11150 }, { "epoch": 0.6993795826283136, "grad_norm": 0.050287846475839615, "learning_rate": 3.874839036541344e-05, "loss": 0.1248, "step": 11160 }, { "epoch": 0.7000062668421383, "grad_norm": 0.07455500215291977, "learning_rate": 3.8737835384517955e-05, "loss": 0.1604, "step": 11170 }, { "epoch": 0.7006329510559629, "grad_norm": 0.1684502214193344, "learning_rate": 3.872728040362247e-05, "loss": 0.308, "step": 11180 }, { "epoch": 0.7012596352697875, "grad_norm": 0.10313006490468979, "learning_rate": 3.871672542272699e-05, "loss": 0.1469, "step": 11190 }, { "epoch": 0.7018863194836122, "grad_norm": 0.11166546493768692, "learning_rate": 3.8706170441831505e-05, "loss": 0.2037, "step": 11200 }, { "epoch": 0.7025130036974369, "grad_norm": 3.201416015625, "learning_rate": 3.869561546093602e-05, "loss": 0.1579, "step": 11210 }, { "epoch": 0.7031396879112615, "grad_norm": 0.16494852304458618, "learning_rate": 3.868506048004053e-05, "loss": 0.1543, "step": 11220 }, { "epoch": 0.7037663721250862, "grad_norm": 3.2456631660461426, "learning_rate": 3.867450549914505e-05, "loss": 0.1165, "step": 11230 }, { "epoch": 0.7043930563389108, "grad_norm": 1.6267149448394775, "learning_rate": 3.8663950518249564e-05, "loss": 0.1047, "step": 11240 }, { "epoch": 0.7050197405527355, "grad_norm": 0.4653446078300476, "learning_rate": 3.865339553735408e-05, "loss": 0.09, "step": 11250 }, { "epoch": 0.7056464247665601, "grad_norm": 6.371344089508057, "learning_rate": 3.864284055645859e-05, "loss": 0.3287, "step": 11260 }, { "epoch": 0.7062731089803848, "grad_norm": 0.11281926184892654, "learning_rate": 3.863228557556311e-05, "loss": 0.0415, "step": 11270 }, { "epoch": 0.7068997931942095, "grad_norm": 0.0942821130156517, "learning_rate": 3.8621730594667624e-05, "loss": 0.0532, "step": 11280 }, { "epoch": 0.7075264774080341, "grad_norm": 0.2059144377708435, "learning_rate": 3.861117561377214e-05, "loss": 0.2404, "step": 11290 }, { "epoch": 0.7081531616218587, "grad_norm": 0.08037558943033218, "learning_rate": 3.860062063287666e-05, "loss": 0.1778, "step": 11300 }, { "epoch": 0.7087798458356834, "grad_norm": 9.817173957824707, "learning_rate": 3.8590065651981174e-05, "loss": 0.202, "step": 11310 }, { "epoch": 0.709406530049508, "grad_norm": 0.15982958674430847, "learning_rate": 3.857951067108569e-05, "loss": 0.2136, "step": 11320 }, { "epoch": 0.7100332142633327, "grad_norm": 0.2970063388347626, "learning_rate": 3.85689556901902e-05, "loss": 0.1994, "step": 11330 }, { "epoch": 0.7106598984771574, "grad_norm": 0.29430776834487915, "learning_rate": 3.855840070929472e-05, "loss": 0.2018, "step": 11340 }, { "epoch": 0.711286582690982, "grad_norm": 0.15369923412799835, "learning_rate": 3.8547845728399234e-05, "loss": 0.0427, "step": 11350 }, { "epoch": 0.7119132669048067, "grad_norm": 0.20897263288497925, "learning_rate": 3.853729074750375e-05, "loss": 0.2163, "step": 11360 }, { "epoch": 0.7125399511186313, "grad_norm": 9.025155067443848, "learning_rate": 3.852673576660826e-05, "loss": 0.2577, "step": 11370 }, { "epoch": 0.713166635332456, "grad_norm": 11.082793235778809, "learning_rate": 3.851618078571278e-05, "loss": 0.3279, "step": 11380 }, { "epoch": 0.7137933195462807, "grad_norm": 7.146550178527832, "learning_rate": 3.8505625804817294e-05, "loss": 0.2959, "step": 11390 }, { "epoch": 0.7144200037601053, "grad_norm": 0.25585150718688965, "learning_rate": 3.849507082392181e-05, "loss": 0.149, "step": 11400 }, { "epoch": 0.7150466879739299, "grad_norm": 5.5770263671875, "learning_rate": 3.848451584302633e-05, "loss": 0.1444, "step": 11410 }, { "epoch": 0.7156733721877546, "grad_norm": 8.812726020812988, "learning_rate": 3.8473960862130844e-05, "loss": 0.1645, "step": 11420 }, { "epoch": 0.7163000564015792, "grad_norm": 0.081893190741539, "learning_rate": 3.846340588123536e-05, "loss": 0.0176, "step": 11430 }, { "epoch": 0.7169267406154038, "grad_norm": 0.06406759470701218, "learning_rate": 3.845285090033987e-05, "loss": 0.1763, "step": 11440 }, { "epoch": 0.7175534248292286, "grad_norm": 0.17364154756069183, "learning_rate": 3.844229591944439e-05, "loss": 0.1932, "step": 11450 }, { "epoch": 0.7181801090430532, "grad_norm": 0.14099130034446716, "learning_rate": 3.8431740938548904e-05, "loss": 0.1635, "step": 11460 }, { "epoch": 0.7188067932568779, "grad_norm": 0.25819849967956543, "learning_rate": 3.8421185957653414e-05, "loss": 0.0816, "step": 11470 }, { "epoch": 0.7194334774707025, "grad_norm": 0.05951720103621483, "learning_rate": 3.841063097675793e-05, "loss": 0.0603, "step": 11480 }, { "epoch": 0.7200601616845271, "grad_norm": 0.12080153822898865, "learning_rate": 3.840007599586245e-05, "loss": 0.2307, "step": 11490 }, { "epoch": 0.7206868458983519, "grad_norm": 0.12720264494419098, "learning_rate": 3.838952101496697e-05, "loss": 0.1466, "step": 11500 }, { "epoch": 0.7213135301121765, "grad_norm": 4.570608615875244, "learning_rate": 3.837896603407148e-05, "loss": 0.3491, "step": 11510 }, { "epoch": 0.7219402143260011, "grad_norm": 1.9697740077972412, "learning_rate": 3.8368411053176e-05, "loss": 0.0256, "step": 11520 }, { "epoch": 0.7225668985398258, "grad_norm": 1.388838291168213, "learning_rate": 3.8357856072280513e-05, "loss": 0.0135, "step": 11530 }, { "epoch": 0.7231935827536504, "grad_norm": 0.06155795976519585, "learning_rate": 3.834730109138502e-05, "loss": 0.0743, "step": 11540 }, { "epoch": 0.723820266967475, "grad_norm": 0.09652787446975708, "learning_rate": 3.833674611048954e-05, "loss": 0.3616, "step": 11550 }, { "epoch": 0.7244469511812998, "grad_norm": 0.12950001657009125, "learning_rate": 3.832619112959406e-05, "loss": 0.1245, "step": 11560 }, { "epoch": 0.7250736353951244, "grad_norm": 2.212172031402588, "learning_rate": 3.831563614869857e-05, "loss": 0.2461, "step": 11570 }, { "epoch": 0.7257003196089491, "grad_norm": 0.1823960393667221, "learning_rate": 3.830508116780308e-05, "loss": 0.182, "step": 11580 }, { "epoch": 0.7263270038227737, "grad_norm": 0.16792289912700653, "learning_rate": 3.8294526186907607e-05, "loss": 0.0991, "step": 11590 }, { "epoch": 0.7269536880365983, "grad_norm": 8.197361946105957, "learning_rate": 3.828397120601212e-05, "loss": 0.2588, "step": 11600 }, { "epoch": 0.727580372250423, "grad_norm": 0.7492303252220154, "learning_rate": 3.827341622511663e-05, "loss": 0.0935, "step": 11610 }, { "epoch": 0.7282070564642477, "grad_norm": 0.12691091001033783, "learning_rate": 3.826286124422115e-05, "loss": 0.1913, "step": 11620 }, { "epoch": 0.7288337406780723, "grad_norm": 0.3421546220779419, "learning_rate": 3.8252306263325666e-05, "loss": 0.1469, "step": 11630 }, { "epoch": 0.729460424891897, "grad_norm": 0.10125494748353958, "learning_rate": 3.824175128243018e-05, "loss": 0.0964, "step": 11640 }, { "epoch": 0.7300871091057216, "grad_norm": 0.06703011691570282, "learning_rate": 3.823119630153469e-05, "loss": 0.0761, "step": 11650 }, { "epoch": 0.7307137933195463, "grad_norm": 3.4559688568115234, "learning_rate": 3.822064132063921e-05, "loss": 0.1696, "step": 11660 }, { "epoch": 0.731340477533371, "grad_norm": 0.1432192325592041, "learning_rate": 3.8210086339743726e-05, "loss": 0.1375, "step": 11670 }, { "epoch": 0.7319671617471956, "grad_norm": 1.798836588859558, "learning_rate": 3.819953135884824e-05, "loss": 0.0559, "step": 11680 }, { "epoch": 0.7325938459610203, "grad_norm": 1.9000601768493652, "learning_rate": 3.818897637795276e-05, "loss": 0.0461, "step": 11690 }, { "epoch": 0.7332205301748449, "grad_norm": 0.08201713860034943, "learning_rate": 3.8178421397057276e-05, "loss": 0.0311, "step": 11700 }, { "epoch": 0.7338472143886695, "grad_norm": 0.09136554598808289, "learning_rate": 3.816786641616179e-05, "loss": 0.1495, "step": 11710 }, { "epoch": 0.7344738986024942, "grad_norm": 0.08365706354379654, "learning_rate": 3.81573114352663e-05, "loss": 0.3126, "step": 11720 }, { "epoch": 0.7351005828163188, "grad_norm": 0.06393969058990479, "learning_rate": 3.814675645437082e-05, "loss": 0.1961, "step": 11730 }, { "epoch": 0.7357272670301435, "grad_norm": 0.34764620661735535, "learning_rate": 3.8136201473475336e-05, "loss": 0.0926, "step": 11740 }, { "epoch": 0.7363539512439682, "grad_norm": 14.139609336853027, "learning_rate": 3.812564649257985e-05, "loss": 0.1091, "step": 11750 }, { "epoch": 0.7369806354577928, "grad_norm": 0.06565447896718979, "learning_rate": 3.811509151168436e-05, "loss": 0.2524, "step": 11760 }, { "epoch": 0.7376073196716175, "grad_norm": 0.4182433784008026, "learning_rate": 3.810453653078888e-05, "loss": 0.2786, "step": 11770 }, { "epoch": 0.7382340038854421, "grad_norm": 5.026593208312988, "learning_rate": 3.8093981549893396e-05, "loss": 0.1597, "step": 11780 }, { "epoch": 0.7388606880992667, "grad_norm": 0.27505579590797424, "learning_rate": 3.808342656899791e-05, "loss": 0.0146, "step": 11790 }, { "epoch": 0.7394873723130915, "grad_norm": 5.809450626373291, "learning_rate": 3.807287158810243e-05, "loss": 0.1832, "step": 11800 }, { "epoch": 0.7401140565269161, "grad_norm": 6.0504679679870605, "learning_rate": 3.8062316607206946e-05, "loss": 0.1652, "step": 11810 }, { "epoch": 0.7407407407407407, "grad_norm": 0.35781294107437134, "learning_rate": 3.805176162631146e-05, "loss": 0.1793, "step": 11820 }, { "epoch": 0.7413674249545654, "grad_norm": 0.09163035452365875, "learning_rate": 3.804120664541597e-05, "loss": 0.1139, "step": 11830 }, { "epoch": 0.74199410916839, "grad_norm": 68.03575897216797, "learning_rate": 3.803065166452049e-05, "loss": 0.085, "step": 11840 }, { "epoch": 0.7426207933822147, "grad_norm": 0.11376772820949554, "learning_rate": 3.8020096683625006e-05, "loss": 0.0952, "step": 11850 }, { "epoch": 0.7432474775960394, "grad_norm": 2.6789710521698, "learning_rate": 3.8009541702729516e-05, "loss": 0.1616, "step": 11860 }, { "epoch": 0.743874161809864, "grad_norm": 0.31956803798675537, "learning_rate": 3.799898672183403e-05, "loss": 0.1641, "step": 11870 }, { "epoch": 0.7445008460236887, "grad_norm": 0.1959105134010315, "learning_rate": 3.798843174093855e-05, "loss": 0.1341, "step": 11880 }, { "epoch": 0.7451275302375133, "grad_norm": 2.517446517944336, "learning_rate": 3.7977876760043065e-05, "loss": 0.1314, "step": 11890 }, { "epoch": 0.7457542144513379, "grad_norm": 0.07533478736877441, "learning_rate": 3.796732177914758e-05, "loss": 0.1315, "step": 11900 }, { "epoch": 0.7463808986651627, "grad_norm": 0.7237932682037354, "learning_rate": 3.79567667982521e-05, "loss": 0.1167, "step": 11910 }, { "epoch": 0.7470075828789873, "grad_norm": 6.23661994934082, "learning_rate": 3.7946211817356615e-05, "loss": 0.1283, "step": 11920 }, { "epoch": 0.7476342670928119, "grad_norm": 3.1848607063293457, "learning_rate": 3.7935656836461125e-05, "loss": 0.1034, "step": 11930 }, { "epoch": 0.7482609513066366, "grad_norm": 2.650261878967285, "learning_rate": 3.792510185556564e-05, "loss": 0.1176, "step": 11940 }, { "epoch": 0.7488876355204612, "grad_norm": 0.06356266140937805, "learning_rate": 3.791454687467016e-05, "loss": 0.1833, "step": 11950 }, { "epoch": 0.7495143197342858, "grad_norm": 2.3670597076416016, "learning_rate": 3.7903991893774675e-05, "loss": 0.2214, "step": 11960 }, { "epoch": 0.7501410039481106, "grad_norm": 0.07301408797502518, "learning_rate": 3.7893436912879185e-05, "loss": 0.308, "step": 11970 }, { "epoch": 0.7507676881619352, "grad_norm": 13.731446266174316, "learning_rate": 3.78828819319837e-05, "loss": 0.2203, "step": 11980 }, { "epoch": 0.7513943723757599, "grad_norm": 2.0269956588745117, "learning_rate": 3.787232695108822e-05, "loss": 0.2015, "step": 11990 }, { "epoch": 0.7520210565895845, "grad_norm": 0.23687097430229187, "learning_rate": 3.7861771970192735e-05, "loss": 0.0128, "step": 12000 }, { "epoch": 0.7526477408034091, "grad_norm": 0.08494888991117477, "learning_rate": 3.785121698929725e-05, "loss": 0.0982, "step": 12010 }, { "epoch": 0.7532744250172339, "grad_norm": 0.0715617686510086, "learning_rate": 3.784066200840177e-05, "loss": 0.0946, "step": 12020 }, { "epoch": 0.7539011092310585, "grad_norm": 0.3059249520301819, "learning_rate": 3.7830107027506285e-05, "loss": 0.2947, "step": 12030 }, { "epoch": 0.7545277934448831, "grad_norm": 0.4487674832344055, "learning_rate": 3.7819552046610795e-05, "loss": 0.1457, "step": 12040 }, { "epoch": 0.7551544776587078, "grad_norm": 28.634532928466797, "learning_rate": 3.780899706571531e-05, "loss": 0.0408, "step": 12050 }, { "epoch": 0.7557811618725324, "grad_norm": 3.7225265502929688, "learning_rate": 3.779844208481983e-05, "loss": 0.2168, "step": 12060 }, { "epoch": 0.7564078460863571, "grad_norm": 0.8148830533027649, "learning_rate": 3.778788710392434e-05, "loss": 0.1995, "step": 12070 }, { "epoch": 0.7570345303001818, "grad_norm": 0.11411413550376892, "learning_rate": 3.7777332123028855e-05, "loss": 0.1426, "step": 12080 }, { "epoch": 0.7576612145140064, "grad_norm": 0.48122769594192505, "learning_rate": 3.776677714213338e-05, "loss": 0.1247, "step": 12090 }, { "epoch": 0.7582878987278311, "grad_norm": 0.4169093370437622, "learning_rate": 3.7756222161237895e-05, "loss": 0.1047, "step": 12100 }, { "epoch": 0.7589145829416557, "grad_norm": 0.06136476248502731, "learning_rate": 3.7745667180342405e-05, "loss": 0.0406, "step": 12110 }, { "epoch": 0.7595412671554803, "grad_norm": 16.241806030273438, "learning_rate": 3.773511219944692e-05, "loss": 0.1553, "step": 12120 }, { "epoch": 0.760167951369305, "grad_norm": 0.404888778924942, "learning_rate": 3.772455721855144e-05, "loss": 0.0967, "step": 12130 }, { "epoch": 0.7607946355831297, "grad_norm": 0.5528884530067444, "learning_rate": 3.7714002237655955e-05, "loss": 0.0098, "step": 12140 }, { "epoch": 0.7614213197969543, "grad_norm": 2.322392225265503, "learning_rate": 3.7703447256760465e-05, "loss": 0.1899, "step": 12150 }, { "epoch": 0.762048004010779, "grad_norm": 0.03181945160031319, "learning_rate": 3.769289227586498e-05, "loss": 0.1481, "step": 12160 }, { "epoch": 0.7626746882246036, "grad_norm": 2.7455575466156006, "learning_rate": 3.76823372949695e-05, "loss": 0.4789, "step": 12170 }, { "epoch": 0.7633013724384283, "grad_norm": 0.22958771884441376, "learning_rate": 3.7671782314074014e-05, "loss": 0.0377, "step": 12180 }, { "epoch": 0.7639280566522529, "grad_norm": 0.05740810185670853, "learning_rate": 3.766122733317853e-05, "loss": 0.1482, "step": 12190 }, { "epoch": 0.7645547408660776, "grad_norm": 0.37747031450271606, "learning_rate": 3.765067235228305e-05, "loss": 0.0657, "step": 12200 }, { "epoch": 0.7651814250799023, "grad_norm": 0.056075967848300934, "learning_rate": 3.7640117371387564e-05, "loss": 0.0356, "step": 12210 }, { "epoch": 0.7658081092937269, "grad_norm": 5.565336227416992, "learning_rate": 3.7629562390492074e-05, "loss": 0.1372, "step": 12220 }, { "epoch": 0.7664347935075515, "grad_norm": 0.4846305847167969, "learning_rate": 3.761900740959659e-05, "loss": 0.0483, "step": 12230 }, { "epoch": 0.7670614777213762, "grad_norm": 0.037832580506801605, "learning_rate": 3.760845242870111e-05, "loss": 0.1366, "step": 12240 }, { "epoch": 0.7676881619352008, "grad_norm": 4.363157272338867, "learning_rate": 3.759789744780562e-05, "loss": 0.1575, "step": 12250 }, { "epoch": 0.7683148461490255, "grad_norm": 0.21504797041416168, "learning_rate": 3.7587342466910134e-05, "loss": 0.2839, "step": 12260 }, { "epoch": 0.7689415303628502, "grad_norm": 0.05504310503602028, "learning_rate": 3.757678748601465e-05, "loss": 0.0095, "step": 12270 }, { "epoch": 0.7695682145766748, "grad_norm": 0.08995038270950317, "learning_rate": 3.756623250511917e-05, "loss": 0.1848, "step": 12280 }, { "epoch": 0.7701948987904995, "grad_norm": 2.675936222076416, "learning_rate": 3.7555677524223684e-05, "loss": 0.049, "step": 12290 }, { "epoch": 0.7708215830043241, "grad_norm": 0.15427015721797943, "learning_rate": 3.75451225433282e-05, "loss": 0.1101, "step": 12300 }, { "epoch": 0.7714482672181487, "grad_norm": 3.6860060691833496, "learning_rate": 3.753456756243272e-05, "loss": 0.235, "step": 12310 }, { "epoch": 0.7720749514319735, "grad_norm": 1.6558562517166138, "learning_rate": 3.752401258153723e-05, "loss": 0.0861, "step": 12320 }, { "epoch": 0.7727016356457981, "grad_norm": 0.12261590361595154, "learning_rate": 3.7513457600641744e-05, "loss": 0.1569, "step": 12330 }, { "epoch": 0.7733283198596227, "grad_norm": 5.633775234222412, "learning_rate": 3.750290261974626e-05, "loss": 0.1584, "step": 12340 }, { "epoch": 0.7739550040734474, "grad_norm": 0.30017462372779846, "learning_rate": 3.749234763885078e-05, "loss": 0.1295, "step": 12350 }, { "epoch": 0.774581688287272, "grad_norm": 2.7208046913146973, "learning_rate": 3.748179265795529e-05, "loss": 0.3396, "step": 12360 }, { "epoch": 0.7752083725010966, "grad_norm": 2.70462965965271, "learning_rate": 3.7471237677059804e-05, "loss": 0.2365, "step": 12370 }, { "epoch": 0.7758350567149214, "grad_norm": 0.3144044876098633, "learning_rate": 3.746068269616432e-05, "loss": 0.1355, "step": 12380 }, { "epoch": 0.776461740928746, "grad_norm": 0.20189404487609863, "learning_rate": 3.745012771526884e-05, "loss": 0.0728, "step": 12390 }, { "epoch": 0.7770884251425707, "grad_norm": 0.15650856494903564, "learning_rate": 3.7439572734373354e-05, "loss": 0.1454, "step": 12400 }, { "epoch": 0.7777151093563953, "grad_norm": 0.17184951901435852, "learning_rate": 3.742901775347787e-05, "loss": 0.1337, "step": 12410 }, { "epoch": 0.7783417935702199, "grad_norm": 0.21894590556621552, "learning_rate": 3.741846277258239e-05, "loss": 0.2187, "step": 12420 }, { "epoch": 0.7789684777840447, "grad_norm": 0.22736823558807373, "learning_rate": 3.74079077916869e-05, "loss": 0.1133, "step": 12430 }, { "epoch": 0.7795951619978693, "grad_norm": 13.00343132019043, "learning_rate": 3.7397352810791414e-05, "loss": 0.0324, "step": 12440 }, { "epoch": 0.7802218462116939, "grad_norm": 11.851655006408691, "learning_rate": 3.738679782989593e-05, "loss": 0.1891, "step": 12450 }, { "epoch": 0.7808485304255186, "grad_norm": 36.18693161010742, "learning_rate": 3.737624284900044e-05, "loss": 0.3295, "step": 12460 }, { "epoch": 0.7814752146393432, "grad_norm": 0.1305285394191742, "learning_rate": 3.736568786810496e-05, "loss": 0.1744, "step": 12470 }, { "epoch": 0.7821018988531679, "grad_norm": 0.8213036060333252, "learning_rate": 3.735513288720947e-05, "loss": 0.2016, "step": 12480 }, { "epoch": 0.7827285830669926, "grad_norm": 3.187350034713745, "learning_rate": 3.734457790631399e-05, "loss": 0.1567, "step": 12490 }, { "epoch": 0.7833552672808172, "grad_norm": 0.19200462102890015, "learning_rate": 3.733402292541851e-05, "loss": 0.0711, "step": 12500 }, { "epoch": 0.7839819514946419, "grad_norm": 0.19528482854366302, "learning_rate": 3.732346794452302e-05, "loss": 0.1273, "step": 12510 }, { "epoch": 0.7846086357084665, "grad_norm": 0.19674736261367798, "learning_rate": 3.731291296362754e-05, "loss": 0.281, "step": 12520 }, { "epoch": 0.7852353199222911, "grad_norm": 0.2751675844192505, "learning_rate": 3.7302357982732057e-05, "loss": 0.1607, "step": 12530 }, { "epoch": 0.7858620041361158, "grad_norm": 0.3155404031276703, "learning_rate": 3.7291803001836566e-05, "loss": 0.1767, "step": 12540 }, { "epoch": 0.7864886883499405, "grad_norm": 0.41322872042655945, "learning_rate": 3.728124802094108e-05, "loss": 0.1663, "step": 12550 }, { "epoch": 0.7871153725637651, "grad_norm": 0.31705963611602783, "learning_rate": 3.72706930400456e-05, "loss": 0.0903, "step": 12560 }, { "epoch": 0.7877420567775898, "grad_norm": 0.1984623223543167, "learning_rate": 3.726013805915011e-05, "loss": 0.0583, "step": 12570 }, { "epoch": 0.7883687409914144, "grad_norm": 0.13793553411960602, "learning_rate": 3.7249583078254626e-05, "loss": 0.1318, "step": 12580 }, { "epoch": 0.7889954252052391, "grad_norm": 5.596744060516357, "learning_rate": 3.723902809735915e-05, "loss": 0.0879, "step": 12590 }, { "epoch": 0.7896221094190637, "grad_norm": 0.12211534380912781, "learning_rate": 3.7228473116463666e-05, "loss": 0.0783, "step": 12600 }, { "epoch": 0.7902487936328884, "grad_norm": 3.553812026977539, "learning_rate": 3.7217918135568176e-05, "loss": 0.1936, "step": 12610 }, { "epoch": 0.7908754778467131, "grad_norm": 0.12860919535160065, "learning_rate": 3.720736315467269e-05, "loss": 0.0552, "step": 12620 }, { "epoch": 0.7915021620605377, "grad_norm": 3.718738079071045, "learning_rate": 3.719680817377721e-05, "loss": 0.2397, "step": 12630 }, { "epoch": 0.7921288462743623, "grad_norm": 0.7922436594963074, "learning_rate": 3.718625319288172e-05, "loss": 0.0915, "step": 12640 }, { "epoch": 0.792755530488187, "grad_norm": 0.11940032988786697, "learning_rate": 3.7175698211986236e-05, "loss": 0.1205, "step": 12650 }, { "epoch": 0.7933822147020116, "grad_norm": 0.10362546145915985, "learning_rate": 3.716514323109075e-05, "loss": 0.0583, "step": 12660 }, { "epoch": 0.7940088989158363, "grad_norm": 3.585110902786255, "learning_rate": 3.715458825019527e-05, "loss": 0.0795, "step": 12670 }, { "epoch": 0.794635583129661, "grad_norm": 0.06763280928134918, "learning_rate": 3.7144033269299786e-05, "loss": 0.2659, "step": 12680 }, { "epoch": 0.7952622673434856, "grad_norm": 0.1489865779876709, "learning_rate": 3.71334782884043e-05, "loss": 0.0992, "step": 12690 }, { "epoch": 0.7958889515573103, "grad_norm": 3.47560977935791, "learning_rate": 3.712292330750882e-05, "loss": 0.1278, "step": 12700 }, { "epoch": 0.7965156357711349, "grad_norm": 3.3073196411132812, "learning_rate": 3.711236832661333e-05, "loss": 0.1825, "step": 12710 }, { "epoch": 0.7971423199849595, "grad_norm": 0.14257051050662994, "learning_rate": 3.7101813345717846e-05, "loss": 0.0773, "step": 12720 }, { "epoch": 0.7977690041987843, "grad_norm": 0.48788848519325256, "learning_rate": 3.709125836482236e-05, "loss": 0.2049, "step": 12730 }, { "epoch": 0.7983956884126089, "grad_norm": 3.0784692764282227, "learning_rate": 3.708070338392688e-05, "loss": 0.2214, "step": 12740 }, { "epoch": 0.7990223726264335, "grad_norm": 0.27575093507766724, "learning_rate": 3.707014840303139e-05, "loss": 0.1934, "step": 12750 }, { "epoch": 0.7996490568402582, "grad_norm": 0.2657164931297302, "learning_rate": 3.7059593422135906e-05, "loss": 0.134, "step": 12760 }, { "epoch": 0.8002757410540828, "grad_norm": 0.9468491673469543, "learning_rate": 3.704903844124042e-05, "loss": 0.0772, "step": 12770 }, { "epoch": 0.8009024252679074, "grad_norm": 0.15965603291988373, "learning_rate": 3.703848346034494e-05, "loss": 0.1017, "step": 12780 }, { "epoch": 0.8015291094817322, "grad_norm": 0.19264058768749237, "learning_rate": 3.7027928479449456e-05, "loss": 0.1909, "step": 12790 }, { "epoch": 0.8021557936955568, "grad_norm": 13.75501537322998, "learning_rate": 3.701737349855397e-05, "loss": 0.0429, "step": 12800 }, { "epoch": 0.8027824779093815, "grad_norm": 0.10453546047210693, "learning_rate": 3.700681851765849e-05, "loss": 0.2287, "step": 12810 }, { "epoch": 0.8034091621232061, "grad_norm": 3.390267848968506, "learning_rate": 3.6996263536763e-05, "loss": 0.2462, "step": 12820 }, { "epoch": 0.8040358463370307, "grad_norm": 3.221275568008423, "learning_rate": 3.6985708555867515e-05, "loss": 0.1346, "step": 12830 }, { "epoch": 0.8046625305508555, "grad_norm": 3.498892307281494, "learning_rate": 3.697515357497203e-05, "loss": 0.1513, "step": 12840 }, { "epoch": 0.8052892147646801, "grad_norm": 6.34757661819458, "learning_rate": 3.696459859407654e-05, "loss": 0.1866, "step": 12850 }, { "epoch": 0.8059158989785047, "grad_norm": 2.874657392501831, "learning_rate": 3.695404361318106e-05, "loss": 0.1909, "step": 12860 }, { "epoch": 0.8065425831923294, "grad_norm": 1.1204040050506592, "learning_rate": 3.6943488632285575e-05, "loss": 0.0936, "step": 12870 }, { "epoch": 0.807169267406154, "grad_norm": 0.156513050198555, "learning_rate": 3.693293365139009e-05, "loss": 0.0151, "step": 12880 }, { "epoch": 0.8077959516199787, "grad_norm": 3.5642528533935547, "learning_rate": 3.692237867049461e-05, "loss": 0.1636, "step": 12890 }, { "epoch": 0.8084226358338034, "grad_norm": 27.339061737060547, "learning_rate": 3.6911823689599125e-05, "loss": 0.0421, "step": 12900 }, { "epoch": 0.809049320047628, "grad_norm": 0.1077108234167099, "learning_rate": 3.690126870870364e-05, "loss": 0.0325, "step": 12910 }, { "epoch": 0.8096760042614527, "grad_norm": 7.458876132965088, "learning_rate": 3.689071372780816e-05, "loss": 0.122, "step": 12920 }, { "epoch": 0.8103026884752773, "grad_norm": 0.0756208524107933, "learning_rate": 3.688015874691267e-05, "loss": 0.0061, "step": 12930 }, { "epoch": 0.8109293726891019, "grad_norm": 0.06975262612104416, "learning_rate": 3.6869603766017185e-05, "loss": 0.19, "step": 12940 }, { "epoch": 0.8115560569029266, "grad_norm": 0.07189065963029861, "learning_rate": 3.68590487851217e-05, "loss": 0.22, "step": 12950 }, { "epoch": 0.8121827411167513, "grad_norm": 12.745660781860352, "learning_rate": 3.684849380422621e-05, "loss": 0.315, "step": 12960 }, { "epoch": 0.8128094253305759, "grad_norm": 1.2097328901290894, "learning_rate": 3.683793882333073e-05, "loss": 0.1202, "step": 12970 }, { "epoch": 0.8134361095444006, "grad_norm": 0.16218720376491547, "learning_rate": 3.6827383842435245e-05, "loss": 0.0863, "step": 12980 }, { "epoch": 0.8140627937582252, "grad_norm": 0.09889566898345947, "learning_rate": 3.681682886153976e-05, "loss": 0.0592, "step": 12990 }, { "epoch": 0.8146894779720499, "grad_norm": 0.6426899433135986, "learning_rate": 3.680627388064428e-05, "loss": 0.1792, "step": 13000 }, { "epoch": 0.8153161621858745, "grad_norm": 12.053089141845703, "learning_rate": 3.6795718899748795e-05, "loss": 0.1879, "step": 13010 }, { "epoch": 0.8159428463996992, "grad_norm": 0.1389077603816986, "learning_rate": 3.678516391885331e-05, "loss": 0.2683, "step": 13020 }, { "epoch": 0.8165695306135239, "grad_norm": 0.29173538088798523, "learning_rate": 3.677460893795782e-05, "loss": 0.124, "step": 13030 }, { "epoch": 0.8171962148273485, "grad_norm": 0.14425741136074066, "learning_rate": 3.676405395706234e-05, "loss": 0.1329, "step": 13040 }, { "epoch": 0.8178228990411731, "grad_norm": 5.297956943511963, "learning_rate": 3.6753498976166855e-05, "loss": 0.1597, "step": 13050 }, { "epoch": 0.8184495832549978, "grad_norm": 0.12852726876735687, "learning_rate": 3.674294399527137e-05, "loss": 0.0384, "step": 13060 }, { "epoch": 0.8190762674688224, "grad_norm": 0.08532652258872986, "learning_rate": 3.673238901437588e-05, "loss": 0.1388, "step": 13070 }, { "epoch": 0.8197029516826471, "grad_norm": 3.585350275039673, "learning_rate": 3.67218340334804e-05, "loss": 0.1586, "step": 13080 }, { "epoch": 0.8203296358964718, "grad_norm": 67.14936065673828, "learning_rate": 3.671127905258492e-05, "loss": 0.2627, "step": 13090 }, { "epoch": 0.8209563201102964, "grad_norm": 0.07265251874923706, "learning_rate": 3.670072407168943e-05, "loss": 0.0716, "step": 13100 }, { "epoch": 0.8215830043241211, "grad_norm": 3.681473731994629, "learning_rate": 3.669016909079395e-05, "loss": 0.2819, "step": 13110 }, { "epoch": 0.8222096885379457, "grad_norm": 3.0709450244903564, "learning_rate": 3.6679614109898464e-05, "loss": 0.2773, "step": 13120 }, { "epoch": 0.8228363727517704, "grad_norm": 0.2906370460987091, "learning_rate": 3.666905912900298e-05, "loss": 0.1433, "step": 13130 }, { "epoch": 0.8234630569655951, "grad_norm": 0.34257593750953674, "learning_rate": 3.665850414810749e-05, "loss": 0.0968, "step": 13140 }, { "epoch": 0.8240897411794197, "grad_norm": 1.3226487636566162, "learning_rate": 3.664794916721201e-05, "loss": 0.0643, "step": 13150 }, { "epoch": 0.8247164253932443, "grad_norm": 0.18398089706897736, "learning_rate": 3.6637394186316524e-05, "loss": 0.3557, "step": 13160 }, { "epoch": 0.825343109607069, "grad_norm": 0.13415154814720154, "learning_rate": 3.6626839205421034e-05, "loss": 0.0636, "step": 13170 }, { "epoch": 0.8259697938208936, "grad_norm": 3.044423818588257, "learning_rate": 3.661628422452556e-05, "loss": 0.233, "step": 13180 }, { "epoch": 0.8265964780347183, "grad_norm": 0.11876463890075684, "learning_rate": 3.6605729243630074e-05, "loss": 0.0823, "step": 13190 }, { "epoch": 0.827223162248543, "grad_norm": 0.46708282828330994, "learning_rate": 3.659517426273459e-05, "loss": 0.0813, "step": 13200 }, { "epoch": 0.8278498464623676, "grad_norm": 3.5009450912475586, "learning_rate": 3.65846192818391e-05, "loss": 0.2123, "step": 13210 }, { "epoch": 0.8284765306761923, "grad_norm": 2.2133095264434814, "learning_rate": 3.657406430094362e-05, "loss": 0.0367, "step": 13220 }, { "epoch": 0.8291032148900169, "grad_norm": 0.09307032078504562, "learning_rate": 3.6563509320048134e-05, "loss": 0.0462, "step": 13230 }, { "epoch": 0.8297298991038415, "grad_norm": 0.2503710687160492, "learning_rate": 3.6552954339152644e-05, "loss": 0.2158, "step": 13240 }, { "epoch": 0.8303565833176663, "grad_norm": 0.09940878301858902, "learning_rate": 3.654239935825716e-05, "loss": 0.0863, "step": 13250 }, { "epoch": 0.8309832675314909, "grad_norm": 0.10309536755084991, "learning_rate": 3.653184437736168e-05, "loss": 0.1642, "step": 13260 }, { "epoch": 0.8316099517453155, "grad_norm": 0.09203781932592392, "learning_rate": 3.6521289396466194e-05, "loss": 0.0441, "step": 13270 }, { "epoch": 0.8322366359591402, "grad_norm": 0.08066492527723312, "learning_rate": 3.651073441557071e-05, "loss": 0.0075, "step": 13280 }, { "epoch": 0.8328633201729648, "grad_norm": 0.08222663402557373, "learning_rate": 3.650017943467523e-05, "loss": 0.2368, "step": 13290 }, { "epoch": 0.8334900043867896, "grad_norm": 0.062067195773124695, "learning_rate": 3.6489624453779744e-05, "loss": 0.0664, "step": 13300 }, { "epoch": 0.8341166886006142, "grad_norm": 0.07864390313625336, "learning_rate": 3.647906947288426e-05, "loss": 0.1451, "step": 13310 }, { "epoch": 0.8347433728144388, "grad_norm": 0.19397737085819244, "learning_rate": 3.646851449198877e-05, "loss": 0.1125, "step": 13320 }, { "epoch": 0.8353700570282635, "grad_norm": 0.06277598440647125, "learning_rate": 3.645795951109329e-05, "loss": 0.1009, "step": 13330 }, { "epoch": 0.8359967412420881, "grad_norm": 0.06773136556148529, "learning_rate": 3.6447404530197804e-05, "loss": 0.1635, "step": 13340 }, { "epoch": 0.8366234254559127, "grad_norm": 0.11217823624610901, "learning_rate": 3.6436849549302314e-05, "loss": 0.0174, "step": 13350 }, { "epoch": 0.8372501096697375, "grad_norm": 3.266810417175293, "learning_rate": 3.642629456840683e-05, "loss": 0.1214, "step": 13360 }, { "epoch": 0.8378767938835621, "grad_norm": 0.14396525919437408, "learning_rate": 3.641573958751135e-05, "loss": 0.1602, "step": 13370 }, { "epoch": 0.8385034780973867, "grad_norm": 0.25807830691337585, "learning_rate": 3.6405184606615864e-05, "loss": 0.2454, "step": 13380 }, { "epoch": 0.8391301623112114, "grad_norm": 4.361166477203369, "learning_rate": 3.639462962572038e-05, "loss": 0.1265, "step": 13390 }, { "epoch": 0.839756846525036, "grad_norm": 0.07289406657218933, "learning_rate": 3.63840746448249e-05, "loss": 0.0458, "step": 13400 }, { "epoch": 0.8403835307388607, "grad_norm": 1.979023814201355, "learning_rate": 3.6373519663929413e-05, "loss": 0.03, "step": 13410 }, { "epoch": 0.8410102149526854, "grad_norm": 0.047339342534542084, "learning_rate": 3.636296468303392e-05, "loss": 0.0726, "step": 13420 }, { "epoch": 0.84163689916651, "grad_norm": 0.05762796103954315, "learning_rate": 3.635240970213844e-05, "loss": 0.1629, "step": 13430 }, { "epoch": 0.8422635833803347, "grad_norm": 0.08475717902183533, "learning_rate": 3.634185472124296e-05, "loss": 0.1693, "step": 13440 }, { "epoch": 0.8428902675941593, "grad_norm": 19.972970962524414, "learning_rate": 3.633129974034747e-05, "loss": 0.0781, "step": 13450 }, { "epoch": 0.8435169518079839, "grad_norm": 6.1189985275268555, "learning_rate": 3.632074475945198e-05, "loss": 0.0217, "step": 13460 }, { "epoch": 0.8441436360218086, "grad_norm": 0.04247862473130226, "learning_rate": 3.63101897785565e-05, "loss": 0.3026, "step": 13470 }, { "epoch": 0.8447703202356333, "grad_norm": 10.373451232910156, "learning_rate": 3.6299634797661017e-05, "loss": 0.2465, "step": 13480 }, { "epoch": 0.8453970044494579, "grad_norm": 0.16475176811218262, "learning_rate": 3.628907981676553e-05, "loss": 0.1451, "step": 13490 }, { "epoch": 0.8460236886632826, "grad_norm": 0.053600408136844635, "learning_rate": 3.627852483587005e-05, "loss": 0.0733, "step": 13500 }, { "epoch": 0.8466503728771072, "grad_norm": 0.06090028956532478, "learning_rate": 3.6267969854974566e-05, "loss": 0.2185, "step": 13510 }, { "epoch": 0.8472770570909319, "grad_norm": 22.72749137878418, "learning_rate": 3.625741487407908e-05, "loss": 0.1009, "step": 13520 }, { "epoch": 0.8479037413047565, "grad_norm": 3.8991730213165283, "learning_rate": 3.624685989318359e-05, "loss": 0.1963, "step": 13530 }, { "epoch": 0.8485304255185812, "grad_norm": 0.0632360652089119, "learning_rate": 3.623630491228811e-05, "loss": 0.1906, "step": 13540 }, { "epoch": 0.8491571097324059, "grad_norm": 10.589201927185059, "learning_rate": 3.6225749931392626e-05, "loss": 0.1771, "step": 13550 }, { "epoch": 0.8497837939462305, "grad_norm": 0.08310031145811081, "learning_rate": 3.6215194950497136e-05, "loss": 0.1889, "step": 13560 }, { "epoch": 0.8504104781600551, "grad_norm": 0.14444205164909363, "learning_rate": 3.620463996960165e-05, "loss": 0.1354, "step": 13570 }, { "epoch": 0.8510371623738798, "grad_norm": 1.6947962045669556, "learning_rate": 3.619408498870617e-05, "loss": 0.1037, "step": 13580 }, { "epoch": 0.8516638465877044, "grad_norm": 2.187366485595703, "learning_rate": 3.618353000781069e-05, "loss": 0.1027, "step": 13590 }, { "epoch": 0.8522905308015291, "grad_norm": 8.265816688537598, "learning_rate": 3.61729750269152e-05, "loss": 0.0939, "step": 13600 }, { "epoch": 0.8529172150153538, "grad_norm": 11.722594261169434, "learning_rate": 3.616242004601972e-05, "loss": 0.0769, "step": 13610 }, { "epoch": 0.8535438992291784, "grad_norm": 0.04671710357069969, "learning_rate": 3.6151865065124236e-05, "loss": 0.0638, "step": 13620 }, { "epoch": 0.8541705834430031, "grad_norm": 14.175790786743164, "learning_rate": 3.6141310084228746e-05, "loss": 0.2084, "step": 13630 }, { "epoch": 0.8547972676568277, "grad_norm": 2.082923412322998, "learning_rate": 3.613075510333326e-05, "loss": 0.1194, "step": 13640 }, { "epoch": 0.8554239518706523, "grad_norm": 0.38090240955352783, "learning_rate": 3.612020012243778e-05, "loss": 0.2383, "step": 13650 }, { "epoch": 0.8560506360844771, "grad_norm": 5.69649076461792, "learning_rate": 3.6109645141542296e-05, "loss": 0.0534, "step": 13660 }, { "epoch": 0.8566773202983017, "grad_norm": 0.22433985769748688, "learning_rate": 3.6099090160646806e-05, "loss": 0.2355, "step": 13670 }, { "epoch": 0.8573040045121263, "grad_norm": 9.8493013381958, "learning_rate": 3.608853517975133e-05, "loss": 0.0305, "step": 13680 }, { "epoch": 0.857930688725951, "grad_norm": 0.031805284321308136, "learning_rate": 3.6077980198855846e-05, "loss": 0.0292, "step": 13690 }, { "epoch": 0.8585573729397756, "grad_norm": 0.20661821961402893, "learning_rate": 3.6067425217960356e-05, "loss": 0.0627, "step": 13700 }, { "epoch": 0.8591840571536004, "grad_norm": 0.8476600050926208, "learning_rate": 3.605687023706487e-05, "loss": 0.0846, "step": 13710 }, { "epoch": 0.859810741367425, "grad_norm": 4.03338098526001, "learning_rate": 3.604631525616939e-05, "loss": 0.2247, "step": 13720 }, { "epoch": 0.8604374255812496, "grad_norm": 0.04489768296480179, "learning_rate": 3.6035760275273906e-05, "loss": 0.238, "step": 13730 }, { "epoch": 0.8610641097950743, "grad_norm": 1.559618353843689, "learning_rate": 3.6025205294378416e-05, "loss": 0.0871, "step": 13740 }, { "epoch": 0.8616907940088989, "grad_norm": 0.12418634444475174, "learning_rate": 3.601465031348293e-05, "loss": 0.1338, "step": 13750 }, { "epoch": 0.8623174782227235, "grad_norm": 3.787989854812622, "learning_rate": 3.600409533258745e-05, "loss": 0.1617, "step": 13760 }, { "epoch": 0.8629441624365483, "grad_norm": 2.1943676471710205, "learning_rate": 3.5993540351691966e-05, "loss": 0.0946, "step": 13770 }, { "epoch": 0.8635708466503729, "grad_norm": 0.0763244777917862, "learning_rate": 3.598298537079648e-05, "loss": 0.1639, "step": 13780 }, { "epoch": 0.8641975308641975, "grad_norm": 0.08010061830282211, "learning_rate": 3.5972430389901e-05, "loss": 0.0546, "step": 13790 }, { "epoch": 0.8648242150780222, "grad_norm": 0.07747522741556168, "learning_rate": 3.5961875409005515e-05, "loss": 0.0335, "step": 13800 }, { "epoch": 0.8654508992918468, "grad_norm": 4.978739261627197, "learning_rate": 3.5951320428110025e-05, "loss": 0.2478, "step": 13810 }, { "epoch": 0.8660775835056715, "grad_norm": 0.06660338491201401, "learning_rate": 3.594076544721454e-05, "loss": 0.11, "step": 13820 }, { "epoch": 0.8667042677194962, "grad_norm": 0.0601072795689106, "learning_rate": 3.593021046631906e-05, "loss": 0.0817, "step": 13830 }, { "epoch": 0.8673309519333208, "grad_norm": 0.0625920370221138, "learning_rate": 3.5919655485423575e-05, "loss": 0.1403, "step": 13840 }, { "epoch": 0.8679576361471455, "grad_norm": 3.843177556991577, "learning_rate": 3.5909100504528085e-05, "loss": 0.1269, "step": 13850 }, { "epoch": 0.8685843203609701, "grad_norm": 3.897458791732788, "learning_rate": 3.58985455236326e-05, "loss": 0.1432, "step": 13860 }, { "epoch": 0.8692110045747947, "grad_norm": 1.523318886756897, "learning_rate": 3.588799054273712e-05, "loss": 0.1433, "step": 13870 }, { "epoch": 0.8698376887886194, "grad_norm": 0.05766427889466286, "learning_rate": 3.5877435561841635e-05, "loss": 0.0306, "step": 13880 }, { "epoch": 0.8704643730024441, "grad_norm": 1.394804835319519, "learning_rate": 3.586688058094615e-05, "loss": 0.0264, "step": 13890 }, { "epoch": 0.8710910572162687, "grad_norm": 0.05665925145149231, "learning_rate": 3.585632560005067e-05, "loss": 0.1586, "step": 13900 }, { "epoch": 0.8717177414300934, "grad_norm": 0.06234462931752205, "learning_rate": 3.5845770619155185e-05, "loss": 0.1324, "step": 13910 }, { "epoch": 0.872344425643918, "grad_norm": 0.08005021512508392, "learning_rate": 3.5835215638259695e-05, "loss": 0.1808, "step": 13920 }, { "epoch": 0.8729711098577427, "grad_norm": 0.11479296535253525, "learning_rate": 3.582466065736421e-05, "loss": 0.1577, "step": 13930 }, { "epoch": 0.8735977940715673, "grad_norm": 0.12691444158554077, "learning_rate": 3.581410567646873e-05, "loss": 0.099, "step": 13940 }, { "epoch": 0.874224478285392, "grad_norm": 0.1263216882944107, "learning_rate": 3.580355069557324e-05, "loss": 0.0946, "step": 13950 }, { "epoch": 0.8748511624992167, "grad_norm": 20.947620391845703, "learning_rate": 3.5792995714677755e-05, "loss": 0.2432, "step": 13960 }, { "epoch": 0.8754778467130413, "grad_norm": 0.9176009893417358, "learning_rate": 3.578244073378227e-05, "loss": 0.1903, "step": 13970 }, { "epoch": 0.8761045309268659, "grad_norm": 7.211978435516357, "learning_rate": 3.577188575288679e-05, "loss": 0.1683, "step": 13980 }, { "epoch": 0.8767312151406906, "grad_norm": 0.17308202385902405, "learning_rate": 3.5761330771991305e-05, "loss": 0.041, "step": 13990 }, { "epoch": 0.8773578993545152, "grad_norm": 0.30790331959724426, "learning_rate": 3.575077579109582e-05, "loss": 0.2975, "step": 14000 }, { "epoch": 0.8779845835683399, "grad_norm": 0.08187547326087952, "learning_rate": 3.574022081020034e-05, "loss": 0.2002, "step": 14010 }, { "epoch": 0.8786112677821646, "grad_norm": 0.38137057423591614, "learning_rate": 3.572966582930485e-05, "loss": 0.037, "step": 14020 }, { "epoch": 0.8792379519959892, "grad_norm": 0.10382096469402313, "learning_rate": 3.5719110848409365e-05, "loss": 0.1482, "step": 14030 }, { "epoch": 0.8798646362098139, "grad_norm": 0.13245761394500732, "learning_rate": 3.570855586751388e-05, "loss": 0.1223, "step": 14040 }, { "epoch": 0.8804913204236385, "grad_norm": 0.06675305962562561, "learning_rate": 3.56980008866184e-05, "loss": 0.0169, "step": 14050 }, { "epoch": 0.8811180046374631, "grad_norm": 0.0645369216799736, "learning_rate": 3.568744590572291e-05, "loss": 0.0948, "step": 14060 }, { "epoch": 0.8817446888512879, "grad_norm": 0.06949496269226074, "learning_rate": 3.5676890924827424e-05, "loss": 0.0758, "step": 14070 }, { "epoch": 0.8823713730651125, "grad_norm": 0.13635949790477753, "learning_rate": 3.566633594393195e-05, "loss": 0.2345, "step": 14080 }, { "epoch": 0.8829980572789371, "grad_norm": 19.03074073791504, "learning_rate": 3.565578096303646e-05, "loss": 0.0708, "step": 14090 }, { "epoch": 0.8836247414927618, "grad_norm": 0.17024590075016022, "learning_rate": 3.5645225982140974e-05, "loss": 0.1785, "step": 14100 }, { "epoch": 0.8842514257065864, "grad_norm": 0.13776808977127075, "learning_rate": 3.563467100124549e-05, "loss": 0.0187, "step": 14110 }, { "epoch": 0.8848781099204112, "grad_norm": 0.1499529480934143, "learning_rate": 3.562411602035001e-05, "loss": 0.1989, "step": 14120 }, { "epoch": 0.8855047941342358, "grad_norm": 3.0967087745666504, "learning_rate": 3.561356103945452e-05, "loss": 0.2515, "step": 14130 }, { "epoch": 0.8861314783480604, "grad_norm": 0.36061203479766846, "learning_rate": 3.5603006058559034e-05, "loss": 0.0717, "step": 14140 }, { "epoch": 0.8867581625618851, "grad_norm": 0.13822808861732483, "learning_rate": 3.559245107766355e-05, "loss": 0.1449, "step": 14150 }, { "epoch": 0.8873848467757097, "grad_norm": 0.12512069940567017, "learning_rate": 3.558189609676807e-05, "loss": 0.277, "step": 14160 }, { "epoch": 0.8880115309895343, "grad_norm": 0.12213852256536484, "learning_rate": 3.557134111587258e-05, "loss": 0.0714, "step": 14170 }, { "epoch": 0.8886382152033591, "grad_norm": 6.950976848602295, "learning_rate": 3.55607861349771e-05, "loss": 0.0562, "step": 14180 }, { "epoch": 0.8892648994171837, "grad_norm": 3.2038068771362305, "learning_rate": 3.555023115408162e-05, "loss": 0.1579, "step": 14190 }, { "epoch": 0.8898915836310083, "grad_norm": 0.07791581749916077, "learning_rate": 3.553967617318613e-05, "loss": 0.0488, "step": 14200 }, { "epoch": 0.890518267844833, "grad_norm": 0.6226568222045898, "learning_rate": 3.5529121192290644e-05, "loss": 0.128, "step": 14210 }, { "epoch": 0.8911449520586576, "grad_norm": 3.773237466812134, "learning_rate": 3.551856621139516e-05, "loss": 0.2278, "step": 14220 }, { "epoch": 0.8917716362724823, "grad_norm": 3.38862943649292, "learning_rate": 3.550801123049968e-05, "loss": 0.2659, "step": 14230 }, { "epoch": 0.892398320486307, "grad_norm": 2.387253522872925, "learning_rate": 3.549745624960419e-05, "loss": 0.1546, "step": 14240 }, { "epoch": 0.8930250047001316, "grad_norm": 0.2753084599971771, "learning_rate": 3.5486901268708704e-05, "loss": 0.2394, "step": 14250 }, { "epoch": 0.8936516889139563, "grad_norm": 0.22734327614307404, "learning_rate": 3.547634628781322e-05, "loss": 0.0995, "step": 14260 }, { "epoch": 0.8942783731277809, "grad_norm": 0.2785840928554535, "learning_rate": 3.546579130691774e-05, "loss": 0.2001, "step": 14270 }, { "epoch": 0.8949050573416055, "grad_norm": 12.311131477355957, "learning_rate": 3.5455236326022254e-05, "loss": 0.0849, "step": 14280 }, { "epoch": 0.8955317415554302, "grad_norm": 0.11820970475673676, "learning_rate": 3.544468134512677e-05, "loss": 0.0703, "step": 14290 }, { "epoch": 0.8961584257692549, "grad_norm": 0.13383793830871582, "learning_rate": 3.543412636423129e-05, "loss": 0.1594, "step": 14300 }, { "epoch": 0.8967851099830795, "grad_norm": 0.1279149353504181, "learning_rate": 3.54235713833358e-05, "loss": 0.1361, "step": 14310 }, { "epoch": 0.8974117941969042, "grad_norm": 0.11653663963079453, "learning_rate": 3.5413016402440314e-05, "loss": 0.2208, "step": 14320 }, { "epoch": 0.8980384784107288, "grad_norm": 6.753573894500732, "learning_rate": 3.540246142154483e-05, "loss": 0.3227, "step": 14330 }, { "epoch": 0.8986651626245535, "grad_norm": 0.18352077901363373, "learning_rate": 3.539190644064934e-05, "loss": 0.1691, "step": 14340 }, { "epoch": 0.8992918468383782, "grad_norm": 0.22739467024803162, "learning_rate": 3.538135145975386e-05, "loss": 0.1916, "step": 14350 }, { "epoch": 0.8999185310522028, "grad_norm": 3.4141712188720703, "learning_rate": 3.5370796478858373e-05, "loss": 0.0496, "step": 14360 }, { "epoch": 0.9005452152660275, "grad_norm": 4.359954357147217, "learning_rate": 3.536024149796289e-05, "loss": 0.3205, "step": 14370 }, { "epoch": 0.9011718994798521, "grad_norm": 0.2058669477701187, "learning_rate": 3.534968651706741e-05, "loss": 0.0168, "step": 14380 }, { "epoch": 0.9017985836936767, "grad_norm": 0.16354796290397644, "learning_rate": 3.533913153617192e-05, "loss": 0.1409, "step": 14390 }, { "epoch": 0.9024252679075014, "grad_norm": 0.184243842959404, "learning_rate": 3.532857655527644e-05, "loss": 0.2108, "step": 14400 }, { "epoch": 0.903051952121326, "grad_norm": 0.19970452785491943, "learning_rate": 3.531802157438095e-05, "loss": 0.0982, "step": 14410 }, { "epoch": 0.9036786363351507, "grad_norm": 5.539926052093506, "learning_rate": 3.5307466593485467e-05, "loss": 0.0542, "step": 14420 }, { "epoch": 0.9043053205489754, "grad_norm": 0.11390367895364761, "learning_rate": 3.529691161258998e-05, "loss": 0.1596, "step": 14430 }, { "epoch": 0.9049320047628, "grad_norm": 3.1585936546325684, "learning_rate": 3.52863566316945e-05, "loss": 0.1152, "step": 14440 }, { "epoch": 0.9055586889766247, "grad_norm": 0.20199960470199585, "learning_rate": 3.527580165079901e-05, "loss": 0.297, "step": 14450 }, { "epoch": 0.9061853731904493, "grad_norm": 0.20266208052635193, "learning_rate": 3.5265246669903526e-05, "loss": 0.0973, "step": 14460 }, { "epoch": 0.906812057404274, "grad_norm": 0.13221389055252075, "learning_rate": 3.525469168900804e-05, "loss": 0.0184, "step": 14470 }, { "epoch": 0.9074387416180987, "grad_norm": 0.10203318297863007, "learning_rate": 3.524413670811256e-05, "loss": 0.0098, "step": 14480 }, { "epoch": 0.9080654258319233, "grad_norm": 0.12804029881954193, "learning_rate": 3.5233581727217076e-05, "loss": 0.0036, "step": 14490 }, { "epoch": 0.9086921100457479, "grad_norm": 0.0833655372262001, "learning_rate": 3.522302674632159e-05, "loss": 0.2275, "step": 14500 }, { "epoch": 0.9093187942595726, "grad_norm": 60.491127014160156, "learning_rate": 3.521247176542611e-05, "loss": 0.2654, "step": 14510 }, { "epoch": 0.9099454784733972, "grad_norm": 4.894341468811035, "learning_rate": 3.520191678453062e-05, "loss": 0.021, "step": 14520 }, { "epoch": 0.910572162687222, "grad_norm": 3.7257537841796875, "learning_rate": 3.5191361803635136e-05, "loss": 0.2712, "step": 14530 }, { "epoch": 0.9111988469010466, "grad_norm": 25.76340675354004, "learning_rate": 3.518080682273965e-05, "loss": 0.1857, "step": 14540 }, { "epoch": 0.9118255311148712, "grad_norm": 3.0573976039886475, "learning_rate": 3.517025184184417e-05, "loss": 0.1953, "step": 14550 }, { "epoch": 0.9124522153286959, "grad_norm": 3.152639389038086, "learning_rate": 3.515969686094868e-05, "loss": 0.2077, "step": 14560 }, { "epoch": 0.9130788995425205, "grad_norm": 0.7926061153411865, "learning_rate": 3.5149141880053196e-05, "loss": 0.2346, "step": 14570 }, { "epoch": 0.9137055837563451, "grad_norm": 0.23289482295513153, "learning_rate": 3.513858689915772e-05, "loss": 0.1212, "step": 14580 }, { "epoch": 0.9143322679701699, "grad_norm": 0.28908389806747437, "learning_rate": 3.512803191826223e-05, "loss": 0.1442, "step": 14590 }, { "epoch": 0.9149589521839945, "grad_norm": 0.2696382701396942, "learning_rate": 3.5117476937366746e-05, "loss": 0.1659, "step": 14600 }, { "epoch": 0.9155856363978191, "grad_norm": 0.15111982822418213, "learning_rate": 3.510692195647126e-05, "loss": 0.2219, "step": 14610 }, { "epoch": 0.9162123206116438, "grad_norm": 36.52037048339844, "learning_rate": 3.509636697557578e-05, "loss": 0.1229, "step": 14620 }, { "epoch": 0.9168390048254684, "grad_norm": 0.141217902302742, "learning_rate": 3.508581199468029e-05, "loss": 0.0352, "step": 14630 }, { "epoch": 0.9174656890392932, "grad_norm": 0.11265815049409866, "learning_rate": 3.5075257013784806e-05, "loss": 0.1197, "step": 14640 }, { "epoch": 0.9180923732531178, "grad_norm": 6.607141971588135, "learning_rate": 3.506470203288932e-05, "loss": 0.2776, "step": 14650 }, { "epoch": 0.9187190574669424, "grad_norm": 0.35055074095726013, "learning_rate": 3.505414705199383e-05, "loss": 0.2109, "step": 14660 }, { "epoch": 0.9193457416807671, "grad_norm": 0.12104617059230804, "learning_rate": 3.504359207109835e-05, "loss": 0.1882, "step": 14670 }, { "epoch": 0.9199724258945917, "grad_norm": 0.1288590133190155, "learning_rate": 3.503303709020287e-05, "loss": 0.1052, "step": 14680 }, { "epoch": 0.9205991101084163, "grad_norm": 0.472810834646225, "learning_rate": 3.502248210930739e-05, "loss": 0.1659, "step": 14690 }, { "epoch": 0.921225794322241, "grad_norm": 0.09575945138931274, "learning_rate": 3.50119271284119e-05, "loss": 0.0759, "step": 14700 }, { "epoch": 0.9218524785360657, "grad_norm": 5.256383419036865, "learning_rate": 3.5001372147516416e-05, "loss": 0.4578, "step": 14710 }, { "epoch": 0.9224791627498903, "grad_norm": 3.0405874252319336, "learning_rate": 3.499081716662093e-05, "loss": 0.2224, "step": 14720 }, { "epoch": 0.923105846963715, "grad_norm": 185.64772033691406, "learning_rate": 3.498026218572544e-05, "loss": 0.0856, "step": 14730 }, { "epoch": 0.9237325311775396, "grad_norm": 0.2516983449459076, "learning_rate": 3.496970720482996e-05, "loss": 0.1241, "step": 14740 }, { "epoch": 0.9243592153913643, "grad_norm": 3.004882574081421, "learning_rate": 3.4959152223934475e-05, "loss": 0.0792, "step": 14750 }, { "epoch": 0.924985899605189, "grad_norm": 0.12979790568351746, "learning_rate": 3.494859724303899e-05, "loss": 0.2953, "step": 14760 }, { "epoch": 0.9256125838190136, "grad_norm": 0.11266639828681946, "learning_rate": 3.493804226214351e-05, "loss": 0.1118, "step": 14770 }, { "epoch": 0.9262392680328383, "grad_norm": 0.18054163455963135, "learning_rate": 3.4927487281248025e-05, "loss": 0.1235, "step": 14780 }, { "epoch": 0.9268659522466629, "grad_norm": 0.07736846059560776, "learning_rate": 3.491693230035254e-05, "loss": 0.0064, "step": 14790 }, { "epoch": 0.9274926364604875, "grad_norm": 0.08575043082237244, "learning_rate": 3.490637731945705e-05, "loss": 0.1972, "step": 14800 }, { "epoch": 0.9281193206743122, "grad_norm": 2.6329586505889893, "learning_rate": 3.489582233856157e-05, "loss": 0.1617, "step": 14810 }, { "epoch": 0.9287460048881369, "grad_norm": 3.1636767387390137, "learning_rate": 3.4885267357666085e-05, "loss": 0.1465, "step": 14820 }, { "epoch": 0.9293726891019615, "grad_norm": 1.366560697555542, "learning_rate": 3.48747123767706e-05, "loss": 0.2379, "step": 14830 }, { "epoch": 0.9299993733157862, "grad_norm": 1.945754051208496, "learning_rate": 3.486415739587511e-05, "loss": 0.1275, "step": 14840 }, { "epoch": 0.9306260575296108, "grad_norm": 3.5652472972869873, "learning_rate": 3.485360241497963e-05, "loss": 0.1644, "step": 14850 }, { "epoch": 0.9312527417434355, "grad_norm": 0.10736887902021408, "learning_rate": 3.4843047434084145e-05, "loss": 0.0855, "step": 14860 }, { "epoch": 0.9318794259572601, "grad_norm": 0.14352063834667206, "learning_rate": 3.483249245318866e-05, "loss": 0.1366, "step": 14870 }, { "epoch": 0.9325061101710848, "grad_norm": 0.2069852501153946, "learning_rate": 3.482193747229318e-05, "loss": 0.131, "step": 14880 }, { "epoch": 0.9331327943849095, "grad_norm": 2.4952032566070557, "learning_rate": 3.4811382491397695e-05, "loss": 0.043, "step": 14890 }, { "epoch": 0.9337594785987341, "grad_norm": 0.050907865166664124, "learning_rate": 3.480082751050221e-05, "loss": 0.1149, "step": 14900 }, { "epoch": 0.9343861628125587, "grad_norm": 0.11749767512083054, "learning_rate": 3.479027252960672e-05, "loss": 0.0684, "step": 14910 }, { "epoch": 0.9350128470263834, "grad_norm": 94.1059341430664, "learning_rate": 3.477971754871124e-05, "loss": 0.2177, "step": 14920 }, { "epoch": 0.935639531240208, "grad_norm": 0.7639022469520569, "learning_rate": 3.4769162567815755e-05, "loss": 0.3833, "step": 14930 }, { "epoch": 0.9362662154540327, "grad_norm": 5.576292514801025, "learning_rate": 3.475860758692027e-05, "loss": 0.3064, "step": 14940 }, { "epoch": 0.9368928996678574, "grad_norm": 0.25809335708618164, "learning_rate": 3.474805260602478e-05, "loss": 0.0185, "step": 14950 }, { "epoch": 0.937519583881682, "grad_norm": 0.10612110793590546, "learning_rate": 3.47374976251293e-05, "loss": 0.0613, "step": 14960 }, { "epoch": 0.9381462680955067, "grad_norm": 0.11804734915494919, "learning_rate": 3.4726942644233815e-05, "loss": 0.0727, "step": 14970 }, { "epoch": 0.9387729523093313, "grad_norm": 0.1256975531578064, "learning_rate": 3.471638766333833e-05, "loss": 0.1307, "step": 14980 }, { "epoch": 0.939399636523156, "grad_norm": 1.1834015846252441, "learning_rate": 3.470583268244285e-05, "loss": 0.1811, "step": 14990 }, { "epoch": 0.9400263207369807, "grad_norm": 0.09870567917823792, "learning_rate": 3.4695277701547365e-05, "loss": 0.1701, "step": 15000 }, { "epoch": 0.9406530049508053, "grad_norm": 0.08171737194061279, "learning_rate": 3.468472272065188e-05, "loss": 0.1085, "step": 15010 }, { "epoch": 0.9412796891646299, "grad_norm": 0.08438348025083542, "learning_rate": 3.467416773975639e-05, "loss": 0.021, "step": 15020 }, { "epoch": 0.9419063733784546, "grad_norm": 19.801450729370117, "learning_rate": 3.466361275886091e-05, "loss": 0.0466, "step": 15030 }, { "epoch": 0.9425330575922792, "grad_norm": 4.311177730560303, "learning_rate": 3.4653057777965424e-05, "loss": 0.1786, "step": 15040 }, { "epoch": 0.943159741806104, "grad_norm": 3.4445955753326416, "learning_rate": 3.4642502797069934e-05, "loss": 0.177, "step": 15050 }, { "epoch": 0.9437864260199286, "grad_norm": 0.06916206330060959, "learning_rate": 3.463194781617445e-05, "loss": 0.0445, "step": 15060 }, { "epoch": 0.9444131102337532, "grad_norm": 0.06181073561310768, "learning_rate": 3.462139283527897e-05, "loss": 0.1036, "step": 15070 }, { "epoch": 0.9450397944475779, "grad_norm": 0.6313286423683167, "learning_rate": 3.461083785438349e-05, "loss": 0.1272, "step": 15080 }, { "epoch": 0.9456664786614025, "grad_norm": 0.22247269749641418, "learning_rate": 3.4600282873488e-05, "loss": 0.195, "step": 15090 }, { "epoch": 0.9462931628752271, "grad_norm": 0.09834704548120499, "learning_rate": 3.458972789259252e-05, "loss": 0.0369, "step": 15100 }, { "epoch": 0.9469198470890519, "grad_norm": 41.505699157714844, "learning_rate": 3.4579172911697034e-05, "loss": 0.2066, "step": 15110 }, { "epoch": 0.9475465313028765, "grad_norm": 0.06724511831998825, "learning_rate": 3.4568617930801544e-05, "loss": 0.0907, "step": 15120 }, { "epoch": 0.9481732155167011, "grad_norm": 0.7390651702880859, "learning_rate": 3.455806294990606e-05, "loss": 0.1586, "step": 15130 }, { "epoch": 0.9487998997305258, "grad_norm": 1.8994289636611938, "learning_rate": 3.454750796901058e-05, "loss": 0.146, "step": 15140 }, { "epoch": 0.9494265839443504, "grad_norm": 0.05240581929683685, "learning_rate": 3.4536952988115094e-05, "loss": 0.0999, "step": 15150 }, { "epoch": 0.9500532681581751, "grad_norm": 0.12572909891605377, "learning_rate": 3.4526398007219604e-05, "loss": 0.2249, "step": 15160 }, { "epoch": 0.9506799523719998, "grad_norm": 6.992523670196533, "learning_rate": 3.451584302632413e-05, "loss": 0.2247, "step": 15170 }, { "epoch": 0.9513066365858244, "grad_norm": 0.2796679437160492, "learning_rate": 3.4505288045428644e-05, "loss": 0.0354, "step": 15180 }, { "epoch": 0.9519333207996491, "grad_norm": 0.16293998062610626, "learning_rate": 3.4494733064533154e-05, "loss": 0.122, "step": 15190 }, { "epoch": 0.9525600050134737, "grad_norm": 0.2495272159576416, "learning_rate": 3.448417808363767e-05, "loss": 0.0535, "step": 15200 }, { "epoch": 0.9531866892272983, "grad_norm": 59.95820617675781, "learning_rate": 3.447362310274219e-05, "loss": 0.1418, "step": 15210 }, { "epoch": 0.953813373441123, "grad_norm": 2.4325592517852783, "learning_rate": 3.4463068121846704e-05, "loss": 0.0899, "step": 15220 }, { "epoch": 0.9544400576549477, "grad_norm": 0.05558519810438156, "learning_rate": 3.4452513140951214e-05, "loss": 0.0207, "step": 15230 }, { "epoch": 0.9550667418687723, "grad_norm": 0.07892566174268723, "learning_rate": 3.444195816005573e-05, "loss": 0.151, "step": 15240 }, { "epoch": 0.955693426082597, "grad_norm": 0.07319292426109314, "learning_rate": 3.443140317916025e-05, "loss": 0.0545, "step": 15250 }, { "epoch": 0.9563201102964216, "grad_norm": 0.5627008676528931, "learning_rate": 3.442084819826476e-05, "loss": 0.3364, "step": 15260 }, { "epoch": 0.9569467945102463, "grad_norm": 0.2820579409599304, "learning_rate": 3.441029321736928e-05, "loss": 0.0516, "step": 15270 }, { "epoch": 0.957573478724071, "grad_norm": 2.9985768795013428, "learning_rate": 3.43997382364738e-05, "loss": 0.2052, "step": 15280 }, { "epoch": 0.9582001629378956, "grad_norm": 0.43202292919158936, "learning_rate": 3.4389183255578314e-05, "loss": 0.1178, "step": 15290 }, { "epoch": 0.9588268471517203, "grad_norm": 0.08718843758106232, "learning_rate": 3.4378628274682823e-05, "loss": 0.161, "step": 15300 }, { "epoch": 0.9594535313655449, "grad_norm": 0.2119925618171692, "learning_rate": 3.436807329378734e-05, "loss": 0.0193, "step": 15310 }, { "epoch": 0.9600802155793695, "grad_norm": 0.25086766481399536, "learning_rate": 3.435751831289186e-05, "loss": 0.0914, "step": 15320 }, { "epoch": 0.9607068997931942, "grad_norm": 0.19801168143749237, "learning_rate": 3.434696333199637e-05, "loss": 0.0216, "step": 15330 }, { "epoch": 0.9613335840070188, "grad_norm": 1.4686822891235352, "learning_rate": 3.433640835110088e-05, "loss": 0.1907, "step": 15340 }, { "epoch": 0.9619602682208435, "grad_norm": 7.100295066833496, "learning_rate": 3.43258533702054e-05, "loss": 0.2471, "step": 15350 }, { "epoch": 0.9625869524346682, "grad_norm": 1.9733787775039673, "learning_rate": 3.4315298389309917e-05, "loss": 0.2039, "step": 15360 }, { "epoch": 0.9632136366484928, "grad_norm": 0.7774206399917603, "learning_rate": 3.430474340841443e-05, "loss": 0.033, "step": 15370 }, { "epoch": 0.9638403208623175, "grad_norm": 0.05901852250099182, "learning_rate": 3.429418842751895e-05, "loss": 0.1349, "step": 15380 }, { "epoch": 0.9644670050761421, "grad_norm": 3.6407742500305176, "learning_rate": 3.4283633446623466e-05, "loss": 0.1869, "step": 15390 }, { "epoch": 0.9650936892899667, "grad_norm": 6.53506326675415, "learning_rate": 3.427307846572798e-05, "loss": 0.2079, "step": 15400 }, { "epoch": 0.9657203735037915, "grad_norm": 0.1483003944158554, "learning_rate": 3.426252348483249e-05, "loss": 0.0777, "step": 15410 }, { "epoch": 0.9663470577176161, "grad_norm": 0.16284331679344177, "learning_rate": 3.425196850393701e-05, "loss": 0.1663, "step": 15420 }, { "epoch": 0.9669737419314407, "grad_norm": 0.08627568185329437, "learning_rate": 3.4241413523041526e-05, "loss": 0.0315, "step": 15430 }, { "epoch": 0.9676004261452654, "grad_norm": 0.3580785393714905, "learning_rate": 3.4230858542146036e-05, "loss": 0.0236, "step": 15440 }, { "epoch": 0.96822711035909, "grad_norm": 2.2506258487701416, "learning_rate": 3.422030356125055e-05, "loss": 0.1656, "step": 15450 }, { "epoch": 0.9688537945729148, "grad_norm": 0.042580630630254745, "learning_rate": 3.420974858035507e-05, "loss": 0.0181, "step": 15460 }, { "epoch": 0.9694804787867394, "grad_norm": 4.1472344398498535, "learning_rate": 3.4199193599459586e-05, "loss": 0.1191, "step": 15470 }, { "epoch": 0.970107163000564, "grad_norm": 0.04683535918593407, "learning_rate": 3.41886386185641e-05, "loss": 0.0648, "step": 15480 }, { "epoch": 0.9707338472143887, "grad_norm": 0.19521360099315643, "learning_rate": 3.417808363766862e-05, "loss": 0.0304, "step": 15490 }, { "epoch": 0.9713605314282133, "grad_norm": 0.04040757939219475, "learning_rate": 3.4167528656773136e-05, "loss": 0.0192, "step": 15500 }, { "epoch": 0.9719872156420379, "grad_norm": 0.8133098483085632, "learning_rate": 3.4156973675877646e-05, "loss": 0.2299, "step": 15510 }, { "epoch": 0.9726138998558627, "grad_norm": 1.3788111209869385, "learning_rate": 3.414641869498216e-05, "loss": 0.0283, "step": 15520 }, { "epoch": 0.9732405840696873, "grad_norm": 0.045829709619283676, "learning_rate": 3.413586371408668e-05, "loss": 0.1146, "step": 15530 }, { "epoch": 0.9738672682835119, "grad_norm": 0.03582155704498291, "learning_rate": 3.4125308733191196e-05, "loss": 0.0558, "step": 15540 }, { "epoch": 0.9744939524973366, "grad_norm": 0.06874176859855652, "learning_rate": 3.4114753752295706e-05, "loss": 0.3368, "step": 15550 }, { "epoch": 0.9751206367111612, "grad_norm": 0.11948882043361664, "learning_rate": 3.410419877140022e-05, "loss": 0.134, "step": 15560 }, { "epoch": 0.975747320924986, "grad_norm": 3.4153361320495605, "learning_rate": 3.409364379050474e-05, "loss": 0.2266, "step": 15570 }, { "epoch": 0.9763740051388106, "grad_norm": 0.24882833659648895, "learning_rate": 3.4083088809609256e-05, "loss": 0.0638, "step": 15580 }, { "epoch": 0.9770006893526352, "grad_norm": 2.382737874984741, "learning_rate": 3.407253382871377e-05, "loss": 0.1365, "step": 15590 }, { "epoch": 0.9776273735664599, "grad_norm": 0.11060065776109695, "learning_rate": 3.406197884781829e-05, "loss": 0.1752, "step": 15600 }, { "epoch": 0.9782540577802845, "grad_norm": 0.1280880719423294, "learning_rate": 3.4051423866922806e-05, "loss": 0.0645, "step": 15610 }, { "epoch": 0.9788807419941091, "grad_norm": 4.298983097076416, "learning_rate": 3.4040868886027316e-05, "loss": 0.1975, "step": 15620 }, { "epoch": 0.9795074262079339, "grad_norm": 0.4864870607852936, "learning_rate": 3.403031390513183e-05, "loss": 0.0377, "step": 15630 }, { "epoch": 0.9801341104217585, "grad_norm": 0.08115755021572113, "learning_rate": 3.401975892423635e-05, "loss": 0.0884, "step": 15640 }, { "epoch": 0.9807607946355831, "grad_norm": 0.27335405349731445, "learning_rate": 3.400920394334086e-05, "loss": 0.3032, "step": 15650 }, { "epoch": 0.9813874788494078, "grad_norm": 0.18419836461544037, "learning_rate": 3.3998648962445375e-05, "loss": 0.0266, "step": 15660 }, { "epoch": 0.9820141630632324, "grad_norm": 0.09136651456356049, "learning_rate": 3.39880939815499e-05, "loss": 0.0558, "step": 15670 }, { "epoch": 0.9826408472770571, "grad_norm": 0.06150569021701813, "learning_rate": 3.3977539000654416e-05, "loss": 0.2052, "step": 15680 }, { "epoch": 0.9832675314908818, "grad_norm": 0.3006777763366699, "learning_rate": 3.3966984019758925e-05, "loss": 0.004, "step": 15690 }, { "epoch": 0.9838942157047064, "grad_norm": 0.06138232722878456, "learning_rate": 3.395642903886344e-05, "loss": 0.1408, "step": 15700 }, { "epoch": 0.9845208999185311, "grad_norm": 5.341544151306152, "learning_rate": 3.394587405796796e-05, "loss": 0.1527, "step": 15710 }, { "epoch": 0.9851475841323557, "grad_norm": 1.9161176681518555, "learning_rate": 3.393531907707247e-05, "loss": 0.1775, "step": 15720 }, { "epoch": 0.9857742683461803, "grad_norm": 0.08122947812080383, "learning_rate": 3.3924764096176985e-05, "loss": 0.111, "step": 15730 }, { "epoch": 0.986400952560005, "grad_norm": 0.06884922832250595, "learning_rate": 3.39142091152815e-05, "loss": 0.0213, "step": 15740 }, { "epoch": 0.9870276367738297, "grad_norm": 0.06372876465320587, "learning_rate": 3.390365413438602e-05, "loss": 0.0811, "step": 15750 }, { "epoch": 0.9876543209876543, "grad_norm": 0.07776562124490738, "learning_rate": 3.3893099153490535e-05, "loss": 0.0972, "step": 15760 }, { "epoch": 0.988281005201479, "grad_norm": 3.338428258895874, "learning_rate": 3.388254417259505e-05, "loss": 0.1325, "step": 15770 }, { "epoch": 0.9889076894153036, "grad_norm": 17.43355941772461, "learning_rate": 3.387198919169957e-05, "loss": 0.1241, "step": 15780 }, { "epoch": 0.9895343736291283, "grad_norm": 0.08450421690940857, "learning_rate": 3.3861434210804085e-05, "loss": 0.1008, "step": 15790 }, { "epoch": 0.9901610578429529, "grad_norm": 1.0543551445007324, "learning_rate": 3.3850879229908595e-05, "loss": 0.1727, "step": 15800 }, { "epoch": 0.9907877420567776, "grad_norm": 0.10355456173419952, "learning_rate": 3.384032424901311e-05, "loss": 0.1414, "step": 15810 }, { "epoch": 0.9914144262706023, "grad_norm": 0.32224273681640625, "learning_rate": 3.382976926811763e-05, "loss": 0.2139, "step": 15820 }, { "epoch": 0.9920411104844269, "grad_norm": 1.6528878211975098, "learning_rate": 3.381921428722214e-05, "loss": 0.0422, "step": 15830 }, { "epoch": 0.9926677946982515, "grad_norm": 0.08288286626338959, "learning_rate": 3.3808659306326655e-05, "loss": 0.1074, "step": 15840 }, { "epoch": 0.9932944789120762, "grad_norm": 0.5336832404136658, "learning_rate": 3.379810432543117e-05, "loss": 0.0239, "step": 15850 }, { "epoch": 0.9939211631259008, "grad_norm": 0.07496319711208344, "learning_rate": 3.378754934453569e-05, "loss": 0.2356, "step": 15860 }, { "epoch": 0.9945478473397256, "grad_norm": 20.032400131225586, "learning_rate": 3.3776994363640205e-05, "loss": 0.3109, "step": 15870 }, { "epoch": 0.9951745315535502, "grad_norm": 0.8633710145950317, "learning_rate": 3.376643938274472e-05, "loss": 0.1438, "step": 15880 }, { "epoch": 0.9958012157673748, "grad_norm": 3.0096535682678223, "learning_rate": 3.375588440184924e-05, "loss": 0.3369, "step": 15890 }, { "epoch": 0.9964278999811995, "grad_norm": 0.24672941863536835, "learning_rate": 3.374532942095375e-05, "loss": 0.0294, "step": 15900 }, { "epoch": 0.9970545841950241, "grad_norm": 0.13079603016376495, "learning_rate": 3.3734774440058265e-05, "loss": 0.1003, "step": 15910 }, { "epoch": 0.9976812684088487, "grad_norm": 0.10342691093683243, "learning_rate": 3.372421945916278e-05, "loss": 0.0169, "step": 15920 }, { "epoch": 0.9983079526226735, "grad_norm": 3.7852447032928467, "learning_rate": 3.37136644782673e-05, "loss": 0.248, "step": 15930 }, { "epoch": 0.9989346368364981, "grad_norm": 0.4074271023273468, "learning_rate": 3.370310949737181e-05, "loss": 0.1473, "step": 15940 }, { "epoch": 0.9995613210503227, "grad_norm": 0.073146291077137, "learning_rate": 3.3692554516476324e-05, "loss": 0.0179, "step": 15950 }, { "epoch": 1.0, "eval_accuracy": 0.9645621181262729, "eval_f1": 0.9639957278227224, "eval_loss": 0.1336221545934677, "eval_precision": 0.9636563068483206, "eval_recall": 0.9645621181262729, "eval_runtime": 288.363, "eval_samples_per_second": 110.676, "eval_steps_per_second": 13.837, "step": 15957 }, { "epoch": 1.0001880052641474, "grad_norm": 1.1847440004348755, "learning_rate": 3.368199953558084e-05, "loss": 0.0828, "step": 15960 }, { "epoch": 1.0008146894779721, "grad_norm": 0.06779574602842331, "learning_rate": 3.367144455468536e-05, "loss": 0.1321, "step": 15970 }, { "epoch": 1.0014413736917966, "grad_norm": 0.06015975400805473, "learning_rate": 3.3660889573789874e-05, "loss": 0.0055, "step": 15980 }, { "epoch": 1.0020680579056214, "grad_norm": 0.04776463657617569, "learning_rate": 3.365033459289439e-05, "loss": 0.1423, "step": 15990 }, { "epoch": 1.002694742119446, "grad_norm": 3.770029306411743, "learning_rate": 3.363977961199891e-05, "loss": 0.1954, "step": 16000 }, { "epoch": 1.0033214263332706, "grad_norm": 1.7021197080612183, "learning_rate": 3.362922463110342e-05, "loss": 0.1411, "step": 16010 }, { "epoch": 1.0039481105470953, "grad_norm": 0.05614056438207626, "learning_rate": 3.3618669650207934e-05, "loss": 0.1156, "step": 16020 }, { "epoch": 1.00457479476092, "grad_norm": 0.10399208217859268, "learning_rate": 3.360811466931245e-05, "loss": 0.1231, "step": 16030 }, { "epoch": 1.0052014789747445, "grad_norm": 0.07346727699041367, "learning_rate": 3.359755968841696e-05, "loss": 0.0193, "step": 16040 }, { "epoch": 1.0058281631885693, "grad_norm": 0.5521314740180969, "learning_rate": 3.358700470752148e-05, "loss": 0.0412, "step": 16050 }, { "epoch": 1.006454847402394, "grad_norm": 0.34978896379470825, "learning_rate": 3.3576449726625994e-05, "loss": 0.0532, "step": 16060 }, { "epoch": 1.0070815316162185, "grad_norm": 0.03547803685069084, "learning_rate": 3.356589474573051e-05, "loss": 0.1295, "step": 16070 }, { "epoch": 1.0077082158300432, "grad_norm": 0.036940619349479675, "learning_rate": 3.355533976483503e-05, "loss": 0.0885, "step": 16080 }, { "epoch": 1.008334900043868, "grad_norm": 0.6497445106506348, "learning_rate": 3.3544784783939544e-05, "loss": 0.1189, "step": 16090 }, { "epoch": 1.0089615842576924, "grad_norm": 6.425511360168457, "learning_rate": 3.353422980304406e-05, "loss": 0.2203, "step": 16100 }, { "epoch": 1.0095882684715172, "grad_norm": 0.05597083643078804, "learning_rate": 3.352367482214857e-05, "loss": 0.1882, "step": 16110 }, { "epoch": 1.010214952685342, "grad_norm": 4.08495569229126, "learning_rate": 3.351311984125309e-05, "loss": 0.1379, "step": 16120 }, { "epoch": 1.0108416368991666, "grad_norm": 4.696819305419922, "learning_rate": 3.3502564860357604e-05, "loss": 0.2121, "step": 16130 }, { "epoch": 1.0114683211129911, "grad_norm": 0.11512627452611923, "learning_rate": 3.349200987946212e-05, "loss": 0.0728, "step": 16140 }, { "epoch": 1.0120950053268158, "grad_norm": 2.806684970855713, "learning_rate": 3.348145489856663e-05, "loss": 0.0728, "step": 16150 }, { "epoch": 1.0127216895406406, "grad_norm": 0.11947732418775558, "learning_rate": 3.347089991767115e-05, "loss": 0.0752, "step": 16160 }, { "epoch": 1.013348373754465, "grad_norm": 0.5134645700454712, "learning_rate": 3.346034493677567e-05, "loss": 0.1281, "step": 16170 }, { "epoch": 1.0139750579682898, "grad_norm": 0.08177413046360016, "learning_rate": 3.344978995588019e-05, "loss": 0.2551, "step": 16180 }, { "epoch": 1.0146017421821145, "grad_norm": 2.666290760040283, "learning_rate": 3.34392349749847e-05, "loss": 0.0988, "step": 16190 }, { "epoch": 1.015228426395939, "grad_norm": 0.20499251782894135, "learning_rate": 3.3428679994089214e-05, "loss": 0.1529, "step": 16200 }, { "epoch": 1.0158551106097637, "grad_norm": 0.056084733456373215, "learning_rate": 3.341812501319373e-05, "loss": 0.1688, "step": 16210 }, { "epoch": 1.0164817948235885, "grad_norm": 0.2051260620355606, "learning_rate": 3.340757003229824e-05, "loss": 0.1783, "step": 16220 }, { "epoch": 1.017108479037413, "grad_norm": 0.20934149622917175, "learning_rate": 3.339701505140276e-05, "loss": 0.0797, "step": 16230 }, { "epoch": 1.0177351632512377, "grad_norm": 8.540879249572754, "learning_rate": 3.3386460070507273e-05, "loss": 0.1156, "step": 16240 }, { "epoch": 1.0183618474650624, "grad_norm": 0.07769911736249924, "learning_rate": 3.337590508961179e-05, "loss": 0.2469, "step": 16250 }, { "epoch": 1.018988531678887, "grad_norm": 0.10938192158937454, "learning_rate": 3.336535010871631e-05, "loss": 0.1786, "step": 16260 }, { "epoch": 1.0196152158927116, "grad_norm": 0.061687126755714417, "learning_rate": 3.335479512782082e-05, "loss": 0.0405, "step": 16270 }, { "epoch": 1.0202419001065364, "grad_norm": 0.0524788424372673, "learning_rate": 3.334424014692534e-05, "loss": 0.0525, "step": 16280 }, { "epoch": 1.0208685843203609, "grad_norm": 1.4585634469985962, "learning_rate": 3.333368516602985e-05, "loss": 0.1014, "step": 16290 }, { "epoch": 1.0214952685341856, "grad_norm": 0.05513199046254158, "learning_rate": 3.3323130185134367e-05, "loss": 0.1112, "step": 16300 }, { "epoch": 1.0221219527480103, "grad_norm": 0.04087262228131294, "learning_rate": 3.331257520423888e-05, "loss": 0.0553, "step": 16310 }, { "epoch": 1.0227486369618348, "grad_norm": 0.03982210531830788, "learning_rate": 3.33020202233434e-05, "loss": 0.1507, "step": 16320 }, { "epoch": 1.0233753211756595, "grad_norm": 0.3110230267047882, "learning_rate": 3.329146524244791e-05, "loss": 0.1162, "step": 16330 }, { "epoch": 1.0240020053894843, "grad_norm": 0.05139019340276718, "learning_rate": 3.3280910261552426e-05, "loss": 0.1322, "step": 16340 }, { "epoch": 1.024628689603309, "grad_norm": 4.573475360870361, "learning_rate": 3.327035528065694e-05, "loss": 0.2322, "step": 16350 }, { "epoch": 1.0252553738171335, "grad_norm": 0.8857044577598572, "learning_rate": 3.325980029976146e-05, "loss": 0.0564, "step": 16360 }, { "epoch": 1.0258820580309582, "grad_norm": 15.947443962097168, "learning_rate": 3.3249245318865976e-05, "loss": 0.111, "step": 16370 }, { "epoch": 1.026508742244783, "grad_norm": 0.10292017459869385, "learning_rate": 3.323869033797049e-05, "loss": 0.0134, "step": 16380 }, { "epoch": 1.0271354264586074, "grad_norm": 0.04638553783297539, "learning_rate": 3.322813535707501e-05, "loss": 0.0824, "step": 16390 }, { "epoch": 1.0277621106724322, "grad_norm": 0.07538238167762756, "learning_rate": 3.321758037617952e-05, "loss": 0.0706, "step": 16400 }, { "epoch": 1.028388794886257, "grad_norm": 22.8479061126709, "learning_rate": 3.3207025395284036e-05, "loss": 0.1156, "step": 16410 }, { "epoch": 1.0290154791000814, "grad_norm": 0.6563345789909363, "learning_rate": 3.319647041438855e-05, "loss": 0.064, "step": 16420 }, { "epoch": 1.0296421633139061, "grad_norm": 1.6031055450439453, "learning_rate": 3.318591543349306e-05, "loss": 0.1772, "step": 16430 }, { "epoch": 1.0302688475277308, "grad_norm": 9.018767356872559, "learning_rate": 3.317536045259758e-05, "loss": 0.1231, "step": 16440 }, { "epoch": 1.0308955317415553, "grad_norm": 0.10965459048748016, "learning_rate": 3.3164805471702096e-05, "loss": 0.2208, "step": 16450 }, { "epoch": 1.03152221595538, "grad_norm": 1.207930326461792, "learning_rate": 3.315425049080661e-05, "loss": 0.0507, "step": 16460 }, { "epoch": 1.0321489001692048, "grad_norm": 5.185263633728027, "learning_rate": 3.314369550991113e-05, "loss": 0.1507, "step": 16470 }, { "epoch": 1.0327755843830293, "grad_norm": 0.04256342723965645, "learning_rate": 3.3133140529015646e-05, "loss": 0.2055, "step": 16480 }, { "epoch": 1.033402268596854, "grad_norm": 0.7863683104515076, "learning_rate": 3.312258554812016e-05, "loss": 0.2199, "step": 16490 }, { "epoch": 1.0340289528106787, "grad_norm": 0.0994349867105484, "learning_rate": 3.311203056722467e-05, "loss": 0.0236, "step": 16500 }, { "epoch": 1.0346556370245032, "grad_norm": 0.051592059433460236, "learning_rate": 3.310147558632919e-05, "loss": 0.0409, "step": 16510 }, { "epoch": 1.035282321238328, "grad_norm": 0.2096475213766098, "learning_rate": 3.3090920605433706e-05, "loss": 0.0062, "step": 16520 }, { "epoch": 1.0359090054521527, "grad_norm": 0.027175744995474815, "learning_rate": 3.308036562453822e-05, "loss": 0.028, "step": 16530 }, { "epoch": 1.0365356896659774, "grad_norm": 0.5925734043121338, "learning_rate": 3.306981064364273e-05, "loss": 0.0372, "step": 16540 }, { "epoch": 1.037162373879802, "grad_norm": 0.03400746360421181, "learning_rate": 3.305925566274725e-05, "loss": 0.0195, "step": 16550 }, { "epoch": 1.0377890580936266, "grad_norm": 0.03515337407588959, "learning_rate": 3.3048700681851766e-05, "loss": 0.1732, "step": 16560 }, { "epoch": 1.0384157423074514, "grad_norm": 17.625837326049805, "learning_rate": 3.303814570095628e-05, "loss": 0.1175, "step": 16570 }, { "epoch": 1.0390424265212759, "grad_norm": 0.02709149569272995, "learning_rate": 3.30275907200608e-05, "loss": 0.0583, "step": 16580 }, { "epoch": 1.0396691107351006, "grad_norm": 3.532773971557617, "learning_rate": 3.3017035739165316e-05, "loss": 0.2537, "step": 16590 }, { "epoch": 1.0402957949489253, "grad_norm": 3.3194799423217773, "learning_rate": 3.300648075826983e-05, "loss": 0.2859, "step": 16600 }, { "epoch": 1.0409224791627498, "grad_norm": 0.24675169587135315, "learning_rate": 3.299592577737434e-05, "loss": 0.0852, "step": 16610 }, { "epoch": 1.0415491633765745, "grad_norm": 3.3506369590759277, "learning_rate": 3.298537079647886e-05, "loss": 0.1883, "step": 16620 }, { "epoch": 1.0421758475903993, "grad_norm": 0.6482851505279541, "learning_rate": 3.2974815815583375e-05, "loss": 0.0887, "step": 16630 }, { "epoch": 1.0428025318042238, "grad_norm": 0.10803534835577011, "learning_rate": 3.296426083468789e-05, "loss": 0.0358, "step": 16640 }, { "epoch": 1.0434292160180485, "grad_norm": 0.07975927740335464, "learning_rate": 3.29537058537924e-05, "loss": 0.211, "step": 16650 }, { "epoch": 1.0440559002318732, "grad_norm": 0.0813465490937233, "learning_rate": 3.294315087289692e-05, "loss": 0.0042, "step": 16660 }, { "epoch": 1.0446825844456977, "grad_norm": 0.09860538691282272, "learning_rate": 3.293259589200144e-05, "loss": 0.0623, "step": 16670 }, { "epoch": 1.0453092686595225, "grad_norm": 0.06293389201164246, "learning_rate": 3.292204091110595e-05, "loss": 0.0985, "step": 16680 }, { "epoch": 1.0459359528733472, "grad_norm": 0.148453950881958, "learning_rate": 3.291148593021047e-05, "loss": 0.048, "step": 16690 }, { "epoch": 1.0465626370871717, "grad_norm": 0.07422909885644913, "learning_rate": 3.2900930949314985e-05, "loss": 0.167, "step": 16700 }, { "epoch": 1.0471893213009964, "grad_norm": 0.07262309640645981, "learning_rate": 3.28903759684195e-05, "loss": 0.0609, "step": 16710 }, { "epoch": 1.0478160055148211, "grad_norm": 0.14245441555976868, "learning_rate": 3.287982098752401e-05, "loss": 0.0112, "step": 16720 }, { "epoch": 1.0484426897286458, "grad_norm": 0.3390648663043976, "learning_rate": 3.286926600662853e-05, "loss": 0.1155, "step": 16730 }, { "epoch": 1.0490693739424704, "grad_norm": 0.23542775213718414, "learning_rate": 3.2858711025733045e-05, "loss": 0.0935, "step": 16740 }, { "epoch": 1.049696058156295, "grad_norm": 21.997127532958984, "learning_rate": 3.2848156044837555e-05, "loss": 0.0458, "step": 16750 }, { "epoch": 1.0503227423701198, "grad_norm": 0.0299488827586174, "learning_rate": 3.283760106394208e-05, "loss": 0.2047, "step": 16760 }, { "epoch": 1.0509494265839443, "grad_norm": 0.046134356409311295, "learning_rate": 3.2827046083046595e-05, "loss": 0.1051, "step": 16770 }, { "epoch": 1.051576110797769, "grad_norm": 2.58500075340271, "learning_rate": 3.281649110215111e-05, "loss": 0.0448, "step": 16780 }, { "epoch": 1.0522027950115938, "grad_norm": 2.5090854167938232, "learning_rate": 3.280593612125562e-05, "loss": 0.1437, "step": 16790 }, { "epoch": 1.0528294792254183, "grad_norm": 0.15016499161720276, "learning_rate": 3.279538114036014e-05, "loss": 0.0185, "step": 16800 }, { "epoch": 1.053456163439243, "grad_norm": 0.15622135996818542, "learning_rate": 3.2784826159464655e-05, "loss": 0.2281, "step": 16810 }, { "epoch": 1.0540828476530677, "grad_norm": 0.16764254868030548, "learning_rate": 3.2774271178569165e-05, "loss": 0.0489, "step": 16820 }, { "epoch": 1.0547095318668922, "grad_norm": 13.331740379333496, "learning_rate": 3.276371619767368e-05, "loss": 0.2832, "step": 16830 }, { "epoch": 1.055336216080717, "grad_norm": 0.08065997809171677, "learning_rate": 3.27531612167782e-05, "loss": 0.1134, "step": 16840 }, { "epoch": 1.0559629002945417, "grad_norm": 1.8846511840820312, "learning_rate": 3.2742606235882715e-05, "loss": 0.0899, "step": 16850 }, { "epoch": 1.0565895845083662, "grad_norm": 0.03695497661828995, "learning_rate": 3.273205125498723e-05, "loss": 0.122, "step": 16860 }, { "epoch": 1.0572162687221909, "grad_norm": 1.0204274654388428, "learning_rate": 3.272149627409175e-05, "loss": 0.0522, "step": 16870 }, { "epoch": 1.0578429529360156, "grad_norm": 0.9126099944114685, "learning_rate": 3.2710941293196265e-05, "loss": 0.0771, "step": 16880 }, { "epoch": 1.05846963714984, "grad_norm": 20.570974349975586, "learning_rate": 3.2700386312300774e-05, "loss": 0.0991, "step": 16890 }, { "epoch": 1.0590963213636648, "grad_norm": 30.219226837158203, "learning_rate": 3.268983133140529e-05, "loss": 0.2042, "step": 16900 }, { "epoch": 1.0597230055774896, "grad_norm": 0.02713117003440857, "learning_rate": 3.267927635050981e-05, "loss": 0.0576, "step": 16910 }, { "epoch": 1.060349689791314, "grad_norm": 15.484114646911621, "learning_rate": 3.2668721369614324e-05, "loss": 0.0664, "step": 16920 }, { "epoch": 1.0609763740051388, "grad_norm": 0.24145345389842987, "learning_rate": 3.2658166388718834e-05, "loss": 0.1132, "step": 16930 }, { "epoch": 1.0616030582189635, "grad_norm": 10.560787200927734, "learning_rate": 3.264761140782335e-05, "loss": 0.1444, "step": 16940 }, { "epoch": 1.062229742432788, "grad_norm": 0.014589796774089336, "learning_rate": 3.263705642692787e-05, "loss": 0.1334, "step": 16950 }, { "epoch": 1.0628564266466127, "grad_norm": 3.2847418785095215, "learning_rate": 3.2626501446032384e-05, "loss": 0.1642, "step": 16960 }, { "epoch": 1.0634831108604375, "grad_norm": 0.014080416411161423, "learning_rate": 3.26159464651369e-05, "loss": 0.068, "step": 16970 }, { "epoch": 1.0641097950742622, "grad_norm": 1.794739842414856, "learning_rate": 3.260539148424142e-05, "loss": 0.1811, "step": 16980 }, { "epoch": 1.0647364792880867, "grad_norm": 0.10375296324491501, "learning_rate": 3.2594836503345934e-05, "loss": 0.0504, "step": 16990 }, { "epoch": 1.0653631635019114, "grad_norm": 0.10705555230379105, "learning_rate": 3.2584281522450444e-05, "loss": 0.1058, "step": 17000 }, { "epoch": 1.0659898477157361, "grad_norm": 0.012350371107459068, "learning_rate": 3.257372654155496e-05, "loss": 0.1455, "step": 17010 }, { "epoch": 1.0666165319295606, "grad_norm": 3.1003317832946777, "learning_rate": 3.256317156065948e-05, "loss": 0.1223, "step": 17020 }, { "epoch": 1.0672432161433854, "grad_norm": 7.691588878631592, "learning_rate": 3.2552616579763994e-05, "loss": 0.1669, "step": 17030 }, { "epoch": 1.06786990035721, "grad_norm": 0.1589193344116211, "learning_rate": 3.2542061598868504e-05, "loss": 0.0303, "step": 17040 }, { "epoch": 1.0684965845710346, "grad_norm": 6.517906665802002, "learning_rate": 3.253150661797302e-05, "loss": 0.1478, "step": 17050 }, { "epoch": 1.0691232687848593, "grad_norm": 2.8963265419006348, "learning_rate": 3.252095163707754e-05, "loss": 0.1978, "step": 17060 }, { "epoch": 1.069749952998684, "grad_norm": 0.056512974202632904, "learning_rate": 3.2510396656182054e-05, "loss": 0.0276, "step": 17070 }, { "epoch": 1.0703766372125085, "grad_norm": 0.2997797131538391, "learning_rate": 3.249984167528657e-05, "loss": 0.0334, "step": 17080 }, { "epoch": 1.0710033214263333, "grad_norm": 0.012841691263020039, "learning_rate": 3.248928669439109e-05, "loss": 0.0108, "step": 17090 }, { "epoch": 1.071630005640158, "grad_norm": 0.11793465912342072, "learning_rate": 3.2478731713495604e-05, "loss": 0.1028, "step": 17100 }, { "epoch": 1.0722566898539825, "grad_norm": 0.20693303644657135, "learning_rate": 3.2468176732600114e-05, "loss": 0.2159, "step": 17110 }, { "epoch": 1.0728833740678072, "grad_norm": 0.15515050292015076, "learning_rate": 3.245762175170463e-05, "loss": 0.0425, "step": 17120 }, { "epoch": 1.073510058281632, "grad_norm": 3.174474000930786, "learning_rate": 3.244706677080915e-05, "loss": 0.0634, "step": 17130 }, { "epoch": 1.0741367424954564, "grad_norm": 0.009767886251211166, "learning_rate": 3.243651178991366e-05, "loss": 0.004, "step": 17140 }, { "epoch": 1.0747634267092812, "grad_norm": 0.013338018208742142, "learning_rate": 3.2425956809018173e-05, "loss": 0.0869, "step": 17150 }, { "epoch": 1.0753901109231059, "grad_norm": 0.009402146562933922, "learning_rate": 3.241540182812269e-05, "loss": 0.0134, "step": 17160 }, { "epoch": 1.0760167951369306, "grad_norm": 4.5529375076293945, "learning_rate": 3.2404846847227214e-05, "loss": 0.3214, "step": 17170 }, { "epoch": 1.076643479350755, "grad_norm": 0.06630541384220123, "learning_rate": 3.2394291866331723e-05, "loss": 0.1297, "step": 17180 }, { "epoch": 1.0772701635645798, "grad_norm": 0.06347658485174179, "learning_rate": 3.238373688543624e-05, "loss": 0.007, "step": 17190 }, { "epoch": 1.0778968477784046, "grad_norm": 0.060882568359375, "learning_rate": 3.237318190454076e-05, "loss": 0.0988, "step": 17200 }, { "epoch": 1.078523531992229, "grad_norm": 0.44615989923477173, "learning_rate": 3.236262692364527e-05, "loss": 0.1414, "step": 17210 }, { "epoch": 1.0791502162060538, "grad_norm": 0.9382216334342957, "learning_rate": 3.235207194274978e-05, "loss": 0.1092, "step": 17220 }, { "epoch": 1.0797769004198785, "grad_norm": 3.53702974319458, "learning_rate": 3.23415169618543e-05, "loss": 0.0298, "step": 17230 }, { "epoch": 1.080403584633703, "grad_norm": 0.23474286496639252, "learning_rate": 3.2330961980958817e-05, "loss": 0.0526, "step": 17240 }, { "epoch": 1.0810302688475277, "grad_norm": 0.4255073666572571, "learning_rate": 3.2320407000063326e-05, "loss": 0.1293, "step": 17250 }, { "epoch": 1.0816569530613525, "grad_norm": 9.479437828063965, "learning_rate": 3.230985201916785e-05, "loss": 0.2246, "step": 17260 }, { "epoch": 1.082283637275177, "grad_norm": 5.882570743560791, "learning_rate": 3.2299297038272367e-05, "loss": 0.1457, "step": 17270 }, { "epoch": 1.0829103214890017, "grad_norm": 0.03978152573108673, "learning_rate": 3.2288742057376876e-05, "loss": 0.0228, "step": 17280 }, { "epoch": 1.0835370057028264, "grad_norm": 0.022685598582029343, "learning_rate": 3.227818707648139e-05, "loss": 0.0577, "step": 17290 }, { "epoch": 1.084163689916651, "grad_norm": 0.03258649632334709, "learning_rate": 3.226763209558591e-05, "loss": 0.116, "step": 17300 }, { "epoch": 1.0847903741304756, "grad_norm": 3.8251702785491943, "learning_rate": 3.2257077114690426e-05, "loss": 0.166, "step": 17310 }, { "epoch": 1.0854170583443004, "grad_norm": 4.577685832977295, "learning_rate": 3.2246522133794936e-05, "loss": 0.0057, "step": 17320 }, { "epoch": 1.0860437425581249, "grad_norm": 0.073435477912426, "learning_rate": 3.223596715289945e-05, "loss": 0.0874, "step": 17330 }, { "epoch": 1.0866704267719496, "grad_norm": 3.804044723510742, "learning_rate": 3.222541217200397e-05, "loss": 0.1952, "step": 17340 }, { "epoch": 1.0872971109857743, "grad_norm": 0.16419921815395355, "learning_rate": 3.2214857191108486e-05, "loss": 0.1542, "step": 17350 }, { "epoch": 1.087923795199599, "grad_norm": 0.1198701560497284, "learning_rate": 3.2204302210213e-05, "loss": 0.0191, "step": 17360 }, { "epoch": 1.0885504794134235, "grad_norm": 0.06243205815553665, "learning_rate": 3.219374722931752e-05, "loss": 0.0052, "step": 17370 }, { "epoch": 1.0891771636272483, "grad_norm": 0.048908885568380356, "learning_rate": 3.2183192248422036e-05, "loss": 0.1644, "step": 17380 }, { "epoch": 1.089803847841073, "grad_norm": 4.926666259765625, "learning_rate": 3.2172637267526546e-05, "loss": 0.1288, "step": 17390 }, { "epoch": 1.0904305320548975, "grad_norm": 0.06150239333510399, "learning_rate": 3.216208228663106e-05, "loss": 0.0992, "step": 17400 }, { "epoch": 1.0910572162687222, "grad_norm": 0.06462900340557098, "learning_rate": 3.215152730573558e-05, "loss": 0.0036, "step": 17410 }, { "epoch": 1.091683900482547, "grad_norm": 0.4827132523059845, "learning_rate": 3.2140972324840096e-05, "loss": 0.0116, "step": 17420 }, { "epoch": 1.0923105846963714, "grad_norm": 4.205441474914551, "learning_rate": 3.2130417343944606e-05, "loss": 0.0806, "step": 17430 }, { "epoch": 1.0929372689101962, "grad_norm": 0.048327527940273285, "learning_rate": 3.211986236304912e-05, "loss": 0.3297, "step": 17440 }, { "epoch": 1.0935639531240209, "grad_norm": 3.5729031562805176, "learning_rate": 3.210930738215364e-05, "loss": 0.1418, "step": 17450 }, { "epoch": 1.0941906373378454, "grad_norm": 0.17036008834838867, "learning_rate": 3.2098752401258156e-05, "loss": 0.2521, "step": 17460 }, { "epoch": 1.09481732155167, "grad_norm": 0.14171738922595978, "learning_rate": 3.208819742036267e-05, "loss": 0.0681, "step": 17470 }, { "epoch": 1.0954440057654948, "grad_norm": 3.630122184753418, "learning_rate": 3.207764243946719e-05, "loss": 0.1545, "step": 17480 }, { "epoch": 1.0960706899793193, "grad_norm": 6.238872528076172, "learning_rate": 3.2067087458571706e-05, "loss": 0.0847, "step": 17490 }, { "epoch": 1.096697374193144, "grad_norm": 3.537527561187744, "learning_rate": 3.2056532477676216e-05, "loss": 0.1196, "step": 17500 }, { "epoch": 1.0973240584069688, "grad_norm": 0.525282084941864, "learning_rate": 3.204597749678073e-05, "loss": 0.1227, "step": 17510 }, { "epoch": 1.0979507426207933, "grad_norm": 0.16825850307941437, "learning_rate": 3.203542251588525e-05, "loss": 0.0205, "step": 17520 }, { "epoch": 1.098577426834618, "grad_norm": 33.61454391479492, "learning_rate": 3.202486753498976e-05, "loss": 0.1907, "step": 17530 }, { "epoch": 1.0992041110484427, "grad_norm": 1.6062792539596558, "learning_rate": 3.2014312554094275e-05, "loss": 0.0417, "step": 17540 }, { "epoch": 1.0998307952622675, "grad_norm": 0.08073863387107849, "learning_rate": 3.200375757319879e-05, "loss": 0.0344, "step": 17550 }, { "epoch": 1.100457479476092, "grad_norm": 0.09482072293758392, "learning_rate": 3.199320259230331e-05, "loss": 0.1362, "step": 17560 }, { "epoch": 1.1010841636899167, "grad_norm": 12.88849925994873, "learning_rate": 3.1982647611407825e-05, "loss": 0.0964, "step": 17570 }, { "epoch": 1.1017108479037414, "grad_norm": 4.788585186004639, "learning_rate": 3.197209263051234e-05, "loss": 0.1735, "step": 17580 }, { "epoch": 1.102337532117566, "grad_norm": 0.06468937546014786, "learning_rate": 3.196153764961686e-05, "loss": 0.1058, "step": 17590 }, { "epoch": 1.1029642163313906, "grad_norm": 0.057759225368499756, "learning_rate": 3.195098266872137e-05, "loss": 0.0705, "step": 17600 }, { "epoch": 1.1035909005452154, "grad_norm": 0.7674767374992371, "learning_rate": 3.1940427687825885e-05, "loss": 0.1252, "step": 17610 }, { "epoch": 1.1042175847590399, "grad_norm": 0.2729140818119049, "learning_rate": 3.19298727069304e-05, "loss": 0.1003, "step": 17620 }, { "epoch": 1.1048442689728646, "grad_norm": 0.08846903592348099, "learning_rate": 3.191931772603492e-05, "loss": 0.1048, "step": 17630 }, { "epoch": 1.1054709531866893, "grad_norm": 0.16635166108608246, "learning_rate": 3.190876274513943e-05, "loss": 0.0924, "step": 17640 }, { "epoch": 1.1060976374005138, "grad_norm": 0.08288455754518509, "learning_rate": 3.1898207764243945e-05, "loss": 0.1139, "step": 17650 }, { "epoch": 1.1067243216143385, "grad_norm": 22.163753509521484, "learning_rate": 3.188765278334846e-05, "loss": 0.1695, "step": 17660 }, { "epoch": 1.1073510058281633, "grad_norm": 4.949541091918945, "learning_rate": 3.187709780245298e-05, "loss": 0.0646, "step": 17670 }, { "epoch": 1.1079776900419878, "grad_norm": 0.12406647950410843, "learning_rate": 3.1866542821557495e-05, "loss": 0.1347, "step": 17680 }, { "epoch": 1.1086043742558125, "grad_norm": 0.058781981468200684, "learning_rate": 3.185598784066201e-05, "loss": 0.0371, "step": 17690 }, { "epoch": 1.1092310584696372, "grad_norm": 0.07266169041395187, "learning_rate": 3.184543285976653e-05, "loss": 0.0737, "step": 17700 }, { "epoch": 1.1098577426834617, "grad_norm": 3.22371244430542, "learning_rate": 3.183487787887104e-05, "loss": 0.1593, "step": 17710 }, { "epoch": 1.1104844268972864, "grad_norm": 0.045021601021289825, "learning_rate": 3.1824322897975555e-05, "loss": 0.0134, "step": 17720 }, { "epoch": 1.1111111111111112, "grad_norm": 0.021073054522275925, "learning_rate": 3.181376791708007e-05, "loss": 0.0462, "step": 17730 }, { "epoch": 1.1117377953249357, "grad_norm": 0.6529107689857483, "learning_rate": 3.180321293618458e-05, "loss": 0.0337, "step": 17740 }, { "epoch": 1.1123644795387604, "grad_norm": 12.296225547790527, "learning_rate": 3.17926579552891e-05, "loss": 0.1708, "step": 17750 }, { "epoch": 1.112991163752585, "grad_norm": 0.07589639723300934, "learning_rate": 3.178210297439362e-05, "loss": 0.1422, "step": 17760 }, { "epoch": 1.1136178479664096, "grad_norm": 4.349477767944336, "learning_rate": 3.177154799349814e-05, "loss": 0.2205, "step": 17770 }, { "epoch": 1.1142445321802343, "grad_norm": 0.06528756022453308, "learning_rate": 3.176099301260265e-05, "loss": 0.1047, "step": 17780 }, { "epoch": 1.114871216394059, "grad_norm": 12.607084274291992, "learning_rate": 3.1750438031707165e-05, "loss": 0.1682, "step": 17790 }, { "epoch": 1.1154979006078838, "grad_norm": 6.911495208740234, "learning_rate": 3.173988305081168e-05, "loss": 0.2568, "step": 17800 }, { "epoch": 1.1161245848217083, "grad_norm": 9.881560325622559, "learning_rate": 3.17293280699162e-05, "loss": 0.117, "step": 17810 }, { "epoch": 1.116751269035533, "grad_norm": 0.3809300363063812, "learning_rate": 3.171877308902071e-05, "loss": 0.1061, "step": 17820 }, { "epoch": 1.1173779532493577, "grad_norm": 0.023420821875333786, "learning_rate": 3.1708218108125224e-05, "loss": 0.0837, "step": 17830 }, { "epoch": 1.1180046374631822, "grad_norm": 2.3442838191986084, "learning_rate": 3.169766312722974e-05, "loss": 0.186, "step": 17840 }, { "epoch": 1.118631321677007, "grad_norm": 1.4598783254623413, "learning_rate": 3.168710814633426e-05, "loss": 0.0194, "step": 17850 }, { "epoch": 1.1192580058908317, "grad_norm": 0.017658187076449394, "learning_rate": 3.1676553165438774e-05, "loss": 0.1068, "step": 17860 }, { "epoch": 1.1198846901046562, "grad_norm": 0.015302395448088646, "learning_rate": 3.166599818454329e-05, "loss": 0.0331, "step": 17870 }, { "epoch": 1.120511374318481, "grad_norm": 0.8182141184806824, "learning_rate": 3.165544320364781e-05, "loss": 0.2699, "step": 17880 }, { "epoch": 1.1211380585323056, "grad_norm": 0.027354877442121506, "learning_rate": 3.164488822275232e-05, "loss": 0.0664, "step": 17890 }, { "epoch": 1.1217647427461301, "grad_norm": 0.11983570456504822, "learning_rate": 3.1634333241856834e-05, "loss": 0.0471, "step": 17900 }, { "epoch": 1.1223914269599549, "grad_norm": 0.33365902304649353, "learning_rate": 3.162377826096135e-05, "loss": 0.0432, "step": 17910 }, { "epoch": 1.1230181111737796, "grad_norm": 3.265024423599243, "learning_rate": 3.161322328006586e-05, "loss": 0.3213, "step": 17920 }, { "epoch": 1.123644795387604, "grad_norm": 4.623913288116455, "learning_rate": 3.160266829917038e-05, "loss": 0.2395, "step": 17930 }, { "epoch": 1.1242714796014288, "grad_norm": 0.09318087249994278, "learning_rate": 3.1592113318274894e-05, "loss": 0.039, "step": 17940 }, { "epoch": 1.1248981638152535, "grad_norm": 0.1346893608570099, "learning_rate": 3.158155833737941e-05, "loss": 0.0821, "step": 17950 }, { "epoch": 1.125524848029078, "grad_norm": 2.6381444931030273, "learning_rate": 3.157100335648393e-05, "loss": 0.0319, "step": 17960 }, { "epoch": 1.1261515322429028, "grad_norm": 8.567200660705566, "learning_rate": 3.1560448375588444e-05, "loss": 0.044, "step": 17970 }, { "epoch": 1.1267782164567275, "grad_norm": 0.02193138189613819, "learning_rate": 3.154989339469296e-05, "loss": 0.0932, "step": 17980 }, { "epoch": 1.1274049006705522, "grad_norm": 1.1436899900436401, "learning_rate": 3.153933841379747e-05, "loss": 0.0233, "step": 17990 }, { "epoch": 1.1280315848843767, "grad_norm": 1.2188142538070679, "learning_rate": 3.152878343290199e-05, "loss": 0.2734, "step": 18000 }, { "epoch": 1.1286582690982014, "grad_norm": 14.177412033081055, "learning_rate": 3.1518228452006504e-05, "loss": 0.0656, "step": 18010 }, { "epoch": 1.1292849533120262, "grad_norm": 0.12195923179388046, "learning_rate": 3.150767347111102e-05, "loss": 0.1494, "step": 18020 }, { "epoch": 1.1299116375258507, "grad_norm": 5.3001389503479, "learning_rate": 3.149711849021553e-05, "loss": 0.0758, "step": 18030 }, { "epoch": 1.1305383217396754, "grad_norm": 0.7896440625190735, "learning_rate": 3.148656350932005e-05, "loss": 0.0049, "step": 18040 }, { "epoch": 1.1311650059535001, "grad_norm": 0.3458056151866913, "learning_rate": 3.1476008528424564e-05, "loss": 0.2096, "step": 18050 }, { "epoch": 1.1317916901673246, "grad_norm": 0.04001154750585556, "learning_rate": 3.146545354752908e-05, "loss": 0.0157, "step": 18060 }, { "epoch": 1.1324183743811493, "grad_norm": 0.3892626166343689, "learning_rate": 3.14548985666336e-05, "loss": 0.1478, "step": 18070 }, { "epoch": 1.133045058594974, "grad_norm": 1.3462265729904175, "learning_rate": 3.1444343585738114e-05, "loss": 0.0465, "step": 18080 }, { "epoch": 1.1336717428087986, "grad_norm": 0.8281564116477966, "learning_rate": 3.143378860484263e-05, "loss": 0.1297, "step": 18090 }, { "epoch": 1.1342984270226233, "grad_norm": 0.05272606387734413, "learning_rate": 3.142323362394714e-05, "loss": 0.1197, "step": 18100 }, { "epoch": 1.134925111236448, "grad_norm": 0.06631779670715332, "learning_rate": 3.141267864305166e-05, "loss": 0.1193, "step": 18110 }, { "epoch": 1.1355517954502725, "grad_norm": 0.7379500865936279, "learning_rate": 3.1402123662156173e-05, "loss": 0.1209, "step": 18120 }, { "epoch": 1.1361784796640972, "grad_norm": 0.08505575358867645, "learning_rate": 3.139156868126068e-05, "loss": 0.1933, "step": 18130 }, { "epoch": 1.136805163877922, "grad_norm": 0.052564263343811035, "learning_rate": 3.13810137003652e-05, "loss": 0.0058, "step": 18140 }, { "epoch": 1.1374318480917465, "grad_norm": 0.10592754930257797, "learning_rate": 3.137045871946972e-05, "loss": 0.2424, "step": 18150 }, { "epoch": 1.1380585323055712, "grad_norm": 0.08464646339416504, "learning_rate": 3.135990373857424e-05, "loss": 0.0235, "step": 18160 }, { "epoch": 1.138685216519396, "grad_norm": 0.025046486407518387, "learning_rate": 3.134934875767875e-05, "loss": 0.0385, "step": 18170 }, { "epoch": 1.1393119007332206, "grad_norm": 0.019679777324199677, "learning_rate": 3.1338793776783267e-05, "loss": 0.0631, "step": 18180 }, { "epoch": 1.1399385849470451, "grad_norm": 0.4192022383213043, "learning_rate": 3.132823879588778e-05, "loss": 0.1764, "step": 18190 }, { "epoch": 1.1405652691608699, "grad_norm": 0.09451547265052795, "learning_rate": 3.13176838149923e-05, "loss": 0.1082, "step": 18200 }, { "epoch": 1.1411919533746944, "grad_norm": 0.034728892147541046, "learning_rate": 3.130712883409681e-05, "loss": 0.1421, "step": 18210 }, { "epoch": 1.141818637588519, "grad_norm": 2.024574041366577, "learning_rate": 3.1296573853201326e-05, "loss": 0.0126, "step": 18220 }, { "epoch": 1.1424453218023438, "grad_norm": 11.830857276916504, "learning_rate": 3.128601887230584e-05, "loss": 0.1805, "step": 18230 }, { "epoch": 1.1430720060161685, "grad_norm": 8.058704376220703, "learning_rate": 3.127546389141035e-05, "loss": 0.1096, "step": 18240 }, { "epoch": 1.143698690229993, "grad_norm": 18.651052474975586, "learning_rate": 3.126490891051487e-05, "loss": 0.2482, "step": 18250 }, { "epoch": 1.1443253744438178, "grad_norm": 0.1312084197998047, "learning_rate": 3.125435392961939e-05, "loss": 0.0713, "step": 18260 }, { "epoch": 1.1449520586576425, "grad_norm": 5.29236364364624, "learning_rate": 3.124379894872391e-05, "loss": 0.159, "step": 18270 }, { "epoch": 1.145578742871467, "grad_norm": 3.630553960800171, "learning_rate": 3.123324396782842e-05, "loss": 0.1035, "step": 18280 }, { "epoch": 1.1462054270852917, "grad_norm": 0.05378236994147301, "learning_rate": 3.1222688986932936e-05, "loss": 0.0035, "step": 18290 }, { "epoch": 1.1468321112991164, "grad_norm": 0.08268694579601288, "learning_rate": 3.121213400603745e-05, "loss": 0.2101, "step": 18300 }, { "epoch": 1.147458795512941, "grad_norm": 0.09726406633853912, "learning_rate": 3.120157902514196e-05, "loss": 0.1929, "step": 18310 }, { "epoch": 1.1480854797267657, "grad_norm": 0.07776237279176712, "learning_rate": 3.119102404424648e-05, "loss": 0.074, "step": 18320 }, { "epoch": 1.1487121639405904, "grad_norm": 0.05973588675260544, "learning_rate": 3.1180469063350996e-05, "loss": 0.0628, "step": 18330 }, { "epoch": 1.149338848154415, "grad_norm": 0.030430462211370468, "learning_rate": 3.116991408245551e-05, "loss": 0.0241, "step": 18340 }, { "epoch": 1.1499655323682396, "grad_norm": 6.213319301605225, "learning_rate": 3.115935910156003e-05, "loss": 0.0966, "step": 18350 }, { "epoch": 1.1505922165820643, "grad_norm": 0.06798451393842697, "learning_rate": 3.1148804120664546e-05, "loss": 0.3239, "step": 18360 }, { "epoch": 1.151218900795889, "grad_norm": 0.03651350736618042, "learning_rate": 3.113824913976906e-05, "loss": 0.1506, "step": 18370 }, { "epoch": 1.1518455850097136, "grad_norm": 10.000849723815918, "learning_rate": 3.112769415887357e-05, "loss": 0.1306, "step": 18380 }, { "epoch": 1.1524722692235383, "grad_norm": 0.05033571273088455, "learning_rate": 3.111713917797809e-05, "loss": 0.148, "step": 18390 }, { "epoch": 1.1530989534373628, "grad_norm": 0.07155164331197739, "learning_rate": 3.1106584197082606e-05, "loss": 0.1108, "step": 18400 }, { "epoch": 1.1537256376511875, "grad_norm": 17.460355758666992, "learning_rate": 3.109602921618712e-05, "loss": 0.2014, "step": 18410 }, { "epoch": 1.1543523218650122, "grad_norm": 0.22391580045223236, "learning_rate": 3.108547423529163e-05, "loss": 0.0475, "step": 18420 }, { "epoch": 1.154979006078837, "grad_norm": 0.08404719829559326, "learning_rate": 3.107491925439615e-05, "loss": 0.0335, "step": 18430 }, { "epoch": 1.1556056902926615, "grad_norm": 4.082967758178711, "learning_rate": 3.1064364273500666e-05, "loss": 0.0251, "step": 18440 }, { "epoch": 1.1562323745064862, "grad_norm": 0.021450785920023918, "learning_rate": 3.105380929260518e-05, "loss": 0.1285, "step": 18450 }, { "epoch": 1.156859058720311, "grad_norm": 0.05551962926983833, "learning_rate": 3.10432543117097e-05, "loss": 0.0565, "step": 18460 }, { "epoch": 1.1574857429341354, "grad_norm": 0.031009657308459282, "learning_rate": 3.1032699330814216e-05, "loss": 0.1844, "step": 18470 }, { "epoch": 1.1581124271479601, "grad_norm": 0.03998725861310959, "learning_rate": 3.102214434991873e-05, "loss": 0.0639, "step": 18480 }, { "epoch": 1.1587391113617849, "grad_norm": 6.586679458618164, "learning_rate": 3.101158936902324e-05, "loss": 0.1592, "step": 18490 }, { "epoch": 1.1593657955756094, "grad_norm": 9.556639671325684, "learning_rate": 3.100103438812776e-05, "loss": 0.0797, "step": 18500 }, { "epoch": 1.159992479789434, "grad_norm": 0.06767041236162186, "learning_rate": 3.0990479407232275e-05, "loss": 0.0086, "step": 18510 }, { "epoch": 1.1606191640032588, "grad_norm": 0.03151436150074005, "learning_rate": 3.0979924426336785e-05, "loss": 0.0038, "step": 18520 }, { "epoch": 1.1612458482170833, "grad_norm": 0.08840905874967575, "learning_rate": 3.09693694454413e-05, "loss": 0.0063, "step": 18530 }, { "epoch": 1.161872532430908, "grad_norm": 0.03952464088797569, "learning_rate": 3.095881446454582e-05, "loss": 0.1326, "step": 18540 }, { "epoch": 1.1624992166447328, "grad_norm": 0.06938064843416214, "learning_rate": 3.0948259483650335e-05, "loss": 0.1138, "step": 18550 }, { "epoch": 1.1631259008585575, "grad_norm": 4.21276330947876, "learning_rate": 3.093770450275485e-05, "loss": 0.1467, "step": 18560 }, { "epoch": 1.163752585072382, "grad_norm": 2.3090898990631104, "learning_rate": 3.092714952185937e-05, "loss": 0.1913, "step": 18570 }, { "epoch": 1.1643792692862067, "grad_norm": 8.786662101745605, "learning_rate": 3.0916594540963885e-05, "loss": 0.127, "step": 18580 }, { "epoch": 1.1650059535000312, "grad_norm": 0.15764133632183075, "learning_rate": 3.09060395600684e-05, "loss": 0.0588, "step": 18590 }, { "epoch": 1.165632637713856, "grad_norm": 0.08247308433055878, "learning_rate": 3.089548457917291e-05, "loss": 0.0592, "step": 18600 }, { "epoch": 1.1662593219276807, "grad_norm": 0.4140074849128723, "learning_rate": 3.088492959827743e-05, "loss": 0.0054, "step": 18610 }, { "epoch": 1.1668860061415054, "grad_norm": 0.03695308417081833, "learning_rate": 3.0874374617381945e-05, "loss": 0.1292, "step": 18620 }, { "epoch": 1.16751269035533, "grad_norm": 9.783849716186523, "learning_rate": 3.0863819636486455e-05, "loss": 0.2868, "step": 18630 }, { "epoch": 1.1681393745691546, "grad_norm": 0.05571041628718376, "learning_rate": 3.085326465559097e-05, "loss": 0.1062, "step": 18640 }, { "epoch": 1.1687660587829793, "grad_norm": 0.5784240961074829, "learning_rate": 3.084270967469549e-05, "loss": 0.0648, "step": 18650 }, { "epoch": 1.1693927429968038, "grad_norm": 7.899927139282227, "learning_rate": 3.083215469380001e-05, "loss": 0.1259, "step": 18660 }, { "epoch": 1.1700194272106286, "grad_norm": 1.123893141746521, "learning_rate": 3.082159971290452e-05, "loss": 0.1433, "step": 18670 }, { "epoch": 1.1706461114244533, "grad_norm": 1.1794332265853882, "learning_rate": 3.081104473200904e-05, "loss": 0.223, "step": 18680 }, { "epoch": 1.1712727956382778, "grad_norm": 0.06373187899589539, "learning_rate": 3.0800489751113555e-05, "loss": 0.0366, "step": 18690 }, { "epoch": 1.1718994798521025, "grad_norm": 0.5772104263305664, "learning_rate": 3.0789934770218065e-05, "loss": 0.1949, "step": 18700 }, { "epoch": 1.1725261640659272, "grad_norm": 3.3197052478790283, "learning_rate": 3.077937978932258e-05, "loss": 0.058, "step": 18710 }, { "epoch": 1.1731528482797517, "grad_norm": 1.338387131690979, "learning_rate": 3.07688248084271e-05, "loss": 0.0886, "step": 18720 }, { "epoch": 1.1737795324935765, "grad_norm": 0.7524656653404236, "learning_rate": 3.0758269827531615e-05, "loss": 0.0612, "step": 18730 }, { "epoch": 1.1744062167074012, "grad_norm": 0.05872412398457527, "learning_rate": 3.0747714846636125e-05, "loss": 0.1389, "step": 18740 }, { "epoch": 1.1750329009212257, "grad_norm": 0.036100588738918304, "learning_rate": 3.073715986574064e-05, "loss": 0.1411, "step": 18750 }, { "epoch": 1.1756595851350504, "grad_norm": 0.06897371262311935, "learning_rate": 3.0726604884845165e-05, "loss": 0.0841, "step": 18760 }, { "epoch": 1.1762862693488751, "grad_norm": 0.0995137169957161, "learning_rate": 3.0716049903949674e-05, "loss": 0.0393, "step": 18770 }, { "epoch": 1.1769129535626996, "grad_norm": 0.04892328381538391, "learning_rate": 3.070549492305419e-05, "loss": 0.0086, "step": 18780 }, { "epoch": 1.1775396377765244, "grad_norm": 0.08550518751144409, "learning_rate": 3.069493994215871e-05, "loss": 0.0755, "step": 18790 }, { "epoch": 1.178166321990349, "grad_norm": 0.03711172938346863, "learning_rate": 3.0684384961263224e-05, "loss": 0.109, "step": 18800 }, { "epoch": 1.1787930062041738, "grad_norm": 0.09902484714984894, "learning_rate": 3.0673829980367734e-05, "loss": 0.1456, "step": 18810 }, { "epoch": 1.1794196904179983, "grad_norm": 2.862833261489868, "learning_rate": 3.066327499947225e-05, "loss": 0.1037, "step": 18820 }, { "epoch": 1.180046374631823, "grad_norm": 0.29862895607948303, "learning_rate": 3.065272001857677e-05, "loss": 0.1776, "step": 18830 }, { "epoch": 1.1806730588456478, "grad_norm": 2.7370612621307373, "learning_rate": 3.064216503768128e-05, "loss": 0.0548, "step": 18840 }, { "epoch": 1.1812997430594723, "grad_norm": 2.9477670192718506, "learning_rate": 3.06316100567858e-05, "loss": 0.1779, "step": 18850 }, { "epoch": 1.181926427273297, "grad_norm": 0.349155455827713, "learning_rate": 3.062105507589032e-05, "loss": 0.0444, "step": 18860 }, { "epoch": 1.1825531114871217, "grad_norm": 0.0302012600004673, "learning_rate": 3.0610500094994834e-05, "loss": 0.0967, "step": 18870 }, { "epoch": 1.1831797957009462, "grad_norm": 4.532477855682373, "learning_rate": 3.0599945114099344e-05, "loss": 0.335, "step": 18880 }, { "epoch": 1.183806479914771, "grad_norm": 0.14545802772045135, "learning_rate": 3.058939013320386e-05, "loss": 0.1045, "step": 18890 }, { "epoch": 1.1844331641285957, "grad_norm": 8.656229019165039, "learning_rate": 3.057883515230838e-05, "loss": 0.3195, "step": 18900 }, { "epoch": 1.1850598483424202, "grad_norm": 8.323212623596191, "learning_rate": 3.056828017141289e-05, "loss": 0.0389, "step": 18910 }, { "epoch": 1.185686532556245, "grad_norm": 0.051557403057813644, "learning_rate": 3.0557725190517404e-05, "loss": 0.0346, "step": 18920 }, { "epoch": 1.1863132167700696, "grad_norm": 0.1489444226026535, "learning_rate": 3.054717020962192e-05, "loss": 0.0837, "step": 18930 }, { "epoch": 1.1869399009838941, "grad_norm": 0.04929697886109352, "learning_rate": 3.053661522872644e-05, "loss": 0.1245, "step": 18940 }, { "epoch": 1.1875665851977188, "grad_norm": 4.793705463409424, "learning_rate": 3.0526060247830954e-05, "loss": 0.0366, "step": 18950 }, { "epoch": 1.1881932694115436, "grad_norm": 38.849334716796875, "learning_rate": 3.0515505266935467e-05, "loss": 0.2209, "step": 18960 }, { "epoch": 1.188819953625368, "grad_norm": 12.780930519104004, "learning_rate": 3.0504950286039984e-05, "loss": 0.135, "step": 18970 }, { "epoch": 1.1894466378391928, "grad_norm": 0.12625622749328613, "learning_rate": 3.0494395305144497e-05, "loss": 0.2881, "step": 18980 }, { "epoch": 1.1900733220530175, "grad_norm": 0.12638124823570251, "learning_rate": 3.0483840324249014e-05, "loss": 0.0959, "step": 18990 }, { "epoch": 1.1907000062668422, "grad_norm": 3.6278929710388184, "learning_rate": 3.047328534335353e-05, "loss": 0.0529, "step": 19000 }, { "epoch": 1.1913266904806668, "grad_norm": 0.10436367988586426, "learning_rate": 3.0462730362458047e-05, "loss": 0.0325, "step": 19010 }, { "epoch": 1.1919533746944915, "grad_norm": 0.0621584877371788, "learning_rate": 3.045217538156256e-05, "loss": 0.1517, "step": 19020 }, { "epoch": 1.192580058908316, "grad_norm": 1.4065021276474, "learning_rate": 3.0441620400667077e-05, "loss": 0.0239, "step": 19030 }, { "epoch": 1.1932067431221407, "grad_norm": 0.06358971446752548, "learning_rate": 3.0431065419771594e-05, "loss": 0.0019, "step": 19040 }, { "epoch": 1.1938334273359654, "grad_norm": 0.11690735816955566, "learning_rate": 3.042051043887611e-05, "loss": 0.2751, "step": 19050 }, { "epoch": 1.1944601115497901, "grad_norm": 0.029574111104011536, "learning_rate": 3.040995545798062e-05, "loss": 0.101, "step": 19060 }, { "epoch": 1.1950867957636147, "grad_norm": 2.2352538108825684, "learning_rate": 3.0399400477085137e-05, "loss": 0.0831, "step": 19070 }, { "epoch": 1.1957134799774394, "grad_norm": 0.24031168222427368, "learning_rate": 3.0388845496189657e-05, "loss": 0.1658, "step": 19080 }, { "epoch": 1.196340164191264, "grad_norm": 3.322831392288208, "learning_rate": 3.0378290515294167e-05, "loss": 0.0086, "step": 19090 }, { "epoch": 1.1969668484050886, "grad_norm": 0.19900059700012207, "learning_rate": 3.0367735534398683e-05, "loss": 0.082, "step": 19100 }, { "epoch": 1.1975935326189133, "grad_norm": 1.1015654802322388, "learning_rate": 3.03571805535032e-05, "loss": 0.1187, "step": 19110 }, { "epoch": 1.198220216832738, "grad_norm": 0.13343365490436554, "learning_rate": 3.0346625572607717e-05, "loss": 0.1798, "step": 19120 }, { "epoch": 1.1988469010465626, "grad_norm": 0.04171142354607582, "learning_rate": 3.033607059171223e-05, "loss": 0.0758, "step": 19130 }, { "epoch": 1.1994735852603873, "grad_norm": 0.08893130719661713, "learning_rate": 3.0325515610816747e-05, "loss": 0.0852, "step": 19140 }, { "epoch": 1.200100269474212, "grad_norm": 7.023293495178223, "learning_rate": 3.0314960629921263e-05, "loss": 0.0773, "step": 19150 }, { "epoch": 1.2007269536880365, "grad_norm": 0.7998108267784119, "learning_rate": 3.0304405649025773e-05, "loss": 0.1002, "step": 19160 }, { "epoch": 1.2013536379018612, "grad_norm": 0.13294047117233276, "learning_rate": 3.029385066813029e-05, "loss": 0.2127, "step": 19170 }, { "epoch": 1.201980322115686, "grad_norm": 0.12403099983930588, "learning_rate": 3.028329568723481e-05, "loss": 0.0417, "step": 19180 }, { "epoch": 1.2026070063295107, "grad_norm": 0.17409439384937286, "learning_rate": 3.0272740706339326e-05, "loss": 0.208, "step": 19190 }, { "epoch": 1.2032336905433352, "grad_norm": 0.4515335261821747, "learning_rate": 3.0262185725443836e-05, "loss": 0.0736, "step": 19200 }, { "epoch": 1.20386037475716, "grad_norm": 1.3704180717468262, "learning_rate": 3.0251630744548353e-05, "loss": 0.1135, "step": 19210 }, { "epoch": 1.2044870589709844, "grad_norm": 10.743453979492188, "learning_rate": 3.024107576365287e-05, "loss": 0.0932, "step": 19220 }, { "epoch": 1.2051137431848091, "grad_norm": 0.02900216355919838, "learning_rate": 3.0230520782757383e-05, "loss": 0.182, "step": 19230 }, { "epoch": 1.2057404273986339, "grad_norm": 0.03926697000861168, "learning_rate": 3.02199658018619e-05, "loss": 0.0693, "step": 19240 }, { "epoch": 1.2063671116124586, "grad_norm": 0.3335837423801422, "learning_rate": 3.0209410820966416e-05, "loss": 0.1682, "step": 19250 }, { "epoch": 1.206993795826283, "grad_norm": 0.15186485648155212, "learning_rate": 3.0198855840070933e-05, "loss": 0.0481, "step": 19260 }, { "epoch": 1.2076204800401078, "grad_norm": 0.017995532602071762, "learning_rate": 3.0188300859175446e-05, "loss": 0.0787, "step": 19270 }, { "epoch": 1.2082471642539325, "grad_norm": 0.07066605240106583, "learning_rate": 3.0177745878279963e-05, "loss": 0.0587, "step": 19280 }, { "epoch": 1.208873848467757, "grad_norm": 2.0296781063079834, "learning_rate": 3.016719089738448e-05, "loss": 0.1208, "step": 19290 }, { "epoch": 1.2095005326815818, "grad_norm": 0.06564827263355255, "learning_rate": 3.015663591648899e-05, "loss": 0.143, "step": 19300 }, { "epoch": 1.2101272168954065, "grad_norm": 0.47882741689682007, "learning_rate": 3.0146080935593506e-05, "loss": 0.0057, "step": 19310 }, { "epoch": 1.210753901109231, "grad_norm": 0.044205307960510254, "learning_rate": 3.0135525954698023e-05, "loss": 0.0397, "step": 19320 }, { "epoch": 1.2113805853230557, "grad_norm": 0.08370712399482727, "learning_rate": 3.0124970973802543e-05, "loss": 0.1867, "step": 19330 }, { "epoch": 1.2120072695368804, "grad_norm": 0.859338641166687, "learning_rate": 3.0114415992907052e-05, "loss": 0.056, "step": 19340 }, { "epoch": 1.212633953750705, "grad_norm": 0.03434952720999718, "learning_rate": 3.010386101201157e-05, "loss": 0.0506, "step": 19350 }, { "epoch": 1.2132606379645297, "grad_norm": 0.1044652909040451, "learning_rate": 3.0093306031116086e-05, "loss": 0.0492, "step": 19360 }, { "epoch": 1.2138873221783544, "grad_norm": 0.012052880600094795, "learning_rate": 3.00827510502206e-05, "loss": 0.0333, "step": 19370 }, { "epoch": 1.214514006392179, "grad_norm": 0.021363429725170135, "learning_rate": 3.0072196069325116e-05, "loss": 0.1359, "step": 19380 }, { "epoch": 1.2151406906060036, "grad_norm": 0.07590274512767792, "learning_rate": 3.0061641088429632e-05, "loss": 0.1013, "step": 19390 }, { "epoch": 1.2157673748198283, "grad_norm": 0.4024260640144348, "learning_rate": 3.005108610753415e-05, "loss": 0.0974, "step": 19400 }, { "epoch": 1.2163940590336528, "grad_norm": 0.035482216626405716, "learning_rate": 3.004053112663866e-05, "loss": 0.0986, "step": 19410 }, { "epoch": 1.2170207432474776, "grad_norm": 0.7663148045539856, "learning_rate": 3.0029976145743176e-05, "loss": 0.1216, "step": 19420 }, { "epoch": 1.2176474274613023, "grad_norm": 0.33147960901260376, "learning_rate": 3.0019421164847696e-05, "loss": 0.0986, "step": 19430 }, { "epoch": 1.218274111675127, "grad_norm": 0.25695696473121643, "learning_rate": 3.0008866183952212e-05, "loss": 0.1765, "step": 19440 }, { "epoch": 1.2189007958889515, "grad_norm": 0.01739318296313286, "learning_rate": 2.9998311203056722e-05, "loss": 0.126, "step": 19450 }, { "epoch": 1.2195274801027762, "grad_norm": 1.8818984031677246, "learning_rate": 2.998775622216124e-05, "loss": 0.0637, "step": 19460 }, { "epoch": 1.220154164316601, "grad_norm": 0.01870718225836754, "learning_rate": 2.9977201241265755e-05, "loss": 0.094, "step": 19470 }, { "epoch": 1.2207808485304255, "grad_norm": 0.12540331482887268, "learning_rate": 2.996664626037027e-05, "loss": 0.108, "step": 19480 }, { "epoch": 1.2214075327442502, "grad_norm": 3.370209217071533, "learning_rate": 2.9956091279474785e-05, "loss": 0.1678, "step": 19490 }, { "epoch": 1.222034216958075, "grad_norm": 3.6439049243927, "learning_rate": 2.9945536298579302e-05, "loss": 0.244, "step": 19500 }, { "epoch": 1.2226609011718994, "grad_norm": 0.9234188199043274, "learning_rate": 2.993498131768382e-05, "loss": 0.0357, "step": 19510 }, { "epoch": 1.2232875853857241, "grad_norm": 0.31463149189949036, "learning_rate": 2.9924426336788332e-05, "loss": 0.0155, "step": 19520 }, { "epoch": 1.2239142695995489, "grad_norm": 0.22235389053821564, "learning_rate": 2.991387135589285e-05, "loss": 0.1766, "step": 19530 }, { "epoch": 1.2245409538133734, "grad_norm": 0.1038447692990303, "learning_rate": 2.9903316374997365e-05, "loss": 0.0483, "step": 19540 }, { "epoch": 1.225167638027198, "grad_norm": 0.10996226221323013, "learning_rate": 2.9892761394101875e-05, "loss": 0.0979, "step": 19550 }, { "epoch": 1.2257943222410228, "grad_norm": 0.05760728940367699, "learning_rate": 2.988220641320639e-05, "loss": 0.1704, "step": 19560 }, { "epoch": 1.2264210064548473, "grad_norm": 0.3645038604736328, "learning_rate": 2.987165143231091e-05, "loss": 0.1405, "step": 19570 }, { "epoch": 1.227047690668672, "grad_norm": 3.712083101272583, "learning_rate": 2.986109645141543e-05, "loss": 0.0835, "step": 19580 }, { "epoch": 1.2276743748824968, "grad_norm": 0.049852531403303146, "learning_rate": 2.9850541470519938e-05, "loss": 0.1232, "step": 19590 }, { "epoch": 1.2283010590963213, "grad_norm": 0.0666368305683136, "learning_rate": 2.9839986489624455e-05, "loss": 0.1781, "step": 19600 }, { "epoch": 1.228927743310146, "grad_norm": 0.15124481916427612, "learning_rate": 2.982943150872897e-05, "loss": 0.2627, "step": 19610 }, { "epoch": 1.2295544275239707, "grad_norm": 0.1682569831609726, "learning_rate": 2.9818876527833485e-05, "loss": 0.0087, "step": 19620 }, { "epoch": 1.2301811117377954, "grad_norm": 0.1204722449183464, "learning_rate": 2.9808321546938e-05, "loss": 0.013, "step": 19630 }, { "epoch": 1.23080779595162, "grad_norm": 5.767759799957275, "learning_rate": 2.9797766566042518e-05, "loss": 0.25, "step": 19640 }, { "epoch": 1.2314344801654447, "grad_norm": 0.25917211174964905, "learning_rate": 2.9787211585147035e-05, "loss": 0.1393, "step": 19650 }, { "epoch": 1.2320611643792694, "grad_norm": 0.15553507208824158, "learning_rate": 2.9776656604251545e-05, "loss": 0.0623, "step": 19660 }, { "epoch": 1.2326878485930939, "grad_norm": 3.4714977741241455, "learning_rate": 2.9766101623356065e-05, "loss": 0.157, "step": 19670 }, { "epoch": 1.2333145328069186, "grad_norm": 0.20367176830768585, "learning_rate": 2.975554664246058e-05, "loss": 0.0247, "step": 19680 }, { "epoch": 1.2339412170207433, "grad_norm": 0.09446282684803009, "learning_rate": 2.974499166156509e-05, "loss": 0.139, "step": 19690 }, { "epoch": 1.2345679012345678, "grad_norm": 0.0783671885728836, "learning_rate": 2.9734436680669608e-05, "loss": 0.0595, "step": 19700 }, { "epoch": 1.2351945854483926, "grad_norm": 0.07558191567659378, "learning_rate": 2.9723881699774125e-05, "loss": 0.0884, "step": 19710 }, { "epoch": 1.2358212696622173, "grad_norm": 0.12064258009195328, "learning_rate": 2.971332671887864e-05, "loss": 0.0598, "step": 19720 }, { "epoch": 1.2364479538760418, "grad_norm": 22.905738830566406, "learning_rate": 2.9702771737983154e-05, "loss": 0.0969, "step": 19730 }, { "epoch": 1.2370746380898665, "grad_norm": 0.04529252275824547, "learning_rate": 2.969221675708767e-05, "loss": 0.0691, "step": 19740 }, { "epoch": 1.2377013223036912, "grad_norm": 0.08773695677518845, "learning_rate": 2.9681661776192188e-05, "loss": 0.142, "step": 19750 }, { "epoch": 1.2383280065175157, "grad_norm": 0.12956632673740387, "learning_rate": 2.9671106795296698e-05, "loss": 0.0385, "step": 19760 }, { "epoch": 1.2389546907313405, "grad_norm": 0.10451506078243256, "learning_rate": 2.9660551814401218e-05, "loss": 0.1296, "step": 19770 }, { "epoch": 1.2395813749451652, "grad_norm": 0.03981676697731018, "learning_rate": 2.9649996833505734e-05, "loss": 0.0055, "step": 19780 }, { "epoch": 1.2402080591589897, "grad_norm": 5.25244665145874, "learning_rate": 2.963944185261025e-05, "loss": 0.011, "step": 19790 }, { "epoch": 1.2408347433728144, "grad_norm": 0.01934889517724514, "learning_rate": 2.962888687171476e-05, "loss": 0.12, "step": 19800 }, { "epoch": 1.2414614275866391, "grad_norm": 0.018547803163528442, "learning_rate": 2.9618331890819277e-05, "loss": 0.0381, "step": 19810 }, { "epoch": 1.2420881118004639, "grad_norm": 0.03331906720995903, "learning_rate": 2.9607776909923794e-05, "loss": 0.1396, "step": 19820 }, { "epoch": 1.2427147960142884, "grad_norm": 0.04414796829223633, "learning_rate": 2.9597221929028314e-05, "loss": 0.0023, "step": 19830 }, { "epoch": 1.243341480228113, "grad_norm": 0.03142017871141434, "learning_rate": 2.9586666948132824e-05, "loss": 0.1793, "step": 19840 }, { "epoch": 1.2439681644419376, "grad_norm": 0.02367275580763817, "learning_rate": 2.957611196723734e-05, "loss": 0.1257, "step": 19850 }, { "epoch": 1.2445948486557623, "grad_norm": 24.037185668945312, "learning_rate": 2.9565556986341857e-05, "loss": 0.1726, "step": 19860 }, { "epoch": 1.245221532869587, "grad_norm": 0.10329566150903702, "learning_rate": 2.955500200544637e-05, "loss": 0.1484, "step": 19870 }, { "epoch": 1.2458482170834118, "grad_norm": 0.04350671544671059, "learning_rate": 2.9544447024550887e-05, "loss": 0.0351, "step": 19880 }, { "epoch": 1.2464749012972363, "grad_norm": 10.928489685058594, "learning_rate": 2.9533892043655404e-05, "loss": 0.2075, "step": 19890 }, { "epoch": 1.247101585511061, "grad_norm": 0.06471390277147293, "learning_rate": 2.952333706275992e-05, "loss": 0.0423, "step": 19900 }, { "epoch": 1.2477282697248857, "grad_norm": 0.08085936307907104, "learning_rate": 2.951278208186443e-05, "loss": 0.1876, "step": 19910 }, { "epoch": 1.2483549539387102, "grad_norm": 0.16564467549324036, "learning_rate": 2.950222710096895e-05, "loss": 0.161, "step": 19920 }, { "epoch": 1.248981638152535, "grad_norm": 0.34986400604248047, "learning_rate": 2.9491672120073467e-05, "loss": 0.0752, "step": 19930 }, { "epoch": 1.2496083223663597, "grad_norm": 0.02159041538834572, "learning_rate": 2.9481117139177977e-05, "loss": 0.1011, "step": 19940 }, { "epoch": 1.2502350065801844, "grad_norm": 0.03885908052325249, "learning_rate": 2.9470562158282494e-05, "loss": 0.0151, "step": 19950 }, { "epoch": 1.2508616907940089, "grad_norm": 0.02896152250468731, "learning_rate": 2.946000717738701e-05, "loss": 0.189, "step": 19960 }, { "epoch": 1.2514883750078336, "grad_norm": 4.279455661773682, "learning_rate": 2.9449452196491527e-05, "loss": 0.1141, "step": 19970 }, { "epoch": 1.252115059221658, "grad_norm": 0.07367860525846481, "learning_rate": 2.943889721559604e-05, "loss": 0.0584, "step": 19980 }, { "epoch": 1.2527417434354828, "grad_norm": 1.4007365703582764, "learning_rate": 2.9428342234700557e-05, "loss": 0.1464, "step": 19990 }, { "epoch": 1.2533684276493076, "grad_norm": 3.3747365474700928, "learning_rate": 2.9417787253805074e-05, "loss": 0.0838, "step": 20000 }, { "epoch": 1.2539951118631323, "grad_norm": 3.90412974357605, "learning_rate": 2.9407232272909583e-05, "loss": 0.0122, "step": 20010 }, { "epoch": 1.2546217960769568, "grad_norm": 0.6516067981719971, "learning_rate": 2.9396677292014103e-05, "loss": 0.1077, "step": 20020 }, { "epoch": 1.2552484802907815, "grad_norm": 10.695253372192383, "learning_rate": 2.938612231111862e-05, "loss": 0.2566, "step": 20030 }, { "epoch": 1.255875164504606, "grad_norm": 0.032031357288360596, "learning_rate": 2.9375567330223137e-05, "loss": 0.1136, "step": 20040 }, { "epoch": 1.2565018487184307, "grad_norm": 0.04882671311497688, "learning_rate": 2.9365012349327647e-05, "loss": 0.0501, "step": 20050 }, { "epoch": 1.2571285329322555, "grad_norm": 0.25139033794403076, "learning_rate": 2.9354457368432163e-05, "loss": 0.1784, "step": 20060 }, { "epoch": 1.2577552171460802, "grad_norm": 1.5231380462646484, "learning_rate": 2.934390238753668e-05, "loss": 0.0535, "step": 20070 }, { "epoch": 1.2583819013599047, "grad_norm": 5.682507514953613, "learning_rate": 2.9333347406641193e-05, "loss": 0.1149, "step": 20080 }, { "epoch": 1.2590085855737294, "grad_norm": 5.259180545806885, "learning_rate": 2.932279242574571e-05, "loss": 0.0943, "step": 20090 }, { "epoch": 1.259635269787554, "grad_norm": 0.047352973371744156, "learning_rate": 2.9312237444850226e-05, "loss": 0.0046, "step": 20100 }, { "epoch": 1.2602619540013786, "grad_norm": 6.0193328857421875, "learning_rate": 2.9301682463954743e-05, "loss": 0.1647, "step": 20110 }, { "epoch": 1.2608886382152034, "grad_norm": 14.669256210327148, "learning_rate": 2.9291127483059256e-05, "loss": 0.0057, "step": 20120 }, { "epoch": 1.261515322429028, "grad_norm": 12.382940292358398, "learning_rate": 2.9280572502163773e-05, "loss": 0.0715, "step": 20130 }, { "epoch": 1.2621420066428526, "grad_norm": 0.02463279478251934, "learning_rate": 2.927001752126829e-05, "loss": 0.0771, "step": 20140 }, { "epoch": 1.2627686908566773, "grad_norm": 0.10457153618335724, "learning_rate": 2.92594625403728e-05, "loss": 0.1199, "step": 20150 }, { "epoch": 1.263395375070502, "grad_norm": 8.55932903289795, "learning_rate": 2.9248907559477316e-05, "loss": 0.1022, "step": 20160 }, { "epoch": 1.2640220592843265, "grad_norm": 0.043147992342710495, "learning_rate": 2.9238352578581836e-05, "loss": 0.0658, "step": 20170 }, { "epoch": 1.2646487434981513, "grad_norm": 27.947301864624023, "learning_rate": 2.9227797597686353e-05, "loss": 0.0859, "step": 20180 }, { "epoch": 1.265275427711976, "grad_norm": 0.02526325173676014, "learning_rate": 2.9217242616790863e-05, "loss": 0.0019, "step": 20190 }, { "epoch": 1.2659021119258007, "grad_norm": 0.09650743752717972, "learning_rate": 2.920668763589538e-05, "loss": 0.0923, "step": 20200 }, { "epoch": 1.2665287961396252, "grad_norm": 0.6603000164031982, "learning_rate": 2.9196132654999896e-05, "loss": 0.0079, "step": 20210 }, { "epoch": 1.26715548035345, "grad_norm": 3.661068916320801, "learning_rate": 2.9185577674104413e-05, "loss": 0.167, "step": 20220 }, { "epoch": 1.2677821645672744, "grad_norm": 0.8906214833259583, "learning_rate": 2.9175022693208926e-05, "loss": 0.1106, "step": 20230 }, { "epoch": 1.2684088487810992, "grad_norm": 0.2638370990753174, "learning_rate": 2.9164467712313443e-05, "loss": 0.1843, "step": 20240 }, { "epoch": 1.2690355329949239, "grad_norm": 3.85107684135437, "learning_rate": 2.915391273141796e-05, "loss": 0.1137, "step": 20250 }, { "epoch": 1.2696622172087486, "grad_norm": 0.170636385679245, "learning_rate": 2.914335775052247e-05, "loss": 0.1177, "step": 20260 }, { "epoch": 1.2702889014225731, "grad_norm": 0.25960955023765564, "learning_rate": 2.913280276962699e-05, "loss": 0.1968, "step": 20270 }, { "epoch": 1.2709155856363978, "grad_norm": 4.221624851226807, "learning_rate": 2.9122247788731506e-05, "loss": 0.0598, "step": 20280 }, { "epoch": 1.2715422698502223, "grad_norm": 1.4436137676239014, "learning_rate": 2.9111692807836023e-05, "loss": 0.0213, "step": 20290 }, { "epoch": 1.272168954064047, "grad_norm": 0.39366188645362854, "learning_rate": 2.9101137826940532e-05, "loss": 0.1929, "step": 20300 }, { "epoch": 1.2727956382778718, "grad_norm": 0.37669408321380615, "learning_rate": 2.909058284604505e-05, "loss": 0.0468, "step": 20310 }, { "epoch": 1.2734223224916965, "grad_norm": 0.015775645151734352, "learning_rate": 2.9080027865149566e-05, "loss": 0.1132, "step": 20320 }, { "epoch": 1.274049006705521, "grad_norm": 0.24999144673347473, "learning_rate": 2.906947288425408e-05, "loss": 0.1911, "step": 20330 }, { "epoch": 1.2746756909193457, "grad_norm": 0.21520374715328217, "learning_rate": 2.9058917903358596e-05, "loss": 0.0826, "step": 20340 }, { "epoch": 1.2753023751331705, "grad_norm": 2.923757314682007, "learning_rate": 2.9048362922463112e-05, "loss": 0.1644, "step": 20350 }, { "epoch": 1.275929059346995, "grad_norm": 1.0977178812026978, "learning_rate": 2.903780794156763e-05, "loss": 0.0665, "step": 20360 }, { "epoch": 1.2765557435608197, "grad_norm": 1.5767126083374023, "learning_rate": 2.9027252960672142e-05, "loss": 0.0443, "step": 20370 }, { "epoch": 1.2771824277746444, "grad_norm": 1.1978888511657715, "learning_rate": 2.901669797977666e-05, "loss": 0.0876, "step": 20380 }, { "epoch": 1.2778091119884691, "grad_norm": 0.14035820960998535, "learning_rate": 2.9006142998881175e-05, "loss": 0.1417, "step": 20390 }, { "epoch": 1.2784357962022936, "grad_norm": 0.056992143392562866, "learning_rate": 2.8995588017985685e-05, "loss": 0.1431, "step": 20400 }, { "epoch": 1.2790624804161184, "grad_norm": 0.09089019894599915, "learning_rate": 2.8985033037090202e-05, "loss": 0.0829, "step": 20410 }, { "epoch": 1.2796891646299429, "grad_norm": 0.05624585598707199, "learning_rate": 2.8974478056194722e-05, "loss": 0.0377, "step": 20420 }, { "epoch": 1.2803158488437676, "grad_norm": 27.855571746826172, "learning_rate": 2.896392307529924e-05, "loss": 0.3182, "step": 20430 }, { "epoch": 1.2809425330575923, "grad_norm": 0.08453124761581421, "learning_rate": 2.895336809440375e-05, "loss": 0.1624, "step": 20440 }, { "epoch": 1.281569217271417, "grad_norm": 0.27798300981521606, "learning_rate": 2.8942813113508265e-05, "loss": 0.0519, "step": 20450 }, { "epoch": 1.2821959014852415, "grad_norm": 3.423630714416504, "learning_rate": 2.8932258132612782e-05, "loss": 0.0672, "step": 20460 }, { "epoch": 1.2828225856990663, "grad_norm": 1.1507163047790527, "learning_rate": 2.8921703151717295e-05, "loss": 0.0965, "step": 20470 }, { "epoch": 1.2834492699128908, "grad_norm": 0.03121231682598591, "learning_rate": 2.8911148170821812e-05, "loss": 0.007, "step": 20480 }, { "epoch": 1.2840759541267155, "grad_norm": 0.4905284345149994, "learning_rate": 2.890059318992633e-05, "loss": 0.0541, "step": 20490 }, { "epoch": 1.2847026383405402, "grad_norm": 0.020438535138964653, "learning_rate": 2.8890038209030845e-05, "loss": 0.002, "step": 20500 }, { "epoch": 1.285329322554365, "grad_norm": 5.502569198608398, "learning_rate": 2.887948322813536e-05, "loss": 0.293, "step": 20510 }, { "epoch": 1.2859560067681894, "grad_norm": 0.04871942475438118, "learning_rate": 2.8868928247239875e-05, "loss": 0.08, "step": 20520 }, { "epoch": 1.2865826909820142, "grad_norm": 0.3004648685455322, "learning_rate": 2.885837326634439e-05, "loss": 0.0476, "step": 20530 }, { "epoch": 1.287209375195839, "grad_norm": 6.2745041847229, "learning_rate": 2.88478182854489e-05, "loss": 0.169, "step": 20540 }, { "epoch": 1.2878360594096634, "grad_norm": 0.2036275714635849, "learning_rate": 2.8837263304553418e-05, "loss": 0.0132, "step": 20550 }, { "epoch": 1.2884627436234881, "grad_norm": 0.04330754280090332, "learning_rate": 2.8826708323657935e-05, "loss": 0.132, "step": 20560 }, { "epoch": 1.2890894278373128, "grad_norm": 0.04521123319864273, "learning_rate": 2.881615334276245e-05, "loss": 0.0356, "step": 20570 }, { "epoch": 1.2897161120511376, "grad_norm": 1.2175931930541992, "learning_rate": 2.8805598361866965e-05, "loss": 0.0142, "step": 20580 }, { "epoch": 1.290342796264962, "grad_norm": 0.34771111607551575, "learning_rate": 2.879504338097148e-05, "loss": 0.0052, "step": 20590 }, { "epoch": 1.2909694804787868, "grad_norm": 2.1885266304016113, "learning_rate": 2.8784488400075998e-05, "loss": 0.3319, "step": 20600 }, { "epoch": 1.2915961646926113, "grad_norm": 0.10941804945468903, "learning_rate": 2.877393341918051e-05, "loss": 0.1175, "step": 20610 }, { "epoch": 1.292222848906436, "grad_norm": 0.07854396849870682, "learning_rate": 2.8763378438285028e-05, "loss": 0.2803, "step": 20620 }, { "epoch": 1.2928495331202607, "grad_norm": 0.07036060094833374, "learning_rate": 2.8752823457389545e-05, "loss": 0.0268, "step": 20630 }, { "epoch": 1.2934762173340855, "grad_norm": 8.92553424835205, "learning_rate": 2.874226847649406e-05, "loss": 0.2271, "step": 20640 }, { "epoch": 1.29410290154791, "grad_norm": 0.1353772133588791, "learning_rate": 2.873171349559857e-05, "loss": 0.0537, "step": 20650 }, { "epoch": 1.2947295857617347, "grad_norm": 1.585835337638855, "learning_rate": 2.8721158514703088e-05, "loss": 0.1439, "step": 20660 }, { "epoch": 1.2953562699755592, "grad_norm": 0.1559787541627884, "learning_rate": 2.8710603533807608e-05, "loss": 0.0652, "step": 20670 }, { "epoch": 1.295982954189384, "grad_norm": 6.339298725128174, "learning_rate": 2.8700048552912124e-05, "loss": 0.1248, "step": 20680 }, { "epoch": 1.2966096384032086, "grad_norm": 4.56957483291626, "learning_rate": 2.8689493572016634e-05, "loss": 0.0374, "step": 20690 }, { "epoch": 1.2972363226170334, "grad_norm": 0.2088044285774231, "learning_rate": 2.867893859112115e-05, "loss": 0.0817, "step": 20700 }, { "epoch": 1.2978630068308579, "grad_norm": 0.021302178502082825, "learning_rate": 2.8668383610225668e-05, "loss": 0.0559, "step": 20710 }, { "epoch": 1.2984896910446826, "grad_norm": 0.028453296050429344, "learning_rate": 2.865782862933018e-05, "loss": 0.2479, "step": 20720 }, { "epoch": 1.2991163752585073, "grad_norm": 0.03374246507883072, "learning_rate": 2.8647273648434698e-05, "loss": 0.1142, "step": 20730 }, { "epoch": 1.2997430594723318, "grad_norm": 0.05600728467106819, "learning_rate": 2.8636718667539214e-05, "loss": 0.1438, "step": 20740 }, { "epoch": 1.3003697436861565, "grad_norm": 0.056089840829372406, "learning_rate": 2.862616368664373e-05, "loss": 0.0818, "step": 20750 }, { "epoch": 1.3009964278999813, "grad_norm": 0.04712797328829765, "learning_rate": 2.8615608705748244e-05, "loss": 0.1265, "step": 20760 }, { "epoch": 1.301623112113806, "grad_norm": 0.1540490686893463, "learning_rate": 2.860505372485276e-05, "loss": 0.0057, "step": 20770 }, { "epoch": 1.3022497963276305, "grad_norm": 4.8552565574646, "learning_rate": 2.8594498743957277e-05, "loss": 0.3901, "step": 20780 }, { "epoch": 1.3028764805414552, "grad_norm": 0.19207793474197388, "learning_rate": 2.8583943763061787e-05, "loss": 0.0407, "step": 20790 }, { "epoch": 1.3035031647552797, "grad_norm": 3.7182164192199707, "learning_rate": 2.8573388782166304e-05, "loss": 0.2068, "step": 20800 }, { "epoch": 1.3041298489691044, "grad_norm": 0.1881396323442459, "learning_rate": 2.856283380127082e-05, "loss": 0.126, "step": 20810 }, { "epoch": 1.3047565331829292, "grad_norm": 0.10974586009979248, "learning_rate": 2.8552278820375337e-05, "loss": 0.0115, "step": 20820 }, { "epoch": 1.305383217396754, "grad_norm": 0.01864585094153881, "learning_rate": 2.854172383947985e-05, "loss": 0.0115, "step": 20830 }, { "epoch": 1.3060099016105784, "grad_norm": 2.640251398086548, "learning_rate": 2.8531168858584367e-05, "loss": 0.1351, "step": 20840 }, { "epoch": 1.3066365858244031, "grad_norm": 0.036722589284181595, "learning_rate": 2.8520613877688884e-05, "loss": 0.0089, "step": 20850 }, { "epoch": 1.3072632700382276, "grad_norm": 0.7262831926345825, "learning_rate": 2.8510058896793397e-05, "loss": 0.1035, "step": 20860 }, { "epoch": 1.3078899542520523, "grad_norm": 9.36829662322998, "learning_rate": 2.8499503915897914e-05, "loss": 0.2305, "step": 20870 }, { "epoch": 1.308516638465877, "grad_norm": 0.041101399809122086, "learning_rate": 2.848894893500243e-05, "loss": 0.1464, "step": 20880 }, { "epoch": 1.3091433226797018, "grad_norm": 0.08307519555091858, "learning_rate": 2.8478393954106947e-05, "loss": 0.191, "step": 20890 }, { "epoch": 1.3097700068935263, "grad_norm": 0.17687416076660156, "learning_rate": 2.8467838973211457e-05, "loss": 0.2236, "step": 20900 }, { "epoch": 1.310396691107351, "grad_norm": 0.32803231477737427, "learning_rate": 2.8457283992315974e-05, "loss": 0.1196, "step": 20910 }, { "epoch": 1.3110233753211755, "grad_norm": 0.19486212730407715, "learning_rate": 2.8446729011420494e-05, "loss": 0.1044, "step": 20920 }, { "epoch": 1.3116500595350002, "grad_norm": 0.38289082050323486, "learning_rate": 2.8436174030525003e-05, "loss": 0.1773, "step": 20930 }, { "epoch": 1.312276743748825, "grad_norm": 0.06538600474596024, "learning_rate": 2.842561904962952e-05, "loss": 0.1115, "step": 20940 }, { "epoch": 1.3129034279626497, "grad_norm": 0.19104982912540436, "learning_rate": 2.8415064068734037e-05, "loss": 0.0813, "step": 20950 }, { "epoch": 1.3135301121764742, "grad_norm": 0.0975765660405159, "learning_rate": 2.8404509087838553e-05, "loss": 0.0691, "step": 20960 }, { "epoch": 1.314156796390299, "grad_norm": 0.1305026262998581, "learning_rate": 2.8393954106943067e-05, "loss": 0.1324, "step": 20970 }, { "epoch": 1.3147834806041236, "grad_norm": 0.05575420334935188, "learning_rate": 2.8383399126047583e-05, "loss": 0.0043, "step": 20980 }, { "epoch": 1.3154101648179481, "grad_norm": 0.028829995542764664, "learning_rate": 2.83728441451521e-05, "loss": 0.0905, "step": 20990 }, { "epoch": 1.3160368490317729, "grad_norm": 0.043897613883018494, "learning_rate": 2.836228916425661e-05, "loss": 0.0019, "step": 21000 }, { "epoch": 1.3166635332455976, "grad_norm": 0.029821641743183136, "learning_rate": 2.835173418336113e-05, "loss": 0.2736, "step": 21010 }, { "epoch": 1.3172902174594223, "grad_norm": 0.39863696694374084, "learning_rate": 2.8341179202465647e-05, "loss": 0.2053, "step": 21020 }, { "epoch": 1.3179169016732468, "grad_norm": 2.2105536460876465, "learning_rate": 2.8330624221570163e-05, "loss": 0.1366, "step": 21030 }, { "epoch": 1.3185435858870715, "grad_norm": 0.6516872048377991, "learning_rate": 2.8320069240674673e-05, "loss": 0.0166, "step": 21040 }, { "epoch": 1.319170270100896, "grad_norm": 0.0926242247223854, "learning_rate": 2.830951425977919e-05, "loss": 0.0083, "step": 21050 }, { "epoch": 1.3197969543147208, "grad_norm": 0.03947974368929863, "learning_rate": 2.8298959278883706e-05, "loss": 0.098, "step": 21060 }, { "epoch": 1.3204236385285455, "grad_norm": 0.01785365492105484, "learning_rate": 2.8288404297988223e-05, "loss": 0.0004, "step": 21070 }, { "epoch": 1.3210503227423702, "grad_norm": 0.11209467053413391, "learning_rate": 2.8277849317092736e-05, "loss": 0.1898, "step": 21080 }, { "epoch": 1.3216770069561947, "grad_norm": 0.022274712100625038, "learning_rate": 2.8267294336197253e-05, "loss": 0.2389, "step": 21090 }, { "epoch": 1.3223036911700194, "grad_norm": 0.02103334292769432, "learning_rate": 2.825673935530177e-05, "loss": 0.1084, "step": 21100 }, { "epoch": 1.322930375383844, "grad_norm": 0.03857764974236488, "learning_rate": 2.8246184374406283e-05, "loss": 0.0906, "step": 21110 }, { "epoch": 1.3235570595976687, "grad_norm": 0.04737214371562004, "learning_rate": 2.82356293935108e-05, "loss": 0.0023, "step": 21120 }, { "epoch": 1.3241837438114934, "grad_norm": 9.90311050415039, "learning_rate": 2.8225074412615316e-05, "loss": 0.2539, "step": 21130 }, { "epoch": 1.3248104280253181, "grad_norm": 0.11373288929462433, "learning_rate": 2.8214519431719833e-05, "loss": 0.0601, "step": 21140 }, { "epoch": 1.3254371122391426, "grad_norm": 9.301064491271973, "learning_rate": 2.8203964450824343e-05, "loss": 0.1733, "step": 21150 }, { "epoch": 1.3260637964529673, "grad_norm": 2.312943935394287, "learning_rate": 2.819340946992886e-05, "loss": 0.0314, "step": 21160 }, { "epoch": 1.326690480666792, "grad_norm": 0.01871352083981037, "learning_rate": 2.818285448903338e-05, "loss": 0.0727, "step": 21170 }, { "epoch": 1.3273171648806166, "grad_norm": 0.024634309113025665, "learning_rate": 2.817229950813789e-05, "loss": 0.0738, "step": 21180 }, { "epoch": 1.3279438490944413, "grad_norm": 0.04853309318423271, "learning_rate": 2.8161744527242406e-05, "loss": 0.1316, "step": 21190 }, { "epoch": 1.328570533308266, "grad_norm": 0.021960796788334846, "learning_rate": 2.8151189546346923e-05, "loss": 0.0776, "step": 21200 }, { "epoch": 1.3291972175220907, "grad_norm": 0.014680769294500351, "learning_rate": 2.814063456545144e-05, "loss": 0.1248, "step": 21210 }, { "epoch": 1.3298239017359152, "grad_norm": 0.012380057014524937, "learning_rate": 2.8130079584555952e-05, "loss": 0.0794, "step": 21220 }, { "epoch": 1.33045058594974, "grad_norm": 0.014920385554432869, "learning_rate": 2.811952460366047e-05, "loss": 0.0538, "step": 21230 }, { "epoch": 1.3310772701635645, "grad_norm": 4.893551349639893, "learning_rate": 2.8108969622764986e-05, "loss": 0.2239, "step": 21240 }, { "epoch": 1.3317039543773892, "grad_norm": 0.04235278442502022, "learning_rate": 2.8098414641869496e-05, "loss": 0.0165, "step": 21250 }, { "epoch": 1.332330638591214, "grad_norm": 0.85734623670578, "learning_rate": 2.8087859660974016e-05, "loss": 0.1398, "step": 21260 }, { "epoch": 1.3329573228050386, "grad_norm": 0.03186863660812378, "learning_rate": 2.8077304680078532e-05, "loss": 0.0695, "step": 21270 }, { "epoch": 1.3335840070188631, "grad_norm": 0.18006360530853271, "learning_rate": 2.806674969918305e-05, "loss": 0.1032, "step": 21280 }, { "epoch": 1.3342106912326879, "grad_norm": 0.03093225508928299, "learning_rate": 2.805619471828756e-05, "loss": 0.0228, "step": 21290 }, { "epoch": 1.3348373754465124, "grad_norm": 2.2488553524017334, "learning_rate": 2.8045639737392076e-05, "loss": 0.0423, "step": 21300 }, { "epoch": 1.335464059660337, "grad_norm": 0.012684832327067852, "learning_rate": 2.8035084756496592e-05, "loss": 0.0628, "step": 21310 }, { "epoch": 1.3360907438741618, "grad_norm": 0.013988979160785675, "learning_rate": 2.8024529775601105e-05, "loss": 0.0138, "step": 21320 }, { "epoch": 1.3367174280879865, "grad_norm": 0.020398080348968506, "learning_rate": 2.8013974794705622e-05, "loss": 0.1118, "step": 21330 }, { "epoch": 1.337344112301811, "grad_norm": 3.608513832092285, "learning_rate": 2.800341981381014e-05, "loss": 0.0097, "step": 21340 }, { "epoch": 1.3379707965156358, "grad_norm": 0.012503975071012974, "learning_rate": 2.7992864832914655e-05, "loss": 0.013, "step": 21350 }, { "epoch": 1.3385974807294605, "grad_norm": 0.14755772054195404, "learning_rate": 2.798230985201917e-05, "loss": 0.1064, "step": 21360 }, { "epoch": 1.339224164943285, "grad_norm": 0.04783150553703308, "learning_rate": 2.7971754871123685e-05, "loss": 0.1094, "step": 21370 }, { "epoch": 1.3398508491571097, "grad_norm": 0.027297774329781532, "learning_rate": 2.7961199890228202e-05, "loss": 0.0789, "step": 21380 }, { "epoch": 1.3404775333709344, "grad_norm": 0.02567318268120289, "learning_rate": 2.7950644909332712e-05, "loss": 0.0552, "step": 21390 }, { "epoch": 1.3411042175847592, "grad_norm": 10.832972526550293, "learning_rate": 2.794008992843723e-05, "loss": 0.2971, "step": 21400 }, { "epoch": 1.3417309017985837, "grad_norm": 0.05646326020359993, "learning_rate": 2.7929534947541745e-05, "loss": 0.1354, "step": 21410 }, { "epoch": 1.3423575860124084, "grad_norm": 0.16988039016723633, "learning_rate": 2.7918979966646265e-05, "loss": 0.0885, "step": 21420 }, { "epoch": 1.342984270226233, "grad_norm": 0.1426088958978653, "learning_rate": 2.7908424985750775e-05, "loss": 0.0904, "step": 21430 }, { "epoch": 1.3436109544400576, "grad_norm": 0.07782195508480072, "learning_rate": 2.7897870004855292e-05, "loss": 0.0178, "step": 21440 }, { "epoch": 1.3442376386538823, "grad_norm": 0.051355279982089996, "learning_rate": 2.788731502395981e-05, "loss": 0.095, "step": 21450 }, { "epoch": 1.344864322867707, "grad_norm": 3.4157044887542725, "learning_rate": 2.7876760043064325e-05, "loss": 0.124, "step": 21460 }, { "epoch": 1.3454910070815316, "grad_norm": 5.504192352294922, "learning_rate": 2.7866205062168838e-05, "loss": 0.1536, "step": 21470 }, { "epoch": 1.3461176912953563, "grad_norm": 0.0324571318924427, "learning_rate": 2.7855650081273355e-05, "loss": 0.2083, "step": 21480 }, { "epoch": 1.3467443755091808, "grad_norm": 0.06137267127633095, "learning_rate": 2.784509510037787e-05, "loss": 0.0064, "step": 21490 }, { "epoch": 1.3473710597230055, "grad_norm": 0.20257635414600372, "learning_rate": 2.783454011948238e-05, "loss": 0.2428, "step": 21500 }, { "epoch": 1.3479977439368303, "grad_norm": 0.45073238015174866, "learning_rate": 2.78239851385869e-05, "loss": 0.1297, "step": 21510 }, { "epoch": 1.348624428150655, "grad_norm": 0.13506528735160828, "learning_rate": 2.7813430157691418e-05, "loss": 0.1046, "step": 21520 }, { "epoch": 1.3492511123644795, "grad_norm": 0.137327641248703, "learning_rate": 2.7802875176795935e-05, "loss": 0.0049, "step": 21530 }, { "epoch": 1.3498777965783042, "grad_norm": 0.07374851405620575, "learning_rate": 2.7792320195900445e-05, "loss": 0.171, "step": 21540 }, { "epoch": 1.350504480792129, "grad_norm": 0.07294797897338867, "learning_rate": 2.778176521500496e-05, "loss": 0.0991, "step": 21550 }, { "epoch": 1.3511311650059534, "grad_norm": 0.6325371265411377, "learning_rate": 2.7771210234109478e-05, "loss": 0.0663, "step": 21560 }, { "epoch": 1.3517578492197782, "grad_norm": 0.7016847133636475, "learning_rate": 2.776065525321399e-05, "loss": 0.2094, "step": 21570 }, { "epoch": 1.3523845334336029, "grad_norm": 0.05865400284528732, "learning_rate": 2.7750100272318508e-05, "loss": 0.1343, "step": 21580 }, { "epoch": 1.3530112176474276, "grad_norm": 0.05986565724015236, "learning_rate": 2.7739545291423025e-05, "loss": 0.0565, "step": 21590 }, { "epoch": 1.353637901861252, "grad_norm": 3.4504475593566895, "learning_rate": 2.772899031052754e-05, "loss": 0.0961, "step": 21600 }, { "epoch": 1.3542645860750768, "grad_norm": 0.04019607976078987, "learning_rate": 2.7718435329632054e-05, "loss": 0.0988, "step": 21610 }, { "epoch": 1.3548912702889013, "grad_norm": 0.042770903557538986, "learning_rate": 2.770788034873657e-05, "loss": 0.1854, "step": 21620 }, { "epoch": 1.355517954502726, "grad_norm": 0.040806010365486145, "learning_rate": 2.7697325367841088e-05, "loss": 0.1581, "step": 21630 }, { "epoch": 1.3561446387165508, "grad_norm": 0.1587454378604889, "learning_rate": 2.7686770386945598e-05, "loss": 0.2592, "step": 21640 }, { "epoch": 1.3567713229303755, "grad_norm": 3.8640308380126953, "learning_rate": 2.7676215406050114e-05, "loss": 0.2352, "step": 21650 }, { "epoch": 1.3573980071442, "grad_norm": 6.425248622894287, "learning_rate": 2.766566042515463e-05, "loss": 0.1215, "step": 21660 }, { "epoch": 1.3580246913580247, "grad_norm": 0.16422656178474426, "learning_rate": 2.765510544425915e-05, "loss": 0.1265, "step": 21670 }, { "epoch": 1.3586513755718492, "grad_norm": 1.1562215089797974, "learning_rate": 2.764455046336366e-05, "loss": 0.1549, "step": 21680 }, { "epoch": 1.359278059785674, "grad_norm": 1.9711147546768188, "learning_rate": 2.7633995482468178e-05, "loss": 0.1053, "step": 21690 }, { "epoch": 1.3599047439994987, "grad_norm": 1.314103603363037, "learning_rate": 2.7623440501572694e-05, "loss": 0.0416, "step": 21700 }, { "epoch": 1.3605314282133234, "grad_norm": 0.0646156594157219, "learning_rate": 2.7612885520677207e-05, "loss": 0.0671, "step": 21710 }, { "epoch": 1.361158112427148, "grad_norm": 0.2056860476732254, "learning_rate": 2.7602330539781724e-05, "loss": 0.1479, "step": 21720 }, { "epoch": 1.3617847966409726, "grad_norm": 0.06871373951435089, "learning_rate": 2.759177555888624e-05, "loss": 0.0234, "step": 21730 }, { "epoch": 1.3624114808547971, "grad_norm": 0.07943173497915268, "learning_rate": 2.7581220577990757e-05, "loss": 0.2848, "step": 21740 }, { "epoch": 1.3630381650686219, "grad_norm": 0.18097421526908875, "learning_rate": 2.7570665597095267e-05, "loss": 0.1584, "step": 21750 }, { "epoch": 1.3636648492824466, "grad_norm": 0.20642918348312378, "learning_rate": 2.7560110616199787e-05, "loss": 0.0104, "step": 21760 }, { "epoch": 1.3642915334962713, "grad_norm": 0.0687187910079956, "learning_rate": 2.7549555635304304e-05, "loss": 0.2375, "step": 21770 }, { "epoch": 1.3649182177100958, "grad_norm": 0.3212626874446869, "learning_rate": 2.7539000654408814e-05, "loss": 0.0807, "step": 21780 }, { "epoch": 1.3655449019239205, "grad_norm": 0.07216355204582214, "learning_rate": 2.752844567351333e-05, "loss": 0.0046, "step": 21790 }, { "epoch": 1.3661715861377453, "grad_norm": 2.168639898300171, "learning_rate": 2.7517890692617847e-05, "loss": 0.1583, "step": 21800 }, { "epoch": 1.3667982703515698, "grad_norm": 0.12190362066030502, "learning_rate": 2.7507335711722364e-05, "loss": 0.0594, "step": 21810 }, { "epoch": 1.3674249545653945, "grad_norm": 0.09011233597993851, "learning_rate": 2.7496780730826877e-05, "loss": 0.1266, "step": 21820 }, { "epoch": 1.3680516387792192, "grad_norm": 0.18184854090213776, "learning_rate": 2.7486225749931394e-05, "loss": 0.0919, "step": 21830 }, { "epoch": 1.368678322993044, "grad_norm": 0.058258481323719025, "learning_rate": 2.747567076903591e-05, "loss": 0.0573, "step": 21840 }, { "epoch": 1.3693050072068684, "grad_norm": 4.3048577308654785, "learning_rate": 2.7465115788140427e-05, "loss": 0.0845, "step": 21850 }, { "epoch": 1.3699316914206932, "grad_norm": 0.04196920618414879, "learning_rate": 2.745456080724494e-05, "loss": 0.1047, "step": 21860 }, { "epoch": 1.3705583756345177, "grad_norm": 0.07941604405641556, "learning_rate": 2.7444005826349457e-05, "loss": 0.0029, "step": 21870 }, { "epoch": 1.3711850598483424, "grad_norm": 9.327536582946777, "learning_rate": 2.7433450845453974e-05, "loss": 0.2039, "step": 21880 }, { "epoch": 1.371811744062167, "grad_norm": 0.03211827203631401, "learning_rate": 2.7422895864558483e-05, "loss": 0.002, "step": 21890 }, { "epoch": 1.3724384282759918, "grad_norm": 0.4241490960121155, "learning_rate": 2.7412340883663e-05, "loss": 0.0386, "step": 21900 }, { "epoch": 1.3730651124898163, "grad_norm": 0.02552582323551178, "learning_rate": 2.7401785902767517e-05, "loss": 0.0797, "step": 21910 }, { "epoch": 1.373691796703641, "grad_norm": 0.08526312559843063, "learning_rate": 2.7391230921872037e-05, "loss": 0.1707, "step": 21920 }, { "epoch": 1.3743184809174656, "grad_norm": 0.07207024097442627, "learning_rate": 2.7380675940976547e-05, "loss": 0.0661, "step": 21930 }, { "epoch": 1.3749451651312903, "grad_norm": 0.10189100354909897, "learning_rate": 2.7370120960081063e-05, "loss": 0.2263, "step": 21940 }, { "epoch": 1.375571849345115, "grad_norm": 20.908044815063477, "learning_rate": 2.735956597918558e-05, "loss": 0.13, "step": 21950 }, { "epoch": 1.3761985335589397, "grad_norm": 0.1741807460784912, "learning_rate": 2.7349010998290093e-05, "loss": 0.1285, "step": 21960 }, { "epoch": 1.3768252177727642, "grad_norm": 0.03784070163965225, "learning_rate": 2.733845601739461e-05, "loss": 0.0372, "step": 21970 }, { "epoch": 1.377451901986589, "grad_norm": 0.1834995001554489, "learning_rate": 2.7327901036499127e-05, "loss": 0.0897, "step": 21980 }, { "epoch": 1.3780785862004137, "grad_norm": 0.14013051986694336, "learning_rate": 2.7317346055603643e-05, "loss": 0.0053, "step": 21990 }, { "epoch": 1.3787052704142382, "grad_norm": 0.028027813881635666, "learning_rate": 2.7306791074708153e-05, "loss": 0.0186, "step": 22000 }, { "epoch": 1.379331954628063, "grad_norm": 12.102706909179688, "learning_rate": 2.7296236093812673e-05, "loss": 0.2363, "step": 22010 }, { "epoch": 1.3799586388418876, "grad_norm": 0.07020573318004608, "learning_rate": 2.728568111291719e-05, "loss": 0.1667, "step": 22020 }, { "epoch": 1.3805853230557124, "grad_norm": 3.281811237335205, "learning_rate": 2.72751261320217e-05, "loss": 0.1397, "step": 22030 }, { "epoch": 1.3812120072695369, "grad_norm": 1.4566924571990967, "learning_rate": 2.7264571151126216e-05, "loss": 0.2678, "step": 22040 }, { "epoch": 1.3818386914833616, "grad_norm": 5.477617263793945, "learning_rate": 2.7254016170230733e-05, "loss": 0.2475, "step": 22050 }, { "epoch": 1.382465375697186, "grad_norm": 6.840038776397705, "learning_rate": 2.724346118933525e-05, "loss": 0.1136, "step": 22060 }, { "epoch": 1.3830920599110108, "grad_norm": 0.04201101139187813, "learning_rate": 2.7232906208439763e-05, "loss": 0.039, "step": 22070 }, { "epoch": 1.3837187441248355, "grad_norm": 0.6134777665138245, "learning_rate": 2.722235122754428e-05, "loss": 0.0134, "step": 22080 }, { "epoch": 1.3843454283386603, "grad_norm": 0.3734954595565796, "learning_rate": 2.7211796246648796e-05, "loss": 0.1143, "step": 22090 }, { "epoch": 1.3849721125524848, "grad_norm": 5.686514854431152, "learning_rate": 2.720124126575331e-05, "loss": 0.3968, "step": 22100 }, { "epoch": 1.3855987967663095, "grad_norm": 0.06281892955303192, "learning_rate": 2.7190686284857826e-05, "loss": 0.1116, "step": 22110 }, { "epoch": 1.386225480980134, "grad_norm": 0.1443566530942917, "learning_rate": 2.7180131303962343e-05, "loss": 0.1349, "step": 22120 }, { "epoch": 1.3868521651939587, "grad_norm": 2.338820695877075, "learning_rate": 2.716957632306686e-05, "loss": 0.0705, "step": 22130 }, { "epoch": 1.3874788494077834, "grad_norm": 0.23876045644283295, "learning_rate": 2.715902134217137e-05, "loss": 0.131, "step": 22140 }, { "epoch": 1.3881055336216082, "grad_norm": 0.12372996658086777, "learning_rate": 2.7148466361275886e-05, "loss": 0.0628, "step": 22150 }, { "epoch": 1.3887322178354327, "grad_norm": 0.6853922605514526, "learning_rate": 2.7137911380380403e-05, "loss": 0.0708, "step": 22160 }, { "epoch": 1.3893589020492574, "grad_norm": 0.05106097832322121, "learning_rate": 2.7127356399484916e-05, "loss": 0.085, "step": 22170 }, { "epoch": 1.389985586263082, "grad_norm": 7.483982086181641, "learning_rate": 2.7116801418589432e-05, "loss": 0.0481, "step": 22180 }, { "epoch": 1.3906122704769066, "grad_norm": 3.056711196899414, "learning_rate": 2.710624643769395e-05, "loss": 0.2775, "step": 22190 }, { "epoch": 1.3912389546907313, "grad_norm": 0.42939886450767517, "learning_rate": 2.7095691456798466e-05, "loss": 0.0791, "step": 22200 }, { "epoch": 1.391865638904556, "grad_norm": 0.26278284192085266, "learning_rate": 2.708513647590298e-05, "loss": 0.0883, "step": 22210 }, { "epoch": 1.3924923231183808, "grad_norm": 0.16981381177902222, "learning_rate": 2.7074581495007496e-05, "loss": 0.0157, "step": 22220 }, { "epoch": 1.3931190073322053, "grad_norm": 13.162467002868652, "learning_rate": 2.7064026514112012e-05, "loss": 0.0918, "step": 22230 }, { "epoch": 1.39374569154603, "grad_norm": 0.16399233043193817, "learning_rate": 2.705347153321653e-05, "loss": 0.1045, "step": 22240 }, { "epoch": 1.3943723757598545, "grad_norm": 0.020432839170098305, "learning_rate": 2.704291655232104e-05, "loss": 0.029, "step": 22250 }, { "epoch": 1.3949990599736792, "grad_norm": 0.023673556745052338, "learning_rate": 2.703236157142556e-05, "loss": 0.1443, "step": 22260 }, { "epoch": 1.395625744187504, "grad_norm": 5.3616204261779785, "learning_rate": 2.7021806590530076e-05, "loss": 0.1986, "step": 22270 }, { "epoch": 1.3962524284013287, "grad_norm": 5.972326278686523, "learning_rate": 2.7011251609634585e-05, "loss": 0.1403, "step": 22280 }, { "epoch": 1.3968791126151532, "grad_norm": 1.81098210811615, "learning_rate": 2.7000696628739102e-05, "loss": 0.0171, "step": 22290 }, { "epoch": 1.397505796828978, "grad_norm": 2.2598025798797607, "learning_rate": 2.699014164784362e-05, "loss": 0.0315, "step": 22300 }, { "epoch": 1.3981324810428024, "grad_norm": 0.057428207248449326, "learning_rate": 2.6979586666948135e-05, "loss": 0.0492, "step": 22310 }, { "epoch": 1.3987591652566271, "grad_norm": 11.328914642333984, "learning_rate": 2.696903168605265e-05, "loss": 0.0546, "step": 22320 }, { "epoch": 1.3993858494704519, "grad_norm": 0.03397885710000992, "learning_rate": 2.6958476705157165e-05, "loss": 0.1241, "step": 22330 }, { "epoch": 1.4000125336842766, "grad_norm": 0.1802971065044403, "learning_rate": 2.6947921724261682e-05, "loss": 0.2208, "step": 22340 }, { "epoch": 1.400639217898101, "grad_norm": 0.26217713952064514, "learning_rate": 2.6937366743366195e-05, "loss": 0.0595, "step": 22350 }, { "epoch": 1.4012659021119258, "grad_norm": 0.5535191893577576, "learning_rate": 2.6926811762470712e-05, "loss": 0.0478, "step": 22360 }, { "epoch": 1.4018925863257505, "grad_norm": 4.9853596687316895, "learning_rate": 2.691625678157523e-05, "loss": 0.1245, "step": 22370 }, { "epoch": 1.402519270539575, "grad_norm": 1.5813056230545044, "learning_rate": 2.6905701800679745e-05, "loss": 0.0386, "step": 22380 }, { "epoch": 1.4031459547533998, "grad_norm": 0.043805863708257675, "learning_rate": 2.6895146819784255e-05, "loss": 0.1035, "step": 22390 }, { "epoch": 1.4037726389672245, "grad_norm": 0.27779996395111084, "learning_rate": 2.688459183888877e-05, "loss": 0.129, "step": 22400 }, { "epoch": 1.404399323181049, "grad_norm": 0.3968289792537689, "learning_rate": 2.6874036857993288e-05, "loss": 0.1455, "step": 22410 }, { "epoch": 1.4050260073948737, "grad_norm": 0.04641241207718849, "learning_rate": 2.68634818770978e-05, "loss": 0.1024, "step": 22420 }, { "epoch": 1.4056526916086984, "grad_norm": 0.04604057967662811, "learning_rate": 2.6852926896202318e-05, "loss": 0.0392, "step": 22430 }, { "epoch": 1.406279375822523, "grad_norm": 0.15959686040878296, "learning_rate": 2.6842371915306835e-05, "loss": 0.2027, "step": 22440 }, { "epoch": 1.4069060600363477, "grad_norm": 0.08660081773996353, "learning_rate": 2.683181693441135e-05, "loss": 0.1929, "step": 22450 }, { "epoch": 1.4075327442501724, "grad_norm": 0.03353703022003174, "learning_rate": 2.6821261953515865e-05, "loss": 0.0416, "step": 22460 }, { "epoch": 1.408159428463997, "grad_norm": 0.5888323187828064, "learning_rate": 2.681070697262038e-05, "loss": 0.0217, "step": 22470 }, { "epoch": 1.4087861126778216, "grad_norm": 0.2682591676712036, "learning_rate": 2.6800151991724898e-05, "loss": 0.0112, "step": 22480 }, { "epoch": 1.4094127968916463, "grad_norm": 0.022292733192443848, "learning_rate": 2.6789597010829408e-05, "loss": 0.1059, "step": 22490 }, { "epoch": 1.4100394811054708, "grad_norm": 0.020648401230573654, "learning_rate": 2.6779042029933925e-05, "loss": 0.0329, "step": 22500 }, { "epoch": 1.4106661653192956, "grad_norm": 0.025373414158821106, "learning_rate": 2.6768487049038445e-05, "loss": 0.0957, "step": 22510 }, { "epoch": 1.4112928495331203, "grad_norm": 0.016700396314263344, "learning_rate": 2.675793206814296e-05, "loss": 0.0694, "step": 22520 }, { "epoch": 1.411919533746945, "grad_norm": 2.5584840774536133, "learning_rate": 2.674737708724747e-05, "loss": 0.0216, "step": 22530 }, { "epoch": 1.4125462179607695, "grad_norm": 0.023159755393862724, "learning_rate": 2.6736822106351988e-05, "loss": 0.1943, "step": 22540 }, { "epoch": 1.4131729021745942, "grad_norm": 0.09470424056053162, "learning_rate": 2.6726267125456504e-05, "loss": 0.0548, "step": 22550 }, { "epoch": 1.4137995863884187, "grad_norm": 0.02677854523062706, "learning_rate": 2.6715712144561018e-05, "loss": 0.1369, "step": 22560 }, { "epoch": 1.4144262706022435, "grad_norm": 0.03269219398498535, "learning_rate": 2.6705157163665534e-05, "loss": 0.0458, "step": 22570 }, { "epoch": 1.4150529548160682, "grad_norm": 2.119086742401123, "learning_rate": 2.669460218277005e-05, "loss": 0.0234, "step": 22580 }, { "epoch": 1.415679639029893, "grad_norm": 0.02962580882012844, "learning_rate": 2.6684047201874568e-05, "loss": 0.1117, "step": 22590 }, { "epoch": 1.4163063232437174, "grad_norm": 0.0268696341663599, "learning_rate": 2.667349222097908e-05, "loss": 0.0093, "step": 22600 }, { "epoch": 1.4169330074575421, "grad_norm": 0.020729854702949524, "learning_rate": 2.6662937240083598e-05, "loss": 0.1382, "step": 22610 }, { "epoch": 1.4175596916713669, "grad_norm": 9.315117835998535, "learning_rate": 2.6652382259188114e-05, "loss": 0.2076, "step": 22620 }, { "epoch": 1.4181863758851914, "grad_norm": 0.24118176102638245, "learning_rate": 2.6641827278292624e-05, "loss": 0.0406, "step": 22630 }, { "epoch": 1.418813060099016, "grad_norm": 6.347595691680908, "learning_rate": 2.663127229739714e-05, "loss": 0.0539, "step": 22640 }, { "epoch": 1.4194397443128408, "grad_norm": 0.05370037630200386, "learning_rate": 2.6620717316501657e-05, "loss": 0.1254, "step": 22650 }, { "epoch": 1.4200664285266655, "grad_norm": 0.020342102274298668, "learning_rate": 2.6610162335606174e-05, "loss": 0.0645, "step": 22660 }, { "epoch": 1.42069311274049, "grad_norm": 0.10898140072822571, "learning_rate": 2.6599607354710687e-05, "loss": 0.1335, "step": 22670 }, { "epoch": 1.4213197969543148, "grad_norm": 0.1842038780450821, "learning_rate": 2.6589052373815204e-05, "loss": 0.1147, "step": 22680 }, { "epoch": 1.4219464811681393, "grad_norm": 1.9294404983520508, "learning_rate": 2.657849739291972e-05, "loss": 0.0036, "step": 22690 }, { "epoch": 1.422573165381964, "grad_norm": 0.10110870003700256, "learning_rate": 2.6567942412024237e-05, "loss": 0.1219, "step": 22700 }, { "epoch": 1.4231998495957887, "grad_norm": 0.06406152248382568, "learning_rate": 2.655738743112875e-05, "loss": 0.092, "step": 22710 }, { "epoch": 1.4238265338096134, "grad_norm": 0.2047063410282135, "learning_rate": 2.6546832450233267e-05, "loss": 0.115, "step": 22720 }, { "epoch": 1.424453218023438, "grad_norm": 0.02261444926261902, "learning_rate": 2.6536277469337784e-05, "loss": 0.0672, "step": 22730 }, { "epoch": 1.4250799022372627, "grad_norm": 0.10477691888809204, "learning_rate": 2.6525722488442294e-05, "loss": 0.1383, "step": 22740 }, { "epoch": 1.4257065864510872, "grad_norm": 4.8798747062683105, "learning_rate": 2.651516750754681e-05, "loss": 0.1659, "step": 22750 }, { "epoch": 1.426333270664912, "grad_norm": 0.061641011387109756, "learning_rate": 2.650461252665133e-05, "loss": 0.1158, "step": 22760 }, { "epoch": 1.4269599548787366, "grad_norm": 0.09502819925546646, "learning_rate": 2.6494057545755847e-05, "loss": 0.1218, "step": 22770 }, { "epoch": 1.4275866390925613, "grad_norm": 0.12448341399431229, "learning_rate": 2.6483502564860357e-05, "loss": 0.1604, "step": 22780 }, { "epoch": 1.4282133233063858, "grad_norm": 0.3269909620285034, "learning_rate": 2.6472947583964874e-05, "loss": 0.152, "step": 22790 }, { "epoch": 1.4288400075202106, "grad_norm": 0.2900196611881256, "learning_rate": 2.646239260306939e-05, "loss": 0.0525, "step": 22800 }, { "epoch": 1.4294666917340353, "grad_norm": 0.027310775592923164, "learning_rate": 2.6451837622173904e-05, "loss": 0.0276, "step": 22810 }, { "epoch": 1.4300933759478598, "grad_norm": 0.02277687005698681, "learning_rate": 2.644128264127842e-05, "loss": 0.0277, "step": 22820 }, { "epoch": 1.4307200601616845, "grad_norm": 0.021468304097652435, "learning_rate": 2.6430727660382937e-05, "loss": 0.1039, "step": 22830 }, { "epoch": 1.4313467443755092, "grad_norm": 1.4674038887023926, "learning_rate": 2.6420172679487453e-05, "loss": 0.0692, "step": 22840 }, { "epoch": 1.431973428589334, "grad_norm": 0.0249196607619524, "learning_rate": 2.6409617698591967e-05, "loss": 0.1458, "step": 22850 }, { "epoch": 1.4326001128031585, "grad_norm": 0.018479831516742706, "learning_rate": 2.6399062717696483e-05, "loss": 0.0304, "step": 22860 }, { "epoch": 1.4332267970169832, "grad_norm": 0.020352143794298172, "learning_rate": 2.6388507736801e-05, "loss": 0.0045, "step": 22870 }, { "epoch": 1.4338534812308077, "grad_norm": 0.026012783870100975, "learning_rate": 2.637795275590551e-05, "loss": 0.1779, "step": 22880 }, { "epoch": 1.4344801654446324, "grad_norm": 0.030733680352568626, "learning_rate": 2.6367397775010027e-05, "loss": 0.0091, "step": 22890 }, { "epoch": 1.4351068496584571, "grad_norm": 0.16905340552330017, "learning_rate": 2.6356842794114543e-05, "loss": 0.1538, "step": 22900 }, { "epoch": 1.4357335338722819, "grad_norm": 0.07733534276485443, "learning_rate": 2.6346287813219063e-05, "loss": 0.0962, "step": 22910 }, { "epoch": 1.4363602180861064, "grad_norm": 0.12916617095470428, "learning_rate": 2.6335732832323573e-05, "loss": 0.0904, "step": 22920 }, { "epoch": 1.436986902299931, "grad_norm": 0.05547713115811348, "learning_rate": 2.632517785142809e-05, "loss": 0.0416, "step": 22930 }, { "epoch": 1.4376135865137556, "grad_norm": 4.29659366607666, "learning_rate": 2.6314622870532606e-05, "loss": 0.1677, "step": 22940 }, { "epoch": 1.4382402707275803, "grad_norm": 0.09266739338636398, "learning_rate": 2.630406788963712e-05, "loss": 0.1797, "step": 22950 }, { "epoch": 1.438866954941405, "grad_norm": 0.14676466584205627, "learning_rate": 2.6293512908741636e-05, "loss": 0.1898, "step": 22960 }, { "epoch": 1.4394936391552298, "grad_norm": 13.134486198425293, "learning_rate": 2.6282957927846153e-05, "loss": 0.2278, "step": 22970 }, { "epoch": 1.4401203233690543, "grad_norm": 0.40627726912498474, "learning_rate": 2.627240294695067e-05, "loss": 0.107, "step": 22980 }, { "epoch": 1.440747007582879, "grad_norm": 0.21462000906467438, "learning_rate": 2.626184796605518e-05, "loss": 0.0617, "step": 22990 }, { "epoch": 1.4413736917967037, "grad_norm": 0.14315448701381683, "learning_rate": 2.6251292985159696e-05, "loss": 0.2208, "step": 23000 }, { "epoch": 1.4420003760105282, "grad_norm": 0.1582183539867401, "learning_rate": 2.6240738004264216e-05, "loss": 0.0407, "step": 23010 }, { "epoch": 1.442627060224353, "grad_norm": 3.420944929122925, "learning_rate": 2.6230183023368726e-05, "loss": 0.2166, "step": 23020 }, { "epoch": 1.4432537444381777, "grad_norm": 0.08734721690416336, "learning_rate": 2.6219628042473243e-05, "loss": 0.0895, "step": 23030 }, { "epoch": 1.4438804286520024, "grad_norm": 3.684218406677246, "learning_rate": 2.620907306157776e-05, "loss": 0.1275, "step": 23040 }, { "epoch": 1.444507112865827, "grad_norm": 57.07538604736328, "learning_rate": 2.6198518080682276e-05, "loss": 0.0456, "step": 23050 }, { "epoch": 1.4451337970796516, "grad_norm": 0.13820041716098785, "learning_rate": 2.618796309978679e-05, "loss": 0.0564, "step": 23060 }, { "epoch": 1.4457604812934761, "grad_norm": 0.377420037984848, "learning_rate": 2.6177408118891306e-05, "loss": 0.3262, "step": 23070 }, { "epoch": 1.4463871655073008, "grad_norm": 14.740819931030273, "learning_rate": 2.6166853137995823e-05, "loss": 0.1122, "step": 23080 }, { "epoch": 1.4470138497211256, "grad_norm": 0.07182967662811279, "learning_rate": 2.615629815710034e-05, "loss": 0.0342, "step": 23090 }, { "epoch": 1.4476405339349503, "grad_norm": 0.36316248774528503, "learning_rate": 2.6145743176204853e-05, "loss": 0.2078, "step": 23100 }, { "epoch": 1.4482672181487748, "grad_norm": 0.03609143942594528, "learning_rate": 2.613518819530937e-05, "loss": 0.0505, "step": 23110 }, { "epoch": 1.4488939023625995, "grad_norm": 0.12309886515140533, "learning_rate": 2.6124633214413886e-05, "loss": 0.0456, "step": 23120 }, { "epoch": 1.449520586576424, "grad_norm": 0.06193877011537552, "learning_rate": 2.6114078233518396e-05, "loss": 0.0057, "step": 23130 }, { "epoch": 1.4501472707902487, "grad_norm": 0.04505753889679909, "learning_rate": 2.6103523252622912e-05, "loss": 0.2234, "step": 23140 }, { "epoch": 1.4507739550040735, "grad_norm": 0.039813585579395294, "learning_rate": 2.609296827172743e-05, "loss": 0.137, "step": 23150 }, { "epoch": 1.4514006392178982, "grad_norm": 0.10670791566371918, "learning_rate": 2.608241329083195e-05, "loss": 0.114, "step": 23160 }, { "epoch": 1.4520273234317227, "grad_norm": 0.05462603643536568, "learning_rate": 2.607185830993646e-05, "loss": 0.1053, "step": 23170 }, { "epoch": 1.4526540076455474, "grad_norm": 0.12284897267818451, "learning_rate": 2.6061303329040976e-05, "loss": 0.0603, "step": 23180 }, { "epoch": 1.4532806918593721, "grad_norm": 0.1386725902557373, "learning_rate": 2.6050748348145492e-05, "loss": 0.0096, "step": 23190 }, { "epoch": 1.4539073760731966, "grad_norm": 18.32283592224121, "learning_rate": 2.6040193367250005e-05, "loss": 0.1813, "step": 23200 }, { "epoch": 1.4545340602870214, "grad_norm": 0.04442450776696205, "learning_rate": 2.6029638386354522e-05, "loss": 0.0997, "step": 23210 }, { "epoch": 1.455160744500846, "grad_norm": 0.032071322202682495, "learning_rate": 2.601908340545904e-05, "loss": 0.0718, "step": 23220 }, { "epoch": 1.4557874287146706, "grad_norm": 0.02288047969341278, "learning_rate": 2.6008528424563555e-05, "loss": 0.003, "step": 23230 }, { "epoch": 1.4564141129284953, "grad_norm": 6.418118000030518, "learning_rate": 2.5997973443668065e-05, "loss": 0.0605, "step": 23240 }, { "epoch": 1.45704079714232, "grad_norm": 0.02514999732375145, "learning_rate": 2.5987418462772582e-05, "loss": 0.1446, "step": 23250 }, { "epoch": 1.4576674813561445, "grad_norm": 14.600542068481445, "learning_rate": 2.5976863481877102e-05, "loss": 0.0825, "step": 23260 }, { "epoch": 1.4582941655699693, "grad_norm": 0.05681881681084633, "learning_rate": 2.5966308500981612e-05, "loss": 0.0676, "step": 23270 }, { "epoch": 1.458920849783794, "grad_norm": 0.17089250683784485, "learning_rate": 2.595575352008613e-05, "loss": 0.1669, "step": 23280 }, { "epoch": 1.4595475339976187, "grad_norm": 0.09021216630935669, "learning_rate": 2.5945198539190645e-05, "loss": 0.1839, "step": 23290 }, { "epoch": 1.4601742182114432, "grad_norm": 0.03588192164897919, "learning_rate": 2.5934643558295162e-05, "loss": 0.0296, "step": 23300 }, { "epoch": 1.460800902425268, "grad_norm": 0.7508852481842041, "learning_rate": 2.5924088577399675e-05, "loss": 0.1099, "step": 23310 }, { "epoch": 1.4614275866390924, "grad_norm": 2.826711416244507, "learning_rate": 2.5913533596504192e-05, "loss": 0.1877, "step": 23320 }, { "epoch": 1.4620542708529172, "grad_norm": 2.2574098110198975, "learning_rate": 2.590297861560871e-05, "loss": 0.1415, "step": 23330 }, { "epoch": 1.462680955066742, "grad_norm": 2.780092239379883, "learning_rate": 2.5892423634713218e-05, "loss": 0.0736, "step": 23340 }, { "epoch": 1.4633076392805666, "grad_norm": 0.1354340761899948, "learning_rate": 2.588186865381774e-05, "loss": 0.2455, "step": 23350 }, { "epoch": 1.4639343234943911, "grad_norm": 0.03604722395539284, "learning_rate": 2.5871313672922255e-05, "loss": 0.0966, "step": 23360 }, { "epoch": 1.4645610077082158, "grad_norm": 0.07439220696687698, "learning_rate": 2.586075869202677e-05, "loss": 0.1228, "step": 23370 }, { "epoch": 1.4651876919220403, "grad_norm": 19.66285514831543, "learning_rate": 2.585020371113128e-05, "loss": 0.1963, "step": 23380 }, { "epoch": 1.465814376135865, "grad_norm": 10.755349159240723, "learning_rate": 2.5839648730235798e-05, "loss": 0.1596, "step": 23390 }, { "epoch": 1.4664410603496898, "grad_norm": 0.3825264871120453, "learning_rate": 2.5829093749340315e-05, "loss": 0.0476, "step": 23400 }, { "epoch": 1.4670677445635145, "grad_norm": 0.985248863697052, "learning_rate": 2.5818538768444828e-05, "loss": 0.1206, "step": 23410 }, { "epoch": 1.467694428777339, "grad_norm": 0.5918242335319519, "learning_rate": 2.5807983787549345e-05, "loss": 0.0314, "step": 23420 }, { "epoch": 1.4683211129911637, "grad_norm": 0.038249701261520386, "learning_rate": 2.579742880665386e-05, "loss": 0.0369, "step": 23430 }, { "epoch": 1.4689477972049885, "grad_norm": 0.04049490764737129, "learning_rate": 2.5786873825758378e-05, "loss": 0.1652, "step": 23440 }, { "epoch": 1.469574481418813, "grad_norm": 0.027155594900250435, "learning_rate": 2.577631884486289e-05, "loss": 0.1038, "step": 23450 }, { "epoch": 1.4702011656326377, "grad_norm": 1.8956443071365356, "learning_rate": 2.5765763863967408e-05, "loss": 0.135, "step": 23460 }, { "epoch": 1.4708278498464624, "grad_norm": 0.02950974553823471, "learning_rate": 2.5755208883071925e-05, "loss": 0.2385, "step": 23470 }, { "epoch": 1.4714545340602871, "grad_norm": 0.09720432013273239, "learning_rate": 2.574465390217644e-05, "loss": 0.0588, "step": 23480 }, { "epoch": 1.4720812182741116, "grad_norm": 0.08742736279964447, "learning_rate": 2.573409892128095e-05, "loss": 0.0941, "step": 23490 }, { "epoch": 1.4727079024879364, "grad_norm": 4.164999485015869, "learning_rate": 2.5723543940385468e-05, "loss": 0.2624, "step": 23500 }, { "epoch": 1.4733345867017609, "grad_norm": 0.3092673420906067, "learning_rate": 2.5712988959489988e-05, "loss": 0.0784, "step": 23510 }, { "epoch": 1.4739612709155856, "grad_norm": 0.16006037592887878, "learning_rate": 2.5702433978594498e-05, "loss": 0.0763, "step": 23520 }, { "epoch": 1.4745879551294103, "grad_norm": 7.904965400695801, "learning_rate": 2.5691878997699014e-05, "loss": 0.2243, "step": 23530 }, { "epoch": 1.475214639343235, "grad_norm": 7.310710430145264, "learning_rate": 2.568132401680353e-05, "loss": 0.1202, "step": 23540 }, { "epoch": 1.4758413235570595, "grad_norm": 3.766096830368042, "learning_rate": 2.5670769035908048e-05, "loss": 0.1949, "step": 23550 }, { "epoch": 1.4764680077708843, "grad_norm": 0.040846891701221466, "learning_rate": 2.566021405501256e-05, "loss": 0.1089, "step": 23560 }, { "epoch": 1.4770946919847088, "grad_norm": 8.319687843322754, "learning_rate": 2.5649659074117078e-05, "loss": 0.1005, "step": 23570 }, { "epoch": 1.4777213761985335, "grad_norm": 0.08896784484386444, "learning_rate": 2.5639104093221594e-05, "loss": 0.0395, "step": 23580 }, { "epoch": 1.4783480604123582, "grad_norm": 0.05673737823963165, "learning_rate": 2.5628549112326104e-05, "loss": 0.0738, "step": 23590 }, { "epoch": 1.478974744626183, "grad_norm": 0.14433351159095764, "learning_rate": 2.5617994131430624e-05, "loss": 0.111, "step": 23600 }, { "epoch": 1.4796014288400074, "grad_norm": 0.09921615570783615, "learning_rate": 2.560743915053514e-05, "loss": 0.2772, "step": 23610 }, { "epoch": 1.4802281130538322, "grad_norm": 0.5152047276496887, "learning_rate": 2.5596884169639657e-05, "loss": 0.0656, "step": 23620 }, { "epoch": 1.480854797267657, "grad_norm": 0.6339969038963318, "learning_rate": 2.5586329188744167e-05, "loss": 0.0307, "step": 23630 }, { "epoch": 1.4814814814814814, "grad_norm": 0.10018188506364822, "learning_rate": 2.5575774207848684e-05, "loss": 0.2019, "step": 23640 }, { "epoch": 1.4821081656953061, "grad_norm": 11.523775100708008, "learning_rate": 2.55652192269532e-05, "loss": 0.0213, "step": 23650 }, { "epoch": 1.4827348499091308, "grad_norm": 0.04751148447394371, "learning_rate": 2.5554664246057714e-05, "loss": 0.1503, "step": 23660 }, { "epoch": 1.4833615341229556, "grad_norm": 0.05950698256492615, "learning_rate": 2.554410926516223e-05, "loss": 0.0822, "step": 23670 }, { "epoch": 1.48398821833678, "grad_norm": 2.1161811351776123, "learning_rate": 2.5533554284266747e-05, "loss": 0.1776, "step": 23680 }, { "epoch": 1.4846149025506048, "grad_norm": 0.06839194893836975, "learning_rate": 2.5522999303371264e-05, "loss": 0.0641, "step": 23690 }, { "epoch": 1.4852415867644293, "grad_norm": 0.21065391600131989, "learning_rate": 2.5512444322475777e-05, "loss": 0.0117, "step": 23700 }, { "epoch": 1.485868270978254, "grad_norm": 9.681075096130371, "learning_rate": 2.5501889341580294e-05, "loss": 0.1, "step": 23710 }, { "epoch": 1.4864949551920787, "grad_norm": 0.11779667437076569, "learning_rate": 2.549133436068481e-05, "loss": 0.0378, "step": 23720 }, { "epoch": 1.4871216394059035, "grad_norm": 1.3199436664581299, "learning_rate": 2.548077937978932e-05, "loss": 0.1533, "step": 23730 }, { "epoch": 1.487748323619728, "grad_norm": 0.03990645334124565, "learning_rate": 2.5470224398893837e-05, "loss": 0.0279, "step": 23740 }, { "epoch": 1.4883750078335527, "grad_norm": 0.021615929901599884, "learning_rate": 2.5459669417998357e-05, "loss": 0.1285, "step": 23750 }, { "epoch": 1.4890016920473772, "grad_norm": 0.013920389115810394, "learning_rate": 2.5449114437102874e-05, "loss": 0.2084, "step": 23760 }, { "epoch": 1.489628376261202, "grad_norm": 0.08621327579021454, "learning_rate": 2.5438559456207383e-05, "loss": 0.1109, "step": 23770 }, { "epoch": 1.4902550604750266, "grad_norm": 0.018220432102680206, "learning_rate": 2.54280044753119e-05, "loss": 0.1331, "step": 23780 }, { "epoch": 1.4908817446888514, "grad_norm": 1.2734665870666504, "learning_rate": 2.5417449494416417e-05, "loss": 0.0118, "step": 23790 }, { "epoch": 1.4915084289026759, "grad_norm": 22.070463180541992, "learning_rate": 2.540689451352093e-05, "loss": 0.0705, "step": 23800 }, { "epoch": 1.4921351131165006, "grad_norm": 228.2330780029297, "learning_rate": 2.5396339532625447e-05, "loss": 0.0961, "step": 23810 }, { "epoch": 1.4927617973303253, "grad_norm": 6.635834217071533, "learning_rate": 2.5385784551729963e-05, "loss": 0.2087, "step": 23820 }, { "epoch": 1.4933884815441498, "grad_norm": 0.03844684734940529, "learning_rate": 2.537522957083448e-05, "loss": 0.1419, "step": 23830 }, { "epoch": 1.4940151657579746, "grad_norm": 0.1389332115650177, "learning_rate": 2.536467458993899e-05, "loss": 0.2215, "step": 23840 }, { "epoch": 1.4946418499717993, "grad_norm": 0.015771828591823578, "learning_rate": 2.535411960904351e-05, "loss": 0.0782, "step": 23850 }, { "epoch": 1.495268534185624, "grad_norm": 0.6827368140220642, "learning_rate": 2.5343564628148027e-05, "loss": 0.0727, "step": 23860 }, { "epoch": 1.4958952183994485, "grad_norm": 0.11785943806171417, "learning_rate": 2.5333009647252543e-05, "loss": 0.0739, "step": 23870 }, { "epoch": 1.4965219026132732, "grad_norm": 0.11551948636770248, "learning_rate": 2.5322454666357053e-05, "loss": 0.1316, "step": 23880 }, { "epoch": 1.4971485868270977, "grad_norm": 9.339971542358398, "learning_rate": 2.531189968546157e-05, "loss": 0.0852, "step": 23890 }, { "epoch": 1.4977752710409225, "grad_norm": 0.8613000512123108, "learning_rate": 2.5301344704566086e-05, "loss": 0.094, "step": 23900 }, { "epoch": 1.4984019552547472, "grad_norm": 0.05517628416419029, "learning_rate": 2.52907897236706e-05, "loss": 0.2456, "step": 23910 }, { "epoch": 1.499028639468572, "grad_norm": 0.04880961403250694, "learning_rate": 2.5280234742775116e-05, "loss": 0.1071, "step": 23920 }, { "epoch": 1.4996553236823964, "grad_norm": 0.04997780919075012, "learning_rate": 2.5269679761879633e-05, "loss": 0.0426, "step": 23930 }, { "epoch": 1.5002820078962211, "grad_norm": 0.047217972576618195, "learning_rate": 2.525912478098415e-05, "loss": 0.1243, "step": 23940 }, { "epoch": 1.5009086921100456, "grad_norm": 0.09410211443901062, "learning_rate": 2.5248569800088663e-05, "loss": 0.0575, "step": 23950 }, { "epoch": 1.5015353763238704, "grad_norm": 3.0829415321350098, "learning_rate": 2.523801481919318e-05, "loss": 0.1165, "step": 23960 }, { "epoch": 1.502162060537695, "grad_norm": 0.029031164944171906, "learning_rate": 2.5227459838297696e-05, "loss": 0.1641, "step": 23970 }, { "epoch": 1.5027887447515198, "grad_norm": 0.09065140038728714, "learning_rate": 2.5216904857402206e-05, "loss": 0.0072, "step": 23980 }, { "epoch": 1.5034154289653445, "grad_norm": 0.08739957213401794, "learning_rate": 2.5206349876506723e-05, "loss": 0.1343, "step": 23990 }, { "epoch": 1.504042113179169, "grad_norm": 0.4279542863368988, "learning_rate": 2.5195794895611243e-05, "loss": 0.0366, "step": 24000 }, { "epoch": 1.5046687973929935, "grad_norm": 0.09616062790155411, "learning_rate": 2.518523991471576e-05, "loss": 0.0371, "step": 24010 }, { "epoch": 1.5052954816068183, "grad_norm": 0.1089567318558693, "learning_rate": 2.517468493382027e-05, "loss": 0.2015, "step": 24020 }, { "epoch": 1.505922165820643, "grad_norm": 0.04283067211508751, "learning_rate": 2.5164129952924786e-05, "loss": 0.1507, "step": 24030 }, { "epoch": 1.5065488500344677, "grad_norm": 0.09019152820110321, "learning_rate": 2.5153574972029303e-05, "loss": 0.0534, "step": 24040 }, { "epoch": 1.5071755342482924, "grad_norm": 0.11618775874376297, "learning_rate": 2.5143019991133816e-05, "loss": 0.1746, "step": 24050 }, { "epoch": 1.507802218462117, "grad_norm": 1.8899295330047607, "learning_rate": 2.5132465010238332e-05, "loss": 0.0871, "step": 24060 }, { "epoch": 1.5084289026759414, "grad_norm": 0.05963942036032677, "learning_rate": 2.512191002934285e-05, "loss": 0.073, "step": 24070 }, { "epoch": 1.5090555868897662, "grad_norm": 12.7268705368042, "learning_rate": 2.5111355048447366e-05, "loss": 0.128, "step": 24080 }, { "epoch": 1.5096822711035909, "grad_norm": 0.5341861844062805, "learning_rate": 2.5100800067551876e-05, "loss": 0.0791, "step": 24090 }, { "epoch": 1.5103089553174156, "grad_norm": 2.8714306354522705, "learning_rate": 2.5090245086656396e-05, "loss": 0.0371, "step": 24100 }, { "epoch": 1.5109356395312403, "grad_norm": 0.1821633130311966, "learning_rate": 2.5079690105760912e-05, "loss": 0.0986, "step": 24110 }, { "epoch": 1.5115623237450648, "grad_norm": 1.7185479402542114, "learning_rate": 2.5069135124865422e-05, "loss": 0.139, "step": 24120 }, { "epoch": 1.5121890079588896, "grad_norm": 4.6903977394104, "learning_rate": 2.505858014396994e-05, "loss": 0.1248, "step": 24130 }, { "epoch": 1.512815692172714, "grad_norm": 9.343801498413086, "learning_rate": 2.5048025163074456e-05, "loss": 0.1528, "step": 24140 }, { "epoch": 1.5134423763865388, "grad_norm": 0.07593511790037155, "learning_rate": 2.5037470182178972e-05, "loss": 0.0746, "step": 24150 }, { "epoch": 1.5140690606003635, "grad_norm": 0.07884012162685394, "learning_rate": 2.5026915201283485e-05, "loss": 0.1444, "step": 24160 }, { "epoch": 1.5146957448141882, "grad_norm": 1.0196356773376465, "learning_rate": 2.5016360220388002e-05, "loss": 0.014, "step": 24170 }, { "epoch": 1.5153224290280127, "grad_norm": 3.953526735305786, "learning_rate": 2.500580523949252e-05, "loss": 0.1165, "step": 24180 }, { "epoch": 1.5159491132418375, "grad_norm": 0.19802182912826538, "learning_rate": 2.4995250258597032e-05, "loss": 0.0583, "step": 24190 }, { "epoch": 1.516575797455662, "grad_norm": 0.1346462517976761, "learning_rate": 2.498469527770155e-05, "loss": 0.0169, "step": 24200 }, { "epoch": 1.5172024816694867, "grad_norm": 0.08564218133687973, "learning_rate": 2.4974140296806065e-05, "loss": 0.0021, "step": 24210 }, { "epoch": 1.5178291658833114, "grad_norm": 3.5072386264801025, "learning_rate": 2.496358531591058e-05, "loss": 0.0853, "step": 24220 }, { "epoch": 1.5184558500971361, "grad_norm": 0.09622887521982193, "learning_rate": 2.4953030335015095e-05, "loss": 0.1146, "step": 24230 }, { "epoch": 1.5190825343109609, "grad_norm": 0.17273660004138947, "learning_rate": 2.494247535411961e-05, "loss": 0.0363, "step": 24240 }, { "epoch": 1.5197092185247854, "grad_norm": 0.019315486773848534, "learning_rate": 2.493192037322413e-05, "loss": 0.1146, "step": 24250 }, { "epoch": 1.5203359027386099, "grad_norm": 0.017586050555109978, "learning_rate": 2.4921365392328642e-05, "loss": 0.1751, "step": 24260 }, { "epoch": 1.5209625869524346, "grad_norm": 5.868565559387207, "learning_rate": 2.4910810411433155e-05, "loss": 0.1345, "step": 24270 }, { "epoch": 1.5215892711662593, "grad_norm": 0.08116251975297928, "learning_rate": 2.490025543053767e-05, "loss": 0.309, "step": 24280 }, { "epoch": 1.522215955380084, "grad_norm": 2.3880562782287598, "learning_rate": 2.4889700449642185e-05, "loss": 0.2768, "step": 24290 }, { "epoch": 1.5228426395939088, "grad_norm": 0.20012100040912628, "learning_rate": 2.4879145468746705e-05, "loss": 0.0256, "step": 24300 }, { "epoch": 1.5234693238077333, "grad_norm": 0.08985351771116257, "learning_rate": 2.4868590487851218e-05, "loss": 0.0712, "step": 24310 }, { "epoch": 1.524096008021558, "grad_norm": 0.023399904370307922, "learning_rate": 2.4858035506955735e-05, "loss": 0.0897, "step": 24320 }, { "epoch": 1.5247226922353825, "grad_norm": 0.08582006394863129, "learning_rate": 2.4847480526060248e-05, "loss": 0.0974, "step": 24330 }, { "epoch": 1.5253493764492072, "grad_norm": 0.03750643879175186, "learning_rate": 2.483692554516476e-05, "loss": 0.1763, "step": 24340 }, { "epoch": 1.525976060663032, "grad_norm": 0.04496735706925392, "learning_rate": 2.482637056426928e-05, "loss": 0.0854, "step": 24350 }, { "epoch": 1.5266027448768567, "grad_norm": 0.06347126513719559, "learning_rate": 2.4815815583373795e-05, "loss": 0.0259, "step": 24360 }, { "epoch": 1.5272294290906812, "grad_norm": 0.05704139918088913, "learning_rate": 2.480526060247831e-05, "loss": 0.1453, "step": 24370 }, { "epoch": 1.5278561133045059, "grad_norm": 0.04704032838344574, "learning_rate": 2.4794705621582825e-05, "loss": 0.0647, "step": 24380 }, { "epoch": 1.5284827975183304, "grad_norm": 0.05256137251853943, "learning_rate": 2.478415064068734e-05, "loss": 0.0646, "step": 24390 }, { "epoch": 1.529109481732155, "grad_norm": 0.05079631507396698, "learning_rate": 2.4773595659791858e-05, "loss": 0.1603, "step": 24400 }, { "epoch": 1.5297361659459798, "grad_norm": 0.0261594969779253, "learning_rate": 2.4763040678896375e-05, "loss": 0.0042, "step": 24410 }, { "epoch": 1.5303628501598046, "grad_norm": 7.628668785095215, "learning_rate": 2.4752485698000888e-05, "loss": 0.2903, "step": 24420 }, { "epoch": 1.5309895343736293, "grad_norm": 3.294372081756592, "learning_rate": 2.47419307171054e-05, "loss": 0.2368, "step": 24430 }, { "epoch": 1.5316162185874538, "grad_norm": 4.897356986999512, "learning_rate": 2.4731375736209918e-05, "loss": 0.1527, "step": 24440 }, { "epoch": 1.5322429028012783, "grad_norm": 0.15121100842952728, "learning_rate": 2.4720820755314434e-05, "loss": 0.0644, "step": 24450 }, { "epoch": 1.532869587015103, "grad_norm": 14.861132621765137, "learning_rate": 2.471026577441895e-05, "loss": 0.1316, "step": 24460 }, { "epoch": 1.5334962712289277, "grad_norm": 0.042586345225572586, "learning_rate": 2.4699710793523464e-05, "loss": 0.286, "step": 24470 }, { "epoch": 1.5341229554427525, "grad_norm": 1.1130905151367188, "learning_rate": 2.468915581262798e-05, "loss": 0.1371, "step": 24480 }, { "epoch": 1.5347496396565772, "grad_norm": 0.2588573396205902, "learning_rate": 2.4678600831732494e-05, "loss": 0.0332, "step": 24490 }, { "epoch": 1.5353763238704017, "grad_norm": 0.22446982562541962, "learning_rate": 2.466804585083701e-05, "loss": 0.0147, "step": 24500 }, { "epoch": 1.5360030080842264, "grad_norm": 0.251001238822937, "learning_rate": 2.4657490869941528e-05, "loss": 0.0458, "step": 24510 }, { "epoch": 1.536629692298051, "grad_norm": 4.180436134338379, "learning_rate": 2.464693588904604e-05, "loss": 0.1391, "step": 24520 }, { "epoch": 1.5372563765118756, "grad_norm": 0.04928579553961754, "learning_rate": 2.4636380908150557e-05, "loss": 0.1121, "step": 24530 }, { "epoch": 1.5378830607257004, "grad_norm": 0.12214578688144684, "learning_rate": 2.462582592725507e-05, "loss": 0.2212, "step": 24540 }, { "epoch": 1.538509744939525, "grad_norm": 0.13402099907398224, "learning_rate": 2.461527094635959e-05, "loss": 0.1176, "step": 24550 }, { "epoch": 1.5391364291533496, "grad_norm": 0.04098990187048912, "learning_rate": 2.4604715965464104e-05, "loss": 0.0895, "step": 24560 }, { "epoch": 1.5397631133671743, "grad_norm": 0.22888420522212982, "learning_rate": 2.4594160984568617e-05, "loss": 0.0536, "step": 24570 }, { "epoch": 1.5403897975809988, "grad_norm": 0.09464392066001892, "learning_rate": 2.4583606003673134e-05, "loss": 0.0432, "step": 24580 }, { "epoch": 1.5410164817948235, "grad_norm": 0.012952485121786594, "learning_rate": 2.457305102277765e-05, "loss": 0.0356, "step": 24590 }, { "epoch": 1.5416431660086483, "grad_norm": 2.1808762550354004, "learning_rate": 2.4562496041882167e-05, "loss": 0.2454, "step": 24600 }, { "epoch": 1.542269850222473, "grad_norm": 0.05670209601521492, "learning_rate": 2.455194106098668e-05, "loss": 0.0498, "step": 24610 }, { "epoch": 1.5428965344362977, "grad_norm": 0.48406851291656494, "learning_rate": 2.4541386080091197e-05, "loss": 0.0974, "step": 24620 }, { "epoch": 1.5435232186501222, "grad_norm": 0.687736988067627, "learning_rate": 2.453083109919571e-05, "loss": 0.027, "step": 24630 }, { "epoch": 1.5441499028639467, "grad_norm": 0.5006890892982483, "learning_rate": 2.4520276118300227e-05, "loss": 0.093, "step": 24640 }, { "epoch": 1.5447765870777714, "grad_norm": 0.007649289909750223, "learning_rate": 2.4509721137404744e-05, "loss": 0.0057, "step": 24650 }, { "epoch": 1.5454032712915962, "grad_norm": 0.055010851472616196, "learning_rate": 2.4499166156509257e-05, "loss": 0.108, "step": 24660 }, { "epoch": 1.5460299555054209, "grad_norm": 0.07023721933364868, "learning_rate": 2.4488611175613774e-05, "loss": 0.0254, "step": 24670 }, { "epoch": 1.5466566397192456, "grad_norm": 0.15422725677490234, "learning_rate": 2.4478056194718287e-05, "loss": 0.1481, "step": 24680 }, { "epoch": 1.54728332393307, "grad_norm": 0.0953349694609642, "learning_rate": 2.4467501213822804e-05, "loss": 0.0028, "step": 24690 }, { "epoch": 1.5479100081468948, "grad_norm": 0.03705024719238281, "learning_rate": 2.445694623292732e-05, "loss": 0.187, "step": 24700 }, { "epoch": 1.5485366923607193, "grad_norm": 0.224687397480011, "learning_rate": 2.4446391252031837e-05, "loss": 0.1034, "step": 24710 }, { "epoch": 1.549163376574544, "grad_norm": 0.056175027042627335, "learning_rate": 2.443583627113635e-05, "loss": 0.0122, "step": 24720 }, { "epoch": 1.5497900607883688, "grad_norm": 3.002753496170044, "learning_rate": 2.4425281290240863e-05, "loss": 0.1492, "step": 24730 }, { "epoch": 1.5504167450021935, "grad_norm": 0.20835314691066742, "learning_rate": 2.441472630934538e-05, "loss": 0.0698, "step": 24740 }, { "epoch": 1.551043429216018, "grad_norm": 40.73422622680664, "learning_rate": 2.4404171328449897e-05, "loss": 0.2096, "step": 24750 }, { "epoch": 1.5516701134298427, "grad_norm": 0.05524049699306488, "learning_rate": 2.4393616347554413e-05, "loss": 0.0075, "step": 24760 }, { "epoch": 1.5522967976436672, "grad_norm": 0.08031153678894043, "learning_rate": 2.4383061366658927e-05, "loss": 0.2378, "step": 24770 }, { "epoch": 1.552923481857492, "grad_norm": 0.09899525344371796, "learning_rate": 2.4372506385763443e-05, "loss": 0.1061, "step": 24780 }, { "epoch": 1.5535501660713167, "grad_norm": 0.06336953490972519, "learning_rate": 2.4361951404867957e-05, "loss": 0.0193, "step": 24790 }, { "epoch": 1.5541768502851414, "grad_norm": 0.03865973278880119, "learning_rate": 2.4351396423972477e-05, "loss": 0.2515, "step": 24800 }, { "epoch": 1.5548035344989661, "grad_norm": 0.33897021412849426, "learning_rate": 2.434084144307699e-05, "loss": 0.0617, "step": 24810 }, { "epoch": 1.5554302187127906, "grad_norm": 6.761913776397705, "learning_rate": 2.4330286462181503e-05, "loss": 0.1185, "step": 24820 }, { "epoch": 1.5560569029266151, "grad_norm": 0.1353708952665329, "learning_rate": 2.431973148128602e-05, "loss": 0.048, "step": 24830 }, { "epoch": 1.5566835871404399, "grad_norm": 0.38180655241012573, "learning_rate": 2.4309176500390536e-05, "loss": 0.098, "step": 24840 }, { "epoch": 1.5573102713542646, "grad_norm": 0.05842465162277222, "learning_rate": 2.4298621519495053e-05, "loss": 0.1091, "step": 24850 }, { "epoch": 1.5579369555680893, "grad_norm": 27.77541160583496, "learning_rate": 2.4288066538599566e-05, "loss": 0.1175, "step": 24860 }, { "epoch": 1.558563639781914, "grad_norm": 0.08940772712230682, "learning_rate": 2.4277511557704083e-05, "loss": 0.0748, "step": 24870 }, { "epoch": 1.5591903239957385, "grad_norm": 0.2604180872440338, "learning_rate": 2.4266956576808596e-05, "loss": 0.11, "step": 24880 }, { "epoch": 1.559817008209563, "grad_norm": 0.12213179469108582, "learning_rate": 2.4256401595913113e-05, "loss": 0.0412, "step": 24890 }, { "epoch": 1.5604436924233878, "grad_norm": 9.398280143737793, "learning_rate": 2.424584661501763e-05, "loss": 0.0677, "step": 24900 }, { "epoch": 1.5610703766372125, "grad_norm": 10.272821426391602, "learning_rate": 2.4235291634122143e-05, "loss": 0.1476, "step": 24910 }, { "epoch": 1.5616970608510372, "grad_norm": 0.024012258276343346, "learning_rate": 2.422473665322666e-05, "loss": 0.0965, "step": 24920 }, { "epoch": 1.562323745064862, "grad_norm": 0.02501060627400875, "learning_rate": 2.4214181672331173e-05, "loss": 0.0118, "step": 24930 }, { "epoch": 1.5629504292786864, "grad_norm": 5.121936321258545, "learning_rate": 2.420362669143569e-05, "loss": 0.1386, "step": 24940 }, { "epoch": 1.5635771134925112, "grad_norm": 0.07374687492847443, "learning_rate": 2.4193071710540206e-05, "loss": 0.0618, "step": 24950 }, { "epoch": 1.5642037977063357, "grad_norm": 0.05653063580393791, "learning_rate": 2.418251672964472e-05, "loss": 0.0351, "step": 24960 }, { "epoch": 1.5648304819201604, "grad_norm": 89.24818420410156, "learning_rate": 2.4171961748749236e-05, "loss": 0.5215, "step": 24970 }, { "epoch": 1.5654571661339851, "grad_norm": 7.567296504974365, "learning_rate": 2.416140676785375e-05, "loss": 0.244, "step": 24980 }, { "epoch": 1.5660838503478098, "grad_norm": 0.25020354986190796, "learning_rate": 2.4150851786958266e-05, "loss": 0.008, "step": 24990 }, { "epoch": 1.5667105345616343, "grad_norm": 0.6312108635902405, "learning_rate": 2.4140296806062782e-05, "loss": 0.0212, "step": 25000 }, { "epoch": 1.567337218775459, "grad_norm": 0.08929482102394104, "learning_rate": 2.41297418251673e-05, "loss": 0.1959, "step": 25010 }, { "epoch": 1.5679639029892836, "grad_norm": 4.561793327331543, "learning_rate": 2.4119186844271812e-05, "loss": 0.2022, "step": 25020 }, { "epoch": 1.5685905872031083, "grad_norm": 0.09889715164899826, "learning_rate": 2.410863186337633e-05, "loss": 0.165, "step": 25030 }, { "epoch": 1.569217271416933, "grad_norm": 0.21414883434772491, "learning_rate": 2.4098076882480842e-05, "loss": 0.152, "step": 25040 }, { "epoch": 1.5698439556307577, "grad_norm": 0.10287700593471527, "learning_rate": 2.408752190158536e-05, "loss": 0.0582, "step": 25050 }, { "epoch": 1.5704706398445825, "grad_norm": 0.013686907477676868, "learning_rate": 2.4076966920689876e-05, "loss": 0.0469, "step": 25060 }, { "epoch": 1.571097324058407, "grad_norm": 2.523181915283203, "learning_rate": 2.406641193979439e-05, "loss": 0.0847, "step": 25070 }, { "epoch": 1.5717240082722315, "grad_norm": 0.11900023370981216, "learning_rate": 2.4055856958898906e-05, "loss": 0.185, "step": 25080 }, { "epoch": 1.5723506924860562, "grad_norm": 9.391716003417969, "learning_rate": 2.4045301978003422e-05, "loss": 0.1941, "step": 25090 }, { "epoch": 1.572977376699881, "grad_norm": 0.12244875729084015, "learning_rate": 2.403474699710794e-05, "loss": 0.1257, "step": 25100 }, { "epoch": 1.5736040609137056, "grad_norm": 6.436902046203613, "learning_rate": 2.4024192016212452e-05, "loss": 0.0995, "step": 25110 }, { "epoch": 1.5742307451275304, "grad_norm": 10.673419952392578, "learning_rate": 2.4013637035316965e-05, "loss": 0.1573, "step": 25120 }, { "epoch": 1.5748574293413549, "grad_norm": 0.02762596309185028, "learning_rate": 2.4003082054421482e-05, "loss": 0.0856, "step": 25130 }, { "epoch": 1.5754841135551796, "grad_norm": 0.051068298518657684, "learning_rate": 2.3992527073526e-05, "loss": 0.0035, "step": 25140 }, { "epoch": 1.576110797769004, "grad_norm": 0.03830769285559654, "learning_rate": 2.3981972092630515e-05, "loss": 0.065, "step": 25150 }, { "epoch": 1.5767374819828288, "grad_norm": 5.371109962463379, "learning_rate": 2.397141711173503e-05, "loss": 0.1066, "step": 25160 }, { "epoch": 1.5773641661966535, "grad_norm": 0.01236297283321619, "learning_rate": 2.3960862130839545e-05, "loss": 0.1319, "step": 25170 }, { "epoch": 1.5779908504104783, "grad_norm": 8.42038345336914, "learning_rate": 2.395030714994406e-05, "loss": 0.1462, "step": 25180 }, { "epoch": 1.5786175346243028, "grad_norm": 13.973833084106445, "learning_rate": 2.3939752169048575e-05, "loss": 0.0665, "step": 25190 }, { "epoch": 1.5792442188381275, "grad_norm": 0.06125205382704735, "learning_rate": 2.3929197188153092e-05, "loss": 0.0698, "step": 25200 }, { "epoch": 1.579870903051952, "grad_norm": 0.10730314999818802, "learning_rate": 2.3918642207257605e-05, "loss": 0.0668, "step": 25210 }, { "epoch": 1.5804975872657767, "grad_norm": 10.271492958068848, "learning_rate": 2.3908087226362122e-05, "loss": 0.148, "step": 25220 }, { "epoch": 1.5811242714796014, "grad_norm": 0.1615249067544937, "learning_rate": 2.3897532245466635e-05, "loss": 0.0969, "step": 25230 }, { "epoch": 1.5817509556934262, "grad_norm": 0.05994172394275665, "learning_rate": 2.388697726457115e-05, "loss": 0.0318, "step": 25240 }, { "epoch": 1.5823776399072509, "grad_norm": 0.10029243677854538, "learning_rate": 2.3876422283675668e-05, "loss": 0.1715, "step": 25250 }, { "epoch": 1.5830043241210754, "grad_norm": 0.036729391664266586, "learning_rate": 2.3865867302780185e-05, "loss": 0.1219, "step": 25260 }, { "epoch": 1.5836310083349, "grad_norm": 6.817121505737305, "learning_rate": 2.3855312321884698e-05, "loss": 0.0587, "step": 25270 }, { "epoch": 1.5842576925487246, "grad_norm": 0.25154295563697815, "learning_rate": 2.384475734098921e-05, "loss": 0.2074, "step": 25280 }, { "epoch": 1.5848843767625493, "grad_norm": 0.04093783348798752, "learning_rate": 2.3834202360093728e-05, "loss": 0.1101, "step": 25290 }, { "epoch": 1.585511060976374, "grad_norm": 0.13500647246837616, "learning_rate": 2.3823647379198245e-05, "loss": 0.0332, "step": 25300 }, { "epoch": 1.5861377451901988, "grad_norm": 0.03615393117070198, "learning_rate": 2.381309239830276e-05, "loss": 0.1278, "step": 25310 }, { "epoch": 1.5867644294040233, "grad_norm": 0.051414694637060165, "learning_rate": 2.3802537417407275e-05, "loss": 0.1995, "step": 25320 }, { "epoch": 1.587391113617848, "grad_norm": 1.4407033920288086, "learning_rate": 2.379198243651179e-05, "loss": 0.0081, "step": 25330 }, { "epoch": 1.5880177978316725, "grad_norm": 9.254353523254395, "learning_rate": 2.3781427455616308e-05, "loss": 0.1209, "step": 25340 }, { "epoch": 1.5886444820454972, "grad_norm": 0.05505969375371933, "learning_rate": 2.377087247472082e-05, "loss": 0.2014, "step": 25350 }, { "epoch": 1.589271166259322, "grad_norm": 0.13277609646320343, "learning_rate": 2.3760317493825338e-05, "loss": 0.1068, "step": 25360 }, { "epoch": 1.5898978504731467, "grad_norm": 0.5146042108535767, "learning_rate": 2.374976251292985e-05, "loss": 0.0216, "step": 25370 }, { "epoch": 1.5905245346869712, "grad_norm": 0.35294288396835327, "learning_rate": 2.3739207532034368e-05, "loss": 0.0596, "step": 25380 }, { "epoch": 1.591151218900796, "grad_norm": 28.5668888092041, "learning_rate": 2.3728652551138884e-05, "loss": 0.1636, "step": 25390 }, { "epoch": 1.5917779031146204, "grad_norm": 0.3631909489631653, "learning_rate": 2.37180975702434e-05, "loss": 0.0676, "step": 25400 }, { "epoch": 1.5924045873284451, "grad_norm": 0.10231323540210724, "learning_rate": 2.3707542589347914e-05, "loss": 0.1439, "step": 25410 }, { "epoch": 1.5930312715422699, "grad_norm": 6.930258274078369, "learning_rate": 2.369698760845243e-05, "loss": 0.2736, "step": 25420 }, { "epoch": 1.5936579557560946, "grad_norm": 4.582469940185547, "learning_rate": 2.3686432627556944e-05, "loss": 0.0806, "step": 25430 }, { "epoch": 1.5942846399699193, "grad_norm": 0.033723387867212296, "learning_rate": 2.367587764666146e-05, "loss": 0.0098, "step": 25440 }, { "epoch": 1.5949113241837438, "grad_norm": 1.0349847078323364, "learning_rate": 2.3665322665765978e-05, "loss": 0.0684, "step": 25450 }, { "epoch": 1.5955380083975683, "grad_norm": 0.03423527255654335, "learning_rate": 2.365476768487049e-05, "loss": 0.1973, "step": 25460 }, { "epoch": 1.596164692611393, "grad_norm": 8.22195053100586, "learning_rate": 2.3644212703975008e-05, "loss": 0.0391, "step": 25470 }, { "epoch": 1.5967913768252178, "grad_norm": 1.6201304197311401, "learning_rate": 2.363365772307952e-05, "loss": 0.1338, "step": 25480 }, { "epoch": 1.5974180610390425, "grad_norm": 1.193275809288025, "learning_rate": 2.3623102742184037e-05, "loss": 0.0354, "step": 25490 }, { "epoch": 1.5980447452528672, "grad_norm": 0.09546542167663574, "learning_rate": 2.3612547761288554e-05, "loss": 0.1602, "step": 25500 }, { "epoch": 1.5986714294666917, "grad_norm": 0.06019541993737221, "learning_rate": 2.3601992780393067e-05, "loss": 0.0713, "step": 25510 }, { "epoch": 1.5992981136805164, "grad_norm": 0.02078704722225666, "learning_rate": 2.3591437799497584e-05, "loss": 0.012, "step": 25520 }, { "epoch": 1.599924797894341, "grad_norm": 0.054111577570438385, "learning_rate": 2.3580882818602097e-05, "loss": 0.0506, "step": 25530 }, { "epoch": 1.6005514821081657, "grad_norm": 0.15975138545036316, "learning_rate": 2.3570327837706614e-05, "loss": 0.1461, "step": 25540 }, { "epoch": 1.6011781663219904, "grad_norm": 0.020044539123773575, "learning_rate": 2.355977285681113e-05, "loss": 0.0946, "step": 25550 }, { "epoch": 1.6018048505358151, "grad_norm": 0.08575562387704849, "learning_rate": 2.3549217875915647e-05, "loss": 0.1476, "step": 25560 }, { "epoch": 1.6024315347496396, "grad_norm": 0.009003080427646637, "learning_rate": 2.353866289502016e-05, "loss": 0.0111, "step": 25570 }, { "epoch": 1.6030582189634643, "grad_norm": 7.464236736297607, "learning_rate": 2.3528107914124674e-05, "loss": 0.0868, "step": 25580 }, { "epoch": 1.6036849031772888, "grad_norm": 23.07075309753418, "learning_rate": 2.3517552933229194e-05, "loss": 0.0932, "step": 25590 }, { "epoch": 1.6043115873911136, "grad_norm": 0.007414340041577816, "learning_rate": 2.3506997952333707e-05, "loss": 0.0806, "step": 25600 }, { "epoch": 1.6049382716049383, "grad_norm": 0.017645269632339478, "learning_rate": 2.3496442971438224e-05, "loss": 0.0524, "step": 25610 }, { "epoch": 1.605564955818763, "grad_norm": 0.018196620047092438, "learning_rate": 2.3485887990542737e-05, "loss": 0.0024, "step": 25620 }, { "epoch": 1.6061916400325877, "grad_norm": 1.105287790298462, "learning_rate": 2.3475333009647254e-05, "loss": 0.0663, "step": 25630 }, { "epoch": 1.6068183242464122, "grad_norm": 0.007714731153100729, "learning_rate": 2.346477802875177e-05, "loss": 0.0115, "step": 25640 }, { "epoch": 1.6074450084602367, "grad_norm": 7.8072075843811035, "learning_rate": 2.3454223047856287e-05, "loss": 0.1591, "step": 25650 }, { "epoch": 1.6080716926740615, "grad_norm": 10.7738676071167, "learning_rate": 2.34436680669608e-05, "loss": 0.0779, "step": 25660 }, { "epoch": 1.6086983768878862, "grad_norm": 0.04062759503722191, "learning_rate": 2.3433113086065313e-05, "loss": 0.0041, "step": 25670 }, { "epoch": 1.609325061101711, "grad_norm": 0.18196269869804382, "learning_rate": 2.342255810516983e-05, "loss": 0.112, "step": 25680 }, { "epoch": 1.6099517453155356, "grad_norm": 0.016388777643442154, "learning_rate": 2.3412003124274347e-05, "loss": 0.0033, "step": 25690 }, { "epoch": 1.6105784295293601, "grad_norm": 0.022291744127869606, "learning_rate": 2.3401448143378863e-05, "loss": 0.0592, "step": 25700 }, { "epoch": 1.6112051137431846, "grad_norm": 0.016995355486869812, "learning_rate": 2.3390893162483377e-05, "loss": 0.1401, "step": 25710 }, { "epoch": 1.6118317979570094, "grad_norm": 0.19162079691886902, "learning_rate": 2.3380338181587893e-05, "loss": 0.2771, "step": 25720 }, { "epoch": 1.612458482170834, "grad_norm": 0.0328703373670578, "learning_rate": 2.3369783200692407e-05, "loss": 0.011, "step": 25730 }, { "epoch": 1.6130851663846588, "grad_norm": 0.04259718582034111, "learning_rate": 2.3359228219796923e-05, "loss": 0.1075, "step": 25740 }, { "epoch": 1.6137118505984835, "grad_norm": 7.747127532958984, "learning_rate": 2.334867323890144e-05, "loss": 0.1878, "step": 25750 }, { "epoch": 1.614338534812308, "grad_norm": 5.813741683959961, "learning_rate": 2.3338118258005953e-05, "loss": 0.0926, "step": 25760 }, { "epoch": 1.6149652190261328, "grad_norm": 0.09612107276916504, "learning_rate": 2.332756327711047e-05, "loss": 0.2835, "step": 25770 }, { "epoch": 1.6155919032399573, "grad_norm": 0.4905875325202942, "learning_rate": 2.3317008296214983e-05, "loss": 0.2207, "step": 25780 }, { "epoch": 1.616218587453782, "grad_norm": 0.11081667244434357, "learning_rate": 2.3306453315319503e-05, "loss": 0.1188, "step": 25790 }, { "epoch": 1.6168452716676067, "grad_norm": 0.1327042430639267, "learning_rate": 2.3295898334424016e-05, "loss": 0.0168, "step": 25800 }, { "epoch": 1.6174719558814314, "grad_norm": 0.2520522177219391, "learning_rate": 2.3285343353528533e-05, "loss": 0.1639, "step": 25810 }, { "epoch": 1.618098640095256, "grad_norm": 0.11177418380975723, "learning_rate": 2.3274788372633046e-05, "loss": 0.0033, "step": 25820 }, { "epoch": 1.6187253243090807, "grad_norm": 5.912029266357422, "learning_rate": 2.326423339173756e-05, "loss": 0.2516, "step": 25830 }, { "epoch": 1.6193520085229052, "grad_norm": 0.013550223782658577, "learning_rate": 2.325367841084208e-05, "loss": 0.0348, "step": 25840 }, { "epoch": 1.61997869273673, "grad_norm": 2.5849907398223877, "learning_rate": 2.3243123429946593e-05, "loss": 0.0329, "step": 25850 }, { "epoch": 1.6206053769505546, "grad_norm": 0.012683117762207985, "learning_rate": 2.323256844905111e-05, "loss": 0.0381, "step": 25860 }, { "epoch": 1.6212320611643793, "grad_norm": 0.009466142393648624, "learning_rate": 2.3222013468155623e-05, "loss": 0.08, "step": 25870 }, { "epoch": 1.621858745378204, "grad_norm": 2.259847402572632, "learning_rate": 2.321145848726014e-05, "loss": 0.0716, "step": 25880 }, { "epoch": 1.6224854295920286, "grad_norm": 0.009451135993003845, "learning_rate": 2.3200903506364656e-05, "loss": 0.0905, "step": 25890 }, { "epoch": 1.623112113805853, "grad_norm": 0.06618939340114594, "learning_rate": 2.319034852546917e-05, "loss": 0.1083, "step": 25900 }, { "epoch": 1.6237387980196778, "grad_norm": 0.7064309120178223, "learning_rate": 2.3179793544573686e-05, "loss": 0.0481, "step": 25910 }, { "epoch": 1.6243654822335025, "grad_norm": 0.02758204936981201, "learning_rate": 2.31692385636782e-05, "loss": 0.0686, "step": 25920 }, { "epoch": 1.6249921664473272, "grad_norm": 0.40926969051361084, "learning_rate": 2.3158683582782716e-05, "loss": 0.0348, "step": 25930 }, { "epoch": 1.625618850661152, "grad_norm": 0.03991227596998215, "learning_rate": 2.3148128601887233e-05, "loss": 0.1037, "step": 25940 }, { "epoch": 1.6262455348749765, "grad_norm": 0.029415365308523178, "learning_rate": 2.313757362099175e-05, "loss": 0.1124, "step": 25950 }, { "epoch": 1.6268722190888012, "grad_norm": 0.28045380115509033, "learning_rate": 2.3127018640096262e-05, "loss": 0.1201, "step": 25960 }, { "epoch": 1.6274989033026257, "grad_norm": 0.016988355666399002, "learning_rate": 2.3116463659200776e-05, "loss": 0.0075, "step": 25970 }, { "epoch": 1.6281255875164504, "grad_norm": 0.010528198443353176, "learning_rate": 2.3105908678305292e-05, "loss": 0.0829, "step": 25980 }, { "epoch": 1.6287522717302751, "grad_norm": 0.007346003782004118, "learning_rate": 2.309535369740981e-05, "loss": 0.0659, "step": 25990 }, { "epoch": 1.6293789559440999, "grad_norm": 0.12764669954776764, "learning_rate": 2.3084798716514326e-05, "loss": 0.1241, "step": 26000 }, { "epoch": 1.6300056401579244, "grad_norm": 0.8179148435592651, "learning_rate": 2.307424373561884e-05, "loss": 0.0656, "step": 26010 }, { "epoch": 1.630632324371749, "grad_norm": 0.22236420214176178, "learning_rate": 2.3063688754723356e-05, "loss": 0.0303, "step": 26020 }, { "epoch": 1.6312590085855736, "grad_norm": 0.006591562181711197, "learning_rate": 2.305313377382787e-05, "loss": 0.0267, "step": 26030 }, { "epoch": 1.6318856927993983, "grad_norm": 0.7650337219238281, "learning_rate": 2.304257879293239e-05, "loss": 0.0687, "step": 26040 }, { "epoch": 1.632512377013223, "grad_norm": 0.09074480086565018, "learning_rate": 2.3032023812036902e-05, "loss": 0.193, "step": 26050 }, { "epoch": 1.6331390612270478, "grad_norm": 0.035380203276872635, "learning_rate": 2.3021468831141415e-05, "loss": 0.1951, "step": 26060 }, { "epoch": 1.6337657454408725, "grad_norm": 19.835845947265625, "learning_rate": 2.3010913850245932e-05, "loss": 0.0323, "step": 26070 }, { "epoch": 1.634392429654697, "grad_norm": 0.12629912793636322, "learning_rate": 2.3000358869350445e-05, "loss": 0.1472, "step": 26080 }, { "epoch": 1.6350191138685215, "grad_norm": 0.1876828670501709, "learning_rate": 2.2989803888454965e-05, "loss": 0.0306, "step": 26090 }, { "epoch": 1.6356457980823462, "grad_norm": 1.093254804611206, "learning_rate": 2.297924890755948e-05, "loss": 0.0094, "step": 26100 }, { "epoch": 1.636272482296171, "grad_norm": 0.009309528395533562, "learning_rate": 2.2968693926663995e-05, "loss": 0.0102, "step": 26110 }, { "epoch": 1.6368991665099957, "grad_norm": 0.012318827211856842, "learning_rate": 2.295813894576851e-05, "loss": 0.1555, "step": 26120 }, { "epoch": 1.6375258507238204, "grad_norm": 0.028864048421382904, "learning_rate": 2.2947583964873022e-05, "loss": 0.0332, "step": 26130 }, { "epoch": 1.638152534937645, "grad_norm": 0.008642313070595264, "learning_rate": 2.2937028983977542e-05, "loss": 0.0884, "step": 26140 }, { "epoch": 1.6387792191514696, "grad_norm": 10.992156028747559, "learning_rate": 2.2926474003082055e-05, "loss": 0.3043, "step": 26150 }, { "epoch": 1.6394059033652941, "grad_norm": 0.035156864672899246, "learning_rate": 2.2915919022186572e-05, "loss": 0.0967, "step": 26160 }, { "epoch": 1.6400325875791189, "grad_norm": 0.07959800213575363, "learning_rate": 2.2905364041291085e-05, "loss": 0.1759, "step": 26170 }, { "epoch": 1.6406592717929436, "grad_norm": 1.4073405265808105, "learning_rate": 2.28948090603956e-05, "loss": 0.1282, "step": 26180 }, { "epoch": 1.6412859560067683, "grad_norm": 0.11637117713689804, "learning_rate": 2.2884254079500118e-05, "loss": 0.0229, "step": 26190 }, { "epoch": 1.6419126402205928, "grad_norm": 0.058900900185108185, "learning_rate": 2.2873699098604635e-05, "loss": 0.269, "step": 26200 }, { "epoch": 1.6425393244344175, "grad_norm": 0.06711501628160477, "learning_rate": 2.2863144117709148e-05, "loss": 0.0458, "step": 26210 }, { "epoch": 1.643166008648242, "grad_norm": 0.12759794294834137, "learning_rate": 2.285258913681366e-05, "loss": 0.1195, "step": 26220 }, { "epoch": 1.6437926928620668, "grad_norm": 3.203655481338501, "learning_rate": 2.2842034155918178e-05, "loss": 0.0766, "step": 26230 }, { "epoch": 1.6444193770758915, "grad_norm": 0.0073426892049610615, "learning_rate": 2.2831479175022695e-05, "loss": 0.0895, "step": 26240 }, { "epoch": 1.6450460612897162, "grad_norm": 0.2755219042301178, "learning_rate": 2.282092419412721e-05, "loss": 0.1019, "step": 26250 }, { "epoch": 1.645672745503541, "grad_norm": 0.01606954075396061, "learning_rate": 2.2810369213231725e-05, "loss": 0.0815, "step": 26260 }, { "epoch": 1.6462994297173654, "grad_norm": 0.13661789894104004, "learning_rate": 2.279981423233624e-05, "loss": 0.0622, "step": 26270 }, { "epoch": 1.64692611393119, "grad_norm": 0.4233393371105194, "learning_rate": 2.2789259251440755e-05, "loss": 0.0526, "step": 26280 }, { "epoch": 1.6475527981450147, "grad_norm": 0.007650166749954224, "learning_rate": 2.277870427054527e-05, "loss": 0.0034, "step": 26290 }, { "epoch": 1.6481794823588394, "grad_norm": 0.06980214267969131, "learning_rate": 2.2768149289649788e-05, "loss": 0.0229, "step": 26300 }, { "epoch": 1.648806166572664, "grad_norm": 0.05770709365606308, "learning_rate": 2.27575943087543e-05, "loss": 0.045, "step": 26310 }, { "epoch": 1.6494328507864888, "grad_norm": 0.006944793742150068, "learning_rate": 2.2747039327858818e-05, "loss": 0.0595, "step": 26320 }, { "epoch": 1.6500595350003133, "grad_norm": 0.010900170542299747, "learning_rate": 2.273648434696333e-05, "loss": 0.3436, "step": 26330 }, { "epoch": 1.650686219214138, "grad_norm": 6.457736968994141, "learning_rate": 2.272592936606785e-05, "loss": 0.1194, "step": 26340 }, { "epoch": 1.6513129034279626, "grad_norm": 0.021840889006853104, "learning_rate": 2.2715374385172364e-05, "loss": 0.1054, "step": 26350 }, { "epoch": 1.6519395876417873, "grad_norm": 0.07167918235063553, "learning_rate": 2.2704819404276878e-05, "loss": 0.1127, "step": 26360 }, { "epoch": 1.652566271855612, "grad_norm": 4.3837432861328125, "learning_rate": 2.2694264423381394e-05, "loss": 0.1142, "step": 26370 }, { "epoch": 1.6531929560694367, "grad_norm": 2.342277765274048, "learning_rate": 2.2683709442485908e-05, "loss": 0.1701, "step": 26380 }, { "epoch": 1.6538196402832612, "grad_norm": 3.2521579265594482, "learning_rate": 2.2673154461590428e-05, "loss": 0.1781, "step": 26390 }, { "epoch": 1.654446324497086, "grad_norm": 0.09720861911773682, "learning_rate": 2.266259948069494e-05, "loss": 0.0441, "step": 26400 }, { "epoch": 1.6550730087109105, "grad_norm": 1.4399570226669312, "learning_rate": 2.2652044499799458e-05, "loss": 0.2398, "step": 26410 }, { "epoch": 1.6556996929247352, "grad_norm": 3.2033612728118896, "learning_rate": 2.264148951890397e-05, "loss": 0.127, "step": 26420 }, { "epoch": 1.65632637713856, "grad_norm": 0.40961018204689026, "learning_rate": 2.2630934538008487e-05, "loss": 0.0384, "step": 26430 }, { "epoch": 1.6569530613523846, "grad_norm": 0.07372618466615677, "learning_rate": 2.2620379557113004e-05, "loss": 0.0118, "step": 26440 }, { "epoch": 1.6575797455662094, "grad_norm": 0.006588101852685213, "learning_rate": 2.2609824576217517e-05, "loss": 0.0775, "step": 26450 }, { "epoch": 1.6582064297800339, "grad_norm": 0.11311575025320053, "learning_rate": 2.2599269595322034e-05, "loss": 0.2475, "step": 26460 }, { "epoch": 1.6588331139938584, "grad_norm": 0.10983302444219589, "learning_rate": 2.2588714614426547e-05, "loss": 0.0448, "step": 26470 }, { "epoch": 1.659459798207683, "grad_norm": 0.3269750773906708, "learning_rate": 2.2578159633531064e-05, "loss": 0.1573, "step": 26480 }, { "epoch": 1.6600864824215078, "grad_norm": 9.705367088317871, "learning_rate": 2.256760465263558e-05, "loss": 0.0499, "step": 26490 }, { "epoch": 1.6607131666353325, "grad_norm": 0.08202777802944183, "learning_rate": 2.2557049671740097e-05, "loss": 0.0541, "step": 26500 }, { "epoch": 1.6613398508491573, "grad_norm": 24.44474220275879, "learning_rate": 2.254649469084461e-05, "loss": 0.1385, "step": 26510 }, { "epoch": 1.6619665350629818, "grad_norm": 0.00329552311450243, "learning_rate": 2.2535939709949124e-05, "loss": 0.0697, "step": 26520 }, { "epoch": 1.6625932192768063, "grad_norm": 0.04606331139802933, "learning_rate": 2.252538472905364e-05, "loss": 0.186, "step": 26530 }, { "epoch": 1.663219903490631, "grad_norm": 0.09263589978218079, "learning_rate": 2.2514829748158157e-05, "loss": 0.006, "step": 26540 }, { "epoch": 1.6638465877044557, "grad_norm": 0.07792260497808456, "learning_rate": 2.2504274767262674e-05, "loss": 0.0679, "step": 26550 }, { "epoch": 1.6644732719182804, "grad_norm": 0.6075377464294434, "learning_rate": 2.2493719786367187e-05, "loss": 0.0973, "step": 26560 }, { "epoch": 1.6650999561321052, "grad_norm": 0.011001525446772575, "learning_rate": 2.2483164805471704e-05, "loss": 0.1077, "step": 26570 }, { "epoch": 1.6657266403459297, "grad_norm": 3.2274794578552246, "learning_rate": 2.2472609824576217e-05, "loss": 0.1959, "step": 26580 }, { "epoch": 1.6663533245597544, "grad_norm": 0.2066841572523117, "learning_rate": 2.2462054843680734e-05, "loss": 0.0543, "step": 26590 }, { "epoch": 1.6669800087735789, "grad_norm": 6.34409761428833, "learning_rate": 2.245149986278525e-05, "loss": 0.0598, "step": 26600 }, { "epoch": 1.6676066929874036, "grad_norm": 6.800547122955322, "learning_rate": 2.2440944881889763e-05, "loss": 0.1238, "step": 26610 }, { "epoch": 1.6682333772012283, "grad_norm": 3.621145725250244, "learning_rate": 2.243038990099428e-05, "loss": 0.2065, "step": 26620 }, { "epoch": 1.668860061415053, "grad_norm": 0.10276742279529572, "learning_rate": 2.2419834920098793e-05, "loss": 0.0092, "step": 26630 }, { "epoch": 1.6694867456288776, "grad_norm": 0.07901106029748917, "learning_rate": 2.2409279939203313e-05, "loss": 0.0909, "step": 26640 }, { "epoch": 1.6701134298427023, "grad_norm": 0.20364725589752197, "learning_rate": 2.2398724958307827e-05, "loss": 0.0745, "step": 26650 }, { "epoch": 1.6707401140565268, "grad_norm": 0.005757195875048637, "learning_rate": 2.2388169977412343e-05, "loss": 0.0874, "step": 26660 }, { "epoch": 1.6713667982703515, "grad_norm": 5.623529434204102, "learning_rate": 2.2377614996516857e-05, "loss": 0.0379, "step": 26670 }, { "epoch": 1.6719934824841762, "grad_norm": 3.7896459102630615, "learning_rate": 2.2367060015621373e-05, "loss": 0.166, "step": 26680 }, { "epoch": 1.672620166698001, "grad_norm": 0.09007629007101059, "learning_rate": 2.235650503472589e-05, "loss": 0.0028, "step": 26690 }, { "epoch": 1.6732468509118257, "grad_norm": 0.03563718497753143, "learning_rate": 2.2345950053830403e-05, "loss": 0.0511, "step": 26700 }, { "epoch": 1.6738735351256502, "grad_norm": 5.399358749389648, "learning_rate": 2.233539507293492e-05, "loss": 0.2271, "step": 26710 }, { "epoch": 1.6745002193394747, "grad_norm": 8.996834754943848, "learning_rate": 2.2324840092039433e-05, "loss": 0.1034, "step": 26720 }, { "epoch": 1.6751269035532994, "grad_norm": 0.01371771190315485, "learning_rate": 2.231428511114395e-05, "loss": 0.1111, "step": 26730 }, { "epoch": 1.6757535877671241, "grad_norm": 0.4695885479450226, "learning_rate": 2.2303730130248466e-05, "loss": 0.0503, "step": 26740 }, { "epoch": 1.6763802719809489, "grad_norm": 7.742033004760742, "learning_rate": 2.229317514935298e-05, "loss": 0.0578, "step": 26750 }, { "epoch": 1.6770069561947736, "grad_norm": 0.6429186463356018, "learning_rate": 2.2282620168457496e-05, "loss": 0.1231, "step": 26760 }, { "epoch": 1.677633640408598, "grad_norm": 0.336631178855896, "learning_rate": 2.227206518756201e-05, "loss": 0.0489, "step": 26770 }, { "epoch": 1.6782603246224228, "grad_norm": 4.629275798797607, "learning_rate": 2.2261510206666526e-05, "loss": 0.2321, "step": 26780 }, { "epoch": 1.6788870088362473, "grad_norm": 0.511132001876831, "learning_rate": 2.2250955225771043e-05, "loss": 0.0051, "step": 26790 }, { "epoch": 1.679513693050072, "grad_norm": 0.4719582796096802, "learning_rate": 2.224040024487556e-05, "loss": 0.0036, "step": 26800 }, { "epoch": 1.6801403772638968, "grad_norm": 3.879145622253418, "learning_rate": 2.2229845263980073e-05, "loss": 0.1711, "step": 26810 }, { "epoch": 1.6807670614777215, "grad_norm": 0.018831124529242516, "learning_rate": 2.221929028308459e-05, "loss": 0.151, "step": 26820 }, { "epoch": 1.681393745691546, "grad_norm": 0.2846979796886444, "learning_rate": 2.2208735302189103e-05, "loss": 0.0052, "step": 26830 }, { "epoch": 1.6820204299053707, "grad_norm": 0.11228816211223602, "learning_rate": 2.219818032129362e-05, "loss": 0.1743, "step": 26840 }, { "epoch": 1.6826471141191952, "grad_norm": 0.2110615074634552, "learning_rate": 2.2187625340398136e-05, "loss": 0.1674, "step": 26850 }, { "epoch": 1.68327379833302, "grad_norm": 1.6484986543655396, "learning_rate": 2.217707035950265e-05, "loss": 0.1076, "step": 26860 }, { "epoch": 1.6839004825468447, "grad_norm": 0.018023191019892693, "learning_rate": 2.2166515378607166e-05, "loss": 0.0119, "step": 26870 }, { "epoch": 1.6845271667606694, "grad_norm": 3.573692560195923, "learning_rate": 2.2155960397711683e-05, "loss": 0.0043, "step": 26880 }, { "epoch": 1.685153850974494, "grad_norm": 0.022061511874198914, "learning_rate": 2.21454054168162e-05, "loss": 0.149, "step": 26890 }, { "epoch": 1.6857805351883186, "grad_norm": 0.05415266752243042, "learning_rate": 2.2134850435920712e-05, "loss": 0.0696, "step": 26900 }, { "epoch": 1.686407219402143, "grad_norm": 0.31539714336395264, "learning_rate": 2.2124295455025226e-05, "loss": 0.11, "step": 26910 }, { "epoch": 1.6870339036159678, "grad_norm": 0.010716657154262066, "learning_rate": 2.2113740474129742e-05, "loss": 0.1094, "step": 26920 }, { "epoch": 1.6876605878297926, "grad_norm": 0.013285728171467781, "learning_rate": 2.210318549323426e-05, "loss": 0.1406, "step": 26930 }, { "epoch": 1.6882872720436173, "grad_norm": 10.243504524230957, "learning_rate": 2.2092630512338776e-05, "loss": 0.2042, "step": 26940 }, { "epoch": 1.688913956257442, "grad_norm": 0.20027025043964386, "learning_rate": 2.208207553144329e-05, "loss": 0.0606, "step": 26950 }, { "epoch": 1.6895406404712665, "grad_norm": 0.3113136291503906, "learning_rate": 2.2071520550547806e-05, "loss": 0.0528, "step": 26960 }, { "epoch": 1.6901673246850912, "grad_norm": 0.06819582730531693, "learning_rate": 2.206096556965232e-05, "loss": 0.0528, "step": 26970 }, { "epoch": 1.6907940088989157, "grad_norm": 0.016161400824785233, "learning_rate": 2.2050410588756835e-05, "loss": 0.1354, "step": 26980 }, { "epoch": 1.6914206931127405, "grad_norm": 0.014996406622231007, "learning_rate": 2.2039855607861352e-05, "loss": 0.054, "step": 26990 }, { "epoch": 1.6920473773265652, "grad_norm": 1.241360068321228, "learning_rate": 2.2029300626965865e-05, "loss": 0.1431, "step": 27000 }, { "epoch": 1.69267406154039, "grad_norm": 0.02852221392095089, "learning_rate": 2.2018745646070382e-05, "loss": 0.1485, "step": 27010 }, { "epoch": 1.6933007457542144, "grad_norm": 2.456637382507324, "learning_rate": 2.2008190665174895e-05, "loss": 0.1193, "step": 27020 }, { "epoch": 1.6939274299680391, "grad_norm": 10.817239761352539, "learning_rate": 2.1997635684279412e-05, "loss": 0.1513, "step": 27030 }, { "epoch": 1.6945541141818636, "grad_norm": 0.029310384765267372, "learning_rate": 2.198708070338393e-05, "loss": 0.0419, "step": 27040 }, { "epoch": 1.6951807983956884, "grad_norm": 0.3586648404598236, "learning_rate": 2.1976525722488445e-05, "loss": 0.0393, "step": 27050 }, { "epoch": 1.695807482609513, "grad_norm": 0.38860127329826355, "learning_rate": 2.196597074159296e-05, "loss": 0.0953, "step": 27060 }, { "epoch": 1.6964341668233378, "grad_norm": 0.016696346923708916, "learning_rate": 2.1955415760697472e-05, "loss": 0.0227, "step": 27070 }, { "epoch": 1.6970608510371625, "grad_norm": 0.018799861893057823, "learning_rate": 2.194486077980199e-05, "loss": 0.1207, "step": 27080 }, { "epoch": 1.697687535250987, "grad_norm": 0.029151970520615578, "learning_rate": 2.1934305798906505e-05, "loss": 0.2022, "step": 27090 }, { "epoch": 1.6983142194648115, "grad_norm": 0.7271300554275513, "learning_rate": 2.1923750818011022e-05, "loss": 0.2016, "step": 27100 }, { "epoch": 1.6989409036786363, "grad_norm": 0.06217074394226074, "learning_rate": 2.1913195837115535e-05, "loss": 0.0028, "step": 27110 }, { "epoch": 1.699567587892461, "grad_norm": 0.047516193240880966, "learning_rate": 2.190264085622005e-05, "loss": 0.1477, "step": 27120 }, { "epoch": 1.7001942721062857, "grad_norm": 0.05786709114909172, "learning_rate": 2.189208587532457e-05, "loss": 0.2008, "step": 27130 }, { "epoch": 1.7008209563201104, "grad_norm": 1.7550990581512451, "learning_rate": 2.188153089442908e-05, "loss": 0.0092, "step": 27140 }, { "epoch": 1.701447640533935, "grad_norm": 0.052369292825460434, "learning_rate": 2.1870975913533598e-05, "loss": 0.0517, "step": 27150 }, { "epoch": 1.7020743247477597, "grad_norm": 3.845099925994873, "learning_rate": 2.186042093263811e-05, "loss": 0.1853, "step": 27160 }, { "epoch": 1.7027010089615842, "grad_norm": 1.763410210609436, "learning_rate": 2.1849865951742628e-05, "loss": 0.1128, "step": 27170 }, { "epoch": 1.7033276931754089, "grad_norm": 7.384634017944336, "learning_rate": 2.1839310970847145e-05, "loss": 0.1175, "step": 27180 }, { "epoch": 1.7039543773892336, "grad_norm": 0.035043444484472275, "learning_rate": 2.182875598995166e-05, "loss": 0.1171, "step": 27190 }, { "epoch": 1.7045810616030583, "grad_norm": 0.16392628848552704, "learning_rate": 2.1818201009056175e-05, "loss": 0.1787, "step": 27200 }, { "epoch": 1.7052077458168828, "grad_norm": 4.580605506896973, "learning_rate": 2.1807646028160688e-05, "loss": 0.0822, "step": 27210 }, { "epoch": 1.7058344300307076, "grad_norm": 0.17701353132724762, "learning_rate": 2.1797091047265205e-05, "loss": 0.0689, "step": 27220 }, { "epoch": 1.706461114244532, "grad_norm": 0.10425002127885818, "learning_rate": 2.178653606636972e-05, "loss": 0.0782, "step": 27230 }, { "epoch": 1.7070877984583568, "grad_norm": 0.0539705716073513, "learning_rate": 2.1775981085474238e-05, "loss": 0.0702, "step": 27240 }, { "epoch": 1.7077144826721815, "grad_norm": 0.09829293191432953, "learning_rate": 2.176542610457875e-05, "loss": 0.0966, "step": 27250 }, { "epoch": 1.7083411668860062, "grad_norm": 0.05815896391868591, "learning_rate": 2.1754871123683268e-05, "loss": 0.1604, "step": 27260 }, { "epoch": 1.708967851099831, "grad_norm": 0.022052781656384468, "learning_rate": 2.174431614278778e-05, "loss": 0.0384, "step": 27270 }, { "epoch": 1.7095945353136555, "grad_norm": 16.015851974487305, "learning_rate": 2.1733761161892298e-05, "loss": 0.0064, "step": 27280 }, { "epoch": 1.71022121952748, "grad_norm": 0.05524298548698425, "learning_rate": 2.1723206180996814e-05, "loss": 0.0611, "step": 27290 }, { "epoch": 1.7108479037413047, "grad_norm": 0.027206147089600563, "learning_rate": 2.1712651200101328e-05, "loss": 0.0416, "step": 27300 }, { "epoch": 1.7114745879551294, "grad_norm": 0.024542508646845818, "learning_rate": 2.1702096219205844e-05, "loss": 0.1534, "step": 27310 }, { "epoch": 1.7121012721689541, "grad_norm": 501.8453674316406, "learning_rate": 2.1691541238310358e-05, "loss": 0.1655, "step": 27320 }, { "epoch": 1.7127279563827789, "grad_norm": 0.027051694691181183, "learning_rate": 2.1680986257414874e-05, "loss": 0.2394, "step": 27330 }, { "epoch": 1.7133546405966034, "grad_norm": 0.17021749913692474, "learning_rate": 2.167043127651939e-05, "loss": 0.1643, "step": 27340 }, { "epoch": 1.7139813248104279, "grad_norm": 0.1182183101773262, "learning_rate": 2.1659876295623908e-05, "loss": 0.1188, "step": 27350 }, { "epoch": 1.7146080090242526, "grad_norm": 3.9816792011260986, "learning_rate": 2.164932131472842e-05, "loss": 0.2683, "step": 27360 }, { "epoch": 1.7152346932380773, "grad_norm": 0.11562171578407288, "learning_rate": 2.1638766333832934e-05, "loss": 0.0492, "step": 27370 }, { "epoch": 1.715861377451902, "grad_norm": 0.18632589280605316, "learning_rate": 2.1628211352937454e-05, "loss": 0.1284, "step": 27380 }, { "epoch": 1.7164880616657268, "grad_norm": 0.08556509763002396, "learning_rate": 2.1617656372041967e-05, "loss": 0.02, "step": 27390 }, { "epoch": 1.7171147458795513, "grad_norm": 12.92889404296875, "learning_rate": 2.1607101391146484e-05, "loss": 0.1641, "step": 27400 }, { "epoch": 1.717741430093376, "grad_norm": 6.711617469787598, "learning_rate": 2.1596546410250997e-05, "loss": 0.086, "step": 27410 }, { "epoch": 1.7183681143072005, "grad_norm": 0.4091886579990387, "learning_rate": 2.1585991429355514e-05, "loss": 0.3211, "step": 27420 }, { "epoch": 1.7189947985210252, "grad_norm": 0.7776539921760559, "learning_rate": 2.157543644846003e-05, "loss": 0.1452, "step": 27430 }, { "epoch": 1.71962148273485, "grad_norm": 0.05936276540160179, "learning_rate": 2.1564881467564547e-05, "loss": 0.0849, "step": 27440 }, { "epoch": 1.7202481669486747, "grad_norm": 13.701384544372559, "learning_rate": 2.155432648666906e-05, "loss": 0.0801, "step": 27450 }, { "epoch": 1.7208748511624992, "grad_norm": 5.974350929260254, "learning_rate": 2.1543771505773574e-05, "loss": 0.0845, "step": 27460 }, { "epoch": 1.7215015353763239, "grad_norm": 0.09645674377679825, "learning_rate": 2.153321652487809e-05, "loss": 0.0393, "step": 27470 }, { "epoch": 1.7221282195901484, "grad_norm": 0.05956216901540756, "learning_rate": 2.1522661543982607e-05, "loss": 0.1193, "step": 27480 }, { "epoch": 1.7227549038039731, "grad_norm": 19.65474510192871, "learning_rate": 2.1512106563087124e-05, "loss": 0.0617, "step": 27490 }, { "epoch": 1.7233815880177978, "grad_norm": 0.23389559984207153, "learning_rate": 2.1501551582191637e-05, "loss": 0.0382, "step": 27500 }, { "epoch": 1.7240082722316226, "grad_norm": 0.03733931854367256, "learning_rate": 2.1490996601296154e-05, "loss": 0.0807, "step": 27510 }, { "epoch": 1.7246349564454473, "grad_norm": 0.14345206320285797, "learning_rate": 2.1480441620400667e-05, "loss": 0.1723, "step": 27520 }, { "epoch": 1.7252616406592718, "grad_norm": 0.28000232577323914, "learning_rate": 2.1469886639505184e-05, "loss": 0.014, "step": 27530 }, { "epoch": 1.7258883248730963, "grad_norm": 0.3158402740955353, "learning_rate": 2.14593316586097e-05, "loss": 0.0806, "step": 27540 }, { "epoch": 1.726515009086921, "grad_norm": 0.1986808329820633, "learning_rate": 2.1448776677714213e-05, "loss": 0.169, "step": 27550 }, { "epoch": 1.7271416933007457, "grad_norm": 0.2472744733095169, "learning_rate": 2.143822169681873e-05, "loss": 0.1522, "step": 27560 }, { "epoch": 1.7277683775145705, "grad_norm": 2.0546441078186035, "learning_rate": 2.1427666715923243e-05, "loss": 0.036, "step": 27570 }, { "epoch": 1.7283950617283952, "grad_norm": 0.36932218074798584, "learning_rate": 2.141711173502776e-05, "loss": 0.056, "step": 27580 }, { "epoch": 1.7290217459422197, "grad_norm": 0.0200295839458704, "learning_rate": 2.1406556754132277e-05, "loss": 0.0711, "step": 27590 }, { "epoch": 1.7296484301560444, "grad_norm": 0.016135990619659424, "learning_rate": 2.139600177323679e-05, "loss": 0.0932, "step": 27600 }, { "epoch": 1.730275114369869, "grad_norm": 0.15928195416927338, "learning_rate": 2.1385446792341307e-05, "loss": 0.1799, "step": 27610 }, { "epoch": 1.7309017985836936, "grad_norm": 0.020300181582570076, "learning_rate": 2.137489181144582e-05, "loss": 0.2728, "step": 27620 }, { "epoch": 1.7315284827975184, "grad_norm": 7.829452991485596, "learning_rate": 2.136433683055034e-05, "loss": 0.1462, "step": 27630 }, { "epoch": 1.732155167011343, "grad_norm": 0.7304818034172058, "learning_rate": 2.1353781849654853e-05, "loss": 0.2374, "step": 27640 }, { "epoch": 1.7327818512251676, "grad_norm": 0.22246751189231873, "learning_rate": 2.134322686875937e-05, "loss": 0.007, "step": 27650 }, { "epoch": 1.7334085354389923, "grad_norm": 0.46738430857658386, "learning_rate": 2.1332671887863883e-05, "loss": 0.0589, "step": 27660 }, { "epoch": 1.7340352196528168, "grad_norm": 2.720977783203125, "learning_rate": 2.13221169069684e-05, "loss": 0.1276, "step": 27670 }, { "epoch": 1.7346619038666415, "grad_norm": 0.04329752177000046, "learning_rate": 2.1311561926072916e-05, "loss": 0.1775, "step": 27680 }, { "epoch": 1.7352885880804663, "grad_norm": 0.5023499131202698, "learning_rate": 2.130100694517743e-05, "loss": 0.0093, "step": 27690 }, { "epoch": 1.735915272294291, "grad_norm": 0.09492196887731552, "learning_rate": 2.1290451964281946e-05, "loss": 0.0215, "step": 27700 }, { "epoch": 1.7365419565081157, "grad_norm": 0.034697309136390686, "learning_rate": 2.127989698338646e-05, "loss": 0.0845, "step": 27710 }, { "epoch": 1.7371686407219402, "grad_norm": 0.04510561749339104, "learning_rate": 2.1269342002490976e-05, "loss": 0.1532, "step": 27720 }, { "epoch": 1.7377953249357647, "grad_norm": 0.8914774656295776, "learning_rate": 2.1258787021595493e-05, "loss": 0.0917, "step": 27730 }, { "epoch": 1.7384220091495894, "grad_norm": 0.5372912287712097, "learning_rate": 2.124823204070001e-05, "loss": 0.1138, "step": 27740 }, { "epoch": 1.7390486933634142, "grad_norm": 1.636864185333252, "learning_rate": 2.1237677059804523e-05, "loss": 0.1581, "step": 27750 }, { "epoch": 1.739675377577239, "grad_norm": 0.4151596128940582, "learning_rate": 2.1227122078909036e-05, "loss": 0.0094, "step": 27760 }, { "epoch": 1.7403020617910636, "grad_norm": 0.06015501543879509, "learning_rate": 2.1216567098013553e-05, "loss": 0.2099, "step": 27770 }, { "epoch": 1.7409287460048881, "grad_norm": 0.28171807527542114, "learning_rate": 2.120601211711807e-05, "loss": 0.0634, "step": 27780 }, { "epoch": 1.7415554302187128, "grad_norm": 0.23028428852558136, "learning_rate": 2.1195457136222586e-05, "loss": 0.1892, "step": 27790 }, { "epoch": 1.7421821144325373, "grad_norm": 0.060233842581510544, "learning_rate": 2.11849021553271e-05, "loss": 0.0961, "step": 27800 }, { "epoch": 1.742808798646362, "grad_norm": 0.3065778315067291, "learning_rate": 2.1174347174431616e-05, "loss": 0.032, "step": 27810 }, { "epoch": 1.7434354828601868, "grad_norm": 20.94129753112793, "learning_rate": 2.116379219353613e-05, "loss": 0.0605, "step": 27820 }, { "epoch": 1.7440621670740115, "grad_norm": 6.300323009490967, "learning_rate": 2.1153237212640646e-05, "loss": 0.2072, "step": 27830 }, { "epoch": 1.744688851287836, "grad_norm": 2.5606863498687744, "learning_rate": 2.1142682231745162e-05, "loss": 0.1082, "step": 27840 }, { "epoch": 1.7453155355016607, "grad_norm": 0.05400048941373825, "learning_rate": 2.1132127250849676e-05, "loss": 0.1643, "step": 27850 }, { "epoch": 1.7459422197154852, "grad_norm": 6.569550037384033, "learning_rate": 2.1121572269954192e-05, "loss": 0.0357, "step": 27860 }, { "epoch": 1.74656890392931, "grad_norm": 10.0947265625, "learning_rate": 2.1111017289058706e-05, "loss": 0.0658, "step": 27870 }, { "epoch": 1.7471955881431347, "grad_norm": 0.07155396044254303, "learning_rate": 2.1100462308163226e-05, "loss": 0.1062, "step": 27880 }, { "epoch": 1.7478222723569594, "grad_norm": 0.055783960968256, "learning_rate": 2.108990732726774e-05, "loss": 0.0037, "step": 27890 }, { "epoch": 1.7484489565707841, "grad_norm": 20.741167068481445, "learning_rate": 2.1079352346372256e-05, "loss": 0.0626, "step": 27900 }, { "epoch": 1.7490756407846086, "grad_norm": 0.012712010182440281, "learning_rate": 2.106879736547677e-05, "loss": 0.1363, "step": 27910 }, { "epoch": 1.7497023249984331, "grad_norm": 13.166600227355957, "learning_rate": 2.1058242384581282e-05, "loss": 0.2561, "step": 27920 }, { "epoch": 1.7503290092122579, "grad_norm": 0.2196955382823944, "learning_rate": 2.1047687403685802e-05, "loss": 0.4069, "step": 27930 }, { "epoch": 1.7509556934260826, "grad_norm": 1.294158935546875, "learning_rate": 2.1037132422790315e-05, "loss": 0.0529, "step": 27940 }, { "epoch": 1.7515823776399073, "grad_norm": 0.22262226045131683, "learning_rate": 2.1026577441894832e-05, "loss": 0.0859, "step": 27950 }, { "epoch": 1.752209061853732, "grad_norm": 0.1377834677696228, "learning_rate": 2.1016022460999345e-05, "loss": 0.2275, "step": 27960 }, { "epoch": 1.7528357460675565, "grad_norm": 5.08927583694458, "learning_rate": 2.1005467480103862e-05, "loss": 0.1528, "step": 27970 }, { "epoch": 1.753462430281381, "grad_norm": 4.58087158203125, "learning_rate": 2.099491249920838e-05, "loss": 0.101, "step": 27980 }, { "epoch": 1.7540891144952058, "grad_norm": 0.10001590102910995, "learning_rate": 2.0984357518312892e-05, "loss": 0.0529, "step": 27990 }, { "epoch": 1.7547157987090305, "grad_norm": 1.8981465101242065, "learning_rate": 2.097380253741741e-05, "loss": 0.08, "step": 28000 }, { "epoch": 1.7553424829228552, "grad_norm": 0.06657546013593674, "learning_rate": 2.0963247556521922e-05, "loss": 0.0844, "step": 28010 }, { "epoch": 1.75596916713668, "grad_norm": 0.08251860737800598, "learning_rate": 2.095269257562644e-05, "loss": 0.0366, "step": 28020 }, { "epoch": 1.7565958513505044, "grad_norm": 0.30270349979400635, "learning_rate": 2.0942137594730955e-05, "loss": 0.1185, "step": 28030 }, { "epoch": 1.7572225355643292, "grad_norm": 0.3146784007549286, "learning_rate": 2.0931582613835472e-05, "loss": 0.1438, "step": 28040 }, { "epoch": 1.7578492197781537, "grad_norm": 0.2562068998813629, "learning_rate": 2.0921027632939985e-05, "loss": 0.0087, "step": 28050 }, { "epoch": 1.7584759039919784, "grad_norm": 0.011984322220087051, "learning_rate": 2.09104726520445e-05, "loss": 0.0678, "step": 28060 }, { "epoch": 1.7591025882058031, "grad_norm": 0.019979266449809074, "learning_rate": 2.0899917671149015e-05, "loss": 0.0312, "step": 28070 }, { "epoch": 1.7597292724196278, "grad_norm": 0.5983748435974121, "learning_rate": 2.088936269025353e-05, "loss": 0.0317, "step": 28080 }, { "epoch": 1.7603559566334526, "grad_norm": 0.33651646971702576, "learning_rate": 2.0878807709358048e-05, "loss": 0.088, "step": 28090 }, { "epoch": 1.760982640847277, "grad_norm": 4.7786455154418945, "learning_rate": 2.086825272846256e-05, "loss": 0.1714, "step": 28100 }, { "epoch": 1.7616093250611016, "grad_norm": 0.6487839221954346, "learning_rate": 2.0857697747567078e-05, "loss": 0.1049, "step": 28110 }, { "epoch": 1.7622360092749263, "grad_norm": 11.593941688537598, "learning_rate": 2.084714276667159e-05, "loss": 0.1995, "step": 28120 }, { "epoch": 1.762862693488751, "grad_norm": 0.08931588381528854, "learning_rate": 2.083658778577611e-05, "loss": 0.046, "step": 28130 }, { "epoch": 1.7634893777025757, "grad_norm": 0.11976440995931625, "learning_rate": 2.0826032804880625e-05, "loss": 0.1334, "step": 28140 }, { "epoch": 1.7641160619164005, "grad_norm": 0.256796658039093, "learning_rate": 2.0815477823985138e-05, "loss": 0.0965, "step": 28150 }, { "epoch": 1.764742746130225, "grad_norm": 0.02123934030532837, "learning_rate": 2.0804922843089655e-05, "loss": 0.0506, "step": 28160 }, { "epoch": 1.7653694303440495, "grad_norm": 0.1435829997062683, "learning_rate": 2.0794367862194168e-05, "loss": 0.0735, "step": 28170 }, { "epoch": 1.7659961145578742, "grad_norm": 0.043175362050533295, "learning_rate": 2.0783812881298688e-05, "loss": 0.05, "step": 28180 }, { "epoch": 1.766622798771699, "grad_norm": 0.02113891765475273, "learning_rate": 2.07732579004032e-05, "loss": 0.0836, "step": 28190 }, { "epoch": 1.7672494829855236, "grad_norm": 0.014169618487358093, "learning_rate": 2.0762702919507718e-05, "loss": 0.284, "step": 28200 }, { "epoch": 1.7678761671993484, "grad_norm": 0.4836975336074829, "learning_rate": 2.075214793861223e-05, "loss": 0.0589, "step": 28210 }, { "epoch": 1.7685028514131729, "grad_norm": 0.10142585635185242, "learning_rate": 2.0741592957716748e-05, "loss": 0.0098, "step": 28220 }, { "epoch": 1.7691295356269976, "grad_norm": 0.17484275996685028, "learning_rate": 2.0731037976821264e-05, "loss": 0.2536, "step": 28230 }, { "epoch": 1.769756219840822, "grad_norm": 9.866804122924805, "learning_rate": 2.0720482995925778e-05, "loss": 0.1497, "step": 28240 }, { "epoch": 1.7703829040546468, "grad_norm": 0.033019669353961945, "learning_rate": 2.0709928015030294e-05, "loss": 0.0064, "step": 28250 }, { "epoch": 1.7710095882684715, "grad_norm": 0.03401433676481247, "learning_rate": 2.0699373034134808e-05, "loss": 0.045, "step": 28260 }, { "epoch": 1.7716362724822963, "grad_norm": 0.028515907004475594, "learning_rate": 2.0688818053239324e-05, "loss": 0.0711, "step": 28270 }, { "epoch": 1.7722629566961208, "grad_norm": 0.21594108641147614, "learning_rate": 2.067826307234384e-05, "loss": 0.0076, "step": 28280 }, { "epoch": 1.7728896409099455, "grad_norm": 0.03385074809193611, "learning_rate": 2.0667708091448358e-05, "loss": 0.1374, "step": 28290 }, { "epoch": 1.77351632512377, "grad_norm": 0.03485338017344475, "learning_rate": 2.065715311055287e-05, "loss": 0.1325, "step": 28300 }, { "epoch": 1.7741430093375947, "grad_norm": 0.013920055702328682, "learning_rate": 2.0646598129657384e-05, "loss": 0.0993, "step": 28310 }, { "epoch": 1.7747696935514194, "grad_norm": 3.2841951847076416, "learning_rate": 2.06360431487619e-05, "loss": 0.0898, "step": 28320 }, { "epoch": 1.7753963777652442, "grad_norm": 0.015438011847436428, "learning_rate": 2.0625488167866417e-05, "loss": 0.003, "step": 28330 }, { "epoch": 1.776023061979069, "grad_norm": 0.01668211817741394, "learning_rate": 2.0614933186970934e-05, "loss": 0.0454, "step": 28340 }, { "epoch": 1.7766497461928934, "grad_norm": 0.31149980425834656, "learning_rate": 2.0604378206075447e-05, "loss": 0.247, "step": 28350 }, { "epoch": 1.777276430406718, "grad_norm": 0.03061065450310707, "learning_rate": 2.0593823225179964e-05, "loss": 0.0469, "step": 28360 }, { "epoch": 1.7779031146205426, "grad_norm": 0.28417980670928955, "learning_rate": 2.0583268244284477e-05, "loss": 0.0652, "step": 28370 }, { "epoch": 1.7785297988343673, "grad_norm": 0.0512668751180172, "learning_rate": 2.0572713263388994e-05, "loss": 0.0497, "step": 28380 }, { "epoch": 1.779156483048192, "grad_norm": 0.41348955035209656, "learning_rate": 2.056215828249351e-05, "loss": 0.0958, "step": 28390 }, { "epoch": 1.7797831672620168, "grad_norm": 0.01674189791083336, "learning_rate": 2.0551603301598024e-05, "loss": 0.0602, "step": 28400 }, { "epoch": 1.7804098514758413, "grad_norm": 0.3735155165195465, "learning_rate": 2.054104832070254e-05, "loss": 0.0747, "step": 28410 }, { "epoch": 1.781036535689666, "grad_norm": 2.6698782444000244, "learning_rate": 2.0530493339807054e-05, "loss": 0.0906, "step": 28420 }, { "epoch": 1.7816632199034905, "grad_norm": 0.012423294596374035, "learning_rate": 2.0519938358911574e-05, "loss": 0.1468, "step": 28430 }, { "epoch": 1.7822899041173152, "grad_norm": 9.054516792297363, "learning_rate": 2.0509383378016087e-05, "loss": 0.1158, "step": 28440 }, { "epoch": 1.78291658833114, "grad_norm": 0.5451701879501343, "learning_rate": 2.0498828397120604e-05, "loss": 0.0784, "step": 28450 }, { "epoch": 1.7835432725449647, "grad_norm": 0.028169279918074608, "learning_rate": 2.0488273416225117e-05, "loss": 0.1311, "step": 28460 }, { "epoch": 1.7841699567587892, "grad_norm": 0.05051959306001663, "learning_rate": 2.0477718435329634e-05, "loss": 0.1032, "step": 28470 }, { "epoch": 1.784796640972614, "grad_norm": 4.341728210449219, "learning_rate": 2.046716345443415e-05, "loss": 0.2776, "step": 28480 }, { "epoch": 1.7854233251864384, "grad_norm": 0.04859687015414238, "learning_rate": 2.0456608473538663e-05, "loss": 0.0333, "step": 28490 }, { "epoch": 1.7860500094002631, "grad_norm": 0.02066011354327202, "learning_rate": 2.044605349264318e-05, "loss": 0.0276, "step": 28500 }, { "epoch": 1.7866766936140879, "grad_norm": 0.0390385240316391, "learning_rate": 2.0435498511747693e-05, "loss": 0.0314, "step": 28510 }, { "epoch": 1.7873033778279126, "grad_norm": 0.014242850244045258, "learning_rate": 2.042494353085221e-05, "loss": 0.073, "step": 28520 }, { "epoch": 1.7879300620417373, "grad_norm": 0.02367878518998623, "learning_rate": 2.0414388549956727e-05, "loss": 0.1714, "step": 28530 }, { "epoch": 1.7885567462555618, "grad_norm": 0.019421106204390526, "learning_rate": 2.040383356906124e-05, "loss": 0.0617, "step": 28540 }, { "epoch": 1.7891834304693863, "grad_norm": 0.059307683259248734, "learning_rate": 2.0393278588165757e-05, "loss": 0.0595, "step": 28550 }, { "epoch": 1.789810114683211, "grad_norm": 0.02156263403594494, "learning_rate": 2.038272360727027e-05, "loss": 0.0034, "step": 28560 }, { "epoch": 1.7904367988970358, "grad_norm": 0.013104693964123726, "learning_rate": 2.0372168626374787e-05, "loss": 0.0526, "step": 28570 }, { "epoch": 1.7910634831108605, "grad_norm": 0.12851563096046448, "learning_rate": 2.0361613645479303e-05, "loss": 0.0343, "step": 28580 }, { "epoch": 1.7916901673246852, "grad_norm": 0.008035060949623585, "learning_rate": 2.035105866458382e-05, "loss": 0.0237, "step": 28590 }, { "epoch": 1.7923168515385097, "grad_norm": 0.013913143426179886, "learning_rate": 2.0340503683688333e-05, "loss": 0.1845, "step": 28600 }, { "epoch": 1.7929435357523344, "grad_norm": 0.7216804623603821, "learning_rate": 2.0329948702792846e-05, "loss": 0.1203, "step": 28610 }, { "epoch": 1.793570219966159, "grad_norm": 0.04569417983293533, "learning_rate": 2.0319393721897363e-05, "loss": 0.0288, "step": 28620 }, { "epoch": 1.7941969041799837, "grad_norm": 0.02900240570306778, "learning_rate": 2.030883874100188e-05, "loss": 0.0895, "step": 28630 }, { "epoch": 1.7948235883938084, "grad_norm": 0.0236115250736475, "learning_rate": 2.0298283760106396e-05, "loss": 0.0794, "step": 28640 }, { "epoch": 1.7954502726076331, "grad_norm": 0.03295353055000305, "learning_rate": 2.028772877921091e-05, "loss": 0.148, "step": 28650 }, { "epoch": 1.7960769568214576, "grad_norm": 0.03861570730805397, "learning_rate": 2.0277173798315426e-05, "loss": 0.1315, "step": 28660 }, { "epoch": 1.7967036410352824, "grad_norm": 0.03682591766119003, "learning_rate": 2.026661881741994e-05, "loss": 0.1394, "step": 28670 }, { "epoch": 1.7973303252491069, "grad_norm": 0.04726806655526161, "learning_rate": 2.025606383652446e-05, "loss": 0.0256, "step": 28680 }, { "epoch": 1.7979570094629316, "grad_norm": 0.476746141910553, "learning_rate": 2.0245508855628973e-05, "loss": 0.009, "step": 28690 }, { "epoch": 1.7985836936767563, "grad_norm": 0.4977225959300995, "learning_rate": 2.0234953874733486e-05, "loss": 0.1839, "step": 28700 }, { "epoch": 1.799210377890581, "grad_norm": 0.742252767086029, "learning_rate": 2.0224398893838003e-05, "loss": 0.1367, "step": 28710 }, { "epoch": 1.7998370621044057, "grad_norm": 1.1202590465545654, "learning_rate": 2.021384391294252e-05, "loss": 0.0563, "step": 28720 }, { "epoch": 1.8004637463182303, "grad_norm": 0.5046806335449219, "learning_rate": 2.0203288932047036e-05, "loss": 0.1431, "step": 28730 }, { "epoch": 1.8010904305320548, "grad_norm": 0.024776900187134743, "learning_rate": 2.019273395115155e-05, "loss": 0.0273, "step": 28740 }, { "epoch": 1.8017171147458795, "grad_norm": 0.03161754831671715, "learning_rate": 2.0182178970256066e-05, "loss": 0.2066, "step": 28750 }, { "epoch": 1.8023437989597042, "grad_norm": 31.176733016967773, "learning_rate": 2.017162398936058e-05, "loss": 0.1162, "step": 28760 }, { "epoch": 1.802970483173529, "grad_norm": 0.052438825368881226, "learning_rate": 2.0161069008465096e-05, "loss": 0.1372, "step": 28770 }, { "epoch": 1.8035971673873537, "grad_norm": 0.04112754762172699, "learning_rate": 2.0150514027569612e-05, "loss": 0.0298, "step": 28780 }, { "epoch": 1.8042238516011782, "grad_norm": 0.6142888069152832, "learning_rate": 2.0139959046674126e-05, "loss": 0.0805, "step": 28790 }, { "epoch": 1.8048505358150027, "grad_norm": 0.06450961530208588, "learning_rate": 2.0129404065778642e-05, "loss": 0.0734, "step": 28800 }, { "epoch": 1.8054772200288274, "grad_norm": 4.156648635864258, "learning_rate": 2.0118849084883156e-05, "loss": 0.0913, "step": 28810 }, { "epoch": 1.806103904242652, "grad_norm": 0.024707753211259842, "learning_rate": 2.0108294103987672e-05, "loss": 0.0503, "step": 28820 }, { "epoch": 1.8067305884564768, "grad_norm": 0.017378469929099083, "learning_rate": 2.009773912309219e-05, "loss": 0.0306, "step": 28830 }, { "epoch": 1.8073572726703016, "grad_norm": 0.022257640957832336, "learning_rate": 2.0087184142196706e-05, "loss": 0.2535, "step": 28840 }, { "epoch": 1.807983956884126, "grad_norm": 14.888116836547852, "learning_rate": 2.007662916130122e-05, "loss": 0.1329, "step": 28850 }, { "epoch": 1.8086106410979508, "grad_norm": 0.7564731240272522, "learning_rate": 2.0066074180405732e-05, "loss": 0.0147, "step": 28860 }, { "epoch": 1.8092373253117753, "grad_norm": 0.06251378357410431, "learning_rate": 2.005551919951025e-05, "loss": 0.2076, "step": 28870 }, { "epoch": 1.8098640095256, "grad_norm": 0.03549632057547569, "learning_rate": 2.0044964218614765e-05, "loss": 0.064, "step": 28880 }, { "epoch": 1.8104906937394247, "grad_norm": 9.972016334533691, "learning_rate": 2.0034409237719282e-05, "loss": 0.2049, "step": 28890 }, { "epoch": 1.8111173779532495, "grad_norm": 0.5740152597427368, "learning_rate": 2.0023854256823795e-05, "loss": 0.0966, "step": 28900 }, { "epoch": 1.8117440621670742, "grad_norm": 0.9303169250488281, "learning_rate": 2.0013299275928312e-05, "loss": 0.0953, "step": 28910 }, { "epoch": 1.8123707463808987, "grad_norm": 0.337392657995224, "learning_rate": 2.000274429503283e-05, "loss": 0.118, "step": 28920 }, { "epoch": 1.8129974305947232, "grad_norm": 0.04707547277212143, "learning_rate": 1.9992189314137342e-05, "loss": 0.1107, "step": 28930 }, { "epoch": 1.813624114808548, "grad_norm": 0.08423315733671188, "learning_rate": 1.998163433324186e-05, "loss": 0.0055, "step": 28940 }, { "epoch": 1.8142507990223726, "grad_norm": 0.021297458559274673, "learning_rate": 1.9971079352346372e-05, "loss": 0.0501, "step": 28950 }, { "epoch": 1.8148774832361974, "grad_norm": 0.040586020797491074, "learning_rate": 1.996052437145089e-05, "loss": 0.0848, "step": 28960 }, { "epoch": 1.815504167450022, "grad_norm": 5.234199047088623, "learning_rate": 1.9949969390555405e-05, "loss": 0.1686, "step": 28970 }, { "epoch": 1.8161308516638466, "grad_norm": 0.32946547865867615, "learning_rate": 1.9939414409659922e-05, "loss": 0.0032, "step": 28980 }, { "epoch": 1.816757535877671, "grad_norm": 0.26657867431640625, "learning_rate": 1.9928859428764435e-05, "loss": 0.2807, "step": 28990 }, { "epoch": 1.8173842200914958, "grad_norm": 0.054265476763248444, "learning_rate": 1.991830444786895e-05, "loss": 0.0384, "step": 29000 }, { "epoch": 1.8180109043053205, "grad_norm": 12.620076179504395, "learning_rate": 1.9907749466973465e-05, "loss": 0.17, "step": 29010 }, { "epoch": 1.8186375885191453, "grad_norm": 0.30357375741004944, "learning_rate": 1.989719448607798e-05, "loss": 0.1859, "step": 29020 }, { "epoch": 1.81926427273297, "grad_norm": 0.42700129747390747, "learning_rate": 1.9886639505182498e-05, "loss": 0.0094, "step": 29030 }, { "epoch": 1.8198909569467945, "grad_norm": 0.030239397659897804, "learning_rate": 1.987608452428701e-05, "loss": 0.1341, "step": 29040 }, { "epoch": 1.8205176411606192, "grad_norm": 0.03529753535985947, "learning_rate": 1.9865529543391528e-05, "loss": 0.0422, "step": 29050 }, { "epoch": 1.8211443253744437, "grad_norm": 0.7819442749023438, "learning_rate": 1.985497456249604e-05, "loss": 0.1731, "step": 29060 }, { "epoch": 1.8217710095882684, "grad_norm": 0.051512762904167175, "learning_rate": 1.9844419581600558e-05, "loss": 0.2309, "step": 29070 }, { "epoch": 1.8223976938020932, "grad_norm": 6.448638439178467, "learning_rate": 1.9833864600705075e-05, "loss": 0.1451, "step": 29080 }, { "epoch": 1.8230243780159179, "grad_norm": 0.08587915450334549, "learning_rate": 1.9823309619809588e-05, "loss": 0.0288, "step": 29090 }, { "epoch": 1.8236510622297424, "grad_norm": 19.13962173461914, "learning_rate": 1.9812754638914105e-05, "loss": 0.1057, "step": 29100 }, { "epoch": 1.824277746443567, "grad_norm": 0.018747830763459206, "learning_rate": 1.9802199658018618e-05, "loss": 0.0739, "step": 29110 }, { "epoch": 1.8249044306573916, "grad_norm": 0.01629248820245266, "learning_rate": 1.9791644677123135e-05, "loss": 0.0735, "step": 29120 }, { "epoch": 1.8255311148712163, "grad_norm": 15.566570281982422, "learning_rate": 1.978108969622765e-05, "loss": 0.0868, "step": 29130 }, { "epoch": 1.826157799085041, "grad_norm": 0.03689432889223099, "learning_rate": 1.9770534715332168e-05, "loss": 0.1567, "step": 29140 }, { "epoch": 1.8267844832988658, "grad_norm": 0.22734884917736053, "learning_rate": 1.975997973443668e-05, "loss": 0.023, "step": 29150 }, { "epoch": 1.8274111675126905, "grad_norm": 0.0562271811068058, "learning_rate": 1.9749424753541194e-05, "loss": 0.1313, "step": 29160 }, { "epoch": 1.828037851726515, "grad_norm": 0.5230720043182373, "learning_rate": 1.9738869772645714e-05, "loss": 0.0257, "step": 29170 }, { "epoch": 1.8286645359403395, "grad_norm": 0.1260448396205902, "learning_rate": 1.9728314791750228e-05, "loss": 0.0753, "step": 29180 }, { "epoch": 1.8292912201541642, "grad_norm": 19.483396530151367, "learning_rate": 1.9717759810854744e-05, "loss": 0.0821, "step": 29190 }, { "epoch": 1.829917904367989, "grad_norm": 0.10309408605098724, "learning_rate": 1.9707204829959258e-05, "loss": 0.1264, "step": 29200 }, { "epoch": 1.8305445885818137, "grad_norm": 8.662972450256348, "learning_rate": 1.9696649849063774e-05, "loss": 0.1339, "step": 29210 }, { "epoch": 1.8311712727956384, "grad_norm": 0.4835667908191681, "learning_rate": 1.968609486816829e-05, "loss": 0.0097, "step": 29220 }, { "epoch": 1.831797957009463, "grad_norm": 0.02930905483663082, "learning_rate": 1.9675539887272804e-05, "loss": 0.0038, "step": 29230 }, { "epoch": 1.8324246412232876, "grad_norm": 0.10878875106573105, "learning_rate": 1.966498490637732e-05, "loss": 0.0659, "step": 29240 }, { "epoch": 1.8330513254371121, "grad_norm": 0.05935356393456459, "learning_rate": 1.9654429925481834e-05, "loss": 0.1068, "step": 29250 }, { "epoch": 1.8336780096509369, "grad_norm": 0.020778510719537735, "learning_rate": 1.964387494458635e-05, "loss": 0.1359, "step": 29260 }, { "epoch": 1.8343046938647616, "grad_norm": 10.65028190612793, "learning_rate": 1.9633319963690867e-05, "loss": 0.1287, "step": 29270 }, { "epoch": 1.8349313780785863, "grad_norm": 1.7528663873672485, "learning_rate": 1.9622764982795384e-05, "loss": 0.0258, "step": 29280 }, { "epoch": 1.8355580622924108, "grad_norm": 0.010283700190484524, "learning_rate": 1.9612210001899897e-05, "loss": 0.0024, "step": 29290 }, { "epoch": 1.8361847465062355, "grad_norm": 1.0101772546768188, "learning_rate": 1.9601655021004414e-05, "loss": 0.1069, "step": 29300 }, { "epoch": 1.83681143072006, "grad_norm": 0.014128070324659348, "learning_rate": 1.9591100040108927e-05, "loss": 0.0097, "step": 29310 }, { "epoch": 1.8374381149338848, "grad_norm": 0.009399221278727055, "learning_rate": 1.9580545059213444e-05, "loss": 0.1276, "step": 29320 }, { "epoch": 1.8380647991477095, "grad_norm": 2.2243754863739014, "learning_rate": 1.956999007831796e-05, "loss": 0.1947, "step": 29330 }, { "epoch": 1.8386914833615342, "grad_norm": 0.06796874105930328, "learning_rate": 1.9559435097422474e-05, "loss": 0.1281, "step": 29340 }, { "epoch": 1.839318167575359, "grad_norm": 1.3128525018692017, "learning_rate": 1.954888011652699e-05, "loss": 0.0821, "step": 29350 }, { "epoch": 1.8399448517891834, "grad_norm": 9.305420875549316, "learning_rate": 1.9538325135631504e-05, "loss": 0.5251, "step": 29360 }, { "epoch": 1.840571536003008, "grad_norm": 0.2909277379512787, "learning_rate": 1.952777015473602e-05, "loss": 0.087, "step": 29370 }, { "epoch": 1.8411982202168327, "grad_norm": 0.8060989379882812, "learning_rate": 1.9517215173840537e-05, "loss": 0.0592, "step": 29380 }, { "epoch": 1.8418249044306574, "grad_norm": 0.1159672886133194, "learning_rate": 1.950666019294505e-05, "loss": 0.0417, "step": 29390 }, { "epoch": 1.842451588644482, "grad_norm": 3.1500940322875977, "learning_rate": 1.9496105212049567e-05, "loss": 0.2034, "step": 29400 }, { "epoch": 1.8430782728583068, "grad_norm": 0.062305234372615814, "learning_rate": 1.948555023115408e-05, "loss": 0.156, "step": 29410 }, { "epoch": 1.8437049570721313, "grad_norm": 0.06603378802537918, "learning_rate": 1.94749952502586e-05, "loss": 0.2016, "step": 29420 }, { "epoch": 1.844331641285956, "grad_norm": 0.5237300992012024, "learning_rate": 1.9464440269363114e-05, "loss": 0.0637, "step": 29430 }, { "epoch": 1.8449583254997806, "grad_norm": 0.3289642632007599, "learning_rate": 1.945388528846763e-05, "loss": 0.102, "step": 29440 }, { "epoch": 1.8455850097136053, "grad_norm": 4.283963203430176, "learning_rate": 1.9443330307572143e-05, "loss": 0.3358, "step": 29450 }, { "epoch": 1.84621169392743, "grad_norm": 8.56247615814209, "learning_rate": 1.943277532667666e-05, "loss": 0.1617, "step": 29460 }, { "epoch": 1.8468383781412547, "grad_norm": 0.1259661316871643, "learning_rate": 1.9422220345781177e-05, "loss": 0.1126, "step": 29470 }, { "epoch": 1.8474650623550792, "grad_norm": 0.09990125149488449, "learning_rate": 1.941166536488569e-05, "loss": 0.0875, "step": 29480 }, { "epoch": 1.848091746568904, "grad_norm": 0.11462511122226715, "learning_rate": 1.9401110383990207e-05, "loss": 0.1446, "step": 29490 }, { "epoch": 1.8487184307827285, "grad_norm": 0.11472202092409134, "learning_rate": 1.939055540309472e-05, "loss": 0.088, "step": 29500 }, { "epoch": 1.8493451149965532, "grad_norm": 1.1895668506622314, "learning_rate": 1.9380000422199237e-05, "loss": 0.036, "step": 29510 }, { "epoch": 1.849971799210378, "grad_norm": 0.19908933341503143, "learning_rate": 1.9369445441303753e-05, "loss": 0.0983, "step": 29520 }, { "epoch": 1.8505984834242026, "grad_norm": 1.47406005859375, "learning_rate": 1.935889046040827e-05, "loss": 0.0543, "step": 29530 }, { "epoch": 1.8512251676380274, "grad_norm": 0.06669019162654877, "learning_rate": 1.9348335479512783e-05, "loss": 0.0428, "step": 29540 }, { "epoch": 1.8518518518518519, "grad_norm": 0.05538703128695488, "learning_rate": 1.9337780498617296e-05, "loss": 0.1216, "step": 29550 }, { "epoch": 1.8524785360656764, "grad_norm": 0.0620872937142849, "learning_rate": 1.9327225517721813e-05, "loss": 0.035, "step": 29560 }, { "epoch": 1.853105220279501, "grad_norm": 4.431196212768555, "learning_rate": 1.931667053682633e-05, "loss": 0.1035, "step": 29570 }, { "epoch": 1.8537319044933258, "grad_norm": 0.05511202663183212, "learning_rate": 1.9306115555930846e-05, "loss": 0.105, "step": 29580 }, { "epoch": 1.8543585887071505, "grad_norm": 0.05063912644982338, "learning_rate": 1.929556057503536e-05, "loss": 0.1448, "step": 29590 }, { "epoch": 1.8549852729209753, "grad_norm": 0.08396095782518387, "learning_rate": 1.9285005594139876e-05, "loss": 0.0884, "step": 29600 }, { "epoch": 1.8556119571347998, "grad_norm": 0.22409754991531372, "learning_rate": 1.927445061324439e-05, "loss": 0.0645, "step": 29610 }, { "epoch": 1.8562386413486243, "grad_norm": 0.05794045329093933, "learning_rate": 1.9263895632348906e-05, "loss": 0.0719, "step": 29620 }, { "epoch": 1.856865325562449, "grad_norm": 0.15742503106594086, "learning_rate": 1.9253340651453423e-05, "loss": 0.3772, "step": 29630 }, { "epoch": 1.8574920097762737, "grad_norm": 0.21620403230190277, "learning_rate": 1.9242785670557936e-05, "loss": 0.0839, "step": 29640 }, { "epoch": 1.8581186939900984, "grad_norm": 0.09392804652452469, "learning_rate": 1.9232230689662453e-05, "loss": 0.0643, "step": 29650 }, { "epoch": 1.8587453782039232, "grad_norm": 0.05274158716201782, "learning_rate": 1.9221675708766966e-05, "loss": 0.1715, "step": 29660 }, { "epoch": 1.8593720624177477, "grad_norm": 0.04212874546647072, "learning_rate": 1.9211120727871486e-05, "loss": 0.0605, "step": 29670 }, { "epoch": 1.8599987466315724, "grad_norm": 0.03563644737005234, "learning_rate": 1.9200565746976e-05, "loss": 0.1432, "step": 29680 }, { "epoch": 1.8606254308453969, "grad_norm": 5.44032621383667, "learning_rate": 1.9190010766080516e-05, "loss": 0.1619, "step": 29690 }, { "epoch": 1.8612521150592216, "grad_norm": 0.13287276029586792, "learning_rate": 1.917945578518503e-05, "loss": 0.1115, "step": 29700 }, { "epoch": 1.8618787992730463, "grad_norm": 3.4286482334136963, "learning_rate": 1.9168900804289542e-05, "loss": 0.1127, "step": 29710 }, { "epoch": 1.862505483486871, "grad_norm": 0.27782508730888367, "learning_rate": 1.9158345823394063e-05, "loss": 0.0366, "step": 29720 }, { "epoch": 1.8631321677006958, "grad_norm": 0.06644205003976822, "learning_rate": 1.9147790842498576e-05, "loss": 0.0781, "step": 29730 }, { "epoch": 1.8637588519145203, "grad_norm": 0.0695660188794136, "learning_rate": 1.9137235861603092e-05, "loss": 0.074, "step": 29740 }, { "epoch": 1.8643855361283448, "grad_norm": 0.05306769534945488, "learning_rate": 1.9126680880707606e-05, "loss": 0.0964, "step": 29750 }, { "epoch": 1.8650122203421695, "grad_norm": 0.053244441747665405, "learning_rate": 1.9116125899812122e-05, "loss": 0.0169, "step": 29760 }, { "epoch": 1.8656389045559942, "grad_norm": 0.16310220956802368, "learning_rate": 1.910557091891664e-05, "loss": 0.105, "step": 29770 }, { "epoch": 1.866265588769819, "grad_norm": 0.07683579623699188, "learning_rate": 1.9095015938021152e-05, "loss": 0.0391, "step": 29780 }, { "epoch": 1.8668922729836437, "grad_norm": 0.33125782012939453, "learning_rate": 1.908446095712567e-05, "loss": 0.082, "step": 29790 }, { "epoch": 1.8675189571974682, "grad_norm": 0.02789173647761345, "learning_rate": 1.9073905976230182e-05, "loss": 0.1035, "step": 29800 }, { "epoch": 1.8681456414112927, "grad_norm": 0.03273582458496094, "learning_rate": 1.90633509953347e-05, "loss": 0.002, "step": 29810 }, { "epoch": 1.8687723256251174, "grad_norm": 0.04846423864364624, "learning_rate": 1.9052796014439215e-05, "loss": 0.0543, "step": 29820 }, { "epoch": 1.8693990098389421, "grad_norm": 0.04357713833451271, "learning_rate": 1.9042241033543732e-05, "loss": 0.0017, "step": 29830 }, { "epoch": 1.8700256940527669, "grad_norm": 9.48665714263916, "learning_rate": 1.9031686052648245e-05, "loss": 0.2533, "step": 29840 }, { "epoch": 1.8706523782665916, "grad_norm": 0.02235112152993679, "learning_rate": 1.902113107175276e-05, "loss": 0.1493, "step": 29850 }, { "epoch": 1.871279062480416, "grad_norm": 7.93790864944458, "learning_rate": 1.9010576090857275e-05, "loss": 0.2031, "step": 29860 }, { "epoch": 1.8719057466942408, "grad_norm": 0.4831400513648987, "learning_rate": 1.9000021109961792e-05, "loss": 0.0422, "step": 29870 }, { "epoch": 1.8725324309080653, "grad_norm": 12.699409484863281, "learning_rate": 1.898946612906631e-05, "loss": 0.1498, "step": 29880 }, { "epoch": 1.87315911512189, "grad_norm": 0.47280260920524597, "learning_rate": 1.8978911148170822e-05, "loss": 0.1014, "step": 29890 }, { "epoch": 1.8737857993357148, "grad_norm": 0.07105378061532974, "learning_rate": 1.896835616727534e-05, "loss": 0.1349, "step": 29900 }, { "epoch": 1.8744124835495395, "grad_norm": 7.203770160675049, "learning_rate": 1.8957801186379852e-05, "loss": 0.2386, "step": 29910 }, { "epoch": 1.875039167763364, "grad_norm": 0.05521472170948982, "learning_rate": 1.8947246205484372e-05, "loss": 0.0533, "step": 29920 }, { "epoch": 1.8756658519771887, "grad_norm": 1.2736027240753174, "learning_rate": 1.8936691224588885e-05, "loss": 0.107, "step": 29930 }, { "epoch": 1.8762925361910132, "grad_norm": 0.05932888761162758, "learning_rate": 1.89261362436934e-05, "loss": 0.1205, "step": 29940 }, { "epoch": 1.876919220404838, "grad_norm": 0.3483729064464569, "learning_rate": 1.8915581262797915e-05, "loss": 0.0029, "step": 29950 }, { "epoch": 1.8775459046186627, "grad_norm": 0.05726746469736099, "learning_rate": 1.8905026281902428e-05, "loss": 0.0772, "step": 29960 }, { "epoch": 1.8781725888324874, "grad_norm": 5.409941673278809, "learning_rate": 1.8894471301006948e-05, "loss": 0.1557, "step": 29970 }, { "epoch": 1.8787992730463121, "grad_norm": 0.017120420932769775, "learning_rate": 1.888391632011146e-05, "loss": 0.1835, "step": 29980 }, { "epoch": 1.8794259572601366, "grad_norm": 1.3128411769866943, "learning_rate": 1.8873361339215978e-05, "loss": 0.2665, "step": 29990 }, { "epoch": 1.8800526414739611, "grad_norm": 0.05831111595034599, "learning_rate": 1.886280635832049e-05, "loss": 0.0641, "step": 30000 }, { "epoch": 1.8806793256877858, "grad_norm": 0.04687171056866646, "learning_rate": 1.8852251377425008e-05, "loss": 0.0097, "step": 30010 }, { "epoch": 1.8813060099016106, "grad_norm": 0.08850085735321045, "learning_rate": 1.8841696396529525e-05, "loss": 0.0698, "step": 30020 }, { "epoch": 1.8819326941154353, "grad_norm": 3.7799575328826904, "learning_rate": 1.8831141415634038e-05, "loss": 0.0373, "step": 30030 }, { "epoch": 1.88255937832926, "grad_norm": 0.03460925817489624, "learning_rate": 1.8820586434738555e-05, "loss": 0.0017, "step": 30040 }, { "epoch": 1.8831860625430845, "grad_norm": 0.3564239740371704, "learning_rate": 1.8810031453843068e-05, "loss": 0.2192, "step": 30050 }, { "epoch": 1.8838127467569092, "grad_norm": 0.014373106881976128, "learning_rate": 1.8799476472947585e-05, "loss": 0.0709, "step": 30060 }, { "epoch": 1.8844394309707337, "grad_norm": 0.24445849657058716, "learning_rate": 1.87889214920521e-05, "loss": 0.0971, "step": 30070 }, { "epoch": 1.8850661151845585, "grad_norm": 3.6444427967071533, "learning_rate": 1.8778366511156618e-05, "loss": 0.0798, "step": 30080 }, { "epoch": 1.8856927993983832, "grad_norm": 3.2659192085266113, "learning_rate": 1.876781153026113e-05, "loss": 0.0547, "step": 30090 }, { "epoch": 1.886319483612208, "grad_norm": 0.009950400330126286, "learning_rate": 1.8757256549365644e-05, "loss": 0.0039, "step": 30100 }, { "epoch": 1.8869461678260324, "grad_norm": 0.030715815722942352, "learning_rate": 1.874670156847016e-05, "loss": 0.0666, "step": 30110 }, { "epoch": 1.8875728520398571, "grad_norm": 0.03627600520849228, "learning_rate": 1.8736146587574678e-05, "loss": 0.0218, "step": 30120 }, { "epoch": 1.8881995362536816, "grad_norm": 0.0974053218960762, "learning_rate": 1.8725591606679194e-05, "loss": 0.1929, "step": 30130 }, { "epoch": 1.8888262204675064, "grad_norm": 6.635987281799316, "learning_rate": 1.8715036625783708e-05, "loss": 0.1065, "step": 30140 }, { "epoch": 1.889452904681331, "grad_norm": 0.1497088223695755, "learning_rate": 1.8704481644888224e-05, "loss": 0.0606, "step": 30150 }, { "epoch": 1.8900795888951558, "grad_norm": 0.03236452117562294, "learning_rate": 1.8693926663992738e-05, "loss": 0.1933, "step": 30160 }, { "epoch": 1.8907062731089805, "grad_norm": 5.778759002685547, "learning_rate": 1.8683371683097254e-05, "loss": 0.0899, "step": 30170 }, { "epoch": 1.891332957322805, "grad_norm": 0.8521557450294495, "learning_rate": 1.867281670220177e-05, "loss": 0.2154, "step": 30180 }, { "epoch": 1.8919596415366295, "grad_norm": 0.08743500709533691, "learning_rate": 1.8662261721306284e-05, "loss": 0.0842, "step": 30190 }, { "epoch": 1.8925863257504543, "grad_norm": 0.06743310391902924, "learning_rate": 1.86517067404108e-05, "loss": 0.0971, "step": 30200 }, { "epoch": 1.893213009964279, "grad_norm": 2.5880138874053955, "learning_rate": 1.8641151759515314e-05, "loss": 0.0598, "step": 30210 }, { "epoch": 1.8938396941781037, "grad_norm": 0.16789746284484863, "learning_rate": 1.8630596778619834e-05, "loss": 0.0475, "step": 30220 }, { "epoch": 1.8944663783919284, "grad_norm": 0.9268659949302673, "learning_rate": 1.8620041797724347e-05, "loss": 0.041, "step": 30230 }, { "epoch": 1.895093062605753, "grad_norm": 0.8711900115013123, "learning_rate": 1.860948681682886e-05, "loss": 0.1084, "step": 30240 }, { "epoch": 1.8957197468195777, "grad_norm": 0.029752174392342567, "learning_rate": 1.8598931835933377e-05, "loss": 0.3856, "step": 30250 }, { "epoch": 1.8963464310334022, "grad_norm": 0.6014415621757507, "learning_rate": 1.8588376855037894e-05, "loss": 0.1008, "step": 30260 }, { "epoch": 1.896973115247227, "grad_norm": 1.2103004455566406, "learning_rate": 1.857782187414241e-05, "loss": 0.0331, "step": 30270 }, { "epoch": 1.8975997994610516, "grad_norm": 0.16703811287879944, "learning_rate": 1.8567266893246924e-05, "loss": 0.0546, "step": 30280 }, { "epoch": 1.8982264836748763, "grad_norm": 5.745260715484619, "learning_rate": 1.855671191235144e-05, "loss": 0.1985, "step": 30290 }, { "epoch": 1.8988531678887008, "grad_norm": 0.5710949897766113, "learning_rate": 1.8546156931455954e-05, "loss": 0.0237, "step": 30300 }, { "epoch": 1.8994798521025256, "grad_norm": 0.0267245564609766, "learning_rate": 1.853560195056047e-05, "loss": 0.0519, "step": 30310 }, { "epoch": 1.90010653631635, "grad_norm": 3.8505678176879883, "learning_rate": 1.8525046969664987e-05, "loss": 0.1425, "step": 30320 }, { "epoch": 1.9007332205301748, "grad_norm": 0.6326449513435364, "learning_rate": 1.85144919887695e-05, "loss": 0.0259, "step": 30330 }, { "epoch": 1.9013599047439995, "grad_norm": 3.883139133453369, "learning_rate": 1.8503937007874017e-05, "loss": 0.1314, "step": 30340 }, { "epoch": 1.9019865889578242, "grad_norm": 7.332738876342773, "learning_rate": 1.849338202697853e-05, "loss": 0.1249, "step": 30350 }, { "epoch": 1.902613273171649, "grad_norm": 8.143255233764648, "learning_rate": 1.8482827046083047e-05, "loss": 0.2064, "step": 30360 }, { "epoch": 1.9032399573854735, "grad_norm": 0.0913747027516365, "learning_rate": 1.8472272065187564e-05, "loss": 0.1017, "step": 30370 }, { "epoch": 1.903866641599298, "grad_norm": 2.4039127826690674, "learning_rate": 1.846171708429208e-05, "loss": 0.1219, "step": 30380 }, { "epoch": 1.9044933258131227, "grad_norm": 2.748976469039917, "learning_rate": 1.8451162103396593e-05, "loss": 0.0252, "step": 30390 }, { "epoch": 1.9051200100269474, "grad_norm": 0.049904074519872665, "learning_rate": 1.8440607122501107e-05, "loss": 0.0039, "step": 30400 }, { "epoch": 1.9057466942407721, "grad_norm": 0.015348660759627819, "learning_rate": 1.8430052141605623e-05, "loss": 0.0532, "step": 30410 }, { "epoch": 1.9063733784545969, "grad_norm": 0.03833635151386261, "learning_rate": 1.841949716071014e-05, "loss": 0.1586, "step": 30420 }, { "epoch": 1.9070000626684214, "grad_norm": 0.11076361685991287, "learning_rate": 1.8408942179814657e-05, "loss": 0.0743, "step": 30430 }, { "epoch": 1.9076267468822459, "grad_norm": 2.92325758934021, "learning_rate": 1.839838719891917e-05, "loss": 0.0999, "step": 30440 }, { "epoch": 1.9082534310960706, "grad_norm": 0.04999106749892235, "learning_rate": 1.8387832218023687e-05, "loss": 0.143, "step": 30450 }, { "epoch": 1.9088801153098953, "grad_norm": 0.04987865686416626, "learning_rate": 1.83772772371282e-05, "loss": 0.0169, "step": 30460 }, { "epoch": 1.90950679952372, "grad_norm": 0.12620976567268372, "learning_rate": 1.836672225623272e-05, "loss": 0.1632, "step": 30470 }, { "epoch": 1.9101334837375448, "grad_norm": 0.0377831868827343, "learning_rate": 1.8356167275337233e-05, "loss": 0.0457, "step": 30480 }, { "epoch": 1.9107601679513693, "grad_norm": 0.10486970096826553, "learning_rate": 1.8345612294441746e-05, "loss": 0.0926, "step": 30490 }, { "epoch": 1.911386852165194, "grad_norm": 0.10072964429855347, "learning_rate": 1.8335057313546263e-05, "loss": 0.0815, "step": 30500 }, { "epoch": 1.9120135363790185, "grad_norm": 0.10867179930210114, "learning_rate": 1.832450233265078e-05, "loss": 0.0788, "step": 30510 }, { "epoch": 1.9126402205928432, "grad_norm": 0.022253375500440598, "learning_rate": 1.8313947351755296e-05, "loss": 0.076, "step": 30520 }, { "epoch": 1.913266904806668, "grad_norm": 0.15047207474708557, "learning_rate": 1.830339237085981e-05, "loss": 0.0035, "step": 30530 }, { "epoch": 1.9138935890204927, "grad_norm": 0.06832699477672577, "learning_rate": 1.8292837389964326e-05, "loss": 0.0788, "step": 30540 }, { "epoch": 1.9145202732343174, "grad_norm": 7.129006385803223, "learning_rate": 1.828228240906884e-05, "loss": 0.1998, "step": 30550 }, { "epoch": 1.915146957448142, "grad_norm": 0.014667839743196964, "learning_rate": 1.8271727428173356e-05, "loss": 0.0204, "step": 30560 }, { "epoch": 1.9157736416619664, "grad_norm": 0.0635824203491211, "learning_rate": 1.8261172447277873e-05, "loss": 0.0038, "step": 30570 }, { "epoch": 1.9164003258757911, "grad_norm": 0.14155304431915283, "learning_rate": 1.8250617466382386e-05, "loss": 0.1168, "step": 30580 }, { "epoch": 1.9170270100896158, "grad_norm": 15.040778160095215, "learning_rate": 1.8240062485486903e-05, "loss": 0.147, "step": 30590 }, { "epoch": 1.9176536943034406, "grad_norm": 0.07141759246587753, "learning_rate": 1.8229507504591416e-05, "loss": 0.0787, "step": 30600 }, { "epoch": 1.9182803785172653, "grad_norm": 33.47121810913086, "learning_rate": 1.8218952523695933e-05, "loss": 0.0976, "step": 30610 }, { "epoch": 1.9189070627310898, "grad_norm": 15.086108207702637, "learning_rate": 1.820839754280045e-05, "loss": 0.1981, "step": 30620 }, { "epoch": 1.9195337469449143, "grad_norm": 1.0487416982650757, "learning_rate": 1.8197842561904963e-05, "loss": 0.0605, "step": 30630 }, { "epoch": 1.920160431158739, "grad_norm": 0.14224369823932648, "learning_rate": 1.818728758100948e-05, "loss": 0.1471, "step": 30640 }, { "epoch": 1.9207871153725637, "grad_norm": 2.3341376781463623, "learning_rate": 1.8176732600113992e-05, "loss": 0.0515, "step": 30650 }, { "epoch": 1.9214137995863885, "grad_norm": 2.8665385246276855, "learning_rate": 1.816617761921851e-05, "loss": 0.0349, "step": 30660 }, { "epoch": 1.9220404838002132, "grad_norm": 0.08486692607402802, "learning_rate": 1.8155622638323026e-05, "loss": 0.0768, "step": 30670 }, { "epoch": 1.9226671680140377, "grad_norm": 5.3784332275390625, "learning_rate": 1.8145067657427542e-05, "loss": 0.1577, "step": 30680 }, { "epoch": 1.9232938522278624, "grad_norm": 0.010095058009028435, "learning_rate": 1.8134512676532056e-05, "loss": 0.0529, "step": 30690 }, { "epoch": 1.923920536441687, "grad_norm": 0.10281635820865631, "learning_rate": 1.8123957695636572e-05, "loss": 0.1099, "step": 30700 }, { "epoch": 1.9245472206555116, "grad_norm": 7.003781318664551, "learning_rate": 1.8113402714741086e-05, "loss": 0.162, "step": 30710 }, { "epoch": 1.9251739048693364, "grad_norm": 0.09196191281080246, "learning_rate": 1.8102847733845602e-05, "loss": 0.0531, "step": 30720 }, { "epoch": 1.925800589083161, "grad_norm": 0.5653356909751892, "learning_rate": 1.809229275295012e-05, "loss": 0.1701, "step": 30730 }, { "epoch": 1.9264272732969856, "grad_norm": 0.48378825187683105, "learning_rate": 1.8081737772054632e-05, "loss": 0.1326, "step": 30740 }, { "epoch": 1.9270539575108103, "grad_norm": 0.2154448926448822, "learning_rate": 1.807118279115915e-05, "loss": 0.1484, "step": 30750 }, { "epoch": 1.9276806417246348, "grad_norm": 0.0348966158926487, "learning_rate": 1.8060627810263665e-05, "loss": 0.0638, "step": 30760 }, { "epoch": 1.9283073259384595, "grad_norm": 0.03106912225484848, "learning_rate": 1.8050072829368182e-05, "loss": 0.002, "step": 30770 }, { "epoch": 1.9289340101522843, "grad_norm": 0.030033491551876068, "learning_rate": 1.8039517848472695e-05, "loss": 0.1182, "step": 30780 }, { "epoch": 1.929560694366109, "grad_norm": 0.008149535395205021, "learning_rate": 1.802896286757721e-05, "loss": 0.1093, "step": 30790 }, { "epoch": 1.9301873785799337, "grad_norm": 0.029186800122261047, "learning_rate": 1.8018407886681725e-05, "loss": 0.0311, "step": 30800 }, { "epoch": 1.9308140627937582, "grad_norm": 0.24340951442718506, "learning_rate": 1.8007852905786242e-05, "loss": 0.0338, "step": 30810 }, { "epoch": 1.9314407470075827, "grad_norm": 0.02873971126973629, "learning_rate": 1.799729792489076e-05, "loss": 0.0698, "step": 30820 }, { "epoch": 1.9320674312214074, "grad_norm": 0.01757604442536831, "learning_rate": 1.7986742943995272e-05, "loss": 0.0096, "step": 30830 }, { "epoch": 1.9326941154352322, "grad_norm": 0.012573912739753723, "learning_rate": 1.797618796309979e-05, "loss": 0.1414, "step": 30840 }, { "epoch": 1.933320799649057, "grad_norm": 7.6997480392456055, "learning_rate": 1.7965632982204302e-05, "loss": 0.1177, "step": 30850 }, { "epoch": 1.9339474838628816, "grad_norm": 0.007678247056901455, "learning_rate": 1.795507800130882e-05, "loss": 0.045, "step": 30860 }, { "epoch": 1.9345741680767061, "grad_norm": 0.006309192162007093, "learning_rate": 1.7944523020413335e-05, "loss": 0.0119, "step": 30870 }, { "epoch": 1.9352008522905308, "grad_norm": 0.0472506545484066, "learning_rate": 1.793396803951785e-05, "loss": 0.1314, "step": 30880 }, { "epoch": 1.9358275365043554, "grad_norm": 0.019679518416523933, "learning_rate": 1.7923413058622365e-05, "loss": 0.0072, "step": 30890 }, { "epoch": 1.93645422071818, "grad_norm": 0.025182457640767097, "learning_rate": 1.7912858077726878e-05, "loss": 0.0436, "step": 30900 }, { "epoch": 1.9370809049320048, "grad_norm": 20.712900161743164, "learning_rate": 1.7902303096831395e-05, "loss": 0.235, "step": 30910 }, { "epoch": 1.9377075891458295, "grad_norm": 0.10453003644943237, "learning_rate": 1.789174811593591e-05, "loss": 0.1638, "step": 30920 }, { "epoch": 1.938334273359654, "grad_norm": 0.261699914932251, "learning_rate": 1.7881193135040428e-05, "loss": 0.0507, "step": 30930 }, { "epoch": 1.9389609575734787, "grad_norm": 0.3301703631877899, "learning_rate": 1.787063815414494e-05, "loss": 0.1241, "step": 30940 }, { "epoch": 1.9395876417873033, "grad_norm": 0.11412771791219711, "learning_rate": 1.7860083173249455e-05, "loss": 0.0123, "step": 30950 }, { "epoch": 1.940214326001128, "grad_norm": 0.10817641019821167, "learning_rate": 1.7849528192353975e-05, "loss": 0.2673, "step": 30960 }, { "epoch": 1.9408410102149527, "grad_norm": 3.031972646713257, "learning_rate": 1.7838973211458488e-05, "loss": 0.0864, "step": 30970 }, { "epoch": 1.9414676944287774, "grad_norm": 0.1671517789363861, "learning_rate": 1.7828418230563005e-05, "loss": 0.1065, "step": 30980 }, { "epoch": 1.9420943786426021, "grad_norm": 0.06491947174072266, "learning_rate": 1.7817863249667518e-05, "loss": 0.046, "step": 30990 }, { "epoch": 1.9427210628564267, "grad_norm": 0.005951672792434692, "learning_rate": 1.7807308268772035e-05, "loss": 0.0031, "step": 31000 }, { "epoch": 1.9433477470702512, "grad_norm": 0.060672201216220856, "learning_rate": 1.779675328787655e-05, "loss": 0.124, "step": 31010 }, { "epoch": 1.9439744312840759, "grad_norm": 0.004647328983992338, "learning_rate": 1.7786198306981065e-05, "loss": 0.0021, "step": 31020 }, { "epoch": 1.9446011154979006, "grad_norm": 2.4108574390411377, "learning_rate": 1.777564332608558e-05, "loss": 0.0126, "step": 31030 }, { "epoch": 1.9452277997117253, "grad_norm": 0.024586204439401627, "learning_rate": 1.7765088345190094e-05, "loss": 0.0686, "step": 31040 }, { "epoch": 1.94585448392555, "grad_norm": 3.447274923324585, "learning_rate": 1.775453336429461e-05, "loss": 0.0141, "step": 31050 }, { "epoch": 1.9464811681393746, "grad_norm": 0.02935156226158142, "learning_rate": 1.7743978383399128e-05, "loss": 0.0688, "step": 31060 }, { "epoch": 1.9471078523531993, "grad_norm": 0.04828619956970215, "learning_rate": 1.7733423402503644e-05, "loss": 0.0683, "step": 31070 }, { "epoch": 1.9477345365670238, "grad_norm": 0.02683708630502224, "learning_rate": 1.7722868421608158e-05, "loss": 0.1011, "step": 31080 }, { "epoch": 1.9483612207808485, "grad_norm": 0.03948148712515831, "learning_rate": 1.7712313440712674e-05, "loss": 0.1203, "step": 31090 }, { "epoch": 1.9489879049946732, "grad_norm": 0.0707787424325943, "learning_rate": 1.7701758459817188e-05, "loss": 0.1169, "step": 31100 }, { "epoch": 1.949614589208498, "grad_norm": 0.21310770511627197, "learning_rate": 1.7691203478921704e-05, "loss": 0.0081, "step": 31110 }, { "epoch": 1.9502412734223225, "grad_norm": 0.798648476600647, "learning_rate": 1.768064849802622e-05, "loss": 0.0931, "step": 31120 }, { "epoch": 1.9508679576361472, "grad_norm": 0.0065465401858091354, "learning_rate": 1.7670093517130734e-05, "loss": 0.2703, "step": 31130 }, { "epoch": 1.9514946418499717, "grad_norm": 0.060641299933195114, "learning_rate": 1.765953853623525e-05, "loss": 0.088, "step": 31140 }, { "epoch": 1.9521213260637964, "grad_norm": 64.52513122558594, "learning_rate": 1.7648983555339764e-05, "loss": 0.1919, "step": 31150 }, { "epoch": 1.9527480102776211, "grad_norm": 0.5025072693824768, "learning_rate": 1.763842857444428e-05, "loss": 0.1127, "step": 31160 }, { "epoch": 1.9533746944914459, "grad_norm": 0.23550131916999817, "learning_rate": 1.7627873593548797e-05, "loss": 0.0814, "step": 31170 }, { "epoch": 1.9540013787052706, "grad_norm": 0.006434514187276363, "learning_rate": 1.761731861265331e-05, "loss": 0.0089, "step": 31180 }, { "epoch": 1.954628062919095, "grad_norm": 0.3607669174671173, "learning_rate": 1.7606763631757827e-05, "loss": 0.0767, "step": 31190 }, { "epoch": 1.9552547471329196, "grad_norm": 0.012200615368783474, "learning_rate": 1.759620865086234e-05, "loss": 0.007, "step": 31200 }, { "epoch": 1.9558814313467443, "grad_norm": 0.04041421413421631, "learning_rate": 1.758565366996686e-05, "loss": 0.0545, "step": 31210 }, { "epoch": 1.956508115560569, "grad_norm": 0.015096294693648815, "learning_rate": 1.7575098689071374e-05, "loss": 0.0678, "step": 31220 }, { "epoch": 1.9571347997743938, "grad_norm": 0.08720281720161438, "learning_rate": 1.756454370817589e-05, "loss": 0.1194, "step": 31230 }, { "epoch": 1.9577614839882185, "grad_norm": 0.013159573078155518, "learning_rate": 1.7553988727280404e-05, "loss": 0.0914, "step": 31240 }, { "epoch": 1.958388168202043, "grad_norm": 4.2139458656311035, "learning_rate": 1.7543433746384917e-05, "loss": 0.2414, "step": 31250 }, { "epoch": 1.9590148524158675, "grad_norm": 0.3159102499485016, "learning_rate": 1.7532878765489437e-05, "loss": 0.0105, "step": 31260 }, { "epoch": 1.9596415366296922, "grad_norm": 0.03543320670723915, "learning_rate": 1.752232378459395e-05, "loss": 0.076, "step": 31270 }, { "epoch": 1.960268220843517, "grad_norm": 0.19333113729953766, "learning_rate": 1.7511768803698467e-05, "loss": 0.036, "step": 31280 }, { "epoch": 1.9608949050573417, "grad_norm": 0.017696138471364975, "learning_rate": 1.750121382280298e-05, "loss": 0.0557, "step": 31290 }, { "epoch": 1.9615215892711664, "grad_norm": 0.19619330763816833, "learning_rate": 1.7490658841907497e-05, "loss": 0.0709, "step": 31300 }, { "epoch": 1.9621482734849909, "grad_norm": 2.6561007499694824, "learning_rate": 1.7480103861012014e-05, "loss": 0.0741, "step": 31310 }, { "epoch": 1.9627749576988156, "grad_norm": 0.20458722114562988, "learning_rate": 1.746954888011653e-05, "loss": 0.1059, "step": 31320 }, { "epoch": 1.96340164191264, "grad_norm": 0.01259886845946312, "learning_rate": 1.7458993899221043e-05, "loss": 0.2263, "step": 31330 }, { "epoch": 1.9640283261264648, "grad_norm": 0.17994363605976105, "learning_rate": 1.7448438918325557e-05, "loss": 0.044, "step": 31340 }, { "epoch": 1.9646550103402896, "grad_norm": 0.0707567110657692, "learning_rate": 1.7437883937430073e-05, "loss": 0.0222, "step": 31350 }, { "epoch": 1.9652816945541143, "grad_norm": 0.019019365310668945, "learning_rate": 1.742732895653459e-05, "loss": 0.0041, "step": 31360 }, { "epoch": 1.965908378767939, "grad_norm": 0.030839603394269943, "learning_rate": 1.7416773975639107e-05, "loss": 0.1864, "step": 31370 }, { "epoch": 1.9665350629817635, "grad_norm": 0.028285950422286987, "learning_rate": 1.740621899474362e-05, "loss": 0.0039, "step": 31380 }, { "epoch": 1.967161747195588, "grad_norm": 0.055145930498838425, "learning_rate": 1.7395664013848137e-05, "loss": 0.0156, "step": 31390 }, { "epoch": 1.9677884314094127, "grad_norm": 0.01977921463549137, "learning_rate": 1.738510903295265e-05, "loss": 0.1479, "step": 31400 }, { "epoch": 1.9684151156232375, "grad_norm": 0.034482572227716446, "learning_rate": 1.7374554052057167e-05, "loss": 0.0572, "step": 31410 }, { "epoch": 1.9690417998370622, "grad_norm": 6.365627288818359, "learning_rate": 1.7363999071161683e-05, "loss": 0.1508, "step": 31420 }, { "epoch": 1.969668484050887, "grad_norm": 0.18992501497268677, "learning_rate": 1.7353444090266196e-05, "loss": 0.1212, "step": 31430 }, { "epoch": 1.9702951682647114, "grad_norm": 95.36531066894531, "learning_rate": 1.7342889109370713e-05, "loss": 0.0771, "step": 31440 }, { "epoch": 1.970921852478536, "grad_norm": 0.13923226296901703, "learning_rate": 1.7332334128475226e-05, "loss": 0.074, "step": 31450 }, { "epoch": 1.9715485366923606, "grad_norm": 0.07367946207523346, "learning_rate": 1.7321779147579746e-05, "loss": 0.244, "step": 31460 }, { "epoch": 1.9721752209061854, "grad_norm": 0.11902827024459839, "learning_rate": 1.731122416668426e-05, "loss": 0.1009, "step": 31470 }, { "epoch": 1.97280190512001, "grad_norm": 4.948719501495361, "learning_rate": 1.7300669185788776e-05, "loss": 0.0484, "step": 31480 }, { "epoch": 1.9734285893338348, "grad_norm": 6.367574691772461, "learning_rate": 1.729011420489329e-05, "loss": 0.0909, "step": 31490 }, { "epoch": 1.9740552735476593, "grad_norm": 0.18085089325904846, "learning_rate": 1.7279559223997803e-05, "loss": 0.1165, "step": 31500 }, { "epoch": 1.974681957761484, "grad_norm": 1.338959813117981, "learning_rate": 1.7269004243102323e-05, "loss": 0.044, "step": 31510 }, { "epoch": 1.9753086419753085, "grad_norm": 0.16457167267799377, "learning_rate": 1.7258449262206836e-05, "loss": 0.1149, "step": 31520 }, { "epoch": 1.9759353261891333, "grad_norm": 0.05906875804066658, "learning_rate": 1.7247894281311353e-05, "loss": 0.0059, "step": 31530 }, { "epoch": 1.976562010402958, "grad_norm": 0.09920290857553482, "learning_rate": 1.7237339300415866e-05, "loss": 0.3544, "step": 31540 }, { "epoch": 1.9771886946167827, "grad_norm": 0.11022662371397018, "learning_rate": 1.7226784319520383e-05, "loss": 0.1279, "step": 31550 }, { "epoch": 1.9778153788306072, "grad_norm": 0.21044260263442993, "learning_rate": 1.72162293386249e-05, "loss": 0.2122, "step": 31560 }, { "epoch": 1.978442063044432, "grad_norm": 3.5601449012756348, "learning_rate": 1.7205674357729413e-05, "loss": 0.0426, "step": 31570 }, { "epoch": 1.9790687472582564, "grad_norm": 0.15041138231754303, "learning_rate": 1.719511937683393e-05, "loss": 0.0905, "step": 31580 }, { "epoch": 1.9796954314720812, "grad_norm": 0.2093459963798523, "learning_rate": 1.7184564395938443e-05, "loss": 0.151, "step": 31590 }, { "epoch": 1.9803221156859059, "grad_norm": 0.0759732648730278, "learning_rate": 1.717400941504296e-05, "loss": 0.0065, "step": 31600 }, { "epoch": 1.9809487998997306, "grad_norm": 6.305356502532959, "learning_rate": 1.7163454434147476e-05, "loss": 0.2713, "step": 31610 }, { "epoch": 1.9815754841135553, "grad_norm": 0.15519599616527557, "learning_rate": 1.7152899453251992e-05, "loss": 0.0993, "step": 31620 }, { "epoch": 1.9822021683273798, "grad_norm": 0.022300608456134796, "learning_rate": 1.7142344472356506e-05, "loss": 0.1047, "step": 31630 }, { "epoch": 1.9828288525412043, "grad_norm": 0.06441289931535721, "learning_rate": 1.713178949146102e-05, "loss": 0.0446, "step": 31640 }, { "epoch": 1.983455536755029, "grad_norm": 0.13338226079940796, "learning_rate": 1.7121234510565536e-05, "loss": 0.0501, "step": 31650 }, { "epoch": 1.9840822209688538, "grad_norm": 0.06936602294445038, "learning_rate": 1.7110679529670052e-05, "loss": 0.0919, "step": 31660 }, { "epoch": 1.9847089051826785, "grad_norm": 0.03395620360970497, "learning_rate": 1.710012454877457e-05, "loss": 0.1159, "step": 31670 }, { "epoch": 1.9853355893965032, "grad_norm": 3.857844591140747, "learning_rate": 1.7089569567879082e-05, "loss": 0.0358, "step": 31680 }, { "epoch": 1.9859622736103277, "grad_norm": 12.544042587280273, "learning_rate": 1.70790145869836e-05, "loss": 0.1129, "step": 31690 }, { "epoch": 1.9865889578241525, "grad_norm": 0.047209981828927994, "learning_rate": 1.7068459606088112e-05, "loss": 0.0967, "step": 31700 }, { "epoch": 1.987215642037977, "grad_norm": 0.01187932025641203, "learning_rate": 1.7057904625192632e-05, "loss": 0.0957, "step": 31710 }, { "epoch": 1.9878423262518017, "grad_norm": 0.009124066680669785, "learning_rate": 1.7047349644297145e-05, "loss": 0.0752, "step": 31720 }, { "epoch": 1.9884690104656264, "grad_norm": 0.06783958524465561, "learning_rate": 1.703679466340166e-05, "loss": 0.0324, "step": 31730 }, { "epoch": 1.9890956946794511, "grad_norm": 4.436092376708984, "learning_rate": 1.7026239682506175e-05, "loss": 0.0974, "step": 31740 }, { "epoch": 1.9897223788932756, "grad_norm": 8.211563110351562, "learning_rate": 1.701568470161069e-05, "loss": 0.1157, "step": 31750 }, { "epoch": 1.9903490631071004, "grad_norm": 6.582380294799805, "learning_rate": 1.700512972071521e-05, "loss": 0.2133, "step": 31760 }, { "epoch": 1.9909757473209249, "grad_norm": 0.029442772269248962, "learning_rate": 1.6994574739819722e-05, "loss": 0.0751, "step": 31770 }, { "epoch": 1.9916024315347496, "grad_norm": 10.132109642028809, "learning_rate": 1.698401975892424e-05, "loss": 0.1387, "step": 31780 }, { "epoch": 1.9922291157485743, "grad_norm": 7.6260576248168945, "learning_rate": 1.6973464778028752e-05, "loss": 0.0754, "step": 31790 }, { "epoch": 1.992855799962399, "grad_norm": 0.12112826108932495, "learning_rate": 1.696290979713327e-05, "loss": 0.0355, "step": 31800 }, { "epoch": 1.9934824841762238, "grad_norm": 0.12398527562618256, "learning_rate": 1.6952354816237785e-05, "loss": 0.1311, "step": 31810 }, { "epoch": 1.9941091683900483, "grad_norm": 0.023617800325155258, "learning_rate": 1.69417998353423e-05, "loss": 0.0997, "step": 31820 }, { "epoch": 1.9947358526038728, "grad_norm": 0.919659435749054, "learning_rate": 1.6931244854446815e-05, "loss": 0.0071, "step": 31830 }, { "epoch": 1.9953625368176975, "grad_norm": 0.2467774897813797, "learning_rate": 1.6920689873551328e-05, "loss": 0.0057, "step": 31840 }, { "epoch": 1.9959892210315222, "grad_norm": 7.434739112854004, "learning_rate": 1.6910134892655845e-05, "loss": 0.1482, "step": 31850 }, { "epoch": 1.996615905245347, "grad_norm": 0.011252244003117085, "learning_rate": 1.689957991176036e-05, "loss": 0.1319, "step": 31860 }, { "epoch": 1.9972425894591717, "grad_norm": 0.03588128834962845, "learning_rate": 1.6889024930864875e-05, "loss": 0.0033, "step": 31870 }, { "epoch": 1.9978692736729962, "grad_norm": 0.03404240682721138, "learning_rate": 1.687846994996939e-05, "loss": 0.1253, "step": 31880 }, { "epoch": 1.9984959578868209, "grad_norm": 5.999427318572998, "learning_rate": 1.6867914969073905e-05, "loss": 0.2037, "step": 31890 }, { "epoch": 1.9991226421006454, "grad_norm": 0.19319628179073334, "learning_rate": 1.685735998817842e-05, "loss": 0.0178, "step": 31900 }, { "epoch": 1.99974932631447, "grad_norm": 8.734156608581543, "learning_rate": 1.6846805007282938e-05, "loss": 0.1435, "step": 31910 }, { "epoch": 2.0, "eval_accuracy": 0.9659094469685101, "eval_f1": 0.9651640718639931, "eval_loss": 0.1303279846906662, "eval_precision": 0.9648435825911362, "eval_recall": 0.9659094469685101, "eval_runtime": 288.0737, "eval_samples_per_second": 110.788, "eval_steps_per_second": 13.851, "step": 31914 }, { "epoch": 2.000376010528295, "grad_norm": 0.5314622521400452, "learning_rate": 1.6836250026387455e-05, "loss": 0.0033, "step": 31920 }, { "epoch": 2.0010026947421196, "grad_norm": 0.09729882329702377, "learning_rate": 1.6825695045491968e-05, "loss": 0.0044, "step": 31930 }, { "epoch": 2.0016293789559443, "grad_norm": 0.02791699394583702, "learning_rate": 1.6815140064596485e-05, "loss": 0.0358, "step": 31940 }, { "epoch": 2.0022560631697686, "grad_norm": 9.531925201416016, "learning_rate": 1.6804585083700998e-05, "loss": 0.0777, "step": 31950 }, { "epoch": 2.0028827473835933, "grad_norm": 0.03661125525832176, "learning_rate": 1.6794030102805515e-05, "loss": 0.1605, "step": 31960 }, { "epoch": 2.003509431597418, "grad_norm": 0.055404722690582275, "learning_rate": 1.678347512191003e-05, "loss": 0.1094, "step": 31970 }, { "epoch": 2.0041361158112427, "grad_norm": 18.206174850463867, "learning_rate": 1.6772920141014544e-05, "loss": 0.0729, "step": 31980 }, { "epoch": 2.0047628000250675, "grad_norm": 0.02194293774664402, "learning_rate": 1.676236516011906e-05, "loss": 0.0643, "step": 31990 }, { "epoch": 2.005389484238892, "grad_norm": 0.046822380274534225, "learning_rate": 1.6751810179223574e-05, "loss": 0.0398, "step": 32000 }, { "epoch": 2.0060161684527165, "grad_norm": 0.021439900621771812, "learning_rate": 1.6741255198328094e-05, "loss": 0.0673, "step": 32010 }, { "epoch": 2.006642852666541, "grad_norm": 0.1626669466495514, "learning_rate": 1.6730700217432608e-05, "loss": 0.158, "step": 32020 }, { "epoch": 2.007269536880366, "grad_norm": 0.24504497647285461, "learning_rate": 1.672014523653712e-05, "loss": 0.0311, "step": 32030 }, { "epoch": 2.0078962210941906, "grad_norm": 0.02000507339835167, "learning_rate": 1.6709590255641638e-05, "loss": 0.0054, "step": 32040 }, { "epoch": 2.0085229053080154, "grad_norm": 1.3739451169967651, "learning_rate": 1.6699035274746154e-05, "loss": 0.0444, "step": 32050 }, { "epoch": 2.00914958952184, "grad_norm": 0.028412913903594017, "learning_rate": 1.668848029385067e-05, "loss": 0.0455, "step": 32060 }, { "epoch": 2.009776273735665, "grad_norm": 0.13429409265518188, "learning_rate": 1.6677925312955184e-05, "loss": 0.0704, "step": 32070 }, { "epoch": 2.010402957949489, "grad_norm": 0.25238388776779175, "learning_rate": 1.66673703320597e-05, "loss": 0.0777, "step": 32080 }, { "epoch": 2.011029642163314, "grad_norm": 0.03479380160570145, "learning_rate": 1.6656815351164214e-05, "loss": 0.025, "step": 32090 }, { "epoch": 2.0116563263771385, "grad_norm": 0.01721784472465515, "learning_rate": 1.664626037026873e-05, "loss": 0.033, "step": 32100 }, { "epoch": 2.0122830105909633, "grad_norm": 0.024397028610110283, "learning_rate": 1.6635705389373247e-05, "loss": 0.1333, "step": 32110 }, { "epoch": 2.012909694804788, "grad_norm": 0.009649792686104774, "learning_rate": 1.662515040847776e-05, "loss": 0.0016, "step": 32120 }, { "epoch": 2.0135363790186127, "grad_norm": 0.037589482963085175, "learning_rate": 1.6614595427582277e-05, "loss": 0.0221, "step": 32130 }, { "epoch": 2.014163063232437, "grad_norm": 0.04780086502432823, "learning_rate": 1.660404044668679e-05, "loss": 0.1299, "step": 32140 }, { "epoch": 2.0147897474462617, "grad_norm": 14.039616584777832, "learning_rate": 1.6593485465791307e-05, "loss": 0.0447, "step": 32150 }, { "epoch": 2.0154164316600864, "grad_norm": 0.18234844505786896, "learning_rate": 1.6582930484895824e-05, "loss": 0.1608, "step": 32160 }, { "epoch": 2.016043115873911, "grad_norm": 44.759552001953125, "learning_rate": 1.657237550400034e-05, "loss": 0.1685, "step": 32170 }, { "epoch": 2.016669800087736, "grad_norm": 6.447089672088623, "learning_rate": 1.6561820523104854e-05, "loss": 0.0545, "step": 32180 }, { "epoch": 2.0172964843015606, "grad_norm": 8.790861129760742, "learning_rate": 1.6551265542209367e-05, "loss": 0.0656, "step": 32190 }, { "epoch": 2.017923168515385, "grad_norm": 0.033416662365198135, "learning_rate": 1.6540710561313884e-05, "loss": 0.0715, "step": 32200 }, { "epoch": 2.0185498527292096, "grad_norm": 0.03852026164531708, "learning_rate": 1.65301555804184e-05, "loss": 0.0336, "step": 32210 }, { "epoch": 2.0191765369430343, "grad_norm": 0.06000113487243652, "learning_rate": 1.6519600599522917e-05, "loss": 0.1005, "step": 32220 }, { "epoch": 2.019803221156859, "grad_norm": 0.05467413365840912, "learning_rate": 1.650904561862743e-05, "loss": 0.0024, "step": 32230 }, { "epoch": 2.020429905370684, "grad_norm": 0.5257253646850586, "learning_rate": 1.6498490637731947e-05, "loss": 0.1164, "step": 32240 }, { "epoch": 2.0210565895845085, "grad_norm": 0.007564585190266371, "learning_rate": 1.648793565683646e-05, "loss": 0.0011, "step": 32250 }, { "epoch": 2.0216832737983332, "grad_norm": 0.01452536229044199, "learning_rate": 1.6477380675940977e-05, "loss": 0.1252, "step": 32260 }, { "epoch": 2.0223099580121575, "grad_norm": 0.05928044021129608, "learning_rate": 1.6466825695045493e-05, "loss": 0.1359, "step": 32270 }, { "epoch": 2.0229366422259822, "grad_norm": 0.030591899529099464, "learning_rate": 1.6456270714150007e-05, "loss": 0.1113, "step": 32280 }, { "epoch": 2.023563326439807, "grad_norm": 0.016560450196266174, "learning_rate": 1.6445715733254523e-05, "loss": 0.0031, "step": 32290 }, { "epoch": 2.0241900106536317, "grad_norm": 0.04848318174481392, "learning_rate": 1.643516075235904e-05, "loss": 0.1171, "step": 32300 }, { "epoch": 2.0248166948674564, "grad_norm": 0.062350522726774216, "learning_rate": 1.6424605771463557e-05, "loss": 0.0333, "step": 32310 }, { "epoch": 2.025443379081281, "grad_norm": 0.1351897269487381, "learning_rate": 1.641405079056807e-05, "loss": 0.0522, "step": 32320 }, { "epoch": 2.0260700632951054, "grad_norm": 0.12988221645355225, "learning_rate": 1.6403495809672587e-05, "loss": 0.0363, "step": 32330 }, { "epoch": 2.02669674750893, "grad_norm": 14.142264366149902, "learning_rate": 1.63929408287771e-05, "loss": 0.1323, "step": 32340 }, { "epoch": 2.027323431722755, "grad_norm": 0.017919203266501427, "learning_rate": 1.6382385847881617e-05, "loss": 0.0035, "step": 32350 }, { "epoch": 2.0279501159365796, "grad_norm": 0.04357292130589485, "learning_rate": 1.6371830866986133e-05, "loss": 0.0871, "step": 32360 }, { "epoch": 2.0285768001504043, "grad_norm": 0.6641724705696106, "learning_rate": 1.6361275886090646e-05, "loss": 0.0236, "step": 32370 }, { "epoch": 2.029203484364229, "grad_norm": 0.39285650849342346, "learning_rate": 1.6350720905195163e-05, "loss": 0.0043, "step": 32380 }, { "epoch": 2.0298301685780533, "grad_norm": 0.14640651643276215, "learning_rate": 1.6340165924299676e-05, "loss": 0.0778, "step": 32390 }, { "epoch": 2.030456852791878, "grad_norm": 0.006762874778360128, "learning_rate": 1.6329610943404193e-05, "loss": 0.1194, "step": 32400 }, { "epoch": 2.0310835370057028, "grad_norm": 0.7191759943962097, "learning_rate": 1.631905596250871e-05, "loss": 0.0866, "step": 32410 }, { "epoch": 2.0317102212195275, "grad_norm": 0.007037244271486998, "learning_rate": 1.6308500981613223e-05, "loss": 0.1469, "step": 32420 }, { "epoch": 2.032336905433352, "grad_norm": 0.46012216806411743, "learning_rate": 1.629794600071774e-05, "loss": 0.0055, "step": 32430 }, { "epoch": 2.032963589647177, "grad_norm": 10.74429702758789, "learning_rate": 1.6287391019822253e-05, "loss": 0.1375, "step": 32440 }, { "epoch": 2.0335902738610017, "grad_norm": 0.01474764384329319, "learning_rate": 1.627683603892677e-05, "loss": 0.0703, "step": 32450 }, { "epoch": 2.034216958074826, "grad_norm": 0.51497882604599, "learning_rate": 1.6266281058031286e-05, "loss": 0.0588, "step": 32460 }, { "epoch": 2.0348436422886507, "grad_norm": 5.204861164093018, "learning_rate": 1.6255726077135803e-05, "loss": 0.0556, "step": 32470 }, { "epoch": 2.0354703265024754, "grad_norm": 0.11711680889129639, "learning_rate": 1.6245171096240316e-05, "loss": 0.0082, "step": 32480 }, { "epoch": 2.0360970107163, "grad_norm": 0.026732532307505608, "learning_rate": 1.623461611534483e-05, "loss": 0.0287, "step": 32490 }, { "epoch": 2.036723694930125, "grad_norm": 0.21844738721847534, "learning_rate": 1.6224061134449346e-05, "loss": 0.1675, "step": 32500 }, { "epoch": 2.0373503791439496, "grad_norm": 0.10609474033117294, "learning_rate": 1.6213506153553863e-05, "loss": 0.0282, "step": 32510 }, { "epoch": 2.037977063357774, "grad_norm": 0.40876927971839905, "learning_rate": 1.620295117265838e-05, "loss": 0.0028, "step": 32520 }, { "epoch": 2.0386037475715986, "grad_norm": 0.03981143981218338, "learning_rate": 1.6192396191762893e-05, "loss": 0.0257, "step": 32530 }, { "epoch": 2.0392304317854233, "grad_norm": 0.20432570576667786, "learning_rate": 1.618184121086741e-05, "loss": 0.1235, "step": 32540 }, { "epoch": 2.039857115999248, "grad_norm": 0.09158885478973389, "learning_rate": 1.6171286229971926e-05, "loss": 0.0567, "step": 32550 }, { "epoch": 2.0404838002130727, "grad_norm": 0.09947940707206726, "learning_rate": 1.6160731249076442e-05, "loss": 0.1098, "step": 32560 }, { "epoch": 2.0411104844268975, "grad_norm": 0.004586049355566502, "learning_rate": 1.6150176268180956e-05, "loss": 0.0011, "step": 32570 }, { "epoch": 2.0417371686407217, "grad_norm": 0.02537638321518898, "learning_rate": 1.613962128728547e-05, "loss": 0.0755, "step": 32580 }, { "epoch": 2.0423638528545465, "grad_norm": 3.6461362838745117, "learning_rate": 1.6129066306389986e-05, "loss": 0.0686, "step": 32590 }, { "epoch": 2.042990537068371, "grad_norm": 0.021860415115952492, "learning_rate": 1.6118511325494502e-05, "loss": 0.0257, "step": 32600 }, { "epoch": 2.043617221282196, "grad_norm": 0.004400935955345631, "learning_rate": 1.610795634459902e-05, "loss": 0.1238, "step": 32610 }, { "epoch": 2.0442439054960206, "grad_norm": 0.009431498125195503, "learning_rate": 1.6097401363703532e-05, "loss": 0.0238, "step": 32620 }, { "epoch": 2.0448705897098454, "grad_norm": 0.026406893506646156, "learning_rate": 1.608684638280805e-05, "loss": 0.1584, "step": 32630 }, { "epoch": 2.0454972739236696, "grad_norm": 0.01052199024707079, "learning_rate": 1.6076291401912562e-05, "loss": 0.0757, "step": 32640 }, { "epoch": 2.0461239581374944, "grad_norm": 0.012562397867441177, "learning_rate": 1.606573642101708e-05, "loss": 0.0015, "step": 32650 }, { "epoch": 2.046750642351319, "grad_norm": 6.99421501159668, "learning_rate": 1.6055181440121595e-05, "loss": 0.1914, "step": 32660 }, { "epoch": 2.047377326565144, "grad_norm": 0.0524737723171711, "learning_rate": 1.604462645922611e-05, "loss": 0.0706, "step": 32670 }, { "epoch": 2.0480040107789685, "grad_norm": 0.019907524809241295, "learning_rate": 1.6034071478330625e-05, "loss": 0.0021, "step": 32680 }, { "epoch": 2.0486306949927933, "grad_norm": 0.44236546754837036, "learning_rate": 1.602351649743514e-05, "loss": 0.1079, "step": 32690 }, { "epoch": 2.049257379206618, "grad_norm": 0.02534150891005993, "learning_rate": 1.6012961516539655e-05, "loss": 0.0519, "step": 32700 }, { "epoch": 2.0498840634204423, "grad_norm": 0.19184216856956482, "learning_rate": 1.6002406535644172e-05, "loss": 0.0912, "step": 32710 }, { "epoch": 2.050510747634267, "grad_norm": 0.3994498550891876, "learning_rate": 1.599185155474869e-05, "loss": 0.0373, "step": 32720 }, { "epoch": 2.0511374318480917, "grad_norm": 0.004602975212037563, "learning_rate": 1.5981296573853202e-05, "loss": 0.0745, "step": 32730 }, { "epoch": 2.0517641160619164, "grad_norm": 0.057611722499132156, "learning_rate": 1.5970741592957715e-05, "loss": 0.0013, "step": 32740 }, { "epoch": 2.052390800275741, "grad_norm": 0.004064835608005524, "learning_rate": 1.5960186612062232e-05, "loss": 0.0731, "step": 32750 }, { "epoch": 2.053017484489566, "grad_norm": 0.04619257152080536, "learning_rate": 1.594963163116675e-05, "loss": 0.0281, "step": 32760 }, { "epoch": 2.05364416870339, "grad_norm": 0.8780233263969421, "learning_rate": 1.5939076650271265e-05, "loss": 0.1676, "step": 32770 }, { "epoch": 2.054270852917215, "grad_norm": 4.63099479675293, "learning_rate": 1.592852166937578e-05, "loss": 0.0789, "step": 32780 }, { "epoch": 2.0548975371310396, "grad_norm": 0.012038362212479115, "learning_rate": 1.5917966688480295e-05, "loss": 0.0009, "step": 32790 }, { "epoch": 2.0555242213448643, "grad_norm": 0.006544423755258322, "learning_rate": 1.590741170758481e-05, "loss": 0.0943, "step": 32800 }, { "epoch": 2.056150905558689, "grad_norm": 0.023845335468649864, "learning_rate": 1.5896856726689325e-05, "loss": 0.0077, "step": 32810 }, { "epoch": 2.056777589772514, "grad_norm": 21.74514389038086, "learning_rate": 1.588630174579384e-05, "loss": 0.0326, "step": 32820 }, { "epoch": 2.057404273986338, "grad_norm": 0.04374400153756142, "learning_rate": 1.5875746764898355e-05, "loss": 0.3699, "step": 32830 }, { "epoch": 2.058030958200163, "grad_norm": 0.160582035779953, "learning_rate": 1.586519178400287e-05, "loss": 0.1196, "step": 32840 }, { "epoch": 2.0586576424139875, "grad_norm": 0.36026671528816223, "learning_rate": 1.5854636803107388e-05, "loss": 0.0051, "step": 32850 }, { "epoch": 2.0592843266278122, "grad_norm": 0.07786229997873306, "learning_rate": 1.5844081822211905e-05, "loss": 0.0447, "step": 32860 }, { "epoch": 2.059911010841637, "grad_norm": 0.37609755992889404, "learning_rate": 1.5833526841316418e-05, "loss": 0.0557, "step": 32870 }, { "epoch": 2.0605376950554617, "grad_norm": 3.595386028289795, "learning_rate": 1.582297186042093e-05, "loss": 0.1036, "step": 32880 }, { "epoch": 2.0611643792692864, "grad_norm": 0.2091987282037735, "learning_rate": 1.5812416879525448e-05, "loss": 0.1075, "step": 32890 }, { "epoch": 2.0617910634831107, "grad_norm": 0.09946735948324203, "learning_rate": 1.5801861898629965e-05, "loss": 0.0028, "step": 32900 }, { "epoch": 2.0624177476969354, "grad_norm": 0.05795929208397865, "learning_rate": 1.579130691773448e-05, "loss": 0.0835, "step": 32910 }, { "epoch": 2.06304443191076, "grad_norm": 0.017441829666495323, "learning_rate": 1.5780751936838994e-05, "loss": 0.128, "step": 32920 }, { "epoch": 2.063671116124585, "grad_norm": 0.10752920806407928, "learning_rate": 1.577019695594351e-05, "loss": 0.042, "step": 32930 }, { "epoch": 2.0642978003384096, "grad_norm": 0.06938701122999191, "learning_rate": 1.5759641975048024e-05, "loss": 0.0017, "step": 32940 }, { "epoch": 2.0649244845522343, "grad_norm": 0.4177788496017456, "learning_rate": 1.574908699415254e-05, "loss": 0.0966, "step": 32950 }, { "epoch": 2.0655511687660586, "grad_norm": 0.1389980912208557, "learning_rate": 1.5738532013257058e-05, "loss": 0.0348, "step": 32960 }, { "epoch": 2.0661778529798833, "grad_norm": 0.0164639949798584, "learning_rate": 1.572797703236157e-05, "loss": 0.0025, "step": 32970 }, { "epoch": 2.066804537193708, "grad_norm": 0.043135009706020355, "learning_rate": 1.5717422051466088e-05, "loss": 0.0307, "step": 32980 }, { "epoch": 2.0674312214075328, "grad_norm": 0.011760778725147247, "learning_rate": 1.57068670705706e-05, "loss": 0.0655, "step": 32990 }, { "epoch": 2.0680579056213575, "grad_norm": 5.448158264160156, "learning_rate": 1.569631208967512e-05, "loss": 0.0796, "step": 33000 }, { "epoch": 2.068684589835182, "grad_norm": 1.601997971534729, "learning_rate": 1.5685757108779634e-05, "loss": 0.0354, "step": 33010 }, { "epoch": 2.0693112740490065, "grad_norm": 0.006840604357421398, "learning_rate": 1.567520212788415e-05, "loss": 0.1773, "step": 33020 }, { "epoch": 2.0699379582628312, "grad_norm": 11.032891273498535, "learning_rate": 1.5664647146988664e-05, "loss": 0.1046, "step": 33030 }, { "epoch": 2.070564642476656, "grad_norm": 0.19698861241340637, "learning_rate": 1.5654092166093177e-05, "loss": 0.2586, "step": 33040 }, { "epoch": 2.0711913266904807, "grad_norm": 0.009995955042541027, "learning_rate": 1.5643537185197697e-05, "loss": 0.1307, "step": 33050 }, { "epoch": 2.0718180109043054, "grad_norm": 0.4533624053001404, "learning_rate": 1.563298220430221e-05, "loss": 0.057, "step": 33060 }, { "epoch": 2.07244469511813, "grad_norm": 0.14136387407779694, "learning_rate": 1.5622427223406727e-05, "loss": 0.1027, "step": 33070 }, { "epoch": 2.073071379331955, "grad_norm": 0.011321029625833035, "learning_rate": 1.561187224251124e-05, "loss": 0.0377, "step": 33080 }, { "epoch": 2.073698063545779, "grad_norm": 0.09532028436660767, "learning_rate": 1.5601317261615757e-05, "loss": 0.0028, "step": 33090 }, { "epoch": 2.074324747759604, "grad_norm": 0.013551967218518257, "learning_rate": 1.5590762280720274e-05, "loss": 0.3631, "step": 33100 }, { "epoch": 2.0749514319734286, "grad_norm": 0.023686746135354042, "learning_rate": 1.558020729982479e-05, "loss": 0.0351, "step": 33110 }, { "epoch": 2.0755781161872533, "grad_norm": 0.0980924665927887, "learning_rate": 1.5569652318929304e-05, "loss": 0.0031, "step": 33120 }, { "epoch": 2.076204800401078, "grad_norm": 0.024400917813181877, "learning_rate": 1.5559097338033817e-05, "loss": 0.0048, "step": 33130 }, { "epoch": 2.0768314846149027, "grad_norm": 0.01752445660531521, "learning_rate": 1.5548542357138334e-05, "loss": 0.0031, "step": 33140 }, { "epoch": 2.077458168828727, "grad_norm": 11.409141540527344, "learning_rate": 1.553798737624285e-05, "loss": 0.1458, "step": 33150 }, { "epoch": 2.0780848530425517, "grad_norm": 0.10126788914203644, "learning_rate": 1.5527432395347367e-05, "loss": 0.0606, "step": 33160 }, { "epoch": 2.0787115372563765, "grad_norm": 0.03489801287651062, "learning_rate": 1.551687741445188e-05, "loss": 0.004, "step": 33170 }, { "epoch": 2.079338221470201, "grad_norm": 0.07026004046201706, "learning_rate": 1.5506322433556397e-05, "loss": 0.1357, "step": 33180 }, { "epoch": 2.079964905684026, "grad_norm": 0.2072756290435791, "learning_rate": 1.549576745266091e-05, "loss": 0.0958, "step": 33190 }, { "epoch": 2.0805915898978506, "grad_norm": 9.07013988494873, "learning_rate": 1.5485212471765427e-05, "loss": 0.2482, "step": 33200 }, { "epoch": 2.081218274111675, "grad_norm": 4.005678653717041, "learning_rate": 1.5474657490869943e-05, "loss": 0.2936, "step": 33210 }, { "epoch": 2.0818449583254997, "grad_norm": 0.03268539160490036, "learning_rate": 1.5464102509974457e-05, "loss": 0.1032, "step": 33220 }, { "epoch": 2.0824716425393244, "grad_norm": 0.9119426608085632, "learning_rate": 1.5453547529078973e-05, "loss": 0.0428, "step": 33230 }, { "epoch": 2.083098326753149, "grad_norm": 0.14467692375183105, "learning_rate": 1.5442992548183487e-05, "loss": 0.0198, "step": 33240 }, { "epoch": 2.083725010966974, "grad_norm": 0.19485610723495483, "learning_rate": 1.5432437567288007e-05, "loss": 0.0031, "step": 33250 }, { "epoch": 2.0843516951807985, "grad_norm": 0.09321003407239914, "learning_rate": 1.542188258639252e-05, "loss": 0.1007, "step": 33260 }, { "epoch": 2.084978379394623, "grad_norm": 0.01489013247191906, "learning_rate": 1.5411327605497033e-05, "loss": 0.0854, "step": 33270 }, { "epoch": 2.0856050636084476, "grad_norm": 0.022184649482369423, "learning_rate": 1.540077262460155e-05, "loss": 0.0329, "step": 33280 }, { "epoch": 2.0862317478222723, "grad_norm": 0.942670464515686, "learning_rate": 1.5390217643706063e-05, "loss": 0.1323, "step": 33290 }, { "epoch": 2.086858432036097, "grad_norm": 1.3526219129562378, "learning_rate": 1.5379662662810583e-05, "loss": 0.1154, "step": 33300 }, { "epoch": 2.0874851162499217, "grad_norm": 0.050650764256715775, "learning_rate": 1.5369107681915096e-05, "loss": 0.0038, "step": 33310 }, { "epoch": 2.0881118004637464, "grad_norm": 0.15342023968696594, "learning_rate": 1.5358552701019613e-05, "loss": 0.0759, "step": 33320 }, { "epoch": 2.088738484677571, "grad_norm": 0.021919699385762215, "learning_rate": 1.5347997720124126e-05, "loss": 0.1006, "step": 33330 }, { "epoch": 2.0893651688913955, "grad_norm": 53.63493728637695, "learning_rate": 1.5337442739228643e-05, "loss": 0.1111, "step": 33340 }, { "epoch": 2.08999185310522, "grad_norm": 4.8673810958862305, "learning_rate": 1.532688775833316e-05, "loss": 0.0968, "step": 33350 }, { "epoch": 2.090618537319045, "grad_norm": 0.17812536656856537, "learning_rate": 1.5316332777437673e-05, "loss": 0.0357, "step": 33360 }, { "epoch": 2.0912452215328696, "grad_norm": 0.13300246000289917, "learning_rate": 1.530577779654219e-05, "loss": 0.0692, "step": 33370 }, { "epoch": 2.0918719057466943, "grad_norm": 0.08312509953975677, "learning_rate": 1.5295222815646703e-05, "loss": 0.0561, "step": 33380 }, { "epoch": 2.092498589960519, "grad_norm": 0.06152992323040962, "learning_rate": 1.528466783475122e-05, "loss": 0.0272, "step": 33390 }, { "epoch": 2.0931252741743434, "grad_norm": 0.13552305102348328, "learning_rate": 1.5274112853855736e-05, "loss": 0.1758, "step": 33400 }, { "epoch": 2.093751958388168, "grad_norm": 0.19440507888793945, "learning_rate": 1.5263557872960253e-05, "loss": 0.0442, "step": 33410 }, { "epoch": 2.094378642601993, "grad_norm": 21.867822647094727, "learning_rate": 1.5253002892064766e-05, "loss": 0.0412, "step": 33420 }, { "epoch": 2.0950053268158175, "grad_norm": 9.543660163879395, "learning_rate": 1.5242447911169281e-05, "loss": 0.0348, "step": 33430 }, { "epoch": 2.0956320110296422, "grad_norm": 0.027133401483297348, "learning_rate": 1.5231892930273798e-05, "loss": 0.0583, "step": 33440 }, { "epoch": 2.096258695243467, "grad_norm": 0.3416101038455963, "learning_rate": 1.5221337949378311e-05, "loss": 0.0022, "step": 33450 }, { "epoch": 2.0968853794572917, "grad_norm": 8.35288143157959, "learning_rate": 1.521078296848283e-05, "loss": 0.0551, "step": 33460 }, { "epoch": 2.097512063671116, "grad_norm": 0.008955095894634724, "learning_rate": 1.5200227987587343e-05, "loss": 0.0178, "step": 33470 }, { "epoch": 2.0981387478849407, "grad_norm": 6.15145206451416, "learning_rate": 1.518967300669186e-05, "loss": 0.1334, "step": 33480 }, { "epoch": 2.0987654320987654, "grad_norm": 0.016260214149951935, "learning_rate": 1.5179118025796374e-05, "loss": 0.026, "step": 33490 }, { "epoch": 2.09939211631259, "grad_norm": 0.5407869219779968, "learning_rate": 1.5168563044900887e-05, "loss": 0.0199, "step": 33500 }, { "epoch": 2.100018800526415, "grad_norm": 0.01398592721670866, "learning_rate": 1.5158008064005406e-05, "loss": 0.0227, "step": 33510 }, { "epoch": 2.1006454847402396, "grad_norm": 0.0240841805934906, "learning_rate": 1.5147453083109919e-05, "loss": 0.2086, "step": 33520 }, { "epoch": 2.101272168954064, "grad_norm": 4.998007297515869, "learning_rate": 1.5136898102214436e-05, "loss": 0.0781, "step": 33530 }, { "epoch": 2.1018988531678886, "grad_norm": 1.8973308801651, "learning_rate": 1.512634312131895e-05, "loss": 0.1497, "step": 33540 }, { "epoch": 2.1025255373817133, "grad_norm": 0.041549600660800934, "learning_rate": 1.5115788140423467e-05, "loss": 0.0448, "step": 33550 }, { "epoch": 2.103152221595538, "grad_norm": 1.6395976543426514, "learning_rate": 1.5105233159527982e-05, "loss": 0.0626, "step": 33560 }, { "epoch": 2.1037789058093628, "grad_norm": 0.01420541387051344, "learning_rate": 1.5094678178632499e-05, "loss": 0.04, "step": 33570 }, { "epoch": 2.1044055900231875, "grad_norm": 5.16315221786499, "learning_rate": 1.5084123197737012e-05, "loss": 0.145, "step": 33580 }, { "epoch": 2.105032274237012, "grad_norm": 0.022046558558940887, "learning_rate": 1.5073568216841527e-05, "loss": 0.0271, "step": 33590 }, { "epoch": 2.1056589584508365, "grad_norm": 0.05051284655928612, "learning_rate": 1.5063013235946044e-05, "loss": 0.0417, "step": 33600 }, { "epoch": 2.1062856426646612, "grad_norm": 0.23544947803020477, "learning_rate": 1.5052458255050559e-05, "loss": 0.0307, "step": 33610 }, { "epoch": 2.106912326878486, "grad_norm": 0.01835622452199459, "learning_rate": 1.5041903274155075e-05, "loss": 0.0437, "step": 33620 }, { "epoch": 2.1075390110923107, "grad_norm": 0.07070131599903107, "learning_rate": 1.503134829325959e-05, "loss": 0.056, "step": 33630 }, { "epoch": 2.1081656953061354, "grad_norm": 0.02240445651113987, "learning_rate": 1.5020793312364107e-05, "loss": 0.1872, "step": 33640 }, { "epoch": 2.1087923795199597, "grad_norm": 0.16485477983951569, "learning_rate": 1.501023833146862e-05, "loss": 0.0741, "step": 33650 }, { "epoch": 2.1094190637337844, "grad_norm": 6.441537857055664, "learning_rate": 1.4999683350573135e-05, "loss": 0.091, "step": 33660 }, { "epoch": 2.110045747947609, "grad_norm": 7.90993070602417, "learning_rate": 1.4989128369677652e-05, "loss": 0.037, "step": 33670 }, { "epoch": 2.110672432161434, "grad_norm": 0.027580685913562775, "learning_rate": 1.4978573388782167e-05, "loss": 0.0665, "step": 33680 }, { "epoch": 2.1112991163752586, "grad_norm": 0.021920882165431976, "learning_rate": 1.4968018407886683e-05, "loss": 0.0015, "step": 33690 }, { "epoch": 2.1119258005890833, "grad_norm": 0.0395321287214756, "learning_rate": 1.4957463426991197e-05, "loss": 0.1043, "step": 33700 }, { "epoch": 2.112552484802908, "grad_norm": 0.015341048128902912, "learning_rate": 1.4946908446095715e-05, "loss": 0.0992, "step": 33710 }, { "epoch": 2.1131791690167323, "grad_norm": 0.032132405787706375, "learning_rate": 1.4936353465200228e-05, "loss": 0.0482, "step": 33720 }, { "epoch": 2.113805853230557, "grad_norm": 0.016636233776807785, "learning_rate": 1.4925798484304745e-05, "loss": 0.099, "step": 33730 }, { "epoch": 2.1144325374443818, "grad_norm": 0.021241918206214905, "learning_rate": 1.491524350340926e-05, "loss": 0.0018, "step": 33740 }, { "epoch": 2.1150592216582065, "grad_norm": 0.01900082267820835, "learning_rate": 1.4904688522513773e-05, "loss": 0.1138, "step": 33750 }, { "epoch": 2.115685905872031, "grad_norm": 10.862110137939453, "learning_rate": 1.4894133541618292e-05, "loss": 0.0826, "step": 33760 }, { "epoch": 2.116312590085856, "grad_norm": 0.825947642326355, "learning_rate": 1.4883578560722805e-05, "loss": 0.0336, "step": 33770 }, { "epoch": 2.11693927429968, "grad_norm": 35.69391632080078, "learning_rate": 1.4873023579827321e-05, "loss": 0.0781, "step": 33780 }, { "epoch": 2.117565958513505, "grad_norm": 0.02140156924724579, "learning_rate": 1.4862468598931836e-05, "loss": 0.0309, "step": 33790 }, { "epoch": 2.1181926427273297, "grad_norm": 0.17433586716651917, "learning_rate": 1.4851913618036353e-05, "loss": 0.001, "step": 33800 }, { "epoch": 2.1188193269411544, "grad_norm": 0.011882697232067585, "learning_rate": 1.4841358637140868e-05, "loss": 0.0749, "step": 33810 }, { "epoch": 2.119446011154979, "grad_norm": 7.203928470611572, "learning_rate": 1.4830803656245381e-05, "loss": 0.171, "step": 33820 }, { "epoch": 2.120072695368804, "grad_norm": 0.020683668553829193, "learning_rate": 1.4820248675349898e-05, "loss": 0.0225, "step": 33830 }, { "epoch": 2.120699379582628, "grad_norm": 4.708274841308594, "learning_rate": 1.4809693694454413e-05, "loss": 0.1266, "step": 33840 }, { "epoch": 2.121326063796453, "grad_norm": 0.2545260190963745, "learning_rate": 1.479913871355893e-05, "loss": 0.0061, "step": 33850 }, { "epoch": 2.1219527480102776, "grad_norm": 0.06264783442020416, "learning_rate": 1.4788583732663445e-05, "loss": 0.0348, "step": 33860 }, { "epoch": 2.1225794322241023, "grad_norm": 0.04944666475057602, "learning_rate": 1.4778028751767961e-05, "loss": 0.0564, "step": 33870 }, { "epoch": 2.123206116437927, "grad_norm": 0.05640007555484772, "learning_rate": 1.4767473770872476e-05, "loss": 0.1237, "step": 33880 }, { "epoch": 2.1238328006517517, "grad_norm": 0.031840428709983826, "learning_rate": 1.475691878997699e-05, "loss": 0.062, "step": 33890 }, { "epoch": 2.124459484865576, "grad_norm": 0.7252791523933411, "learning_rate": 1.4746363809081506e-05, "loss": 0.0555, "step": 33900 }, { "epoch": 2.1250861690794007, "grad_norm": 0.09943141043186188, "learning_rate": 1.4735808828186021e-05, "loss": 0.1402, "step": 33910 }, { "epoch": 2.1257128532932255, "grad_norm": 0.040643591433763504, "learning_rate": 1.4725253847290538e-05, "loss": 0.002, "step": 33920 }, { "epoch": 2.12633953750705, "grad_norm": 0.06584443897008896, "learning_rate": 1.4714698866395053e-05, "loss": 0.0877, "step": 33930 }, { "epoch": 2.126966221720875, "grad_norm": 0.03503456711769104, "learning_rate": 1.470414388549957e-05, "loss": 0.0065, "step": 33940 }, { "epoch": 2.1275929059346996, "grad_norm": 14.166348457336426, "learning_rate": 1.4693588904604083e-05, "loss": 0.143, "step": 33950 }, { "epoch": 2.1282195901485244, "grad_norm": 35.32097625732422, "learning_rate": 1.4683033923708601e-05, "loss": 0.1444, "step": 33960 }, { "epoch": 2.1288462743623486, "grad_norm": 0.06533791869878769, "learning_rate": 1.4672478942813114e-05, "loss": 0.0792, "step": 33970 }, { "epoch": 2.1294729585761734, "grad_norm": 0.1732708066701889, "learning_rate": 1.4661923961917629e-05, "loss": 0.1347, "step": 33980 }, { "epoch": 2.130099642789998, "grad_norm": 11.223489761352539, "learning_rate": 1.4651368981022146e-05, "loss": 0.0402, "step": 33990 }, { "epoch": 2.130726327003823, "grad_norm": 0.30682894587516785, "learning_rate": 1.4640814000126659e-05, "loss": 0.0591, "step": 34000 }, { "epoch": 2.1313530112176475, "grad_norm": 0.0331999771296978, "learning_rate": 1.4630259019231177e-05, "loss": 0.0475, "step": 34010 }, { "epoch": 2.1319796954314723, "grad_norm": 16.945070266723633, "learning_rate": 1.461970403833569e-05, "loss": 0.0869, "step": 34020 }, { "epoch": 2.1326063796452965, "grad_norm": 0.034137096256017685, "learning_rate": 1.4609149057440207e-05, "loss": 0.1776, "step": 34030 }, { "epoch": 2.1332330638591213, "grad_norm": 2.4567253589630127, "learning_rate": 1.4598594076544722e-05, "loss": 0.0038, "step": 34040 }, { "epoch": 2.133859748072946, "grad_norm": 0.06618314236402512, "learning_rate": 1.4588039095649237e-05, "loss": 0.009, "step": 34050 }, { "epoch": 2.1344864322867707, "grad_norm": 0.010522538796067238, "learning_rate": 1.4577484114753754e-05, "loss": 0.1303, "step": 34060 }, { "epoch": 2.1351131165005954, "grad_norm": 17.482473373413086, "learning_rate": 1.4566929133858267e-05, "loss": 0.1436, "step": 34070 }, { "epoch": 2.13573980071442, "grad_norm": 0.016001539304852486, "learning_rate": 1.4556374152962784e-05, "loss": 0.0007, "step": 34080 }, { "epoch": 2.136366484928245, "grad_norm": 0.04239490255713463, "learning_rate": 1.4545819172067299e-05, "loss": 0.0855, "step": 34090 }, { "epoch": 2.136993169142069, "grad_norm": 0.5522505640983582, "learning_rate": 1.4535264191171815e-05, "loss": 0.0036, "step": 34100 }, { "epoch": 2.137619853355894, "grad_norm": 0.01569908857345581, "learning_rate": 1.452470921027633e-05, "loss": 0.0346, "step": 34110 }, { "epoch": 2.1382465375697186, "grad_norm": 1.0340025424957275, "learning_rate": 1.4514154229380847e-05, "loss": 0.2189, "step": 34120 }, { "epoch": 2.1388732217835433, "grad_norm": 0.019147315993905067, "learning_rate": 1.4503599248485362e-05, "loss": 0.0875, "step": 34130 }, { "epoch": 2.139499905997368, "grad_norm": 0.047466620802879333, "learning_rate": 1.4493044267589875e-05, "loss": 0.0955, "step": 34140 }, { "epoch": 2.140126590211193, "grad_norm": 0.020558997988700867, "learning_rate": 1.4482489286694392e-05, "loss": 0.1108, "step": 34150 }, { "epoch": 2.140753274425017, "grad_norm": 0.02180035598576069, "learning_rate": 1.4471934305798907e-05, "loss": 0.0826, "step": 34160 }, { "epoch": 2.141379958638842, "grad_norm": 0.024248367175459862, "learning_rate": 1.4461379324903423e-05, "loss": 0.0022, "step": 34170 }, { "epoch": 2.1420066428526665, "grad_norm": 0.026557935401797295, "learning_rate": 1.4450824344007938e-05, "loss": 0.0256, "step": 34180 }, { "epoch": 2.1426333270664912, "grad_norm": 0.013800345361232758, "learning_rate": 1.4440269363112455e-05, "loss": 0.1043, "step": 34190 }, { "epoch": 2.143260011280316, "grad_norm": 0.0492568239569664, "learning_rate": 1.4429714382216968e-05, "loss": 0.0024, "step": 34200 }, { "epoch": 2.1438866954941407, "grad_norm": 0.08210790902376175, "learning_rate": 1.4419159401321483e-05, "loss": 0.0627, "step": 34210 }, { "epoch": 2.144513379707965, "grad_norm": 2.664041042327881, "learning_rate": 1.4408604420426e-05, "loss": 0.1104, "step": 34220 }, { "epoch": 2.1451400639217897, "grad_norm": 0.6660194396972656, "learning_rate": 1.4398049439530515e-05, "loss": 0.0381, "step": 34230 }, { "epoch": 2.1457667481356144, "grad_norm": 0.07248642295598984, "learning_rate": 1.4387494458635032e-05, "loss": 0.0531, "step": 34240 }, { "epoch": 2.146393432349439, "grad_norm": 1.3321788311004639, "learning_rate": 1.4376939477739545e-05, "loss": 0.0295, "step": 34250 }, { "epoch": 2.147020116563264, "grad_norm": 24.10909080505371, "learning_rate": 1.4366384496844063e-05, "loss": 0.0952, "step": 34260 }, { "epoch": 2.1476468007770886, "grad_norm": 0.015267537906765938, "learning_rate": 1.4355829515948576e-05, "loss": 0.0007, "step": 34270 }, { "epoch": 2.148273484990913, "grad_norm": 0.04566666856408119, "learning_rate": 1.4345274535053091e-05, "loss": 0.0016, "step": 34280 }, { "epoch": 2.1489001692047376, "grad_norm": 0.007647485937923193, "learning_rate": 1.4334719554157608e-05, "loss": 0.0042, "step": 34290 }, { "epoch": 2.1495268534185623, "grad_norm": 0.4660249352455139, "learning_rate": 1.4324164573262123e-05, "loss": 0.0145, "step": 34300 }, { "epoch": 2.150153537632387, "grad_norm": 0.008947279304265976, "learning_rate": 1.431360959236664e-05, "loss": 0.0371, "step": 34310 }, { "epoch": 2.1507802218462118, "grad_norm": 0.012975979596376419, "learning_rate": 1.4303054611471153e-05, "loss": 0.1095, "step": 34320 }, { "epoch": 2.1514069060600365, "grad_norm": 0.011418702080845833, "learning_rate": 1.429249963057567e-05, "loss": 0.2002, "step": 34330 }, { "epoch": 2.152033590273861, "grad_norm": 0.35795190930366516, "learning_rate": 1.4281944649680184e-05, "loss": 0.0012, "step": 34340 }, { "epoch": 2.1526602744876855, "grad_norm": 0.06372092664241791, "learning_rate": 1.4271389668784701e-05, "loss": 0.0485, "step": 34350 }, { "epoch": 2.15328695870151, "grad_norm": 0.06595498323440552, "learning_rate": 1.4260834687889216e-05, "loss": 0.0305, "step": 34360 }, { "epoch": 2.153913642915335, "grad_norm": 0.010245050303637981, "learning_rate": 1.425027970699373e-05, "loss": 0.0007, "step": 34370 }, { "epoch": 2.1545403271291597, "grad_norm": 0.04346427321434021, "learning_rate": 1.4239724726098248e-05, "loss": 0.0013, "step": 34380 }, { "epoch": 2.1551670113429844, "grad_norm": 0.012852686457335949, "learning_rate": 1.4229169745202761e-05, "loss": 0.1434, "step": 34390 }, { "epoch": 2.155793695556809, "grad_norm": 0.056614551693201065, "learning_rate": 1.4218614764307278e-05, "loss": 0.0815, "step": 34400 }, { "epoch": 2.1564203797706334, "grad_norm": 0.054879337549209595, "learning_rate": 1.4208059783411793e-05, "loss": 0.1237, "step": 34410 }, { "epoch": 2.157047063984458, "grad_norm": 0.010255315341055393, "learning_rate": 1.419750480251631e-05, "loss": 0.0008, "step": 34420 }, { "epoch": 2.157673748198283, "grad_norm": 0.009069728665053844, "learning_rate": 1.4186949821620824e-05, "loss": 0.1256, "step": 34430 }, { "epoch": 2.1583004324121076, "grad_norm": 0.047382794320583344, "learning_rate": 1.4176394840725337e-05, "loss": 0.0012, "step": 34440 }, { "epoch": 2.1589271166259323, "grad_norm": 0.12266150861978531, "learning_rate": 1.4165839859829854e-05, "loss": 0.0589, "step": 34450 }, { "epoch": 2.159553800839757, "grad_norm": 0.04495285078883171, "learning_rate": 1.4155284878934369e-05, "loss": 0.1503, "step": 34460 }, { "epoch": 2.1601804850535817, "grad_norm": 0.0057646301575005054, "learning_rate": 1.4144729898038886e-05, "loss": 0.0806, "step": 34470 }, { "epoch": 2.160807169267406, "grad_norm": 0.48330438137054443, "learning_rate": 1.41341749171434e-05, "loss": 0.0041, "step": 34480 }, { "epoch": 2.1614338534812307, "grad_norm": 0.10639398545026779, "learning_rate": 1.4123619936247917e-05, "loss": 0.0579, "step": 34490 }, { "epoch": 2.1620605376950555, "grad_norm": 0.01347813569009304, "learning_rate": 1.411306495535243e-05, "loss": 0.1687, "step": 34500 }, { "epoch": 2.16268722190888, "grad_norm": 0.029893875122070312, "learning_rate": 1.4102509974456946e-05, "loss": 0.001, "step": 34510 }, { "epoch": 2.163313906122705, "grad_norm": 0.16543273627758026, "learning_rate": 1.4091954993561462e-05, "loss": 0.0327, "step": 34520 }, { "epoch": 2.163940590336529, "grad_norm": 7.4403815269470215, "learning_rate": 1.4081400012665977e-05, "loss": 0.2158, "step": 34530 }, { "epoch": 2.164567274550354, "grad_norm": 0.01477138139307499, "learning_rate": 1.4070845031770494e-05, "loss": 0.042, "step": 34540 }, { "epoch": 2.1651939587641786, "grad_norm": 0.01747424341738224, "learning_rate": 1.4060290050875009e-05, "loss": 0.0589, "step": 34550 }, { "epoch": 2.1658206429780034, "grad_norm": 0.01443378534168005, "learning_rate": 1.4049735069979525e-05, "loss": 0.1629, "step": 34560 }, { "epoch": 2.166447327191828, "grad_norm": 0.06920278072357178, "learning_rate": 1.4039180089084039e-05, "loss": 0.0235, "step": 34570 }, { "epoch": 2.167074011405653, "grad_norm": 0.01657126098871231, "learning_rate": 1.4028625108188555e-05, "loss": 0.1257, "step": 34580 }, { "epoch": 2.1677006956194775, "grad_norm": 0.009466479532420635, "learning_rate": 1.401807012729307e-05, "loss": 0.0129, "step": 34590 }, { "epoch": 2.168327379833302, "grad_norm": 0.4203064739704132, "learning_rate": 1.4007515146397585e-05, "loss": 0.0021, "step": 34600 }, { "epoch": 2.1689540640471265, "grad_norm": 0.03957168757915497, "learning_rate": 1.3996960165502102e-05, "loss": 0.0192, "step": 34610 }, { "epoch": 2.1695807482609513, "grad_norm": 0.006051171105355024, "learning_rate": 1.3986405184606615e-05, "loss": 0.0307, "step": 34620 }, { "epoch": 2.170207432474776, "grad_norm": 0.02826111391186714, "learning_rate": 1.3975850203711133e-05, "loss": 0.0751, "step": 34630 }, { "epoch": 2.1708341166886007, "grad_norm": 0.007988009601831436, "learning_rate": 1.3965295222815647e-05, "loss": 0.0098, "step": 34640 }, { "epoch": 2.1714608009024254, "grad_norm": 0.0047339689917862415, "learning_rate": 1.3954740241920163e-05, "loss": 0.05, "step": 34650 }, { "epoch": 2.1720874851162497, "grad_norm": 0.04309682920575142, "learning_rate": 1.3944185261024678e-05, "loss": 0.0921, "step": 34660 }, { "epoch": 2.1727141693300744, "grad_norm": 0.020489204674959183, "learning_rate": 1.3933630280129192e-05, "loss": 0.0826, "step": 34670 }, { "epoch": 2.173340853543899, "grad_norm": 0.011994308792054653, "learning_rate": 1.392307529923371e-05, "loss": 0.0534, "step": 34680 }, { "epoch": 2.173967537757724, "grad_norm": 0.0060248589143157005, "learning_rate": 1.3912520318338223e-05, "loss": 0.1518, "step": 34690 }, { "epoch": 2.1745942219715486, "grad_norm": 2.8406407833099365, "learning_rate": 1.390196533744274e-05, "loss": 0.1138, "step": 34700 }, { "epoch": 2.1752209061853733, "grad_norm": 0.010623175650835037, "learning_rate": 1.3891410356547255e-05, "loss": 0.0644, "step": 34710 }, { "epoch": 2.175847590399198, "grad_norm": 24.322790145874023, "learning_rate": 1.3880855375651771e-05, "loss": 0.0389, "step": 34720 }, { "epoch": 2.1764742746130223, "grad_norm": 0.01590089127421379, "learning_rate": 1.3870300394756286e-05, "loss": 0.0032, "step": 34730 }, { "epoch": 2.177100958826847, "grad_norm": 0.015220998786389828, "learning_rate": 1.3859745413860803e-05, "loss": 0.0071, "step": 34740 }, { "epoch": 2.177727643040672, "grad_norm": 0.01890176720917225, "learning_rate": 1.3849190432965316e-05, "loss": 0.002, "step": 34750 }, { "epoch": 2.1783543272544965, "grad_norm": 0.08504509925842285, "learning_rate": 1.3838635452069831e-05, "loss": 0.0022, "step": 34760 }, { "epoch": 2.1789810114683212, "grad_norm": 0.007461244240403175, "learning_rate": 1.3828080471174348e-05, "loss": 0.0841, "step": 34770 }, { "epoch": 2.179607695682146, "grad_norm": 0.19550983607769012, "learning_rate": 1.3817525490278863e-05, "loss": 0.122, "step": 34780 }, { "epoch": 2.1802343798959702, "grad_norm": 0.07259407639503479, "learning_rate": 1.380697050938338e-05, "loss": 0.0708, "step": 34790 }, { "epoch": 2.180861064109795, "grad_norm": 0.02365555614233017, "learning_rate": 1.3796415528487895e-05, "loss": 0.0666, "step": 34800 }, { "epoch": 2.1814877483236197, "grad_norm": 16.392606735229492, "learning_rate": 1.3785860547592411e-05, "loss": 0.0461, "step": 34810 }, { "epoch": 2.1821144325374444, "grad_norm": 19.294736862182617, "learning_rate": 1.3775305566696924e-05, "loss": 0.0895, "step": 34820 }, { "epoch": 2.182741116751269, "grad_norm": 0.03826337680220604, "learning_rate": 1.376475058580144e-05, "loss": 0.0013, "step": 34830 }, { "epoch": 2.183367800965094, "grad_norm": 0.016204219311475754, "learning_rate": 1.3754195604905956e-05, "loss": 0.0407, "step": 34840 }, { "epoch": 2.183994485178918, "grad_norm": 0.017499983310699463, "learning_rate": 1.3743640624010471e-05, "loss": 0.0585, "step": 34850 }, { "epoch": 2.184621169392743, "grad_norm": 0.13106019794940948, "learning_rate": 1.3733085643114988e-05, "loss": 0.1402, "step": 34860 }, { "epoch": 2.1852478536065676, "grad_norm": 0.005508528556674719, "learning_rate": 1.3722530662219501e-05, "loss": 0.0515, "step": 34870 }, { "epoch": 2.1858745378203923, "grad_norm": 0.009193802252411842, "learning_rate": 1.371197568132402e-05, "loss": 0.0038, "step": 34880 }, { "epoch": 2.186501222034217, "grad_norm": 0.035130925476551056, "learning_rate": 1.3701420700428533e-05, "loss": 0.0014, "step": 34890 }, { "epoch": 2.1871279062480418, "grad_norm": 0.006802499294281006, "learning_rate": 1.3690865719533047e-05, "loss": 0.0543, "step": 34900 }, { "epoch": 2.187754590461866, "grad_norm": 0.0060197836719453335, "learning_rate": 1.3680310738637564e-05, "loss": 0.0665, "step": 34910 }, { "epoch": 2.1883812746756908, "grad_norm": 0.02188221924006939, "learning_rate": 1.3669755757742077e-05, "loss": 0.0215, "step": 34920 }, { "epoch": 2.1890079588895155, "grad_norm": 0.14490945637226105, "learning_rate": 1.3659200776846596e-05, "loss": 0.169, "step": 34930 }, { "epoch": 2.18963464310334, "grad_norm": 0.03352600708603859, "learning_rate": 1.3648645795951109e-05, "loss": 0.0012, "step": 34940 }, { "epoch": 2.190261327317165, "grad_norm": 0.1054094210267067, "learning_rate": 1.3638090815055626e-05, "loss": 0.0891, "step": 34950 }, { "epoch": 2.1908880115309897, "grad_norm": 0.0154159776866436, "learning_rate": 1.362753583416014e-05, "loss": 0.1988, "step": 34960 }, { "epoch": 2.1915146957448144, "grad_norm": 0.010679114609956741, "learning_rate": 1.3616980853264657e-05, "loss": 0.1249, "step": 34970 }, { "epoch": 2.1921413799586387, "grad_norm": 0.11450926214456558, "learning_rate": 1.3606425872369172e-05, "loss": 0.0023, "step": 34980 }, { "epoch": 2.1927680641724634, "grad_norm": 15.463875770568848, "learning_rate": 1.3595870891473685e-05, "loss": 0.2882, "step": 34990 }, { "epoch": 2.193394748386288, "grad_norm": 5.67316198348999, "learning_rate": 1.3585315910578202e-05, "loss": 0.0922, "step": 35000 }, { "epoch": 2.194021432600113, "grad_norm": 0.02195173315703869, "learning_rate": 1.3574760929682717e-05, "loss": 0.1378, "step": 35010 }, { "epoch": 2.1946481168139376, "grad_norm": 0.22539135813713074, "learning_rate": 1.3564205948787234e-05, "loss": 0.0881, "step": 35020 }, { "epoch": 2.1952748010277623, "grad_norm": 0.022844787687063217, "learning_rate": 1.3553650967891749e-05, "loss": 0.0723, "step": 35030 }, { "epoch": 2.1959014852415866, "grad_norm": 0.058049276471138, "learning_rate": 1.3543095986996265e-05, "loss": 0.0071, "step": 35040 }, { "epoch": 2.1965281694554113, "grad_norm": 0.035429976880550385, "learning_rate": 1.353254100610078e-05, "loss": 0.1019, "step": 35050 }, { "epoch": 2.197154853669236, "grad_norm": 3.9574286937713623, "learning_rate": 1.3521986025205294e-05, "loss": 0.154, "step": 35060 }, { "epoch": 2.1977815378830607, "grad_norm": 0.02437640354037285, "learning_rate": 1.351143104430981e-05, "loss": 0.1157, "step": 35070 }, { "epoch": 2.1984082220968855, "grad_norm": 0.3660740256309509, "learning_rate": 1.3500876063414325e-05, "loss": 0.082, "step": 35080 }, { "epoch": 2.19903490631071, "grad_norm": 22.220230102539062, "learning_rate": 1.3490321082518842e-05, "loss": 0.0268, "step": 35090 }, { "epoch": 2.199661590524535, "grad_norm": 405.9232482910156, "learning_rate": 1.3479766101623357e-05, "loss": 0.1237, "step": 35100 }, { "epoch": 2.200288274738359, "grad_norm": 0.029608746990561485, "learning_rate": 1.3469211120727873e-05, "loss": 0.1838, "step": 35110 }, { "epoch": 2.200914958952184, "grad_norm": 0.038986582309007645, "learning_rate": 1.3458656139832387e-05, "loss": 0.0051, "step": 35120 }, { "epoch": 2.2015416431660086, "grad_norm": 0.04025193676352501, "learning_rate": 1.3448101158936902e-05, "loss": 0.0287, "step": 35130 }, { "epoch": 2.2021683273798334, "grad_norm": 0.25155943632125854, "learning_rate": 1.3437546178041418e-05, "loss": 0.0335, "step": 35140 }, { "epoch": 2.202795011593658, "grad_norm": 0.10960265249013901, "learning_rate": 1.3426991197145933e-05, "loss": 0.003, "step": 35150 }, { "epoch": 2.203421695807483, "grad_norm": 4.629335403442383, "learning_rate": 1.341643621625045e-05, "loss": 0.0887, "step": 35160 }, { "epoch": 2.204048380021307, "grad_norm": 0.01714794896543026, "learning_rate": 1.3405881235354963e-05, "loss": 0.0731, "step": 35170 }, { "epoch": 2.204675064235132, "grad_norm": 0.04572465270757675, "learning_rate": 1.3395326254459482e-05, "loss": 0.1256, "step": 35180 }, { "epoch": 2.2053017484489565, "grad_norm": 1.0245270729064941, "learning_rate": 1.3384771273563995e-05, "loss": 0.0712, "step": 35190 }, { "epoch": 2.2059284326627813, "grad_norm": 0.020694352686405182, "learning_rate": 1.3374216292668511e-05, "loss": 0.0021, "step": 35200 }, { "epoch": 2.206555116876606, "grad_norm": 0.025215351954102516, "learning_rate": 1.3363661311773026e-05, "loss": 0.0019, "step": 35210 }, { "epoch": 2.2071818010904307, "grad_norm": 0.019033333286643028, "learning_rate": 1.3353106330877541e-05, "loss": 0.0031, "step": 35220 }, { "epoch": 2.207808485304255, "grad_norm": 2.6878879070281982, "learning_rate": 1.3342551349982058e-05, "loss": 0.0039, "step": 35230 }, { "epoch": 2.2084351695180797, "grad_norm": 0.020640341565012932, "learning_rate": 1.3331996369086571e-05, "loss": 0.0841, "step": 35240 }, { "epoch": 2.2090618537319044, "grad_norm": 0.016223404556512833, "learning_rate": 1.332144138819109e-05, "loss": 0.0448, "step": 35250 }, { "epoch": 2.209688537945729, "grad_norm": 0.005702883470803499, "learning_rate": 1.3310886407295603e-05, "loss": 0.0283, "step": 35260 }, { "epoch": 2.210315222159554, "grad_norm": 13.36315631866455, "learning_rate": 1.330033142640012e-05, "loss": 0.121, "step": 35270 }, { "epoch": 2.2109419063733786, "grad_norm": 0.007761610671877861, "learning_rate": 1.3289776445504634e-05, "loss": 0.0029, "step": 35280 }, { "epoch": 2.211568590587203, "grad_norm": 0.05677232891321182, "learning_rate": 1.3279221464609148e-05, "loss": 0.1566, "step": 35290 }, { "epoch": 2.2121952748010276, "grad_norm": 0.01307602971792221, "learning_rate": 1.3268666483713666e-05, "loss": 0.0427, "step": 35300 }, { "epoch": 2.2128219590148523, "grad_norm": 0.021906176581978798, "learning_rate": 1.325811150281818e-05, "loss": 0.0512, "step": 35310 }, { "epoch": 2.213448643228677, "grad_norm": 0.008934159763157368, "learning_rate": 1.3247556521922696e-05, "loss": 0.0585, "step": 35320 }, { "epoch": 2.214075327442502, "grad_norm": 0.05048960819840431, "learning_rate": 1.3237001541027211e-05, "loss": 0.1755, "step": 35330 }, { "epoch": 2.2147020116563265, "grad_norm": 0.03687850758433342, "learning_rate": 1.3226446560131728e-05, "loss": 0.1197, "step": 35340 }, { "epoch": 2.2153286958701512, "grad_norm": 0.07645319402217865, "learning_rate": 1.3215891579236243e-05, "loss": 0.0426, "step": 35350 }, { "epoch": 2.2159553800839755, "grad_norm": 0.07259127497673035, "learning_rate": 1.320533659834076e-05, "loss": 0.0037, "step": 35360 }, { "epoch": 2.2165820642978002, "grad_norm": 0.034717291593551636, "learning_rate": 1.3194781617445272e-05, "loss": 0.0022, "step": 35370 }, { "epoch": 2.217208748511625, "grad_norm": 0.19604039192199707, "learning_rate": 1.3184226636549787e-05, "loss": 0.0881, "step": 35380 }, { "epoch": 2.2178354327254497, "grad_norm": 0.08138164132833481, "learning_rate": 1.3173671655654304e-05, "loss": 0.0602, "step": 35390 }, { "epoch": 2.2184621169392744, "grad_norm": 0.012671887874603271, "learning_rate": 1.3163116674758819e-05, "loss": 0.0523, "step": 35400 }, { "epoch": 2.219088801153099, "grad_norm": 0.08288644999265671, "learning_rate": 1.3152561693863336e-05, "loss": 0.0015, "step": 35410 }, { "epoch": 2.2197154853669234, "grad_norm": 0.5564950108528137, "learning_rate": 1.3142006712967849e-05, "loss": 0.1163, "step": 35420 }, { "epoch": 2.220342169580748, "grad_norm": 0.03301873803138733, "learning_rate": 1.3131451732072367e-05, "loss": 0.0419, "step": 35430 }, { "epoch": 2.220968853794573, "grad_norm": 0.021805711090564728, "learning_rate": 1.312089675117688e-05, "loss": 0.0042, "step": 35440 }, { "epoch": 2.2215955380083976, "grad_norm": 0.07749617099761963, "learning_rate": 1.3110341770281396e-05, "loss": 0.0013, "step": 35450 }, { "epoch": 2.2222222222222223, "grad_norm": 0.007691920269280672, "learning_rate": 1.3099786789385912e-05, "loss": 0.0307, "step": 35460 }, { "epoch": 2.222848906436047, "grad_norm": 0.006902730092406273, "learning_rate": 1.3089231808490427e-05, "loss": 0.0014, "step": 35470 }, { "epoch": 2.2234755906498713, "grad_norm": 0.005276334937661886, "learning_rate": 1.3078676827594944e-05, "loss": 0.1014, "step": 35480 }, { "epoch": 2.224102274863696, "grad_norm": 0.010618511587381363, "learning_rate": 1.3068121846699457e-05, "loss": 0.001, "step": 35490 }, { "epoch": 2.2247289590775208, "grad_norm": 0.005693093407899141, "learning_rate": 1.3057566865803975e-05, "loss": 0.1915, "step": 35500 }, { "epoch": 2.2253556432913455, "grad_norm": 0.014188366010785103, "learning_rate": 1.3047011884908489e-05, "loss": 0.0016, "step": 35510 }, { "epoch": 2.22598232750517, "grad_norm": 0.008730873465538025, "learning_rate": 1.3036456904013004e-05, "loss": 0.0006, "step": 35520 }, { "epoch": 2.226609011718995, "grad_norm": 0.007341964635998011, "learning_rate": 1.302590192311752e-05, "loss": 0.0014, "step": 35530 }, { "epoch": 2.2272356959328192, "grad_norm": 0.00883073452860117, "learning_rate": 1.3015346942222034e-05, "loss": 0.0698, "step": 35540 }, { "epoch": 2.227862380146644, "grad_norm": 0.12473952025175095, "learning_rate": 1.3004791961326552e-05, "loss": 0.267, "step": 35550 }, { "epoch": 2.2284890643604687, "grad_norm": 8.776956558227539, "learning_rate": 1.2994236980431065e-05, "loss": 0.1964, "step": 35560 }, { "epoch": 2.2291157485742934, "grad_norm": 0.05307682231068611, "learning_rate": 1.2983681999535582e-05, "loss": 0.0166, "step": 35570 }, { "epoch": 2.229742432788118, "grad_norm": 0.13624274730682373, "learning_rate": 1.2973127018640097e-05, "loss": 0.0555, "step": 35580 }, { "epoch": 2.230369117001943, "grad_norm": 0.45357584953308105, "learning_rate": 1.2962572037744613e-05, "loss": 0.0374, "step": 35590 }, { "epoch": 2.2309958012157676, "grad_norm": 0.016800403594970703, "learning_rate": 1.2952017056849128e-05, "loss": 0.0024, "step": 35600 }, { "epoch": 2.231622485429592, "grad_norm": 0.010358024388551712, "learning_rate": 1.2941462075953642e-05, "loss": 0.002, "step": 35610 }, { "epoch": 2.2322491696434166, "grad_norm": 0.20369577407836914, "learning_rate": 1.2930907095058158e-05, "loss": 0.1755, "step": 35620 }, { "epoch": 2.2328758538572413, "grad_norm": 0.012203558348119259, "learning_rate": 1.2920352114162673e-05, "loss": 0.1723, "step": 35630 }, { "epoch": 2.233502538071066, "grad_norm": 0.026246270164847374, "learning_rate": 1.290979713326719e-05, "loss": 0.176, "step": 35640 }, { "epoch": 2.2341292222848907, "grad_norm": 0.014504744671285152, "learning_rate": 1.2899242152371705e-05, "loss": 0.0751, "step": 35650 }, { "epoch": 2.2347559064987155, "grad_norm": 0.010934627614915371, "learning_rate": 1.2888687171476222e-05, "loss": 0.0406, "step": 35660 }, { "epoch": 2.2353825907125398, "grad_norm": 0.009612313471734524, "learning_rate": 1.2878132190580735e-05, "loss": 0.0758, "step": 35670 }, { "epoch": 2.2360092749263645, "grad_norm": 0.18484322726726532, "learning_rate": 1.286757720968525e-05, "loss": 0.043, "step": 35680 }, { "epoch": 2.236635959140189, "grad_norm": 0.17664383351802826, "learning_rate": 1.2857022228789766e-05, "loss": 0.0918, "step": 35690 }, { "epoch": 2.237262643354014, "grad_norm": 0.05666724219918251, "learning_rate": 1.2846467247894281e-05, "loss": 0.0608, "step": 35700 }, { "epoch": 2.2378893275678386, "grad_norm": 0.1784476339817047, "learning_rate": 1.2835912266998798e-05, "loss": 0.2159, "step": 35710 }, { "epoch": 2.2385160117816634, "grad_norm": 0.6217782497406006, "learning_rate": 1.2825357286103313e-05, "loss": 0.0619, "step": 35720 }, { "epoch": 2.239142695995488, "grad_norm": 0.1981956660747528, "learning_rate": 1.281480230520783e-05, "loss": 0.0399, "step": 35730 }, { "epoch": 2.2397693802093124, "grad_norm": 0.08834876865148544, "learning_rate": 1.2804247324312343e-05, "loss": 0.0421, "step": 35740 }, { "epoch": 2.240396064423137, "grad_norm": 5.194336414337158, "learning_rate": 1.2793692343416861e-05, "loss": 0.0668, "step": 35750 }, { "epoch": 2.241022748636962, "grad_norm": 0.005984729155898094, "learning_rate": 1.2783137362521374e-05, "loss": 0.0331, "step": 35760 }, { "epoch": 2.2416494328507865, "grad_norm": 0.056345418095588684, "learning_rate": 1.277258238162589e-05, "loss": 0.1728, "step": 35770 }, { "epoch": 2.2422761170646113, "grad_norm": 0.10154221206903458, "learning_rate": 1.2762027400730406e-05, "loss": 0.0279, "step": 35780 }, { "epoch": 2.242902801278436, "grad_norm": 0.15458165109157562, "learning_rate": 1.275147241983492e-05, "loss": 0.0027, "step": 35790 }, { "epoch": 2.2435294854922603, "grad_norm": 7.0228271484375, "learning_rate": 1.2740917438939438e-05, "loss": 0.1917, "step": 35800 }, { "epoch": 2.244156169706085, "grad_norm": 0.019665498286485672, "learning_rate": 1.2730362458043951e-05, "loss": 0.0937, "step": 35810 }, { "epoch": 2.2447828539199097, "grad_norm": 0.11862511932849884, "learning_rate": 1.2719807477148468e-05, "loss": 0.0025, "step": 35820 }, { "epoch": 2.2454095381337345, "grad_norm": 0.0292394757270813, "learning_rate": 1.2709252496252983e-05, "loss": 0.0008, "step": 35830 }, { "epoch": 2.246036222347559, "grad_norm": 0.5504111051559448, "learning_rate": 1.2698697515357496e-05, "loss": 0.1048, "step": 35840 }, { "epoch": 2.246662906561384, "grad_norm": 5.584314823150635, "learning_rate": 1.2688142534462014e-05, "loss": 0.2176, "step": 35850 }, { "epoch": 2.247289590775208, "grad_norm": 0.19631190598011017, "learning_rate": 1.2677587553566527e-05, "loss": 0.1025, "step": 35860 }, { "epoch": 2.247916274989033, "grad_norm": 0.12277902662754059, "learning_rate": 1.2667032572671044e-05, "loss": 0.0933, "step": 35870 }, { "epoch": 2.2485429592028576, "grad_norm": 0.18572159111499786, "learning_rate": 1.2656477591775559e-05, "loss": 0.0701, "step": 35880 }, { "epoch": 2.2491696434166824, "grad_norm": 0.09474160522222519, "learning_rate": 1.2645922610880076e-05, "loss": 0.0376, "step": 35890 }, { "epoch": 2.249796327630507, "grad_norm": 0.10830563306808472, "learning_rate": 1.263536762998459e-05, "loss": 0.0533, "step": 35900 }, { "epoch": 2.250423011844332, "grad_norm": 0.0841013714671135, "learning_rate": 1.2624812649089104e-05, "loss": 0.0689, "step": 35910 }, { "epoch": 2.251049696058156, "grad_norm": 0.005682028364390135, "learning_rate": 1.2614257668193622e-05, "loss": 0.0077, "step": 35920 }, { "epoch": 2.251676380271981, "grad_norm": 5.086600303649902, "learning_rate": 1.2603702687298136e-05, "loss": 0.1592, "step": 35930 }, { "epoch": 2.2523030644858055, "grad_norm": 0.01670365035533905, "learning_rate": 1.2593147706402652e-05, "loss": 0.0045, "step": 35940 }, { "epoch": 2.2529297486996303, "grad_norm": 3.9989254474639893, "learning_rate": 1.2582592725507167e-05, "loss": 0.0359, "step": 35950 }, { "epoch": 2.253556432913455, "grad_norm": 0.015506722964346409, "learning_rate": 1.2572037744611684e-05, "loss": 0.0014, "step": 35960 }, { "epoch": 2.2541831171272797, "grad_norm": 0.11642977595329285, "learning_rate": 1.2561482763716199e-05, "loss": 0.0708, "step": 35970 }, { "epoch": 2.2548098013411044, "grad_norm": 0.35174962878227234, "learning_rate": 1.2550927782820715e-05, "loss": 0.002, "step": 35980 }, { "epoch": 2.2554364855549287, "grad_norm": 0.6952834129333496, "learning_rate": 1.2540372801925229e-05, "loss": 0.0513, "step": 35990 }, { "epoch": 2.2560631697687534, "grad_norm": 0.007165560964494944, "learning_rate": 1.2529817821029744e-05, "loss": 0.1342, "step": 36000 }, { "epoch": 2.256689853982578, "grad_norm": 0.006440062541514635, "learning_rate": 1.251926284013426e-05, "loss": 0.043, "step": 36010 }, { "epoch": 2.257316538196403, "grad_norm": 0.013440284878015518, "learning_rate": 1.2508707859238775e-05, "loss": 0.0136, "step": 36020 }, { "epoch": 2.2579432224102276, "grad_norm": 0.5947048664093018, "learning_rate": 1.249815287834329e-05, "loss": 0.082, "step": 36030 }, { "epoch": 2.2585699066240523, "grad_norm": 0.6158246994018555, "learning_rate": 1.2487597897447805e-05, "loss": 0.084, "step": 36040 }, { "epoch": 2.2591965908378766, "grad_norm": 0.008292706683278084, "learning_rate": 1.2477042916552322e-05, "loss": 0.0303, "step": 36050 }, { "epoch": 2.2598232750517013, "grad_norm": 0.2527766823768616, "learning_rate": 1.2466487935656837e-05, "loss": 0.0815, "step": 36060 }, { "epoch": 2.260449959265526, "grad_norm": 1.2495615482330322, "learning_rate": 1.2455932954761353e-05, "loss": 0.0471, "step": 36070 }, { "epoch": 2.261076643479351, "grad_norm": 0.5814064145088196, "learning_rate": 1.2445377973865868e-05, "loss": 0.2085, "step": 36080 }, { "epoch": 2.2617033276931755, "grad_norm": 0.005435166414827108, "learning_rate": 1.2434822992970383e-05, "loss": 0.0408, "step": 36090 }, { "epoch": 2.2623300119070002, "grad_norm": 0.0057927523739635944, "learning_rate": 1.24242680120749e-05, "loss": 0.07, "step": 36100 }, { "epoch": 2.262956696120825, "grad_norm": 0.06232810765504837, "learning_rate": 1.2413713031179413e-05, "loss": 0.0007, "step": 36110 }, { "epoch": 2.2635833803346492, "grad_norm": 5.673398017883301, "learning_rate": 1.240315805028393e-05, "loss": 0.1493, "step": 36120 }, { "epoch": 2.264210064548474, "grad_norm": 0.03220819681882858, "learning_rate": 1.2392603069388445e-05, "loss": 0.0644, "step": 36130 }, { "epoch": 2.2648367487622987, "grad_norm": 0.004986956249922514, "learning_rate": 1.238204808849296e-05, "loss": 0.0758, "step": 36140 }, { "epoch": 2.2654634329761234, "grad_norm": 3.61641263961792, "learning_rate": 1.2371493107597476e-05, "loss": 0.0821, "step": 36150 }, { "epoch": 2.266090117189948, "grad_norm": 0.0038919805083423853, "learning_rate": 1.2360938126701991e-05, "loss": 0.0038, "step": 36160 }, { "epoch": 2.2667168014037724, "grad_norm": 0.14399461448192596, "learning_rate": 1.2350383145806508e-05, "loss": 0.2721, "step": 36170 }, { "epoch": 2.267343485617597, "grad_norm": 0.1366736739873886, "learning_rate": 1.2339828164911021e-05, "loss": 0.1095, "step": 36180 }, { "epoch": 2.267970169831422, "grad_norm": 0.020060712471604347, "learning_rate": 1.2329273184015536e-05, "loss": 0.0027, "step": 36190 }, { "epoch": 2.2685968540452466, "grad_norm": 0.09323687851428986, "learning_rate": 1.2318718203120053e-05, "loss": 0.067, "step": 36200 }, { "epoch": 2.2692235382590713, "grad_norm": 0.34588101506233215, "learning_rate": 1.2308163222224568e-05, "loss": 0.077, "step": 36210 }, { "epoch": 2.269850222472896, "grad_norm": 0.16162216663360596, "learning_rate": 1.2297608241329085e-05, "loss": 0.0386, "step": 36220 }, { "epoch": 2.2704769066867208, "grad_norm": 0.35178256034851074, "learning_rate": 1.22870532604336e-05, "loss": 0.004, "step": 36230 }, { "epoch": 2.271103590900545, "grad_norm": 0.08593368530273438, "learning_rate": 1.2276498279538114e-05, "loss": 0.0616, "step": 36240 }, { "epoch": 2.2717302751143698, "grad_norm": 0.6215272545814514, "learning_rate": 1.2265943298642631e-05, "loss": 0.0053, "step": 36250 }, { "epoch": 2.2723569593281945, "grad_norm": 0.3504977524280548, "learning_rate": 1.2255388317747144e-05, "loss": 0.0525, "step": 36260 }, { "epoch": 2.272983643542019, "grad_norm": 3.6675524711608887, "learning_rate": 1.2244833336851661e-05, "loss": 0.1653, "step": 36270 }, { "epoch": 2.273610327755844, "grad_norm": 0.012513170018792152, "learning_rate": 1.2234278355956176e-05, "loss": 0.0449, "step": 36280 }, { "epoch": 2.2742370119696687, "grad_norm": 0.009431272745132446, "learning_rate": 1.2223723375060691e-05, "loss": 0.0756, "step": 36290 }, { "epoch": 2.274863696183493, "grad_norm": 3.7812867164611816, "learning_rate": 1.2213168394165208e-05, "loss": 0.1541, "step": 36300 }, { "epoch": 2.2754903803973177, "grad_norm": 0.033403124660253525, "learning_rate": 1.2202613413269723e-05, "loss": 0.0012, "step": 36310 }, { "epoch": 2.2761170646111424, "grad_norm": 0.046253349632024765, "learning_rate": 1.219205843237424e-05, "loss": 0.0517, "step": 36320 }, { "epoch": 2.276743748824967, "grad_norm": 0.2164696902036667, "learning_rate": 1.2181503451478754e-05, "loss": 0.0435, "step": 36330 }, { "epoch": 2.277370433038792, "grad_norm": 0.24268116056919098, "learning_rate": 1.2170948470583269e-05, "loss": 0.0321, "step": 36340 }, { "epoch": 2.2779971172526166, "grad_norm": 0.017050549387931824, "learning_rate": 1.2160393489687784e-05, "loss": 0.2641, "step": 36350 }, { "epoch": 2.2786238014664413, "grad_norm": 8.202871322631836, "learning_rate": 1.2149838508792299e-05, "loss": 0.1438, "step": 36360 }, { "epoch": 2.2792504856802656, "grad_norm": 0.2853762209415436, "learning_rate": 1.2139283527896816e-05, "loss": 0.0668, "step": 36370 }, { "epoch": 2.2798771698940903, "grad_norm": 0.03244467452168465, "learning_rate": 1.212872854700133e-05, "loss": 0.0191, "step": 36380 }, { "epoch": 2.280503854107915, "grad_norm": 0.03982846438884735, "learning_rate": 1.2118173566105846e-05, "loss": 0.1769, "step": 36390 }, { "epoch": 2.2811305383217397, "grad_norm": 0.10278554260730743, "learning_rate": 1.2107618585210362e-05, "loss": 0.0697, "step": 36400 }, { "epoch": 2.2817572225355645, "grad_norm": 0.051736317574977875, "learning_rate": 1.2097063604314877e-05, "loss": 0.1525, "step": 36410 }, { "epoch": 2.2823839067493887, "grad_norm": 0.10186845809221268, "learning_rate": 1.2086508623419392e-05, "loss": 0.0365, "step": 36420 }, { "epoch": 2.2830105909632135, "grad_norm": 0.057203132659196854, "learning_rate": 1.2075953642523907e-05, "loss": 0.0055, "step": 36430 }, { "epoch": 2.283637275177038, "grad_norm": 0.024866662919521332, "learning_rate": 1.2065398661628422e-05, "loss": 0.0431, "step": 36440 }, { "epoch": 2.284263959390863, "grad_norm": 1.0920522212982178, "learning_rate": 1.2054843680732939e-05, "loss": 0.005, "step": 36450 }, { "epoch": 2.2848906436046876, "grad_norm": 0.01022693607956171, "learning_rate": 1.2044288699837454e-05, "loss": 0.1027, "step": 36460 }, { "epoch": 2.2855173278185124, "grad_norm": 0.356696754693985, "learning_rate": 1.203373371894197e-05, "loss": 0.0805, "step": 36470 }, { "epoch": 2.286144012032337, "grad_norm": 0.008136897347867489, "learning_rate": 1.2023178738046485e-05, "loss": 0.0251, "step": 36480 }, { "epoch": 2.286770696246162, "grad_norm": 0.017162851989269257, "learning_rate": 1.2012623757151e-05, "loss": 0.0505, "step": 36490 }, { "epoch": 2.287397380459986, "grad_norm": 0.7460328936576843, "learning_rate": 1.2002068776255515e-05, "loss": 0.0285, "step": 36500 }, { "epoch": 2.288024064673811, "grad_norm": 0.008685870096087456, "learning_rate": 1.199151379536003e-05, "loss": 0.0679, "step": 36510 }, { "epoch": 2.2886507488876355, "grad_norm": 0.008240882307291031, "learning_rate": 1.1980958814464547e-05, "loss": 0.1384, "step": 36520 }, { "epoch": 2.2892774331014603, "grad_norm": 0.008877246640622616, "learning_rate": 1.1970403833569062e-05, "loss": 0.1276, "step": 36530 }, { "epoch": 2.289904117315285, "grad_norm": 0.0377153716981411, "learning_rate": 1.1959848852673577e-05, "loss": 0.0019, "step": 36540 }, { "epoch": 2.2905308015291093, "grad_norm": 0.007972978055477142, "learning_rate": 1.1949293871778093e-05, "loss": 0.0022, "step": 36550 }, { "epoch": 2.291157485742934, "grad_norm": 0.22672541439533234, "learning_rate": 1.1938738890882608e-05, "loss": 0.074, "step": 36560 }, { "epoch": 2.2917841699567587, "grad_norm": 0.011422897689044476, "learning_rate": 1.1928183909987123e-05, "loss": 0.0549, "step": 36570 }, { "epoch": 2.2924108541705834, "grad_norm": 3.5832436084747314, "learning_rate": 1.1917628929091638e-05, "loss": 0.0674, "step": 36580 }, { "epoch": 2.293037538384408, "grad_norm": 0.006303721573203802, "learning_rate": 1.1907073948196155e-05, "loss": 0.0947, "step": 36590 }, { "epoch": 2.293664222598233, "grad_norm": 0.005968243815004826, "learning_rate": 1.189651896730067e-05, "loss": 0.0781, "step": 36600 }, { "epoch": 2.2942909068120576, "grad_norm": 0.005579926539212465, "learning_rate": 1.1885963986405185e-05, "loss": 0.0021, "step": 36610 }, { "epoch": 2.294917591025882, "grad_norm": 0.006298782769590616, "learning_rate": 1.1875409005509701e-05, "loss": 0.1184, "step": 36620 }, { "epoch": 2.2955442752397066, "grad_norm": 0.005551437381654978, "learning_rate": 1.1864854024614216e-05, "loss": 0.0034, "step": 36630 }, { "epoch": 2.2961709594535313, "grad_norm": 0.3854537606239319, "learning_rate": 1.1854299043718731e-05, "loss": 0.0348, "step": 36640 }, { "epoch": 2.296797643667356, "grad_norm": 6.585789203643799, "learning_rate": 1.1843744062823246e-05, "loss": 0.1863, "step": 36650 }, { "epoch": 2.297424327881181, "grad_norm": 0.12899911403656006, "learning_rate": 1.1833189081927761e-05, "loss": 0.0734, "step": 36660 }, { "epoch": 2.2980510120950055, "grad_norm": 0.006385709624737501, "learning_rate": 1.1822634101032278e-05, "loss": 0.0007, "step": 36670 }, { "epoch": 2.29867769630883, "grad_norm": 0.1899183690547943, "learning_rate": 1.1812079120136793e-05, "loss": 0.0053, "step": 36680 }, { "epoch": 2.2993043805226545, "grad_norm": 1.0379557609558105, "learning_rate": 1.1801524139241308e-05, "loss": 0.002, "step": 36690 }, { "epoch": 2.2999310647364792, "grad_norm": 8.715649604797363, "learning_rate": 1.1790969158345824e-05, "loss": 0.1757, "step": 36700 }, { "epoch": 2.300557748950304, "grad_norm": 0.014531532302498817, "learning_rate": 1.178041417745034e-05, "loss": 0.1319, "step": 36710 }, { "epoch": 2.3011844331641287, "grad_norm": 0.00932616088539362, "learning_rate": 1.1769859196554856e-05, "loss": 0.0015, "step": 36720 }, { "epoch": 2.3018111173779534, "grad_norm": 0.008489527739584446, "learning_rate": 1.175930421565937e-05, "loss": 0.0736, "step": 36730 }, { "epoch": 2.302437801591778, "grad_norm": 0.04834457114338875, "learning_rate": 1.1748749234763886e-05, "loss": 0.0771, "step": 36740 }, { "epoch": 2.3030644858056024, "grad_norm": 0.26478925347328186, "learning_rate": 1.1738194253868401e-05, "loss": 0.0638, "step": 36750 }, { "epoch": 2.303691170019427, "grad_norm": 0.0104445219039917, "learning_rate": 1.1727639272972916e-05, "loss": 0.0027, "step": 36760 }, { "epoch": 2.304317854233252, "grad_norm": 0.008162743411958218, "learning_rate": 1.1717084292077433e-05, "loss": 0.0778, "step": 36770 }, { "epoch": 2.3049445384470766, "grad_norm": 0.2298077791929245, "learning_rate": 1.1706529311181948e-05, "loss": 0.0465, "step": 36780 }, { "epoch": 2.3055712226609013, "grad_norm": 0.008726351894438267, "learning_rate": 1.1695974330286462e-05, "loss": 0.0401, "step": 36790 }, { "epoch": 2.3061979068747256, "grad_norm": 0.24333196878433228, "learning_rate": 1.1685419349390977e-05, "loss": 0.2434, "step": 36800 }, { "epoch": 2.3068245910885503, "grad_norm": 0.011418229900300503, "learning_rate": 1.1674864368495492e-05, "loss": 0.0016, "step": 36810 }, { "epoch": 2.307451275302375, "grad_norm": 0.11549999564886093, "learning_rate": 1.1664309387600009e-05, "loss": 0.0032, "step": 36820 }, { "epoch": 2.3080779595161998, "grad_norm": 5.21661376953125, "learning_rate": 1.1653754406704524e-05, "loss": 0.2179, "step": 36830 }, { "epoch": 2.3087046437300245, "grad_norm": 0.016975464299321175, "learning_rate": 1.164319942580904e-05, "loss": 0.0251, "step": 36840 }, { "epoch": 2.309331327943849, "grad_norm": 38.22617721557617, "learning_rate": 1.1632644444913556e-05, "loss": 0.052, "step": 36850 }, { "epoch": 2.309958012157674, "grad_norm": 5.6047444343566895, "learning_rate": 1.162208946401807e-05, "loss": 0.0952, "step": 36860 }, { "epoch": 2.310584696371498, "grad_norm": 0.04268406331539154, "learning_rate": 1.1611534483122587e-05, "loss": 0.1451, "step": 36870 }, { "epoch": 2.311211380585323, "grad_norm": 0.7725669145584106, "learning_rate": 1.16009795022271e-05, "loss": 0.0431, "step": 36880 }, { "epoch": 2.3118380647991477, "grad_norm": 0.016580281779170036, "learning_rate": 1.1590424521331617e-05, "loss": 0.033, "step": 36890 }, { "epoch": 2.3124647490129724, "grad_norm": 6.017152309417725, "learning_rate": 1.1579869540436132e-05, "loss": 0.2377, "step": 36900 }, { "epoch": 2.313091433226797, "grad_norm": 0.042723577469587326, "learning_rate": 1.1569314559540647e-05, "loss": 0.0019, "step": 36910 }, { "epoch": 2.313718117440622, "grad_norm": 26.877918243408203, "learning_rate": 1.1558759578645164e-05, "loss": 0.0296, "step": 36920 }, { "epoch": 2.314344801654446, "grad_norm": 0.01965474896132946, "learning_rate": 1.1548204597749679e-05, "loss": 0.0037, "step": 36930 }, { "epoch": 2.314971485868271, "grad_norm": 12.622506141662598, "learning_rate": 1.1537649616854195e-05, "loss": 0.2077, "step": 36940 }, { "epoch": 2.3155981700820956, "grad_norm": 0.08890029788017273, "learning_rate": 1.152709463595871e-05, "loss": 0.0894, "step": 36950 }, { "epoch": 2.3162248542959203, "grad_norm": 0.08074666559696198, "learning_rate": 1.1516539655063224e-05, "loss": 0.0451, "step": 36960 }, { "epoch": 2.316851538509745, "grad_norm": 6.843764305114746, "learning_rate": 1.150598467416774e-05, "loss": 0.0899, "step": 36970 }, { "epoch": 2.3174782227235697, "grad_norm": 0.012340700253844261, "learning_rate": 1.1495429693272255e-05, "loss": 0.0809, "step": 36980 }, { "epoch": 2.3181049069373945, "grad_norm": 0.07329646497964859, "learning_rate": 1.1484874712376772e-05, "loss": 0.145, "step": 36990 }, { "epoch": 2.3187315911512187, "grad_norm": 0.22346799075603485, "learning_rate": 1.1474319731481287e-05, "loss": 0.0036, "step": 37000 }, { "epoch": 2.3193582753650435, "grad_norm": 0.43645739555358887, "learning_rate": 1.1463764750585802e-05, "loss": 0.0983, "step": 37010 }, { "epoch": 2.319984959578868, "grad_norm": 0.11260388046503067, "learning_rate": 1.1453209769690318e-05, "loss": 0.1085, "step": 37020 }, { "epoch": 2.320611643792693, "grad_norm": 0.09059934318065643, "learning_rate": 1.1442654788794833e-05, "loss": 0.0394, "step": 37030 }, { "epoch": 2.3212383280065176, "grad_norm": 3.420612335205078, "learning_rate": 1.1432099807899348e-05, "loss": 0.0855, "step": 37040 }, { "epoch": 2.3218650122203424, "grad_norm": 0.07549533247947693, "learning_rate": 1.1421544827003863e-05, "loss": 0.0024, "step": 37050 }, { "epoch": 2.3224916964341666, "grad_norm": 0.01121055893599987, "learning_rate": 1.1410989846108378e-05, "loss": 0.0828, "step": 37060 }, { "epoch": 2.3231183806479914, "grad_norm": 0.03671666979789734, "learning_rate": 1.1400434865212895e-05, "loss": 0.0616, "step": 37070 }, { "epoch": 2.323745064861816, "grad_norm": 0.01181852724403143, "learning_rate": 1.138987988431741e-05, "loss": 0.0378, "step": 37080 }, { "epoch": 2.324371749075641, "grad_norm": 0.634685218334198, "learning_rate": 1.1379324903421926e-05, "loss": 0.2232, "step": 37090 }, { "epoch": 2.3249984332894655, "grad_norm": 2.2556233406066895, "learning_rate": 1.1368769922526441e-05, "loss": 0.1033, "step": 37100 }, { "epoch": 2.3256251175032903, "grad_norm": 0.02632150985300541, "learning_rate": 1.1358214941630955e-05, "loss": 0.0698, "step": 37110 }, { "epoch": 2.326251801717115, "grad_norm": 8.871074676513672, "learning_rate": 1.1347659960735471e-05, "loss": 0.0906, "step": 37120 }, { "epoch": 2.3268784859309393, "grad_norm": 1.0586740970611572, "learning_rate": 1.1337104979839986e-05, "loss": 0.0763, "step": 37130 }, { "epoch": 2.327505170144764, "grad_norm": 0.023631436750292778, "learning_rate": 1.1326549998944503e-05, "loss": 0.0057, "step": 37140 }, { "epoch": 2.3281318543585887, "grad_norm": 0.09110420942306519, "learning_rate": 1.1315995018049018e-05, "loss": 0.0792, "step": 37150 }, { "epoch": 2.3287585385724134, "grad_norm": 0.1705956906080246, "learning_rate": 1.1305440037153533e-05, "loss": 0.0269, "step": 37160 }, { "epoch": 2.329385222786238, "grad_norm": 0.04436657577753067, "learning_rate": 1.129488505625805e-05, "loss": 0.153, "step": 37170 }, { "epoch": 2.3300119070000624, "grad_norm": 0.01578608714044094, "learning_rate": 1.1284330075362564e-05, "loss": 0.002, "step": 37180 }, { "epoch": 2.330638591213887, "grad_norm": 0.011682651937007904, "learning_rate": 1.127377509446708e-05, "loss": 0.0338, "step": 37190 }, { "epoch": 2.331265275427712, "grad_norm": 0.02722298726439476, "learning_rate": 1.1263220113571594e-05, "loss": 0.0246, "step": 37200 }, { "epoch": 2.3318919596415366, "grad_norm": 0.03455738350749016, "learning_rate": 1.125266513267611e-05, "loss": 0.0013, "step": 37210 }, { "epoch": 2.3325186438553613, "grad_norm": 0.6584683060646057, "learning_rate": 1.1242110151780626e-05, "loss": 0.3428, "step": 37220 }, { "epoch": 2.333145328069186, "grad_norm": 0.017207475379109383, "learning_rate": 1.1231555170885141e-05, "loss": 0.0411, "step": 37230 }, { "epoch": 2.333772012283011, "grad_norm": 4.051939964294434, "learning_rate": 1.1221000189989658e-05, "loss": 0.1354, "step": 37240 }, { "epoch": 2.334398696496835, "grad_norm": 16.780792236328125, "learning_rate": 1.1210445209094173e-05, "loss": 0.0758, "step": 37250 }, { "epoch": 2.33502538071066, "grad_norm": 0.6355787515640259, "learning_rate": 1.1199890228198687e-05, "loss": 0.1064, "step": 37260 }, { "epoch": 2.3356520649244845, "grad_norm": 0.046288829296827316, "learning_rate": 1.1189335247303202e-05, "loss": 0.0298, "step": 37270 }, { "epoch": 2.3362787491383092, "grad_norm": 5.317742347717285, "learning_rate": 1.1178780266407717e-05, "loss": 0.0783, "step": 37280 }, { "epoch": 2.336905433352134, "grad_norm": 0.026349328458309174, "learning_rate": 1.1168225285512234e-05, "loss": 0.0038, "step": 37290 }, { "epoch": 2.3375321175659587, "grad_norm": 0.026744646951556206, "learning_rate": 1.1157670304616749e-05, "loss": 0.0264, "step": 37300 }, { "epoch": 2.338158801779783, "grad_norm": 0.013837196864187717, "learning_rate": 1.1147115323721264e-05, "loss": 0.0502, "step": 37310 }, { "epoch": 2.3387854859936077, "grad_norm": 0.31136569380760193, "learning_rate": 1.113656034282578e-05, "loss": 0.0065, "step": 37320 }, { "epoch": 2.3394121702074324, "grad_norm": 0.013896308839321136, "learning_rate": 1.1126005361930296e-05, "loss": 0.1444, "step": 37330 }, { "epoch": 2.340038854421257, "grad_norm": 1.2215086221694946, "learning_rate": 1.1115450381034812e-05, "loss": 0.0053, "step": 37340 }, { "epoch": 2.340665538635082, "grad_norm": 0.032671570777893066, "learning_rate": 1.1104895400139325e-05, "loss": 0.0256, "step": 37350 }, { "epoch": 2.3412922228489066, "grad_norm": 21.453617095947266, "learning_rate": 1.1094340419243842e-05, "loss": 0.061, "step": 37360 }, { "epoch": 2.3419189070627313, "grad_norm": 0.21888981759548187, "learning_rate": 1.1083785438348357e-05, "loss": 0.0257, "step": 37370 }, { "epoch": 2.3425455912765556, "grad_norm": 5.909558296203613, "learning_rate": 1.1073230457452872e-05, "loss": 0.0862, "step": 37380 }, { "epoch": 2.3431722754903803, "grad_norm": 0.038163937628269196, "learning_rate": 1.1062675476557389e-05, "loss": 0.0518, "step": 37390 }, { "epoch": 2.343798959704205, "grad_norm": 0.045827217400074005, "learning_rate": 1.1052120495661904e-05, "loss": 0.1041, "step": 37400 }, { "epoch": 2.3444256439180298, "grad_norm": 7.460287570953369, "learning_rate": 1.1041565514766419e-05, "loss": 0.028, "step": 37410 }, { "epoch": 2.3450523281318545, "grad_norm": 0.032291725277900696, "learning_rate": 1.1031010533870935e-05, "loss": 0.0369, "step": 37420 }, { "epoch": 2.3456790123456788, "grad_norm": 0.025432724505662918, "learning_rate": 1.1020455552975449e-05, "loss": 0.0819, "step": 37430 }, { "epoch": 2.3463056965595035, "grad_norm": 0.009814858436584473, "learning_rate": 1.1009900572079965e-05, "loss": 0.1122, "step": 37440 }, { "epoch": 2.346932380773328, "grad_norm": 0.10019972175359726, "learning_rate": 1.099934559118448e-05, "loss": 0.0015, "step": 37450 }, { "epoch": 2.347559064987153, "grad_norm": 5.408548831939697, "learning_rate": 1.0988790610288995e-05, "loss": 0.1065, "step": 37460 }, { "epoch": 2.3481857492009777, "grad_norm": 5.011567115783691, "learning_rate": 1.0978235629393512e-05, "loss": 0.1711, "step": 37470 }, { "epoch": 2.3488124334148024, "grad_norm": 0.2545178234577179, "learning_rate": 1.0967680648498027e-05, "loss": 0.1407, "step": 37480 }, { "epoch": 2.349439117628627, "grad_norm": 0.04436579346656799, "learning_rate": 1.0957125667602543e-05, "loss": 0.1503, "step": 37490 }, { "epoch": 2.3500658018424514, "grad_norm": 18.1503849029541, "learning_rate": 1.0946570686707057e-05, "loss": 0.1198, "step": 37500 }, { "epoch": 2.350692486056276, "grad_norm": 0.2500666379928589, "learning_rate": 1.0936015705811573e-05, "loss": 0.071, "step": 37510 }, { "epoch": 2.351319170270101, "grad_norm": 0.2713271677494049, "learning_rate": 1.0925460724916088e-05, "loss": 0.1216, "step": 37520 }, { "epoch": 2.3519458544839256, "grad_norm": 0.2982094883918762, "learning_rate": 1.0914905744020603e-05, "loss": 0.09, "step": 37530 }, { "epoch": 2.3525725386977503, "grad_norm": 0.24116140604019165, "learning_rate": 1.090435076312512e-05, "loss": 0.0944, "step": 37540 }, { "epoch": 2.353199222911575, "grad_norm": 0.02480962686240673, "learning_rate": 1.0893795782229635e-05, "loss": 0.0286, "step": 37550 }, { "epoch": 2.3538259071253993, "grad_norm": 0.05994749814271927, "learning_rate": 1.088324080133415e-05, "loss": 0.1056, "step": 37560 }, { "epoch": 2.354452591339224, "grad_norm": 4.796363830566406, "learning_rate": 1.0872685820438666e-05, "loss": 0.2076, "step": 37570 }, { "epoch": 2.3550792755530487, "grad_norm": 0.0286555178463459, "learning_rate": 1.086213083954318e-05, "loss": 0.0233, "step": 37580 }, { "epoch": 2.3557059597668735, "grad_norm": 0.052487812936306, "learning_rate": 1.0851575858647696e-05, "loss": 0.04, "step": 37590 }, { "epoch": 2.356332643980698, "grad_norm": 0.03592820465564728, "learning_rate": 1.0841020877752211e-05, "loss": 0.0173, "step": 37600 }, { "epoch": 2.356959328194523, "grad_norm": 0.015476588159799576, "learning_rate": 1.0830465896856728e-05, "loss": 0.0018, "step": 37610 }, { "epoch": 2.3575860124083476, "grad_norm": 0.013065283186733723, "learning_rate": 1.0819910915961243e-05, "loss": 0.0705, "step": 37620 }, { "epoch": 2.358212696622172, "grad_norm": 9.37665843963623, "learning_rate": 1.0809355935065758e-05, "loss": 0.1219, "step": 37630 }, { "epoch": 2.3588393808359966, "grad_norm": 0.2045561969280243, "learning_rate": 1.0798800954170275e-05, "loss": 0.0633, "step": 37640 }, { "epoch": 2.3594660650498214, "grad_norm": 0.18942023813724518, "learning_rate": 1.078824597327479e-05, "loss": 0.0611, "step": 37650 }, { "epoch": 2.360092749263646, "grad_norm": 77.0622787475586, "learning_rate": 1.0777690992379304e-05, "loss": 0.1633, "step": 37660 }, { "epoch": 2.360719433477471, "grad_norm": 0.24678970873355865, "learning_rate": 1.076713601148382e-05, "loss": 0.052, "step": 37670 }, { "epoch": 2.3613461176912955, "grad_norm": 0.008172067813575268, "learning_rate": 1.0756581030588334e-05, "loss": 0.1067, "step": 37680 }, { "epoch": 2.36197280190512, "grad_norm": 0.05612856149673462, "learning_rate": 1.0746026049692851e-05, "loss": 0.1683, "step": 37690 }, { "epoch": 2.3625994861189445, "grad_norm": 0.016600340604782104, "learning_rate": 1.0735471068797366e-05, "loss": 0.0029, "step": 37700 }, { "epoch": 2.3632261703327693, "grad_norm": 0.4090496897697449, "learning_rate": 1.0724916087901881e-05, "loss": 0.0012, "step": 37710 }, { "epoch": 2.363852854546594, "grad_norm": 0.01410214975476265, "learning_rate": 1.0714361107006398e-05, "loss": 0.0022, "step": 37720 }, { "epoch": 2.3644795387604187, "grad_norm": 0.045551199465990067, "learning_rate": 1.0703806126110913e-05, "loss": 0.0283, "step": 37730 }, { "epoch": 2.3651062229742434, "grad_norm": 2.0473251342773438, "learning_rate": 1.0693251145215427e-05, "loss": 0.0508, "step": 37740 }, { "epoch": 2.365732907188068, "grad_norm": 6.74009370803833, "learning_rate": 1.0682696164319942e-05, "loss": 0.0948, "step": 37750 }, { "epoch": 2.3663595914018924, "grad_norm": 0.0833032876253128, "learning_rate": 1.0672141183424459e-05, "loss": 0.0332, "step": 37760 }, { "epoch": 2.366986275615717, "grad_norm": 15.18896484375, "learning_rate": 1.0661586202528974e-05, "loss": 0.0505, "step": 37770 }, { "epoch": 2.367612959829542, "grad_norm": 0.3920779526233673, "learning_rate": 1.0651031221633489e-05, "loss": 0.0363, "step": 37780 }, { "epoch": 2.3682396440433666, "grad_norm": 0.008296458050608635, "learning_rate": 1.0640476240738006e-05, "loss": 0.0467, "step": 37790 }, { "epoch": 2.3688663282571913, "grad_norm": 0.014929085038602352, "learning_rate": 1.062992125984252e-05, "loss": 0.1838, "step": 37800 }, { "epoch": 2.3694930124710156, "grad_norm": 0.024601779878139496, "learning_rate": 1.0619366278947036e-05, "loss": 0.0027, "step": 37810 }, { "epoch": 2.3701196966848403, "grad_norm": 0.1446009874343872, "learning_rate": 1.060881129805155e-05, "loss": 0.0521, "step": 37820 }, { "epoch": 2.370746380898665, "grad_norm": 0.022335579618811607, "learning_rate": 1.0598256317156065e-05, "loss": 0.2932, "step": 37830 }, { "epoch": 2.37137306511249, "grad_norm": 0.03476012498140335, "learning_rate": 1.0587701336260582e-05, "loss": 0.2129, "step": 37840 }, { "epoch": 2.3719997493263145, "grad_norm": 0.10861321538686752, "learning_rate": 1.0577146355365097e-05, "loss": 0.0755, "step": 37850 }, { "epoch": 2.3726264335401392, "grad_norm": 0.04184696078300476, "learning_rate": 1.0566591374469614e-05, "loss": 0.0031, "step": 37860 }, { "epoch": 2.373253117753964, "grad_norm": 6.86499547958374, "learning_rate": 1.0556036393574129e-05, "loss": 0.2977, "step": 37870 }, { "epoch": 2.3738798019677883, "grad_norm": 0.027534818276762962, "learning_rate": 1.0545481412678644e-05, "loss": 0.0055, "step": 37880 }, { "epoch": 2.374506486181613, "grad_norm": 1.948462963104248, "learning_rate": 1.0534926431783159e-05, "loss": 0.0605, "step": 37890 }, { "epoch": 2.3751331703954377, "grad_norm": 8.56440258026123, "learning_rate": 1.0524371450887674e-05, "loss": 0.0694, "step": 37900 }, { "epoch": 2.3757598546092624, "grad_norm": 0.39405912160873413, "learning_rate": 1.051381646999219e-05, "loss": 0.0028, "step": 37910 }, { "epoch": 2.376386538823087, "grad_norm": 0.13846786320209503, "learning_rate": 1.0503261489096705e-05, "loss": 0.0018, "step": 37920 }, { "epoch": 2.377013223036912, "grad_norm": 0.024694940075278282, "learning_rate": 1.049270650820122e-05, "loss": 0.0351, "step": 37930 }, { "epoch": 2.377639907250736, "grad_norm": 0.013253679499030113, "learning_rate": 1.0482151527305737e-05, "loss": 0.0972, "step": 37940 }, { "epoch": 2.378266591464561, "grad_norm": 0.010091162286698818, "learning_rate": 1.0471596546410252e-05, "loss": 0.0472, "step": 37950 }, { "epoch": 2.3788932756783856, "grad_norm": 0.0070129623636603355, "learning_rate": 1.0461041565514768e-05, "loss": 0.1027, "step": 37960 }, { "epoch": 2.3795199598922103, "grad_norm": 0.36958637833595276, "learning_rate": 1.0450486584619282e-05, "loss": 0.1343, "step": 37970 }, { "epoch": 2.380146644106035, "grad_norm": 24.933622360229492, "learning_rate": 1.0439931603723797e-05, "loss": 0.0473, "step": 37980 }, { "epoch": 2.3807733283198598, "grad_norm": 0.015931887552142143, "learning_rate": 1.0429376622828313e-05, "loss": 0.0813, "step": 37990 }, { "epoch": 2.3814000125336845, "grad_norm": 0.03066008910536766, "learning_rate": 1.0418821641932828e-05, "loss": 0.0206, "step": 38000 }, { "epoch": 2.3820266967475088, "grad_norm": 0.011086874641478062, "learning_rate": 1.0408266661037345e-05, "loss": 0.0025, "step": 38010 }, { "epoch": 2.3826533809613335, "grad_norm": 0.011415200307965279, "learning_rate": 1.039771168014186e-05, "loss": 0.051, "step": 38020 }, { "epoch": 2.3832800651751582, "grad_norm": 0.017173422500491142, "learning_rate": 1.0387156699246375e-05, "loss": 0.1283, "step": 38030 }, { "epoch": 2.383906749388983, "grad_norm": 0.03894858434796333, "learning_rate": 1.0376601718350891e-05, "loss": 0.0005, "step": 38040 }, { "epoch": 2.3845334336028077, "grad_norm": 0.020368503406643867, "learning_rate": 1.0366046737455405e-05, "loss": 0.0711, "step": 38050 }, { "epoch": 2.385160117816632, "grad_norm": 0.01607118360698223, "learning_rate": 1.0355491756559921e-05, "loss": 0.0114, "step": 38060 }, { "epoch": 2.3857868020304567, "grad_norm": 14.362074851989746, "learning_rate": 1.0344936775664436e-05, "loss": 0.0455, "step": 38070 }, { "epoch": 2.3864134862442814, "grad_norm": 0.18846359848976135, "learning_rate": 1.0334381794768951e-05, "loss": 0.0081, "step": 38080 }, { "epoch": 2.387040170458106, "grad_norm": 0.020711679011583328, "learning_rate": 1.0323826813873468e-05, "loss": 0.1372, "step": 38090 }, { "epoch": 2.387666854671931, "grad_norm": 0.01883024349808693, "learning_rate": 1.0313271832977983e-05, "loss": 0.0023, "step": 38100 }, { "epoch": 2.3882935388857556, "grad_norm": 0.052873801440000534, "learning_rate": 1.03027168520825e-05, "loss": 0.0184, "step": 38110 }, { "epoch": 2.3889202230995803, "grad_norm": 0.7084928750991821, "learning_rate": 1.0292161871187013e-05, "loss": 0.0062, "step": 38120 }, { "epoch": 2.389546907313405, "grad_norm": 0.040277425199747086, "learning_rate": 1.0281606890291528e-05, "loss": 0.0184, "step": 38130 }, { "epoch": 2.3901735915272293, "grad_norm": 12.49878978729248, "learning_rate": 1.0271051909396044e-05, "loss": 0.0084, "step": 38140 }, { "epoch": 2.390800275741054, "grad_norm": 0.007965478114783764, "learning_rate": 1.026049692850056e-05, "loss": 0.0019, "step": 38150 }, { "epoch": 2.3914269599548788, "grad_norm": 0.0063006095588207245, "learning_rate": 1.0249941947605076e-05, "loss": 0.0573, "step": 38160 }, { "epoch": 2.3920536441687035, "grad_norm": 4.532165050506592, "learning_rate": 1.0239386966709591e-05, "loss": 0.0936, "step": 38170 }, { "epoch": 2.392680328382528, "grad_norm": 0.012650924734771252, "learning_rate": 1.0228831985814106e-05, "loss": 0.0184, "step": 38180 }, { "epoch": 2.3933070125963525, "grad_norm": 0.1698913872241974, "learning_rate": 1.0218277004918623e-05, "loss": 0.1913, "step": 38190 }, { "epoch": 2.393933696810177, "grad_norm": 0.010228320956230164, "learning_rate": 1.0207722024023136e-05, "loss": 0.1132, "step": 38200 }, { "epoch": 2.394560381024002, "grad_norm": 0.036675091832876205, "learning_rate": 1.0197167043127652e-05, "loss": 0.1673, "step": 38210 }, { "epoch": 2.3951870652378267, "grad_norm": 0.07806365191936493, "learning_rate": 1.0186612062232167e-05, "loss": 0.1276, "step": 38220 }, { "epoch": 2.3958137494516514, "grad_norm": 0.01533064991235733, "learning_rate": 1.0176057081336682e-05, "loss": 0.0448, "step": 38230 }, { "epoch": 2.396440433665476, "grad_norm": 19.43166160583496, "learning_rate": 1.0165502100441199e-05, "loss": 0.1547, "step": 38240 }, { "epoch": 2.397067117879301, "grad_norm": 0.04978703707456589, "learning_rate": 1.0154947119545714e-05, "loss": 0.0024, "step": 38250 }, { "epoch": 2.397693802093125, "grad_norm": 0.014913588762283325, "learning_rate": 1.014439213865023e-05, "loss": 0.0507, "step": 38260 }, { "epoch": 2.39832048630695, "grad_norm": 0.029882259666919708, "learning_rate": 1.0133837157754746e-05, "loss": 0.1431, "step": 38270 }, { "epoch": 2.3989471705207746, "grad_norm": 0.27971410751342773, "learning_rate": 1.012328217685926e-05, "loss": 0.0017, "step": 38280 }, { "epoch": 2.3995738547345993, "grad_norm": 0.0225802194327116, "learning_rate": 1.0112727195963776e-05, "loss": 0.0679, "step": 38290 }, { "epoch": 2.400200538948424, "grad_norm": 0.11946109682321548, "learning_rate": 1.010217221506829e-05, "loss": 0.1072, "step": 38300 }, { "epoch": 2.4008272231622487, "grad_norm": 0.018057910725474358, "learning_rate": 1.0091617234172807e-05, "loss": 0.0599, "step": 38310 }, { "epoch": 2.401453907376073, "grad_norm": 0.16458840668201447, "learning_rate": 1.0081062253277322e-05, "loss": 0.0039, "step": 38320 }, { "epoch": 2.4020805915898977, "grad_norm": 0.013043378479778767, "learning_rate": 1.0070507272381837e-05, "loss": 0.0631, "step": 38330 }, { "epoch": 2.4027072758037225, "grad_norm": 0.08399718254804611, "learning_rate": 1.0059952291486354e-05, "loss": 0.065, "step": 38340 }, { "epoch": 2.403333960017547, "grad_norm": 0.020437980070710182, "learning_rate": 1.0049397310590869e-05, "loss": 0.0777, "step": 38350 }, { "epoch": 2.403960644231372, "grad_norm": 5.764674663543701, "learning_rate": 1.0038842329695384e-05, "loss": 0.0664, "step": 38360 }, { "epoch": 2.4045873284451966, "grad_norm": 0.020989634096622467, "learning_rate": 1.0028287348799899e-05, "loss": 0.0679, "step": 38370 }, { "epoch": 2.4052140126590213, "grad_norm": 0.12513042986392975, "learning_rate": 1.0017732367904415e-05, "loss": 0.0018, "step": 38380 }, { "epoch": 2.4058406968728456, "grad_norm": 0.040014710277318954, "learning_rate": 1.000717738700893e-05, "loss": 0.0711, "step": 38390 }, { "epoch": 2.4064673810866704, "grad_norm": 5.037256240844727, "learning_rate": 9.996622406113445e-06, "loss": 0.3477, "step": 38400 }, { "epoch": 2.407094065300495, "grad_norm": 0.5470215678215027, "learning_rate": 9.986067425217962e-06, "loss": 0.0053, "step": 38410 }, { "epoch": 2.40772074951432, "grad_norm": 0.16532324254512787, "learning_rate": 9.975512444322477e-06, "loss": 0.0031, "step": 38420 }, { "epoch": 2.4083474337281445, "grad_norm": 0.011890468187630177, "learning_rate": 9.964957463426992e-06, "loss": 0.1645, "step": 38430 }, { "epoch": 2.408974117941969, "grad_norm": 9.385170936584473, "learning_rate": 9.954402482531507e-06, "loss": 0.1329, "step": 38440 }, { "epoch": 2.4096008021557935, "grad_norm": 0.021074773743748665, "learning_rate": 9.943847501636022e-06, "loss": 0.002, "step": 38450 }, { "epoch": 2.4102274863696183, "grad_norm": 0.01659083366394043, "learning_rate": 9.933292520740538e-06, "loss": 0.0025, "step": 38460 }, { "epoch": 2.410854170583443, "grad_norm": 0.01685764454305172, "learning_rate": 9.922737539845053e-06, "loss": 0.0032, "step": 38470 }, { "epoch": 2.4114808547972677, "grad_norm": 0.007443627342581749, "learning_rate": 9.912182558949568e-06, "loss": 0.0022, "step": 38480 }, { "epoch": 2.4121075390110924, "grad_norm": 0.13265888392925262, "learning_rate": 9.901627578054085e-06, "loss": 0.1902, "step": 38490 }, { "epoch": 2.412734223224917, "grad_norm": 0.04250665754079819, "learning_rate": 9.8910725971586e-06, "loss": 0.0988, "step": 38500 }, { "epoch": 2.4133609074387414, "grad_norm": 0.2933494448661804, "learning_rate": 9.880517616263115e-06, "loss": 0.0713, "step": 38510 }, { "epoch": 2.413987591652566, "grad_norm": 0.030012181028723717, "learning_rate": 9.86996263536763e-06, "loss": 0.0635, "step": 38520 }, { "epoch": 2.414614275866391, "grad_norm": 0.21046006679534912, "learning_rate": 9.859407654472146e-06, "loss": 0.0017, "step": 38530 }, { "epoch": 2.4152409600802156, "grad_norm": 0.006909917574375868, "learning_rate": 9.848852673576661e-06, "loss": 0.2198, "step": 38540 }, { "epoch": 2.4158676442940403, "grad_norm": 0.1460009217262268, "learning_rate": 9.838297692681176e-06, "loss": 0.0016, "step": 38550 }, { "epoch": 2.416494328507865, "grad_norm": 0.49015122652053833, "learning_rate": 9.827742711785693e-06, "loss": 0.0041, "step": 38560 }, { "epoch": 2.4171210127216893, "grad_norm": 0.0862099900841713, "learning_rate": 9.817187730890208e-06, "loss": 0.1908, "step": 38570 }, { "epoch": 2.417747696935514, "grad_norm": 0.007962826639413834, "learning_rate": 9.806632749994723e-06, "loss": 0.0472, "step": 38580 }, { "epoch": 2.418374381149339, "grad_norm": 0.023178890347480774, "learning_rate": 9.796077769099238e-06, "loss": 0.0047, "step": 38590 }, { "epoch": 2.4190010653631635, "grad_norm": 0.013865641318261623, "learning_rate": 9.785522788203753e-06, "loss": 0.0388, "step": 38600 }, { "epoch": 2.4196277495769882, "grad_norm": 0.03907917067408562, "learning_rate": 9.77496780730827e-06, "loss": 0.0341, "step": 38610 }, { "epoch": 2.420254433790813, "grad_norm": 0.25156423449516296, "learning_rate": 9.764412826412784e-06, "loss": 0.0868, "step": 38620 }, { "epoch": 2.4208811180046377, "grad_norm": 5.6584343910217285, "learning_rate": 9.753857845517301e-06, "loss": 0.1857, "step": 38630 }, { "epoch": 2.421507802218462, "grad_norm": 6.962517261505127, "learning_rate": 9.743302864621816e-06, "loss": 0.0857, "step": 38640 }, { "epoch": 2.4221344864322867, "grad_norm": 0.031737297773361206, "learning_rate": 9.732747883726331e-06, "loss": 0.1053, "step": 38650 }, { "epoch": 2.4227611706461114, "grad_norm": 0.24770425260066986, "learning_rate": 9.722192902830848e-06, "loss": 0.0985, "step": 38660 }, { "epoch": 2.423387854859936, "grad_norm": 0.041502561420202255, "learning_rate": 9.71163792193536e-06, "loss": 0.0038, "step": 38670 }, { "epoch": 2.424014539073761, "grad_norm": 0.026886025443673134, "learning_rate": 9.701082941039877e-06, "loss": 0.0027, "step": 38680 }, { "epoch": 2.4246412232875856, "grad_norm": 5.663731575012207, "learning_rate": 9.690527960144392e-06, "loss": 0.1705, "step": 38690 }, { "epoch": 2.42526790750141, "grad_norm": 0.05266406387090683, "learning_rate": 9.679972979248907e-06, "loss": 0.0965, "step": 38700 }, { "epoch": 2.4258945917152346, "grad_norm": 0.025294840335845947, "learning_rate": 9.669417998353424e-06, "loss": 0.0695, "step": 38710 }, { "epoch": 2.4265212759290593, "grad_norm": 24.895811080932617, "learning_rate": 9.658863017457939e-06, "loss": 0.1557, "step": 38720 }, { "epoch": 2.427147960142884, "grad_norm": 0.04512655735015869, "learning_rate": 9.648308036562454e-06, "loss": 0.0673, "step": 38730 }, { "epoch": 2.4277746443567088, "grad_norm": 0.47210386395454407, "learning_rate": 9.637753055666969e-06, "loss": 0.1174, "step": 38740 }, { "epoch": 2.4284013285705335, "grad_norm": 0.07949967682361603, "learning_rate": 9.627198074771484e-06, "loss": 0.0547, "step": 38750 }, { "epoch": 2.429028012784358, "grad_norm": 0.11611676961183548, "learning_rate": 9.616643093876e-06, "loss": 0.1079, "step": 38760 }, { "epoch": 2.4296546969981825, "grad_norm": 9.299555778503418, "learning_rate": 9.606088112980515e-06, "loss": 0.0926, "step": 38770 }, { "epoch": 2.430281381212007, "grad_norm": 0.05925847962498665, "learning_rate": 9.595533132085032e-06, "loss": 0.215, "step": 38780 }, { "epoch": 2.430908065425832, "grad_norm": 0.07853665202856064, "learning_rate": 9.584978151189547e-06, "loss": 0.0559, "step": 38790 }, { "epoch": 2.4315347496396567, "grad_norm": 0.1322367936372757, "learning_rate": 9.574423170294062e-06, "loss": 0.0045, "step": 38800 }, { "epoch": 2.4321614338534814, "grad_norm": 0.22832946479320526, "learning_rate": 9.563868189398579e-06, "loss": 0.1404, "step": 38810 }, { "epoch": 2.4327881180673057, "grad_norm": 0.021371060982346535, "learning_rate": 9.553313208503092e-06, "loss": 0.1971, "step": 38820 }, { "epoch": 2.4334148022811304, "grad_norm": 0.03888564184308052, "learning_rate": 9.542758227607609e-06, "loss": 0.059, "step": 38830 }, { "epoch": 2.434041486494955, "grad_norm": 0.0451219379901886, "learning_rate": 9.532203246712124e-06, "loss": 0.094, "step": 38840 }, { "epoch": 2.43466817070878, "grad_norm": 0.03573371469974518, "learning_rate": 9.521648265816639e-06, "loss": 0.0276, "step": 38850 }, { "epoch": 2.4352948549226046, "grad_norm": 0.05684259161353111, "learning_rate": 9.511093284921155e-06, "loss": 0.0492, "step": 38860 }, { "epoch": 2.4359215391364293, "grad_norm": 0.021530529484152794, "learning_rate": 9.50053830402567e-06, "loss": 0.0352, "step": 38870 }, { "epoch": 2.436548223350254, "grad_norm": 6.627198219299316, "learning_rate": 9.489983323130187e-06, "loss": 0.2246, "step": 38880 }, { "epoch": 2.4371749075640783, "grad_norm": 8.19845962524414, "learning_rate": 9.479428342234702e-06, "loss": 0.1038, "step": 38890 }, { "epoch": 2.437801591777903, "grad_norm": 0.13801322877407074, "learning_rate": 9.468873361339215e-06, "loss": 0.0036, "step": 38900 }, { "epoch": 2.4384282759917277, "grad_norm": 0.16967302560806274, "learning_rate": 9.458318380443732e-06, "loss": 0.0364, "step": 38910 }, { "epoch": 2.4390549602055525, "grad_norm": 0.4350135922431946, "learning_rate": 9.447763399548247e-06, "loss": 0.028, "step": 38920 }, { "epoch": 2.439681644419377, "grad_norm": 4.991820812225342, "learning_rate": 9.437208418652763e-06, "loss": 0.1386, "step": 38930 }, { "epoch": 2.440308328633202, "grad_norm": 0.03626188263297081, "learning_rate": 9.426653437757278e-06, "loss": 0.0017, "step": 38940 }, { "epoch": 2.440935012847026, "grad_norm": 0.006887183059006929, "learning_rate": 9.416098456861793e-06, "loss": 0.1327, "step": 38950 }, { "epoch": 2.441561697060851, "grad_norm": 0.005281991325318813, "learning_rate": 9.40554347596631e-06, "loss": 0.004, "step": 38960 }, { "epoch": 2.4421883812746756, "grad_norm": 0.00498551269993186, "learning_rate": 9.394988495070825e-06, "loss": 0.0388, "step": 38970 }, { "epoch": 2.4428150654885004, "grad_norm": 0.017654692754149437, "learning_rate": 9.38443351417534e-06, "loss": 0.1044, "step": 38980 }, { "epoch": 2.443441749702325, "grad_norm": 0.06669986993074417, "learning_rate": 9.373878533279855e-06, "loss": 0.1428, "step": 38990 }, { "epoch": 2.44406843391615, "grad_norm": 0.19353161752223969, "learning_rate": 9.36332355238437e-06, "loss": 0.0032, "step": 39000 }, { "epoch": 2.4446951181299745, "grad_norm": 0.19353748857975006, "learning_rate": 9.352768571488886e-06, "loss": 0.0608, "step": 39010 }, { "epoch": 2.445321802343799, "grad_norm": 0.011640098877251148, "learning_rate": 9.342213590593401e-06, "loss": 0.021, "step": 39020 }, { "epoch": 2.4459484865576235, "grad_norm": 0.052307840436697006, "learning_rate": 9.331658609697918e-06, "loss": 0.028, "step": 39030 }, { "epoch": 2.4465751707714483, "grad_norm": 0.01141645573079586, "learning_rate": 9.321103628802433e-06, "loss": 0.051, "step": 39040 }, { "epoch": 2.447201854985273, "grad_norm": 0.012516812421381474, "learning_rate": 9.310548647906948e-06, "loss": 0.0318, "step": 39050 }, { "epoch": 2.4478285391990977, "grad_norm": 0.07261917740106583, "learning_rate": 9.299993667011463e-06, "loss": 0.0009, "step": 39060 }, { "epoch": 2.448455223412922, "grad_norm": 0.07534591108560562, "learning_rate": 9.289438686115978e-06, "loss": 0.0406, "step": 39070 }, { "epoch": 2.4490819076267467, "grad_norm": 0.012093325145542622, "learning_rate": 9.278883705220494e-06, "loss": 0.069, "step": 39080 }, { "epoch": 2.4497085918405714, "grad_norm": 0.021784339100122452, "learning_rate": 9.26832872432501e-06, "loss": 0.1769, "step": 39090 }, { "epoch": 2.450335276054396, "grad_norm": 0.08205260336399078, "learning_rate": 9.257773743429524e-06, "loss": 0.0058, "step": 39100 }, { "epoch": 2.450961960268221, "grad_norm": 0.012557939626276493, "learning_rate": 9.247218762534041e-06, "loss": 0.0407, "step": 39110 }, { "epoch": 2.4515886444820456, "grad_norm": 0.00688109640032053, "learning_rate": 9.236663781638556e-06, "loss": 0.0449, "step": 39120 }, { "epoch": 2.4522153286958703, "grad_norm": 0.035959869623184204, "learning_rate": 9.226108800743071e-06, "loss": 0.0687, "step": 39130 }, { "epoch": 2.4528420129096946, "grad_norm": 18.920211791992188, "learning_rate": 9.215553819847586e-06, "loss": 0.1367, "step": 39140 }, { "epoch": 2.4534686971235193, "grad_norm": 0.011474884115159512, "learning_rate": 9.2049988389521e-06, "loss": 0.0033, "step": 39150 }, { "epoch": 2.454095381337344, "grad_norm": 5.881065368652344, "learning_rate": 9.194443858056617e-06, "loss": 0.0946, "step": 39160 }, { "epoch": 2.454722065551169, "grad_norm": 0.00592809310182929, "learning_rate": 9.183888877161132e-06, "loss": 0.0388, "step": 39170 }, { "epoch": 2.4553487497649935, "grad_norm": 0.6982523202896118, "learning_rate": 9.173333896265649e-06, "loss": 0.1002, "step": 39180 }, { "epoch": 2.4559754339788182, "grad_norm": 0.012783760204911232, "learning_rate": 9.162778915370164e-06, "loss": 0.0076, "step": 39190 }, { "epoch": 2.4566021181926425, "grad_norm": 6.998777389526367, "learning_rate": 9.152223934474679e-06, "loss": 0.053, "step": 39200 }, { "epoch": 2.4572288024064672, "grad_norm": 0.18721304833889008, "learning_rate": 9.141668953579194e-06, "loss": 0.0025, "step": 39210 }, { "epoch": 2.457855486620292, "grad_norm": 0.012833227403461933, "learning_rate": 9.131113972683709e-06, "loss": 0.0976, "step": 39220 }, { "epoch": 2.4584821708341167, "grad_norm": 0.014098461717367172, "learning_rate": 9.120558991788226e-06, "loss": 0.0631, "step": 39230 }, { "epoch": 2.4591088550479414, "grad_norm": 0.7037414908409119, "learning_rate": 9.11000401089274e-06, "loss": 0.0014, "step": 39240 }, { "epoch": 2.459735539261766, "grad_norm": 0.0042467243038117886, "learning_rate": 9.099449029997255e-06, "loss": 0.0062, "step": 39250 }, { "epoch": 2.460362223475591, "grad_norm": 0.005058089271187782, "learning_rate": 9.088894049101772e-06, "loss": 0.0743, "step": 39260 }, { "epoch": 2.460988907689415, "grad_norm": 0.009189349599182606, "learning_rate": 9.078339068206287e-06, "loss": 0.1397, "step": 39270 }, { "epoch": 2.46161559190324, "grad_norm": 0.005269974935799837, "learning_rate": 9.067784087310804e-06, "loss": 0.0057, "step": 39280 }, { "epoch": 2.4622422761170646, "grad_norm": 1.0407090187072754, "learning_rate": 9.057229106415317e-06, "loss": 0.043, "step": 39290 }, { "epoch": 2.4628689603308893, "grad_norm": 0.6664018630981445, "learning_rate": 9.046674125519834e-06, "loss": 0.0912, "step": 39300 }, { "epoch": 2.463495644544714, "grad_norm": 0.008333146572113037, "learning_rate": 9.036119144624349e-06, "loss": 0.315, "step": 39310 }, { "epoch": 2.4641223287585388, "grad_norm": 6.8014445304870605, "learning_rate": 9.025564163728864e-06, "loss": 0.0057, "step": 39320 }, { "epoch": 2.464749012972363, "grad_norm": 0.10319343209266663, "learning_rate": 9.01500918283338e-06, "loss": 0.0191, "step": 39330 }, { "epoch": 2.4653756971861878, "grad_norm": 17.48542594909668, "learning_rate": 9.004454201937895e-06, "loss": 0.2083, "step": 39340 }, { "epoch": 2.4660023814000125, "grad_norm": 0.022007020190358162, "learning_rate": 8.99389922104241e-06, "loss": 0.002, "step": 39350 }, { "epoch": 2.466629065613837, "grad_norm": 0.08251410722732544, "learning_rate": 8.983344240146927e-06, "loss": 0.1012, "step": 39360 }, { "epoch": 2.467255749827662, "grad_norm": 0.1723119467496872, "learning_rate": 8.97278925925144e-06, "loss": 0.104, "step": 39370 }, { "epoch": 2.4678824340414867, "grad_norm": 0.06838217377662659, "learning_rate": 8.962234278355957e-06, "loss": 0.002, "step": 39380 }, { "epoch": 2.4685091182553114, "grad_norm": 0.01145200990140438, "learning_rate": 8.951679297460472e-06, "loss": 0.0015, "step": 39390 }, { "epoch": 2.4691358024691357, "grad_norm": 0.01762591302394867, "learning_rate": 8.941124316564988e-06, "loss": 0.0528, "step": 39400 }, { "epoch": 2.4697624866829604, "grad_norm": 0.11748268455266953, "learning_rate": 8.930569335669503e-06, "loss": 0.0992, "step": 39410 }, { "epoch": 2.470389170896785, "grad_norm": 0.04246789962053299, "learning_rate": 8.920014354774018e-06, "loss": 0.0537, "step": 39420 }, { "epoch": 2.47101585511061, "grad_norm": 0.12296553701162338, "learning_rate": 8.909459373878535e-06, "loss": 0.0725, "step": 39430 }, { "epoch": 2.4716425393244346, "grad_norm": 0.008064544759690762, "learning_rate": 8.898904392983048e-06, "loss": 0.04, "step": 39440 }, { "epoch": 2.472269223538259, "grad_norm": 0.005667585413902998, "learning_rate": 8.888349412087565e-06, "loss": 0.0019, "step": 39450 }, { "epoch": 2.4728959077520836, "grad_norm": 0.0050772554241120815, "learning_rate": 8.87779443119208e-06, "loss": 0.0014, "step": 39460 }, { "epoch": 2.4735225919659083, "grad_norm": 0.010119447484612465, "learning_rate": 8.867239450296595e-06, "loss": 0.1343, "step": 39470 }, { "epoch": 2.474149276179733, "grad_norm": 0.0813032016158104, "learning_rate": 8.856684469401111e-06, "loss": 0.0013, "step": 39480 }, { "epoch": 2.4747759603935577, "grad_norm": 2.469439744949341, "learning_rate": 8.846129488505626e-06, "loss": 0.0497, "step": 39490 }, { "epoch": 2.4754026446073825, "grad_norm": 0.027179885655641556, "learning_rate": 8.835574507610141e-06, "loss": 0.0521, "step": 39500 }, { "epoch": 2.476029328821207, "grad_norm": 0.04990369826555252, "learning_rate": 8.825019526714658e-06, "loss": 0.1079, "step": 39510 }, { "epoch": 2.4766560130350315, "grad_norm": 9.644336700439453, "learning_rate": 8.814464545819171e-06, "loss": 0.0785, "step": 39520 }, { "epoch": 2.477282697248856, "grad_norm": 0.008019731380045414, "learning_rate": 8.803909564923688e-06, "loss": 0.0042, "step": 39530 }, { "epoch": 2.477909381462681, "grad_norm": 0.008901724591851234, "learning_rate": 8.793354584028203e-06, "loss": 0.1083, "step": 39540 }, { "epoch": 2.4785360656765056, "grad_norm": 0.05041956529021263, "learning_rate": 8.78279960313272e-06, "loss": 0.1845, "step": 39550 }, { "epoch": 2.4791627498903304, "grad_norm": 0.06697653979063034, "learning_rate": 8.772244622237234e-06, "loss": 0.03, "step": 39560 }, { "epoch": 2.479789434104155, "grad_norm": 0.031245127320289612, "learning_rate": 8.76168964134175e-06, "loss": 0.0018, "step": 39570 }, { "epoch": 2.4804161183179794, "grad_norm": 0.024495404213666916, "learning_rate": 8.751134660446266e-06, "loss": 0.0303, "step": 39580 }, { "epoch": 2.481042802531804, "grad_norm": 0.006389065179973841, "learning_rate": 8.740579679550781e-06, "loss": 0.0641, "step": 39590 }, { "epoch": 2.481669486745629, "grad_norm": 0.29042214155197144, "learning_rate": 8.730024698655296e-06, "loss": 0.0874, "step": 39600 }, { "epoch": 2.4822961709594535, "grad_norm": 0.007173601537942886, "learning_rate": 8.719469717759811e-06, "loss": 0.0814, "step": 39610 }, { "epoch": 2.4829228551732783, "grad_norm": 40.7451286315918, "learning_rate": 8.708914736864326e-06, "loss": 0.0931, "step": 39620 }, { "epoch": 2.483549539387103, "grad_norm": 0.007661072537302971, "learning_rate": 8.698359755968842e-06, "loss": 0.1018, "step": 39630 }, { "epoch": 2.4841762236009277, "grad_norm": 0.0291671734303236, "learning_rate": 8.687804775073357e-06, "loss": 0.0021, "step": 39640 }, { "epoch": 2.484802907814752, "grad_norm": 0.05833710730075836, "learning_rate": 8.677249794177874e-06, "loss": 0.0815, "step": 39650 }, { "epoch": 2.4854295920285767, "grad_norm": 0.09396538883447647, "learning_rate": 8.666694813282389e-06, "loss": 0.1411, "step": 39660 }, { "epoch": 2.4860562762424014, "grad_norm": 9.64147663116455, "learning_rate": 8.656139832386904e-06, "loss": 0.1394, "step": 39670 }, { "epoch": 2.486682960456226, "grad_norm": 0.21708106994628906, "learning_rate": 8.645584851491419e-06, "loss": 0.0018, "step": 39680 }, { "epoch": 2.487309644670051, "grad_norm": 6.932487964630127, "learning_rate": 8.635029870595934e-06, "loss": 0.2451, "step": 39690 }, { "epoch": 2.487936328883875, "grad_norm": 0.5158403515815735, "learning_rate": 8.62447488970045e-06, "loss": 0.0026, "step": 39700 }, { "epoch": 2.4885630130977, "grad_norm": 0.14011597633361816, "learning_rate": 8.613919908804966e-06, "loss": 0.0171, "step": 39710 }, { "epoch": 2.4891896973115246, "grad_norm": 0.01039296854287386, "learning_rate": 8.60336492790948e-06, "loss": 0.0317, "step": 39720 }, { "epoch": 2.4898163815253493, "grad_norm": 0.17363180220127106, "learning_rate": 8.592809947013997e-06, "loss": 0.0335, "step": 39730 }, { "epoch": 2.490443065739174, "grad_norm": 0.46715372800827026, "learning_rate": 8.582254966118512e-06, "loss": 0.0054, "step": 39740 }, { "epoch": 2.491069749952999, "grad_norm": 0.18222229182720184, "learning_rate": 8.571699985223027e-06, "loss": 0.1999, "step": 39750 }, { "epoch": 2.4916964341668235, "grad_norm": 0.03781836852431297, "learning_rate": 8.561145004327542e-06, "loss": 0.0014, "step": 39760 }, { "epoch": 2.4923231183806482, "grad_norm": 4.425987243652344, "learning_rate": 8.550590023432057e-06, "loss": 0.0785, "step": 39770 }, { "epoch": 2.4929498025944725, "grad_norm": 0.0323716439306736, "learning_rate": 8.540035042536574e-06, "loss": 0.1397, "step": 39780 }, { "epoch": 2.4935764868082972, "grad_norm": 0.3504541516304016, "learning_rate": 8.529480061641089e-06, "loss": 0.1509, "step": 39790 }, { "epoch": 2.494203171022122, "grad_norm": 0.018971078097820282, "learning_rate": 8.518925080745605e-06, "loss": 0.0023, "step": 39800 }, { "epoch": 2.4948298552359467, "grad_norm": 0.1483769714832306, "learning_rate": 8.50837009985012e-06, "loss": 0.0789, "step": 39810 }, { "epoch": 2.4954565394497714, "grad_norm": 0.013152056373655796, "learning_rate": 8.497815118954635e-06, "loss": 0.0024, "step": 39820 }, { "epoch": 2.4960832236635957, "grad_norm": 7.379671096801758, "learning_rate": 8.48726013805915e-06, "loss": 0.0829, "step": 39830 }, { "epoch": 2.4967099078774204, "grad_norm": 0.007252856157720089, "learning_rate": 8.476705157163665e-06, "loss": 0.0338, "step": 39840 }, { "epoch": 2.497336592091245, "grad_norm": 0.0384717620909214, "learning_rate": 8.466150176268182e-06, "loss": 0.0011, "step": 39850 }, { "epoch": 2.49796327630507, "grad_norm": 0.05042622238397598, "learning_rate": 8.455595195372697e-06, "loss": 0.1155, "step": 39860 }, { "epoch": 2.4985899605188946, "grad_norm": 0.009568829089403152, "learning_rate": 8.445040214477212e-06, "loss": 0.0009, "step": 39870 }, { "epoch": 2.4992166447327193, "grad_norm": 0.05151621252298355, "learning_rate": 8.434485233581728e-06, "loss": 0.0021, "step": 39880 }, { "epoch": 2.499843328946544, "grad_norm": 2.42625093460083, "learning_rate": 8.423930252686243e-06, "loss": 0.0025, "step": 39890 }, { "epoch": 2.5004700131603688, "grad_norm": 0.03024793043732643, "learning_rate": 8.41337527179076e-06, "loss": 0.0068, "step": 39900 }, { "epoch": 2.501096697374193, "grad_norm": 0.10120806097984314, "learning_rate": 8.402820290895273e-06, "loss": 0.1672, "step": 39910 }, { "epoch": 2.5017233815880178, "grad_norm": 0.055157218128442764, "learning_rate": 8.392265309999788e-06, "loss": 0.0486, "step": 39920 }, { "epoch": 2.5023500658018425, "grad_norm": 0.015098130330443382, "learning_rate": 8.381710329104305e-06, "loss": 0.1388, "step": 39930 }, { "epoch": 2.502976750015667, "grad_norm": 0.17691518366336823, "learning_rate": 8.37115534820882e-06, "loss": 0.0009, "step": 39940 }, { "epoch": 2.5036034342294915, "grad_norm": 0.35355162620544434, "learning_rate": 8.360600367313336e-06, "loss": 0.0522, "step": 39950 }, { "epoch": 2.504230118443316, "grad_norm": 0.04962703213095665, "learning_rate": 8.350045386417851e-06, "loss": 0.0506, "step": 39960 }, { "epoch": 2.504856802657141, "grad_norm": 0.08713936060667038, "learning_rate": 8.339490405522366e-06, "loss": 0.085, "step": 39970 }, { "epoch": 2.5054834868709657, "grad_norm": 8.202126502990723, "learning_rate": 8.328935424626883e-06, "loss": 0.2781, "step": 39980 }, { "epoch": 2.5061101710847904, "grad_norm": 15.094564437866211, "learning_rate": 8.318380443731396e-06, "loss": 0.2105, "step": 39990 }, { "epoch": 2.506736855298615, "grad_norm": 0.005411601159721613, "learning_rate": 8.307825462835913e-06, "loss": 0.0096, "step": 40000 }, { "epoch": 2.50736353951244, "grad_norm": 0.4384141266345978, "learning_rate": 8.297270481940428e-06, "loss": 0.039, "step": 40010 }, { "epoch": 2.5079902237262646, "grad_norm": 10.535191535949707, "learning_rate": 8.286715501044943e-06, "loss": 0.0983, "step": 40020 }, { "epoch": 2.508616907940089, "grad_norm": 17.734485626220703, "learning_rate": 8.27616052014946e-06, "loss": 0.0868, "step": 40030 }, { "epoch": 2.5092435921539136, "grad_norm": 0.14247313141822815, "learning_rate": 8.265605539253974e-06, "loss": 0.0562, "step": 40040 }, { "epoch": 2.5098702763677383, "grad_norm": 10.993691444396973, "learning_rate": 8.255050558358491e-06, "loss": 0.1285, "step": 40050 }, { "epoch": 2.510496960581563, "grad_norm": 116.90253448486328, "learning_rate": 8.244495577463004e-06, "loss": 0.0778, "step": 40060 }, { "epoch": 2.5111236447953877, "grad_norm": 0.01970071718096733, "learning_rate": 8.233940596567521e-06, "loss": 0.0018, "step": 40070 }, { "epoch": 2.511750329009212, "grad_norm": 0.15241803228855133, "learning_rate": 8.223385615672036e-06, "loss": 0.0509, "step": 40080 }, { "epoch": 2.5123770132230367, "grad_norm": 0.17657651007175446, "learning_rate": 8.21283063477655e-06, "loss": 0.1325, "step": 40090 }, { "epoch": 2.5130036974368615, "grad_norm": 0.15709511935710907, "learning_rate": 8.202275653881067e-06, "loss": 0.0016, "step": 40100 }, { "epoch": 2.513630381650686, "grad_norm": 0.06546179950237274, "learning_rate": 8.191720672985582e-06, "loss": 0.0021, "step": 40110 }, { "epoch": 2.514257065864511, "grad_norm": 0.0067861187271773815, "learning_rate": 8.181165692090097e-06, "loss": 0.0012, "step": 40120 }, { "epoch": 2.5148837500783356, "grad_norm": 0.006304751615971327, "learning_rate": 8.170610711194614e-06, "loss": 0.2247, "step": 40130 }, { "epoch": 2.5155104342921604, "grad_norm": 0.013594291172921658, "learning_rate": 8.160055730299127e-06, "loss": 0.0031, "step": 40140 }, { "epoch": 2.516137118505985, "grad_norm": 0.063226617872715, "learning_rate": 8.149500749403644e-06, "loss": 0.0957, "step": 40150 }, { "epoch": 2.5167638027198094, "grad_norm": 5.06606912612915, "learning_rate": 8.138945768508159e-06, "loss": 0.2566, "step": 40160 }, { "epoch": 2.517390486933634, "grad_norm": 0.040200620889663696, "learning_rate": 8.128390787612674e-06, "loss": 0.1106, "step": 40170 }, { "epoch": 2.518017171147459, "grad_norm": 0.04336791858077049, "learning_rate": 8.11783580671719e-06, "loss": 0.0042, "step": 40180 }, { "epoch": 2.5186438553612835, "grad_norm": 0.13072709739208221, "learning_rate": 8.107280825821705e-06, "loss": 0.0505, "step": 40190 }, { "epoch": 2.519270539575108, "grad_norm": 0.13012777268886566, "learning_rate": 8.096725844926222e-06, "loss": 0.0073, "step": 40200 }, { "epoch": 2.5198972237889325, "grad_norm": 0.03830237314105034, "learning_rate": 8.086170864030737e-06, "loss": 0.0673, "step": 40210 }, { "epoch": 2.5205239080027573, "grad_norm": 0.032999083399772644, "learning_rate": 8.075615883135252e-06, "loss": 0.1633, "step": 40220 }, { "epoch": 2.521150592216582, "grad_norm": 0.1449841558933258, "learning_rate": 8.065060902239767e-06, "loss": 0.0378, "step": 40230 }, { "epoch": 2.5217772764304067, "grad_norm": 0.4678923785686493, "learning_rate": 8.054505921344282e-06, "loss": 0.0041, "step": 40240 }, { "epoch": 2.5224039606442314, "grad_norm": 0.1612612009048462, "learning_rate": 8.043950940448799e-06, "loss": 0.0515, "step": 40250 }, { "epoch": 2.523030644858056, "grad_norm": 0.06994244456291199, "learning_rate": 8.033395959553314e-06, "loss": 0.0494, "step": 40260 }, { "epoch": 2.523657329071881, "grad_norm": 0.02159612812101841, "learning_rate": 8.022840978657829e-06, "loss": 0.0418, "step": 40270 }, { "epoch": 2.524284013285705, "grad_norm": 4.538101673126221, "learning_rate": 8.012285997762345e-06, "loss": 0.0343, "step": 40280 }, { "epoch": 2.52491069749953, "grad_norm": 0.0578109547495842, "learning_rate": 8.00173101686686e-06, "loss": 0.1129, "step": 40290 }, { "epoch": 2.5255373817133546, "grad_norm": 6.311728477478027, "learning_rate": 7.991176035971375e-06, "loss": 0.0461, "step": 40300 }, { "epoch": 2.5261640659271793, "grad_norm": 0.0051009259186685085, "learning_rate": 7.98062105507589e-06, "loss": 0.0041, "step": 40310 }, { "epoch": 2.526790750141004, "grad_norm": 0.017606286332011223, "learning_rate": 7.970066074180407e-06, "loss": 0.0028, "step": 40320 }, { "epoch": 2.5274174343548284, "grad_norm": 11.35566520690918, "learning_rate": 7.959511093284922e-06, "loss": 0.2051, "step": 40330 }, { "epoch": 2.528044118568653, "grad_norm": 0.010604508221149445, "learning_rate": 7.948956112389437e-06, "loss": 0.0021, "step": 40340 }, { "epoch": 2.528670802782478, "grad_norm": 0.005866426043212414, "learning_rate": 7.938401131493953e-06, "loss": 0.1692, "step": 40350 }, { "epoch": 2.5292974869963025, "grad_norm": 5.990290641784668, "learning_rate": 7.927846150598468e-06, "loss": 0.0793, "step": 40360 }, { "epoch": 2.5299241712101272, "grad_norm": 0.017411302775144577, "learning_rate": 7.917291169702983e-06, "loss": 0.0348, "step": 40370 }, { "epoch": 2.530550855423952, "grad_norm": 4.7487969398498535, "learning_rate": 7.906736188807498e-06, "loss": 0.0432, "step": 40380 }, { "epoch": 2.5311775396377767, "grad_norm": 0.01979987509548664, "learning_rate": 7.896181207912013e-06, "loss": 0.1079, "step": 40390 }, { "epoch": 2.5318042238516014, "grad_norm": 17.053985595703125, "learning_rate": 7.88562622701653e-06, "loss": 0.074, "step": 40400 }, { "epoch": 2.5324309080654257, "grad_norm": 0.03223740682005882, "learning_rate": 7.875071246121045e-06, "loss": 0.031, "step": 40410 }, { "epoch": 2.5330575922792504, "grad_norm": 0.05917942896485329, "learning_rate": 7.864516265225561e-06, "loss": 0.1192, "step": 40420 }, { "epoch": 2.533684276493075, "grad_norm": 0.008573943749070168, "learning_rate": 7.853961284330076e-06, "loss": 0.0051, "step": 40430 }, { "epoch": 2.5343109607069, "grad_norm": 0.09303145110607147, "learning_rate": 7.843406303434591e-06, "loss": 0.1075, "step": 40440 }, { "epoch": 2.5349376449207246, "grad_norm": 0.004650171380490065, "learning_rate": 7.832851322539106e-06, "loss": 0.0004, "step": 40450 }, { "epoch": 2.535564329134549, "grad_norm": 1.0261204242706299, "learning_rate": 7.822296341643621e-06, "loss": 0.0653, "step": 40460 }, { "epoch": 2.5361910133483736, "grad_norm": 0.06141495332121849, "learning_rate": 7.811741360748138e-06, "loss": 0.1281, "step": 40470 }, { "epoch": 2.5368176975621983, "grad_norm": 0.12824884057044983, "learning_rate": 7.801186379852653e-06, "loss": 0.0246, "step": 40480 }, { "epoch": 2.537444381776023, "grad_norm": 0.0036953743547201157, "learning_rate": 7.790631398957168e-06, "loss": 0.1411, "step": 40490 }, { "epoch": 2.5380710659898478, "grad_norm": 0.012939559295773506, "learning_rate": 7.780076418061684e-06, "loss": 0.1519, "step": 40500 }, { "epoch": 2.5386977502036725, "grad_norm": 0.12337151914834976, "learning_rate": 7.7695214371662e-06, "loss": 0.0698, "step": 40510 }, { "epoch": 2.5393244344174972, "grad_norm": 0.19674107432365417, "learning_rate": 7.758966456270714e-06, "loss": 0.1287, "step": 40520 }, { "epoch": 2.539951118631322, "grad_norm": 0.06755591183900833, "learning_rate": 7.74841147537523e-06, "loss": 0.0695, "step": 40530 }, { "epoch": 2.5405778028451462, "grad_norm": 0.1544746607542038, "learning_rate": 7.737856494479744e-06, "loss": 0.0466, "step": 40540 }, { "epoch": 2.541204487058971, "grad_norm": 0.03073558211326599, "learning_rate": 7.727301513584261e-06, "loss": 0.0462, "step": 40550 }, { "epoch": 2.5418311712727957, "grad_norm": 10.895833015441895, "learning_rate": 7.716746532688776e-06, "loss": 0.069, "step": 40560 }, { "epoch": 2.5424578554866204, "grad_norm": 0.006254229694604874, "learning_rate": 7.706191551793292e-06, "loss": 0.1509, "step": 40570 }, { "epoch": 2.5430845397004447, "grad_norm": 0.10220787674188614, "learning_rate": 7.695636570897807e-06, "loss": 0.032, "step": 40580 }, { "epoch": 2.5437112239142694, "grad_norm": 0.11436332762241364, "learning_rate": 7.685081590002322e-06, "loss": 0.1924, "step": 40590 }, { "epoch": 2.544337908128094, "grad_norm": 0.015846652910113335, "learning_rate": 7.674526609106839e-06, "loss": 0.0413, "step": 40600 }, { "epoch": 2.544964592341919, "grad_norm": 6.154116630554199, "learning_rate": 7.663971628211352e-06, "loss": 0.1555, "step": 40610 }, { "epoch": 2.5455912765557436, "grad_norm": 0.022826502099633217, "learning_rate": 7.653416647315869e-06, "loss": 0.0445, "step": 40620 }, { "epoch": 2.5462179607695683, "grad_norm": 1.5049920082092285, "learning_rate": 7.642861666420384e-06, "loss": 0.0827, "step": 40630 }, { "epoch": 2.546844644983393, "grad_norm": 0.02650361694395542, "learning_rate": 7.632306685524899e-06, "loss": 0.0738, "step": 40640 }, { "epoch": 2.5474713291972177, "grad_norm": 0.0813385620713234, "learning_rate": 7.6217517046294155e-06, "loss": 0.0428, "step": 40650 }, { "epoch": 2.548098013411042, "grad_norm": 0.04428323358297348, "learning_rate": 7.6111967237339305e-06, "loss": 0.1311, "step": 40660 }, { "epoch": 2.5487246976248668, "grad_norm": 0.15579140186309814, "learning_rate": 7.600641742838446e-06, "loss": 0.1653, "step": 40670 }, { "epoch": 2.5493513818386915, "grad_norm": 11.184800148010254, "learning_rate": 7.590086761942962e-06, "loss": 0.061, "step": 40680 }, { "epoch": 2.549978066052516, "grad_norm": 0.03695826232433319, "learning_rate": 7.579531781047476e-06, "loss": 0.0053, "step": 40690 }, { "epoch": 2.550604750266341, "grad_norm": 0.011084161698818207, "learning_rate": 7.568976800151992e-06, "loss": 0.0534, "step": 40700 }, { "epoch": 2.551231434480165, "grad_norm": 0.008540414273738861, "learning_rate": 7.558421819256507e-06, "loss": 0.034, "step": 40710 }, { "epoch": 2.55185811869399, "grad_norm": 0.004470044281333685, "learning_rate": 7.547866838361023e-06, "loss": 0.0365, "step": 40720 }, { "epoch": 2.5524848029078147, "grad_norm": 0.01315213367342949, "learning_rate": 7.5373118574655386e-06, "loss": 0.005, "step": 40730 }, { "epoch": 2.5531114871216394, "grad_norm": 0.006134567316621542, "learning_rate": 7.526756876570054e-06, "loss": 0.0048, "step": 40740 }, { "epoch": 2.553738171335464, "grad_norm": 0.006208804901689291, "learning_rate": 7.516201895674569e-06, "loss": 0.0122, "step": 40750 }, { "epoch": 2.554364855549289, "grad_norm": 0.008568532764911652, "learning_rate": 7.505646914779084e-06, "loss": 0.0436, "step": 40760 }, { "epoch": 2.5549915397631136, "grad_norm": 2.7557194232940674, "learning_rate": 7.495091933883599e-06, "loss": 0.0044, "step": 40770 }, { "epoch": 2.5556182239769383, "grad_norm": 0.019581960514187813, "learning_rate": 7.484536952988115e-06, "loss": 0.0024, "step": 40780 }, { "epoch": 2.5562449081907626, "grad_norm": 8.799368858337402, "learning_rate": 7.473981972092631e-06, "loss": 0.1515, "step": 40790 }, { "epoch": 2.5568715924045873, "grad_norm": 0.006035898812115192, "learning_rate": 7.463426991197147e-06, "loss": 0.003, "step": 40800 }, { "epoch": 2.557498276618412, "grad_norm": 0.008945782668888569, "learning_rate": 7.452872010301662e-06, "loss": 0.1117, "step": 40810 }, { "epoch": 2.5581249608322367, "grad_norm": 51.71331787109375, "learning_rate": 7.442317029406177e-06, "loss": 0.0434, "step": 40820 }, { "epoch": 2.5587516450460615, "grad_norm": 0.0057718148455023766, "learning_rate": 7.431762048510693e-06, "loss": 0.0409, "step": 40830 }, { "epoch": 2.5593783292598857, "grad_norm": 0.006954421754926443, "learning_rate": 7.421207067615207e-06, "loss": 0.0488, "step": 40840 }, { "epoch": 2.5600050134737105, "grad_norm": 0.009053668938577175, "learning_rate": 7.410652086719723e-06, "loss": 0.0025, "step": 40850 }, { "epoch": 2.560631697687535, "grad_norm": 0.00380707997828722, "learning_rate": 7.400097105824239e-06, "loss": 0.1444, "step": 40860 }, { "epoch": 2.56125838190136, "grad_norm": 0.022652175277471542, "learning_rate": 7.389542124928754e-06, "loss": 0.0867, "step": 40870 }, { "epoch": 2.5618850661151846, "grad_norm": 0.3384392559528351, "learning_rate": 7.37898714403327e-06, "loss": 0.0178, "step": 40880 }, { "epoch": 2.5625117503290094, "grad_norm": 0.004380661062896252, "learning_rate": 7.3684321631377855e-06, "loss": 0.0013, "step": 40890 }, { "epoch": 2.563138434542834, "grad_norm": 0.0032064158003777266, "learning_rate": 7.357877182242301e-06, "loss": 0.002, "step": 40900 }, { "epoch": 2.5637651187566584, "grad_norm": 0.055104441940784454, "learning_rate": 7.347322201346816e-06, "loss": 0.0609, "step": 40910 }, { "epoch": 2.564391802970483, "grad_norm": 0.003831064561381936, "learning_rate": 7.33676722045133e-06, "loss": 0.1124, "step": 40920 }, { "epoch": 2.565018487184308, "grad_norm": 0.0033136503770947456, "learning_rate": 7.326212239555846e-06, "loss": 0.0425, "step": 40930 }, { "epoch": 2.5656451713981325, "grad_norm": 0.03835991770029068, "learning_rate": 7.315657258660362e-06, "loss": 0.0009, "step": 40940 }, { "epoch": 2.5662718556119573, "grad_norm": 0.11937529593706131, "learning_rate": 7.305102277764878e-06, "loss": 0.1146, "step": 40950 }, { "epoch": 2.5668985398257815, "grad_norm": 0.13023905456066132, "learning_rate": 7.294547296869393e-06, "loss": 0.127, "step": 40960 }, { "epoch": 2.5675252240396063, "grad_norm": 0.021615734323859215, "learning_rate": 7.2839923159739086e-06, "loss": 0.0035, "step": 40970 }, { "epoch": 2.568151908253431, "grad_norm": 0.03249719366431236, "learning_rate": 7.273437335078424e-06, "loss": 0.0023, "step": 40980 }, { "epoch": 2.5687785924672557, "grad_norm": 0.007845253683626652, "learning_rate": 7.26288235418294e-06, "loss": 0.0853, "step": 40990 }, { "epoch": 2.5694052766810804, "grad_norm": 0.8201372027397156, "learning_rate": 7.252327373287454e-06, "loss": 0.0428, "step": 41000 }, { "epoch": 2.570031960894905, "grad_norm": 0.026965798810124397, "learning_rate": 7.24177239239197e-06, "loss": 0.0226, "step": 41010 }, { "epoch": 2.57065864510873, "grad_norm": 0.39754635095596313, "learning_rate": 7.231217411496485e-06, "loss": 0.0014, "step": 41020 }, { "epoch": 2.5712853293225546, "grad_norm": 0.15909859538078308, "learning_rate": 7.220662430601001e-06, "loss": 0.1904, "step": 41030 }, { "epoch": 2.571912013536379, "grad_norm": 0.04498082771897316, "learning_rate": 7.210107449705517e-06, "loss": 0.0012, "step": 41040 }, { "epoch": 2.5725386977502036, "grad_norm": 0.034305937588214874, "learning_rate": 7.1995524688100324e-06, "loss": 0.0518, "step": 41050 }, { "epoch": 2.5731653819640283, "grad_norm": 17.956905364990234, "learning_rate": 7.188997487914547e-06, "loss": 0.0613, "step": 41060 }, { "epoch": 2.573792066177853, "grad_norm": 0.006127170752733946, "learning_rate": 7.178442507019062e-06, "loss": 0.0619, "step": 41070 }, { "epoch": 2.574418750391678, "grad_norm": 0.14973905682563782, "learning_rate": 7.167887526123577e-06, "loss": 0.1973, "step": 41080 }, { "epoch": 2.575045434605502, "grad_norm": 5.818470001220703, "learning_rate": 7.157332545228093e-06, "loss": 0.0659, "step": 41090 }, { "epoch": 2.575672118819327, "grad_norm": 6.122541427612305, "learning_rate": 7.146777564332609e-06, "loss": 0.0494, "step": 41100 }, { "epoch": 2.5762988030331515, "grad_norm": 0.022952023893594742, "learning_rate": 7.136222583437125e-06, "loss": 0.0709, "step": 41110 }, { "epoch": 2.5769254872469762, "grad_norm": 0.03813765570521355, "learning_rate": 7.12566760254164e-06, "loss": 0.1177, "step": 41120 }, { "epoch": 2.577552171460801, "grad_norm": 0.07932894676923752, "learning_rate": 7.1151126216461555e-06, "loss": 0.0325, "step": 41130 }, { "epoch": 2.5781788556746257, "grad_norm": 2.9672300815582275, "learning_rate": 7.104557640750671e-06, "loss": 0.0839, "step": 41140 }, { "epoch": 2.5788055398884504, "grad_norm": 0.006282791495323181, "learning_rate": 7.094002659855185e-06, "loss": 0.0005, "step": 41150 }, { "epoch": 2.579432224102275, "grad_norm": 0.04274289309978485, "learning_rate": 7.083447678959701e-06, "loss": 0.0583, "step": 41160 }, { "epoch": 2.5800589083160994, "grad_norm": 0.06123344227671623, "learning_rate": 7.072892698064216e-06, "loss": 0.004, "step": 41170 }, { "epoch": 2.580685592529924, "grad_norm": 2.8731517791748047, "learning_rate": 7.062337717168732e-06, "loss": 0.0143, "step": 41180 }, { "epoch": 2.581312276743749, "grad_norm": 0.00873857643455267, "learning_rate": 7.051782736273248e-06, "loss": 0.1282, "step": 41190 }, { "epoch": 2.5819389609575736, "grad_norm": 0.24595175683498383, "learning_rate": 7.041227755377764e-06, "loss": 0.0358, "step": 41200 }, { "epoch": 2.582565645171398, "grad_norm": 0.508924126625061, "learning_rate": 7.0306727744822785e-06, "loss": 0.0816, "step": 41210 }, { "epoch": 2.5831923293852226, "grad_norm": 0.05899544432759285, "learning_rate": 7.020117793586794e-06, "loss": 0.1588, "step": 41220 }, { "epoch": 2.5838190135990473, "grad_norm": 9.842616081237793, "learning_rate": 7.0095628126913085e-06, "loss": 0.0661, "step": 41230 }, { "epoch": 2.584445697812872, "grad_norm": 0.0917615070939064, "learning_rate": 6.999007831795824e-06, "loss": 0.0988, "step": 41240 }, { "epoch": 2.5850723820266968, "grad_norm": 21.74089813232422, "learning_rate": 6.98845285090034e-06, "loss": 0.1004, "step": 41250 }, { "epoch": 2.5856990662405215, "grad_norm": 0.03793327137827873, "learning_rate": 6.977897870004856e-06, "loss": 0.0648, "step": 41260 }, { "epoch": 2.586325750454346, "grad_norm": 8.540681838989258, "learning_rate": 6.967342889109371e-06, "loss": 0.1421, "step": 41270 }, { "epoch": 2.586952434668171, "grad_norm": 0.07502247393131256, "learning_rate": 6.956787908213887e-06, "loss": 0.0398, "step": 41280 }, { "epoch": 2.587579118881995, "grad_norm": 0.7496595978736877, "learning_rate": 6.9462329273184024e-06, "loss": 0.0032, "step": 41290 }, { "epoch": 2.58820580309582, "grad_norm": 0.8887155652046204, "learning_rate": 6.935677946422918e-06, "loss": 0.1033, "step": 41300 }, { "epoch": 2.5888324873096447, "grad_norm": 0.09058801829814911, "learning_rate": 6.925122965527432e-06, "loss": 0.002, "step": 41310 }, { "epoch": 2.5894591715234694, "grad_norm": 0.014632687903940678, "learning_rate": 6.914567984631948e-06, "loss": 0.0641, "step": 41320 }, { "epoch": 2.590085855737294, "grad_norm": 0.035875823348760605, "learning_rate": 6.904013003736463e-06, "loss": 0.1075, "step": 41330 }, { "epoch": 2.5907125399511184, "grad_norm": 0.645311713218689, "learning_rate": 6.893458022840979e-06, "loss": 0.0041, "step": 41340 }, { "epoch": 2.591339224164943, "grad_norm": 6.724026203155518, "learning_rate": 6.882903041945495e-06, "loss": 0.1227, "step": 41350 }, { "epoch": 2.591965908378768, "grad_norm": 0.04724414646625519, "learning_rate": 6.8723480610500105e-06, "loss": 0.2922, "step": 41360 }, { "epoch": 2.5925925925925926, "grad_norm": 0.024513479322195053, "learning_rate": 6.8617930801545255e-06, "loss": 0.0545, "step": 41370 }, { "epoch": 2.5932192768064173, "grad_norm": 0.8417285680770874, "learning_rate": 6.85123809925904e-06, "loss": 0.0518, "step": 41380 }, { "epoch": 2.593845961020242, "grad_norm": 0.06562932580709457, "learning_rate": 6.840683118363555e-06, "loss": 0.0475, "step": 41390 }, { "epoch": 2.5944726452340667, "grad_norm": 0.04768669232726097, "learning_rate": 6.830128137468071e-06, "loss": 0.1743, "step": 41400 }, { "epoch": 2.5950993294478915, "grad_norm": 0.025528786703944206, "learning_rate": 6.819573156572587e-06, "loss": 0.0038, "step": 41410 }, { "epoch": 2.5957260136617157, "grad_norm": 0.1445946991443634, "learning_rate": 6.809018175677102e-06, "loss": 0.0479, "step": 41420 }, { "epoch": 2.5963526978755405, "grad_norm": 5.870827674865723, "learning_rate": 6.798463194781618e-06, "loss": 0.1458, "step": 41430 }, { "epoch": 2.596979382089365, "grad_norm": 0.14257600903511047, "learning_rate": 6.7879082138861336e-06, "loss": 0.0583, "step": 41440 }, { "epoch": 2.59760606630319, "grad_norm": 0.03526608645915985, "learning_rate": 6.777353232990649e-06, "loss": 0.0316, "step": 41450 }, { "epoch": 2.5982327505170146, "grad_norm": 0.2761639654636383, "learning_rate": 6.7667982520951635e-06, "loss": 0.0428, "step": 41460 }, { "epoch": 2.598859434730839, "grad_norm": 0.011171748861670494, "learning_rate": 6.756243271199679e-06, "loss": 0.0828, "step": 41470 }, { "epoch": 2.5994861189446636, "grad_norm": 0.008683438412845135, "learning_rate": 6.745688290304194e-06, "loss": 0.0023, "step": 41480 }, { "epoch": 2.6001128031584884, "grad_norm": 5.977514266967773, "learning_rate": 6.73513330940871e-06, "loss": 0.0436, "step": 41490 }, { "epoch": 2.600739487372313, "grad_norm": 0.011247127316892147, "learning_rate": 6.724578328513226e-06, "loss": 0.0024, "step": 41500 }, { "epoch": 2.601366171586138, "grad_norm": 0.168271005153656, "learning_rate": 6.714023347617742e-06, "loss": 0.0032, "step": 41510 }, { "epoch": 2.6019928557999625, "grad_norm": 0.006584922783076763, "learning_rate": 6.703468366722257e-06, "loss": 0.0281, "step": 41520 }, { "epoch": 2.6026195400137873, "grad_norm": 0.006742222234606743, "learning_rate": 6.692913385826772e-06, "loss": 0.0585, "step": 41530 }, { "epoch": 2.603246224227612, "grad_norm": 9.171943664550781, "learning_rate": 6.6823584049312865e-06, "loss": 0.2837, "step": 41540 }, { "epoch": 2.6038729084414363, "grad_norm": 0.11651241779327393, "learning_rate": 6.671803424035802e-06, "loss": 0.043, "step": 41550 }, { "epoch": 2.604499592655261, "grad_norm": 0.008494174107909203, "learning_rate": 6.661248443140318e-06, "loss": 0.0693, "step": 41560 }, { "epoch": 2.6051262768690857, "grad_norm": 0.08039513230323792, "learning_rate": 6.650693462244834e-06, "loss": 0.0345, "step": 41570 }, { "epoch": 2.6057529610829104, "grad_norm": 0.01966715045273304, "learning_rate": 6.640138481349349e-06, "loss": 0.0016, "step": 41580 }, { "epoch": 2.6063796452967347, "grad_norm": 0.10775923728942871, "learning_rate": 6.629583500453865e-06, "loss": 0.1824, "step": 41590 }, { "epoch": 2.6070063295105594, "grad_norm": 0.11887072771787643, "learning_rate": 6.6190285195583805e-06, "loss": 0.0817, "step": 41600 }, { "epoch": 2.607633013724384, "grad_norm": 0.031048081815242767, "learning_rate": 6.608473538662896e-06, "loss": 0.0473, "step": 41610 }, { "epoch": 2.608259697938209, "grad_norm": 12.19013500213623, "learning_rate": 6.59791855776741e-06, "loss": 0.1918, "step": 41620 }, { "epoch": 2.6088863821520336, "grad_norm": 0.7925893068313599, "learning_rate": 6.587363576871925e-06, "loss": 0.0302, "step": 41630 }, { "epoch": 2.6095130663658583, "grad_norm": 0.013532374054193497, "learning_rate": 6.576808595976441e-06, "loss": 0.1553, "step": 41640 }, { "epoch": 2.610139750579683, "grad_norm": 1.1819570064544678, "learning_rate": 6.566253615080957e-06, "loss": 0.031, "step": 41650 }, { "epoch": 2.610766434793508, "grad_norm": 0.08105959743261337, "learning_rate": 6.555698634185473e-06, "loss": 0.0032, "step": 41660 }, { "epoch": 2.611393119007332, "grad_norm": 0.47787657380104065, "learning_rate": 6.545143653289989e-06, "loss": 0.0621, "step": 41670 }, { "epoch": 2.612019803221157, "grad_norm": 5.609531879425049, "learning_rate": 6.5345886723945035e-06, "loss": 0.0825, "step": 41680 }, { "epoch": 2.6126464874349815, "grad_norm": 0.004527808167040348, "learning_rate": 6.524033691499019e-06, "loss": 0.1428, "step": 41690 }, { "epoch": 2.6132731716488062, "grad_norm": 0.0466812439262867, "learning_rate": 6.5134787106035335e-06, "loss": 0.0012, "step": 41700 }, { "epoch": 2.613899855862631, "grad_norm": 1.6810473203659058, "learning_rate": 6.502923729708049e-06, "loss": 0.0182, "step": 41710 }, { "epoch": 2.6145265400764552, "grad_norm": 2.1184799671173096, "learning_rate": 6.492368748812565e-06, "loss": 0.0049, "step": 41720 }, { "epoch": 2.61515322429028, "grad_norm": 0.01584853045642376, "learning_rate": 6.48181376791708e-06, "loss": 0.1132, "step": 41730 }, { "epoch": 2.6157799085041047, "grad_norm": 0.7091934084892273, "learning_rate": 6.471258787021596e-06, "loss": 0.1384, "step": 41740 }, { "epoch": 2.6164065927179294, "grad_norm": 0.013542457483708858, "learning_rate": 6.460703806126112e-06, "loss": 0.0336, "step": 41750 }, { "epoch": 2.617033276931754, "grad_norm": 0.010229435749351978, "learning_rate": 6.4501488252306274e-06, "loss": 0.0331, "step": 41760 }, { "epoch": 2.617659961145579, "grad_norm": 0.06294869631528854, "learning_rate": 6.4395938443351416e-06, "loss": 0.1182, "step": 41770 }, { "epoch": 2.6182866453594036, "grad_norm": 0.12787538766860962, "learning_rate": 6.429038863439657e-06, "loss": 0.0817, "step": 41780 }, { "epoch": 2.6189133295732283, "grad_norm": 0.7674179673194885, "learning_rate": 6.418483882544172e-06, "loss": 0.0175, "step": 41790 }, { "epoch": 2.6195400137870526, "grad_norm": 13.270587921142578, "learning_rate": 6.407928901648688e-06, "loss": 0.1413, "step": 41800 }, { "epoch": 2.6201666980008773, "grad_norm": 0.11268117278814316, "learning_rate": 6.397373920753204e-06, "loss": 0.0409, "step": 41810 }, { "epoch": 2.620793382214702, "grad_norm": 0.040429167449474335, "learning_rate": 6.38681893985772e-06, "loss": 0.1457, "step": 41820 }, { "epoch": 2.6214200664285268, "grad_norm": 3.8810534477233887, "learning_rate": 6.376263958962235e-06, "loss": 0.118, "step": 41830 }, { "epoch": 2.622046750642351, "grad_norm": 0.08467940986156464, "learning_rate": 6.3657089780667505e-06, "loss": 0.0023, "step": 41840 }, { "epoch": 2.6226734348561758, "grad_norm": 0.016723625361919403, "learning_rate": 6.355153997171265e-06, "loss": 0.1124, "step": 41850 }, { "epoch": 2.6233001190700005, "grad_norm": 0.018312910571694374, "learning_rate": 6.34459901627578e-06, "loss": 0.1396, "step": 41860 }, { "epoch": 2.623926803283825, "grad_norm": 0.4090491235256195, "learning_rate": 6.334044035380296e-06, "loss": 0.1224, "step": 41870 }, { "epoch": 2.62455348749765, "grad_norm": 14.096756935119629, "learning_rate": 6.323489054484812e-06, "loss": 0.092, "step": 41880 }, { "epoch": 2.6251801717114747, "grad_norm": 12.442021369934082, "learning_rate": 6.312934073589327e-06, "loss": 0.0976, "step": 41890 }, { "epoch": 2.6258068559252994, "grad_norm": 10.28456974029541, "learning_rate": 6.302379092693843e-06, "loss": 0.0989, "step": 41900 }, { "epoch": 2.626433540139124, "grad_norm": 0.009201687760651112, "learning_rate": 6.2918241117983586e-06, "loss": 0.041, "step": 41910 }, { "epoch": 2.6270602243529484, "grad_norm": 0.019938096404075623, "learning_rate": 6.281269130902874e-06, "loss": 0.0052, "step": 41920 }, { "epoch": 2.627686908566773, "grad_norm": 10.491694450378418, "learning_rate": 6.2707141500073885e-06, "loss": 0.1354, "step": 41930 }, { "epoch": 2.628313592780598, "grad_norm": 0.3292548358440399, "learning_rate": 6.2601591691119034e-06, "loss": 0.0025, "step": 41940 }, { "epoch": 2.6289402769944226, "grad_norm": 0.320330411195755, "learning_rate": 6.249604188216419e-06, "loss": 0.1281, "step": 41950 }, { "epoch": 2.6295669612082473, "grad_norm": 0.015448945574462414, "learning_rate": 6.239049207320935e-06, "loss": 0.0047, "step": 41960 }, { "epoch": 2.6301936454220716, "grad_norm": 0.2195909321308136, "learning_rate": 6.228494226425451e-06, "loss": 0.0077, "step": 41970 }, { "epoch": 2.6308203296358963, "grad_norm": 0.3505687415599823, "learning_rate": 6.217939245529966e-06, "loss": 0.003, "step": 41980 }, { "epoch": 2.631447013849721, "grad_norm": 0.20360609889030457, "learning_rate": 6.207384264634481e-06, "loss": 0.1084, "step": 41990 }, { "epoch": 2.6320736980635457, "grad_norm": 0.02529897727072239, "learning_rate": 6.196829283738997e-06, "loss": 0.0012, "step": 42000 }, { "epoch": 2.6327003822773705, "grad_norm": 0.02185901068150997, "learning_rate": 6.186274302843512e-06, "loss": 0.041, "step": 42010 }, { "epoch": 2.633327066491195, "grad_norm": 0.006049241870641708, "learning_rate": 6.175719321948028e-06, "loss": 0.0067, "step": 42020 }, { "epoch": 2.63395375070502, "grad_norm": 0.12334437668323517, "learning_rate": 6.165164341052543e-06, "loss": 0.0008, "step": 42030 }, { "epoch": 2.6345804349188446, "grad_norm": 0.004662103019654751, "learning_rate": 6.154609360157058e-06, "loss": 0.003, "step": 42040 }, { "epoch": 2.635207119132669, "grad_norm": 0.04426957294344902, "learning_rate": 6.144054379261574e-06, "loss": 0.1455, "step": 42050 }, { "epoch": 2.6358338033464936, "grad_norm": 0.09200718253850937, "learning_rate": 6.133499398366089e-06, "loss": 0.0919, "step": 42060 }, { "epoch": 2.6364604875603184, "grad_norm": 0.011606945656239986, "learning_rate": 6.122944417470605e-06, "loss": 0.2493, "step": 42070 }, { "epoch": 2.637087171774143, "grad_norm": 0.00984701793640852, "learning_rate": 6.1123894365751205e-06, "loss": 0.0294, "step": 42080 }, { "epoch": 2.637713855987968, "grad_norm": 0.2293330878019333, "learning_rate": 6.1018344556796354e-06, "loss": 0.0982, "step": 42090 }, { "epoch": 2.638340540201792, "grad_norm": 13.595054626464844, "learning_rate": 6.09127947478415e-06, "loss": 0.1903, "step": 42100 }, { "epoch": 2.638967224415617, "grad_norm": 1.0521003007888794, "learning_rate": 6.080724493888666e-06, "loss": 0.088, "step": 42110 }, { "epoch": 2.6395939086294415, "grad_norm": 0.09486385434865952, "learning_rate": 6.070169512993182e-06, "loss": 0.0367, "step": 42120 }, { "epoch": 2.6402205928432663, "grad_norm": 0.05776892229914665, "learning_rate": 6.059614532097698e-06, "loss": 0.0036, "step": 42130 }, { "epoch": 2.640847277057091, "grad_norm": 0.19651919603347778, "learning_rate": 6.049059551202212e-06, "loss": 0.0629, "step": 42140 }, { "epoch": 2.6414739612709157, "grad_norm": 0.419067919254303, "learning_rate": 6.038504570306728e-06, "loss": 0.0035, "step": 42150 }, { "epoch": 2.6421006454847404, "grad_norm": 0.10673335194587708, "learning_rate": 6.0279495894112435e-06, "loss": 0.1531, "step": 42160 }, { "epoch": 2.642727329698565, "grad_norm": 0.013691593892872334, "learning_rate": 6.017394608515759e-06, "loss": 0.061, "step": 42170 }, { "epoch": 2.6433540139123894, "grad_norm": 0.022470830008387566, "learning_rate": 6.006839627620274e-06, "loss": 0.0417, "step": 42180 }, { "epoch": 2.643980698126214, "grad_norm": 0.03461216017603874, "learning_rate": 5.996284646724789e-06, "loss": 0.0546, "step": 42190 }, { "epoch": 2.644607382340039, "grad_norm": 0.02822648175060749, "learning_rate": 5.985729665829305e-06, "loss": 0.0752, "step": 42200 }, { "epoch": 2.6452340665538636, "grad_norm": 4.36279821395874, "learning_rate": 5.975174684933821e-06, "loss": 0.1409, "step": 42210 }, { "epoch": 2.645860750767688, "grad_norm": 0.024720776826143265, "learning_rate": 5.964619704038336e-06, "loss": 0.0329, "step": 42220 }, { "epoch": 2.6464874349815126, "grad_norm": 0.040048062801361084, "learning_rate": 5.954064723142852e-06, "loss": 0.0036, "step": 42230 }, { "epoch": 2.6471141191953373, "grad_norm": 0.05158762261271477, "learning_rate": 5.9435097422473666e-06, "loss": 0.0581, "step": 42240 }, { "epoch": 2.647740803409162, "grad_norm": 0.018962504342198372, "learning_rate": 5.932954761351882e-06, "loss": 0.2953, "step": 42250 }, { "epoch": 2.648367487622987, "grad_norm": 0.021726837381720543, "learning_rate": 5.922399780456397e-06, "loss": 0.0464, "step": 42260 }, { "epoch": 2.6489941718368115, "grad_norm": 0.02034616470336914, "learning_rate": 5.911844799560913e-06, "loss": 0.054, "step": 42270 }, { "epoch": 2.6496208560506362, "grad_norm": 0.04526104778051376, "learning_rate": 5.901289818665429e-06, "loss": 0.1052, "step": 42280 }, { "epoch": 2.650247540264461, "grad_norm": 0.02887248992919922, "learning_rate": 5.890734837769944e-06, "loss": 0.006, "step": 42290 }, { "epoch": 2.6508742244782852, "grad_norm": 5.1538472175598145, "learning_rate": 5.880179856874459e-06, "loss": 0.1128, "step": 42300 }, { "epoch": 2.65150090869211, "grad_norm": 0.046616777777671814, "learning_rate": 5.869624875978975e-06, "loss": 0.1021, "step": 42310 }, { "epoch": 2.6521275929059347, "grad_norm": 0.05382724851369858, "learning_rate": 5.8590698950834905e-06, "loss": 0.0401, "step": 42320 }, { "epoch": 2.6527542771197594, "grad_norm": 0.027329618111252785, "learning_rate": 5.848514914188006e-06, "loss": 0.0029, "step": 42330 }, { "epoch": 2.653380961333584, "grad_norm": 0.05983074754476547, "learning_rate": 5.837959933292521e-06, "loss": 0.0678, "step": 42340 }, { "epoch": 2.6540076455474084, "grad_norm": 0.03666955605149269, "learning_rate": 5.827404952397036e-06, "loss": 0.132, "step": 42350 }, { "epoch": 2.654634329761233, "grad_norm": 0.03756636381149292, "learning_rate": 5.816849971501552e-06, "loss": 0.0721, "step": 42360 }, { "epoch": 2.655261013975058, "grad_norm": 0.02370397001504898, "learning_rate": 5.806294990606067e-06, "loss": 0.0888, "step": 42370 }, { "epoch": 2.6558876981888826, "grad_norm": 0.0890141949057579, "learning_rate": 5.795740009710583e-06, "loss": 0.0935, "step": 42380 }, { "epoch": 2.6565143824027073, "grad_norm": 0.01145352516323328, "learning_rate": 5.7851850288150985e-06, "loss": 0.0907, "step": 42390 }, { "epoch": 2.657141066616532, "grad_norm": 11.319497108459473, "learning_rate": 5.7746300479196135e-06, "loss": 0.0856, "step": 42400 }, { "epoch": 2.6577677508303568, "grad_norm": 1.578118920326233, "learning_rate": 5.7640750670241285e-06, "loss": 0.0597, "step": 42410 }, { "epoch": 2.6583944350441815, "grad_norm": 0.2819182276725769, "learning_rate": 5.753520086128644e-06, "loss": 0.0032, "step": 42420 }, { "epoch": 2.6590211192580058, "grad_norm": 0.009528083726763725, "learning_rate": 5.74296510523316e-06, "loss": 0.0017, "step": 42430 }, { "epoch": 2.6596478034718305, "grad_norm": 2.445085287094116, "learning_rate": 5.732410124337675e-06, "loss": 0.0389, "step": 42440 }, { "epoch": 2.660274487685655, "grad_norm": 0.010715600103139877, "learning_rate": 5.72185514344219e-06, "loss": 0.0801, "step": 42450 }, { "epoch": 2.66090117189948, "grad_norm": 0.009130342863500118, "learning_rate": 5.711300162546706e-06, "loss": 0.0024, "step": 42460 }, { "epoch": 2.6615278561133047, "grad_norm": 14.391807556152344, "learning_rate": 5.700745181651222e-06, "loss": 0.042, "step": 42470 }, { "epoch": 2.662154540327129, "grad_norm": 0.7011982202529907, "learning_rate": 5.690190200755737e-06, "loss": 0.2109, "step": 42480 }, { "epoch": 2.6627812245409537, "grad_norm": 0.0479697659611702, "learning_rate": 5.679635219860252e-06, "loss": 0.0387, "step": 42490 }, { "epoch": 2.6634079087547784, "grad_norm": 0.02062498964369297, "learning_rate": 5.669080238964767e-06, "loss": 0.0018, "step": 42500 }, { "epoch": 2.664034592968603, "grad_norm": 0.01683782786130905, "learning_rate": 5.658525258069283e-06, "loss": 0.0011, "step": 42510 }, { "epoch": 2.664661277182428, "grad_norm": 0.006330487783998251, "learning_rate": 5.647970277173799e-06, "loss": 0.05, "step": 42520 }, { "epoch": 2.6652879613962526, "grad_norm": 0.010017489083111286, "learning_rate": 5.637415296278314e-06, "loss": 0.096, "step": 42530 }, { "epoch": 2.6659146456100773, "grad_norm": 0.01938549615442753, "learning_rate": 5.62686031538283e-06, "loss": 0.1787, "step": 42540 }, { "epoch": 2.6665413298239016, "grad_norm": 0.02217995375394821, "learning_rate": 5.616305334487345e-06, "loss": 0.0657, "step": 42550 }, { "epoch": 2.6671680140377263, "grad_norm": 0.039555083960294724, "learning_rate": 5.6057503535918604e-06, "loss": 0.0936, "step": 42560 }, { "epoch": 2.667794698251551, "grad_norm": 28.735490798950195, "learning_rate": 5.595195372696375e-06, "loss": 0.1826, "step": 42570 }, { "epoch": 2.6684213824653757, "grad_norm": 0.01792915351688862, "learning_rate": 5.584640391800891e-06, "loss": 0.1354, "step": 42580 }, { "epoch": 2.6690480666792005, "grad_norm": 0.5480226278305054, "learning_rate": 5.574085410905407e-06, "loss": 0.1323, "step": 42590 }, { "epoch": 2.6696747508930248, "grad_norm": 0.5054864883422852, "learning_rate": 5.563530430009922e-06, "loss": 0.0902, "step": 42600 }, { "epoch": 2.6703014351068495, "grad_norm": 1.5177534818649292, "learning_rate": 5.552975449114437e-06, "loss": 0.0593, "step": 42610 }, { "epoch": 2.670928119320674, "grad_norm": 0.1519448608160019, "learning_rate": 5.542420468218953e-06, "loss": 0.0512, "step": 42620 }, { "epoch": 2.671554803534499, "grad_norm": 0.008680077269673347, "learning_rate": 5.5318654873234685e-06, "loss": 0.0316, "step": 42630 }, { "epoch": 2.6721814877483236, "grad_norm": 0.04235262796282768, "learning_rate": 5.521310506427984e-06, "loss": 0.0016, "step": 42640 }, { "epoch": 2.6728081719621484, "grad_norm": 0.014904645271599293, "learning_rate": 5.5107555255324984e-06, "loss": 0.0385, "step": 42650 }, { "epoch": 2.673434856175973, "grad_norm": 0.011473514139652252, "learning_rate": 5.500200544637014e-06, "loss": 0.0452, "step": 42660 }, { "epoch": 2.674061540389798, "grad_norm": 20.41715431213379, "learning_rate": 5.48964556374153e-06, "loss": 0.0704, "step": 42670 }, { "epoch": 2.674688224603622, "grad_norm": 0.011435385793447495, "learning_rate": 5.479090582846046e-06, "loss": 0.2295, "step": 42680 }, { "epoch": 2.675314908817447, "grad_norm": 0.013783792965114117, "learning_rate": 5.468535601950561e-06, "loss": 0.1978, "step": 42690 }, { "epoch": 2.6759415930312715, "grad_norm": 0.053753383457660675, "learning_rate": 5.457980621055076e-06, "loss": 0.0721, "step": 42700 }, { "epoch": 2.6765682772450963, "grad_norm": 0.12864500284194946, "learning_rate": 5.4474256401595916e-06, "loss": 0.0034, "step": 42710 }, { "epoch": 2.677194961458921, "grad_norm": 15.265963554382324, "learning_rate": 5.4368706592641065e-06, "loss": 0.0486, "step": 42720 }, { "epoch": 2.6778216456727453, "grad_norm": 0.01681533455848694, "learning_rate": 5.426315678368622e-06, "loss": 0.0048, "step": 42730 }, { "epoch": 2.67844832988657, "grad_norm": 0.6825605630874634, "learning_rate": 5.415760697473138e-06, "loss": 0.1534, "step": 42740 }, { "epoch": 2.6790750141003947, "grad_norm": 0.016611248254776, "learning_rate": 5.405205716577653e-06, "loss": 0.0404, "step": 42750 }, { "epoch": 2.6797016983142194, "grad_norm": 0.017577961087226868, "learning_rate": 5.394650735682168e-06, "loss": 0.139, "step": 42760 }, { "epoch": 2.680328382528044, "grad_norm": 6.641668796539307, "learning_rate": 5.384095754786684e-06, "loss": 0.1234, "step": 42770 }, { "epoch": 2.680955066741869, "grad_norm": 0.4626294672489166, "learning_rate": 5.3735407738912e-06, "loss": 0.0406, "step": 42780 }, { "epoch": 2.6815817509556936, "grad_norm": 10.911255836486816, "learning_rate": 5.3629857929957155e-06, "loss": 0.1176, "step": 42790 }, { "epoch": 2.6822084351695183, "grad_norm": 13.511198997497559, "learning_rate": 5.35243081210023e-06, "loss": 0.0905, "step": 42800 }, { "epoch": 2.6828351193833426, "grad_norm": 0.0641942247748375, "learning_rate": 5.341875831204745e-06, "loss": 0.0522, "step": 42810 }, { "epoch": 2.6834618035971673, "grad_norm": 0.009677493013441563, "learning_rate": 5.331320850309261e-06, "loss": 0.0094, "step": 42820 }, { "epoch": 2.684088487810992, "grad_norm": 0.06767957657575607, "learning_rate": 5.320765869413777e-06, "loss": 0.0327, "step": 42830 }, { "epoch": 2.684715172024817, "grad_norm": 0.049471303820610046, "learning_rate": 5.310210888518292e-06, "loss": 0.072, "step": 42840 }, { "epoch": 2.685341856238641, "grad_norm": 0.06735648959875107, "learning_rate": 5.299655907622808e-06, "loss": 0.2579, "step": 42850 }, { "epoch": 2.685968540452466, "grad_norm": 0.09247054904699326, "learning_rate": 5.289100926727323e-06, "loss": 0.0933, "step": 42860 }, { "epoch": 2.6865952246662905, "grad_norm": 0.04064527153968811, "learning_rate": 5.2785459458318385e-06, "loss": 0.1325, "step": 42870 }, { "epoch": 2.6872219088801153, "grad_norm": 0.009018697775900364, "learning_rate": 5.2679909649363535e-06, "loss": 0.0428, "step": 42880 }, { "epoch": 2.68784859309394, "grad_norm": 6.484643459320068, "learning_rate": 5.257435984040869e-06, "loss": 0.1189, "step": 42890 }, { "epoch": 2.6884752773077647, "grad_norm": 0.013771232217550278, "learning_rate": 5.246881003145385e-06, "loss": 0.0026, "step": 42900 }, { "epoch": 2.6891019615215894, "grad_norm": 0.022826027125120163, "learning_rate": 5.2363260222499e-06, "loss": 0.0361, "step": 42910 }, { "epoch": 2.689728645735414, "grad_norm": 0.02315569669008255, "learning_rate": 5.225771041354415e-06, "loss": 0.0551, "step": 42920 }, { "epoch": 2.6903553299492384, "grad_norm": 0.06086333841085434, "learning_rate": 5.215216060458931e-06, "loss": 0.0964, "step": 42930 }, { "epoch": 2.690982014163063, "grad_norm": 0.3857003152370453, "learning_rate": 5.204661079563447e-06, "loss": 0.1914, "step": 42940 }, { "epoch": 2.691608698376888, "grad_norm": 9.57112979888916, "learning_rate": 5.1941060986679616e-06, "loss": 0.2328, "step": 42950 }, { "epoch": 2.6922353825907126, "grad_norm": 4.99583625793457, "learning_rate": 5.1835511177724765e-06, "loss": 0.1707, "step": 42960 }, { "epoch": 2.6928620668045373, "grad_norm": 0.023815227672457695, "learning_rate": 5.172996136876992e-06, "loss": 0.0038, "step": 42970 }, { "epoch": 2.6934887510183616, "grad_norm": 0.05912555754184723, "learning_rate": 5.162441155981508e-06, "loss": 0.0479, "step": 42980 }, { "epoch": 2.6941154352321863, "grad_norm": 0.14293870329856873, "learning_rate": 5.151886175086024e-06, "loss": 0.1847, "step": 42990 }, { "epoch": 2.694742119446011, "grad_norm": 0.02170496992766857, "learning_rate": 5.141331194190539e-06, "loss": 0.0728, "step": 43000 }, { "epoch": 2.6953688036598358, "grad_norm": 0.4979763925075531, "learning_rate": 5.130776213295054e-06, "loss": 0.1014, "step": 43010 }, { "epoch": 2.6959954878736605, "grad_norm": 4.050300121307373, "learning_rate": 5.12022123239957e-06, "loss": 0.1959, "step": 43020 }, { "epoch": 2.6966221720874852, "grad_norm": 0.0454852357506752, "learning_rate": 5.109666251504085e-06, "loss": 0.1131, "step": 43030 }, { "epoch": 2.69724885630131, "grad_norm": 0.37142908573150635, "learning_rate": 5.0991112706086e-06, "loss": 0.0955, "step": 43040 }, { "epoch": 2.6978755405151347, "grad_norm": 0.08890983462333679, "learning_rate": 5.088556289713116e-06, "loss": 0.073, "step": 43050 }, { "epoch": 2.698502224728959, "grad_norm": 0.04773983359336853, "learning_rate": 5.078001308817631e-06, "loss": 0.1204, "step": 43060 }, { "epoch": 2.6991289089427837, "grad_norm": 0.06058727577328682, "learning_rate": 5.067446327922146e-06, "loss": 0.0382, "step": 43070 }, { "epoch": 2.6997555931566084, "grad_norm": 0.09812531620264053, "learning_rate": 5.056891347026662e-06, "loss": 0.0313, "step": 43080 }, { "epoch": 2.700382277370433, "grad_norm": 0.06097055599093437, "learning_rate": 5.046336366131178e-06, "loss": 0.0044, "step": 43090 }, { "epoch": 2.701008961584258, "grad_norm": 0.1884794533252716, "learning_rate": 5.0357813852356935e-06, "loss": 0.0592, "step": 43100 }, { "epoch": 2.701635645798082, "grad_norm": 0.12037229537963867, "learning_rate": 5.0252264043402085e-06, "loss": 0.0371, "step": 43110 }, { "epoch": 2.702262330011907, "grad_norm": 0.5690087080001831, "learning_rate": 5.0146714234447234e-06, "loss": 0.0464, "step": 43120 }, { "epoch": 2.7028890142257316, "grad_norm": 0.030820699408650398, "learning_rate": 5.004116442549239e-06, "loss": 0.0863, "step": 43130 }, { "epoch": 2.7035156984395563, "grad_norm": 0.033750128000974655, "learning_rate": 4.993561461653755e-06, "loss": 0.0341, "step": 43140 }, { "epoch": 2.704142382653381, "grad_norm": 0.0517287403345108, "learning_rate": 4.98300648075827e-06, "loss": 0.1082, "step": 43150 }, { "epoch": 2.7047690668672058, "grad_norm": 0.01474336814135313, "learning_rate": 4.972451499862785e-06, "loss": 0.0034, "step": 43160 }, { "epoch": 2.7053957510810305, "grad_norm": 0.19128337502479553, "learning_rate": 4.961896518967301e-06, "loss": 0.0952, "step": 43170 }, { "epoch": 2.706022435294855, "grad_norm": 0.1466163694858551, "learning_rate": 4.951341538071817e-06, "loss": 0.038, "step": 43180 }, { "epoch": 2.7066491195086795, "grad_norm": 7.711380958557129, "learning_rate": 4.9407865571763315e-06, "loss": 0.2064, "step": 43190 }, { "epoch": 2.707275803722504, "grad_norm": 0.025164706632494926, "learning_rate": 4.930231576280847e-06, "loss": 0.0944, "step": 43200 }, { "epoch": 2.707902487936329, "grad_norm": 5.459824085235596, "learning_rate": 4.919676595385362e-06, "loss": 0.2002, "step": 43210 }, { "epoch": 2.7085291721501537, "grad_norm": 6.968769550323486, "learning_rate": 4.909121614489878e-06, "loss": 0.1127, "step": 43220 }, { "epoch": 2.709155856363978, "grad_norm": 0.03392601013183594, "learning_rate": 4.898566633594393e-06, "loss": 0.037, "step": 43230 }, { "epoch": 2.7097825405778027, "grad_norm": 0.13544121384620667, "learning_rate": 4.888011652698909e-06, "loss": 0.0745, "step": 43240 }, { "epoch": 2.7104092247916274, "grad_norm": 7.2782301902771, "learning_rate": 4.877456671803425e-06, "loss": 0.0351, "step": 43250 }, { "epoch": 2.711035909005452, "grad_norm": 0.06163564696907997, "learning_rate": 4.86690169090794e-06, "loss": 0.1478, "step": 43260 }, { "epoch": 2.711662593219277, "grad_norm": 0.0893465131521225, "learning_rate": 4.856346710012455e-06, "loss": 0.0038, "step": 43270 }, { "epoch": 2.7122892774331016, "grad_norm": 0.6175368428230286, "learning_rate": 4.84579172911697e-06, "loss": 0.1493, "step": 43280 }, { "epoch": 2.7129159616469263, "grad_norm": 0.24023815989494324, "learning_rate": 4.835236748221486e-06, "loss": 0.0617, "step": 43290 }, { "epoch": 2.713542645860751, "grad_norm": 0.01608484983444214, "learning_rate": 4.824681767326002e-06, "loss": 0.0713, "step": 43300 }, { "epoch": 2.7141693300745753, "grad_norm": 0.17486777901649475, "learning_rate": 4.814126786430517e-06, "loss": 0.002, "step": 43310 }, { "epoch": 2.7147960142884, "grad_norm": 0.24492332339286804, "learning_rate": 4.803571805535032e-06, "loss": 0.0289, "step": 43320 }, { "epoch": 2.7154226985022247, "grad_norm": 7.103209972381592, "learning_rate": 4.793016824639548e-06, "loss": 0.0583, "step": 43330 }, { "epoch": 2.7160493827160495, "grad_norm": 0.01576782949268818, "learning_rate": 4.7824618437440635e-06, "loss": 0.0413, "step": 43340 }, { "epoch": 2.716676066929874, "grad_norm": 0.016768546774983406, "learning_rate": 4.7719068628485785e-06, "loss": 0.0285, "step": 43350 }, { "epoch": 2.7173027511436985, "grad_norm": 0.014253930188715458, "learning_rate": 4.761351881953094e-06, "loss": 0.1184, "step": 43360 }, { "epoch": 2.717929435357523, "grad_norm": 0.009223722852766514, "learning_rate": 4.750796901057609e-06, "loss": 0.099, "step": 43370 }, { "epoch": 2.718556119571348, "grad_norm": 0.06819354742765427, "learning_rate": 4.740241920162124e-06, "loss": 0.0382, "step": 43380 }, { "epoch": 2.7191828037851726, "grad_norm": 0.007454665377736092, "learning_rate": 4.72968693926664e-06, "loss": 0.2593, "step": 43390 }, { "epoch": 2.7198094879989974, "grad_norm": 0.02343241311609745, "learning_rate": 4.719131958371156e-06, "loss": 0.0025, "step": 43400 }, { "epoch": 2.720436172212822, "grad_norm": 0.007642934564501047, "learning_rate": 4.708576977475672e-06, "loss": 0.0171, "step": 43410 }, { "epoch": 2.721062856426647, "grad_norm": 0.006195179186761379, "learning_rate": 4.698021996580186e-06, "loss": 0.1173, "step": 43420 }, { "epoch": 2.7216895406404715, "grad_norm": 0.0281064510345459, "learning_rate": 4.6874670156847015e-06, "loss": 0.0045, "step": 43430 }, { "epoch": 2.722316224854296, "grad_norm": 0.0708283931016922, "learning_rate": 4.676912034789217e-06, "loss": 0.0497, "step": 43440 }, { "epoch": 2.7229429090681205, "grad_norm": 0.009736607782542706, "learning_rate": 4.666357053893733e-06, "loss": 0.0016, "step": 43450 }, { "epoch": 2.7235695932819453, "grad_norm": 0.014862255193293095, "learning_rate": 4.655802072998248e-06, "loss": 0.0018, "step": 43460 }, { "epoch": 2.72419627749577, "grad_norm": 0.3423691689968109, "learning_rate": 4.645247092102763e-06, "loss": 0.1195, "step": 43470 }, { "epoch": 2.7248229617095943, "grad_norm": 1.0811691284179688, "learning_rate": 4.634692111207279e-06, "loss": 0.0056, "step": 43480 }, { "epoch": 2.725449645923419, "grad_norm": 0.31755921244621277, "learning_rate": 4.624137130311795e-06, "loss": 0.3132, "step": 43490 }, { "epoch": 2.7260763301372437, "grad_norm": 6.94637393951416, "learning_rate": 4.61358214941631e-06, "loss": 0.1208, "step": 43500 }, { "epoch": 2.7267030143510684, "grad_norm": 0.23933982849121094, "learning_rate": 4.603027168520825e-06, "loss": 0.1155, "step": 43510 }, { "epoch": 2.727329698564893, "grad_norm": 0.018548715859651566, "learning_rate": 4.59247218762534e-06, "loss": 0.0049, "step": 43520 }, { "epoch": 2.727956382778718, "grad_norm": 0.02049705944955349, "learning_rate": 4.581917206729856e-06, "loss": 0.04, "step": 43530 }, { "epoch": 2.7285830669925426, "grad_norm": 0.28012970089912415, "learning_rate": 4.571362225834371e-06, "loss": 0.0044, "step": 43540 }, { "epoch": 2.7292097512063673, "grad_norm": 0.16389213502407074, "learning_rate": 4.560807244938887e-06, "loss": 0.0456, "step": 43550 }, { "epoch": 2.7298364354201916, "grad_norm": 0.27304649353027344, "learning_rate": 4.550252264043403e-06, "loss": 0.1046, "step": 43560 }, { "epoch": 2.7304631196340163, "grad_norm": 0.28648197650909424, "learning_rate": 4.539697283147918e-06, "loss": 0.0498, "step": 43570 }, { "epoch": 2.731089803847841, "grad_norm": 0.013310969807207584, "learning_rate": 4.529142302252433e-06, "loss": 0.0713, "step": 43580 }, { "epoch": 2.731716488061666, "grad_norm": 0.01846585050225258, "learning_rate": 4.5185873213569485e-06, "loss": 0.0194, "step": 43590 }, { "epoch": 2.7323431722754905, "grad_norm": 10.159256935119629, "learning_rate": 4.508032340461464e-06, "loss": 0.0576, "step": 43600 }, { "epoch": 2.732969856489315, "grad_norm": 0.012528217397630215, "learning_rate": 4.49747735956598e-06, "loss": 0.2147, "step": 43610 }, { "epoch": 2.7335965407031395, "grad_norm": 4.751760482788086, "learning_rate": 4.486922378670495e-06, "loss": 0.0491, "step": 43620 }, { "epoch": 2.7342232249169642, "grad_norm": 0.0792778953909874, "learning_rate": 4.47636739777501e-06, "loss": 0.1169, "step": 43630 }, { "epoch": 2.734849909130789, "grad_norm": 0.20727191865444183, "learning_rate": 4.465812416879526e-06, "loss": 0.0531, "step": 43640 }, { "epoch": 2.7354765933446137, "grad_norm": 0.009766854345798492, "learning_rate": 4.455257435984042e-06, "loss": 0.0051, "step": 43650 }, { "epoch": 2.7361032775584384, "grad_norm": 7.263557434082031, "learning_rate": 4.4447024550885565e-06, "loss": 0.0657, "step": 43660 }, { "epoch": 2.736729961772263, "grad_norm": 0.3022579252719879, "learning_rate": 4.4341474741930715e-06, "loss": 0.0035, "step": 43670 }, { "epoch": 2.737356645986088, "grad_norm": 0.016718635335564613, "learning_rate": 4.423592493297587e-06, "loss": 0.0057, "step": 43680 }, { "epoch": 2.737983330199912, "grad_norm": 0.004681083839386702, "learning_rate": 4.413037512402102e-06, "loss": 0.0038, "step": 43690 }, { "epoch": 2.738610014413737, "grad_norm": 0.05114516615867615, "learning_rate": 4.402482531506618e-06, "loss": 0.0013, "step": 43700 }, { "epoch": 2.7392366986275616, "grad_norm": 0.010205262340605259, "learning_rate": 4.391927550611134e-06, "loss": 0.0857, "step": 43710 }, { "epoch": 2.7398633828413863, "grad_norm": 0.20754745602607727, "learning_rate": 4.381372569715649e-06, "loss": 0.0807, "step": 43720 }, { "epoch": 2.740490067055211, "grad_norm": 25.73821258544922, "learning_rate": 4.370817588820164e-06, "loss": 0.1039, "step": 43730 }, { "epoch": 2.7411167512690353, "grad_norm": 23.46526527404785, "learning_rate": 4.36026260792468e-06, "loss": 0.0647, "step": 43740 }, { "epoch": 2.74174343548286, "grad_norm": 6.308553695678711, "learning_rate": 4.349707627029195e-06, "loss": 0.0606, "step": 43750 }, { "epoch": 2.7423701196966848, "grad_norm": 0.009312220849096775, "learning_rate": 4.339152646133711e-06, "loss": 0.0727, "step": 43760 }, { "epoch": 2.7429968039105095, "grad_norm": 0.026370780542492867, "learning_rate": 4.328597665238226e-06, "loss": 0.0012, "step": 43770 }, { "epoch": 2.743623488124334, "grad_norm": 1.8105483055114746, "learning_rate": 4.318042684342741e-06, "loss": 0.2181, "step": 43780 }, { "epoch": 2.744250172338159, "grad_norm": 0.026521051302552223, "learning_rate": 4.307487703447257e-06, "loss": 0.0558, "step": 43790 }, { "epoch": 2.7448768565519837, "grad_norm": 0.056364260613918304, "learning_rate": 4.296932722551773e-06, "loss": 0.0425, "step": 43800 }, { "epoch": 2.7455035407658084, "grad_norm": 0.056610897183418274, "learning_rate": 4.286377741656288e-06, "loss": 0.1089, "step": 43810 }, { "epoch": 2.7461302249796327, "grad_norm": 0.016765808686614037, "learning_rate": 4.2758227607608035e-06, "loss": 0.0718, "step": 43820 }, { "epoch": 2.7467569091934574, "grad_norm": 1.5205049514770508, "learning_rate": 4.2652677798653184e-06, "loss": 0.1873, "step": 43830 }, { "epoch": 2.747383593407282, "grad_norm": 0.021429814398288727, "learning_rate": 4.254712798969834e-06, "loss": 0.1228, "step": 43840 }, { "epoch": 2.748010277621107, "grad_norm": 0.4221652150154114, "learning_rate": 4.244157818074349e-06, "loss": 0.0062, "step": 43850 }, { "epoch": 2.748636961834931, "grad_norm": 0.024672698229551315, "learning_rate": 4.233602837178865e-06, "loss": 0.0617, "step": 43860 }, { "epoch": 2.749263646048756, "grad_norm": 0.3984994888305664, "learning_rate": 4.223047856283381e-06, "loss": 0.0381, "step": 43870 }, { "epoch": 2.7498903302625806, "grad_norm": 0.023568619042634964, "learning_rate": 4.212492875387896e-06, "loss": 0.0323, "step": 43880 }, { "epoch": 2.7505170144764053, "grad_norm": 0.8084057569503784, "learning_rate": 4.201937894492411e-06, "loss": 0.1702, "step": 43890 }, { "epoch": 2.75114369869023, "grad_norm": 53.31449508666992, "learning_rate": 4.1913829135969265e-06, "loss": 0.1677, "step": 43900 }, { "epoch": 2.7517703829040547, "grad_norm": 0.021764956414699554, "learning_rate": 4.180827932701442e-06, "loss": 0.0817, "step": 43910 }, { "epoch": 2.7523970671178795, "grad_norm": 0.05249863490462303, "learning_rate": 4.170272951805958e-06, "loss": 0.0425, "step": 43920 }, { "epoch": 2.753023751331704, "grad_norm": 6.336426258087158, "learning_rate": 4.159717970910472e-06, "loss": 0.1102, "step": 43930 }, { "epoch": 2.7536504355455285, "grad_norm": 0.8154854774475098, "learning_rate": 4.149162990014988e-06, "loss": 0.0805, "step": 43940 }, { "epoch": 2.754277119759353, "grad_norm": 0.028954818844795227, "learning_rate": 4.138608009119504e-06, "loss": 0.0646, "step": 43950 }, { "epoch": 2.754903803973178, "grad_norm": 0.01812795363366604, "learning_rate": 4.12805302822402e-06, "loss": 0.1344, "step": 43960 }, { "epoch": 2.7555304881870026, "grad_norm": 0.1904141902923584, "learning_rate": 4.117498047328535e-06, "loss": 0.1025, "step": 43970 }, { "epoch": 2.7561571724008274, "grad_norm": 0.15014392137527466, "learning_rate": 4.10694306643305e-06, "loss": 0.0035, "step": 43980 }, { "epoch": 2.7567838566146516, "grad_norm": 0.16566021740436554, "learning_rate": 4.096388085537565e-06, "loss": 0.1814, "step": 43990 }, { "epoch": 2.7574105408284764, "grad_norm": 0.4417349398136139, "learning_rate": 4.085833104642081e-06, "loss": 0.0033, "step": 44000 }, { "epoch": 2.758037225042301, "grad_norm": 0.03424863517284393, "learning_rate": 4.075278123746596e-06, "loss": 0.0012, "step": 44010 }, { "epoch": 2.758663909256126, "grad_norm": 0.02660740353167057, "learning_rate": 4.064723142851112e-06, "loss": 0.0299, "step": 44020 }, { "epoch": 2.7592905934699505, "grad_norm": 0.1258479654788971, "learning_rate": 4.054168161955627e-06, "loss": 0.0038, "step": 44030 }, { "epoch": 2.7599172776837753, "grad_norm": 0.13935960829257965, "learning_rate": 4.043613181060142e-06, "loss": 0.0374, "step": 44040 }, { "epoch": 2.7605439618976, "grad_norm": 0.08836539089679718, "learning_rate": 4.033058200164658e-06, "loss": 0.0768, "step": 44050 }, { "epoch": 2.7611706461114247, "grad_norm": 0.18964870274066925, "learning_rate": 4.0225032192691735e-06, "loss": 0.106, "step": 44060 }, { "epoch": 2.761797330325249, "grad_norm": 0.010779556818306446, "learning_rate": 4.011948238373689e-06, "loss": 0.0014, "step": 44070 }, { "epoch": 2.7624240145390737, "grad_norm": 0.21891902387142181, "learning_rate": 4.001393257478204e-06, "loss": 0.0294, "step": 44080 }, { "epoch": 2.7630506987528984, "grad_norm": 0.1056990772485733, "learning_rate": 3.990838276582719e-06, "loss": 0.0478, "step": 44090 }, { "epoch": 2.763677382966723, "grad_norm": 0.1947309374809265, "learning_rate": 3.980283295687235e-06, "loss": 0.1257, "step": 44100 }, { "epoch": 2.764304067180548, "grad_norm": 0.9363518357276917, "learning_rate": 3.969728314791751e-06, "loss": 0.0757, "step": 44110 }, { "epoch": 2.764930751394372, "grad_norm": 0.006408642511814833, "learning_rate": 3.959173333896266e-06, "loss": 0.0577, "step": 44120 }, { "epoch": 2.765557435608197, "grad_norm": 0.36477547883987427, "learning_rate": 3.9486183530007816e-06, "loss": 0.0014, "step": 44130 }, { "epoch": 2.7661841198220216, "grad_norm": 0.11764029413461685, "learning_rate": 3.9380633721052965e-06, "loss": 0.0283, "step": 44140 }, { "epoch": 2.7668108040358463, "grad_norm": 1.2040200233459473, "learning_rate": 3.927508391209812e-06, "loss": 0.0767, "step": 44150 }, { "epoch": 2.767437488249671, "grad_norm": 0.04894113168120384, "learning_rate": 3.916953410314327e-06, "loss": 0.0558, "step": 44160 }, { "epoch": 2.768064172463496, "grad_norm": 7.974740982055664, "learning_rate": 3.906398429418843e-06, "loss": 0.0612, "step": 44170 }, { "epoch": 2.7686908566773205, "grad_norm": 0.5689123272895813, "learning_rate": 3.895843448523358e-06, "loss": 0.0887, "step": 44180 }, { "epoch": 2.769317540891145, "grad_norm": 0.12117575854063034, "learning_rate": 3.885288467627874e-06, "loss": 0.0454, "step": 44190 }, { "epoch": 2.7699442251049695, "grad_norm": 0.05510355532169342, "learning_rate": 3.874733486732389e-06, "loss": 0.0018, "step": 44200 }, { "epoch": 2.7705709093187942, "grad_norm": 0.34615781903266907, "learning_rate": 3.864178505836905e-06, "loss": 0.1697, "step": 44210 }, { "epoch": 2.771197593532619, "grad_norm": 0.016674628481268883, "learning_rate": 3.85362352494142e-06, "loss": 0.0888, "step": 44220 }, { "epoch": 2.7718242777464437, "grad_norm": 0.009896052069962025, "learning_rate": 3.843068544045935e-06, "loss": 0.004, "step": 44230 }, { "epoch": 2.772450961960268, "grad_norm": 11.549787521362305, "learning_rate": 3.83251356315045e-06, "loss": 0.0569, "step": 44240 }, { "epoch": 2.7730776461740927, "grad_norm": 0.005293003749102354, "learning_rate": 3.821958582254966e-06, "loss": 0.1791, "step": 44250 }, { "epoch": 2.7737043303879174, "grad_norm": 0.01065013650804758, "learning_rate": 3.811403601359482e-06, "loss": 0.0196, "step": 44260 }, { "epoch": 2.774331014601742, "grad_norm": 0.4858344793319702, "learning_rate": 3.8008486204639973e-06, "loss": 0.0956, "step": 44270 }, { "epoch": 2.774957698815567, "grad_norm": 0.007913574576377869, "learning_rate": 3.7902936395685123e-06, "loss": 0.0668, "step": 44280 }, { "epoch": 2.7755843830293916, "grad_norm": 0.36157143115997314, "learning_rate": 3.779738658673028e-06, "loss": 0.1573, "step": 44290 }, { "epoch": 2.7762110672432163, "grad_norm": 0.010120880790054798, "learning_rate": 3.7691836777775435e-06, "loss": 0.0908, "step": 44300 }, { "epoch": 2.776837751457041, "grad_norm": 0.016430944204330444, "learning_rate": 3.7586286968820593e-06, "loss": 0.0009, "step": 44310 }, { "epoch": 2.7774644356708653, "grad_norm": 0.011352479457855225, "learning_rate": 3.7480737159865742e-06, "loss": 0.0313, "step": 44320 }, { "epoch": 2.77809111988469, "grad_norm": 0.011910725384950638, "learning_rate": 3.7375187350910896e-06, "loss": 0.0014, "step": 44330 }, { "epoch": 2.7787178040985148, "grad_norm": 0.03391356021165848, "learning_rate": 3.7269637541956054e-06, "loss": 0.1373, "step": 44340 }, { "epoch": 2.7793444883123395, "grad_norm": 0.06280501186847687, "learning_rate": 3.7164087733001204e-06, "loss": 0.0012, "step": 44350 }, { "epoch": 2.779971172526164, "grad_norm": 0.01791159249842167, "learning_rate": 3.7058537924046357e-06, "loss": 0.0778, "step": 44360 }, { "epoch": 2.7805978567399885, "grad_norm": 0.007814787328243256, "learning_rate": 3.6952988115091515e-06, "loss": 0.0069, "step": 44370 }, { "epoch": 2.781224540953813, "grad_norm": 0.03260798752307892, "learning_rate": 3.684743830613667e-06, "loss": 0.0707, "step": 44380 }, { "epoch": 2.781851225167638, "grad_norm": 0.04626648873090744, "learning_rate": 3.674188849718182e-06, "loss": 0.0044, "step": 44390 }, { "epoch": 2.7824779093814627, "grad_norm": 0.004598037339746952, "learning_rate": 3.6636338688226973e-06, "loss": 0.0621, "step": 44400 }, { "epoch": 2.7831045935952874, "grad_norm": 16.925655364990234, "learning_rate": 3.653078887927213e-06, "loss": 0.0674, "step": 44410 }, { "epoch": 2.783731277809112, "grad_norm": 0.01036140788346529, "learning_rate": 3.642523907031729e-06, "loss": 0.0396, "step": 44420 }, { "epoch": 2.784357962022937, "grad_norm": 0.004241311922669411, "learning_rate": 3.6319689261362434e-06, "loss": 0.1163, "step": 44430 }, { "epoch": 2.7849846462367616, "grad_norm": 0.003902185009792447, "learning_rate": 3.621413945240759e-06, "loss": 0.0272, "step": 44440 }, { "epoch": 2.785611330450586, "grad_norm": 0.474302738904953, "learning_rate": 3.6108589643452746e-06, "loss": 0.0135, "step": 44450 }, { "epoch": 2.7862380146644106, "grad_norm": 0.011814618483185768, "learning_rate": 3.6003039834497904e-06, "loss": 0.1892, "step": 44460 }, { "epoch": 2.7868646988782353, "grad_norm": 0.1012982577085495, "learning_rate": 3.5897490025543053e-06, "loss": 0.1607, "step": 44470 }, { "epoch": 2.78749138309206, "grad_norm": 0.03258698061108589, "learning_rate": 3.5791940216588207e-06, "loss": 0.1343, "step": 44480 }, { "epoch": 2.7881180673058843, "grad_norm": 0.004049960989505053, "learning_rate": 3.5686390407633365e-06, "loss": 0.2204, "step": 44490 }, { "epoch": 2.788744751519709, "grad_norm": 0.008130665868520737, "learning_rate": 3.558084059867852e-06, "loss": 0.0028, "step": 44500 }, { "epoch": 2.7893714357335337, "grad_norm": 9.015934944152832, "learning_rate": 3.547529078972367e-06, "loss": 0.082, "step": 44510 }, { "epoch": 2.7899981199473585, "grad_norm": 0.00387417059391737, "learning_rate": 3.5369740980768827e-06, "loss": 0.0949, "step": 44520 }, { "epoch": 2.790624804161183, "grad_norm": 4.448673725128174, "learning_rate": 3.526419117181398e-06, "loss": 0.0473, "step": 44530 }, { "epoch": 2.791251488375008, "grad_norm": 0.11645185202360153, "learning_rate": 3.515864136285914e-06, "loss": 0.0798, "step": 44540 }, { "epoch": 2.7918781725888326, "grad_norm": 0.047189708799123764, "learning_rate": 3.505309155390429e-06, "loss": 0.0368, "step": 44550 }, { "epoch": 2.7925048568026574, "grad_norm": 16.466087341308594, "learning_rate": 3.494754174494944e-06, "loss": 0.09, "step": 44560 }, { "epoch": 2.7931315410164816, "grad_norm": 0.07427942007780075, "learning_rate": 3.48419919359946e-06, "loss": 0.1788, "step": 44570 }, { "epoch": 2.7937582252303064, "grad_norm": 0.014952383004128933, "learning_rate": 3.4736442127039754e-06, "loss": 0.0356, "step": 44580 }, { "epoch": 2.794384909444131, "grad_norm": 0.5309322476387024, "learning_rate": 3.4630892318084903e-06, "loss": 0.0368, "step": 44590 }, { "epoch": 2.795011593657956, "grad_norm": 4.181199550628662, "learning_rate": 3.452534250913006e-06, "loss": 0.1609, "step": 44600 }, { "epoch": 2.7956382778717805, "grad_norm": 0.31589823961257935, "learning_rate": 3.4419792700175215e-06, "loss": 0.0745, "step": 44610 }, { "epoch": 2.796264962085605, "grad_norm": 4.957091331481934, "learning_rate": 3.4314242891220373e-06, "loss": 0.0768, "step": 44620 }, { "epoch": 2.7968916462994295, "grad_norm": 0.6653972268104553, "learning_rate": 3.4208693082265523e-06, "loss": 0.1005, "step": 44630 }, { "epoch": 2.7975183305132543, "grad_norm": 0.03658928722143173, "learning_rate": 3.4103143273310677e-06, "loss": 0.0479, "step": 44640 }, { "epoch": 2.798145014727079, "grad_norm": 0.028675353154540062, "learning_rate": 3.3997593464355835e-06, "loss": 0.01, "step": 44650 }, { "epoch": 2.7987716989409037, "grad_norm": 0.05703501030802727, "learning_rate": 3.389204365540099e-06, "loss": 0.0476, "step": 44660 }, { "epoch": 2.7993983831547284, "grad_norm": 0.025796543806791306, "learning_rate": 3.378649384644614e-06, "loss": 0.2466, "step": 44670 }, { "epoch": 2.800025067368553, "grad_norm": 0.06946486234664917, "learning_rate": 3.368094403749129e-06, "loss": 0.0533, "step": 44680 }, { "epoch": 2.800651751582378, "grad_norm": 0.03723221644759178, "learning_rate": 3.357539422853645e-06, "loss": 0.1658, "step": 44690 }, { "epoch": 2.801278435796202, "grad_norm": 0.20439843833446503, "learning_rate": 3.34698444195816e-06, "loss": 0.0314, "step": 44700 }, { "epoch": 2.801905120010027, "grad_norm": 9.646413803100586, "learning_rate": 3.3364294610626753e-06, "loss": 0.0337, "step": 44710 }, { "epoch": 2.8025318042238516, "grad_norm": 124.05106353759766, "learning_rate": 3.325874480167191e-06, "loss": 0.0586, "step": 44720 }, { "epoch": 2.8031584884376763, "grad_norm": 0.05693316087126732, "learning_rate": 3.3153194992717065e-06, "loss": 0.0022, "step": 44730 }, { "epoch": 2.803785172651501, "grad_norm": 0.04369988664984703, "learning_rate": 3.3047645183762215e-06, "loss": 0.0427, "step": 44740 }, { "epoch": 2.8044118568653253, "grad_norm": 0.03682061284780502, "learning_rate": 3.2942095374807373e-06, "loss": 0.0777, "step": 44750 }, { "epoch": 2.80503854107915, "grad_norm": 0.3942996859550476, "learning_rate": 3.2836545565852527e-06, "loss": 0.123, "step": 44760 }, { "epoch": 2.805665225292975, "grad_norm": 0.03973536938428879, "learning_rate": 3.2730995756897685e-06, "loss": 0.0912, "step": 44770 }, { "epoch": 2.8062919095067995, "grad_norm": 0.3577216863632202, "learning_rate": 3.2625445947942834e-06, "loss": 0.0692, "step": 44780 }, { "epoch": 2.8069185937206242, "grad_norm": 0.13281461596488953, "learning_rate": 3.251989613898799e-06, "loss": 0.1269, "step": 44790 }, { "epoch": 2.807545277934449, "grad_norm": 0.1817847192287445, "learning_rate": 3.2414346330033146e-06, "loss": 0.0766, "step": 44800 }, { "epoch": 2.8081719621482737, "grad_norm": 0.05994454398751259, "learning_rate": 3.23087965210783e-06, "loss": 0.038, "step": 44810 }, { "epoch": 2.808798646362098, "grad_norm": 0.007000461686402559, "learning_rate": 3.220324671212345e-06, "loss": 0.1416, "step": 44820 }, { "epoch": 2.8094253305759227, "grad_norm": 0.06378606706857681, "learning_rate": 3.2097696903168607e-06, "loss": 0.0422, "step": 44830 }, { "epoch": 2.8100520147897474, "grad_norm": 0.007673150394111872, "learning_rate": 3.199214709421376e-06, "loss": 0.0294, "step": 44840 }, { "epoch": 2.810678699003572, "grad_norm": 0.09554049372673035, "learning_rate": 3.188659728525892e-06, "loss": 0.0039, "step": 44850 }, { "epoch": 2.811305383217397, "grad_norm": 0.07531704008579254, "learning_rate": 3.178104747630407e-06, "loss": 0.0449, "step": 44860 }, { "epoch": 2.811932067431221, "grad_norm": 0.2533637583255768, "learning_rate": 3.1675497667349223e-06, "loss": 0.0041, "step": 44870 }, { "epoch": 2.812558751645046, "grad_norm": 8.10892391204834, "learning_rate": 3.156994785839438e-06, "loss": 0.0659, "step": 44880 }, { "epoch": 2.8131854358588706, "grad_norm": 7.151573657989502, "learning_rate": 3.1464398049439535e-06, "loss": 0.1933, "step": 44890 }, { "epoch": 2.8138121200726953, "grad_norm": 0.3158617913722992, "learning_rate": 3.1358848240484684e-06, "loss": 0.0357, "step": 44900 }, { "epoch": 2.81443880428652, "grad_norm": 0.012602360919117928, "learning_rate": 3.125329843152984e-06, "loss": 0.1188, "step": 44910 }, { "epoch": 2.8150654885003448, "grad_norm": 5.404448509216309, "learning_rate": 3.1147748622574996e-06, "loss": 0.0822, "step": 44920 }, { "epoch": 2.8156921727141695, "grad_norm": 0.0632733628153801, "learning_rate": 3.104219881362015e-06, "loss": 0.0099, "step": 44930 }, { "epoch": 2.816318856927994, "grad_norm": 0.19753728806972504, "learning_rate": 3.0936649004665304e-06, "loss": 0.1135, "step": 44940 }, { "epoch": 2.8169455411418185, "grad_norm": 5.0890092849731445, "learning_rate": 3.0831099195710457e-06, "loss": 0.1338, "step": 44950 }, { "epoch": 2.8175722253556432, "grad_norm": 0.8431811332702637, "learning_rate": 3.072554938675561e-06, "loss": 0.0986, "step": 44960 }, { "epoch": 2.818198909569468, "grad_norm": 0.20394667983055115, "learning_rate": 3.0619999577800765e-06, "loss": 0.0036, "step": 44970 }, { "epoch": 2.8188255937832927, "grad_norm": 0.30946293473243713, "learning_rate": 3.0514449768845923e-06, "loss": 0.0354, "step": 44980 }, { "epoch": 2.8194522779971174, "grad_norm": 0.1506856381893158, "learning_rate": 3.0408899959891073e-06, "loss": 0.0177, "step": 44990 }, { "epoch": 2.8200789622109417, "grad_norm": 0.2589752972126007, "learning_rate": 3.030335015093623e-06, "loss": 0.0036, "step": 45000 }, { "epoch": 2.8207056464247664, "grad_norm": 0.020266905426979065, "learning_rate": 3.0197800341981384e-06, "loss": 0.0016, "step": 45010 }, { "epoch": 2.821332330638591, "grad_norm": 10.223530769348145, "learning_rate": 3.0092250533026534e-06, "loss": 0.1038, "step": 45020 }, { "epoch": 2.821959014852416, "grad_norm": 0.005880449432879686, "learning_rate": 2.998670072407169e-06, "loss": 0.0008, "step": 45030 }, { "epoch": 2.8225856990662406, "grad_norm": 0.00848589837551117, "learning_rate": 2.988115091511684e-06, "loss": 0.0746, "step": 45040 }, { "epoch": 2.8232123832800653, "grad_norm": 0.02407548949122429, "learning_rate": 2.9775601106162e-06, "loss": 0.0358, "step": 45050 }, { "epoch": 2.82383906749389, "grad_norm": 0.058489829301834106, "learning_rate": 2.9670051297207154e-06, "loss": 0.0485, "step": 45060 }, { "epoch": 2.8244657517077147, "grad_norm": 172.251220703125, "learning_rate": 2.9564501488252307e-06, "loss": 0.096, "step": 45070 }, { "epoch": 2.825092435921539, "grad_norm": 0.23265521228313446, "learning_rate": 2.945895167929746e-06, "loss": 0.0534, "step": 45080 }, { "epoch": 2.8257191201353637, "grad_norm": 0.03863035887479782, "learning_rate": 2.9353401870342615e-06, "loss": 0.1314, "step": 45090 }, { "epoch": 2.8263458043491885, "grad_norm": 0.027446454390883446, "learning_rate": 2.924785206138777e-06, "loss": 0.0281, "step": 45100 }, { "epoch": 2.826972488563013, "grad_norm": 20.515501022338867, "learning_rate": 2.9142302252432927e-06, "loss": 0.0892, "step": 45110 }, { "epoch": 2.8275991727768375, "grad_norm": 0.011976310983300209, "learning_rate": 2.9036752443478076e-06, "loss": 0.003, "step": 45120 }, { "epoch": 2.828225856990662, "grad_norm": 0.042540911585092545, "learning_rate": 2.8931202634523234e-06, "loss": 0.1174, "step": 45130 }, { "epoch": 2.828852541204487, "grad_norm": 7.581174373626709, "learning_rate": 2.882565282556839e-06, "loss": 0.0382, "step": 45140 }, { "epoch": 2.8294792254183116, "grad_norm": 0.22137735784053802, "learning_rate": 2.872010301661354e-06, "loss": 0.0034, "step": 45150 }, { "epoch": 2.8301059096321364, "grad_norm": 0.2696286737918854, "learning_rate": 2.8614553207658696e-06, "loss": 0.0279, "step": 45160 }, { "epoch": 2.830732593845961, "grad_norm": 0.15648046135902405, "learning_rate": 2.850900339870385e-06, "loss": 0.0128, "step": 45170 }, { "epoch": 2.831359278059786, "grad_norm": 0.03201805427670479, "learning_rate": 2.8403453589749003e-06, "loss": 0.097, "step": 45180 }, { "epoch": 2.8319859622736105, "grad_norm": 0.17333370447158813, "learning_rate": 2.8297903780794157e-06, "loss": 0.0033, "step": 45190 }, { "epoch": 2.832612646487435, "grad_norm": 0.009709888137876987, "learning_rate": 2.819235397183931e-06, "loss": 0.0018, "step": 45200 }, { "epoch": 2.8332393307012596, "grad_norm": 0.012017948552966118, "learning_rate": 2.808680416288447e-06, "loss": 0.0017, "step": 45210 }, { "epoch": 2.8338660149150843, "grad_norm": 0.01653211936354637, "learning_rate": 2.798125435392962e-06, "loss": 0.0024, "step": 45220 }, { "epoch": 2.834492699128909, "grad_norm": 6.492367744445801, "learning_rate": 2.7875704544974777e-06, "loss": 0.1434, "step": 45230 }, { "epoch": 2.8351193833427337, "grad_norm": 11.736387252807617, "learning_rate": 2.777015473601993e-06, "loss": 0.123, "step": 45240 }, { "epoch": 2.835746067556558, "grad_norm": 0.013091763481497765, "learning_rate": 2.7664604927065084e-06, "loss": 0.1348, "step": 45250 }, { "epoch": 2.8363727517703827, "grad_norm": 3.22715163230896, "learning_rate": 2.755905511811024e-06, "loss": 0.0459, "step": 45260 }, { "epoch": 2.8369994359842075, "grad_norm": 15.400287628173828, "learning_rate": 2.745350530915539e-06, "loss": 0.0604, "step": 45270 }, { "epoch": 2.837626120198032, "grad_norm": 0.014584527350962162, "learning_rate": 2.7347955500200546e-06, "loss": 0.0825, "step": 45280 }, { "epoch": 2.838252804411857, "grad_norm": 0.24854691326618195, "learning_rate": 2.7242405691245704e-06, "loss": 0.002, "step": 45290 }, { "epoch": 2.8388794886256816, "grad_norm": 0.1337384581565857, "learning_rate": 2.7136855882290853e-06, "loss": 0.0022, "step": 45300 }, { "epoch": 2.8395061728395063, "grad_norm": 0.014081398025155067, "learning_rate": 2.703130607333601e-06, "loss": 0.0442, "step": 45310 }, { "epoch": 2.840132857053331, "grad_norm": 0.017172805964946747, "learning_rate": 2.692575626438116e-06, "loss": 0.0428, "step": 45320 }, { "epoch": 2.8407595412671554, "grad_norm": 0.021060975268483162, "learning_rate": 2.682020645542632e-06, "loss": 0.1414, "step": 45330 }, { "epoch": 2.84138622548098, "grad_norm": 0.04201918840408325, "learning_rate": 2.6714656646471473e-06, "loss": 0.0713, "step": 45340 }, { "epoch": 2.842012909694805, "grad_norm": 0.01611120067536831, "learning_rate": 2.6609106837516622e-06, "loss": 0.0448, "step": 45350 }, { "epoch": 2.8426395939086295, "grad_norm": 0.021956700831651688, "learning_rate": 2.650355702856178e-06, "loss": 0.0068, "step": 45360 }, { "epoch": 2.8432662781224542, "grad_norm": 0.031278882175683975, "learning_rate": 2.6398007219606934e-06, "loss": 0.0365, "step": 45370 }, { "epoch": 2.8438929623362785, "grad_norm": 0.059587206691503525, "learning_rate": 2.629245741065209e-06, "loss": 0.0497, "step": 45380 }, { "epoch": 2.8445196465501033, "grad_norm": 0.033658090978860855, "learning_rate": 2.618690760169724e-06, "loss": 0.1026, "step": 45390 }, { "epoch": 2.845146330763928, "grad_norm": 0.4833011329174042, "learning_rate": 2.6081357792742396e-06, "loss": 0.0172, "step": 45400 }, { "epoch": 2.8457730149777527, "grad_norm": 0.017121130600571632, "learning_rate": 2.597580798378755e-06, "loss": 0.1607, "step": 45410 }, { "epoch": 2.8463996991915774, "grad_norm": 0.0939975306391716, "learning_rate": 2.5870258174832703e-06, "loss": 0.0868, "step": 45420 }, { "epoch": 2.847026383405402, "grad_norm": 0.1767207384109497, "learning_rate": 2.5764708365877857e-06, "loss": 0.0336, "step": 45430 }, { "epoch": 2.847653067619227, "grad_norm": 0.012052887119352818, "learning_rate": 2.5659158556923015e-06, "loss": 0.0405, "step": 45440 }, { "epoch": 2.8482797518330516, "grad_norm": 0.10825521498918533, "learning_rate": 2.5553608747968165e-06, "loss": 0.0965, "step": 45450 }, { "epoch": 2.848906436046876, "grad_norm": 0.20288436114788055, "learning_rate": 2.5448058939013323e-06, "loss": 0.0956, "step": 45460 }, { "epoch": 2.8495331202607006, "grad_norm": 0.015252267010509968, "learning_rate": 2.5342509130058477e-06, "loss": 0.0015, "step": 45470 }, { "epoch": 2.8501598044745253, "grad_norm": 0.0075327022932469845, "learning_rate": 2.523695932110363e-06, "loss": 0.0031, "step": 45480 }, { "epoch": 2.85078648868835, "grad_norm": 0.024453936144709587, "learning_rate": 2.5131409512148784e-06, "loss": 0.0186, "step": 45490 }, { "epoch": 2.8514131729021743, "grad_norm": 0.015601756051182747, "learning_rate": 2.502585970319394e-06, "loss": 0.0584, "step": 45500 }, { "epoch": 2.852039857115999, "grad_norm": 0.020709333941340446, "learning_rate": 2.492030989423909e-06, "loss": 0.0372, "step": 45510 }, { "epoch": 2.852666541329824, "grad_norm": 9.91942024230957, "learning_rate": 2.481476008528425e-06, "loss": 0.0665, "step": 45520 }, { "epoch": 2.8532932255436485, "grad_norm": 0.011118386872112751, "learning_rate": 2.47092102763294e-06, "loss": 0.001, "step": 45530 }, { "epoch": 2.8539199097574732, "grad_norm": 0.00901950802654028, "learning_rate": 2.4603660467374557e-06, "loss": 0.0011, "step": 45540 }, { "epoch": 2.854546593971298, "grad_norm": 0.09835624694824219, "learning_rate": 2.4498110658419707e-06, "loss": 0.0642, "step": 45550 }, { "epoch": 2.8551732781851227, "grad_norm": 9.125321388244629, "learning_rate": 2.4392560849464865e-06, "loss": 0.0341, "step": 45560 }, { "epoch": 2.8557999623989474, "grad_norm": 0.007022942416369915, "learning_rate": 2.428701104051002e-06, "loss": 0.0011, "step": 45570 }, { "epoch": 2.8564266466127717, "grad_norm": 0.13939209282398224, "learning_rate": 2.4181461231555173e-06, "loss": 0.0816, "step": 45580 }, { "epoch": 2.8570533308265964, "grad_norm": 0.010702289640903473, "learning_rate": 2.4075911422600326e-06, "loss": 0.0439, "step": 45590 }, { "epoch": 2.857680015040421, "grad_norm": 9.077207565307617, "learning_rate": 2.397036161364548e-06, "loss": 0.0487, "step": 45600 }, { "epoch": 2.858306699254246, "grad_norm": 0.005782410968095064, "learning_rate": 2.3864811804690634e-06, "loss": 0.0007, "step": 45610 }, { "epoch": 2.8589333834680706, "grad_norm": 0.006889980286359787, "learning_rate": 2.375926199573579e-06, "loss": 0.0009, "step": 45620 }, { "epoch": 2.859560067681895, "grad_norm": 0.14798372983932495, "learning_rate": 2.365371218678094e-06, "loss": 0.0306, "step": 45630 }, { "epoch": 2.8601867518957196, "grad_norm": 0.009764185175299644, "learning_rate": 2.35481623778261e-06, "loss": 0.0012, "step": 45640 }, { "epoch": 2.8608134361095443, "grad_norm": 6.804908752441406, "learning_rate": 2.3442612568871254e-06, "loss": 0.12, "step": 45650 }, { "epoch": 2.861440120323369, "grad_norm": 0.008579867891967297, "learning_rate": 2.3337062759916407e-06, "loss": 0.027, "step": 45660 }, { "epoch": 2.8620668045371938, "grad_norm": 0.00904889777302742, "learning_rate": 2.323151295096156e-06, "loss": 0.0005, "step": 45670 }, { "epoch": 2.8626934887510185, "grad_norm": 5.600223541259766, "learning_rate": 2.312596314200671e-06, "loss": 0.1602, "step": 45680 }, { "epoch": 2.863320172964843, "grad_norm": 0.03661729022860527, "learning_rate": 2.302041333305187e-06, "loss": 0.0022, "step": 45690 }, { "epoch": 2.863946857178668, "grad_norm": 0.07130564749240875, "learning_rate": 2.2914863524097023e-06, "loss": 0.0046, "step": 45700 }, { "epoch": 2.864573541392492, "grad_norm": 0.005574720446020365, "learning_rate": 2.2809313715142176e-06, "loss": 0.0299, "step": 45710 }, { "epoch": 2.865200225606317, "grad_norm": 0.006751071196049452, "learning_rate": 2.270376390618733e-06, "loss": 0.0568, "step": 45720 }, { "epoch": 2.8658269098201417, "grad_norm": 0.6684667468070984, "learning_rate": 2.2598214097232484e-06, "loss": 0.0062, "step": 45730 }, { "epoch": 2.8664535940339664, "grad_norm": 1.4935647249221802, "learning_rate": 2.2492664288277638e-06, "loss": 0.1296, "step": 45740 }, { "epoch": 2.867080278247791, "grad_norm": 0.006710459478199482, "learning_rate": 2.2387114479322796e-06, "loss": 0.0375, "step": 45750 }, { "epoch": 2.8677069624616154, "grad_norm": 15.230756759643555, "learning_rate": 2.2281564670367945e-06, "loss": 0.1664, "step": 45760 }, { "epoch": 2.86833364667544, "grad_norm": 1.371279239654541, "learning_rate": 2.2176014861413103e-06, "loss": 0.0035, "step": 45770 }, { "epoch": 2.868960330889265, "grad_norm": 0.05189768597483635, "learning_rate": 2.2070465052458253e-06, "loss": 0.0907, "step": 45780 }, { "epoch": 2.8695870151030896, "grad_norm": 0.06382831186056137, "learning_rate": 2.196491524350341e-06, "loss": 0.2114, "step": 45790 }, { "epoch": 2.8702136993169143, "grad_norm": 0.02207627147436142, "learning_rate": 2.1859365434548565e-06, "loss": 0.0651, "step": 45800 }, { "epoch": 2.870840383530739, "grad_norm": 0.01618020236492157, "learning_rate": 2.175381562559372e-06, "loss": 0.0205, "step": 45810 }, { "epoch": 2.8714670677445637, "grad_norm": 0.27104535698890686, "learning_rate": 2.1648265816638872e-06, "loss": 0.0584, "step": 45820 }, { "epoch": 2.872093751958388, "grad_norm": 0.07692761719226837, "learning_rate": 2.1542716007684026e-06, "loss": 0.112, "step": 45830 }, { "epoch": 2.8727204361722127, "grad_norm": 7.452527046203613, "learning_rate": 2.143716619872918e-06, "loss": 0.0788, "step": 45840 }, { "epoch": 2.8733471203860375, "grad_norm": 0.011884380131959915, "learning_rate": 2.133161638977434e-06, "loss": 0.0006, "step": 45850 }, { "epoch": 2.873973804599862, "grad_norm": 0.00841989554464817, "learning_rate": 2.1226066580819488e-06, "loss": 0.0029, "step": 45860 }, { "epoch": 2.874600488813687, "grad_norm": 0.3218337297439575, "learning_rate": 2.1120516771864646e-06, "loss": 0.1013, "step": 45870 }, { "epoch": 2.875227173027511, "grad_norm": 0.3481862246990204, "learning_rate": 2.10149669629098e-06, "loss": 0.1067, "step": 45880 }, { "epoch": 2.875853857241336, "grad_norm": 0.014908461831510067, "learning_rate": 2.0909417153954953e-06, "loss": 0.0643, "step": 45890 }, { "epoch": 2.8764805414551606, "grad_norm": 0.005841756239533424, "learning_rate": 2.0803867345000107e-06, "loss": 0.0578, "step": 45900 }, { "epoch": 2.8771072256689854, "grad_norm": 0.007631808519363403, "learning_rate": 2.069831753604526e-06, "loss": 0.0035, "step": 45910 }, { "epoch": 2.87773390988281, "grad_norm": 0.00997697003185749, "learning_rate": 2.0592767727090415e-06, "loss": 0.0051, "step": 45920 }, { "epoch": 2.878360594096635, "grad_norm": 0.01217371691018343, "learning_rate": 2.048721791813557e-06, "loss": 0.2205, "step": 45930 }, { "epoch": 2.8789872783104595, "grad_norm": 4.541561603546143, "learning_rate": 2.0381668109180722e-06, "loss": 0.3388, "step": 45940 }, { "epoch": 2.8796139625242843, "grad_norm": 0.1472068428993225, "learning_rate": 2.027611830022588e-06, "loss": 0.0013, "step": 45950 }, { "epoch": 2.8802406467381085, "grad_norm": 0.3544577360153198, "learning_rate": 2.017056849127103e-06, "loss": 0.0574, "step": 45960 }, { "epoch": 2.8808673309519333, "grad_norm": 0.181627556681633, "learning_rate": 2.006501868231619e-06, "loss": 0.0161, "step": 45970 }, { "epoch": 2.881494015165758, "grad_norm": 0.009927576407790184, "learning_rate": 1.995946887336134e-06, "loss": 0.0427, "step": 45980 }, { "epoch": 2.8821206993795827, "grad_norm": 0.010722942650318146, "learning_rate": 1.9853919064406496e-06, "loss": 0.0541, "step": 45990 }, { "epoch": 2.8827473835934074, "grad_norm": 0.087717205286026, "learning_rate": 1.974836925545165e-06, "loss": 0.1293, "step": 46000 }, { "epoch": 2.8833740678072317, "grad_norm": 0.810115396976471, "learning_rate": 1.96428194464968e-06, "loss": 0.1102, "step": 46010 }, { "epoch": 2.8840007520210564, "grad_norm": 0.2306370884180069, "learning_rate": 1.9537269637541957e-06, "loss": 0.0255, "step": 46020 }, { "epoch": 2.884627436234881, "grad_norm": 0.09505550563335419, "learning_rate": 1.943171982858711e-06, "loss": 0.0287, "step": 46030 }, { "epoch": 2.885254120448706, "grad_norm": 0.007353039458394051, "learning_rate": 1.9326170019632265e-06, "loss": 0.0801, "step": 46040 }, { "epoch": 2.8858808046625306, "grad_norm": 0.0182589590549469, "learning_rate": 1.922062021067742e-06, "loss": 0.0027, "step": 46050 }, { "epoch": 2.8865074888763553, "grad_norm": 0.008695291355252266, "learning_rate": 1.9115070401722572e-06, "loss": 0.0059, "step": 46060 }, { "epoch": 2.88713417309018, "grad_norm": 0.08065420389175415, "learning_rate": 1.9009520592767726e-06, "loss": 0.108, "step": 46070 }, { "epoch": 2.887760857304005, "grad_norm": 0.047303736209869385, "learning_rate": 1.8903970783812882e-06, "loss": 0.0261, "step": 46080 }, { "epoch": 2.888387541517829, "grad_norm": 0.21223130822181702, "learning_rate": 1.8798420974858036e-06, "loss": 0.0461, "step": 46090 }, { "epoch": 2.889014225731654, "grad_norm": 0.023573195561766624, "learning_rate": 1.8692871165903192e-06, "loss": 0.0967, "step": 46100 }, { "epoch": 2.8896409099454785, "grad_norm": 0.007523571141064167, "learning_rate": 1.8587321356948343e-06, "loss": 0.0229, "step": 46110 }, { "epoch": 2.8902675941593032, "grad_norm": 0.3306170403957367, "learning_rate": 1.84817715479935e-06, "loss": 0.0654, "step": 46120 }, { "epoch": 2.8908942783731275, "grad_norm": 0.04974093288183212, "learning_rate": 1.8376221739038653e-06, "loss": 0.0948, "step": 46130 }, { "epoch": 2.8915209625869522, "grad_norm": 0.14429162442684174, "learning_rate": 1.827067193008381e-06, "loss": 0.0017, "step": 46140 }, { "epoch": 2.892147646800777, "grad_norm": 0.006346757058054209, "learning_rate": 1.816512212112896e-06, "loss": 0.0255, "step": 46150 }, { "epoch": 2.8927743310146017, "grad_norm": 0.02242646925151348, "learning_rate": 1.8059572312174117e-06, "loss": 0.0803, "step": 46160 }, { "epoch": 2.8934010152284264, "grad_norm": 0.026843460276722908, "learning_rate": 1.7954022503219268e-06, "loss": 0.0557, "step": 46170 }, { "epoch": 2.894027699442251, "grad_norm": 0.005104394629597664, "learning_rate": 1.7848472694264424e-06, "loss": 0.0641, "step": 46180 }, { "epoch": 2.894654383656076, "grad_norm": 16.5250244140625, "learning_rate": 1.7742922885309578e-06, "loss": 0.2605, "step": 46190 }, { "epoch": 2.8952810678699006, "grad_norm": 0.08836544305086136, "learning_rate": 1.7637373076354734e-06, "loss": 0.0622, "step": 46200 }, { "epoch": 2.895907752083725, "grad_norm": 0.011723213829100132, "learning_rate": 1.7531823267399886e-06, "loss": 0.1145, "step": 46210 }, { "epoch": 2.8965344362975496, "grad_norm": 0.03772498294711113, "learning_rate": 1.7426273458445042e-06, "loss": 0.1735, "step": 46220 }, { "epoch": 2.8971611205113743, "grad_norm": 4.869604587554932, "learning_rate": 1.7320723649490195e-06, "loss": 0.0396, "step": 46230 }, { "epoch": 2.897787804725199, "grad_norm": 0.10343295335769653, "learning_rate": 1.7215173840535351e-06, "loss": 0.0015, "step": 46240 }, { "epoch": 2.8984144889390238, "grad_norm": 0.21444806456565857, "learning_rate": 1.7109624031580503e-06, "loss": 0.0045, "step": 46250 }, { "epoch": 2.899041173152848, "grad_norm": 0.011719079688191414, "learning_rate": 1.700407422262566e-06, "loss": 0.04, "step": 46260 }, { "epoch": 2.8996678573666728, "grad_norm": 0.02197698876261711, "learning_rate": 1.6898524413670813e-06, "loss": 0.0023, "step": 46270 }, { "epoch": 2.9002945415804975, "grad_norm": 0.07769487798213959, "learning_rate": 1.6792974604715969e-06, "loss": 0.0963, "step": 46280 }, { "epoch": 2.900921225794322, "grad_norm": 9.129709243774414, "learning_rate": 1.668742479576112e-06, "loss": 0.037, "step": 46290 }, { "epoch": 2.901547910008147, "grad_norm": 0.2846452295780182, "learning_rate": 1.6581874986806276e-06, "loss": 0.2469, "step": 46300 }, { "epoch": 2.9021745942219717, "grad_norm": 0.0827544555068016, "learning_rate": 1.6476325177851428e-06, "loss": 0.0516, "step": 46310 }, { "epoch": 2.9028012784357964, "grad_norm": 0.2681815028190613, "learning_rate": 1.6370775368896584e-06, "loss": 0.0819, "step": 46320 }, { "epoch": 2.903427962649621, "grad_norm": 0.011799097061157227, "learning_rate": 1.6265225559941738e-06, "loss": 0.0402, "step": 46330 }, { "epoch": 2.9040546468634454, "grad_norm": 0.5588979125022888, "learning_rate": 1.615967575098689e-06, "loss": 0.0788, "step": 46340 }, { "epoch": 2.90468133107727, "grad_norm": 0.015121783129870892, "learning_rate": 1.6054125942032045e-06, "loss": 0.0032, "step": 46350 }, { "epoch": 2.905308015291095, "grad_norm": 0.07492361217737198, "learning_rate": 1.59485761330772e-06, "loss": 0.0238, "step": 46360 }, { "epoch": 2.9059346995049196, "grad_norm": 0.0074353525415062904, "learning_rate": 1.5843026324122355e-06, "loss": 0.031, "step": 46370 }, { "epoch": 2.9065613837187443, "grad_norm": 0.057144444435834885, "learning_rate": 1.5737476515167507e-06, "loss": 0.0749, "step": 46380 }, { "epoch": 2.9071880679325686, "grad_norm": 0.32816392183303833, "learning_rate": 1.5631926706212663e-06, "loss": 0.0029, "step": 46390 }, { "epoch": 2.9078147521463933, "grad_norm": 0.15893450379371643, "learning_rate": 1.5526376897257817e-06, "loss": 0.1622, "step": 46400 }, { "epoch": 2.908441436360218, "grad_norm": 0.39106377959251404, "learning_rate": 1.542082708830297e-06, "loss": 0.0057, "step": 46410 }, { "epoch": 2.9090681205740427, "grad_norm": 0.046128787100315094, "learning_rate": 1.5315277279348126e-06, "loss": 0.0526, "step": 46420 }, { "epoch": 2.9096948047878675, "grad_norm": 0.01425962895154953, "learning_rate": 1.520972747039328e-06, "loss": 0.0934, "step": 46430 }, { "epoch": 2.910321489001692, "grad_norm": 0.014097287319600582, "learning_rate": 1.5104177661438434e-06, "loss": 0.002, "step": 46440 }, { "epoch": 2.910948173215517, "grad_norm": 0.09383156150579453, "learning_rate": 1.4998627852483588e-06, "loss": 0.0871, "step": 46450 }, { "epoch": 2.911574857429341, "grad_norm": 0.005777876358479261, "learning_rate": 1.4893078043528744e-06, "loss": 0.0014, "step": 46460 }, { "epoch": 2.912201541643166, "grad_norm": 0.020350627601146698, "learning_rate": 1.4787528234573897e-06, "loss": 0.0513, "step": 46470 }, { "epoch": 2.9128282258569906, "grad_norm": 0.007423871662467718, "learning_rate": 1.4681978425619051e-06, "loss": 0.065, "step": 46480 }, { "epoch": 2.9134549100708154, "grad_norm": 0.33089637756347656, "learning_rate": 1.4576428616664203e-06, "loss": 0.0027, "step": 46490 }, { "epoch": 2.91408159428464, "grad_norm": 0.005864977836608887, "learning_rate": 1.4470878807709359e-06, "loss": 0.002, "step": 46500 }, { "epoch": 2.9147082784984644, "grad_norm": 7.691262722015381, "learning_rate": 1.4365328998754513e-06, "loss": 0.0641, "step": 46510 }, { "epoch": 2.915334962712289, "grad_norm": 0.5212214589118958, "learning_rate": 1.4259779189799666e-06, "loss": 0.1364, "step": 46520 }, { "epoch": 2.915961646926114, "grad_norm": 0.08162923902273178, "learning_rate": 1.415422938084482e-06, "loss": 0.0629, "step": 46530 }, { "epoch": 2.9165883311399385, "grad_norm": 1.3101518154144287, "learning_rate": 1.4048679571889974e-06, "loss": 0.0516, "step": 46540 }, { "epoch": 2.9172150153537633, "grad_norm": 0.06720486283302307, "learning_rate": 1.394312976293513e-06, "loss": 0.04, "step": 46550 }, { "epoch": 2.917841699567588, "grad_norm": 0.2428748905658722, "learning_rate": 1.3837579953980284e-06, "loss": 0.0667, "step": 46560 }, { "epoch": 2.9184683837814127, "grad_norm": 0.19993221759796143, "learning_rate": 1.3732030145025438e-06, "loss": 0.1962, "step": 46570 }, { "epoch": 2.9190950679952374, "grad_norm": 0.02945621870458126, "learning_rate": 1.3626480336070591e-06, "loss": 0.258, "step": 46580 }, { "epoch": 2.9197217522090617, "grad_norm": 0.1247883215546608, "learning_rate": 1.3520930527115745e-06, "loss": 0.1295, "step": 46590 }, { "epoch": 2.9203484364228864, "grad_norm": 0.011887336149811745, "learning_rate": 1.3415380718160901e-06, "loss": 0.0717, "step": 46600 }, { "epoch": 2.920975120636711, "grad_norm": 0.010727292858064175, "learning_rate": 1.3309830909206055e-06, "loss": 0.0272, "step": 46610 }, { "epoch": 2.921601804850536, "grad_norm": 0.19950401782989502, "learning_rate": 1.3204281100251209e-06, "loss": 0.1339, "step": 46620 }, { "epoch": 2.9222284890643606, "grad_norm": 0.018687257543206215, "learning_rate": 1.3098731291296363e-06, "loss": 0.1271, "step": 46630 }, { "epoch": 2.922855173278185, "grad_norm": 0.13185709714889526, "learning_rate": 1.2993181482341519e-06, "loss": 0.1095, "step": 46640 }, { "epoch": 2.9234818574920096, "grad_norm": 0.015231356024742126, "learning_rate": 1.2887631673386672e-06, "loss": 0.0017, "step": 46650 }, { "epoch": 2.9241085417058343, "grad_norm": 0.012139714322984219, "learning_rate": 1.2782081864431826e-06, "loss": 0.0944, "step": 46660 }, { "epoch": 2.924735225919659, "grad_norm": 0.012931933626532555, "learning_rate": 1.267653205547698e-06, "loss": 0.0102, "step": 46670 }, { "epoch": 2.925361910133484, "grad_norm": 0.02020031027495861, "learning_rate": 1.2570982246522134e-06, "loss": 0.0012, "step": 46680 }, { "epoch": 2.9259885943473085, "grad_norm": 0.4495704174041748, "learning_rate": 1.246543243756729e-06, "loss": 0.0709, "step": 46690 }, { "epoch": 2.9266152785611332, "grad_norm": 0.006982484832406044, "learning_rate": 1.2359882628612443e-06, "loss": 0.0012, "step": 46700 }, { "epoch": 2.927241962774958, "grad_norm": 0.005636376328766346, "learning_rate": 1.2254332819657597e-06, "loss": 0.0822, "step": 46710 }, { "epoch": 2.9278686469887822, "grad_norm": 0.010224459692835808, "learning_rate": 1.2148783010702751e-06, "loss": 0.0009, "step": 46720 }, { "epoch": 2.928495331202607, "grad_norm": 0.7171324491500854, "learning_rate": 1.2043233201747905e-06, "loss": 0.1434, "step": 46730 }, { "epoch": 2.9291220154164317, "grad_norm": 0.2830602824687958, "learning_rate": 1.193768339279306e-06, "loss": 0.02, "step": 46740 }, { "epoch": 2.9297486996302564, "grad_norm": 0.011865504086017609, "learning_rate": 1.1832133583838215e-06, "loss": 0.1664, "step": 46750 }, { "epoch": 2.9303753838440807, "grad_norm": 0.015045304782688618, "learning_rate": 1.1726583774883368e-06, "loss": 0.0011, "step": 46760 }, { "epoch": 2.9310020680579054, "grad_norm": 0.02452757954597473, "learning_rate": 1.1621033965928522e-06, "loss": 0.0009, "step": 46770 }, { "epoch": 2.93162875227173, "grad_norm": 0.007630917243659496, "learning_rate": 1.1515484156973678e-06, "loss": 0.0451, "step": 46780 }, { "epoch": 2.932255436485555, "grad_norm": 0.05825293809175491, "learning_rate": 1.1409934348018832e-06, "loss": 0.0955, "step": 46790 }, { "epoch": 2.9328821206993796, "grad_norm": 5.381073951721191, "learning_rate": 1.1304384539063986e-06, "loss": 0.0924, "step": 46800 }, { "epoch": 2.9335088049132043, "grad_norm": 0.2285376489162445, "learning_rate": 1.119883473010914e-06, "loss": 0.0039, "step": 46810 }, { "epoch": 2.934135489127029, "grad_norm": 0.11770034581422806, "learning_rate": 1.1093284921154293e-06, "loss": 0.0365, "step": 46820 }, { "epoch": 2.9347621733408538, "grad_norm": 0.009794503450393677, "learning_rate": 1.0987735112199447e-06, "loss": 0.1226, "step": 46830 }, { "epoch": 2.935388857554678, "grad_norm": 0.010756062343716621, "learning_rate": 1.08821853032446e-06, "loss": 0.0753, "step": 46840 }, { "epoch": 2.9360155417685028, "grad_norm": 2.7289679050445557, "learning_rate": 1.0776635494289755e-06, "loss": 0.0095, "step": 46850 }, { "epoch": 2.9366422259823275, "grad_norm": 0.5125044584274292, "learning_rate": 1.0671085685334909e-06, "loss": 0.0563, "step": 46860 }, { "epoch": 2.937268910196152, "grad_norm": 32.42184066772461, "learning_rate": 1.0565535876380065e-06, "loss": 0.157, "step": 46870 }, { "epoch": 2.937895594409977, "grad_norm": 0.024757886305451393, "learning_rate": 1.0459986067425218e-06, "loss": 0.0593, "step": 46880 }, { "epoch": 2.938522278623801, "grad_norm": 10.523122787475586, "learning_rate": 1.0354436258470372e-06, "loss": 0.0794, "step": 46890 }, { "epoch": 2.939148962837626, "grad_norm": 0.024635594338178635, "learning_rate": 1.0248886449515526e-06, "loss": 0.0035, "step": 46900 }, { "epoch": 2.9397756470514507, "grad_norm": 0.01637181080877781, "learning_rate": 1.014333664056068e-06, "loss": 0.0008, "step": 46910 }, { "epoch": 2.9404023312652754, "grad_norm": 0.008923073299229145, "learning_rate": 1.0037786831605836e-06, "loss": 0.0071, "step": 46920 }, { "epoch": 2.9410290154791, "grad_norm": 0.06598563492298126, "learning_rate": 9.93223702265099e-07, "loss": 0.0846, "step": 46930 }, { "epoch": 2.941655699692925, "grad_norm": 0.06929517537355423, "learning_rate": 9.826687213696143e-07, "loss": 0.039, "step": 46940 }, { "epoch": 2.9422823839067496, "grad_norm": 12.796843528747559, "learning_rate": 9.721137404741297e-07, "loss": 0.1326, "step": 46950 }, { "epoch": 2.9429090681205743, "grad_norm": 0.06685430556535721, "learning_rate": 9.61558759578645e-07, "loss": 0.002, "step": 46960 }, { "epoch": 2.9435357523343986, "grad_norm": 0.004804515279829502, "learning_rate": 9.510037786831606e-07, "loss": 0.1405, "step": 46970 }, { "epoch": 2.9441624365482233, "grad_norm": 1.1889760494232178, "learning_rate": 9.404487977876761e-07, "loss": 0.083, "step": 46980 }, { "epoch": 2.944789120762048, "grad_norm": 0.6792351603507996, "learning_rate": 9.298938168921914e-07, "loss": 0.003, "step": 46990 }, { "epoch": 2.9454158049758727, "grad_norm": 0.012094573117792606, "learning_rate": 9.193388359967069e-07, "loss": 0.138, "step": 47000 }, { "epoch": 2.9460424891896975, "grad_norm": 0.025209341198205948, "learning_rate": 9.087838551012223e-07, "loss": 0.1022, "step": 47010 }, { "epoch": 2.9466691734035217, "grad_norm": 0.015056479722261429, "learning_rate": 8.982288742057378e-07, "loss": 0.087, "step": 47020 }, { "epoch": 2.9472958576173465, "grad_norm": 0.1191815584897995, "learning_rate": 8.876738933102532e-07, "loss": 0.0039, "step": 47030 }, { "epoch": 2.947922541831171, "grad_norm": 0.09055612236261368, "learning_rate": 8.771189124147686e-07, "loss": 0.0542, "step": 47040 }, { "epoch": 2.948549226044996, "grad_norm": 0.031856048852205276, "learning_rate": 8.66563931519284e-07, "loss": 0.0304, "step": 47050 }, { "epoch": 2.9491759102588206, "grad_norm": 0.005399190355092287, "learning_rate": 8.560089506237994e-07, "loss": 0.0488, "step": 47060 }, { "epoch": 2.9498025944726454, "grad_norm": 0.07480963319540024, "learning_rate": 8.454539697283149e-07, "loss": 0.1056, "step": 47070 }, { "epoch": 2.95042927868647, "grad_norm": 0.037359680980443954, "learning_rate": 8.348989888328303e-07, "loss": 0.0486, "step": 47080 }, { "epoch": 2.951055962900295, "grad_norm": 0.03391611948609352, "learning_rate": 8.243440079373458e-07, "loss": 0.075, "step": 47090 }, { "epoch": 2.951682647114119, "grad_norm": 0.3366416096687317, "learning_rate": 8.137890270418612e-07, "loss": 0.1217, "step": 47100 }, { "epoch": 2.952309331327944, "grad_norm": 0.2528597116470337, "learning_rate": 8.032340461463765e-07, "loss": 0.1417, "step": 47110 }, { "epoch": 2.9529360155417685, "grad_norm": 0.1561173051595688, "learning_rate": 7.92679065250892e-07, "loss": 0.0294, "step": 47120 }, { "epoch": 2.9535626997555933, "grad_norm": 0.006439610850065947, "learning_rate": 7.821240843554074e-07, "loss": 0.0761, "step": 47130 }, { "epoch": 2.9541893839694175, "grad_norm": 0.09393543750047684, "learning_rate": 7.715691034599228e-07, "loss": 0.1152, "step": 47140 }, { "epoch": 2.9548160681832423, "grad_norm": 0.015306937508285046, "learning_rate": 7.610141225644382e-07, "loss": 0.1413, "step": 47150 }, { "epoch": 2.955442752397067, "grad_norm": 0.013179498724639416, "learning_rate": 7.504591416689537e-07, "loss": 0.0375, "step": 47160 }, { "epoch": 2.9560694366108917, "grad_norm": 0.02790948562324047, "learning_rate": 7.39904160773469e-07, "loss": 0.0948, "step": 47170 }, { "epoch": 2.9566961208247164, "grad_norm": 0.3824322521686554, "learning_rate": 7.293491798779845e-07, "loss": 0.2018, "step": 47180 }, { "epoch": 2.957322805038541, "grad_norm": 0.12762394547462463, "learning_rate": 7.187941989824999e-07, "loss": 0.1251, "step": 47190 }, { "epoch": 2.957949489252366, "grad_norm": 0.007191898766905069, "learning_rate": 7.082392180870153e-07, "loss": 0.0033, "step": 47200 }, { "epoch": 2.9585761734661906, "grad_norm": 0.03603360056877136, "learning_rate": 6.976842371915308e-07, "loss": 0.0031, "step": 47210 }, { "epoch": 2.959202857680015, "grad_norm": 0.0791572779417038, "learning_rate": 6.871292562960462e-07, "loss": 0.0563, "step": 47220 }, { "epoch": 2.9598295418938396, "grad_norm": 0.15367931127548218, "learning_rate": 6.765742754005615e-07, "loss": 0.0826, "step": 47230 }, { "epoch": 2.9604562261076643, "grad_norm": 9.065729141235352, "learning_rate": 6.660192945050769e-07, "loss": 0.127, "step": 47240 }, { "epoch": 2.961082910321489, "grad_norm": 0.014689859934151173, "learning_rate": 6.554643136095924e-07, "loss": 0.0907, "step": 47250 }, { "epoch": 2.961709594535314, "grad_norm": 0.012969248928129673, "learning_rate": 6.449093327141078e-07, "loss": 0.0636, "step": 47260 }, { "epoch": 2.962336278749138, "grad_norm": 0.0509127713739872, "learning_rate": 6.343543518186233e-07, "loss": 0.0992, "step": 47270 }, { "epoch": 2.962962962962963, "grad_norm": 0.40878334641456604, "learning_rate": 6.237993709231387e-07, "loss": 0.0273, "step": 47280 }, { "epoch": 2.9635896471767875, "grad_norm": 0.0912589356303215, "learning_rate": 6.13244390027654e-07, "loss": 0.0213, "step": 47290 }, { "epoch": 2.9642163313906122, "grad_norm": 0.005653919652104378, "learning_rate": 6.026894091321695e-07, "loss": 0.0164, "step": 47300 }, { "epoch": 2.964843015604437, "grad_norm": 0.01706555485725403, "learning_rate": 5.921344282366849e-07, "loss": 0.0996, "step": 47310 }, { "epoch": 2.9654696998182617, "grad_norm": 0.11770886182785034, "learning_rate": 5.815794473412004e-07, "loss": 0.0924, "step": 47320 }, { "epoch": 2.9660963840320864, "grad_norm": 0.05868373066186905, "learning_rate": 5.710244664457158e-07, "loss": 0.1727, "step": 47330 }, { "epoch": 2.966723068245911, "grad_norm": 10.509180068969727, "learning_rate": 5.604694855502311e-07, "loss": 0.1099, "step": 47340 }, { "epoch": 2.9673497524597354, "grad_norm": 0.5250189900398254, "learning_rate": 5.499145046547466e-07, "loss": 0.0871, "step": 47350 }, { "epoch": 2.96797643667356, "grad_norm": 0.005643198732286692, "learning_rate": 5.39359523759262e-07, "loss": 0.0155, "step": 47360 }, { "epoch": 2.968603120887385, "grad_norm": 0.24792006611824036, "learning_rate": 5.288045428637775e-07, "loss": 0.0357, "step": 47370 }, { "epoch": 2.9692298051012096, "grad_norm": 0.2950761020183563, "learning_rate": 5.182495619682929e-07, "loss": 0.026, "step": 47380 }, { "epoch": 2.9698564893150343, "grad_norm": 0.00568793248385191, "learning_rate": 5.076945810728084e-07, "loss": 0.0264, "step": 47390 }, { "epoch": 2.9704831735288586, "grad_norm": 0.008012909442186356, "learning_rate": 4.971396001773236e-07, "loss": 0.0018, "step": 47400 }, { "epoch": 2.9711098577426833, "grad_norm": 0.04426155611872673, "learning_rate": 4.865846192818391e-07, "loss": 0.0976, "step": 47410 }, { "epoch": 2.971736541956508, "grad_norm": 0.009348000399768353, "learning_rate": 4.760296383863545e-07, "loss": 0.041, "step": 47420 }, { "epoch": 2.9723632261703328, "grad_norm": 0.09541311115026474, "learning_rate": 4.6547465749086994e-07, "loss": 0.0034, "step": 47430 }, { "epoch": 2.9729899103841575, "grad_norm": 0.013340012170374393, "learning_rate": 4.549196765953854e-07, "loss": 0.0014, "step": 47440 }, { "epoch": 2.973616594597982, "grad_norm": 0.04422794282436371, "learning_rate": 4.443646956999008e-07, "loss": 0.0389, "step": 47450 }, { "epoch": 2.974243278811807, "grad_norm": 0.09784085303544998, "learning_rate": 4.338097148044162e-07, "loss": 0.1464, "step": 47460 }, { "epoch": 2.9748699630256312, "grad_norm": 0.00832357257604599, "learning_rate": 4.232547339089316e-07, "loss": 0.03, "step": 47470 }, { "epoch": 2.975496647239456, "grad_norm": 8.156325340270996, "learning_rate": 4.1269975301344706e-07, "loss": 0.1161, "step": 47480 }, { "epoch": 2.9761233314532807, "grad_norm": 6.328774452209473, "learning_rate": 4.021447721179625e-07, "loss": 0.0816, "step": 47490 }, { "epoch": 2.9767500156671054, "grad_norm": 0.010376522317528725, "learning_rate": 3.915897912224779e-07, "loss": 0.0299, "step": 47500 }, { "epoch": 2.97737669988093, "grad_norm": 0.0181084256619215, "learning_rate": 3.8103481032699336e-07, "loss": 0.0526, "step": 47510 }, { "epoch": 2.9780033840947544, "grad_norm": 0.04073436185717583, "learning_rate": 3.7047982943150874e-07, "loss": 0.0011, "step": 47520 }, { "epoch": 2.978630068308579, "grad_norm": 0.2758326232433319, "learning_rate": 3.599248485360242e-07, "loss": 0.0034, "step": 47530 }, { "epoch": 2.979256752522404, "grad_norm": 0.011556997895240784, "learning_rate": 3.4936986764053955e-07, "loss": 0.0011, "step": 47540 }, { "epoch": 2.9798834367362286, "grad_norm": 0.03783591091632843, "learning_rate": 3.38814886745055e-07, "loss": 0.0521, "step": 47550 }, { "epoch": 2.9805101209500533, "grad_norm": 0.5519959926605225, "learning_rate": 3.282599058495704e-07, "loss": 0.0333, "step": 47560 }, { "epoch": 2.981136805163878, "grad_norm": 0.05450139567255974, "learning_rate": 3.1770492495408586e-07, "loss": 0.0707, "step": 47570 }, { "epoch": 2.9817634893777027, "grad_norm": 0.056759271770715714, "learning_rate": 3.071499440586013e-07, "loss": 0.0408, "step": 47580 }, { "epoch": 2.9823901735915275, "grad_norm": 12.344095230102539, "learning_rate": 2.965949631631167e-07, "loss": 0.1028, "step": 47590 }, { "epoch": 2.9830168578053518, "grad_norm": 4.790780544281006, "learning_rate": 2.860399822676321e-07, "loss": 0.1723, "step": 47600 }, { "epoch": 2.9836435420191765, "grad_norm": 0.1742323935031891, "learning_rate": 2.7548500137214754e-07, "loss": 0.105, "step": 47610 }, { "epoch": 2.984270226233001, "grad_norm": 0.011358167044818401, "learning_rate": 2.649300204766629e-07, "loss": 0.2667, "step": 47620 }, { "epoch": 2.984896910446826, "grad_norm": 0.010282701812684536, "learning_rate": 2.5437503958117835e-07, "loss": 0.0754, "step": 47630 }, { "epoch": 2.9855235946606506, "grad_norm": 0.030036158859729767, "learning_rate": 2.438200586856938e-07, "loss": 0.0637, "step": 47640 }, { "epoch": 2.986150278874475, "grad_norm": 0.008501702919602394, "learning_rate": 2.3326507779020922e-07, "loss": 0.1259, "step": 47650 }, { "epoch": 2.9867769630882997, "grad_norm": 0.026741160079836845, "learning_rate": 2.2271009689472465e-07, "loss": 0.0055, "step": 47660 }, { "epoch": 2.9874036473021244, "grad_norm": 0.008301452733576298, "learning_rate": 2.1215511599924006e-07, "loss": 0.2303, "step": 47670 }, { "epoch": 2.988030331515949, "grad_norm": 0.005024346057325602, "learning_rate": 2.016001351037555e-07, "loss": 0.1822, "step": 47680 }, { "epoch": 2.988657015729774, "grad_norm": 0.014699382707476616, "learning_rate": 1.910451542082709e-07, "loss": 0.0462, "step": 47690 }, { "epoch": 2.9892836999435985, "grad_norm": 0.08590447157621384, "learning_rate": 1.8049017331278633e-07, "loss": 0.002, "step": 47700 }, { "epoch": 2.9899103841574233, "grad_norm": 0.08910279721021652, "learning_rate": 1.6993519241730171e-07, "loss": 0.115, "step": 47710 }, { "epoch": 2.990537068371248, "grad_norm": 8.29712963104248, "learning_rate": 1.5938021152181715e-07, "loss": 0.0961, "step": 47720 }, { "epoch": 2.9911637525850723, "grad_norm": 1.3991972208023071, "learning_rate": 1.4882523062633258e-07, "loss": 0.0508, "step": 47730 }, { "epoch": 2.991790436798897, "grad_norm": 4.287667274475098, "learning_rate": 1.3827024973084802e-07, "loss": 0.0809, "step": 47740 }, { "epoch": 2.9924171210127217, "grad_norm": 0.1272421032190323, "learning_rate": 1.277152688353634e-07, "loss": 0.0012, "step": 47750 }, { "epoch": 2.9930438052265464, "grad_norm": 0.025945696979761124, "learning_rate": 1.1716028793987883e-07, "loss": 0.123, "step": 47760 }, { "epoch": 2.9936704894403707, "grad_norm": 0.06106176972389221, "learning_rate": 1.0660530704439426e-07, "loss": 0.0575, "step": 47770 }, { "epoch": 2.9942971736541955, "grad_norm": 0.02119859866797924, "learning_rate": 9.605032614890967e-08, "loss": 0.0722, "step": 47780 }, { "epoch": 2.99492385786802, "grad_norm": 0.01732785254716873, "learning_rate": 8.54953452534251e-08, "loss": 0.0866, "step": 47790 }, { "epoch": 2.995550542081845, "grad_norm": 0.009875914081931114, "learning_rate": 7.494036435794051e-08, "loss": 0.0629, "step": 47800 }, { "epoch": 2.9961772262956696, "grad_norm": 0.043381739407777786, "learning_rate": 6.438538346245594e-08, "loss": 0.0017, "step": 47810 }, { "epoch": 2.9968039105094944, "grad_norm": 0.041683148592710495, "learning_rate": 5.383040256697135e-08, "loss": 0.0534, "step": 47820 }, { "epoch": 2.997430594723319, "grad_norm": 0.05090497434139252, "learning_rate": 4.327542167148678e-08, "loss": 0.0547, "step": 47830 }, { "epoch": 2.998057278937144, "grad_norm": 0.018531568348407745, "learning_rate": 3.27204407760022e-08, "loss": 0.0027, "step": 47840 }, { "epoch": 2.998683963150968, "grad_norm": 0.22370028495788574, "learning_rate": 2.2165459880517617e-08, "loss": 0.0557, "step": 47850 }, { "epoch": 2.999310647364793, "grad_norm": 3.741847038269043, "learning_rate": 1.1610478985033039e-08, "loss": 0.0429, "step": 47860 }, { "epoch": 2.9999373315786175, "grad_norm": 0.8121204376220703, "learning_rate": 1.055498089548458e-09, "loss": 0.0538, "step": 47870 }, { "epoch": 3.0, "eval_accuracy": 0.9655334482218393, "eval_f1": 0.9651329077512896, "eval_loss": 0.1548488736152649, "eval_precision": 0.9648537768510945, "eval_recall": 0.9655334482218393, "eval_runtime": 288.3598, "eval_samples_per_second": 110.678, "eval_steps_per_second": 13.837, "step": 47871 } ], "logging_steps": 10, "max_steps": 47871, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.073077472849101e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }