diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,33578 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 47871, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006266842138246538, + "grad_norm": 3.7908527851104736, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.6843, + "step": 10 + }, + { + "epoch": 0.0012533684276493076, + "grad_norm": 4.450835704803467, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6671, + "step": 20 + }, + { + "epoch": 0.0018800526414739613, + "grad_norm": 4.033405780792236, + "learning_rate": 3e-06, + "loss": 0.6299, + "step": 30 + }, + { + "epoch": 0.002506736855298615, + "grad_norm": 4.196229457855225, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5655, + "step": 40 + }, + { + "epoch": 0.0031334210691232687, + "grad_norm": 2.715080738067627, + "learning_rate": 5e-06, + "loss": 0.4726, + "step": 50 + }, + { + "epoch": 0.0037601052829479226, + "grad_norm": 2.747175455093384, + "learning_rate": 6e-06, + "loss": 0.3862, + "step": 60 + }, + { + "epoch": 0.0043867894967725765, + "grad_norm": 2.173097848892212, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3775, + "step": 70 + }, + { + "epoch": 0.00501347371059723, + "grad_norm": 1.9927845001220703, + "learning_rate": 8.000000000000001e-06, + "loss": 0.3042, + "step": 80 + }, + { + "epoch": 0.0056401579244218835, + "grad_norm": 1.9776703119277954, + "learning_rate": 9e-06, + "loss": 0.2959, + "step": 90 + }, + { + "epoch": 0.006266842138246537, + "grad_norm": 1.9182004928588867, + "learning_rate": 1e-05, + "loss": 0.2846, + "step": 100 + }, + { + "epoch": 0.006893526352071191, + "grad_norm": 2.0619113445281982, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.3035, + "step": 110 + }, + { + "epoch": 0.007520210565895845, + "grad_norm": 0.834303617477417, + "learning_rate": 1.2e-05, + "loss": 0.2613, + "step": 120 + }, + { + "epoch": 0.008146894779720498, + "grad_norm": 2.8216769695281982, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.2193, + "step": 130 + }, + { + "epoch": 0.008773578993545153, + "grad_norm": 0.9458842873573303, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.2789, + "step": 140 + }, + { + "epoch": 0.009400263207369806, + "grad_norm": 2.4860618114471436, + "learning_rate": 1.5e-05, + "loss": 0.2208, + "step": 150 + }, + { + "epoch": 0.01002694742119446, + "grad_norm": 0.7528994083404541, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3034, + "step": 160 + }, + { + "epoch": 0.010653631635019114, + "grad_norm": 0.22199566662311554, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.1009, + "step": 170 + }, + { + "epoch": 0.011280315848843767, + "grad_norm": 3.4047577381134033, + "learning_rate": 1.8e-05, + "loss": 0.2154, + "step": 180 + }, + { + "epoch": 0.011907000062668422, + "grad_norm": 3.8273696899414062, + "learning_rate": 1.9e-05, + "loss": 0.1748, + "step": 190 + }, + { + "epoch": 0.012533684276493075, + "grad_norm": 0.13623806834220886, + "learning_rate": 2e-05, + "loss": 0.0289, + "step": 200 + }, + { + "epoch": 0.01316036849031773, + "grad_norm": 3.247593879699707, + "learning_rate": 2.1e-05, + "loss": 0.4862, + "step": 210 + }, + { + "epoch": 0.013787052704142383, + "grad_norm": 11.889203071594238, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3237, + "step": 220 + }, + { + "epoch": 0.014413736917967036, + "grad_norm": 12.991532325744629, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0925, + "step": 230 + }, + { + "epoch": 0.01504042113179169, + "grad_norm": 0.5583555102348328, + "learning_rate": 2.4e-05, + "loss": 0.1658, + "step": 240 + }, + { + "epoch": 0.015667105345616345, + "grad_norm": 1.1460843086242676, + "learning_rate": 2.5e-05, + "loss": 0.2338, + "step": 250 + }, + { + "epoch": 0.016293789559440996, + "grad_norm": 0.3752973973751068, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0844, + "step": 260 + }, + { + "epoch": 0.01692047377326565, + "grad_norm": 0.15886737406253815, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.2237, + "step": 270 + }, + { + "epoch": 0.017547157987090306, + "grad_norm": 17.93160629272461, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.1505, + "step": 280 + }, + { + "epoch": 0.018173842200914957, + "grad_norm": 5.02848482131958, + "learning_rate": 2.9e-05, + "loss": 0.2666, + "step": 290 + }, + { + "epoch": 0.018800526414739612, + "grad_norm": 0.12570694088935852, + "learning_rate": 3e-05, + "loss": 0.1358, + "step": 300 + }, + { + "epoch": 0.019427210628564267, + "grad_norm": 7.472323894500732, + "learning_rate": 3.1e-05, + "loss": 0.2556, + "step": 310 + }, + { + "epoch": 0.02005389484238892, + "grad_norm": 0.15033140778541565, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.1621, + "step": 320 + }, + { + "epoch": 0.020680579056213573, + "grad_norm": 0.3948344588279724, + "learning_rate": 3.3e-05, + "loss": 0.021, + "step": 330 + }, + { + "epoch": 0.021307263270038228, + "grad_norm": 0.1560896635055542, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.1498, + "step": 340 + }, + { + "epoch": 0.021933947483862883, + "grad_norm": 0.2152891755104065, + "learning_rate": 3.5e-05, + "loss": 0.1955, + "step": 350 + }, + { + "epoch": 0.022560631697687534, + "grad_norm": 0.12051887810230255, + "learning_rate": 3.6e-05, + "loss": 0.1427, + "step": 360 + }, + { + "epoch": 0.02318731591151219, + "grad_norm": 0.1034441739320755, + "learning_rate": 3.7e-05, + "loss": 0.2262, + "step": 370 + }, + { + "epoch": 0.023814000125336843, + "grad_norm": 7.567387580871582, + "learning_rate": 3.8e-05, + "loss": 0.3546, + "step": 380 + }, + { + "epoch": 0.024440684339161498, + "grad_norm": 12.388092994689941, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.1448, + "step": 390 + }, + { + "epoch": 0.02506736855298615, + "grad_norm": 4.503359317779541, + "learning_rate": 4e-05, + "loss": 0.0713, + "step": 400 + }, + { + "epoch": 0.025694052766810804, + "grad_norm": 0.23637720942497253, + "learning_rate": 4.1e-05, + "loss": 0.2683, + "step": 410 + }, + { + "epoch": 0.02632073698063546, + "grad_norm": 0.2273373007774353, + "learning_rate": 4.2e-05, + "loss": 0.0707, + "step": 420 + }, + { + "epoch": 0.02694742119446011, + "grad_norm": 21.912202835083008, + "learning_rate": 4.3e-05, + "loss": 0.0993, + "step": 430 + }, + { + "epoch": 0.027574105408284765, + "grad_norm": 6.012325763702393, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.1063, + "step": 440 + }, + { + "epoch": 0.02820078962210942, + "grad_norm": 1.8421870470046997, + "learning_rate": 4.5e-05, + "loss": 0.4188, + "step": 450 + }, + { + "epoch": 0.02882747383593407, + "grad_norm": 0.906676173210144, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1205, + "step": 460 + }, + { + "epoch": 0.029454158049758726, + "grad_norm": 9.664088249206543, + "learning_rate": 4.7e-05, + "loss": 0.2815, + "step": 470 + }, + { + "epoch": 0.03008084226358338, + "grad_norm": 0.122682586312294, + "learning_rate": 4.8e-05, + "loss": 0.1559, + "step": 480 + }, + { + "epoch": 0.030707526477408036, + "grad_norm": 0.26290163397789, + "learning_rate": 4.9e-05, + "loss": 0.0847, + "step": 490 + }, + { + "epoch": 0.03133421069123269, + "grad_norm": 0.10128908604383469, + "learning_rate": 5e-05, + "loss": 0.1371, + "step": 500 + }, + { + "epoch": 0.03196089490505734, + "grad_norm": 12.029622077941895, + "learning_rate": 4.998944501910452e-05, + "loss": 0.2847, + "step": 510 + }, + { + "epoch": 0.03258757911888199, + "grad_norm": 0.16734056174755096, + "learning_rate": 4.997889003820903e-05, + "loss": 0.1148, + "step": 520 + }, + { + "epoch": 0.03321426333270665, + "grad_norm": 31.41100311279297, + "learning_rate": 4.9968335057313546e-05, + "loss": 0.195, + "step": 530 + }, + { + "epoch": 0.0338409475465313, + "grad_norm": 3.025646209716797, + "learning_rate": 4.995778007641806e-05, + "loss": 0.2554, + "step": 540 + }, + { + "epoch": 0.03446763176035596, + "grad_norm": 4.002821922302246, + "learning_rate": 4.994722509552258e-05, + "loss": 0.1961, + "step": 550 + }, + { + "epoch": 0.03509431597418061, + "grad_norm": 0.12222342193126678, + "learning_rate": 4.9936670114627096e-05, + "loss": 0.3323, + "step": 560 + }, + { + "epoch": 0.03572100018800527, + "grad_norm": 21.177127838134766, + "learning_rate": 4.992611513373161e-05, + "loss": 0.4417, + "step": 570 + }, + { + "epoch": 0.036347684401829915, + "grad_norm": 0.5925000905990601, + "learning_rate": 4.991556015283613e-05, + "loss": 0.223, + "step": 580 + }, + { + "epoch": 0.03697436861565457, + "grad_norm": 16.268051147460938, + "learning_rate": 4.990500517194064e-05, + "loss": 0.181, + "step": 590 + }, + { + "epoch": 0.037601052829479224, + "grad_norm": 0.11519550532102585, + "learning_rate": 4.9894450191045155e-05, + "loss": 0.1659, + "step": 600 + }, + { + "epoch": 0.03822773704330388, + "grad_norm": 0.1412481963634491, + "learning_rate": 4.988389521014967e-05, + "loss": 0.2244, + "step": 610 + }, + { + "epoch": 0.038854421257128534, + "grad_norm": 3.1080451011657715, + "learning_rate": 4.987334022925419e-05, + "loss": 0.0507, + "step": 620 + }, + { + "epoch": 0.03948110547095319, + "grad_norm": 0.1547059267759323, + "learning_rate": 4.98627852483587e-05, + "loss": 0.2914, + "step": 630 + }, + { + "epoch": 0.04010778968477784, + "grad_norm": 0.22034786641597748, + "learning_rate": 4.9852230267463215e-05, + "loss": 0.223, + "step": 640 + }, + { + "epoch": 0.04073447389860249, + "grad_norm": 0.7811187505722046, + "learning_rate": 4.984167528656773e-05, + "loss": 0.2595, + "step": 650 + }, + { + "epoch": 0.041361158112427146, + "grad_norm": 0.7039777040481567, + "learning_rate": 4.983112030567225e-05, + "loss": 0.1963, + "step": 660 + }, + { + "epoch": 0.0419878423262518, + "grad_norm": 15.276497840881348, + "learning_rate": 4.9820565324776765e-05, + "loss": 0.1323, + "step": 670 + }, + { + "epoch": 0.042614526540076456, + "grad_norm": 8.510149002075195, + "learning_rate": 4.981001034388128e-05, + "loss": 0.1898, + "step": 680 + }, + { + "epoch": 0.04324121075390111, + "grad_norm": 1.145938515663147, + "learning_rate": 4.97994553629858e-05, + "loss": 0.1372, + "step": 690 + }, + { + "epoch": 0.043867894967725765, + "grad_norm": 0.04701605811715126, + "learning_rate": 4.978890038209031e-05, + "loss": 0.1105, + "step": 700 + }, + { + "epoch": 0.04449457918155042, + "grad_norm": 9.567878723144531, + "learning_rate": 4.9778345401194825e-05, + "loss": 0.3655, + "step": 710 + }, + { + "epoch": 0.04512126339537507, + "grad_norm": 14.110966682434082, + "learning_rate": 4.976779042029934e-05, + "loss": 0.1356, + "step": 720 + }, + { + "epoch": 0.04574794760919972, + "grad_norm": 0.08877156674861908, + "learning_rate": 4.975723543940386e-05, + "loss": 0.1217, + "step": 730 + }, + { + "epoch": 0.04637463182302438, + "grad_norm": 0.07869545370340347, + "learning_rate": 4.974668045850837e-05, + "loss": 0.1509, + "step": 740 + }, + { + "epoch": 0.04700131603684903, + "grad_norm": 5.638880252838135, + "learning_rate": 4.973612547761289e-05, + "loss": 0.1362, + "step": 750 + }, + { + "epoch": 0.04762800025067369, + "grad_norm": 0.07686334103345871, + "learning_rate": 4.972557049671741e-05, + "loss": 0.0054, + "step": 760 + }, + { + "epoch": 0.04825468446449834, + "grad_norm": 0.11897100508213043, + "learning_rate": 4.971501551582192e-05, + "loss": 0.2837, + "step": 770 + }, + { + "epoch": 0.048881368678322996, + "grad_norm": 15.471230506896973, + "learning_rate": 4.9704460534926435e-05, + "loss": 0.0393, + "step": 780 + }, + { + "epoch": 0.049508052892147644, + "grad_norm": 9.890192985534668, + "learning_rate": 4.969390555403095e-05, + "loss": 0.1452, + "step": 790 + }, + { + "epoch": 0.0501347371059723, + "grad_norm": 12.074395179748535, + "learning_rate": 4.968335057313547e-05, + "loss": 0.4167, + "step": 800 + }, + { + "epoch": 0.050761421319796954, + "grad_norm": 5.107584476470947, + "learning_rate": 4.967279559223998e-05, + "loss": 0.171, + "step": 810 + }, + { + "epoch": 0.05138810553362161, + "grad_norm": 0.09638172388076782, + "learning_rate": 4.9662240611344495e-05, + "loss": 0.0487, + "step": 820 + }, + { + "epoch": 0.05201478974744626, + "grad_norm": 6.376888275146484, + "learning_rate": 4.965168563044901e-05, + "loss": 0.1593, + "step": 830 + }, + { + "epoch": 0.05264147396127092, + "grad_norm": 2.0702414512634277, + "learning_rate": 4.964113064955352e-05, + "loss": 0.277, + "step": 840 + }, + { + "epoch": 0.053268158175095566, + "grad_norm": 0.21229584515094757, + "learning_rate": 4.9630575668658045e-05, + "loss": 0.1619, + "step": 850 + }, + { + "epoch": 0.05389484238892022, + "grad_norm": 15.22604751586914, + "learning_rate": 4.962002068776256e-05, + "loss": 0.1956, + "step": 860 + }, + { + "epoch": 0.054521526602744876, + "grad_norm": 0.05341089889407158, + "learning_rate": 4.960946570686708e-05, + "loss": 0.3803, + "step": 870 + }, + { + "epoch": 0.05514821081656953, + "grad_norm": 0.09002327173948288, + "learning_rate": 4.959891072597159e-05, + "loss": 0.0758, + "step": 880 + }, + { + "epoch": 0.055774895030394185, + "grad_norm": 6.2136430740356445, + "learning_rate": 4.9588355745076104e-05, + "loss": 0.1879, + "step": 890 + }, + { + "epoch": 0.05640157924421884, + "grad_norm": 0.36718255281448364, + "learning_rate": 4.957780076418062e-05, + "loss": 0.2352, + "step": 900 + }, + { + "epoch": 0.057028263458043495, + "grad_norm": 0.7451048493385315, + "learning_rate": 4.956724578328513e-05, + "loss": 0.1774, + "step": 910 + }, + { + "epoch": 0.05765494767186814, + "grad_norm": 0.721674382686615, + "learning_rate": 4.955669080238965e-05, + "loss": 0.1824, + "step": 920 + }, + { + "epoch": 0.0582816318856928, + "grad_norm": 18.23922348022461, + "learning_rate": 4.9546135821494164e-05, + "loss": 0.2313, + "step": 930 + }, + { + "epoch": 0.05890831609951745, + "grad_norm": 10.863558769226074, + "learning_rate": 4.953558084059868e-05, + "loss": 0.271, + "step": 940 + }, + { + "epoch": 0.05953500031334211, + "grad_norm": 14.508039474487305, + "learning_rate": 4.95250258597032e-05, + "loss": 0.1408, + "step": 950 + }, + { + "epoch": 0.06016168452716676, + "grad_norm": 5.872216701507568, + "learning_rate": 4.9514470878807714e-05, + "loss": 0.1507, + "step": 960 + }, + { + "epoch": 0.060788368740991416, + "grad_norm": 7.473176002502441, + "learning_rate": 4.950391589791223e-05, + "loss": 0.2671, + "step": 970 + }, + { + "epoch": 0.06141505295481607, + "grad_norm": 3.9476542472839355, + "learning_rate": 4.949336091701674e-05, + "loss": 0.264, + "step": 980 + }, + { + "epoch": 0.06204173716864072, + "grad_norm": 0.1643088310956955, + "learning_rate": 4.948280593612126e-05, + "loss": 0.1544, + "step": 990 + }, + { + "epoch": 0.06266842138246538, + "grad_norm": 0.5590381026268005, + "learning_rate": 4.9472250955225774e-05, + "loss": 0.2018, + "step": 1000 + }, + { + "epoch": 0.06329510559629004, + "grad_norm": 3.769355058670044, + "learning_rate": 4.946169597433029e-05, + "loss": 0.2406, + "step": 1010 + }, + { + "epoch": 0.06392178981011468, + "grad_norm": 11.80628776550293, + "learning_rate": 4.94511409934348e-05, + "loss": 0.1427, + "step": 1020 + }, + { + "epoch": 0.06454847402393933, + "grad_norm": 11.33843994140625, + "learning_rate": 4.944058601253932e-05, + "loss": 0.3128, + "step": 1030 + }, + { + "epoch": 0.06517515823776399, + "grad_norm": 1.8439558744430542, + "learning_rate": 4.9430031031643834e-05, + "loss": 0.0682, + "step": 1040 + }, + { + "epoch": 0.06580184245158864, + "grad_norm": 0.4288315176963806, + "learning_rate": 4.941947605074835e-05, + "loss": 0.1597, + "step": 1050 + }, + { + "epoch": 0.0664285266654133, + "grad_norm": 0.11158425360918045, + "learning_rate": 4.940892106985287e-05, + "loss": 0.0531, + "step": 1060 + }, + { + "epoch": 0.06705521087923795, + "grad_norm": 1.80587899684906, + "learning_rate": 4.9398366088957384e-05, + "loss": 0.1558, + "step": 1070 + }, + { + "epoch": 0.0676818950930626, + "grad_norm": 9.482861518859863, + "learning_rate": 4.93878111080619e-05, + "loss": 0.1914, + "step": 1080 + }, + { + "epoch": 0.06830857930688726, + "grad_norm": 3.83756685256958, + "learning_rate": 4.937725612716641e-05, + "loss": 0.1753, + "step": 1090 + }, + { + "epoch": 0.06893526352071191, + "grad_norm": 0.12418217211961746, + "learning_rate": 4.936670114627093e-05, + "loss": 0.134, + "step": 1100 + }, + { + "epoch": 0.06956194773453657, + "grad_norm": 3.1548798084259033, + "learning_rate": 4.9356146165375444e-05, + "loss": 0.1173, + "step": 1110 + }, + { + "epoch": 0.07018863194836122, + "grad_norm": 2.56209135055542, + "learning_rate": 4.934559118447996e-05, + "loss": 0.2539, + "step": 1120 + }, + { + "epoch": 0.07081531616218588, + "grad_norm": 3.5509636402130127, + "learning_rate": 4.933503620358447e-05, + "loss": 0.2879, + "step": 1130 + }, + { + "epoch": 0.07144200037601053, + "grad_norm": 0.764828622341156, + "learning_rate": 4.932448122268899e-05, + "loss": 0.2342, + "step": 1140 + }, + { + "epoch": 0.07206868458983519, + "grad_norm": 4.222464561462402, + "learning_rate": 4.9313926241793503e-05, + "loss": 0.0773, + "step": 1150 + }, + { + "epoch": 0.07269536880365983, + "grad_norm": 0.05031255632638931, + "learning_rate": 4.930337126089802e-05, + "loss": 0.0528, + "step": 1160 + }, + { + "epoch": 0.07332205301748448, + "grad_norm": 12.016239166259766, + "learning_rate": 4.929281628000254e-05, + "loss": 0.4079, + "step": 1170 + }, + { + "epoch": 0.07394873723130914, + "grad_norm": 0.22898195683956146, + "learning_rate": 4.928226129910705e-05, + "loss": 0.1286, + "step": 1180 + }, + { + "epoch": 0.0745754214451338, + "grad_norm": 1.5522078275680542, + "learning_rate": 4.927170631821157e-05, + "loss": 0.1678, + "step": 1190 + }, + { + "epoch": 0.07520210565895845, + "grad_norm": 8.510868072509766, + "learning_rate": 4.926115133731608e-05, + "loss": 0.1897, + "step": 1200 + }, + { + "epoch": 0.0758287898727831, + "grad_norm": 15.368112564086914, + "learning_rate": 4.9250596356420597e-05, + "loss": 0.1667, + "step": 1210 + }, + { + "epoch": 0.07645547408660776, + "grad_norm": 5.564446926116943, + "learning_rate": 4.924004137552511e-05, + "loss": 0.0653, + "step": 1220 + }, + { + "epoch": 0.07708215830043241, + "grad_norm": 4.638933181762695, + "learning_rate": 4.922948639462962e-05, + "loss": 0.2175, + "step": 1230 + }, + { + "epoch": 0.07770884251425707, + "grad_norm": 11.21340274810791, + "learning_rate": 4.921893141373414e-05, + "loss": 0.1778, + "step": 1240 + }, + { + "epoch": 0.07833552672808172, + "grad_norm": 0.5090659260749817, + "learning_rate": 4.920837643283866e-05, + "loss": 0.0966, + "step": 1250 + }, + { + "epoch": 0.07896221094190638, + "grad_norm": 0.08244244009256363, + "learning_rate": 4.919782145194318e-05, + "loss": 0.1893, + "step": 1260 + }, + { + "epoch": 0.07958889515573103, + "grad_norm": 0.22240492701530457, + "learning_rate": 4.918726647104769e-05, + "loss": 0.1542, + "step": 1270 + }, + { + "epoch": 0.08021557936955569, + "grad_norm": 0.508705198764801, + "learning_rate": 4.9176711490152206e-05, + "loss": 0.179, + "step": 1280 + }, + { + "epoch": 0.08084226358338034, + "grad_norm": 4.623276710510254, + "learning_rate": 4.916615650925672e-05, + "loss": 0.3569, + "step": 1290 + }, + { + "epoch": 0.08146894779720498, + "grad_norm": 2.1783127784729004, + "learning_rate": 4.915560152836123e-05, + "loss": 0.1525, + "step": 1300 + }, + { + "epoch": 0.08209563201102964, + "grad_norm": 0.3979020118713379, + "learning_rate": 4.914504654746575e-05, + "loss": 0.0309, + "step": 1310 + }, + { + "epoch": 0.08272231622485429, + "grad_norm": 0.14067301154136658, + "learning_rate": 4.9134491566570266e-05, + "loss": 0.1543, + "step": 1320 + }, + { + "epoch": 0.08334900043867895, + "grad_norm": 0.15461118519306183, + "learning_rate": 4.912393658567478e-05, + "loss": 0.0617, + "step": 1330 + }, + { + "epoch": 0.0839756846525036, + "grad_norm": 0.0635291114449501, + "learning_rate": 4.911338160477929e-05, + "loss": 0.1736, + "step": 1340 + }, + { + "epoch": 0.08460236886632826, + "grad_norm": 0.07697498798370361, + "learning_rate": 4.9102826623883816e-05, + "loss": 0.0851, + "step": 1350 + }, + { + "epoch": 0.08522905308015291, + "grad_norm": 3.9091408252716064, + "learning_rate": 4.909227164298833e-05, + "loss": 0.2063, + "step": 1360 + }, + { + "epoch": 0.08585573729397757, + "grad_norm": 0.49504515528678894, + "learning_rate": 4.908171666209284e-05, + "loss": 0.0456, + "step": 1370 + }, + { + "epoch": 0.08648242150780222, + "grad_norm": 3.6886634826660156, + "learning_rate": 4.907116168119736e-05, + "loss": 0.1798, + "step": 1380 + }, + { + "epoch": 0.08710910572162688, + "grad_norm": 24.821048736572266, + "learning_rate": 4.9060606700301876e-05, + "loss": 0.0989, + "step": 1390 + }, + { + "epoch": 0.08773578993545153, + "grad_norm": 10.899468421936035, + "learning_rate": 4.905005171940639e-05, + "loss": 0.4166, + "step": 1400 + }, + { + "epoch": 0.08836247414927619, + "grad_norm": 2.6948788166046143, + "learning_rate": 4.90394967385109e-05, + "loss": 0.3788, + "step": 1410 + }, + { + "epoch": 0.08898915836310084, + "grad_norm": 2.010342836380005, + "learning_rate": 4.902894175761542e-05, + "loss": 0.1184, + "step": 1420 + }, + { + "epoch": 0.08961584257692548, + "grad_norm": 0.03161928057670593, + "learning_rate": 4.9018386776719936e-05, + "loss": 0.0987, + "step": 1430 + }, + { + "epoch": 0.09024252679075014, + "grad_norm": 11.630290031433105, + "learning_rate": 4.900783179582445e-05, + "loss": 0.1251, + "step": 1440 + }, + { + "epoch": 0.09086921100457479, + "grad_norm": 0.29562464356422424, + "learning_rate": 4.899727681492897e-05, + "loss": 0.1273, + "step": 1450 + }, + { + "epoch": 0.09149589521839945, + "grad_norm": 55.65250015258789, + "learning_rate": 4.8986721834033486e-05, + "loss": 0.1763, + "step": 1460 + }, + { + "epoch": 0.0921225794322241, + "grad_norm": 2.1838998794555664, + "learning_rate": 4.8976166853138e-05, + "loss": 0.33, + "step": 1470 + }, + { + "epoch": 0.09274926364604875, + "grad_norm": 7.990715980529785, + "learning_rate": 4.896561187224251e-05, + "loss": 0.1245, + "step": 1480 + }, + { + "epoch": 0.09337594785987341, + "grad_norm": 8.314751625061035, + "learning_rate": 4.895505689134703e-05, + "loss": 0.093, + "step": 1490 + }, + { + "epoch": 0.09400263207369806, + "grad_norm": 2.740541458129883, + "learning_rate": 4.8944501910451546e-05, + "loss": 0.325, + "step": 1500 + }, + { + "epoch": 0.09462931628752272, + "grad_norm": 0.10673114657402039, + "learning_rate": 4.8933946929556055e-05, + "loss": 0.228, + "step": 1510 + }, + { + "epoch": 0.09525600050134737, + "grad_norm": 0.07358872145414352, + "learning_rate": 4.892339194866057e-05, + "loss": 0.134, + "step": 1520 + }, + { + "epoch": 0.09588268471517203, + "grad_norm": 0.673576295375824, + "learning_rate": 4.891283696776509e-05, + "loss": 0.0626, + "step": 1530 + }, + { + "epoch": 0.09650936892899668, + "grad_norm": 9.495621681213379, + "learning_rate": 4.8902281986869605e-05, + "loss": 0.3845, + "step": 1540 + }, + { + "epoch": 0.09713605314282134, + "grad_norm": 4.396236419677734, + "learning_rate": 4.889172700597412e-05, + "loss": 0.1065, + "step": 1550 + }, + { + "epoch": 0.09776273735664599, + "grad_norm": 0.25836917757987976, + "learning_rate": 4.888117202507864e-05, + "loss": 0.2009, + "step": 1560 + }, + { + "epoch": 0.09838942157047063, + "grad_norm": 0.12051882594823837, + "learning_rate": 4.8870617044183155e-05, + "loss": 0.0977, + "step": 1570 + }, + { + "epoch": 0.09901610578429529, + "grad_norm": 1.9034405946731567, + "learning_rate": 4.886006206328767e-05, + "loss": 0.1978, + "step": 1580 + }, + { + "epoch": 0.09964278999811994, + "grad_norm": 0.4317615032196045, + "learning_rate": 4.884950708239218e-05, + "loss": 0.1936, + "step": 1590 + }, + { + "epoch": 0.1002694742119446, + "grad_norm": 1.0015511512756348, + "learning_rate": 4.88389521014967e-05, + "loss": 0.1216, + "step": 1600 + }, + { + "epoch": 0.10089615842576925, + "grad_norm": 0.307364284992218, + "learning_rate": 4.8828397120601215e-05, + "loss": 0.0249, + "step": 1610 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 0.7230989336967468, + "learning_rate": 4.8817842139705725e-05, + "loss": 0.1124, + "step": 1620 + }, + { + "epoch": 0.10214952685341856, + "grad_norm": 3.9494760036468506, + "learning_rate": 4.880728715881024e-05, + "loss": 0.1477, + "step": 1630 + }, + { + "epoch": 0.10277621106724322, + "grad_norm": 0.13938945531845093, + "learning_rate": 4.879673217791476e-05, + "loss": 0.2507, + "step": 1640 + }, + { + "epoch": 0.10340289528106787, + "grad_norm": 3.76981520652771, + "learning_rate": 4.8786177197019275e-05, + "loss": 0.2671, + "step": 1650 + }, + { + "epoch": 0.10402957949489253, + "grad_norm": 0.21893472969532013, + "learning_rate": 4.877562221612379e-05, + "loss": 0.1446, + "step": 1660 + }, + { + "epoch": 0.10465626370871718, + "grad_norm": 0.17878833413124084, + "learning_rate": 4.876506723522831e-05, + "loss": 0.1263, + "step": 1670 + }, + { + "epoch": 0.10528294792254184, + "grad_norm": 0.6086212396621704, + "learning_rate": 4.8754512254332825e-05, + "loss": 0.1327, + "step": 1680 + }, + { + "epoch": 0.10590963213636649, + "grad_norm": 0.3757482171058655, + "learning_rate": 4.8743957273437335e-05, + "loss": 0.1065, + "step": 1690 + }, + { + "epoch": 0.10653631635019113, + "grad_norm": 0.045173462480306625, + "learning_rate": 4.873340229254185e-05, + "loss": 0.1215, + "step": 1700 + }, + { + "epoch": 0.10716300056401579, + "grad_norm": 0.026341581717133522, + "learning_rate": 4.872284731164637e-05, + "loss": 0.1255, + "step": 1710 + }, + { + "epoch": 0.10778968477784044, + "grad_norm": 0.036156751215457916, + "learning_rate": 4.8712292330750885e-05, + "loss": 0.2899, + "step": 1720 + }, + { + "epoch": 0.1084163689916651, + "grad_norm": 22.635560989379883, + "learning_rate": 4.8701737349855395e-05, + "loss": 0.2425, + "step": 1730 + }, + { + "epoch": 0.10904305320548975, + "grad_norm": 7.215483665466309, + "learning_rate": 4.869118236895991e-05, + "loss": 0.1708, + "step": 1740 + }, + { + "epoch": 0.1096697374193144, + "grad_norm": 8.375259399414062, + "learning_rate": 4.8680627388064435e-05, + "loss": 0.0651, + "step": 1750 + }, + { + "epoch": 0.11029642163313906, + "grad_norm": 0.029143275693058968, + "learning_rate": 4.8670072407168945e-05, + "loss": 0.0235, + "step": 1760 + }, + { + "epoch": 0.11092310584696372, + "grad_norm": 0.640413224697113, + "learning_rate": 4.865951742627346e-05, + "loss": 0.2324, + "step": 1770 + }, + { + "epoch": 0.11154979006078837, + "grad_norm": 5.40239143371582, + "learning_rate": 4.864896244537798e-05, + "loss": 0.1828, + "step": 1780 + }, + { + "epoch": 0.11217647427461303, + "grad_norm": 6.553516864776611, + "learning_rate": 4.8638407464482495e-05, + "loss": 0.0174, + "step": 1790 + }, + { + "epoch": 0.11280315848843768, + "grad_norm": 5.948601722717285, + "learning_rate": 4.8627852483587004e-05, + "loss": 0.2896, + "step": 1800 + }, + { + "epoch": 0.11342984270226233, + "grad_norm": 0.044085729867219925, + "learning_rate": 4.861729750269152e-05, + "loss": 0.1238, + "step": 1810 + }, + { + "epoch": 0.11405652691608699, + "grad_norm": 1.21904718875885, + "learning_rate": 4.860674252179604e-05, + "loss": 0.2042, + "step": 1820 + }, + { + "epoch": 0.11468321112991164, + "grad_norm": 0.3443450927734375, + "learning_rate": 4.859618754090055e-05, + "loss": 0.0531, + "step": 1830 + }, + { + "epoch": 0.11530989534373629, + "grad_norm": 5.81721305847168, + "learning_rate": 4.858563256000507e-05, + "loss": 0.3022, + "step": 1840 + }, + { + "epoch": 0.11593657955756094, + "grad_norm": 1.6238272190093994, + "learning_rate": 4.857507757910959e-05, + "loss": 0.2342, + "step": 1850 + }, + { + "epoch": 0.1165632637713856, + "grad_norm": 1.0162500143051147, + "learning_rate": 4.8564522598214104e-05, + "loss": 0.0913, + "step": 1860 + }, + { + "epoch": 0.11718994798521025, + "grad_norm": 0.029920578002929688, + "learning_rate": 4.8553967617318614e-05, + "loss": 0.167, + "step": 1870 + }, + { + "epoch": 0.1178166321990349, + "grad_norm": 0.19740080833435059, + "learning_rate": 4.854341263642313e-05, + "loss": 0.2157, + "step": 1880 + }, + { + "epoch": 0.11844331641285956, + "grad_norm": 7.18571662902832, + "learning_rate": 4.853285765552765e-05, + "loss": 0.2966, + "step": 1890 + }, + { + "epoch": 0.11907000062668421, + "grad_norm": 0.09219271689653397, + "learning_rate": 4.852230267463216e-05, + "loss": 0.1399, + "step": 1900 + }, + { + "epoch": 0.11969668484050887, + "grad_norm": 0.7403557896614075, + "learning_rate": 4.8511747693736674e-05, + "loss": 0.1223, + "step": 1910 + }, + { + "epoch": 0.12032336905433352, + "grad_norm": 19.517911911010742, + "learning_rate": 4.850119271284119e-05, + "loss": 0.2518, + "step": 1920 + }, + { + "epoch": 0.12095005326815818, + "grad_norm": 0.033364780247211456, + "learning_rate": 4.849063773194571e-05, + "loss": 0.2251, + "step": 1930 + }, + { + "epoch": 0.12157673748198283, + "grad_norm": 0.16062411665916443, + "learning_rate": 4.8480082751050224e-05, + "loss": 0.1562, + "step": 1940 + }, + { + "epoch": 0.12220342169580749, + "grad_norm": 0.25009897351264954, + "learning_rate": 4.846952777015474e-05, + "loss": 0.0093, + "step": 1950 + }, + { + "epoch": 0.12283010590963214, + "grad_norm": 0.11893218010663986, + "learning_rate": 4.845897278925926e-05, + "loss": 0.0805, + "step": 1960 + }, + { + "epoch": 0.12345679012345678, + "grad_norm": 0.02064370922744274, + "learning_rate": 4.8448417808363774e-05, + "loss": 0.2079, + "step": 1970 + }, + { + "epoch": 0.12408347433728144, + "grad_norm": 0.1334376484155655, + "learning_rate": 4.8437862827468284e-05, + "loss": 0.139, + "step": 1980 + }, + { + "epoch": 0.12471015855110609, + "grad_norm": 0.07230320572853088, + "learning_rate": 4.84273078465728e-05, + "loss": 0.1551, + "step": 1990 + }, + { + "epoch": 0.12533684276493076, + "grad_norm": 0.08805450797080994, + "learning_rate": 4.841675286567732e-05, + "loss": 0.0033, + "step": 2000 + }, + { + "epoch": 0.1259635269787554, + "grad_norm": 0.07734368741512299, + "learning_rate": 4.840619788478183e-05, + "loss": 0.3391, + "step": 2010 + }, + { + "epoch": 0.12659021119258007, + "grad_norm": 0.12036556750535965, + "learning_rate": 4.8395642903886344e-05, + "loss": 0.1158, + "step": 2020 + }, + { + "epoch": 0.1272168954064047, + "grad_norm": 6.732293605804443, + "learning_rate": 4.838508792299086e-05, + "loss": 0.1857, + "step": 2030 + }, + { + "epoch": 0.12784357962022935, + "grad_norm": 4.408974647521973, + "learning_rate": 4.837453294209538e-05, + "loss": 0.0729, + "step": 2040 + }, + { + "epoch": 0.12847026383405402, + "grad_norm": 3.5390381813049316, + "learning_rate": 4.8363977961199894e-05, + "loss": 0.1274, + "step": 2050 + }, + { + "epoch": 0.12909694804787866, + "grad_norm": 0.04466002434492111, + "learning_rate": 4.835342298030441e-05, + "loss": 0.0609, + "step": 2060 + }, + { + "epoch": 0.12972363226170333, + "grad_norm": 24.187576293945312, + "learning_rate": 4.834286799940893e-05, + "loss": 0.028, + "step": 2070 + }, + { + "epoch": 0.13035031647552797, + "grad_norm": 0.023793770000338554, + "learning_rate": 4.833231301851344e-05, + "loss": 0.0028, + "step": 2080 + }, + { + "epoch": 0.13097700068935264, + "grad_norm": 0.20558828115463257, + "learning_rate": 4.8321758037617953e-05, + "loss": 0.2252, + "step": 2090 + }, + { + "epoch": 0.13160368490317728, + "grad_norm": 0.06289856880903244, + "learning_rate": 4.831120305672247e-05, + "loss": 0.0928, + "step": 2100 + }, + { + "epoch": 0.13223036911700195, + "grad_norm": 7.317686080932617, + "learning_rate": 4.830064807582699e-05, + "loss": 0.1462, + "step": 2110 + }, + { + "epoch": 0.1328570533308266, + "grad_norm": 2.1458468437194824, + "learning_rate": 4.8290093094931497e-05, + "loss": 0.1463, + "step": 2120 + }, + { + "epoch": 0.13348373754465126, + "grad_norm": 4.286557197570801, + "learning_rate": 4.827953811403601e-05, + "loss": 0.1568, + "step": 2130 + }, + { + "epoch": 0.1341104217584759, + "grad_norm": 8.06277084350586, + "learning_rate": 4.826898313314053e-05, + "loss": 0.0988, + "step": 2140 + }, + { + "epoch": 0.13473710597230057, + "grad_norm": 37.442718505859375, + "learning_rate": 4.8258428152245047e-05, + "loss": 0.0563, + "step": 2150 + }, + { + "epoch": 0.1353637901861252, + "grad_norm": 1.2893574237823486, + "learning_rate": 4.824787317134956e-05, + "loss": 0.053, + "step": 2160 + }, + { + "epoch": 0.13599047439994988, + "grad_norm": 28.35834503173828, + "learning_rate": 4.823731819045408e-05, + "loss": 0.0935, + "step": 2170 + }, + { + "epoch": 0.13661715861377452, + "grad_norm": 0.25138330459594727, + "learning_rate": 4.8226763209558597e-05, + "loss": 0.0325, + "step": 2180 + }, + { + "epoch": 0.13724384282759916, + "grad_norm": 1.9327008724212646, + "learning_rate": 4.8216208228663106e-05, + "loss": 0.0734, + "step": 2190 + }, + { + "epoch": 0.13787052704142383, + "grad_norm": 0.01644737273454666, + "learning_rate": 4.820565324776762e-05, + "loss": 0.1206, + "step": 2200 + }, + { + "epoch": 0.13849721125524847, + "grad_norm": 0.38374465703964233, + "learning_rate": 4.819509826687214e-05, + "loss": 0.1869, + "step": 2210 + }, + { + "epoch": 0.13912389546907314, + "grad_norm": 3.8984410762786865, + "learning_rate": 4.818454328597665e-05, + "loss": 0.1447, + "step": 2220 + }, + { + "epoch": 0.13975057968289778, + "grad_norm": 0.06196863204240799, + "learning_rate": 4.8173988305081166e-05, + "loss": 0.2484, + "step": 2230 + }, + { + "epoch": 0.14037726389672245, + "grad_norm": 0.36186841130256653, + "learning_rate": 4.816343332418568e-05, + "loss": 0.068, + "step": 2240 + }, + { + "epoch": 0.1410039481105471, + "grad_norm": 0.07987497001886368, + "learning_rate": 4.8152878343290206e-05, + "loss": 0.2028, + "step": 2250 + }, + { + "epoch": 0.14163063232437176, + "grad_norm": 6.794961929321289, + "learning_rate": 4.8142323362394716e-05, + "loss": 0.2791, + "step": 2260 + }, + { + "epoch": 0.1422573165381964, + "grad_norm": 0.14792947471141815, + "learning_rate": 4.813176838149923e-05, + "loss": 0.097, + "step": 2270 + }, + { + "epoch": 0.14288400075202107, + "grad_norm": 2.2044332027435303, + "learning_rate": 4.812121340060375e-05, + "loss": 0.2176, + "step": 2280 + }, + { + "epoch": 0.1435106849658457, + "grad_norm": 1.7671102285385132, + "learning_rate": 4.811065841970826e-05, + "loss": 0.1282, + "step": 2290 + }, + { + "epoch": 0.14413736917967038, + "grad_norm": 1.2018122673034668, + "learning_rate": 4.8100103438812776e-05, + "loss": 0.0809, + "step": 2300 + }, + { + "epoch": 0.14476405339349502, + "grad_norm": 0.08096177130937576, + "learning_rate": 4.808954845791729e-05, + "loss": 0.1052, + "step": 2310 + }, + { + "epoch": 0.14539073760731966, + "grad_norm": 0.1800854206085205, + "learning_rate": 4.807899347702181e-05, + "loss": 0.3427, + "step": 2320 + }, + { + "epoch": 0.14601742182114433, + "grad_norm": 0.6755953431129456, + "learning_rate": 4.806843849612632e-05, + "loss": 0.1775, + "step": 2330 + }, + { + "epoch": 0.14664410603496897, + "grad_norm": 0.213535875082016, + "learning_rate": 4.805788351523084e-05, + "loss": 0.0122, + "step": 2340 + }, + { + "epoch": 0.14727079024879364, + "grad_norm": 0.19489499926567078, + "learning_rate": 4.804732853433536e-05, + "loss": 0.1821, + "step": 2350 + }, + { + "epoch": 0.14789747446261828, + "grad_norm": 0.05088840797543526, + "learning_rate": 4.8036773553439876e-05, + "loss": 0.2758, + "step": 2360 + }, + { + "epoch": 0.14852415867644295, + "grad_norm": 3.9418184757232666, + "learning_rate": 4.8026218572544386e-05, + "loss": 0.1707, + "step": 2370 + }, + { + "epoch": 0.1491508428902676, + "grad_norm": 0.08057057112455368, + "learning_rate": 4.80156635916489e-05, + "loss": 0.0479, + "step": 2380 + }, + { + "epoch": 0.14977752710409226, + "grad_norm": 0.035881925374269485, + "learning_rate": 4.800510861075342e-05, + "loss": 0.0087, + "step": 2390 + }, + { + "epoch": 0.1504042113179169, + "grad_norm": 0.07139906287193298, + "learning_rate": 4.799455362985793e-05, + "loss": 0.1374, + "step": 2400 + }, + { + "epoch": 0.15103089553174157, + "grad_norm": 1.764007329940796, + "learning_rate": 4.7983998648962446e-05, + "loss": 0.3789, + "step": 2410 + }, + { + "epoch": 0.1516575797455662, + "grad_norm": 0.17716698348522186, + "learning_rate": 4.797344366806696e-05, + "loss": 0.1987, + "step": 2420 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 0.5136746764183044, + "learning_rate": 4.796288868717148e-05, + "loss": 0.1136, + "step": 2430 + }, + { + "epoch": 0.15291094817321552, + "grad_norm": 8.765165328979492, + "learning_rate": 4.7952333706275996e-05, + "loss": 0.2001, + "step": 2440 + }, + { + "epoch": 0.15353763238704016, + "grad_norm": 6.890345573425293, + "learning_rate": 4.794177872538051e-05, + "loss": 0.1941, + "step": 2450 + }, + { + "epoch": 0.15416431660086483, + "grad_norm": 2.7363126277923584, + "learning_rate": 4.793122374448503e-05, + "loss": 0.1513, + "step": 2460 + }, + { + "epoch": 0.15479100081468947, + "grad_norm": 0.43579980731010437, + "learning_rate": 4.792066876358954e-05, + "loss": 0.1283, + "step": 2470 + }, + { + "epoch": 0.15541768502851414, + "grad_norm": 10.213183403015137, + "learning_rate": 4.7910113782694055e-05, + "loss": 0.211, + "step": 2480 + }, + { + "epoch": 0.15604436924233878, + "grad_norm": 2.5786595344543457, + "learning_rate": 4.789955880179857e-05, + "loss": 0.1093, + "step": 2490 + }, + { + "epoch": 0.15667105345616344, + "grad_norm": 8.773494720458984, + "learning_rate": 4.788900382090309e-05, + "loss": 0.0228, + "step": 2500 + }, + { + "epoch": 0.15729773766998809, + "grad_norm": 1.877724528312683, + "learning_rate": 4.78784488400076e-05, + "loss": 0.0768, + "step": 2510 + }, + { + "epoch": 0.15792442188381275, + "grad_norm": 0.2219085693359375, + "learning_rate": 4.7867893859112115e-05, + "loss": 0.1902, + "step": 2520 + }, + { + "epoch": 0.1585511060976374, + "grad_norm": 2.0254573822021484, + "learning_rate": 4.785733887821663e-05, + "loss": 0.2101, + "step": 2530 + }, + { + "epoch": 0.15917779031146206, + "grad_norm": 0.1042730063199997, + "learning_rate": 4.784678389732115e-05, + "loss": 0.1561, + "step": 2540 + }, + { + "epoch": 0.1598044745252867, + "grad_norm": 3.1619834899902344, + "learning_rate": 4.7836228916425665e-05, + "loss": 0.2101, + "step": 2550 + }, + { + "epoch": 0.16043115873911137, + "grad_norm": 0.6285260319709778, + "learning_rate": 4.782567393553018e-05, + "loss": 0.0834, + "step": 2560 + }, + { + "epoch": 0.16105784295293601, + "grad_norm": 0.13654287159442902, + "learning_rate": 4.78151189546347e-05, + "loss": 0.0928, + "step": 2570 + }, + { + "epoch": 0.16168452716676068, + "grad_norm": 0.3744922876358032, + "learning_rate": 4.780456397373921e-05, + "loss": 0.2121, + "step": 2580 + }, + { + "epoch": 0.16231121138058532, + "grad_norm": 7.18287992477417, + "learning_rate": 4.7794008992843725e-05, + "loss": 0.1356, + "step": 2590 + }, + { + "epoch": 0.16293789559440996, + "grad_norm": 0.1572078913450241, + "learning_rate": 4.778345401194824e-05, + "loss": 0.1453, + "step": 2600 + }, + { + "epoch": 0.16356457980823463, + "grad_norm": 0.1542951464653015, + "learning_rate": 4.777289903105275e-05, + "loss": 0.1251, + "step": 2610 + }, + { + "epoch": 0.16419126402205927, + "grad_norm": 3.625025749206543, + "learning_rate": 4.776234405015727e-05, + "loss": 0.3419, + "step": 2620 + }, + { + "epoch": 0.16481794823588394, + "grad_norm": 0.07648801058530807, + "learning_rate": 4.7751789069261785e-05, + "loss": 0.0818, + "step": 2630 + }, + { + "epoch": 0.16544463244970858, + "grad_norm": 0.20059090852737427, + "learning_rate": 4.77412340883663e-05, + "loss": 0.19, + "step": 2640 + }, + { + "epoch": 0.16607131666353325, + "grad_norm": 0.1197114810347557, + "learning_rate": 4.773067910747082e-05, + "loss": 0.0858, + "step": 2650 + }, + { + "epoch": 0.1666980008773579, + "grad_norm": 0.030985107645392418, + "learning_rate": 4.7720124126575335e-05, + "loss": 0.0555, + "step": 2660 + }, + { + "epoch": 0.16732468509118256, + "grad_norm": 0.05402218550443649, + "learning_rate": 4.770956914567985e-05, + "loss": 0.013, + "step": 2670 + }, + { + "epoch": 0.1679513693050072, + "grad_norm": 0.8109678626060486, + "learning_rate": 4.769901416478436e-05, + "loss": 0.097, + "step": 2680 + }, + { + "epoch": 0.16857805351883187, + "grad_norm": 7.592169284820557, + "learning_rate": 4.768845918388888e-05, + "loss": 0.1593, + "step": 2690 + }, + { + "epoch": 0.1692047377326565, + "grad_norm": 0.12266740947961807, + "learning_rate": 4.7677904202993395e-05, + "loss": 0.3517, + "step": 2700 + }, + { + "epoch": 0.16983142194648118, + "grad_norm": 0.22772647440433502, + "learning_rate": 4.766734922209791e-05, + "loss": 0.2029, + "step": 2710 + }, + { + "epoch": 0.17045810616030582, + "grad_norm": 0.8687103986740112, + "learning_rate": 4.765679424120242e-05, + "loss": 0.0864, + "step": 2720 + }, + { + "epoch": 0.17108479037413046, + "grad_norm": 0.15315799415111542, + "learning_rate": 4.764623926030694e-05, + "loss": 0.1457, + "step": 2730 + }, + { + "epoch": 0.17171147458795513, + "grad_norm": 2.896604537963867, + "learning_rate": 4.7635684279411454e-05, + "loss": 0.1904, + "step": 2740 + }, + { + "epoch": 0.17233815880177977, + "grad_norm": 0.11487535387277603, + "learning_rate": 4.762512929851598e-05, + "loss": 0.2531, + "step": 2750 + }, + { + "epoch": 0.17296484301560444, + "grad_norm": 5.1275811195373535, + "learning_rate": 4.761457431762049e-05, + "loss": 0.166, + "step": 2760 + }, + { + "epoch": 0.17359152722942908, + "grad_norm": 0.2095639705657959, + "learning_rate": 4.7604019336725004e-05, + "loss": 0.0942, + "step": 2770 + }, + { + "epoch": 0.17421821144325375, + "grad_norm": 7.469160556793213, + "learning_rate": 4.759346435582952e-05, + "loss": 0.2922, + "step": 2780 + }, + { + "epoch": 0.1748448956570784, + "grad_norm": 1.80148184299469, + "learning_rate": 4.758290937493403e-05, + "loss": 0.1071, + "step": 2790 + }, + { + "epoch": 0.17547157987090306, + "grad_norm": 0.3815973699092865, + "learning_rate": 4.757235439403855e-05, + "loss": 0.1167, + "step": 2800 + }, + { + "epoch": 0.1760982640847277, + "grad_norm": 0.01410369761288166, + "learning_rate": 4.7561799413143064e-05, + "loss": 0.0313, + "step": 2810 + }, + { + "epoch": 0.17672494829855237, + "grad_norm": 0.0505596324801445, + "learning_rate": 4.755124443224758e-05, + "loss": 0.212, + "step": 2820 + }, + { + "epoch": 0.177351632512377, + "grad_norm": 5.202013969421387, + "learning_rate": 4.754068945135209e-05, + "loss": 0.1225, + "step": 2830 + }, + { + "epoch": 0.17797831672620168, + "grad_norm": 0.5114884972572327, + "learning_rate": 4.7530134470456614e-05, + "loss": 0.0803, + "step": 2840 + }, + { + "epoch": 0.17860500094002632, + "grad_norm": 0.32649967074394226, + "learning_rate": 4.751957948956113e-05, + "loss": 0.0959, + "step": 2850 + }, + { + "epoch": 0.17923168515385096, + "grad_norm": 0.49903982877731323, + "learning_rate": 4.750902450866564e-05, + "loss": 0.1916, + "step": 2860 + }, + { + "epoch": 0.17985836936767563, + "grad_norm": 0.5530174970626831, + "learning_rate": 4.749846952777016e-05, + "loss": 0.0615, + "step": 2870 + }, + { + "epoch": 0.18048505358150027, + "grad_norm": 0.010926044546067715, + "learning_rate": 4.7487914546874674e-05, + "loss": 0.0331, + "step": 2880 + }, + { + "epoch": 0.18111173779532494, + "grad_norm": 8.002918243408203, + "learning_rate": 4.747735956597919e-05, + "loss": 0.4187, + "step": 2890 + }, + { + "epoch": 0.18173842200914958, + "grad_norm": 0.23336580395698547, + "learning_rate": 4.74668045850837e-05, + "loss": 0.0829, + "step": 2900 + }, + { + "epoch": 0.18236510622297425, + "grad_norm": 2.910646915435791, + "learning_rate": 4.745624960418822e-05, + "loss": 0.2485, + "step": 2910 + }, + { + "epoch": 0.1829917904367989, + "grad_norm": 0.33291372656822205, + "learning_rate": 4.7445694623292734e-05, + "loss": 0.1436, + "step": 2920 + }, + { + "epoch": 0.18361847465062356, + "grad_norm": 8.978804588317871, + "learning_rate": 4.743513964239725e-05, + "loss": 0.0809, + "step": 2930 + }, + { + "epoch": 0.1842451588644482, + "grad_norm": 39.01954650878906, + "learning_rate": 4.742458466150177e-05, + "loss": 0.1548, + "step": 2940 + }, + { + "epoch": 0.18487184307827287, + "grad_norm": 0.8386328816413879, + "learning_rate": 4.7414029680606284e-05, + "loss": 0.2184, + "step": 2950 + }, + { + "epoch": 0.1854985272920975, + "grad_norm": 0.12659718096256256, + "learning_rate": 4.74034746997108e-05, + "loss": 0.1347, + "step": 2960 + }, + { + "epoch": 0.18612521150592218, + "grad_norm": 0.6178060173988342, + "learning_rate": 4.739291971881531e-05, + "loss": 0.1945, + "step": 2970 + }, + { + "epoch": 0.18675189571974682, + "grad_norm": 0.10851160436868668, + "learning_rate": 4.738236473791983e-05, + "loss": 0.1198, + "step": 2980 + }, + { + "epoch": 0.18737857993357146, + "grad_norm": 2.062321186065674, + "learning_rate": 4.7371809757024344e-05, + "loss": 0.3923, + "step": 2990 + }, + { + "epoch": 0.18800526414739613, + "grad_norm": 0.0955008938908577, + "learning_rate": 4.7361254776128853e-05, + "loss": 0.1186, + "step": 3000 + }, + { + "epoch": 0.18863194836122077, + "grad_norm": 0.11655368655920029, + "learning_rate": 4.735069979523337e-05, + "loss": 0.1031, + "step": 3010 + }, + { + "epoch": 0.18925863257504544, + "grad_norm": 0.2267540544271469, + "learning_rate": 4.734014481433789e-05, + "loss": 0.1104, + "step": 3020 + }, + { + "epoch": 0.18988531678887008, + "grad_norm": 12.633221626281738, + "learning_rate": 4.7329589833442403e-05, + "loss": 0.2682, + "step": 3030 + }, + { + "epoch": 0.19051200100269475, + "grad_norm": 0.10257907211780548, + "learning_rate": 4.731903485254692e-05, + "loss": 0.1305, + "step": 3040 + }, + { + "epoch": 0.1911386852165194, + "grad_norm": 0.056987181305885315, + "learning_rate": 4.730847987165144e-05, + "loss": 0.0214, + "step": 3050 + }, + { + "epoch": 0.19176536943034406, + "grad_norm": 0.05389223247766495, + "learning_rate": 4.7297924890755953e-05, + "loss": 0.0817, + "step": 3060 + }, + { + "epoch": 0.1923920536441687, + "grad_norm": 0.11531835794448853, + "learning_rate": 4.728736990986046e-05, + "loss": 0.2344, + "step": 3070 + }, + { + "epoch": 0.19301873785799337, + "grad_norm": 0.5801142454147339, + "learning_rate": 4.727681492896498e-05, + "loss": 0.1039, + "step": 3080 + }, + { + "epoch": 0.193645422071818, + "grad_norm": 2.6055166721343994, + "learning_rate": 4.7266259948069497e-05, + "loss": 0.2305, + "step": 3090 + }, + { + "epoch": 0.19427210628564268, + "grad_norm": 0.3381422758102417, + "learning_rate": 4.725570496717401e-05, + "loss": 0.2068, + "step": 3100 + }, + { + "epoch": 0.19489879049946732, + "grad_norm": 4.314819812774658, + "learning_rate": 4.724514998627852e-05, + "loss": 0.0849, + "step": 3110 + }, + { + "epoch": 0.19552547471329199, + "grad_norm": 0.05561206117272377, + "learning_rate": 4.723459500538304e-05, + "loss": 0.1866, + "step": 3120 + }, + { + "epoch": 0.19615215892711663, + "grad_norm": 0.33232080936431885, + "learning_rate": 4.7224040024487556e-05, + "loss": 0.2515, + "step": 3130 + }, + { + "epoch": 0.19677884314094127, + "grad_norm": 0.579521656036377, + "learning_rate": 4.721348504359207e-05, + "loss": 0.1305, + "step": 3140 + }, + { + "epoch": 0.19740552735476594, + "grad_norm": 0.43451932072639465, + "learning_rate": 4.720293006269659e-05, + "loss": 0.1847, + "step": 3150 + }, + { + "epoch": 0.19803221156859058, + "grad_norm": 1.7295702695846558, + "learning_rate": 4.7192375081801106e-05, + "loss": 0.042, + "step": 3160 + }, + { + "epoch": 0.19865889578241525, + "grad_norm": 0.11605259031057358, + "learning_rate": 4.718182010090562e-05, + "loss": 0.1818, + "step": 3170 + }, + { + "epoch": 0.1992855799962399, + "grad_norm": 3.375051975250244, + "learning_rate": 4.717126512001013e-05, + "loss": 0.0832, + "step": 3180 + }, + { + "epoch": 0.19991226421006456, + "grad_norm": 15.903059005737305, + "learning_rate": 4.716071013911465e-05, + "loss": 0.4062, + "step": 3190 + }, + { + "epoch": 0.2005389484238892, + "grad_norm": 7.629429817199707, + "learning_rate": 4.7150155158219166e-05, + "loss": 0.3307, + "step": 3200 + }, + { + "epoch": 0.20116563263771386, + "grad_norm": 0.11025646328926086, + "learning_rate": 4.713960017732368e-05, + "loss": 0.0341, + "step": 3210 + }, + { + "epoch": 0.2017923168515385, + "grad_norm": 5.843116283416748, + "learning_rate": 4.712904519642819e-05, + "loss": 0.1977, + "step": 3220 + }, + { + "epoch": 0.20241900106536317, + "grad_norm": 5.388326168060303, + "learning_rate": 4.711849021553271e-05, + "loss": 0.1191, + "step": 3230 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 14.07459831237793, + "learning_rate": 4.7107935234637226e-05, + "loss": 0.1881, + "step": 3240 + }, + { + "epoch": 0.20367236949301248, + "grad_norm": 1.0998096466064453, + "learning_rate": 4.709738025374174e-05, + "loss": 0.1228, + "step": 3250 + }, + { + "epoch": 0.20429905370683712, + "grad_norm": 1.7342157363891602, + "learning_rate": 4.708682527284626e-05, + "loss": 0.0381, + "step": 3260 + }, + { + "epoch": 0.20492573792066177, + "grad_norm": 1.3697205781936646, + "learning_rate": 4.7076270291950776e-05, + "loss": 0.2799, + "step": 3270 + }, + { + "epoch": 0.20555242213448643, + "grad_norm": 0.14480531215667725, + "learning_rate": 4.706571531105529e-05, + "loss": 0.236, + "step": 3280 + }, + { + "epoch": 0.20617910634831108, + "grad_norm": 11.9490385055542, + "learning_rate": 4.70551603301598e-05, + "loss": 0.2193, + "step": 3290 + }, + { + "epoch": 0.20680579056213574, + "grad_norm": 0.29927733540534973, + "learning_rate": 4.704460534926432e-05, + "loss": 0.1346, + "step": 3300 + }, + { + "epoch": 0.20743247477596038, + "grad_norm": 1.87830650806427, + "learning_rate": 4.7034050368368836e-05, + "loss": 0.1282, + "step": 3310 + }, + { + "epoch": 0.20805915898978505, + "grad_norm": 0.37590470910072327, + "learning_rate": 4.7023495387473346e-05, + "loss": 0.0127, + "step": 3320 + }, + { + "epoch": 0.2086858432036097, + "grad_norm": 0.24762612581253052, + "learning_rate": 4.701294040657786e-05, + "loss": 0.3277, + "step": 3330 + }, + { + "epoch": 0.20931252741743436, + "grad_norm": 3.689312219619751, + "learning_rate": 4.7002385425682386e-05, + "loss": 0.1627, + "step": 3340 + }, + { + "epoch": 0.209939211631259, + "grad_norm": 8.164970397949219, + "learning_rate": 4.69918304447869e-05, + "loss": 0.2183, + "step": 3350 + }, + { + "epoch": 0.21056589584508367, + "grad_norm": 5.536794662475586, + "learning_rate": 4.698127546389141e-05, + "loss": 0.2664, + "step": 3360 + }, + { + "epoch": 0.2111925800589083, + "grad_norm": 0.11749021708965302, + "learning_rate": 4.697072048299593e-05, + "loss": 0.0704, + "step": 3370 + }, + { + "epoch": 0.21181926427273298, + "grad_norm": 0.28687575459480286, + "learning_rate": 4.6960165502100446e-05, + "loss": 0.1234, + "step": 3380 + }, + { + "epoch": 0.21244594848655762, + "grad_norm": 4.869693756103516, + "learning_rate": 4.6949610521204955e-05, + "loss": 0.0407, + "step": 3390 + }, + { + "epoch": 0.21307263270038226, + "grad_norm": 0.13574935495853424, + "learning_rate": 4.693905554030947e-05, + "loss": 0.0768, + "step": 3400 + }, + { + "epoch": 0.21369931691420693, + "grad_norm": 0.0712619423866272, + "learning_rate": 4.692850055941399e-05, + "loss": 0.1368, + "step": 3410 + }, + { + "epoch": 0.21432600112803157, + "grad_norm": 0.12375117838382721, + "learning_rate": 4.6917945578518505e-05, + "loss": 0.1302, + "step": 3420 + }, + { + "epoch": 0.21495268534185624, + "grad_norm": 0.2046600729227066, + "learning_rate": 4.690739059762302e-05, + "loss": 0.067, + "step": 3430 + }, + { + "epoch": 0.21557936955568088, + "grad_norm": 0.017954392358660698, + "learning_rate": 4.689683561672754e-05, + "loss": 0.0414, + "step": 3440 + }, + { + "epoch": 0.21620605376950555, + "grad_norm": 0.08368971198797226, + "learning_rate": 4.6886280635832055e-05, + "loss": 0.2528, + "step": 3450 + }, + { + "epoch": 0.2168327379833302, + "grad_norm": 1.2165157794952393, + "learning_rate": 4.6875725654936565e-05, + "loss": 0.0628, + "step": 3460 + }, + { + "epoch": 0.21745942219715486, + "grad_norm": 0.062159277498722076, + "learning_rate": 4.686517067404108e-05, + "loss": 0.1981, + "step": 3470 + }, + { + "epoch": 0.2180861064109795, + "grad_norm": 0.049053970724344254, + "learning_rate": 4.68546156931456e-05, + "loss": 0.0574, + "step": 3480 + }, + { + "epoch": 0.21871279062480417, + "grad_norm": 0.01489468477666378, + "learning_rate": 4.6844060712250115e-05, + "loss": 0.0837, + "step": 3490 + }, + { + "epoch": 0.2193394748386288, + "grad_norm": 0.02858012728393078, + "learning_rate": 4.6833505731354625e-05, + "loss": 0.0839, + "step": 3500 + }, + { + "epoch": 0.21996615905245348, + "grad_norm": 10.893243789672852, + "learning_rate": 4.682295075045914e-05, + "loss": 0.4084, + "step": 3510 + }, + { + "epoch": 0.22059284326627812, + "grad_norm": 0.19011631608009338, + "learning_rate": 4.681239576956366e-05, + "loss": 0.0307, + "step": 3520 + }, + { + "epoch": 0.2212195274801028, + "grad_norm": 3.5512607097625732, + "learning_rate": 4.6801840788668175e-05, + "loss": 0.1811, + "step": 3530 + }, + { + "epoch": 0.22184621169392743, + "grad_norm": 0.42964065074920654, + "learning_rate": 4.679128580777269e-05, + "loss": 0.2602, + "step": 3540 + }, + { + "epoch": 0.22247289590775207, + "grad_norm": 0.08508183062076569, + "learning_rate": 4.678073082687721e-05, + "loss": 0.0838, + "step": 3550 + }, + { + "epoch": 0.22309958012157674, + "grad_norm": 0.020195091143250465, + "learning_rate": 4.6770175845981725e-05, + "loss": 0.1333, + "step": 3560 + }, + { + "epoch": 0.22372626433540138, + "grad_norm": 0.02404479682445526, + "learning_rate": 4.6759620865086235e-05, + "loss": 0.0295, + "step": 3570 + }, + { + "epoch": 0.22435294854922605, + "grad_norm": 8.078899383544922, + "learning_rate": 4.674906588419075e-05, + "loss": 0.2297, + "step": 3580 + }, + { + "epoch": 0.2249796327630507, + "grad_norm": 3.0931432247161865, + "learning_rate": 4.673851090329527e-05, + "loss": 0.0381, + "step": 3590 + }, + { + "epoch": 0.22560631697687536, + "grad_norm": 0.7269296646118164, + "learning_rate": 4.6727955922399785e-05, + "loss": 0.099, + "step": 3600 + }, + { + "epoch": 0.2262330011907, + "grad_norm": 0.09635982662439346, + "learning_rate": 4.6717400941504295e-05, + "loss": 0.1602, + "step": 3610 + }, + { + "epoch": 0.22685968540452467, + "grad_norm": 0.36591339111328125, + "learning_rate": 4.670684596060881e-05, + "loss": 0.1373, + "step": 3620 + }, + { + "epoch": 0.2274863696183493, + "grad_norm": 2.797891616821289, + "learning_rate": 4.669629097971333e-05, + "loss": 0.2, + "step": 3630 + }, + { + "epoch": 0.22811305383217398, + "grad_norm": 0.13196179270744324, + "learning_rate": 4.6685735998817845e-05, + "loss": 0.1875, + "step": 3640 + }, + { + "epoch": 0.22873973804599862, + "grad_norm": 0.13646236062049866, + "learning_rate": 4.667518101792236e-05, + "loss": 0.3006, + "step": 3650 + }, + { + "epoch": 0.2293664222598233, + "grad_norm": 0.4018650949001312, + "learning_rate": 4.666462603702688e-05, + "loss": 0.4341, + "step": 3660 + }, + { + "epoch": 0.22999310647364793, + "grad_norm": 0.5656597018241882, + "learning_rate": 4.6654071056131395e-05, + "loss": 0.0513, + "step": 3670 + }, + { + "epoch": 0.23061979068747257, + "grad_norm": 10.083354949951172, + "learning_rate": 4.6643516075235904e-05, + "loss": 0.2332, + "step": 3680 + }, + { + "epoch": 0.23124647490129724, + "grad_norm": 0.1959206759929657, + "learning_rate": 4.663296109434042e-05, + "loss": 0.139, + "step": 3690 + }, + { + "epoch": 0.23187315911512188, + "grad_norm": 0.418293297290802, + "learning_rate": 4.662240611344494e-05, + "loss": 0.1942, + "step": 3700 + }, + { + "epoch": 0.23249984332894655, + "grad_norm": 1.2529011964797974, + "learning_rate": 4.661185113254945e-05, + "loss": 0.0641, + "step": 3710 + }, + { + "epoch": 0.2331265275427712, + "grad_norm": 41.292633056640625, + "learning_rate": 4.6601296151653964e-05, + "loss": 0.0563, + "step": 3720 + }, + { + "epoch": 0.23375321175659586, + "grad_norm": 0.023516474291682243, + "learning_rate": 4.659074117075848e-05, + "loss": 0.0929, + "step": 3730 + }, + { + "epoch": 0.2343798959704205, + "grad_norm": 3.448504686355591, + "learning_rate": 4.6580186189863e-05, + "loss": 0.1342, + "step": 3740 + }, + { + "epoch": 0.23500658018424517, + "grad_norm": 0.05701204016804695, + "learning_rate": 4.6569631208967514e-05, + "loss": 0.1256, + "step": 3750 + }, + { + "epoch": 0.2356332643980698, + "grad_norm": 0.11099798232316971, + "learning_rate": 4.655907622807203e-05, + "loss": 0.1523, + "step": 3760 + }, + { + "epoch": 0.23625994861189448, + "grad_norm": 0.04083950072526932, + "learning_rate": 4.654852124717655e-05, + "loss": 0.0609, + "step": 3770 + }, + { + "epoch": 0.23688663282571912, + "grad_norm": 6.568485260009766, + "learning_rate": 4.653796626628106e-05, + "loss": 0.1807, + "step": 3780 + }, + { + "epoch": 0.2375133170395438, + "grad_norm": 9.865739822387695, + "learning_rate": 4.6527411285385574e-05, + "loss": 0.1791, + "step": 3790 + }, + { + "epoch": 0.23814000125336843, + "grad_norm": 0.05502455309033394, + "learning_rate": 4.651685630449009e-05, + "loss": 0.0791, + "step": 3800 + }, + { + "epoch": 0.23876668546719307, + "grad_norm": 1.563883662223816, + "learning_rate": 4.650630132359461e-05, + "loss": 0.2054, + "step": 3810 + }, + { + "epoch": 0.23939336968101774, + "grad_norm": 0.3699815571308136, + "learning_rate": 4.649574634269912e-05, + "loss": 0.1275, + "step": 3820 + }, + { + "epoch": 0.24002005389484238, + "grad_norm": 0.03253927826881409, + "learning_rate": 4.6485191361803634e-05, + "loss": 0.077, + "step": 3830 + }, + { + "epoch": 0.24064673810866705, + "grad_norm": 0.44651317596435547, + "learning_rate": 4.647463638090816e-05, + "loss": 0.1526, + "step": 3840 + }, + { + "epoch": 0.2412734223224917, + "grad_norm": 0.019008852541446686, + "learning_rate": 4.646408140001267e-05, + "loss": 0.0415, + "step": 3850 + }, + { + "epoch": 0.24190010653631636, + "grad_norm": 14.093941688537598, + "learning_rate": 4.6453526419117184e-05, + "loss": 0.287, + "step": 3860 + }, + { + "epoch": 0.242526790750141, + "grad_norm": 2.9186899662017822, + "learning_rate": 4.64429714382217e-05, + "loss": 0.3775, + "step": 3870 + }, + { + "epoch": 0.24315347496396567, + "grad_norm": 8.830793380737305, + "learning_rate": 4.643241645732622e-05, + "loss": 0.0785, + "step": 3880 + }, + { + "epoch": 0.2437801591777903, + "grad_norm": 6.0299072265625, + "learning_rate": 4.642186147643073e-05, + "loss": 0.2825, + "step": 3890 + }, + { + "epoch": 0.24440684339161498, + "grad_norm": 0.060153331607580185, + "learning_rate": 4.6411306495535244e-05, + "loss": 0.1284, + "step": 3900 + }, + { + "epoch": 0.24503352760543962, + "grad_norm": 0.03562921658158302, + "learning_rate": 4.640075151463976e-05, + "loss": 0.0256, + "step": 3910 + }, + { + "epoch": 0.24566021181926428, + "grad_norm": 0.1369025558233261, + "learning_rate": 4.639019653374427e-05, + "loss": 0.0423, + "step": 3920 + }, + { + "epoch": 0.24628689603308893, + "grad_norm": 0.3345741331577301, + "learning_rate": 4.6379641552848794e-05, + "loss": 0.1194, + "step": 3930 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 2.964259386062622, + "learning_rate": 4.636908657195331e-05, + "loss": 0.1352, + "step": 3940 + }, + { + "epoch": 0.24754026446073824, + "grad_norm": 0.02697465941309929, + "learning_rate": 4.635853159105783e-05, + "loss": 0.0331, + "step": 3950 + }, + { + "epoch": 0.24816694867456288, + "grad_norm": 0.025438886135816574, + "learning_rate": 4.634797661016234e-05, + "loss": 0.2382, + "step": 3960 + }, + { + "epoch": 0.24879363288838754, + "grad_norm": 0.09332406520843506, + "learning_rate": 4.6337421629266853e-05, + "loss": 0.3281, + "step": 3970 + }, + { + "epoch": 0.24942031710221219, + "grad_norm": 0.11696890741586685, + "learning_rate": 4.632686664837137e-05, + "loss": 0.152, + "step": 3980 + }, + { + "epoch": 0.25004700131603685, + "grad_norm": 2.8147506713867188, + "learning_rate": 4.631631166747589e-05, + "loss": 0.1781, + "step": 3990 + }, + { + "epoch": 0.2506736855298615, + "grad_norm": 0.088865727186203, + "learning_rate": 4.63057566865804e-05, + "loss": 0.0816, + "step": 4000 + }, + { + "epoch": 0.25130036974368614, + "grad_norm": 0.10434025526046753, + "learning_rate": 4.629520170568491e-05, + "loss": 0.0756, + "step": 4010 + }, + { + "epoch": 0.2519270539575108, + "grad_norm": 8.183455467224121, + "learning_rate": 4.628464672478943e-05, + "loss": 0.1846, + "step": 4020 + }, + { + "epoch": 0.2525537381713355, + "grad_norm": 0.05632643774151802, + "learning_rate": 4.6274091743893947e-05, + "loss": 0.0299, + "step": 4030 + }, + { + "epoch": 0.25318042238516014, + "grad_norm": 2.964418649673462, + "learning_rate": 4.626353676299846e-05, + "loss": 0.4379, + "step": 4040 + }, + { + "epoch": 0.25380710659898476, + "grad_norm": 0.20549556612968445, + "learning_rate": 4.625298178210298e-05, + "loss": 0.2554, + "step": 4050 + }, + { + "epoch": 0.2544337908128094, + "grad_norm": 0.5683102011680603, + "learning_rate": 4.6242426801207497e-05, + "loss": 0.1996, + "step": 4060 + }, + { + "epoch": 0.2550604750266341, + "grad_norm": 0.22006919980049133, + "learning_rate": 4.6231871820312006e-05, + "loss": 0.1015, + "step": 4070 + }, + { + "epoch": 0.2556871592404587, + "grad_norm": 0.06379158049821854, + "learning_rate": 4.622131683941652e-05, + "loss": 0.1321, + "step": 4080 + }, + { + "epoch": 0.2563138434542834, + "grad_norm": 0.1231253445148468, + "learning_rate": 4.621076185852104e-05, + "loss": 0.0246, + "step": 4090 + }, + { + "epoch": 0.25694052766810804, + "grad_norm": 0.11641805619001389, + "learning_rate": 4.620020687762555e-05, + "loss": 0.1296, + "step": 4100 + }, + { + "epoch": 0.2575672118819327, + "grad_norm": 8.17453384399414, + "learning_rate": 4.6189651896730066e-05, + "loss": 0.1271, + "step": 4110 + }, + { + "epoch": 0.2581938960957573, + "grad_norm": 2.913849115371704, + "learning_rate": 4.617909691583458e-05, + "loss": 0.2079, + "step": 4120 + }, + { + "epoch": 0.258820580309582, + "grad_norm": 5.707304954528809, + "learning_rate": 4.61685419349391e-05, + "loss": 0.2832, + "step": 4130 + }, + { + "epoch": 0.25944726452340666, + "grad_norm": 5.583265781402588, + "learning_rate": 4.6157986954043616e-05, + "loss": 0.1184, + "step": 4140 + }, + { + "epoch": 0.26007394873723133, + "grad_norm": 0.47731733322143555, + "learning_rate": 4.614743197314813e-05, + "loss": 0.2857, + "step": 4150 + }, + { + "epoch": 0.26070063295105594, + "grad_norm": 0.46668675541877747, + "learning_rate": 4.613687699225265e-05, + "loss": 0.1019, + "step": 4160 + }, + { + "epoch": 0.2613273171648806, + "grad_norm": 3.6754000186920166, + "learning_rate": 4.612632201135716e-05, + "loss": 0.2119, + "step": 4170 + }, + { + "epoch": 0.2619540013787053, + "grad_norm": 0.11781708151102066, + "learning_rate": 4.6115767030461676e-05, + "loss": 0.059, + "step": 4180 + }, + { + "epoch": 0.26258068559252995, + "grad_norm": 0.6701149344444275, + "learning_rate": 4.610521204956619e-05, + "loss": 0.0225, + "step": 4190 + }, + { + "epoch": 0.26320736980635456, + "grad_norm": 7.709531307220459, + "learning_rate": 4.609465706867071e-05, + "loss": 0.0726, + "step": 4200 + }, + { + "epoch": 0.26383405402017923, + "grad_norm": 0.12035151571035385, + "learning_rate": 4.608410208777522e-05, + "loss": 0.1888, + "step": 4210 + }, + { + "epoch": 0.2644607382340039, + "grad_norm": 8.956652641296387, + "learning_rate": 4.6073547106879736e-05, + "loss": 0.2796, + "step": 4220 + }, + { + "epoch": 0.2650874224478285, + "grad_norm": 0.8151219487190247, + "learning_rate": 4.606299212598425e-05, + "loss": 0.0759, + "step": 4230 + }, + { + "epoch": 0.2657141066616532, + "grad_norm": 0.19752751290798187, + "learning_rate": 4.605243714508877e-05, + "loss": 0.1748, + "step": 4240 + }, + { + "epoch": 0.26634079087547785, + "grad_norm": 8.772632598876953, + "learning_rate": 4.6041882164193286e-05, + "loss": 0.1709, + "step": 4250 + }, + { + "epoch": 0.2669674750893025, + "grad_norm": 0.10757267475128174, + "learning_rate": 4.60313271832978e-05, + "loss": 0.2218, + "step": 4260 + }, + { + "epoch": 0.26759415930312713, + "grad_norm": 0.11040544509887695, + "learning_rate": 4.602077220240232e-05, + "loss": 0.1871, + "step": 4270 + }, + { + "epoch": 0.2682208435169518, + "grad_norm": 0.30749446153640747, + "learning_rate": 4.601021722150683e-05, + "loss": 0.0953, + "step": 4280 + }, + { + "epoch": 0.26884752773077647, + "grad_norm": 0.0919603630900383, + "learning_rate": 4.5999662240611346e-05, + "loss": 0.2144, + "step": 4290 + }, + { + "epoch": 0.26947421194460114, + "grad_norm": 0.063777856528759, + "learning_rate": 4.598910725971586e-05, + "loss": 0.0812, + "step": 4300 + }, + { + "epoch": 0.27010089615842575, + "grad_norm": 5.391152858734131, + "learning_rate": 4.597855227882037e-05, + "loss": 0.2445, + "step": 4310 + }, + { + "epoch": 0.2707275803722504, + "grad_norm": 1.2878516912460327, + "learning_rate": 4.596799729792489e-05, + "loss": 0.0912, + "step": 4320 + }, + { + "epoch": 0.2713542645860751, + "grad_norm": 24.52849006652832, + "learning_rate": 4.5957442317029405e-05, + "loss": 0.042, + "step": 4330 + }, + { + "epoch": 0.27198094879989976, + "grad_norm": 0.16191601753234863, + "learning_rate": 4.594688733613393e-05, + "loss": 0.1879, + "step": 4340 + }, + { + "epoch": 0.27260763301372437, + "grad_norm": 0.04222509637475014, + "learning_rate": 4.593633235523844e-05, + "loss": 0.0576, + "step": 4350 + }, + { + "epoch": 0.27323431722754904, + "grad_norm": 0.04624179005622864, + "learning_rate": 4.5925777374342955e-05, + "loss": 0.1089, + "step": 4360 + }, + { + "epoch": 0.2738610014413737, + "grad_norm": 3.643953323364258, + "learning_rate": 4.591522239344747e-05, + "loss": 0.2434, + "step": 4370 + }, + { + "epoch": 0.2744876856551983, + "grad_norm": 3.4364843368530273, + "learning_rate": 4.590466741255199e-05, + "loss": 0.2215, + "step": 4380 + }, + { + "epoch": 0.275114369869023, + "grad_norm": 1.3025727272033691, + "learning_rate": 4.58941124316565e-05, + "loss": 0.1839, + "step": 4390 + }, + { + "epoch": 0.27574105408284766, + "grad_norm": 15.097777366638184, + "learning_rate": 4.5883557450761015e-05, + "loss": 0.1754, + "step": 4400 + }, + { + "epoch": 0.2763677382966723, + "grad_norm": 0.12557531893253326, + "learning_rate": 4.587300246986553e-05, + "loss": 0.136, + "step": 4410 + }, + { + "epoch": 0.27699442251049694, + "grad_norm": 0.080543152987957, + "learning_rate": 4.586244748897004e-05, + "loss": 0.009, + "step": 4420 + }, + { + "epoch": 0.2776211067243216, + "grad_norm": 0.035216640681028366, + "learning_rate": 4.5851892508074565e-05, + "loss": 0.0359, + "step": 4430 + }, + { + "epoch": 0.2782477909381463, + "grad_norm": 0.03987397626042366, + "learning_rate": 4.584133752717908e-05, + "loss": 0.1096, + "step": 4440 + }, + { + "epoch": 0.27887447515197095, + "grad_norm": 0.11862228810787201, + "learning_rate": 4.58307825462836e-05, + "loss": 0.0751, + "step": 4450 + }, + { + "epoch": 0.27950115936579556, + "grad_norm": 172.26731872558594, + "learning_rate": 4.582022756538811e-05, + "loss": 0.1038, + "step": 4460 + }, + { + "epoch": 0.28012784357962023, + "grad_norm": 2.120897054672241, + "learning_rate": 4.5809672584492625e-05, + "loss": 0.2261, + "step": 4470 + }, + { + "epoch": 0.2807545277934449, + "grad_norm": 14.573572158813477, + "learning_rate": 4.579911760359714e-05, + "loss": 0.2673, + "step": 4480 + }, + { + "epoch": 0.2813812120072695, + "grad_norm": 1.8395880460739136, + "learning_rate": 4.578856262270165e-05, + "loss": 0.115, + "step": 4490 + }, + { + "epoch": 0.2820078962210942, + "grad_norm": 3.6968624591827393, + "learning_rate": 4.577800764180617e-05, + "loss": 0.1004, + "step": 4500 + }, + { + "epoch": 0.28263458043491885, + "grad_norm": 3.105196237564087, + "learning_rate": 4.5767452660910685e-05, + "loss": 0.1143, + "step": 4510 + }, + { + "epoch": 0.2832612646487435, + "grad_norm": 5.869255542755127, + "learning_rate": 4.57568976800152e-05, + "loss": 0.1985, + "step": 4520 + }, + { + "epoch": 0.28388794886256813, + "grad_norm": 5.56795597076416, + "learning_rate": 4.574634269911972e-05, + "loss": 0.0755, + "step": 4530 + }, + { + "epoch": 0.2845146330763928, + "grad_norm": 2.6562743186950684, + "learning_rate": 4.5735787718224235e-05, + "loss": 0.212, + "step": 4540 + }, + { + "epoch": 0.28514131729021747, + "grad_norm": 7.3283867835998535, + "learning_rate": 4.572523273732875e-05, + "loss": 0.1854, + "step": 4550 + }, + { + "epoch": 0.28576800150404214, + "grad_norm": 0.06201328709721565, + "learning_rate": 4.571467775643326e-05, + "loss": 0.1102, + "step": 4560 + }, + { + "epoch": 0.28639468571786675, + "grad_norm": 0.10211961716413498, + "learning_rate": 4.570412277553778e-05, + "loss": 0.0591, + "step": 4570 + }, + { + "epoch": 0.2870213699316914, + "grad_norm": 0.08829760551452637, + "learning_rate": 4.5693567794642295e-05, + "loss": 0.0468, + "step": 4580 + }, + { + "epoch": 0.2876480541455161, + "grad_norm": 6.4518141746521, + "learning_rate": 4.568301281374681e-05, + "loss": 0.2678, + "step": 4590 + }, + { + "epoch": 0.28827473835934075, + "grad_norm": 0.13497485220432281, + "learning_rate": 4.567245783285132e-05, + "loss": 0.0539, + "step": 4600 + }, + { + "epoch": 0.28890142257316537, + "grad_norm": 0.0946924239397049, + "learning_rate": 4.566190285195584e-05, + "loss": 0.1492, + "step": 4610 + }, + { + "epoch": 0.28952810678699004, + "grad_norm": 0.4621901512145996, + "learning_rate": 4.5651347871060354e-05, + "loss": 0.0721, + "step": 4620 + }, + { + "epoch": 0.2901547910008147, + "grad_norm": 4.470915794372559, + "learning_rate": 4.564079289016487e-05, + "loss": 0.1717, + "step": 4630 + }, + { + "epoch": 0.2907814752146393, + "grad_norm": 0.09481313824653625, + "learning_rate": 4.563023790926939e-05, + "loss": 0.1039, + "step": 4640 + }, + { + "epoch": 0.291408159428464, + "grad_norm": 0.095061294734478, + "learning_rate": 4.5619682928373904e-05, + "loss": 0.3072, + "step": 4650 + }, + { + "epoch": 0.29203484364228866, + "grad_norm": 5.283687114715576, + "learning_rate": 4.560912794747842e-05, + "loss": 0.0254, + "step": 4660 + }, + { + "epoch": 0.2926615278561133, + "grad_norm": 0.07655642926692963, + "learning_rate": 4.559857296658293e-05, + "loss": 0.1093, + "step": 4670 + }, + { + "epoch": 0.29328821206993794, + "grad_norm": 0.12662218511104584, + "learning_rate": 4.558801798568745e-05, + "loss": 0.0054, + "step": 4680 + }, + { + "epoch": 0.2939148962837626, + "grad_norm": 6.806703567504883, + "learning_rate": 4.5577463004791964e-05, + "loss": 0.1459, + "step": 4690 + }, + { + "epoch": 0.2945415804975873, + "grad_norm": 0.09386972337961197, + "learning_rate": 4.5566908023896474e-05, + "loss": 0.1345, + "step": 4700 + }, + { + "epoch": 0.29516826471141194, + "grad_norm": 3.9050133228302, + "learning_rate": 4.555635304300099e-05, + "loss": 0.2161, + "step": 4710 + }, + { + "epoch": 0.29579494892523656, + "grad_norm": 0.464335560798645, + "learning_rate": 4.554579806210551e-05, + "loss": 0.0422, + "step": 4720 + }, + { + "epoch": 0.2964216331390612, + "grad_norm": 0.06394585967063904, + "learning_rate": 4.5535243081210024e-05, + "loss": 0.1675, + "step": 4730 + }, + { + "epoch": 0.2970483173528859, + "grad_norm": 13.138270378112793, + "learning_rate": 4.552468810031454e-05, + "loss": 0.296, + "step": 4740 + }, + { + "epoch": 0.29767500156671056, + "grad_norm": 6.973242282867432, + "learning_rate": 4.551413311941906e-05, + "loss": 0.0738, + "step": 4750 + }, + { + "epoch": 0.2983016857805352, + "grad_norm": 0.09183468669652939, + "learning_rate": 4.5503578138523574e-05, + "loss": 0.0267, + "step": 4760 + }, + { + "epoch": 0.29892836999435984, + "grad_norm": 0.07803864032030106, + "learning_rate": 4.549302315762809e-05, + "loss": 0.2194, + "step": 4770 + }, + { + "epoch": 0.2995550542081845, + "grad_norm": 0.13608647882938385, + "learning_rate": 4.54824681767326e-05, + "loss": 0.2405, + "step": 4780 + }, + { + "epoch": 0.3001817384220091, + "grad_norm": 0.2874913811683655, + "learning_rate": 4.547191319583712e-05, + "loss": 0.0569, + "step": 4790 + }, + { + "epoch": 0.3008084226358338, + "grad_norm": 0.12187890708446503, + "learning_rate": 4.5461358214941634e-05, + "loss": 0.1173, + "step": 4800 + }, + { + "epoch": 0.30143510684965846, + "grad_norm": 0.8910410404205322, + "learning_rate": 4.5450803234046144e-05, + "loss": 0.0923, + "step": 4810 + }, + { + "epoch": 0.30206179106348313, + "grad_norm": 3.576404571533203, + "learning_rate": 4.544024825315066e-05, + "loss": 0.1787, + "step": 4820 + }, + { + "epoch": 0.30268847527730774, + "grad_norm": 5.950135707855225, + "learning_rate": 4.5429693272255184e-05, + "loss": 0.2014, + "step": 4830 + }, + { + "epoch": 0.3033151594911324, + "grad_norm": 0.23504792153835297, + "learning_rate": 4.54191382913597e-05, + "loss": 0.0568, + "step": 4840 + }, + { + "epoch": 0.3039418437049571, + "grad_norm": 0.1148870512843132, + "learning_rate": 4.540858331046421e-05, + "loss": 0.193, + "step": 4850 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 2.612504482269287, + "learning_rate": 4.539802832956873e-05, + "loss": 0.1086, + "step": 4860 + }, + { + "epoch": 0.30519521213260636, + "grad_norm": 0.021087897941470146, + "learning_rate": 4.5387473348673244e-05, + "loss": 0.0463, + "step": 4870 + }, + { + "epoch": 0.30582189634643103, + "grad_norm": 6.417503356933594, + "learning_rate": 4.5376918367777754e-05, + "loss": 0.1688, + "step": 4880 + }, + { + "epoch": 0.3064485805602557, + "grad_norm": 0.1166616752743721, + "learning_rate": 4.536636338688227e-05, + "loss": 0.1142, + "step": 4890 + }, + { + "epoch": 0.3070752647740803, + "grad_norm": 0.11154291033744812, + "learning_rate": 4.535580840598679e-05, + "loss": 0.1412, + "step": 4900 + }, + { + "epoch": 0.307701948987905, + "grad_norm": 0.1466520130634308, + "learning_rate": 4.5345253425091303e-05, + "loss": 0.0189, + "step": 4910 + }, + { + "epoch": 0.30832863320172965, + "grad_norm": 0.09126675128936768, + "learning_rate": 4.533469844419581e-05, + "loss": 0.0135, + "step": 4920 + }, + { + "epoch": 0.3089553174155543, + "grad_norm": 0.06963896751403809, + "learning_rate": 4.532414346330034e-05, + "loss": 0.3541, + "step": 4930 + }, + { + "epoch": 0.30958200162937893, + "grad_norm": 0.28274479508399963, + "learning_rate": 4.5313588482404853e-05, + "loss": 0.1919, + "step": 4940 + }, + { + "epoch": 0.3102086858432036, + "grad_norm": 1.2750145196914673, + "learning_rate": 4.530303350150936e-05, + "loss": 0.0554, + "step": 4950 + }, + { + "epoch": 0.31083537005702827, + "grad_norm": 0.06011820584535599, + "learning_rate": 4.529247852061388e-05, + "loss": 0.0602, + "step": 4960 + }, + { + "epoch": 0.31146205427085294, + "grad_norm": 1.432435154914856, + "learning_rate": 4.52819235397184e-05, + "loss": 0.011, + "step": 4970 + }, + { + "epoch": 0.31208873848467755, + "grad_norm": 0.08396012336015701, + "learning_rate": 4.527136855882291e-05, + "loss": 0.0512, + "step": 4980 + }, + { + "epoch": 0.3127154226985022, + "grad_norm": 0.01751044951379299, + "learning_rate": 4.526081357792742e-05, + "loss": 0.0981, + "step": 4990 + }, + { + "epoch": 0.3133421069123269, + "grad_norm": 0.02498529851436615, + "learning_rate": 4.525025859703194e-05, + "loss": 0.1803, + "step": 5000 + }, + { + "epoch": 0.31396879112615156, + "grad_norm": 3.77972674369812, + "learning_rate": 4.5239703616136456e-05, + "loss": 0.2917, + "step": 5010 + }, + { + "epoch": 0.31459547533997617, + "grad_norm": 0.40921223163604736, + "learning_rate": 4.522914863524097e-05, + "loss": 0.1298, + "step": 5020 + }, + { + "epoch": 0.31522215955380084, + "grad_norm": 11.053905487060547, + "learning_rate": 4.521859365434549e-05, + "loss": 0.1119, + "step": 5030 + }, + { + "epoch": 0.3158488437676255, + "grad_norm": 12.93051815032959, + "learning_rate": 4.5208038673450006e-05, + "loss": 0.0669, + "step": 5040 + }, + { + "epoch": 0.3164755279814501, + "grad_norm": 0.1682542860507965, + "learning_rate": 4.519748369255452e-05, + "loss": 0.0718, + "step": 5050 + }, + { + "epoch": 0.3171022121952748, + "grad_norm": 6.569287300109863, + "learning_rate": 4.518692871165903e-05, + "loss": 0.1773, + "step": 5060 + }, + { + "epoch": 0.31772889640909946, + "grad_norm": 0.24212561547756195, + "learning_rate": 4.517637373076355e-05, + "loss": 0.0564, + "step": 5070 + }, + { + "epoch": 0.31835558062292413, + "grad_norm": 0.04047981649637222, + "learning_rate": 4.5165818749868066e-05, + "loss": 0.1893, + "step": 5080 + }, + { + "epoch": 0.31898226483674874, + "grad_norm": 0.6782110333442688, + "learning_rate": 4.5155263768972576e-05, + "loss": 0.2226, + "step": 5090 + }, + { + "epoch": 0.3196089490505734, + "grad_norm": 0.14049291610717773, + "learning_rate": 4.514470878807709e-05, + "loss": 0.091, + "step": 5100 + }, + { + "epoch": 0.3202356332643981, + "grad_norm": 0.1849367469549179, + "learning_rate": 4.513415380718161e-05, + "loss": 0.1393, + "step": 5110 + }, + { + "epoch": 0.32086231747822275, + "grad_norm": 0.1305677443742752, + "learning_rate": 4.5123598826286126e-05, + "loss": 0.221, + "step": 5120 + }, + { + "epoch": 0.32148900169204736, + "grad_norm": 0.4029105603694916, + "learning_rate": 4.511304384539064e-05, + "loss": 0.0189, + "step": 5130 + }, + { + "epoch": 0.32211568590587203, + "grad_norm": 0.062030546367168427, + "learning_rate": 4.510248886449516e-05, + "loss": 0.0842, + "step": 5140 + }, + { + "epoch": 0.3227423701196967, + "grad_norm": 9.721570014953613, + "learning_rate": 4.5091933883599676e-05, + "loss": 0.2981, + "step": 5150 + }, + { + "epoch": 0.32336905433352137, + "grad_norm": 0.10169248282909393, + "learning_rate": 4.5081378902704186e-05, + "loss": 0.1636, + "step": 5160 + }, + { + "epoch": 0.323995738547346, + "grad_norm": 0.22179928421974182, + "learning_rate": 4.50708239218087e-05, + "loss": 0.1367, + "step": 5170 + }, + { + "epoch": 0.32462242276117065, + "grad_norm": 5.547093868255615, + "learning_rate": 4.506026894091322e-05, + "loss": 0.1028, + "step": 5180 + }, + { + "epoch": 0.3252491069749953, + "grad_norm": 8.620623588562012, + "learning_rate": 4.5049713960017736e-05, + "loss": 0.1962, + "step": 5190 + }, + { + "epoch": 0.32587579118881993, + "grad_norm": 3.4595415592193604, + "learning_rate": 4.5039158979122246e-05, + "loss": 0.19, + "step": 5200 + }, + { + "epoch": 0.3265024754026446, + "grad_norm": 0.08240117877721786, + "learning_rate": 4.502860399822676e-05, + "loss": 0.0581, + "step": 5210 + }, + { + "epoch": 0.32712915961646927, + "grad_norm": 0.07359209656715393, + "learning_rate": 4.501804901733128e-05, + "loss": 0.0856, + "step": 5220 + }, + { + "epoch": 0.32775584383029394, + "grad_norm": 1.3321644067764282, + "learning_rate": 4.5007494036435796e-05, + "loss": 0.1497, + "step": 5230 + }, + { + "epoch": 0.32838252804411855, + "grad_norm": 3.477701425552368, + "learning_rate": 4.499693905554031e-05, + "loss": 0.2044, + "step": 5240 + }, + { + "epoch": 0.3290092122579432, + "grad_norm": 6.64050817489624, + "learning_rate": 4.498638407464483e-05, + "loss": 0.3428, + "step": 5250 + }, + { + "epoch": 0.3296358964717679, + "grad_norm": 0.25289374589920044, + "learning_rate": 4.4975829093749346e-05, + "loss": 0.1021, + "step": 5260 + }, + { + "epoch": 0.33026258068559255, + "grad_norm": 0.14394524693489075, + "learning_rate": 4.4965274112853856e-05, + "loss": 0.1675, + "step": 5270 + }, + { + "epoch": 0.33088926489941717, + "grad_norm": 3.0389928817749023, + "learning_rate": 4.495471913195837e-05, + "loss": 0.1389, + "step": 5280 + }, + { + "epoch": 0.33151594911324184, + "grad_norm": 4.999429702758789, + "learning_rate": 4.494416415106289e-05, + "loss": 0.1257, + "step": 5290 + }, + { + "epoch": 0.3321426333270665, + "grad_norm": 9.429051399230957, + "learning_rate": 4.4933609170167405e-05, + "loss": 0.1603, + "step": 5300 + }, + { + "epoch": 0.3327693175408911, + "grad_norm": 3.5542385578155518, + "learning_rate": 4.4923054189271915e-05, + "loss": 0.1632, + "step": 5310 + }, + { + "epoch": 0.3333960017547158, + "grad_norm": 0.5139914155006409, + "learning_rate": 4.491249920837643e-05, + "loss": 0.063, + "step": 5320 + }, + { + "epoch": 0.33402268596854046, + "grad_norm": 0.7108550071716309, + "learning_rate": 4.4901944227480955e-05, + "loss": 0.0941, + "step": 5330 + }, + { + "epoch": 0.3346493701823651, + "grad_norm": 0.048141974955797195, + "learning_rate": 4.4891389246585465e-05, + "loss": 0.1396, + "step": 5340 + }, + { + "epoch": 0.33527605439618974, + "grad_norm": 2.3768675327301025, + "learning_rate": 4.488083426568998e-05, + "loss": 0.1904, + "step": 5350 + }, + { + "epoch": 0.3359027386100144, + "grad_norm": 0.14059720933437347, + "learning_rate": 4.48702792847945e-05, + "loss": 0.1807, + "step": 5360 + }, + { + "epoch": 0.3365294228238391, + "grad_norm": 0.13699670135974884, + "learning_rate": 4.4859724303899015e-05, + "loss": 0.1932, + "step": 5370 + }, + { + "epoch": 0.33715610703766374, + "grad_norm": 0.14927171170711517, + "learning_rate": 4.4849169323003525e-05, + "loss": 0.146, + "step": 5380 + }, + { + "epoch": 0.33778279125148836, + "grad_norm": 1.7202868461608887, + "learning_rate": 4.483861434210804e-05, + "loss": 0.2793, + "step": 5390 + }, + { + "epoch": 0.338409475465313, + "grad_norm": 0.2133670300245285, + "learning_rate": 4.482805936121256e-05, + "loss": 0.0996, + "step": 5400 + }, + { + "epoch": 0.3390361596791377, + "grad_norm": 0.10931286215782166, + "learning_rate": 4.481750438031707e-05, + "loss": 0.1403, + "step": 5410 + }, + { + "epoch": 0.33966284389296236, + "grad_norm": 0.11824043095111847, + "learning_rate": 4.4806949399421585e-05, + "loss": 0.1866, + "step": 5420 + }, + { + "epoch": 0.340289528106787, + "grad_norm": 3.111504554748535, + "learning_rate": 4.479639441852611e-05, + "loss": 0.1339, + "step": 5430 + }, + { + "epoch": 0.34091621232061164, + "grad_norm": 0.15324468910694122, + "learning_rate": 4.4785839437630625e-05, + "loss": 0.1496, + "step": 5440 + }, + { + "epoch": 0.3415428965344363, + "grad_norm": 15.096182823181152, + "learning_rate": 4.4775284456735135e-05, + "loss": 0.2691, + "step": 5450 + }, + { + "epoch": 0.3421695807482609, + "grad_norm": 6.626861095428467, + "learning_rate": 4.476472947583965e-05, + "loss": 0.2139, + "step": 5460 + }, + { + "epoch": 0.3427962649620856, + "grad_norm": 0.3588835895061493, + "learning_rate": 4.475417449494417e-05, + "loss": 0.0899, + "step": 5470 + }, + { + "epoch": 0.34342294917591026, + "grad_norm": 3.0588839054107666, + "learning_rate": 4.474361951404868e-05, + "loss": 0.1258, + "step": 5480 + }, + { + "epoch": 0.34404963338973493, + "grad_norm": 0.17878539860248566, + "learning_rate": 4.4733064533153195e-05, + "loss": 0.1772, + "step": 5490 + }, + { + "epoch": 0.34467631760355955, + "grad_norm": 1.422548532485962, + "learning_rate": 4.472250955225771e-05, + "loss": 0.1118, + "step": 5500 + }, + { + "epoch": 0.3453030018173842, + "grad_norm": 0.09782126545906067, + "learning_rate": 4.471195457136223e-05, + "loss": 0.0883, + "step": 5510 + }, + { + "epoch": 0.3459296860312089, + "grad_norm": 1.7103350162506104, + "learning_rate": 4.4701399590466745e-05, + "loss": 0.0905, + "step": 5520 + }, + { + "epoch": 0.34655637024503355, + "grad_norm": 0.24084354937076569, + "learning_rate": 4.469084460957126e-05, + "loss": 0.157, + "step": 5530 + }, + { + "epoch": 0.34718305445885816, + "grad_norm": 0.39323166012763977, + "learning_rate": 4.468028962867578e-05, + "loss": 0.165, + "step": 5540 + }, + { + "epoch": 0.34780973867268283, + "grad_norm": 4.8996124267578125, + "learning_rate": 4.466973464778029e-05, + "loss": 0.1703, + "step": 5550 + }, + { + "epoch": 0.3484364228865075, + "grad_norm": 0.06810665875673294, + "learning_rate": 4.4659179666884805e-05, + "loss": 0.1135, + "step": 5560 + }, + { + "epoch": 0.3490631071003321, + "grad_norm": 0.4743961989879608, + "learning_rate": 4.464862468598932e-05, + "loss": 0.1448, + "step": 5570 + }, + { + "epoch": 0.3496897913141568, + "grad_norm": 1.2920023202896118, + "learning_rate": 4.463806970509384e-05, + "loss": 0.1119, + "step": 5580 + }, + { + "epoch": 0.35031647552798145, + "grad_norm": 3.457261323928833, + "learning_rate": 4.462751472419835e-05, + "loss": 0.1757, + "step": 5590 + }, + { + "epoch": 0.3509431597418061, + "grad_norm": 3.696157932281494, + "learning_rate": 4.4616959743302864e-05, + "loss": 0.186, + "step": 5600 + }, + { + "epoch": 0.35156984395563073, + "grad_norm": 0.24386389553546906, + "learning_rate": 4.460640476240738e-05, + "loss": 0.1244, + "step": 5610 + }, + { + "epoch": 0.3521965281694554, + "grad_norm": 26.246967315673828, + "learning_rate": 4.45958497815119e-05, + "loss": 0.1493, + "step": 5620 + }, + { + "epoch": 0.35282321238328007, + "grad_norm": 0.29406628012657166, + "learning_rate": 4.4585294800616414e-05, + "loss": 0.1353, + "step": 5630 + }, + { + "epoch": 0.35344989659710474, + "grad_norm": 0.42619481682777405, + "learning_rate": 4.457473981972093e-05, + "loss": 0.1239, + "step": 5640 + }, + { + "epoch": 0.35407658081092935, + "grad_norm": 0.22796322405338287, + "learning_rate": 4.456418483882545e-05, + "loss": 0.1861, + "step": 5650 + }, + { + "epoch": 0.354703265024754, + "grad_norm": 0.15700025856494904, + "learning_rate": 4.455362985792996e-05, + "loss": 0.1362, + "step": 5660 + }, + { + "epoch": 0.3553299492385787, + "grad_norm": 0.25178059935569763, + "learning_rate": 4.4543074877034474e-05, + "loss": 0.1262, + "step": 5670 + }, + { + "epoch": 0.35595663345240336, + "grad_norm": 0.06967005878686905, + "learning_rate": 4.453251989613899e-05, + "loss": 0.1283, + "step": 5680 + }, + { + "epoch": 0.356583317666228, + "grad_norm": 0.48487424850463867, + "learning_rate": 4.452196491524351e-05, + "loss": 0.1818, + "step": 5690 + }, + { + "epoch": 0.35721000188005264, + "grad_norm": 1.9316906929016113, + "learning_rate": 4.451140993434802e-05, + "loss": 0.0954, + "step": 5700 + }, + { + "epoch": 0.3578366860938773, + "grad_norm": 0.09588257223367691, + "learning_rate": 4.4500854953452534e-05, + "loss": 0.0795, + "step": 5710 + }, + { + "epoch": 0.3584633703077019, + "grad_norm": 1.1819512844085693, + "learning_rate": 4.449029997255705e-05, + "loss": 0.2532, + "step": 5720 + }, + { + "epoch": 0.3590900545215266, + "grad_norm": 0.40093863010406494, + "learning_rate": 4.447974499166157e-05, + "loss": 0.0591, + "step": 5730 + }, + { + "epoch": 0.35971673873535126, + "grad_norm": 9.683305740356445, + "learning_rate": 4.4469190010766084e-05, + "loss": 0.0656, + "step": 5740 + }, + { + "epoch": 0.36034342294917593, + "grad_norm": 0.07515076547861099, + "learning_rate": 4.44586350298706e-05, + "loss": 0.1718, + "step": 5750 + }, + { + "epoch": 0.36097010716300054, + "grad_norm": 0.0691744014620781, + "learning_rate": 4.444808004897512e-05, + "loss": 0.2273, + "step": 5760 + }, + { + "epoch": 0.3615967913768252, + "grad_norm": 8.280632019042969, + "learning_rate": 4.443752506807963e-05, + "loss": 0.1762, + "step": 5770 + }, + { + "epoch": 0.3622234755906499, + "grad_norm": 0.13394978642463684, + "learning_rate": 4.4426970087184144e-05, + "loss": 0.168, + "step": 5780 + }, + { + "epoch": 0.36285015980447455, + "grad_norm": 0.17555440962314606, + "learning_rate": 4.441641510628866e-05, + "loss": 0.085, + "step": 5790 + }, + { + "epoch": 0.36347684401829916, + "grad_norm": 13.172547340393066, + "learning_rate": 4.440586012539317e-05, + "loss": 0.2534, + "step": 5800 + }, + { + "epoch": 0.36410352823212383, + "grad_norm": 0.8912064433097839, + "learning_rate": 4.439530514449769e-05, + "loss": 0.087, + "step": 5810 + }, + { + "epoch": 0.3647302124459485, + "grad_norm": 0.07865786552429199, + "learning_rate": 4.4384750163602204e-05, + "loss": 0.0783, + "step": 5820 + }, + { + "epoch": 0.36535689665977317, + "grad_norm": 7.192616939544678, + "learning_rate": 4.437419518270673e-05, + "loss": 0.0805, + "step": 5830 + }, + { + "epoch": 0.3659835808735978, + "grad_norm": 6.647584915161133, + "learning_rate": 4.436364020181124e-05, + "loss": 0.3214, + "step": 5840 + }, + { + "epoch": 0.36661026508742245, + "grad_norm": 3.0235965251922607, + "learning_rate": 4.4353085220915754e-05, + "loss": 0.2507, + "step": 5850 + }, + { + "epoch": 0.3672369493012471, + "grad_norm": 0.1376418173313141, + "learning_rate": 4.434253024002027e-05, + "loss": 0.0591, + "step": 5860 + }, + { + "epoch": 0.36786363351507173, + "grad_norm": 0.08546671271324158, + "learning_rate": 4.433197525912478e-05, + "loss": 0.0486, + "step": 5870 + }, + { + "epoch": 0.3684903177288964, + "grad_norm": 3.4542911052703857, + "learning_rate": 4.43214202782293e-05, + "loss": 0.1363, + "step": 5880 + }, + { + "epoch": 0.36911700194272107, + "grad_norm": 0.18823327124118805, + "learning_rate": 4.431086529733381e-05, + "loss": 0.0364, + "step": 5890 + }, + { + "epoch": 0.36974368615654574, + "grad_norm": 0.26859134435653687, + "learning_rate": 4.430031031643833e-05, + "loss": 0.0906, + "step": 5900 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 67.65911102294922, + "learning_rate": 4.428975533554284e-05, + "loss": 0.0909, + "step": 5910 + }, + { + "epoch": 0.370997054584195, + "grad_norm": 15.827759742736816, + "learning_rate": 4.427920035464736e-05, + "loss": 0.1871, + "step": 5920 + }, + { + "epoch": 0.3716237387980197, + "grad_norm": 4.99617862701416, + "learning_rate": 4.426864537375188e-05, + "loss": 0.155, + "step": 5930 + }, + { + "epoch": 0.37225042301184436, + "grad_norm": 8.309243202209473, + "learning_rate": 4.425809039285639e-05, + "loss": 0.4023, + "step": 5940 + }, + { + "epoch": 0.37287710722566897, + "grad_norm": 0.054077792912721634, + "learning_rate": 4.4247535411960906e-05, + "loss": 0.1171, + "step": 5950 + }, + { + "epoch": 0.37350379143949364, + "grad_norm": 7.583477020263672, + "learning_rate": 4.423698043106542e-05, + "loss": 0.1549, + "step": 5960 + }, + { + "epoch": 0.3741304756533183, + "grad_norm": 0.06153470277786255, + "learning_rate": 4.422642545016994e-05, + "loss": 0.0672, + "step": 5970 + }, + { + "epoch": 0.3747571598671429, + "grad_norm": 0.42908042669296265, + "learning_rate": 4.421587046927445e-05, + "loss": 0.081, + "step": 5980 + }, + { + "epoch": 0.3753838440809676, + "grad_norm": 0.088747039437294, + "learning_rate": 4.4205315488378966e-05, + "loss": 0.1062, + "step": 5990 + }, + { + "epoch": 0.37601052829479226, + "grad_norm": 0.06233467161655426, + "learning_rate": 4.419476050748348e-05, + "loss": 0.0545, + "step": 6000 + }, + { + "epoch": 0.3766372125086169, + "grad_norm": 0.06656718254089355, + "learning_rate": 4.4184205526588e-05, + "loss": 0.1114, + "step": 6010 + }, + { + "epoch": 0.37726389672244154, + "grad_norm": 0.16433706879615784, + "learning_rate": 4.4173650545692516e-05, + "loss": 0.0839, + "step": 6020 + }, + { + "epoch": 0.3778905809362662, + "grad_norm": 0.40656861662864685, + "learning_rate": 4.416309556479703e-05, + "loss": 0.0803, + "step": 6030 + }, + { + "epoch": 0.3785172651500909, + "grad_norm": 0.031963735818862915, + "learning_rate": 4.415254058390155e-05, + "loss": 0.09, + "step": 6040 + }, + { + "epoch": 0.37914394936391554, + "grad_norm": 0.03380054607987404, + "learning_rate": 4.414198560300606e-05, + "loss": 0.307, + "step": 6050 + }, + { + "epoch": 0.37977063357774016, + "grad_norm": 0.06074713170528412, + "learning_rate": 4.4131430622110576e-05, + "loss": 0.0836, + "step": 6060 + }, + { + "epoch": 0.3803973177915648, + "grad_norm": 0.25206536054611206, + "learning_rate": 4.412087564121509e-05, + "loss": 0.0627, + "step": 6070 + }, + { + "epoch": 0.3810240020053895, + "grad_norm": 0.6263360381126404, + "learning_rate": 4.411032066031961e-05, + "loss": 0.0528, + "step": 6080 + }, + { + "epoch": 0.38165068621921416, + "grad_norm": 0.07583242654800415, + "learning_rate": 4.409976567942412e-05, + "loss": 0.4302, + "step": 6090 + }, + { + "epoch": 0.3822773704330388, + "grad_norm": 0.07200941443443298, + "learning_rate": 4.4089210698528636e-05, + "loss": 0.0395, + "step": 6100 + }, + { + "epoch": 0.38290405464686345, + "grad_norm": 9.109925270080566, + "learning_rate": 4.407865571763315e-05, + "loss": 0.2408, + "step": 6110 + }, + { + "epoch": 0.3835307388606881, + "grad_norm": 48.827877044677734, + "learning_rate": 4.406810073673767e-05, + "loss": 0.1589, + "step": 6120 + }, + { + "epoch": 0.3841574230745127, + "grad_norm": 3.1407594680786133, + "learning_rate": 4.4057545755842186e-05, + "loss": 0.1474, + "step": 6130 + }, + { + "epoch": 0.3847841072883374, + "grad_norm": 0.10355928540229797, + "learning_rate": 4.40469907749467e-05, + "loss": 0.0277, + "step": 6140 + }, + { + "epoch": 0.38541079150216206, + "grad_norm": 0.45885977149009705, + "learning_rate": 4.403643579405122e-05, + "loss": 0.1817, + "step": 6150 + }, + { + "epoch": 0.38603747571598673, + "grad_norm": 0.1281880885362625, + "learning_rate": 4.402588081315573e-05, + "loss": 0.1287, + "step": 6160 + }, + { + "epoch": 0.38666415992981135, + "grad_norm": 0.21207277476787567, + "learning_rate": 4.4015325832260246e-05, + "loss": 0.1862, + "step": 6170 + }, + { + "epoch": 0.387290844143636, + "grad_norm": 6.804543495178223, + "learning_rate": 4.400477085136476e-05, + "loss": 0.2877, + "step": 6180 + }, + { + "epoch": 0.3879175283574607, + "grad_norm": 0.1996593177318573, + "learning_rate": 4.399421587046927e-05, + "loss": 0.1129, + "step": 6190 + }, + { + "epoch": 0.38854421257128535, + "grad_norm": 0.15772663056850433, + "learning_rate": 4.398366088957379e-05, + "loss": 0.1105, + "step": 6200 + }, + { + "epoch": 0.38917089678510997, + "grad_norm": 0.14258632063865662, + "learning_rate": 4.3973105908678306e-05, + "loss": 0.1717, + "step": 6210 + }, + { + "epoch": 0.38979758099893463, + "grad_norm": 3.9509053230285645, + "learning_rate": 4.396255092778282e-05, + "loss": 0.176, + "step": 6220 + }, + { + "epoch": 0.3904242652127593, + "grad_norm": 0.16813886165618896, + "learning_rate": 4.395199594688734e-05, + "loss": 0.0789, + "step": 6230 + }, + { + "epoch": 0.39105094942658397, + "grad_norm": 0.6697870492935181, + "learning_rate": 4.3941440965991855e-05, + "loss": 0.0794, + "step": 6240 + }, + { + "epoch": 0.3916776336404086, + "grad_norm": 2.156053066253662, + "learning_rate": 4.393088598509637e-05, + "loss": 0.207, + "step": 6250 + }, + { + "epoch": 0.39230431785423325, + "grad_norm": 0.9125531911849976, + "learning_rate": 4.392033100420088e-05, + "loss": 0.2005, + "step": 6260 + }, + { + "epoch": 0.3929310020680579, + "grad_norm": 1.7858306169509888, + "learning_rate": 4.39097760233054e-05, + "loss": 0.126, + "step": 6270 + }, + { + "epoch": 0.39355768628188253, + "grad_norm": 4.1872687339782715, + "learning_rate": 4.3899221042409915e-05, + "loss": 0.2317, + "step": 6280 + }, + { + "epoch": 0.3941843704957072, + "grad_norm": 0.26265501976013184, + "learning_rate": 4.388866606151443e-05, + "loss": 0.144, + "step": 6290 + }, + { + "epoch": 0.3948110547095319, + "grad_norm": 0.15700949728488922, + "learning_rate": 4.387811108061894e-05, + "loss": 0.1866, + "step": 6300 + }, + { + "epoch": 0.39543773892335654, + "grad_norm": 3.4927055835723877, + "learning_rate": 4.386755609972346e-05, + "loss": 0.1554, + "step": 6310 + }, + { + "epoch": 0.39606442313718115, + "grad_norm": 0.31977131962776184, + "learning_rate": 4.3857001118827975e-05, + "loss": 0.249, + "step": 6320 + }, + { + "epoch": 0.3966911073510058, + "grad_norm": 0.11661989986896515, + "learning_rate": 4.384644613793249e-05, + "loss": 0.1728, + "step": 6330 + }, + { + "epoch": 0.3973177915648305, + "grad_norm": 3.7508764266967773, + "learning_rate": 4.383589115703701e-05, + "loss": 0.3801, + "step": 6340 + }, + { + "epoch": 0.39794447577865516, + "grad_norm": 0.44953954219818115, + "learning_rate": 4.3825336176141525e-05, + "loss": 0.0824, + "step": 6350 + }, + { + "epoch": 0.3985711599924798, + "grad_norm": 0.4894031286239624, + "learning_rate": 4.381478119524604e-05, + "loss": 0.1609, + "step": 6360 + }, + { + "epoch": 0.39919784420630444, + "grad_norm": 0.49157753586769104, + "learning_rate": 4.380422621435055e-05, + "loss": 0.135, + "step": 6370 + }, + { + "epoch": 0.3998245284201291, + "grad_norm": 0.10516194254159927, + "learning_rate": 4.379367123345507e-05, + "loss": 0.1034, + "step": 6380 + }, + { + "epoch": 0.4004512126339537, + "grad_norm": 0.24221737682819366, + "learning_rate": 4.3783116252559585e-05, + "loss": 0.1595, + "step": 6390 + }, + { + "epoch": 0.4010778968477784, + "grad_norm": 0.21182815730571747, + "learning_rate": 4.37725612716641e-05, + "loss": 0.0767, + "step": 6400 + }, + { + "epoch": 0.40170458106160306, + "grad_norm": 0.17457489669322968, + "learning_rate": 4.376200629076861e-05, + "loss": 0.1248, + "step": 6410 + }, + { + "epoch": 0.40233126527542773, + "grad_norm": 3.25508975982666, + "learning_rate": 4.3751451309873135e-05, + "loss": 0.1442, + "step": 6420 + }, + { + "epoch": 0.40295794948925234, + "grad_norm": 0.06830067932605743, + "learning_rate": 4.374089632897765e-05, + "loss": 0.1064, + "step": 6430 + }, + { + "epoch": 0.403584633703077, + "grad_norm": 4.027711868286133, + "learning_rate": 4.373034134808216e-05, + "loss": 0.2826, + "step": 6440 + }, + { + "epoch": 0.4042113179169017, + "grad_norm": 6.067227840423584, + "learning_rate": 4.371978636718668e-05, + "loss": 0.1604, + "step": 6450 + }, + { + "epoch": 0.40483800213072635, + "grad_norm": 0.2599928081035614, + "learning_rate": 4.3709231386291195e-05, + "loss": 0.3021, + "step": 6460 + }, + { + "epoch": 0.40546468634455096, + "grad_norm": 0.2688082456588745, + "learning_rate": 4.369867640539571e-05, + "loss": 0.2037, + "step": 6470 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 0.13320320844650269, + "learning_rate": 4.368812142450022e-05, + "loss": 0.0776, + "step": 6480 + }, + { + "epoch": 0.4067180547722003, + "grad_norm": 0.8527864813804626, + "learning_rate": 4.367756644360474e-05, + "loss": 0.1801, + "step": 6490 + }, + { + "epoch": 0.40734473898602497, + "grad_norm": 14.777264595031738, + "learning_rate": 4.3667011462709255e-05, + "loss": 0.117, + "step": 6500 + }, + { + "epoch": 0.4079714231998496, + "grad_norm": 0.06564256548881531, + "learning_rate": 4.365645648181377e-05, + "loss": 0.1407, + "step": 6510 + }, + { + "epoch": 0.40859810741367425, + "grad_norm": 93.907470703125, + "learning_rate": 4.364590150091829e-05, + "loss": 0.1947, + "step": 6520 + }, + { + "epoch": 0.4092247916274989, + "grad_norm": 0.08441044390201569, + "learning_rate": 4.3635346520022804e-05, + "loss": 0.0147, + "step": 6530 + }, + { + "epoch": 0.40985147584132353, + "grad_norm": 6.032024383544922, + "learning_rate": 4.362479153912732e-05, + "loss": 0.1261, + "step": 6540 + }, + { + "epoch": 0.4104781600551482, + "grad_norm": 0.7132280468940735, + "learning_rate": 4.361423655823183e-05, + "loss": 0.1141, + "step": 6550 + }, + { + "epoch": 0.41110484426897287, + "grad_norm": 0.052206575870513916, + "learning_rate": 4.360368157733635e-05, + "loss": 0.0464, + "step": 6560 + }, + { + "epoch": 0.41173152848279754, + "grad_norm": 0.04777556285262108, + "learning_rate": 4.3593126596440864e-05, + "loss": 0.0635, + "step": 6570 + }, + { + "epoch": 0.41235821269662215, + "grad_norm": 0.02872728556394577, + "learning_rate": 4.3582571615545374e-05, + "loss": 0.0768, + "step": 6580 + }, + { + "epoch": 0.4129848969104468, + "grad_norm": 0.03338625654578209, + "learning_rate": 4.357201663464989e-05, + "loss": 0.1111, + "step": 6590 + }, + { + "epoch": 0.4136115811242715, + "grad_norm": 0.46986547112464905, + "learning_rate": 4.356146165375441e-05, + "loss": 0.1058, + "step": 6600 + }, + { + "epoch": 0.41423826533809616, + "grad_norm": 0.296297162771225, + "learning_rate": 4.3550906672858924e-05, + "loss": 0.1546, + "step": 6610 + }, + { + "epoch": 0.41486494955192077, + "grad_norm": 4.170858383178711, + "learning_rate": 4.354035169196344e-05, + "loss": 0.1412, + "step": 6620 + }, + { + "epoch": 0.41549163376574544, + "grad_norm": 0.05129874125123024, + "learning_rate": 4.352979671106796e-05, + "loss": 0.0897, + "step": 6630 + }, + { + "epoch": 0.4161183179795701, + "grad_norm": 3.248608112335205, + "learning_rate": 4.3519241730172474e-05, + "loss": 0.2496, + "step": 6640 + }, + { + "epoch": 0.4167450021933948, + "grad_norm": 4.730010509490967, + "learning_rate": 4.3508686749276984e-05, + "loss": 0.0941, + "step": 6650 + }, + { + "epoch": 0.4173716864072194, + "grad_norm": 0.11405862867832184, + "learning_rate": 4.34981317683815e-05, + "loss": 0.0985, + "step": 6660 + }, + { + "epoch": 0.41799837062104406, + "grad_norm": 0.29627180099487305, + "learning_rate": 4.348757678748602e-05, + "loss": 0.1299, + "step": 6670 + }, + { + "epoch": 0.4186250548348687, + "grad_norm": 0.11819622665643692, + "learning_rate": 4.3477021806590534e-05, + "loss": 0.2367, + "step": 6680 + }, + { + "epoch": 0.41925173904869334, + "grad_norm": 7.079653739929199, + "learning_rate": 4.3466466825695044e-05, + "loss": 0.0979, + "step": 6690 + }, + { + "epoch": 0.419878423262518, + "grad_norm": 0.5191217660903931, + "learning_rate": 4.345591184479956e-05, + "loss": 0.1518, + "step": 6700 + }, + { + "epoch": 0.4205051074763427, + "grad_norm": 3.152400493621826, + "learning_rate": 4.344535686390408e-05, + "loss": 0.252, + "step": 6710 + }, + { + "epoch": 0.42113179169016735, + "grad_norm": 1.489100456237793, + "learning_rate": 4.3434801883008594e-05, + "loss": 0.2649, + "step": 6720 + }, + { + "epoch": 0.42175847590399196, + "grad_norm": 0.1367146521806717, + "learning_rate": 4.342424690211311e-05, + "loss": 0.0611, + "step": 6730 + }, + { + "epoch": 0.4223851601178166, + "grad_norm": 0.22044914960861206, + "learning_rate": 4.341369192121763e-05, + "loss": 0.1254, + "step": 6740 + }, + { + "epoch": 0.4230118443316413, + "grad_norm": 0.045484308153390884, + "learning_rate": 4.3403136940322144e-05, + "loss": 0.0632, + "step": 6750 + }, + { + "epoch": 0.42363852854546596, + "grad_norm": 0.32636168599128723, + "learning_rate": 4.3392581959426654e-05, + "loss": 0.0894, + "step": 6760 + }, + { + "epoch": 0.4242652127592906, + "grad_norm": 0.35098814964294434, + "learning_rate": 4.338202697853117e-05, + "loss": 0.3407, + "step": 6770 + }, + { + "epoch": 0.42489189697311525, + "grad_norm": 0.25456687808036804, + "learning_rate": 4.337147199763569e-05, + "loss": 0.1445, + "step": 6780 + }, + { + "epoch": 0.4255185811869399, + "grad_norm": 0.39973339438438416, + "learning_rate": 4.33609170167402e-05, + "loss": 0.204, + "step": 6790 + }, + { + "epoch": 0.42614526540076453, + "grad_norm": 0.36315247416496277, + "learning_rate": 4.3350362035844713e-05, + "loss": 0.2102, + "step": 6800 + }, + { + "epoch": 0.4267719496145892, + "grad_norm": 0.31502068042755127, + "learning_rate": 4.333980705494923e-05, + "loss": 0.1505, + "step": 6810 + }, + { + "epoch": 0.42739863382841387, + "grad_norm": 0.1232970803976059, + "learning_rate": 4.332925207405375e-05, + "loss": 0.2021, + "step": 6820 + }, + { + "epoch": 0.42802531804223853, + "grad_norm": 3.6902544498443604, + "learning_rate": 4.331869709315826e-05, + "loss": 0.1584, + "step": 6830 + }, + { + "epoch": 0.42865200225606315, + "grad_norm": 0.17084215581417084, + "learning_rate": 4.330814211226278e-05, + "loss": 0.0913, + "step": 6840 + }, + { + "epoch": 0.4292786864698878, + "grad_norm": 0.18105857074260712, + "learning_rate": 4.32975871313673e-05, + "loss": 0.1581, + "step": 6850 + }, + { + "epoch": 0.4299053706837125, + "grad_norm": 1.236863613128662, + "learning_rate": 4.328703215047181e-05, + "loss": 0.1888, + "step": 6860 + }, + { + "epoch": 0.43053205489753715, + "grad_norm": 0.0806477889418602, + "learning_rate": 4.327647716957632e-05, + "loss": 0.0882, + "step": 6870 + }, + { + "epoch": 0.43115873911136177, + "grad_norm": 0.1351771354675293, + "learning_rate": 4.326592218868084e-05, + "loss": 0.0103, + "step": 6880 + }, + { + "epoch": 0.43178542332518643, + "grad_norm": 0.04418093338608742, + "learning_rate": 4.3255367207785356e-05, + "loss": 0.1063, + "step": 6890 + }, + { + "epoch": 0.4324121075390111, + "grad_norm": 0.06696956604719162, + "learning_rate": 4.3244812226889866e-05, + "loss": 0.1186, + "step": 6900 + }, + { + "epoch": 0.43303879175283577, + "grad_norm": 2.718379020690918, + "learning_rate": 4.323425724599438e-05, + "loss": 0.2461, + "step": 6910 + }, + { + "epoch": 0.4336654759666604, + "grad_norm": 3.3854384422302246, + "learning_rate": 4.3223702265098906e-05, + "loss": 0.2563, + "step": 6920 + }, + { + "epoch": 0.43429216018048505, + "grad_norm": 6.008489608764648, + "learning_rate": 4.321314728420342e-05, + "loss": 0.1759, + "step": 6930 + }, + { + "epoch": 0.4349188443943097, + "grad_norm": 0.43417903780937195, + "learning_rate": 4.320259230330793e-05, + "loss": 0.0919, + "step": 6940 + }, + { + "epoch": 0.43554552860813434, + "grad_norm": 10.172123908996582, + "learning_rate": 4.319203732241245e-05, + "loss": 0.2124, + "step": 6950 + }, + { + "epoch": 0.436172212821959, + "grad_norm": 0.26110365986824036, + "learning_rate": 4.3181482341516966e-05, + "loss": 0.2002, + "step": 6960 + }, + { + "epoch": 0.4367988970357837, + "grad_norm": 3.4817702770233154, + "learning_rate": 4.3170927360621476e-05, + "loss": 0.1704, + "step": 6970 + }, + { + "epoch": 0.43742558124960834, + "grad_norm": 2.7794384956359863, + "learning_rate": 4.316037237972599e-05, + "loss": 0.1826, + "step": 6980 + }, + { + "epoch": 0.43805226546343295, + "grad_norm": 1.7215393781661987, + "learning_rate": 4.314981739883051e-05, + "loss": 0.1402, + "step": 6990 + }, + { + "epoch": 0.4386789496772576, + "grad_norm": 0.0389961376786232, + "learning_rate": 4.3139262417935026e-05, + "loss": 0.0581, + "step": 7000 + }, + { + "epoch": 0.4393056338910823, + "grad_norm": 0.11511804163455963, + "learning_rate": 4.312870743703954e-05, + "loss": 0.1763, + "step": 7010 + }, + { + "epoch": 0.43993231810490696, + "grad_norm": 0.4011329412460327, + "learning_rate": 4.311815245614406e-05, + "loss": 0.012, + "step": 7020 + }, + { + "epoch": 0.4405590023187316, + "grad_norm": 8.781197547912598, + "learning_rate": 4.3107597475248576e-05, + "loss": 0.084, + "step": 7030 + }, + { + "epoch": 0.44118568653255624, + "grad_norm": 1.0521060228347778, + "learning_rate": 4.3097042494353086e-05, + "loss": 0.2094, + "step": 7040 + }, + { + "epoch": 0.4418123707463809, + "grad_norm": 4.4334330558776855, + "learning_rate": 4.30864875134576e-05, + "loss": 0.1565, + "step": 7050 + }, + { + "epoch": 0.4424390549602056, + "grad_norm": 3.078625440597534, + "learning_rate": 4.307593253256212e-05, + "loss": 0.1848, + "step": 7060 + }, + { + "epoch": 0.4430657391740302, + "grad_norm": 0.9323499202728271, + "learning_rate": 4.3065377551666636e-05, + "loss": 0.1174, + "step": 7070 + }, + { + "epoch": 0.44369242338785486, + "grad_norm": 0.04099570959806442, + "learning_rate": 4.3054822570771146e-05, + "loss": 0.2271, + "step": 7080 + }, + { + "epoch": 0.44431910760167953, + "grad_norm": 4.120002746582031, + "learning_rate": 4.304426758987566e-05, + "loss": 0.0977, + "step": 7090 + }, + { + "epoch": 0.44494579181550414, + "grad_norm": 0.0420520082116127, + "learning_rate": 4.303371260898018e-05, + "loss": 0.0655, + "step": 7100 + }, + { + "epoch": 0.4455724760293288, + "grad_norm": 0.03804453834891319, + "learning_rate": 4.3023157628084696e-05, + "loss": 0.0494, + "step": 7110 + }, + { + "epoch": 0.4461991602431535, + "grad_norm": 3.8197927474975586, + "learning_rate": 4.301260264718921e-05, + "loss": 0.1237, + "step": 7120 + }, + { + "epoch": 0.44682584445697815, + "grad_norm": 3.5626463890075684, + "learning_rate": 4.300204766629373e-05, + "loss": 0.1843, + "step": 7130 + }, + { + "epoch": 0.44745252867080276, + "grad_norm": 0.12504714727401733, + "learning_rate": 4.2991492685398246e-05, + "loss": 0.122, + "step": 7140 + }, + { + "epoch": 0.44807921288462743, + "grad_norm": 2.9400346279144287, + "learning_rate": 4.2980937704502756e-05, + "loss": 0.2058, + "step": 7150 + }, + { + "epoch": 0.4487058970984521, + "grad_norm": 0.2937048375606537, + "learning_rate": 4.297038272360727e-05, + "loss": 0.2501, + "step": 7160 + }, + { + "epoch": 0.44933258131227677, + "grad_norm": 0.18238890171051025, + "learning_rate": 4.295982774271179e-05, + "loss": 0.0663, + "step": 7170 + }, + { + "epoch": 0.4499592655261014, + "grad_norm": 0.12388616055250168, + "learning_rate": 4.29492727618163e-05, + "loss": 0.0381, + "step": 7180 + }, + { + "epoch": 0.45058594973992605, + "grad_norm": 5.601403713226318, + "learning_rate": 4.2938717780920815e-05, + "loss": 0.2703, + "step": 7190 + }, + { + "epoch": 0.4512126339537507, + "grad_norm": 0.20709013938903809, + "learning_rate": 4.292816280002533e-05, + "loss": 0.0724, + "step": 7200 + }, + { + "epoch": 0.45183931816757533, + "grad_norm": 3.6804709434509277, + "learning_rate": 4.291760781912985e-05, + "loss": 0.1287, + "step": 7210 + }, + { + "epoch": 0.4524660023814, + "grad_norm": 6.19010066986084, + "learning_rate": 4.2907052838234365e-05, + "loss": 0.3109, + "step": 7220 + }, + { + "epoch": 0.45309268659522467, + "grad_norm": 0.1047079935669899, + "learning_rate": 4.289649785733888e-05, + "loss": 0.1165, + "step": 7230 + }, + { + "epoch": 0.45371937080904934, + "grad_norm": 2.2670769691467285, + "learning_rate": 4.28859428764434e-05, + "loss": 0.0588, + "step": 7240 + }, + { + "epoch": 0.45434605502287395, + "grad_norm": 39.598915100097656, + "learning_rate": 4.2875387895547915e-05, + "loss": 0.3843, + "step": 7250 + }, + { + "epoch": 0.4549727392366986, + "grad_norm": 0.05779840797185898, + "learning_rate": 4.2864832914652425e-05, + "loss": 0.0862, + "step": 7260 + }, + { + "epoch": 0.4555994234505233, + "grad_norm": 0.09832815825939178, + "learning_rate": 4.285427793375694e-05, + "loss": 0.2489, + "step": 7270 + }, + { + "epoch": 0.45622610766434796, + "grad_norm": 0.26292064785957336, + "learning_rate": 4.284372295286146e-05, + "loss": 0.2071, + "step": 7280 + }, + { + "epoch": 0.45685279187817257, + "grad_norm": 5.3300557136535645, + "learning_rate": 4.283316797196597e-05, + "loss": 0.1721, + "step": 7290 + }, + { + "epoch": 0.45747947609199724, + "grad_norm": 0.23609036207199097, + "learning_rate": 4.2822612991070485e-05, + "loss": 0.0499, + "step": 7300 + }, + { + "epoch": 0.4581061603058219, + "grad_norm": 1.547780990600586, + "learning_rate": 4.2812058010175e-05, + "loss": 0.0725, + "step": 7310 + }, + { + "epoch": 0.4587328445196466, + "grad_norm": 2.538231611251831, + "learning_rate": 4.280150302927952e-05, + "loss": 0.2541, + "step": 7320 + }, + { + "epoch": 0.4593595287334712, + "grad_norm": 0.7122913002967834, + "learning_rate": 4.2790948048384035e-05, + "loss": 0.0331, + "step": 7330 + }, + { + "epoch": 0.45998621294729586, + "grad_norm": 0.13575519621372223, + "learning_rate": 4.278039306748855e-05, + "loss": 0.1148, + "step": 7340 + }, + { + "epoch": 0.4606128971611205, + "grad_norm": 0.37606069445610046, + "learning_rate": 4.276983808659307e-05, + "loss": 0.169, + "step": 7350 + }, + { + "epoch": 0.46123958137494514, + "grad_norm": 0.06538957357406616, + "learning_rate": 4.275928310569758e-05, + "loss": 0.1728, + "step": 7360 + }, + { + "epoch": 0.4618662655887698, + "grad_norm": 0.12389545887708664, + "learning_rate": 4.2748728124802095e-05, + "loss": 0.1081, + "step": 7370 + }, + { + "epoch": 0.4624929498025945, + "grad_norm": 0.15639159083366394, + "learning_rate": 4.273817314390661e-05, + "loss": 0.1994, + "step": 7380 + }, + { + "epoch": 0.46311963401641915, + "grad_norm": 2.5816802978515625, + "learning_rate": 4.272761816301113e-05, + "loss": 0.1382, + "step": 7390 + }, + { + "epoch": 0.46374631823024376, + "grad_norm": 0.1728868931531906, + "learning_rate": 4.271706318211564e-05, + "loss": 0.061, + "step": 7400 + }, + { + "epoch": 0.46437300244406843, + "grad_norm": 0.4691903293132782, + "learning_rate": 4.2706508201220155e-05, + "loss": 0.0113, + "step": 7410 + }, + { + "epoch": 0.4649996866578931, + "grad_norm": 0.23932453989982605, + "learning_rate": 4.269595322032468e-05, + "loss": 0.2199, + "step": 7420 + }, + { + "epoch": 0.46562637087171777, + "grad_norm": 0.6389980912208557, + "learning_rate": 4.268539823942919e-05, + "loss": 0.1017, + "step": 7430 + }, + { + "epoch": 0.4662530550855424, + "grad_norm": 2.4998295307159424, + "learning_rate": 4.2674843258533705e-05, + "loss": 0.222, + "step": 7440 + }, + { + "epoch": 0.46687973929936705, + "grad_norm": 0.17659592628479004, + "learning_rate": 4.266428827763822e-05, + "loss": 0.213, + "step": 7450 + }, + { + "epoch": 0.4675064235131917, + "grad_norm": 0.48938411474227905, + "learning_rate": 4.265373329674274e-05, + "loss": 0.1021, + "step": 7460 + }, + { + "epoch": 0.46813310772701633, + "grad_norm": 3.5176033973693848, + "learning_rate": 4.264317831584725e-05, + "loss": 0.2315, + "step": 7470 + }, + { + "epoch": 0.468759791940841, + "grad_norm": 3.836408853530884, + "learning_rate": 4.2632623334951764e-05, + "loss": 0.2891, + "step": 7480 + }, + { + "epoch": 0.46938647615466567, + "grad_norm": 0.2693319618701935, + "learning_rate": 4.262206835405628e-05, + "loss": 0.1225, + "step": 7490 + }, + { + "epoch": 0.47001316036849033, + "grad_norm": 0.3623167872428894, + "learning_rate": 4.261151337316079e-05, + "loss": 0.0797, + "step": 7500 + }, + { + "epoch": 0.47063984458231495, + "grad_norm": 0.27036330103874207, + "learning_rate": 4.2600958392265314e-05, + "loss": 0.1628, + "step": 7510 + }, + { + "epoch": 0.4712665287961396, + "grad_norm": 0.13844487071037292, + "learning_rate": 4.259040341136983e-05, + "loss": 0.1731, + "step": 7520 + }, + { + "epoch": 0.4718932130099643, + "grad_norm": 0.19326917827129364, + "learning_rate": 4.257984843047435e-05, + "loss": 0.0219, + "step": 7530 + }, + { + "epoch": 0.47251989722378895, + "grad_norm": 0.03843018412590027, + "learning_rate": 4.256929344957886e-05, + "loss": 0.0881, + "step": 7540 + }, + { + "epoch": 0.47314658143761357, + "grad_norm": 1.3060014247894287, + "learning_rate": 4.2558738468683374e-05, + "loss": 0.2043, + "step": 7550 + }, + { + "epoch": 0.47377326565143824, + "grad_norm": 0.21384786069393158, + "learning_rate": 4.254818348778789e-05, + "loss": 0.3474, + "step": 7560 + }, + { + "epoch": 0.4743999498652629, + "grad_norm": 1.6711466312408447, + "learning_rate": 4.25376285068924e-05, + "loss": 0.171, + "step": 7570 + }, + { + "epoch": 0.4750266340790876, + "grad_norm": 0.1102273091673851, + "learning_rate": 4.252707352599692e-05, + "loss": 0.0721, + "step": 7580 + }, + { + "epoch": 0.4756533182929122, + "grad_norm": 0.3417176902294159, + "learning_rate": 4.2516518545101434e-05, + "loss": 0.1601, + "step": 7590 + }, + { + "epoch": 0.47628000250673685, + "grad_norm": 0.049307115375995636, + "learning_rate": 4.250596356420595e-05, + "loss": 0.0476, + "step": 7600 + }, + { + "epoch": 0.4769066867205615, + "grad_norm": 0.051750484853982925, + "learning_rate": 4.249540858331047e-05, + "loss": 0.0123, + "step": 7610 + }, + { + "epoch": 0.47753337093438614, + "grad_norm": 0.17750048637390137, + "learning_rate": 4.2484853602414984e-05, + "loss": 0.2373, + "step": 7620 + }, + { + "epoch": 0.4781600551482108, + "grad_norm": 0.0933203399181366, + "learning_rate": 4.24742986215195e-05, + "loss": 0.0497, + "step": 7630 + }, + { + "epoch": 0.4787867393620355, + "grad_norm": 0.035495299845933914, + "learning_rate": 4.246374364062402e-05, + "loss": 0.0878, + "step": 7640 + }, + { + "epoch": 0.47941342357586014, + "grad_norm": 0.04313305392861366, + "learning_rate": 4.245318865972853e-05, + "loss": 0.1121, + "step": 7650 + }, + { + "epoch": 0.48004010778968476, + "grad_norm": 0.3072851002216339, + "learning_rate": 4.2442633678833044e-05, + "loss": 0.3325, + "step": 7660 + }, + { + "epoch": 0.4806667920035094, + "grad_norm": 0.527976393699646, + "learning_rate": 4.243207869793756e-05, + "loss": 0.1979, + "step": 7670 + }, + { + "epoch": 0.4812934762173341, + "grad_norm": 0.2459559291601181, + "learning_rate": 4.242152371704207e-05, + "loss": 0.1636, + "step": 7680 + }, + { + "epoch": 0.48192016043115876, + "grad_norm": 0.31813257932662964, + "learning_rate": 4.241096873614659e-05, + "loss": 0.1305, + "step": 7690 + }, + { + "epoch": 0.4825468446449834, + "grad_norm": 2.2913360595703125, + "learning_rate": 4.2400413755251104e-05, + "loss": 0.1391, + "step": 7700 + }, + { + "epoch": 0.48317352885880804, + "grad_norm": 0.3729984760284424, + "learning_rate": 4.238985877435562e-05, + "loss": 0.045, + "step": 7710 + }, + { + "epoch": 0.4838002130726327, + "grad_norm": 3.880664587020874, + "learning_rate": 4.237930379346014e-05, + "loss": 0.1644, + "step": 7720 + }, + { + "epoch": 0.4844268972864574, + "grad_norm": 0.22369036078453064, + "learning_rate": 4.2368748812564654e-05, + "loss": 0.1288, + "step": 7730 + }, + { + "epoch": 0.485053581500282, + "grad_norm": 13.893917083740234, + "learning_rate": 4.235819383166917e-05, + "loss": 0.1569, + "step": 7740 + }, + { + "epoch": 0.48568026571410666, + "grad_norm": 4.144253730773926, + "learning_rate": 4.234763885077368e-05, + "loss": 0.0984, + "step": 7750 + }, + { + "epoch": 0.48630694992793133, + "grad_norm": 0.7604568004608154, + "learning_rate": 4.23370838698782e-05, + "loss": 0.1548, + "step": 7760 + }, + { + "epoch": 0.48693363414175594, + "grad_norm": 2.1528584957122803, + "learning_rate": 4.232652888898271e-05, + "loss": 0.1772, + "step": 7770 + }, + { + "epoch": 0.4875603183555806, + "grad_norm": 3.7687759399414062, + "learning_rate": 4.231597390808723e-05, + "loss": 0.2759, + "step": 7780 + }, + { + "epoch": 0.4881870025694053, + "grad_norm": 0.08065836876630783, + "learning_rate": 4.230541892719174e-05, + "loss": 0.1073, + "step": 7790 + }, + { + "epoch": 0.48881368678322995, + "grad_norm": 0.07277223467826843, + "learning_rate": 4.2294863946296257e-05, + "loss": 0.1238, + "step": 7800 + }, + { + "epoch": 0.48944037099705456, + "grad_norm": 0.6644709706306458, + "learning_rate": 4.228430896540077e-05, + "loss": 0.0536, + "step": 7810 + }, + { + "epoch": 0.49006705521087923, + "grad_norm": 3.4870352745056152, + "learning_rate": 4.227375398450529e-05, + "loss": 0.1726, + "step": 7820 + }, + { + "epoch": 0.4906937394247039, + "grad_norm": 0.24318358302116394, + "learning_rate": 4.2263199003609807e-05, + "loss": 0.042, + "step": 7830 + }, + { + "epoch": 0.49132042363852857, + "grad_norm": 0.18962280452251434, + "learning_rate": 4.225264402271432e-05, + "loss": 0.2307, + "step": 7840 + }, + { + "epoch": 0.4919471078523532, + "grad_norm": 0.16015306115150452, + "learning_rate": 4.224208904181884e-05, + "loss": 0.1665, + "step": 7850 + }, + { + "epoch": 0.49257379206617785, + "grad_norm": 0.22559410333633423, + "learning_rate": 4.223153406092335e-05, + "loss": 0.117, + "step": 7860 + }, + { + "epoch": 0.4932004762800025, + "grad_norm": 5.2421793937683105, + "learning_rate": 4.2220979080027866e-05, + "loss": 0.1174, + "step": 7870 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 2.1046204566955566, + "learning_rate": 4.221042409913238e-05, + "loss": 0.0429, + "step": 7880 + }, + { + "epoch": 0.4944538447076518, + "grad_norm": 0.19564154744148254, + "learning_rate": 4.219986911823689e-05, + "loss": 0.1396, + "step": 7890 + }, + { + "epoch": 0.49508052892147647, + "grad_norm": 4.799094200134277, + "learning_rate": 4.218931413734141e-05, + "loss": 0.1839, + "step": 7900 + }, + { + "epoch": 0.49570721313530114, + "grad_norm": 0.08568393439054489, + "learning_rate": 4.2178759156445926e-05, + "loss": 0.082, + "step": 7910 + }, + { + "epoch": 0.49633389734912575, + "grad_norm": 2.0710394382476807, + "learning_rate": 4.216820417555045e-05, + "loss": 0.187, + "step": 7920 + }, + { + "epoch": 0.4969605815629504, + "grad_norm": 5.502629280090332, + "learning_rate": 4.215764919465496e-05, + "loss": 0.2082, + "step": 7930 + }, + { + "epoch": 0.4975872657767751, + "grad_norm": 1.7179450988769531, + "learning_rate": 4.2147094213759476e-05, + "loss": 0.1262, + "step": 7940 + }, + { + "epoch": 0.49821394999059976, + "grad_norm": 0.49480941891670227, + "learning_rate": 4.213653923286399e-05, + "loss": 0.1315, + "step": 7950 + }, + { + "epoch": 0.49884063420442437, + "grad_norm": 0.3045163154602051, + "learning_rate": 4.21259842519685e-05, + "loss": 0.032, + "step": 7960 + }, + { + "epoch": 0.49946731841824904, + "grad_norm": 0.22712041437625885, + "learning_rate": 4.211542927107302e-05, + "loss": 0.1132, + "step": 7970 + }, + { + "epoch": 0.5000940026320737, + "grad_norm": 0.0418301597237587, + "learning_rate": 4.2104874290177536e-05, + "loss": 0.1, + "step": 7980 + }, + { + "epoch": 0.5007206868458983, + "grad_norm": 0.1894550770521164, + "learning_rate": 4.209431930928205e-05, + "loss": 0.0973, + "step": 7990 + }, + { + "epoch": 0.501347371059723, + "grad_norm": 0.19991464912891388, + "learning_rate": 4.208376432838656e-05, + "loss": 0.058, + "step": 8000 + }, + { + "epoch": 0.5019740552735477, + "grad_norm": 0.02192305587232113, + "learning_rate": 4.2073209347491086e-05, + "loss": 0.1147, + "step": 8010 + }, + { + "epoch": 0.5026007394873723, + "grad_norm": 0.163367360830307, + "learning_rate": 4.20626543665956e-05, + "loss": 0.112, + "step": 8020 + }, + { + "epoch": 0.503227423701197, + "grad_norm": 0.17916612327098846, + "learning_rate": 4.205209938570012e-05, + "loss": 0.1368, + "step": 8030 + }, + { + "epoch": 0.5038541079150216, + "grad_norm": 2.2590155601501465, + "learning_rate": 4.204154440480463e-05, + "loss": 0.1517, + "step": 8040 + }, + { + "epoch": 0.5044807921288462, + "grad_norm": 0.16151118278503418, + "learning_rate": 4.2030989423909146e-05, + "loss": 0.145, + "step": 8050 + }, + { + "epoch": 0.505107476342671, + "grad_norm": 2.493932008743286, + "learning_rate": 4.202043444301366e-05, + "loss": 0.2022, + "step": 8060 + }, + { + "epoch": 0.5057341605564956, + "grad_norm": 6.127912998199463, + "learning_rate": 4.200987946211817e-05, + "loss": 0.1948, + "step": 8070 + }, + { + "epoch": 0.5063608447703203, + "grad_norm": 2.1237828731536865, + "learning_rate": 4.199932448122269e-05, + "loss": 0.1675, + "step": 8080 + }, + { + "epoch": 0.5069875289841449, + "grad_norm": 2.187483549118042, + "learning_rate": 4.1988769500327206e-05, + "loss": 0.1937, + "step": 8090 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 0.5250789523124695, + "learning_rate": 4.197821451943172e-05, + "loss": 0.182, + "step": 8100 + }, + { + "epoch": 0.5082408974117942, + "grad_norm": 0.0238255113363266, + "learning_rate": 4.196765953853624e-05, + "loss": 0.0511, + "step": 8110 + }, + { + "epoch": 0.5088675816256188, + "grad_norm": 0.04943560063838959, + "learning_rate": 4.1957104557640756e-05, + "loss": 0.1251, + "step": 8120 + }, + { + "epoch": 0.5094942658394435, + "grad_norm": 0.1290494054555893, + "learning_rate": 4.194654957674527e-05, + "loss": 0.1943, + "step": 8130 + }, + { + "epoch": 0.5101209500532682, + "grad_norm": 0.400713175535202, + "learning_rate": 4.193599459584978e-05, + "loss": 0.2568, + "step": 8140 + }, + { + "epoch": 0.5107476342670928, + "grad_norm": 0.07351066917181015, + "learning_rate": 4.19254396149543e-05, + "loss": 0.0159, + "step": 8150 + }, + { + "epoch": 0.5113743184809174, + "grad_norm": 3.4980196952819824, + "learning_rate": 4.1914884634058815e-05, + "loss": 0.2556, + "step": 8160 + }, + { + "epoch": 0.5120010026947421, + "grad_norm": 0.1484570950269699, + "learning_rate": 4.190432965316333e-05, + "loss": 0.0522, + "step": 8170 + }, + { + "epoch": 0.5126276869085667, + "grad_norm": 0.07907593250274658, + "learning_rate": 4.189377467226784e-05, + "loss": 0.0977, + "step": 8180 + }, + { + "epoch": 0.5132543711223915, + "grad_norm": 0.056584157049655914, + "learning_rate": 4.188321969137236e-05, + "loss": 0.1622, + "step": 8190 + }, + { + "epoch": 0.5138810553362161, + "grad_norm": 0.1531023234128952, + "learning_rate": 4.1872664710476875e-05, + "loss": 0.0139, + "step": 8200 + }, + { + "epoch": 0.5145077395500407, + "grad_norm": 0.12977644801139832, + "learning_rate": 4.186210972958139e-05, + "loss": 0.1056, + "step": 8210 + }, + { + "epoch": 0.5151344237638654, + "grad_norm": 2.399658679962158, + "learning_rate": 4.185155474868591e-05, + "loss": 0.2637, + "step": 8220 + }, + { + "epoch": 0.51576110797769, + "grad_norm": 0.1842947155237198, + "learning_rate": 4.1840999767790425e-05, + "loss": 0.0058, + "step": 8230 + }, + { + "epoch": 0.5163877921915146, + "grad_norm": 0.04608267545700073, + "learning_rate": 4.183044478689494e-05, + "loss": 0.0202, + "step": 8240 + }, + { + "epoch": 0.5170144764053394, + "grad_norm": 0.1500760167837143, + "learning_rate": 4.181988980599945e-05, + "loss": 0.1144, + "step": 8250 + }, + { + "epoch": 0.517641160619164, + "grad_norm": 3.4322853088378906, + "learning_rate": 4.180933482510397e-05, + "loss": 0.1559, + "step": 8260 + }, + { + "epoch": 0.5182678448329887, + "grad_norm": 0.09054571390151978, + "learning_rate": 4.1798779844208485e-05, + "loss": 0.1007, + "step": 8270 + }, + { + "epoch": 0.5188945290468133, + "grad_norm": 0.19511841237545013, + "learning_rate": 4.1788224863312995e-05, + "loss": 0.0458, + "step": 8280 + }, + { + "epoch": 0.5195212132606379, + "grad_norm": 0.062319837510585785, + "learning_rate": 4.177766988241751e-05, + "loss": 0.0829, + "step": 8290 + }, + { + "epoch": 0.5201478974744627, + "grad_norm": 0.17525716125965118, + "learning_rate": 4.176711490152203e-05, + "loss": 0.1351, + "step": 8300 + }, + { + "epoch": 0.5207745816882873, + "grad_norm": 0.09585020691156387, + "learning_rate": 4.1756559920626545e-05, + "loss": 0.0419, + "step": 8310 + }, + { + "epoch": 0.5214012659021119, + "grad_norm": 0.04737556353211403, + "learning_rate": 4.174600493973106e-05, + "loss": 0.0599, + "step": 8320 + }, + { + "epoch": 0.5220279501159366, + "grad_norm": 0.3379499912261963, + "learning_rate": 4.173544995883558e-05, + "loss": 0.1293, + "step": 8330 + }, + { + "epoch": 0.5226546343297612, + "grad_norm": 0.03692694380879402, + "learning_rate": 4.1724894977940095e-05, + "loss": 0.0598, + "step": 8340 + }, + { + "epoch": 0.5232813185435858, + "grad_norm": 0.20106247067451477, + "learning_rate": 4.1714339997044605e-05, + "loss": 0.0969, + "step": 8350 + }, + { + "epoch": 0.5239080027574106, + "grad_norm": 0.07053105533123016, + "learning_rate": 4.170378501614912e-05, + "loss": 0.2857, + "step": 8360 + }, + { + "epoch": 0.5245346869712352, + "grad_norm": 0.15336304903030396, + "learning_rate": 4.169323003525364e-05, + "loss": 0.1083, + "step": 8370 + }, + { + "epoch": 0.5251613711850599, + "grad_norm": 0.30457621812820435, + "learning_rate": 4.1682675054358155e-05, + "loss": 0.0545, + "step": 8380 + }, + { + "epoch": 0.5257880553988845, + "grad_norm": 0.03971443325281143, + "learning_rate": 4.1672120073462664e-05, + "loss": 0.0083, + "step": 8390 + }, + { + "epoch": 0.5264147396127091, + "grad_norm": 0.09428033977746964, + "learning_rate": 4.166156509256718e-05, + "loss": 0.1767, + "step": 8400 + }, + { + "epoch": 0.5270414238265339, + "grad_norm": 0.18962548673152924, + "learning_rate": 4.16510101116717e-05, + "loss": 0.1163, + "step": 8410 + }, + { + "epoch": 0.5276681080403585, + "grad_norm": 10.36578369140625, + "learning_rate": 4.1640455130776214e-05, + "loss": 0.2371, + "step": 8420 + }, + { + "epoch": 0.5282947922541831, + "grad_norm": 331.3830261230469, + "learning_rate": 4.162990014988073e-05, + "loss": 0.102, + "step": 8430 + }, + { + "epoch": 0.5289214764680078, + "grad_norm": 0.3419227600097656, + "learning_rate": 4.161934516898525e-05, + "loss": 0.0567, + "step": 8440 + }, + { + "epoch": 0.5295481606818324, + "grad_norm": 0.0799892470240593, + "learning_rate": 4.1608790188089764e-05, + "loss": 0.0113, + "step": 8450 + }, + { + "epoch": 0.530174844895657, + "grad_norm": 0.36934471130371094, + "learning_rate": 4.1598235207194274e-05, + "loss": 0.2659, + "step": 8460 + }, + { + "epoch": 0.5308015291094818, + "grad_norm": 0.4492112398147583, + "learning_rate": 4.158768022629879e-05, + "loss": 0.127, + "step": 8470 + }, + { + "epoch": 0.5314282133233064, + "grad_norm": 0.19504284858703613, + "learning_rate": 4.157712524540331e-05, + "loss": 0.1399, + "step": 8480 + }, + { + "epoch": 0.5320548975371311, + "grad_norm": 0.7225850224494934, + "learning_rate": 4.1566570264507824e-05, + "loss": 0.0433, + "step": 8490 + }, + { + "epoch": 0.5326815817509557, + "grad_norm": 11.716758728027344, + "learning_rate": 4.1556015283612334e-05, + "loss": 0.2159, + "step": 8500 + }, + { + "epoch": 0.5333082659647803, + "grad_norm": 2.664440393447876, + "learning_rate": 4.154546030271686e-05, + "loss": 0.1738, + "step": 8510 + }, + { + "epoch": 0.533934950178605, + "grad_norm": 2.3211724758148193, + "learning_rate": 4.1534905321821374e-05, + "loss": 0.1761, + "step": 8520 + }, + { + "epoch": 0.5345616343924297, + "grad_norm": 0.3916930854320526, + "learning_rate": 4.1524350340925884e-05, + "loss": 0.1548, + "step": 8530 + }, + { + "epoch": 0.5351883186062543, + "grad_norm": 0.0378059558570385, + "learning_rate": 4.15137953600304e-05, + "loss": 0.1087, + "step": 8540 + }, + { + "epoch": 0.535815002820079, + "grad_norm": 4.720902919769287, + "learning_rate": 4.150324037913492e-05, + "loss": 0.2359, + "step": 8550 + }, + { + "epoch": 0.5364416870339036, + "grad_norm": 0.04153915494680405, + "learning_rate": 4.1492685398239434e-05, + "loss": 0.1439, + "step": 8560 + }, + { + "epoch": 0.5370683712477282, + "grad_norm": 0.03464338183403015, + "learning_rate": 4.1482130417343944e-05, + "loss": 0.1784, + "step": 8570 + }, + { + "epoch": 0.5376950554615529, + "grad_norm": 0.17074334621429443, + "learning_rate": 4.147157543644846e-05, + "loss": 0.1967, + "step": 8580 + }, + { + "epoch": 0.5383217396753776, + "grad_norm": 0.5356113314628601, + "learning_rate": 4.146102045555298e-05, + "loss": 0.2086, + "step": 8590 + }, + { + "epoch": 0.5389484238892023, + "grad_norm": 0.6513091921806335, + "learning_rate": 4.1450465474657494e-05, + "loss": 0.1085, + "step": 8600 + }, + { + "epoch": 0.5395751081030269, + "grad_norm": 0.09826638549566269, + "learning_rate": 4.143991049376201e-05, + "loss": 0.073, + "step": 8610 + }, + { + "epoch": 0.5402017923168515, + "grad_norm": 3.7576205730438232, + "learning_rate": 4.142935551286653e-05, + "loss": 0.1715, + "step": 8620 + }, + { + "epoch": 0.5408284765306762, + "grad_norm": 2.084686279296875, + "learning_rate": 4.1418800531971044e-05, + "loss": 0.1808, + "step": 8630 + }, + { + "epoch": 0.5414551607445008, + "grad_norm": 6.913938999176025, + "learning_rate": 4.1408245551075554e-05, + "loss": 0.0754, + "step": 8640 + }, + { + "epoch": 0.5420818449583255, + "grad_norm": 0.18210375308990479, + "learning_rate": 4.139769057018007e-05, + "loss": 0.0596, + "step": 8650 + }, + { + "epoch": 0.5427085291721502, + "grad_norm": 0.15195395052433014, + "learning_rate": 4.138713558928459e-05, + "loss": 0.1203, + "step": 8660 + }, + { + "epoch": 0.5433352133859748, + "grad_norm": 2.44944429397583, + "learning_rate": 4.13765806083891e-05, + "loss": 0.1983, + "step": 8670 + }, + { + "epoch": 0.5439618975997995, + "grad_norm": 0.9037182331085205, + "learning_rate": 4.1366025627493613e-05, + "loss": 0.0473, + "step": 8680 + }, + { + "epoch": 0.5445885818136241, + "grad_norm": 0.0421769879758358, + "learning_rate": 4.135547064659813e-05, + "loss": 0.2814, + "step": 8690 + }, + { + "epoch": 0.5452152660274487, + "grad_norm": 0.23261401057243347, + "learning_rate": 4.134491566570265e-05, + "loss": 0.1457, + "step": 8700 + }, + { + "epoch": 0.5458419502412735, + "grad_norm": 0.0654134452342987, + "learning_rate": 4.1334360684807163e-05, + "loss": 0.026, + "step": 8710 + }, + { + "epoch": 0.5464686344550981, + "grad_norm": 4.39208459854126, + "learning_rate": 4.132380570391168e-05, + "loss": 0.0946, + "step": 8720 + }, + { + "epoch": 0.5470953186689227, + "grad_norm": 0.5686179399490356, + "learning_rate": 4.13132507230162e-05, + "loss": 0.1669, + "step": 8730 + }, + { + "epoch": 0.5477220028827474, + "grad_norm": 4.334964275360107, + "learning_rate": 4.1302695742120707e-05, + "loss": 0.1729, + "step": 8740 + }, + { + "epoch": 0.548348687096572, + "grad_norm": 0.15098807215690613, + "learning_rate": 4.129214076122522e-05, + "loss": 0.1916, + "step": 8750 + }, + { + "epoch": 0.5489753713103966, + "grad_norm": 0.24327068030834198, + "learning_rate": 4.128158578032974e-05, + "loss": 0.1069, + "step": 8760 + }, + { + "epoch": 0.5496020555242214, + "grad_norm": 0.160555899143219, + "learning_rate": 4.1271030799434257e-05, + "loss": 0.1482, + "step": 8770 + }, + { + "epoch": 0.550228739738046, + "grad_norm": 0.2681258022785187, + "learning_rate": 4.1260475818538766e-05, + "loss": 0.0383, + "step": 8780 + }, + { + "epoch": 0.5508554239518707, + "grad_norm": 0.02957313507795334, + "learning_rate": 4.124992083764328e-05, + "loss": 0.006, + "step": 8790 + }, + { + "epoch": 0.5514821081656953, + "grad_norm": 0.49831515550613403, + "learning_rate": 4.12393658567478e-05, + "loss": 0.1116, + "step": 8800 + }, + { + "epoch": 0.5521087923795199, + "grad_norm": 0.022528450936079025, + "learning_rate": 4.1228810875852316e-05, + "loss": 0.0708, + "step": 8810 + }, + { + "epoch": 0.5527354765933447, + "grad_norm": 0.4816558063030243, + "learning_rate": 4.121825589495683e-05, + "loss": 0.2754, + "step": 8820 + }, + { + "epoch": 0.5533621608071693, + "grad_norm": 0.4593820571899414, + "learning_rate": 4.120770091406135e-05, + "loss": 0.1499, + "step": 8830 + }, + { + "epoch": 0.5539888450209939, + "grad_norm": 0.5131998062133789, + "learning_rate": 4.1197145933165866e-05, + "loss": 0.1313, + "step": 8840 + }, + { + "epoch": 0.5546155292348186, + "grad_norm": 0.5724000930786133, + "learning_rate": 4.1186590952270376e-05, + "loss": 0.2599, + "step": 8850 + }, + { + "epoch": 0.5552422134486432, + "grad_norm": 0.21304863691329956, + "learning_rate": 4.117603597137489e-05, + "loss": 0.2477, + "step": 8860 + }, + { + "epoch": 0.5558688976624678, + "grad_norm": 0.1989622563123703, + "learning_rate": 4.116548099047941e-05, + "loss": 0.1082, + "step": 8870 + }, + { + "epoch": 0.5564955818762926, + "grad_norm": 0.16896386444568634, + "learning_rate": 4.1154926009583926e-05, + "loss": 0.1285, + "step": 8880 + }, + { + "epoch": 0.5571222660901172, + "grad_norm": 0.305745005607605, + "learning_rate": 4.1144371028688436e-05, + "loss": 0.2389, + "step": 8890 + }, + { + "epoch": 0.5577489503039419, + "grad_norm": 0.23722957074642181, + "learning_rate": 4.113381604779295e-05, + "loss": 0.2969, + "step": 8900 + }, + { + "epoch": 0.5583756345177665, + "grad_norm": 0.3046024739742279, + "learning_rate": 4.1123261066897476e-05, + "loss": 0.1207, + "step": 8910 + }, + { + "epoch": 0.5590023187315911, + "grad_norm": 5.19449520111084, + "learning_rate": 4.1112706086001986e-05, + "loss": 0.1542, + "step": 8920 + }, + { + "epoch": 0.5596290029454158, + "grad_norm": 0.25576645135879517, + "learning_rate": 4.11021511051065e-05, + "loss": 0.0896, + "step": 8930 + }, + { + "epoch": 0.5602556871592405, + "grad_norm": 0.04636983945965767, + "learning_rate": 4.109159612421102e-05, + "loss": 0.0247, + "step": 8940 + }, + { + "epoch": 0.5608823713730651, + "grad_norm": 1.0605615377426147, + "learning_rate": 4.1081041143315536e-05, + "loss": 0.2397, + "step": 8950 + }, + { + "epoch": 0.5615090555868898, + "grad_norm": 0.077987901866436, + "learning_rate": 4.1070486162420046e-05, + "loss": 0.0849, + "step": 8960 + }, + { + "epoch": 0.5621357398007144, + "grad_norm": 0.4862576425075531, + "learning_rate": 4.105993118152456e-05, + "loss": 0.2118, + "step": 8970 + }, + { + "epoch": 0.562762424014539, + "grad_norm": 0.3730028569698334, + "learning_rate": 4.104937620062908e-05, + "loss": 0.2261, + "step": 8980 + }, + { + "epoch": 0.5633891082283637, + "grad_norm": 0.4581931233406067, + "learning_rate": 4.103882121973359e-05, + "loss": 0.1878, + "step": 8990 + }, + { + "epoch": 0.5640157924421884, + "grad_norm": 0.2025977373123169, + "learning_rate": 4.1028266238838106e-05, + "loss": 0.2524, + "step": 9000 + }, + { + "epoch": 0.5646424766560131, + "grad_norm": 0.7682130336761475, + "learning_rate": 4.101771125794263e-05, + "loss": 0.1354, + "step": 9010 + }, + { + "epoch": 0.5652691608698377, + "grad_norm": 0.149651437997818, + "learning_rate": 4.1007156277047146e-05, + "loss": 0.0785, + "step": 9020 + }, + { + "epoch": 0.5658958450836623, + "grad_norm": 0.07458806782960892, + "learning_rate": 4.0996601296151656e-05, + "loss": 0.0196, + "step": 9030 + }, + { + "epoch": 0.566522529297487, + "grad_norm": 44.61407470703125, + "learning_rate": 4.098604631525617e-05, + "loss": 0.0744, + "step": 9040 + }, + { + "epoch": 0.5671492135113116, + "grad_norm": 0.7599819898605347, + "learning_rate": 4.097549133436069e-05, + "loss": 0.1664, + "step": 9050 + }, + { + "epoch": 0.5677758977251363, + "grad_norm": 0.8320948481559753, + "learning_rate": 4.09649363534652e-05, + "loss": 0.2156, + "step": 9060 + }, + { + "epoch": 0.568402581938961, + "grad_norm": 0.09615202993154526, + "learning_rate": 4.0954381372569715e-05, + "loss": 0.0353, + "step": 9070 + }, + { + "epoch": 0.5690292661527856, + "grad_norm": 3.353734254837036, + "learning_rate": 4.094382639167423e-05, + "loss": 0.1515, + "step": 9080 + }, + { + "epoch": 0.5696559503666103, + "grad_norm": 0.161566361784935, + "learning_rate": 4.093327141077875e-05, + "loss": 0.2015, + "step": 9090 + }, + { + "epoch": 0.5702826345804349, + "grad_norm": 0.31871262192726135, + "learning_rate": 4.0922716429883265e-05, + "loss": 0.1474, + "step": 9100 + }, + { + "epoch": 0.5709093187942595, + "grad_norm": 2.3990204334259033, + "learning_rate": 4.091216144898778e-05, + "loss": 0.2111, + "step": 9110 + }, + { + "epoch": 0.5715360030080843, + "grad_norm": 0.35437873005867004, + "learning_rate": 4.09016064680923e-05, + "loss": 0.1354, + "step": 9120 + }, + { + "epoch": 0.5721626872219089, + "grad_norm": 0.26824936270713806, + "learning_rate": 4.089105148719681e-05, + "loss": 0.2922, + "step": 9130 + }, + { + "epoch": 0.5727893714357335, + "grad_norm": 0.1944427490234375, + "learning_rate": 4.0880496506301325e-05, + "loss": 0.0781, + "step": 9140 + }, + { + "epoch": 0.5734160556495582, + "grad_norm": 0.08825784921646118, + "learning_rate": 4.086994152540584e-05, + "loss": 0.1337, + "step": 9150 + }, + { + "epoch": 0.5740427398633828, + "grad_norm": 0.08061042428016663, + "learning_rate": 4.085938654451036e-05, + "loss": 0.0975, + "step": 9160 + }, + { + "epoch": 0.5746694240772074, + "grad_norm": 0.2497250884771347, + "learning_rate": 4.084883156361487e-05, + "loss": 0.0555, + "step": 9170 + }, + { + "epoch": 0.5752961082910322, + "grad_norm": 0.11670225113630295, + "learning_rate": 4.0838276582719385e-05, + "loss": 0.0195, + "step": 9180 + }, + { + "epoch": 0.5759227925048568, + "grad_norm": 0.09469518810510635, + "learning_rate": 4.08277216018239e-05, + "loss": 0.1508, + "step": 9190 + }, + { + "epoch": 0.5765494767186815, + "grad_norm": 0.04566322639584541, + "learning_rate": 4.081716662092842e-05, + "loss": 0.1271, + "step": 9200 + }, + { + "epoch": 0.5771761609325061, + "grad_norm": 0.6333693861961365, + "learning_rate": 4.0806611640032935e-05, + "loss": 0.2478, + "step": 9210 + }, + { + "epoch": 0.5778028451463307, + "grad_norm": 0.05764749273657799, + "learning_rate": 4.079605665913745e-05, + "loss": 0.0117, + "step": 9220 + }, + { + "epoch": 0.5784295293601555, + "grad_norm": 0.6019806265830994, + "learning_rate": 4.078550167824197e-05, + "loss": 0.1013, + "step": 9230 + }, + { + "epoch": 0.5790562135739801, + "grad_norm": 0.11212746053934097, + "learning_rate": 4.077494669734648e-05, + "loss": 0.0529, + "step": 9240 + }, + { + "epoch": 0.5796828977878047, + "grad_norm": 0.045619022101163864, + "learning_rate": 4.0764391716450995e-05, + "loss": 0.2901, + "step": 9250 + }, + { + "epoch": 0.5803095820016294, + "grad_norm": 0.20387081801891327, + "learning_rate": 4.075383673555551e-05, + "loss": 0.3294, + "step": 9260 + }, + { + "epoch": 0.580936266215454, + "grad_norm": 0.2026294320821762, + "learning_rate": 4.074328175466003e-05, + "loss": 0.0859, + "step": 9270 + }, + { + "epoch": 0.5815629504292786, + "grad_norm": 0.07742750644683838, + "learning_rate": 4.073272677376454e-05, + "loss": 0.0185, + "step": 9280 + }, + { + "epoch": 0.5821896346431034, + "grad_norm": 0.06963720172643661, + "learning_rate": 4.0722171792869055e-05, + "loss": 0.2457, + "step": 9290 + }, + { + "epoch": 0.582816318856928, + "grad_norm": 0.0938970223069191, + "learning_rate": 4.071161681197357e-05, + "loss": 0.0966, + "step": 9300 + }, + { + "epoch": 0.5834430030707527, + "grad_norm": 0.07957907766103745, + "learning_rate": 4.070106183107809e-05, + "loss": 0.0115, + "step": 9310 + }, + { + "epoch": 0.5840696872845773, + "grad_norm": 0.06466685980558395, + "learning_rate": 4.0690506850182605e-05, + "loss": 0.1959, + "step": 9320 + }, + { + "epoch": 0.5846963714984019, + "grad_norm": 8.07848834991455, + "learning_rate": 4.067995186928712e-05, + "loss": 0.1698, + "step": 9330 + }, + { + "epoch": 0.5853230557122266, + "grad_norm": 3.3286468982696533, + "learning_rate": 4.066939688839164e-05, + "loss": 0.1606, + "step": 9340 + }, + { + "epoch": 0.5859497399260513, + "grad_norm": 0.11040914058685303, + "learning_rate": 4.065884190749615e-05, + "loss": 0.1459, + "step": 9350 + }, + { + "epoch": 0.5865764241398759, + "grad_norm": 0.13724346458911896, + "learning_rate": 4.0648286926600664e-05, + "loss": 0.0829, + "step": 9360 + }, + { + "epoch": 0.5872031083537006, + "grad_norm": 0.22262322902679443, + "learning_rate": 4.063773194570518e-05, + "loss": 0.1914, + "step": 9370 + }, + { + "epoch": 0.5878297925675252, + "grad_norm": 0.16995050013065338, + "learning_rate": 4.062717696480969e-05, + "loss": 0.2103, + "step": 9380 + }, + { + "epoch": 0.5884564767813498, + "grad_norm": 0.22897568345069885, + "learning_rate": 4.061662198391421e-05, + "loss": 0.1384, + "step": 9390 + }, + { + "epoch": 0.5890831609951745, + "grad_norm": 3.172271490097046, + "learning_rate": 4.0606067003018724e-05, + "loss": 0.0875, + "step": 9400 + }, + { + "epoch": 0.5897098452089992, + "grad_norm": 0.12132357060909271, + "learning_rate": 4.059551202212325e-05, + "loss": 0.1005, + "step": 9410 + }, + { + "epoch": 0.5903365294228239, + "grad_norm": 0.09609333425760269, + "learning_rate": 4.058495704122776e-05, + "loss": 0.046, + "step": 9420 + }, + { + "epoch": 0.5909632136366485, + "grad_norm": 3.4071130752563477, + "learning_rate": 4.0574402060332274e-05, + "loss": 0.1476, + "step": 9430 + }, + { + "epoch": 0.5915898978504731, + "grad_norm": 0.08280141651630402, + "learning_rate": 4.056384707943679e-05, + "loss": 0.1378, + "step": 9440 + }, + { + "epoch": 0.5922165820642978, + "grad_norm": 0.13492263853549957, + "learning_rate": 4.05532920985413e-05, + "loss": 0.2137, + "step": 9450 + }, + { + "epoch": 0.5928432662781224, + "grad_norm": 2.974963665008545, + "learning_rate": 4.054273711764582e-05, + "loss": 0.183, + "step": 9460 + }, + { + "epoch": 0.5934699504919471, + "grad_norm": 2.765462636947632, + "learning_rate": 4.0532182136750334e-05, + "loss": 0.168, + "step": 9470 + }, + { + "epoch": 0.5940966347057718, + "grad_norm": 0.14917774498462677, + "learning_rate": 4.052162715585485e-05, + "loss": 0.1352, + "step": 9480 + }, + { + "epoch": 0.5947233189195964, + "grad_norm": 0.1324753761291504, + "learning_rate": 4.051107217495936e-05, + "loss": 0.0498, + "step": 9490 + }, + { + "epoch": 0.5953500031334211, + "grad_norm": 3.4256207942962646, + "learning_rate": 4.050051719406388e-05, + "loss": 0.3867, + "step": 9500 + }, + { + "epoch": 0.5959766873472457, + "grad_norm": 2.0906643867492676, + "learning_rate": 4.04899622131684e-05, + "loss": 0.295, + "step": 9510 + }, + { + "epoch": 0.5966033715610704, + "grad_norm": 0.2582828104496002, + "learning_rate": 4.047940723227291e-05, + "loss": 0.2008, + "step": 9520 + }, + { + "epoch": 0.5972300557748951, + "grad_norm": 0.3075207769870758, + "learning_rate": 4.046885225137743e-05, + "loss": 0.0913, + "step": 9530 + }, + { + "epoch": 0.5978567399887197, + "grad_norm": 0.5495119690895081, + "learning_rate": 4.0458297270481944e-05, + "loss": 0.115, + "step": 9540 + }, + { + "epoch": 0.5984834242025443, + "grad_norm": 0.28728142380714417, + "learning_rate": 4.044774228958646e-05, + "loss": 0.1403, + "step": 9550 + }, + { + "epoch": 0.599110108416369, + "grad_norm": 0.25113600492477417, + "learning_rate": 4.043718730869097e-05, + "loss": 0.0978, + "step": 9560 + }, + { + "epoch": 0.5997367926301936, + "grad_norm": 11.35430908203125, + "learning_rate": 4.042663232779549e-05, + "loss": 0.2455, + "step": 9570 + }, + { + "epoch": 0.6003634768440183, + "grad_norm": 0.2805491089820862, + "learning_rate": 4.0416077346900004e-05, + "loss": 0.1732, + "step": 9580 + }, + { + "epoch": 0.600990161057843, + "grad_norm": 0.5016766786575317, + "learning_rate": 4.0405522366004514e-05, + "loss": 0.1413, + "step": 9590 + }, + { + "epoch": 0.6016168452716676, + "grad_norm": 0.1248757615685463, + "learning_rate": 4.039496738510904e-05, + "loss": 0.2073, + "step": 9600 + }, + { + "epoch": 0.6022435294854923, + "grad_norm": 0.18444593250751495, + "learning_rate": 4.0384412404213554e-05, + "loss": 0.1955, + "step": 9610 + }, + { + "epoch": 0.6028702136993169, + "grad_norm": 1.08712899684906, + "learning_rate": 4.037385742331807e-05, + "loss": 0.1148, + "step": 9620 + }, + { + "epoch": 0.6034968979131415, + "grad_norm": 0.23765408992767334, + "learning_rate": 4.036330244242258e-05, + "loss": 0.0364, + "step": 9630 + }, + { + "epoch": 0.6041235821269663, + "grad_norm": 0.44067704677581787, + "learning_rate": 4.03527474615271e-05, + "loss": 0.2301, + "step": 9640 + }, + { + "epoch": 0.6047502663407909, + "grad_norm": 0.13342992961406708, + "learning_rate": 4.0342192480631613e-05, + "loss": 0.0981, + "step": 9650 + }, + { + "epoch": 0.6053769505546155, + "grad_norm": 14.491141319274902, + "learning_rate": 4.033163749973613e-05, + "loss": 0.2648, + "step": 9660 + }, + { + "epoch": 0.6060036347684402, + "grad_norm": 0.32491153478622437, + "learning_rate": 4.032108251884064e-05, + "loss": 0.058, + "step": 9670 + }, + { + "epoch": 0.6066303189822648, + "grad_norm": 0.49822214245796204, + "learning_rate": 4.0310527537945157e-05, + "loss": 0.1284, + "step": 9680 + }, + { + "epoch": 0.6072570031960894, + "grad_norm": 0.06606928259134293, + "learning_rate": 4.029997255704967e-05, + "loss": 0.1202, + "step": 9690 + }, + { + "epoch": 0.6078836874099142, + "grad_norm": 0.10034257173538208, + "learning_rate": 4.028941757615419e-05, + "loss": 0.1395, + "step": 9700 + }, + { + "epoch": 0.6085103716237388, + "grad_norm": 3.284310817718506, + "learning_rate": 4.0278862595258707e-05, + "loss": 0.2535, + "step": 9710 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 3.463360548019409, + "learning_rate": 4.026830761436322e-05, + "loss": 0.1223, + "step": 9720 + }, + { + "epoch": 0.6097637400513881, + "grad_norm": 0.18776389956474304, + "learning_rate": 4.025775263346774e-05, + "loss": 0.0571, + "step": 9730 + }, + { + "epoch": 0.6103904242652127, + "grad_norm": 0.12362643331289291, + "learning_rate": 4.024719765257225e-05, + "loss": 0.0691, + "step": 9740 + }, + { + "epoch": 0.6110171084790375, + "grad_norm": 6.846742630004883, + "learning_rate": 4.0236642671676766e-05, + "loss": 0.1198, + "step": 9750 + }, + { + "epoch": 0.6116437926928621, + "grad_norm": 3.5688319206237793, + "learning_rate": 4.022608769078128e-05, + "loss": 0.2297, + "step": 9760 + }, + { + "epoch": 0.6122704769066867, + "grad_norm": 8.02388858795166, + "learning_rate": 4.021553270988579e-05, + "loss": 0.1796, + "step": 9770 + }, + { + "epoch": 0.6128971611205114, + "grad_norm": 0.3712387681007385, + "learning_rate": 4.020497772899031e-05, + "loss": 0.2178, + "step": 9780 + }, + { + "epoch": 0.613523845334336, + "grad_norm": 0.2477940171957016, + "learning_rate": 4.0194422748094826e-05, + "loss": 0.0652, + "step": 9790 + }, + { + "epoch": 0.6141505295481606, + "grad_norm": 4.8273210525512695, + "learning_rate": 4.018386776719934e-05, + "loss": 0.1627, + "step": 9800 + }, + { + "epoch": 0.6147772137619854, + "grad_norm": 0.5402454137802124, + "learning_rate": 4.017331278630386e-05, + "loss": 0.1057, + "step": 9810 + }, + { + "epoch": 0.61540389797581, + "grad_norm": 0.2719340920448303, + "learning_rate": 4.0162757805408376e-05, + "loss": 0.2223, + "step": 9820 + }, + { + "epoch": 0.6160305821896347, + "grad_norm": 0.45198434591293335, + "learning_rate": 4.015220282451289e-05, + "loss": 0.1297, + "step": 9830 + }, + { + "epoch": 0.6166572664034593, + "grad_norm": 2.249263048171997, + "learning_rate": 4.01416478436174e-05, + "loss": 0.3277, + "step": 9840 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 2.3682870864868164, + "learning_rate": 4.013109286272192e-05, + "loss": 0.0852, + "step": 9850 + }, + { + "epoch": 0.6179106348311086, + "grad_norm": 0.9312769770622253, + "learning_rate": 4.0120537881826436e-05, + "loss": 0.1365, + "step": 9860 + }, + { + "epoch": 0.6185373190449333, + "grad_norm": 0.24179698526859283, + "learning_rate": 4.010998290093095e-05, + "loss": 0.1263, + "step": 9870 + }, + { + "epoch": 0.6191640032587579, + "grad_norm": 0.24173693358898163, + "learning_rate": 4.009942792003546e-05, + "loss": 0.1996, + "step": 9880 + }, + { + "epoch": 0.6197906874725826, + "grad_norm": 0.07061693072319031, + "learning_rate": 4.008887293913998e-05, + "loss": 0.2337, + "step": 9890 + }, + { + "epoch": 0.6204173716864072, + "grad_norm": 0.650595486164093, + "learning_rate": 4.0078317958244496e-05, + "loss": 0.1252, + "step": 9900 + }, + { + "epoch": 0.6210440559002319, + "grad_norm": 0.09584268927574158, + "learning_rate": 4.006776297734901e-05, + "loss": 0.0382, + "step": 9910 + }, + { + "epoch": 0.6216707401140565, + "grad_norm": 61.076053619384766, + "learning_rate": 4.005720799645353e-05, + "loss": 0.0504, + "step": 9920 + }, + { + "epoch": 0.6222974243278812, + "grad_norm": 0.37995097041130066, + "learning_rate": 4.0046653015558046e-05, + "loss": 0.0523, + "step": 9930 + }, + { + "epoch": 0.6229241085417059, + "grad_norm": 15.864818572998047, + "learning_rate": 4.003609803466256e-05, + "loss": 0.2305, + "step": 9940 + }, + { + "epoch": 0.6235507927555305, + "grad_norm": 31.124530792236328, + "learning_rate": 4.002554305376707e-05, + "loss": 0.0783, + "step": 9950 + }, + { + "epoch": 0.6241774769693551, + "grad_norm": 3.1052074432373047, + "learning_rate": 4.001498807287159e-05, + "loss": 0.0864, + "step": 9960 + }, + { + "epoch": 0.6248041611831798, + "grad_norm": 0.4385007619857788, + "learning_rate": 4.0004433091976106e-05, + "loss": 0.323, + "step": 9970 + }, + { + "epoch": 0.6254308453970044, + "grad_norm": 2.410914659500122, + "learning_rate": 3.9993878111080615e-05, + "loss": 0.0486, + "step": 9980 + }, + { + "epoch": 0.626057529610829, + "grad_norm": 0.31448647379875183, + "learning_rate": 3.998332313018513e-05, + "loss": 0.1045, + "step": 9990 + }, + { + "epoch": 0.6266842138246538, + "grad_norm": 0.04953588917851448, + "learning_rate": 3.9972768149289656e-05, + "loss": 0.0343, + "step": 10000 + }, + { + "epoch": 0.6273108980384784, + "grad_norm": 0.04356538504362106, + "learning_rate": 3.996221316839417e-05, + "loss": 0.2221, + "step": 10010 + }, + { + "epoch": 0.6279375822523031, + "grad_norm": 0.3184496760368347, + "learning_rate": 3.995165818749868e-05, + "loss": 0.2251, + "step": 10020 + }, + { + "epoch": 0.6285642664661277, + "grad_norm": 0.6410167217254639, + "learning_rate": 3.99411032066032e-05, + "loss": 0.0652, + "step": 10030 + }, + { + "epoch": 0.6291909506799523, + "grad_norm": 3.2516913414001465, + "learning_rate": 3.9930548225707715e-05, + "loss": 0.1548, + "step": 10040 + }, + { + "epoch": 0.6298176348937771, + "grad_norm": 0.24519649147987366, + "learning_rate": 3.991999324481223e-05, + "loss": 0.1789, + "step": 10050 + }, + { + "epoch": 0.6304443191076017, + "grad_norm": 0.1177930012345314, + "learning_rate": 3.990943826391674e-05, + "loss": 0.1067, + "step": 10060 + }, + { + "epoch": 0.6310710033214263, + "grad_norm": 0.08309656381607056, + "learning_rate": 3.989888328302126e-05, + "loss": 0.0873, + "step": 10070 + }, + { + "epoch": 0.631697687535251, + "grad_norm": 0.3515457212924957, + "learning_rate": 3.9888328302125775e-05, + "loss": 0.3244, + "step": 10080 + }, + { + "epoch": 0.6323243717490756, + "grad_norm": 0.4474419355392456, + "learning_rate": 3.9877773321230285e-05, + "loss": 0.1005, + "step": 10090 + }, + { + "epoch": 0.6329510559629002, + "grad_norm": 0.13745348155498505, + "learning_rate": 3.986721834033481e-05, + "loss": 0.2018, + "step": 10100 + }, + { + "epoch": 0.633577740176725, + "grad_norm": 0.32032960653305054, + "learning_rate": 3.9856663359439325e-05, + "loss": 0.069, + "step": 10110 + }, + { + "epoch": 0.6342044243905496, + "grad_norm": 2.037172317504883, + "learning_rate": 3.984610837854384e-05, + "loss": 0.0672, + "step": 10120 + }, + { + "epoch": 0.6348311086043743, + "grad_norm": 5.471995830535889, + "learning_rate": 3.983555339764835e-05, + "loss": 0.2724, + "step": 10130 + }, + { + "epoch": 0.6354577928181989, + "grad_norm": 0.14681348204612732, + "learning_rate": 3.982499841675287e-05, + "loss": 0.1718, + "step": 10140 + }, + { + "epoch": 0.6360844770320235, + "grad_norm": 1.499589443206787, + "learning_rate": 3.9814443435857385e-05, + "loss": 0.1107, + "step": 10150 + }, + { + "epoch": 0.6367111612458483, + "grad_norm": 17.15564727783203, + "learning_rate": 3.9803888454961895e-05, + "loss": 0.1474, + "step": 10160 + }, + { + "epoch": 0.6373378454596729, + "grad_norm": 2.377847909927368, + "learning_rate": 3.979333347406641e-05, + "loss": 0.0473, + "step": 10170 + }, + { + "epoch": 0.6379645296734975, + "grad_norm": 1.3150869607925415, + "learning_rate": 3.978277849317093e-05, + "loss": 0.2218, + "step": 10180 + }, + { + "epoch": 0.6385912138873222, + "grad_norm": 0.0968065857887268, + "learning_rate": 3.9772223512275445e-05, + "loss": 0.1458, + "step": 10190 + }, + { + "epoch": 0.6392178981011468, + "grad_norm": 0.0983181819319725, + "learning_rate": 3.976166853137996e-05, + "loss": 0.0453, + "step": 10200 + }, + { + "epoch": 0.6398445823149714, + "grad_norm": 0.128788560628891, + "learning_rate": 3.975111355048448e-05, + "loss": 0.077, + "step": 10210 + }, + { + "epoch": 0.6404712665287962, + "grad_norm": 0.09855977445840836, + "learning_rate": 3.9740558569588995e-05, + "loss": 0.1964, + "step": 10220 + }, + { + "epoch": 0.6410979507426208, + "grad_norm": 0.07261627167463303, + "learning_rate": 3.9730003588693505e-05, + "loss": 0.117, + "step": 10230 + }, + { + "epoch": 0.6417246349564455, + "grad_norm": 0.1142692044377327, + "learning_rate": 3.971944860779802e-05, + "loss": 0.0995, + "step": 10240 + }, + { + "epoch": 0.6423513191702701, + "grad_norm": 0.12659485638141632, + "learning_rate": 3.970889362690254e-05, + "loss": 0.0944, + "step": 10250 + }, + { + "epoch": 0.6429780033840947, + "grad_norm": 7.10390567779541, + "learning_rate": 3.9698338646007055e-05, + "loss": 0.3312, + "step": 10260 + }, + { + "epoch": 0.6436046875979194, + "grad_norm": 0.6107752919197083, + "learning_rate": 3.9687783665111564e-05, + "loss": 0.0876, + "step": 10270 + }, + { + "epoch": 0.6442313718117441, + "grad_norm": 0.16075240075588226, + "learning_rate": 3.967722868421608e-05, + "loss": 0.0315, + "step": 10280 + }, + { + "epoch": 0.6448580560255687, + "grad_norm": 0.17103694379329681, + "learning_rate": 3.96666737033206e-05, + "loss": 0.1716, + "step": 10290 + }, + { + "epoch": 0.6454847402393934, + "grad_norm": 0.08142802864313126, + "learning_rate": 3.9656118722425114e-05, + "loss": 0.1019, + "step": 10300 + }, + { + "epoch": 0.646111424453218, + "grad_norm": 0.07293316721916199, + "learning_rate": 3.964556374152963e-05, + "loss": 0.1578, + "step": 10310 + }, + { + "epoch": 0.6467381086670427, + "grad_norm": 0.09897245466709137, + "learning_rate": 3.963500876063415e-05, + "loss": 0.1362, + "step": 10320 + }, + { + "epoch": 0.6473647928808673, + "grad_norm": 3.195725202560425, + "learning_rate": 3.9624453779738664e-05, + "loss": 0.073, + "step": 10330 + }, + { + "epoch": 0.647991477094692, + "grad_norm": 2.7055459022521973, + "learning_rate": 3.9613898798843174e-05, + "loss": 0.3671, + "step": 10340 + }, + { + "epoch": 0.6486181613085167, + "grad_norm": 0.9218514561653137, + "learning_rate": 3.960334381794769e-05, + "loss": 0.0901, + "step": 10350 + }, + { + "epoch": 0.6492448455223413, + "grad_norm": 0.10140139609575272, + "learning_rate": 3.959278883705221e-05, + "loss": 0.1005, + "step": 10360 + }, + { + "epoch": 0.6498715297361659, + "grad_norm": 0.12973734736442566, + "learning_rate": 3.958223385615672e-05, + "loss": 0.098, + "step": 10370 + }, + { + "epoch": 0.6504982139499906, + "grad_norm": 0.13121546804904938, + "learning_rate": 3.9571678875261234e-05, + "loss": 0.0977, + "step": 10380 + }, + { + "epoch": 0.6511248981638152, + "grad_norm": 0.043394092470407486, + "learning_rate": 3.956112389436575e-05, + "loss": 0.2903, + "step": 10390 + }, + { + "epoch": 0.6517515823776399, + "grad_norm": 0.1369188278913498, + "learning_rate": 3.955056891347027e-05, + "loss": 0.0737, + "step": 10400 + }, + { + "epoch": 0.6523782665914646, + "grad_norm": 0.2751493752002716, + "learning_rate": 3.9540013932574784e-05, + "loss": 0.0196, + "step": 10410 + }, + { + "epoch": 0.6530049508052892, + "grad_norm": 11.32926082611084, + "learning_rate": 3.95294589516793e-05, + "loss": 0.1617, + "step": 10420 + }, + { + "epoch": 0.6536316350191139, + "grad_norm": 0.5413795113563538, + "learning_rate": 3.951890397078382e-05, + "loss": 0.1212, + "step": 10430 + }, + { + "epoch": 0.6542583192329385, + "grad_norm": 0.045053768903017044, + "learning_rate": 3.950834898988833e-05, + "loss": 0.2104, + "step": 10440 + }, + { + "epoch": 0.6548850034467631, + "grad_norm": 3.709371328353882, + "learning_rate": 3.9497794008992844e-05, + "loss": 0.1815, + "step": 10450 + }, + { + "epoch": 0.6555116876605879, + "grad_norm": 0.1551133543252945, + "learning_rate": 3.948723902809736e-05, + "loss": 0.2463, + "step": 10460 + }, + { + "epoch": 0.6561383718744125, + "grad_norm": 0.2704763114452362, + "learning_rate": 3.947668404720188e-05, + "loss": 0.2314, + "step": 10470 + }, + { + "epoch": 0.6567650560882371, + "grad_norm": 0.2972564399242401, + "learning_rate": 3.946612906630639e-05, + "loss": 0.2011, + "step": 10480 + }, + { + "epoch": 0.6573917403020618, + "grad_norm": 3.8430252075195312, + "learning_rate": 3.9455574085410904e-05, + "loss": 0.1605, + "step": 10490 + }, + { + "epoch": 0.6580184245158864, + "grad_norm": 2.99223256111145, + "learning_rate": 3.944501910451543e-05, + "loss": 0.2343, + "step": 10500 + }, + { + "epoch": 0.658645108729711, + "grad_norm": 2.406914710998535, + "learning_rate": 3.9434464123619944e-05, + "loss": 0.3027, + "step": 10510 + }, + { + "epoch": 0.6592717929435358, + "grad_norm": 5.0974249839782715, + "learning_rate": 3.9423909142724454e-05, + "loss": 0.2039, + "step": 10520 + }, + { + "epoch": 0.6598984771573604, + "grad_norm": 0.17414742708206177, + "learning_rate": 3.941335416182897e-05, + "loss": 0.0915, + "step": 10530 + }, + { + "epoch": 0.6605251613711851, + "grad_norm": 2.815800189971924, + "learning_rate": 3.940279918093349e-05, + "loss": 0.1515, + "step": 10540 + }, + { + "epoch": 0.6611518455850097, + "grad_norm": 0.3265558183193207, + "learning_rate": 3.9392244200038e-05, + "loss": 0.0477, + "step": 10550 + }, + { + "epoch": 0.6617785297988343, + "grad_norm": 0.16620047390460968, + "learning_rate": 3.9381689219142513e-05, + "loss": 0.1954, + "step": 10560 + }, + { + "epoch": 0.6624052140126591, + "grad_norm": 3.4621222019195557, + "learning_rate": 3.937113423824703e-05, + "loss": 0.2202, + "step": 10570 + }, + { + "epoch": 0.6630318982264837, + "grad_norm": 0.23589849472045898, + "learning_rate": 3.936057925735155e-05, + "loss": 0.1349, + "step": 10580 + }, + { + "epoch": 0.6636585824403083, + "grad_norm": 0.573025643825531, + "learning_rate": 3.935002427645606e-05, + "loss": 0.1222, + "step": 10590 + }, + { + "epoch": 0.664285266654133, + "grad_norm": 0.22413015365600586, + "learning_rate": 3.933946929556058e-05, + "loss": 0.1157, + "step": 10600 + }, + { + "epoch": 0.6649119508679576, + "grad_norm": 0.8842704892158508, + "learning_rate": 3.93289143146651e-05, + "loss": 0.0963, + "step": 10610 + }, + { + "epoch": 0.6655386350817822, + "grad_norm": 2.976203680038452, + "learning_rate": 3.931835933376961e-05, + "loss": 0.1808, + "step": 10620 + }, + { + "epoch": 0.666165319295607, + "grad_norm": 0.048292968422174454, + "learning_rate": 3.930780435287412e-05, + "loss": 0.0867, + "step": 10630 + }, + { + "epoch": 0.6667920035094316, + "grad_norm": 0.40534451603889465, + "learning_rate": 3.929724937197864e-05, + "loss": 0.2062, + "step": 10640 + }, + { + "epoch": 0.6674186877232563, + "grad_norm": 0.3869112432003021, + "learning_rate": 3.9286694391083157e-05, + "loss": 0.0908, + "step": 10650 + }, + { + "epoch": 0.6680453719370809, + "grad_norm": 3.531752824783325, + "learning_rate": 3.9276139410187666e-05, + "loss": 0.197, + "step": 10660 + }, + { + "epoch": 0.6686720561509055, + "grad_norm": 1.0175777673721313, + "learning_rate": 3.926558442929218e-05, + "loss": 0.1265, + "step": 10670 + }, + { + "epoch": 0.6692987403647302, + "grad_norm": 55.90322494506836, + "learning_rate": 3.92550294483967e-05, + "loss": 0.0909, + "step": 10680 + }, + { + "epoch": 0.6699254245785549, + "grad_norm": 0.06375054270029068, + "learning_rate": 3.9244474467501216e-05, + "loss": 0.3127, + "step": 10690 + }, + { + "epoch": 0.6705521087923795, + "grad_norm": 1.1217492818832397, + "learning_rate": 3.923391948660573e-05, + "loss": 0.0276, + "step": 10700 + }, + { + "epoch": 0.6711787930062042, + "grad_norm": 0.06201806664466858, + "learning_rate": 3.922336450571025e-05, + "loss": 0.009, + "step": 10710 + }, + { + "epoch": 0.6718054772200288, + "grad_norm": 28.970144271850586, + "learning_rate": 3.9212809524814766e-05, + "loss": 0.016, + "step": 10720 + }, + { + "epoch": 0.6724321614338534, + "grad_norm": 0.061413247138261795, + "learning_rate": 3.9202254543919276e-05, + "loss": 0.2635, + "step": 10730 + }, + { + "epoch": 0.6730588456476782, + "grad_norm": 10.785362243652344, + "learning_rate": 3.919169956302379e-05, + "loss": 0.325, + "step": 10740 + }, + { + "epoch": 0.6736855298615028, + "grad_norm": 0.165009006857872, + "learning_rate": 3.918114458212831e-05, + "loss": 0.2889, + "step": 10750 + }, + { + "epoch": 0.6743122140753275, + "grad_norm": 0.1995621770620346, + "learning_rate": 3.917058960123282e-05, + "loss": 0.0757, + "step": 10760 + }, + { + "epoch": 0.6749388982891521, + "grad_norm": 12.344490051269531, + "learning_rate": 3.9160034620337336e-05, + "loss": 0.0832, + "step": 10770 + }, + { + "epoch": 0.6755655825029767, + "grad_norm": 0.12043475359678268, + "learning_rate": 3.914947963944185e-05, + "loss": 0.1071, + "step": 10780 + }, + { + "epoch": 0.6761922667168014, + "grad_norm": 3.2827420234680176, + "learning_rate": 3.913892465854637e-05, + "loss": 0.2466, + "step": 10790 + }, + { + "epoch": 0.676818950930626, + "grad_norm": 2.662658929824829, + "learning_rate": 3.9128369677650886e-05, + "loss": 0.2005, + "step": 10800 + }, + { + "epoch": 0.6774456351444507, + "grad_norm": 3.157073974609375, + "learning_rate": 3.91178146967554e-05, + "loss": 0.3099, + "step": 10810 + }, + { + "epoch": 0.6780723193582754, + "grad_norm": 0.13340094685554504, + "learning_rate": 3.910725971585992e-05, + "loss": 0.0161, + "step": 10820 + }, + { + "epoch": 0.6786990035721, + "grad_norm": 0.18897636234760284, + "learning_rate": 3.909670473496443e-05, + "loss": 0.189, + "step": 10830 + }, + { + "epoch": 0.6793256877859247, + "grad_norm": 0.26442772150039673, + "learning_rate": 3.9086149754068946e-05, + "loss": 0.2095, + "step": 10840 + }, + { + "epoch": 0.6799523719997493, + "grad_norm": 2.244516134262085, + "learning_rate": 3.907559477317346e-05, + "loss": 0.21, + "step": 10850 + }, + { + "epoch": 0.680579056213574, + "grad_norm": 54.9011116027832, + "learning_rate": 3.906503979227798e-05, + "loss": 0.0742, + "step": 10860 + }, + { + "epoch": 0.6812057404273987, + "grad_norm": 0.1011706218123436, + "learning_rate": 3.905448481138249e-05, + "loss": 0.0987, + "step": 10870 + }, + { + "epoch": 0.6818324246412233, + "grad_norm": 1.3590539693832397, + "learning_rate": 3.9043929830487006e-05, + "loss": 0.0191, + "step": 10880 + }, + { + "epoch": 0.6824591088550479, + "grad_norm": 0.1384275257587433, + "learning_rate": 3.903337484959152e-05, + "loss": 0.1315, + "step": 10890 + }, + { + "epoch": 0.6830857930688726, + "grad_norm": 0.08217030763626099, + "learning_rate": 3.902281986869604e-05, + "loss": 0.1368, + "step": 10900 + }, + { + "epoch": 0.6837124772826972, + "grad_norm": 0.11884766072034836, + "learning_rate": 3.9012264887800556e-05, + "loss": 0.1184, + "step": 10910 + }, + { + "epoch": 0.6843391614965219, + "grad_norm": 0.09507952630519867, + "learning_rate": 3.900170990690507e-05, + "loss": 0.1407, + "step": 10920 + }, + { + "epoch": 0.6849658457103466, + "grad_norm": 0.34754639863967896, + "learning_rate": 3.899115492600959e-05, + "loss": 0.0828, + "step": 10930 + }, + { + "epoch": 0.6855925299241712, + "grad_norm": 0.1255376785993576, + "learning_rate": 3.89805999451141e-05, + "loss": 0.1745, + "step": 10940 + }, + { + "epoch": 0.6862192141379959, + "grad_norm": 3.3651392459869385, + "learning_rate": 3.8970044964218615e-05, + "loss": 0.2798, + "step": 10950 + }, + { + "epoch": 0.6868458983518205, + "grad_norm": 0.08753059059381485, + "learning_rate": 3.895948998332313e-05, + "loss": 0.0903, + "step": 10960 + }, + { + "epoch": 0.6874725825656451, + "grad_norm": 0.11063142120838165, + "learning_rate": 3.894893500242765e-05, + "loss": 0.1449, + "step": 10970 + }, + { + "epoch": 0.6880992667794699, + "grad_norm": 0.11355997622013092, + "learning_rate": 3.893838002153216e-05, + "loss": 0.1304, + "step": 10980 + }, + { + "epoch": 0.6887259509932945, + "grad_norm": 0.0895422026515007, + "learning_rate": 3.8927825040636675e-05, + "loss": 0.0438, + "step": 10990 + }, + { + "epoch": 0.6893526352071191, + "grad_norm": 0.05675465241074562, + "learning_rate": 3.89172700597412e-05, + "loss": 0.0408, + "step": 11000 + }, + { + "epoch": 0.6899793194209438, + "grad_norm": 0.7115016579627991, + "learning_rate": 3.890671507884571e-05, + "loss": 0.1766, + "step": 11010 + }, + { + "epoch": 0.6906060036347684, + "grad_norm": 0.09861718118190765, + "learning_rate": 3.8896160097950225e-05, + "loss": 0.1257, + "step": 11020 + }, + { + "epoch": 0.691232687848593, + "grad_norm": 0.9432891607284546, + "learning_rate": 3.888560511705474e-05, + "loss": 0.2082, + "step": 11030 + }, + { + "epoch": 0.6918593720624178, + "grad_norm": 0.3203168213367462, + "learning_rate": 3.887505013615926e-05, + "loss": 0.1755, + "step": 11040 + }, + { + "epoch": 0.6924860562762424, + "grad_norm": 0.22065997123718262, + "learning_rate": 3.886449515526377e-05, + "loss": 0.1434, + "step": 11050 + }, + { + "epoch": 0.6931127404900671, + "grad_norm": 0.18533888459205627, + "learning_rate": 3.8853940174368285e-05, + "loss": 0.036, + "step": 11060 + }, + { + "epoch": 0.6937394247038917, + "grad_norm": 0.09828941524028778, + "learning_rate": 3.88433851934728e-05, + "loss": 0.0293, + "step": 11070 + }, + { + "epoch": 0.6943661089177163, + "grad_norm": 16.122100830078125, + "learning_rate": 3.883283021257731e-05, + "loss": 0.1103, + "step": 11080 + }, + { + "epoch": 0.694992793131541, + "grad_norm": 0.05723113566637039, + "learning_rate": 3.8822275231681835e-05, + "loss": 0.1173, + "step": 11090 + }, + { + "epoch": 0.6956194773453657, + "grad_norm": 8.0460205078125, + "learning_rate": 3.881172025078635e-05, + "loss": 0.1969, + "step": 11100 + }, + { + "epoch": 0.6962461615591903, + "grad_norm": 0.07896614819765091, + "learning_rate": 3.880116526989087e-05, + "loss": 0.0467, + "step": 11110 + }, + { + "epoch": 0.696872845773015, + "grad_norm": 5.321878433227539, + "learning_rate": 3.879061028899538e-05, + "loss": 0.1794, + "step": 11120 + }, + { + "epoch": 0.6974995299868396, + "grad_norm": 0.0769563838839531, + "learning_rate": 3.8780055308099895e-05, + "loss": 0.0785, + "step": 11130 + }, + { + "epoch": 0.6981262142006642, + "grad_norm": 2.8381550312042236, + "learning_rate": 3.876950032720441e-05, + "loss": 0.0109, + "step": 11140 + }, + { + "epoch": 0.698752898414489, + "grad_norm": 0.10958783328533173, + "learning_rate": 3.875894534630892e-05, + "loss": 0.1323, + "step": 11150 + }, + { + "epoch": 0.6993795826283136, + "grad_norm": 0.050287846475839615, + "learning_rate": 3.874839036541344e-05, + "loss": 0.1248, + "step": 11160 + }, + { + "epoch": 0.7000062668421383, + "grad_norm": 0.07455500215291977, + "learning_rate": 3.8737835384517955e-05, + "loss": 0.1604, + "step": 11170 + }, + { + "epoch": 0.7006329510559629, + "grad_norm": 0.1684502214193344, + "learning_rate": 3.872728040362247e-05, + "loss": 0.308, + "step": 11180 + }, + { + "epoch": 0.7012596352697875, + "grad_norm": 0.10313006490468979, + "learning_rate": 3.871672542272699e-05, + "loss": 0.1469, + "step": 11190 + }, + { + "epoch": 0.7018863194836122, + "grad_norm": 0.11166546493768692, + "learning_rate": 3.8706170441831505e-05, + "loss": 0.2037, + "step": 11200 + }, + { + "epoch": 0.7025130036974369, + "grad_norm": 3.201416015625, + "learning_rate": 3.869561546093602e-05, + "loss": 0.1579, + "step": 11210 + }, + { + "epoch": 0.7031396879112615, + "grad_norm": 0.16494852304458618, + "learning_rate": 3.868506048004053e-05, + "loss": 0.1543, + "step": 11220 + }, + { + "epoch": 0.7037663721250862, + "grad_norm": 3.2456631660461426, + "learning_rate": 3.867450549914505e-05, + "loss": 0.1165, + "step": 11230 + }, + { + "epoch": 0.7043930563389108, + "grad_norm": 1.6267149448394775, + "learning_rate": 3.8663950518249564e-05, + "loss": 0.1047, + "step": 11240 + }, + { + "epoch": 0.7050197405527355, + "grad_norm": 0.4653446078300476, + "learning_rate": 3.865339553735408e-05, + "loss": 0.09, + "step": 11250 + }, + { + "epoch": 0.7056464247665601, + "grad_norm": 6.371344089508057, + "learning_rate": 3.864284055645859e-05, + "loss": 0.3287, + "step": 11260 + }, + { + "epoch": 0.7062731089803848, + "grad_norm": 0.11281926184892654, + "learning_rate": 3.863228557556311e-05, + "loss": 0.0415, + "step": 11270 + }, + { + "epoch": 0.7068997931942095, + "grad_norm": 0.0942821130156517, + "learning_rate": 3.8621730594667624e-05, + "loss": 0.0532, + "step": 11280 + }, + { + "epoch": 0.7075264774080341, + "grad_norm": 0.2059144377708435, + "learning_rate": 3.861117561377214e-05, + "loss": 0.2404, + "step": 11290 + }, + { + "epoch": 0.7081531616218587, + "grad_norm": 0.08037558943033218, + "learning_rate": 3.860062063287666e-05, + "loss": 0.1778, + "step": 11300 + }, + { + "epoch": 0.7087798458356834, + "grad_norm": 9.817173957824707, + "learning_rate": 3.8590065651981174e-05, + "loss": 0.202, + "step": 11310 + }, + { + "epoch": 0.709406530049508, + "grad_norm": 0.15982958674430847, + "learning_rate": 3.857951067108569e-05, + "loss": 0.2136, + "step": 11320 + }, + { + "epoch": 0.7100332142633327, + "grad_norm": 0.2970063388347626, + "learning_rate": 3.85689556901902e-05, + "loss": 0.1994, + "step": 11330 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 0.29430776834487915, + "learning_rate": 3.855840070929472e-05, + "loss": 0.2018, + "step": 11340 + }, + { + "epoch": 0.711286582690982, + "grad_norm": 0.15369923412799835, + "learning_rate": 3.8547845728399234e-05, + "loss": 0.0427, + "step": 11350 + }, + { + "epoch": 0.7119132669048067, + "grad_norm": 0.20897263288497925, + "learning_rate": 3.853729074750375e-05, + "loss": 0.2163, + "step": 11360 + }, + { + "epoch": 0.7125399511186313, + "grad_norm": 9.025155067443848, + "learning_rate": 3.852673576660826e-05, + "loss": 0.2577, + "step": 11370 + }, + { + "epoch": 0.713166635332456, + "grad_norm": 11.082793235778809, + "learning_rate": 3.851618078571278e-05, + "loss": 0.3279, + "step": 11380 + }, + { + "epoch": 0.7137933195462807, + "grad_norm": 7.146550178527832, + "learning_rate": 3.8505625804817294e-05, + "loss": 0.2959, + "step": 11390 + }, + { + "epoch": 0.7144200037601053, + "grad_norm": 0.25585150718688965, + "learning_rate": 3.849507082392181e-05, + "loss": 0.149, + "step": 11400 + }, + { + "epoch": 0.7150466879739299, + "grad_norm": 5.5770263671875, + "learning_rate": 3.848451584302633e-05, + "loss": 0.1444, + "step": 11410 + }, + { + "epoch": 0.7156733721877546, + "grad_norm": 8.812726020812988, + "learning_rate": 3.8473960862130844e-05, + "loss": 0.1645, + "step": 11420 + }, + { + "epoch": 0.7163000564015792, + "grad_norm": 0.081893190741539, + "learning_rate": 3.846340588123536e-05, + "loss": 0.0176, + "step": 11430 + }, + { + "epoch": 0.7169267406154038, + "grad_norm": 0.06406759470701218, + "learning_rate": 3.845285090033987e-05, + "loss": 0.1763, + "step": 11440 + }, + { + "epoch": 0.7175534248292286, + "grad_norm": 0.17364154756069183, + "learning_rate": 3.844229591944439e-05, + "loss": 0.1932, + "step": 11450 + }, + { + "epoch": 0.7181801090430532, + "grad_norm": 0.14099130034446716, + "learning_rate": 3.8431740938548904e-05, + "loss": 0.1635, + "step": 11460 + }, + { + "epoch": 0.7188067932568779, + "grad_norm": 0.25819849967956543, + "learning_rate": 3.8421185957653414e-05, + "loss": 0.0816, + "step": 11470 + }, + { + "epoch": 0.7194334774707025, + "grad_norm": 0.05951720103621483, + "learning_rate": 3.841063097675793e-05, + "loss": 0.0603, + "step": 11480 + }, + { + "epoch": 0.7200601616845271, + "grad_norm": 0.12080153822898865, + "learning_rate": 3.840007599586245e-05, + "loss": 0.2307, + "step": 11490 + }, + { + "epoch": 0.7206868458983519, + "grad_norm": 0.12720264494419098, + "learning_rate": 3.838952101496697e-05, + "loss": 0.1466, + "step": 11500 + }, + { + "epoch": 0.7213135301121765, + "grad_norm": 4.570608615875244, + "learning_rate": 3.837896603407148e-05, + "loss": 0.3491, + "step": 11510 + }, + { + "epoch": 0.7219402143260011, + "grad_norm": 1.9697740077972412, + "learning_rate": 3.8368411053176e-05, + "loss": 0.0256, + "step": 11520 + }, + { + "epoch": 0.7225668985398258, + "grad_norm": 1.388838291168213, + "learning_rate": 3.8357856072280513e-05, + "loss": 0.0135, + "step": 11530 + }, + { + "epoch": 0.7231935827536504, + "grad_norm": 0.06155795976519585, + "learning_rate": 3.834730109138502e-05, + "loss": 0.0743, + "step": 11540 + }, + { + "epoch": 0.723820266967475, + "grad_norm": 0.09652787446975708, + "learning_rate": 3.833674611048954e-05, + "loss": 0.3616, + "step": 11550 + }, + { + "epoch": 0.7244469511812998, + "grad_norm": 0.12950001657009125, + "learning_rate": 3.832619112959406e-05, + "loss": 0.1245, + "step": 11560 + }, + { + "epoch": 0.7250736353951244, + "grad_norm": 2.212172031402588, + "learning_rate": 3.831563614869857e-05, + "loss": 0.2461, + "step": 11570 + }, + { + "epoch": 0.7257003196089491, + "grad_norm": 0.1823960393667221, + "learning_rate": 3.830508116780308e-05, + "loss": 0.182, + "step": 11580 + }, + { + "epoch": 0.7263270038227737, + "grad_norm": 0.16792289912700653, + "learning_rate": 3.8294526186907607e-05, + "loss": 0.0991, + "step": 11590 + }, + { + "epoch": 0.7269536880365983, + "grad_norm": 8.197361946105957, + "learning_rate": 3.828397120601212e-05, + "loss": 0.2588, + "step": 11600 + }, + { + "epoch": 0.727580372250423, + "grad_norm": 0.7492303252220154, + "learning_rate": 3.827341622511663e-05, + "loss": 0.0935, + "step": 11610 + }, + { + "epoch": 0.7282070564642477, + "grad_norm": 0.12691091001033783, + "learning_rate": 3.826286124422115e-05, + "loss": 0.1913, + "step": 11620 + }, + { + "epoch": 0.7288337406780723, + "grad_norm": 0.3421546220779419, + "learning_rate": 3.8252306263325666e-05, + "loss": 0.1469, + "step": 11630 + }, + { + "epoch": 0.729460424891897, + "grad_norm": 0.10125494748353958, + "learning_rate": 3.824175128243018e-05, + "loss": 0.0964, + "step": 11640 + }, + { + "epoch": 0.7300871091057216, + "grad_norm": 0.06703011691570282, + "learning_rate": 3.823119630153469e-05, + "loss": 0.0761, + "step": 11650 + }, + { + "epoch": 0.7307137933195463, + "grad_norm": 3.4559688568115234, + "learning_rate": 3.822064132063921e-05, + "loss": 0.1696, + "step": 11660 + }, + { + "epoch": 0.731340477533371, + "grad_norm": 0.1432192325592041, + "learning_rate": 3.8210086339743726e-05, + "loss": 0.1375, + "step": 11670 + }, + { + "epoch": 0.7319671617471956, + "grad_norm": 1.798836588859558, + "learning_rate": 3.819953135884824e-05, + "loss": 0.0559, + "step": 11680 + }, + { + "epoch": 0.7325938459610203, + "grad_norm": 1.9000601768493652, + "learning_rate": 3.818897637795276e-05, + "loss": 0.0461, + "step": 11690 + }, + { + "epoch": 0.7332205301748449, + "grad_norm": 0.08201713860034943, + "learning_rate": 3.8178421397057276e-05, + "loss": 0.0311, + "step": 11700 + }, + { + "epoch": 0.7338472143886695, + "grad_norm": 0.09136554598808289, + "learning_rate": 3.816786641616179e-05, + "loss": 0.1495, + "step": 11710 + }, + { + "epoch": 0.7344738986024942, + "grad_norm": 0.08365706354379654, + "learning_rate": 3.81573114352663e-05, + "loss": 0.3126, + "step": 11720 + }, + { + "epoch": 0.7351005828163188, + "grad_norm": 0.06393969058990479, + "learning_rate": 3.814675645437082e-05, + "loss": 0.1961, + "step": 11730 + }, + { + "epoch": 0.7357272670301435, + "grad_norm": 0.34764620661735535, + "learning_rate": 3.8136201473475336e-05, + "loss": 0.0926, + "step": 11740 + }, + { + "epoch": 0.7363539512439682, + "grad_norm": 14.139609336853027, + "learning_rate": 3.812564649257985e-05, + "loss": 0.1091, + "step": 11750 + }, + { + "epoch": 0.7369806354577928, + "grad_norm": 0.06565447896718979, + "learning_rate": 3.811509151168436e-05, + "loss": 0.2524, + "step": 11760 + }, + { + "epoch": 0.7376073196716175, + "grad_norm": 0.4182433784008026, + "learning_rate": 3.810453653078888e-05, + "loss": 0.2786, + "step": 11770 + }, + { + "epoch": 0.7382340038854421, + "grad_norm": 5.026593208312988, + "learning_rate": 3.8093981549893396e-05, + "loss": 0.1597, + "step": 11780 + }, + { + "epoch": 0.7388606880992667, + "grad_norm": 0.27505579590797424, + "learning_rate": 3.808342656899791e-05, + "loss": 0.0146, + "step": 11790 + }, + { + "epoch": 0.7394873723130915, + "grad_norm": 5.809450626373291, + "learning_rate": 3.807287158810243e-05, + "loss": 0.1832, + "step": 11800 + }, + { + "epoch": 0.7401140565269161, + "grad_norm": 6.0504679679870605, + "learning_rate": 3.8062316607206946e-05, + "loss": 0.1652, + "step": 11810 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.35781294107437134, + "learning_rate": 3.805176162631146e-05, + "loss": 0.1793, + "step": 11820 + }, + { + "epoch": 0.7413674249545654, + "grad_norm": 0.09163035452365875, + "learning_rate": 3.804120664541597e-05, + "loss": 0.1139, + "step": 11830 + }, + { + "epoch": 0.74199410916839, + "grad_norm": 68.03575897216797, + "learning_rate": 3.803065166452049e-05, + "loss": 0.085, + "step": 11840 + }, + { + "epoch": 0.7426207933822147, + "grad_norm": 0.11376772820949554, + "learning_rate": 3.8020096683625006e-05, + "loss": 0.0952, + "step": 11850 + }, + { + "epoch": 0.7432474775960394, + "grad_norm": 2.6789710521698, + "learning_rate": 3.8009541702729516e-05, + "loss": 0.1616, + "step": 11860 + }, + { + "epoch": 0.743874161809864, + "grad_norm": 0.31956803798675537, + "learning_rate": 3.799898672183403e-05, + "loss": 0.1641, + "step": 11870 + }, + { + "epoch": 0.7445008460236887, + "grad_norm": 0.1959105134010315, + "learning_rate": 3.798843174093855e-05, + "loss": 0.1341, + "step": 11880 + }, + { + "epoch": 0.7451275302375133, + "grad_norm": 2.517446517944336, + "learning_rate": 3.7977876760043065e-05, + "loss": 0.1314, + "step": 11890 + }, + { + "epoch": 0.7457542144513379, + "grad_norm": 0.07533478736877441, + "learning_rate": 3.796732177914758e-05, + "loss": 0.1315, + "step": 11900 + }, + { + "epoch": 0.7463808986651627, + "grad_norm": 0.7237932682037354, + "learning_rate": 3.79567667982521e-05, + "loss": 0.1167, + "step": 11910 + }, + { + "epoch": 0.7470075828789873, + "grad_norm": 6.23661994934082, + "learning_rate": 3.7946211817356615e-05, + "loss": 0.1283, + "step": 11920 + }, + { + "epoch": 0.7476342670928119, + "grad_norm": 3.1848607063293457, + "learning_rate": 3.7935656836461125e-05, + "loss": 0.1034, + "step": 11930 + }, + { + "epoch": 0.7482609513066366, + "grad_norm": 2.650261878967285, + "learning_rate": 3.792510185556564e-05, + "loss": 0.1176, + "step": 11940 + }, + { + "epoch": 0.7488876355204612, + "grad_norm": 0.06356266140937805, + "learning_rate": 3.791454687467016e-05, + "loss": 0.1833, + "step": 11950 + }, + { + "epoch": 0.7495143197342858, + "grad_norm": 2.3670597076416016, + "learning_rate": 3.7903991893774675e-05, + "loss": 0.2214, + "step": 11960 + }, + { + "epoch": 0.7501410039481106, + "grad_norm": 0.07301408797502518, + "learning_rate": 3.7893436912879185e-05, + "loss": 0.308, + "step": 11970 + }, + { + "epoch": 0.7507676881619352, + "grad_norm": 13.731446266174316, + "learning_rate": 3.78828819319837e-05, + "loss": 0.2203, + "step": 11980 + }, + { + "epoch": 0.7513943723757599, + "grad_norm": 2.0269956588745117, + "learning_rate": 3.787232695108822e-05, + "loss": 0.2015, + "step": 11990 + }, + { + "epoch": 0.7520210565895845, + "grad_norm": 0.23687097430229187, + "learning_rate": 3.7861771970192735e-05, + "loss": 0.0128, + "step": 12000 + }, + { + "epoch": 0.7526477408034091, + "grad_norm": 0.08494888991117477, + "learning_rate": 3.785121698929725e-05, + "loss": 0.0982, + "step": 12010 + }, + { + "epoch": 0.7532744250172339, + "grad_norm": 0.0715617686510086, + "learning_rate": 3.784066200840177e-05, + "loss": 0.0946, + "step": 12020 + }, + { + "epoch": 0.7539011092310585, + "grad_norm": 0.3059249520301819, + "learning_rate": 3.7830107027506285e-05, + "loss": 0.2947, + "step": 12030 + }, + { + "epoch": 0.7545277934448831, + "grad_norm": 0.4487674832344055, + "learning_rate": 3.7819552046610795e-05, + "loss": 0.1457, + "step": 12040 + }, + { + "epoch": 0.7551544776587078, + "grad_norm": 28.634532928466797, + "learning_rate": 3.780899706571531e-05, + "loss": 0.0408, + "step": 12050 + }, + { + "epoch": 0.7557811618725324, + "grad_norm": 3.7225265502929688, + "learning_rate": 3.779844208481983e-05, + "loss": 0.2168, + "step": 12060 + }, + { + "epoch": 0.7564078460863571, + "grad_norm": 0.8148830533027649, + "learning_rate": 3.778788710392434e-05, + "loss": 0.1995, + "step": 12070 + }, + { + "epoch": 0.7570345303001818, + "grad_norm": 0.11411413550376892, + "learning_rate": 3.7777332123028855e-05, + "loss": 0.1426, + "step": 12080 + }, + { + "epoch": 0.7576612145140064, + "grad_norm": 0.48122769594192505, + "learning_rate": 3.776677714213338e-05, + "loss": 0.1247, + "step": 12090 + }, + { + "epoch": 0.7582878987278311, + "grad_norm": 0.4169093370437622, + "learning_rate": 3.7756222161237895e-05, + "loss": 0.1047, + "step": 12100 + }, + { + "epoch": 0.7589145829416557, + "grad_norm": 0.06136476248502731, + "learning_rate": 3.7745667180342405e-05, + "loss": 0.0406, + "step": 12110 + }, + { + "epoch": 0.7595412671554803, + "grad_norm": 16.241806030273438, + "learning_rate": 3.773511219944692e-05, + "loss": 0.1553, + "step": 12120 + }, + { + "epoch": 0.760167951369305, + "grad_norm": 0.404888778924942, + "learning_rate": 3.772455721855144e-05, + "loss": 0.0967, + "step": 12130 + }, + { + "epoch": 0.7607946355831297, + "grad_norm": 0.5528884530067444, + "learning_rate": 3.7714002237655955e-05, + "loss": 0.0098, + "step": 12140 + }, + { + "epoch": 0.7614213197969543, + "grad_norm": 2.322392225265503, + "learning_rate": 3.7703447256760465e-05, + "loss": 0.1899, + "step": 12150 + }, + { + "epoch": 0.762048004010779, + "grad_norm": 0.03181945160031319, + "learning_rate": 3.769289227586498e-05, + "loss": 0.1481, + "step": 12160 + }, + { + "epoch": 0.7626746882246036, + "grad_norm": 2.7455575466156006, + "learning_rate": 3.76823372949695e-05, + "loss": 0.4789, + "step": 12170 + }, + { + "epoch": 0.7633013724384283, + "grad_norm": 0.22958771884441376, + "learning_rate": 3.7671782314074014e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7639280566522529, + "grad_norm": 0.05740810185670853, + "learning_rate": 3.766122733317853e-05, + "loss": 0.1482, + "step": 12190 + }, + { + "epoch": 0.7645547408660776, + "grad_norm": 0.37747031450271606, + "learning_rate": 3.765067235228305e-05, + "loss": 0.0657, + "step": 12200 + }, + { + "epoch": 0.7651814250799023, + "grad_norm": 0.056075967848300934, + "learning_rate": 3.7640117371387564e-05, + "loss": 0.0356, + "step": 12210 + }, + { + "epoch": 0.7658081092937269, + "grad_norm": 5.565336227416992, + "learning_rate": 3.7629562390492074e-05, + "loss": 0.1372, + "step": 12220 + }, + { + "epoch": 0.7664347935075515, + "grad_norm": 0.4846305847167969, + "learning_rate": 3.761900740959659e-05, + "loss": 0.0483, + "step": 12230 + }, + { + "epoch": 0.7670614777213762, + "grad_norm": 0.037832580506801605, + "learning_rate": 3.760845242870111e-05, + "loss": 0.1366, + "step": 12240 + }, + { + "epoch": 0.7676881619352008, + "grad_norm": 4.363157272338867, + "learning_rate": 3.759789744780562e-05, + "loss": 0.1575, + "step": 12250 + }, + { + "epoch": 0.7683148461490255, + "grad_norm": 0.21504797041416168, + "learning_rate": 3.7587342466910134e-05, + "loss": 0.2839, + "step": 12260 + }, + { + "epoch": 0.7689415303628502, + "grad_norm": 0.05504310503602028, + "learning_rate": 3.757678748601465e-05, + "loss": 0.0095, + "step": 12270 + }, + { + "epoch": 0.7695682145766748, + "grad_norm": 0.08995038270950317, + "learning_rate": 3.756623250511917e-05, + "loss": 0.1848, + "step": 12280 + }, + { + "epoch": 0.7701948987904995, + "grad_norm": 2.675936222076416, + "learning_rate": 3.7555677524223684e-05, + "loss": 0.049, + "step": 12290 + }, + { + "epoch": 0.7708215830043241, + "grad_norm": 0.15427015721797943, + "learning_rate": 3.75451225433282e-05, + "loss": 0.1101, + "step": 12300 + }, + { + "epoch": 0.7714482672181487, + "grad_norm": 3.6860060691833496, + "learning_rate": 3.753456756243272e-05, + "loss": 0.235, + "step": 12310 + }, + { + "epoch": 0.7720749514319735, + "grad_norm": 1.6558562517166138, + "learning_rate": 3.752401258153723e-05, + "loss": 0.0861, + "step": 12320 + }, + { + "epoch": 0.7727016356457981, + "grad_norm": 0.12261590361595154, + "learning_rate": 3.7513457600641744e-05, + "loss": 0.1569, + "step": 12330 + }, + { + "epoch": 0.7733283198596227, + "grad_norm": 5.633775234222412, + "learning_rate": 3.750290261974626e-05, + "loss": 0.1584, + "step": 12340 + }, + { + "epoch": 0.7739550040734474, + "grad_norm": 0.30017462372779846, + "learning_rate": 3.749234763885078e-05, + "loss": 0.1295, + "step": 12350 + }, + { + "epoch": 0.774581688287272, + "grad_norm": 2.7208046913146973, + "learning_rate": 3.748179265795529e-05, + "loss": 0.3396, + "step": 12360 + }, + { + "epoch": 0.7752083725010966, + "grad_norm": 2.70462965965271, + "learning_rate": 3.7471237677059804e-05, + "loss": 0.2365, + "step": 12370 + }, + { + "epoch": 0.7758350567149214, + "grad_norm": 0.3144044876098633, + "learning_rate": 3.746068269616432e-05, + "loss": 0.1355, + "step": 12380 + }, + { + "epoch": 0.776461740928746, + "grad_norm": 0.20189404487609863, + "learning_rate": 3.745012771526884e-05, + "loss": 0.0728, + "step": 12390 + }, + { + "epoch": 0.7770884251425707, + "grad_norm": 0.15650856494903564, + "learning_rate": 3.7439572734373354e-05, + "loss": 0.1454, + "step": 12400 + }, + { + "epoch": 0.7777151093563953, + "grad_norm": 0.17184951901435852, + "learning_rate": 3.742901775347787e-05, + "loss": 0.1337, + "step": 12410 + }, + { + "epoch": 0.7783417935702199, + "grad_norm": 0.21894590556621552, + "learning_rate": 3.741846277258239e-05, + "loss": 0.2187, + "step": 12420 + }, + { + "epoch": 0.7789684777840447, + "grad_norm": 0.22736823558807373, + "learning_rate": 3.74079077916869e-05, + "loss": 0.1133, + "step": 12430 + }, + { + "epoch": 0.7795951619978693, + "grad_norm": 13.00343132019043, + "learning_rate": 3.7397352810791414e-05, + "loss": 0.0324, + "step": 12440 + }, + { + "epoch": 0.7802218462116939, + "grad_norm": 11.851655006408691, + "learning_rate": 3.738679782989593e-05, + "loss": 0.1891, + "step": 12450 + }, + { + "epoch": 0.7808485304255186, + "grad_norm": 36.18693161010742, + "learning_rate": 3.737624284900044e-05, + "loss": 0.3295, + "step": 12460 + }, + { + "epoch": 0.7814752146393432, + "grad_norm": 0.1305285394191742, + "learning_rate": 3.736568786810496e-05, + "loss": 0.1744, + "step": 12470 + }, + { + "epoch": 0.7821018988531679, + "grad_norm": 0.8213036060333252, + "learning_rate": 3.735513288720947e-05, + "loss": 0.2016, + "step": 12480 + }, + { + "epoch": 0.7827285830669926, + "grad_norm": 3.187350034713745, + "learning_rate": 3.734457790631399e-05, + "loss": 0.1567, + "step": 12490 + }, + { + "epoch": 0.7833552672808172, + "grad_norm": 0.19200462102890015, + "learning_rate": 3.733402292541851e-05, + "loss": 0.0711, + "step": 12500 + }, + { + "epoch": 0.7839819514946419, + "grad_norm": 0.19528482854366302, + "learning_rate": 3.732346794452302e-05, + "loss": 0.1273, + "step": 12510 + }, + { + "epoch": 0.7846086357084665, + "grad_norm": 0.19674736261367798, + "learning_rate": 3.731291296362754e-05, + "loss": 0.281, + "step": 12520 + }, + { + "epoch": 0.7852353199222911, + "grad_norm": 0.2751675844192505, + "learning_rate": 3.7302357982732057e-05, + "loss": 0.1607, + "step": 12530 + }, + { + "epoch": 0.7858620041361158, + "grad_norm": 0.3155404031276703, + "learning_rate": 3.7291803001836566e-05, + "loss": 0.1767, + "step": 12540 + }, + { + "epoch": 0.7864886883499405, + "grad_norm": 0.41322872042655945, + "learning_rate": 3.728124802094108e-05, + "loss": 0.1663, + "step": 12550 + }, + { + "epoch": 0.7871153725637651, + "grad_norm": 0.31705963611602783, + "learning_rate": 3.72706930400456e-05, + "loss": 0.0903, + "step": 12560 + }, + { + "epoch": 0.7877420567775898, + "grad_norm": 0.1984623223543167, + "learning_rate": 3.726013805915011e-05, + "loss": 0.0583, + "step": 12570 + }, + { + "epoch": 0.7883687409914144, + "grad_norm": 0.13793553411960602, + "learning_rate": 3.7249583078254626e-05, + "loss": 0.1318, + "step": 12580 + }, + { + "epoch": 0.7889954252052391, + "grad_norm": 5.596744060516357, + "learning_rate": 3.723902809735915e-05, + "loss": 0.0879, + "step": 12590 + }, + { + "epoch": 0.7896221094190637, + "grad_norm": 0.12211534380912781, + "learning_rate": 3.7228473116463666e-05, + "loss": 0.0783, + "step": 12600 + }, + { + "epoch": 0.7902487936328884, + "grad_norm": 3.553812026977539, + "learning_rate": 3.7217918135568176e-05, + "loss": 0.1936, + "step": 12610 + }, + { + "epoch": 0.7908754778467131, + "grad_norm": 0.12860919535160065, + "learning_rate": 3.720736315467269e-05, + "loss": 0.0552, + "step": 12620 + }, + { + "epoch": 0.7915021620605377, + "grad_norm": 3.718738079071045, + "learning_rate": 3.719680817377721e-05, + "loss": 0.2397, + "step": 12630 + }, + { + "epoch": 0.7921288462743623, + "grad_norm": 0.7922436594963074, + "learning_rate": 3.718625319288172e-05, + "loss": 0.0915, + "step": 12640 + }, + { + "epoch": 0.792755530488187, + "grad_norm": 0.11940032988786697, + "learning_rate": 3.7175698211986236e-05, + "loss": 0.1205, + "step": 12650 + }, + { + "epoch": 0.7933822147020116, + "grad_norm": 0.10362546145915985, + "learning_rate": 3.716514323109075e-05, + "loss": 0.0583, + "step": 12660 + }, + { + "epoch": 0.7940088989158363, + "grad_norm": 3.585110902786255, + "learning_rate": 3.715458825019527e-05, + "loss": 0.0795, + "step": 12670 + }, + { + "epoch": 0.794635583129661, + "grad_norm": 0.06763280928134918, + "learning_rate": 3.7144033269299786e-05, + "loss": 0.2659, + "step": 12680 + }, + { + "epoch": 0.7952622673434856, + "grad_norm": 0.1489865779876709, + "learning_rate": 3.71334782884043e-05, + "loss": 0.0992, + "step": 12690 + }, + { + "epoch": 0.7958889515573103, + "grad_norm": 3.47560977935791, + "learning_rate": 3.712292330750882e-05, + "loss": 0.1278, + "step": 12700 + }, + { + "epoch": 0.7965156357711349, + "grad_norm": 3.3073196411132812, + "learning_rate": 3.711236832661333e-05, + "loss": 0.1825, + "step": 12710 + }, + { + "epoch": 0.7971423199849595, + "grad_norm": 0.14257051050662994, + "learning_rate": 3.7101813345717846e-05, + "loss": 0.0773, + "step": 12720 + }, + { + "epoch": 0.7977690041987843, + "grad_norm": 0.48788848519325256, + "learning_rate": 3.709125836482236e-05, + "loss": 0.2049, + "step": 12730 + }, + { + "epoch": 0.7983956884126089, + "grad_norm": 3.0784692764282227, + "learning_rate": 3.708070338392688e-05, + "loss": 0.2214, + "step": 12740 + }, + { + "epoch": 0.7990223726264335, + "grad_norm": 0.27575093507766724, + "learning_rate": 3.707014840303139e-05, + "loss": 0.1934, + "step": 12750 + }, + { + "epoch": 0.7996490568402582, + "grad_norm": 0.2657164931297302, + "learning_rate": 3.7059593422135906e-05, + "loss": 0.134, + "step": 12760 + }, + { + "epoch": 0.8002757410540828, + "grad_norm": 0.9468491673469543, + "learning_rate": 3.704903844124042e-05, + "loss": 0.0772, + "step": 12770 + }, + { + "epoch": 0.8009024252679074, + "grad_norm": 0.15965603291988373, + "learning_rate": 3.703848346034494e-05, + "loss": 0.1017, + "step": 12780 + }, + { + "epoch": 0.8015291094817322, + "grad_norm": 0.19264058768749237, + "learning_rate": 3.7027928479449456e-05, + "loss": 0.1909, + "step": 12790 + }, + { + "epoch": 0.8021557936955568, + "grad_norm": 13.75501537322998, + "learning_rate": 3.701737349855397e-05, + "loss": 0.0429, + "step": 12800 + }, + { + "epoch": 0.8027824779093815, + "grad_norm": 0.10453546047210693, + "learning_rate": 3.700681851765849e-05, + "loss": 0.2287, + "step": 12810 + }, + { + "epoch": 0.8034091621232061, + "grad_norm": 3.390267848968506, + "learning_rate": 3.6996263536763e-05, + "loss": 0.2462, + "step": 12820 + }, + { + "epoch": 0.8040358463370307, + "grad_norm": 3.221275568008423, + "learning_rate": 3.6985708555867515e-05, + "loss": 0.1346, + "step": 12830 + }, + { + "epoch": 0.8046625305508555, + "grad_norm": 3.498892307281494, + "learning_rate": 3.697515357497203e-05, + "loss": 0.1513, + "step": 12840 + }, + { + "epoch": 0.8052892147646801, + "grad_norm": 6.34757661819458, + "learning_rate": 3.696459859407654e-05, + "loss": 0.1866, + "step": 12850 + }, + { + "epoch": 0.8059158989785047, + "grad_norm": 2.874657392501831, + "learning_rate": 3.695404361318106e-05, + "loss": 0.1909, + "step": 12860 + }, + { + "epoch": 0.8065425831923294, + "grad_norm": 1.1204040050506592, + "learning_rate": 3.6943488632285575e-05, + "loss": 0.0936, + "step": 12870 + }, + { + "epoch": 0.807169267406154, + "grad_norm": 0.156513050198555, + "learning_rate": 3.693293365139009e-05, + "loss": 0.0151, + "step": 12880 + }, + { + "epoch": 0.8077959516199787, + "grad_norm": 3.5642528533935547, + "learning_rate": 3.692237867049461e-05, + "loss": 0.1636, + "step": 12890 + }, + { + "epoch": 0.8084226358338034, + "grad_norm": 27.339061737060547, + "learning_rate": 3.6911823689599125e-05, + "loss": 0.0421, + "step": 12900 + }, + { + "epoch": 0.809049320047628, + "grad_norm": 0.1077108234167099, + "learning_rate": 3.690126870870364e-05, + "loss": 0.0325, + "step": 12910 + }, + { + "epoch": 0.8096760042614527, + "grad_norm": 7.458876132965088, + "learning_rate": 3.689071372780816e-05, + "loss": 0.122, + "step": 12920 + }, + { + "epoch": 0.8103026884752773, + "grad_norm": 0.0756208524107933, + "learning_rate": 3.688015874691267e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.8109293726891019, + "grad_norm": 0.06975262612104416, + "learning_rate": 3.6869603766017185e-05, + "loss": 0.19, + "step": 12940 + }, + { + "epoch": 0.8115560569029266, + "grad_norm": 0.07189065963029861, + "learning_rate": 3.68590487851217e-05, + "loss": 0.22, + "step": 12950 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 12.745660781860352, + "learning_rate": 3.684849380422621e-05, + "loss": 0.315, + "step": 12960 + }, + { + "epoch": 0.8128094253305759, + "grad_norm": 1.2097328901290894, + "learning_rate": 3.683793882333073e-05, + "loss": 0.1202, + "step": 12970 + }, + { + "epoch": 0.8134361095444006, + "grad_norm": 0.16218720376491547, + "learning_rate": 3.6827383842435245e-05, + "loss": 0.0863, + "step": 12980 + }, + { + "epoch": 0.8140627937582252, + "grad_norm": 0.09889566898345947, + "learning_rate": 3.681682886153976e-05, + "loss": 0.0592, + "step": 12990 + }, + { + "epoch": 0.8146894779720499, + "grad_norm": 0.6426899433135986, + "learning_rate": 3.680627388064428e-05, + "loss": 0.1792, + "step": 13000 + }, + { + "epoch": 0.8153161621858745, + "grad_norm": 12.053089141845703, + "learning_rate": 3.6795718899748795e-05, + "loss": 0.1879, + "step": 13010 + }, + { + "epoch": 0.8159428463996992, + "grad_norm": 0.1389077603816986, + "learning_rate": 3.678516391885331e-05, + "loss": 0.2683, + "step": 13020 + }, + { + "epoch": 0.8165695306135239, + "grad_norm": 0.29173538088798523, + "learning_rate": 3.677460893795782e-05, + "loss": 0.124, + "step": 13030 + }, + { + "epoch": 0.8171962148273485, + "grad_norm": 0.14425741136074066, + "learning_rate": 3.676405395706234e-05, + "loss": 0.1329, + "step": 13040 + }, + { + "epoch": 0.8178228990411731, + "grad_norm": 5.297956943511963, + "learning_rate": 3.6753498976166855e-05, + "loss": 0.1597, + "step": 13050 + }, + { + "epoch": 0.8184495832549978, + "grad_norm": 0.12852726876735687, + "learning_rate": 3.674294399527137e-05, + "loss": 0.0384, + "step": 13060 + }, + { + "epoch": 0.8190762674688224, + "grad_norm": 0.08532652258872986, + "learning_rate": 3.673238901437588e-05, + "loss": 0.1388, + "step": 13070 + }, + { + "epoch": 0.8197029516826471, + "grad_norm": 3.585350275039673, + "learning_rate": 3.67218340334804e-05, + "loss": 0.1586, + "step": 13080 + }, + { + "epoch": 0.8203296358964718, + "grad_norm": 67.14936065673828, + "learning_rate": 3.671127905258492e-05, + "loss": 0.2627, + "step": 13090 + }, + { + "epoch": 0.8209563201102964, + "grad_norm": 0.07265251874923706, + "learning_rate": 3.670072407168943e-05, + "loss": 0.0716, + "step": 13100 + }, + { + "epoch": 0.8215830043241211, + "grad_norm": 3.681473731994629, + "learning_rate": 3.669016909079395e-05, + "loss": 0.2819, + "step": 13110 + }, + { + "epoch": 0.8222096885379457, + "grad_norm": 3.0709450244903564, + "learning_rate": 3.6679614109898464e-05, + "loss": 0.2773, + "step": 13120 + }, + { + "epoch": 0.8228363727517704, + "grad_norm": 0.2906370460987091, + "learning_rate": 3.666905912900298e-05, + "loss": 0.1433, + "step": 13130 + }, + { + "epoch": 0.8234630569655951, + "grad_norm": 0.34257593750953674, + "learning_rate": 3.665850414810749e-05, + "loss": 0.0968, + "step": 13140 + }, + { + "epoch": 0.8240897411794197, + "grad_norm": 1.3226487636566162, + "learning_rate": 3.664794916721201e-05, + "loss": 0.0643, + "step": 13150 + }, + { + "epoch": 0.8247164253932443, + "grad_norm": 0.18398089706897736, + "learning_rate": 3.6637394186316524e-05, + "loss": 0.3557, + "step": 13160 + }, + { + "epoch": 0.825343109607069, + "grad_norm": 0.13415154814720154, + "learning_rate": 3.6626839205421034e-05, + "loss": 0.0636, + "step": 13170 + }, + { + "epoch": 0.8259697938208936, + "grad_norm": 3.044423818588257, + "learning_rate": 3.661628422452556e-05, + "loss": 0.233, + "step": 13180 + }, + { + "epoch": 0.8265964780347183, + "grad_norm": 0.11876463890075684, + "learning_rate": 3.6605729243630074e-05, + "loss": 0.0823, + "step": 13190 + }, + { + "epoch": 0.827223162248543, + "grad_norm": 0.46708282828330994, + "learning_rate": 3.659517426273459e-05, + "loss": 0.0813, + "step": 13200 + }, + { + "epoch": 0.8278498464623676, + "grad_norm": 3.5009450912475586, + "learning_rate": 3.65846192818391e-05, + "loss": 0.2123, + "step": 13210 + }, + { + "epoch": 0.8284765306761923, + "grad_norm": 2.2133095264434814, + "learning_rate": 3.657406430094362e-05, + "loss": 0.0367, + "step": 13220 + }, + { + "epoch": 0.8291032148900169, + "grad_norm": 0.09307032078504562, + "learning_rate": 3.6563509320048134e-05, + "loss": 0.0462, + "step": 13230 + }, + { + "epoch": 0.8297298991038415, + "grad_norm": 0.2503710687160492, + "learning_rate": 3.6552954339152644e-05, + "loss": 0.2158, + "step": 13240 + }, + { + "epoch": 0.8303565833176663, + "grad_norm": 0.09940878301858902, + "learning_rate": 3.654239935825716e-05, + "loss": 0.0863, + "step": 13250 + }, + { + "epoch": 0.8309832675314909, + "grad_norm": 0.10309536755084991, + "learning_rate": 3.653184437736168e-05, + "loss": 0.1642, + "step": 13260 + }, + { + "epoch": 0.8316099517453155, + "grad_norm": 0.09203781932592392, + "learning_rate": 3.6521289396466194e-05, + "loss": 0.0441, + "step": 13270 + }, + { + "epoch": 0.8322366359591402, + "grad_norm": 0.08066492527723312, + "learning_rate": 3.651073441557071e-05, + "loss": 0.0075, + "step": 13280 + }, + { + "epoch": 0.8328633201729648, + "grad_norm": 0.08222663402557373, + "learning_rate": 3.650017943467523e-05, + "loss": 0.2368, + "step": 13290 + }, + { + "epoch": 0.8334900043867896, + "grad_norm": 0.062067195773124695, + "learning_rate": 3.6489624453779744e-05, + "loss": 0.0664, + "step": 13300 + }, + { + "epoch": 0.8341166886006142, + "grad_norm": 0.07864390313625336, + "learning_rate": 3.647906947288426e-05, + "loss": 0.1451, + "step": 13310 + }, + { + "epoch": 0.8347433728144388, + "grad_norm": 0.19397737085819244, + "learning_rate": 3.646851449198877e-05, + "loss": 0.1125, + "step": 13320 + }, + { + "epoch": 0.8353700570282635, + "grad_norm": 0.06277598440647125, + "learning_rate": 3.645795951109329e-05, + "loss": 0.1009, + "step": 13330 + }, + { + "epoch": 0.8359967412420881, + "grad_norm": 0.06773136556148529, + "learning_rate": 3.6447404530197804e-05, + "loss": 0.1635, + "step": 13340 + }, + { + "epoch": 0.8366234254559127, + "grad_norm": 0.11217823624610901, + "learning_rate": 3.6436849549302314e-05, + "loss": 0.0174, + "step": 13350 + }, + { + "epoch": 0.8372501096697375, + "grad_norm": 3.266810417175293, + "learning_rate": 3.642629456840683e-05, + "loss": 0.1214, + "step": 13360 + }, + { + "epoch": 0.8378767938835621, + "grad_norm": 0.14396525919437408, + "learning_rate": 3.641573958751135e-05, + "loss": 0.1602, + "step": 13370 + }, + { + "epoch": 0.8385034780973867, + "grad_norm": 0.25807830691337585, + "learning_rate": 3.6405184606615864e-05, + "loss": 0.2454, + "step": 13380 + }, + { + "epoch": 0.8391301623112114, + "grad_norm": 4.361166477203369, + "learning_rate": 3.639462962572038e-05, + "loss": 0.1265, + "step": 13390 + }, + { + "epoch": 0.839756846525036, + "grad_norm": 0.07289406657218933, + "learning_rate": 3.63840746448249e-05, + "loss": 0.0458, + "step": 13400 + }, + { + "epoch": 0.8403835307388607, + "grad_norm": 1.979023814201355, + "learning_rate": 3.6373519663929413e-05, + "loss": 0.03, + "step": 13410 + }, + { + "epoch": 0.8410102149526854, + "grad_norm": 0.047339342534542084, + "learning_rate": 3.636296468303392e-05, + "loss": 0.0726, + "step": 13420 + }, + { + "epoch": 0.84163689916651, + "grad_norm": 0.05762796103954315, + "learning_rate": 3.635240970213844e-05, + "loss": 0.1629, + "step": 13430 + }, + { + "epoch": 0.8422635833803347, + "grad_norm": 0.08475717902183533, + "learning_rate": 3.634185472124296e-05, + "loss": 0.1693, + "step": 13440 + }, + { + "epoch": 0.8428902675941593, + "grad_norm": 19.972970962524414, + "learning_rate": 3.633129974034747e-05, + "loss": 0.0781, + "step": 13450 + }, + { + "epoch": 0.8435169518079839, + "grad_norm": 6.1189985275268555, + "learning_rate": 3.632074475945198e-05, + "loss": 0.0217, + "step": 13460 + }, + { + "epoch": 0.8441436360218086, + "grad_norm": 0.04247862473130226, + "learning_rate": 3.63101897785565e-05, + "loss": 0.3026, + "step": 13470 + }, + { + "epoch": 0.8447703202356333, + "grad_norm": 10.373451232910156, + "learning_rate": 3.6299634797661017e-05, + "loss": 0.2465, + "step": 13480 + }, + { + "epoch": 0.8453970044494579, + "grad_norm": 0.16475176811218262, + "learning_rate": 3.628907981676553e-05, + "loss": 0.1451, + "step": 13490 + }, + { + "epoch": 0.8460236886632826, + "grad_norm": 0.053600408136844635, + "learning_rate": 3.627852483587005e-05, + "loss": 0.0733, + "step": 13500 + }, + { + "epoch": 0.8466503728771072, + "grad_norm": 0.06090028956532478, + "learning_rate": 3.6267969854974566e-05, + "loss": 0.2185, + "step": 13510 + }, + { + "epoch": 0.8472770570909319, + "grad_norm": 22.72749137878418, + "learning_rate": 3.625741487407908e-05, + "loss": 0.1009, + "step": 13520 + }, + { + "epoch": 0.8479037413047565, + "grad_norm": 3.8991730213165283, + "learning_rate": 3.624685989318359e-05, + "loss": 0.1963, + "step": 13530 + }, + { + "epoch": 0.8485304255185812, + "grad_norm": 0.0632360652089119, + "learning_rate": 3.623630491228811e-05, + "loss": 0.1906, + "step": 13540 + }, + { + "epoch": 0.8491571097324059, + "grad_norm": 10.589201927185059, + "learning_rate": 3.6225749931392626e-05, + "loss": 0.1771, + "step": 13550 + }, + { + "epoch": 0.8497837939462305, + "grad_norm": 0.08310031145811081, + "learning_rate": 3.6215194950497136e-05, + "loss": 0.1889, + "step": 13560 + }, + { + "epoch": 0.8504104781600551, + "grad_norm": 0.14444205164909363, + "learning_rate": 3.620463996960165e-05, + "loss": 0.1354, + "step": 13570 + }, + { + "epoch": 0.8510371623738798, + "grad_norm": 1.6947962045669556, + "learning_rate": 3.619408498870617e-05, + "loss": 0.1037, + "step": 13580 + }, + { + "epoch": 0.8516638465877044, + "grad_norm": 2.187366485595703, + "learning_rate": 3.618353000781069e-05, + "loss": 0.1027, + "step": 13590 + }, + { + "epoch": 0.8522905308015291, + "grad_norm": 8.265816688537598, + "learning_rate": 3.61729750269152e-05, + "loss": 0.0939, + "step": 13600 + }, + { + "epoch": 0.8529172150153538, + "grad_norm": 11.722594261169434, + "learning_rate": 3.616242004601972e-05, + "loss": 0.0769, + "step": 13610 + }, + { + "epoch": 0.8535438992291784, + "grad_norm": 0.04671710357069969, + "learning_rate": 3.6151865065124236e-05, + "loss": 0.0638, + "step": 13620 + }, + { + "epoch": 0.8541705834430031, + "grad_norm": 14.175790786743164, + "learning_rate": 3.6141310084228746e-05, + "loss": 0.2084, + "step": 13630 + }, + { + "epoch": 0.8547972676568277, + "grad_norm": 2.082923412322998, + "learning_rate": 3.613075510333326e-05, + "loss": 0.1194, + "step": 13640 + }, + { + "epoch": 0.8554239518706523, + "grad_norm": 0.38090240955352783, + "learning_rate": 3.612020012243778e-05, + "loss": 0.2383, + "step": 13650 + }, + { + "epoch": 0.8560506360844771, + "grad_norm": 5.69649076461792, + "learning_rate": 3.6109645141542296e-05, + "loss": 0.0534, + "step": 13660 + }, + { + "epoch": 0.8566773202983017, + "grad_norm": 0.22433985769748688, + "learning_rate": 3.6099090160646806e-05, + "loss": 0.2355, + "step": 13670 + }, + { + "epoch": 0.8573040045121263, + "grad_norm": 9.8493013381958, + "learning_rate": 3.608853517975133e-05, + "loss": 0.0305, + "step": 13680 + }, + { + "epoch": 0.857930688725951, + "grad_norm": 0.031805284321308136, + "learning_rate": 3.6077980198855846e-05, + "loss": 0.0292, + "step": 13690 + }, + { + "epoch": 0.8585573729397756, + "grad_norm": 0.20661821961402893, + "learning_rate": 3.6067425217960356e-05, + "loss": 0.0627, + "step": 13700 + }, + { + "epoch": 0.8591840571536004, + "grad_norm": 0.8476600050926208, + "learning_rate": 3.605687023706487e-05, + "loss": 0.0846, + "step": 13710 + }, + { + "epoch": 0.859810741367425, + "grad_norm": 4.03338098526001, + "learning_rate": 3.604631525616939e-05, + "loss": 0.2247, + "step": 13720 + }, + { + "epoch": 0.8604374255812496, + "grad_norm": 0.04489768296480179, + "learning_rate": 3.6035760275273906e-05, + "loss": 0.238, + "step": 13730 + }, + { + "epoch": 0.8610641097950743, + "grad_norm": 1.559618353843689, + "learning_rate": 3.6025205294378416e-05, + "loss": 0.0871, + "step": 13740 + }, + { + "epoch": 0.8616907940088989, + "grad_norm": 0.12418634444475174, + "learning_rate": 3.601465031348293e-05, + "loss": 0.1338, + "step": 13750 + }, + { + "epoch": 0.8623174782227235, + "grad_norm": 3.787989854812622, + "learning_rate": 3.600409533258745e-05, + "loss": 0.1617, + "step": 13760 + }, + { + "epoch": 0.8629441624365483, + "grad_norm": 2.1943676471710205, + "learning_rate": 3.5993540351691966e-05, + "loss": 0.0946, + "step": 13770 + }, + { + "epoch": 0.8635708466503729, + "grad_norm": 0.0763244777917862, + "learning_rate": 3.598298537079648e-05, + "loss": 0.1639, + "step": 13780 + }, + { + "epoch": 0.8641975308641975, + "grad_norm": 0.08010061830282211, + "learning_rate": 3.5972430389901e-05, + "loss": 0.0546, + "step": 13790 + }, + { + "epoch": 0.8648242150780222, + "grad_norm": 0.07747522741556168, + "learning_rate": 3.5961875409005515e-05, + "loss": 0.0335, + "step": 13800 + }, + { + "epoch": 0.8654508992918468, + "grad_norm": 4.978739261627197, + "learning_rate": 3.5951320428110025e-05, + "loss": 0.2478, + "step": 13810 + }, + { + "epoch": 0.8660775835056715, + "grad_norm": 0.06660338491201401, + "learning_rate": 3.594076544721454e-05, + "loss": 0.11, + "step": 13820 + }, + { + "epoch": 0.8667042677194962, + "grad_norm": 0.0601072795689106, + "learning_rate": 3.593021046631906e-05, + "loss": 0.0817, + "step": 13830 + }, + { + "epoch": 0.8673309519333208, + "grad_norm": 0.0625920370221138, + "learning_rate": 3.5919655485423575e-05, + "loss": 0.1403, + "step": 13840 + }, + { + "epoch": 0.8679576361471455, + "grad_norm": 3.843177556991577, + "learning_rate": 3.5909100504528085e-05, + "loss": 0.1269, + "step": 13850 + }, + { + "epoch": 0.8685843203609701, + "grad_norm": 3.897458791732788, + "learning_rate": 3.58985455236326e-05, + "loss": 0.1432, + "step": 13860 + }, + { + "epoch": 0.8692110045747947, + "grad_norm": 1.523318886756897, + "learning_rate": 3.588799054273712e-05, + "loss": 0.1433, + "step": 13870 + }, + { + "epoch": 0.8698376887886194, + "grad_norm": 0.05766427889466286, + "learning_rate": 3.5877435561841635e-05, + "loss": 0.0306, + "step": 13880 + }, + { + "epoch": 0.8704643730024441, + "grad_norm": 1.394804835319519, + "learning_rate": 3.586688058094615e-05, + "loss": 0.0264, + "step": 13890 + }, + { + "epoch": 0.8710910572162687, + "grad_norm": 0.05665925145149231, + "learning_rate": 3.585632560005067e-05, + "loss": 0.1586, + "step": 13900 + }, + { + "epoch": 0.8717177414300934, + "grad_norm": 0.06234462931752205, + "learning_rate": 3.5845770619155185e-05, + "loss": 0.1324, + "step": 13910 + }, + { + "epoch": 0.872344425643918, + "grad_norm": 0.08005021512508392, + "learning_rate": 3.5835215638259695e-05, + "loss": 0.1808, + "step": 13920 + }, + { + "epoch": 0.8729711098577427, + "grad_norm": 0.11479296535253525, + "learning_rate": 3.582466065736421e-05, + "loss": 0.1577, + "step": 13930 + }, + { + "epoch": 0.8735977940715673, + "grad_norm": 0.12691444158554077, + "learning_rate": 3.581410567646873e-05, + "loss": 0.099, + "step": 13940 + }, + { + "epoch": 0.874224478285392, + "grad_norm": 0.1263216882944107, + "learning_rate": 3.580355069557324e-05, + "loss": 0.0946, + "step": 13950 + }, + { + "epoch": 0.8748511624992167, + "grad_norm": 20.947620391845703, + "learning_rate": 3.5792995714677755e-05, + "loss": 0.2432, + "step": 13960 + }, + { + "epoch": 0.8754778467130413, + "grad_norm": 0.9176009893417358, + "learning_rate": 3.578244073378227e-05, + "loss": 0.1903, + "step": 13970 + }, + { + "epoch": 0.8761045309268659, + "grad_norm": 7.211978435516357, + "learning_rate": 3.577188575288679e-05, + "loss": 0.1683, + "step": 13980 + }, + { + "epoch": 0.8767312151406906, + "grad_norm": 0.17308202385902405, + "learning_rate": 3.5761330771991305e-05, + "loss": 0.041, + "step": 13990 + }, + { + "epoch": 0.8773578993545152, + "grad_norm": 0.30790331959724426, + "learning_rate": 3.575077579109582e-05, + "loss": 0.2975, + "step": 14000 + }, + { + "epoch": 0.8779845835683399, + "grad_norm": 0.08187547326087952, + "learning_rate": 3.574022081020034e-05, + "loss": 0.2002, + "step": 14010 + }, + { + "epoch": 0.8786112677821646, + "grad_norm": 0.38137057423591614, + "learning_rate": 3.572966582930485e-05, + "loss": 0.037, + "step": 14020 + }, + { + "epoch": 0.8792379519959892, + "grad_norm": 0.10382096469402313, + "learning_rate": 3.5719110848409365e-05, + "loss": 0.1482, + "step": 14030 + }, + { + "epoch": 0.8798646362098139, + "grad_norm": 0.13245761394500732, + "learning_rate": 3.570855586751388e-05, + "loss": 0.1223, + "step": 14040 + }, + { + "epoch": 0.8804913204236385, + "grad_norm": 0.06675305962562561, + "learning_rate": 3.56980008866184e-05, + "loss": 0.0169, + "step": 14050 + }, + { + "epoch": 0.8811180046374631, + "grad_norm": 0.0645369216799736, + "learning_rate": 3.568744590572291e-05, + "loss": 0.0948, + "step": 14060 + }, + { + "epoch": 0.8817446888512879, + "grad_norm": 0.06949496269226074, + "learning_rate": 3.5676890924827424e-05, + "loss": 0.0758, + "step": 14070 + }, + { + "epoch": 0.8823713730651125, + "grad_norm": 0.13635949790477753, + "learning_rate": 3.566633594393195e-05, + "loss": 0.2345, + "step": 14080 + }, + { + "epoch": 0.8829980572789371, + "grad_norm": 19.03074073791504, + "learning_rate": 3.565578096303646e-05, + "loss": 0.0708, + "step": 14090 + }, + { + "epoch": 0.8836247414927618, + "grad_norm": 0.17024590075016022, + "learning_rate": 3.5645225982140974e-05, + "loss": 0.1785, + "step": 14100 + }, + { + "epoch": 0.8842514257065864, + "grad_norm": 0.13776808977127075, + "learning_rate": 3.563467100124549e-05, + "loss": 0.0187, + "step": 14110 + }, + { + "epoch": 0.8848781099204112, + "grad_norm": 0.1499529480934143, + "learning_rate": 3.562411602035001e-05, + "loss": 0.1989, + "step": 14120 + }, + { + "epoch": 0.8855047941342358, + "grad_norm": 3.0967087745666504, + "learning_rate": 3.561356103945452e-05, + "loss": 0.2515, + "step": 14130 + }, + { + "epoch": 0.8861314783480604, + "grad_norm": 0.36061203479766846, + "learning_rate": 3.5603006058559034e-05, + "loss": 0.0717, + "step": 14140 + }, + { + "epoch": 0.8867581625618851, + "grad_norm": 0.13822808861732483, + "learning_rate": 3.559245107766355e-05, + "loss": 0.1449, + "step": 14150 + }, + { + "epoch": 0.8873848467757097, + "grad_norm": 0.12512069940567017, + "learning_rate": 3.558189609676807e-05, + "loss": 0.277, + "step": 14160 + }, + { + "epoch": 0.8880115309895343, + "grad_norm": 0.12213852256536484, + "learning_rate": 3.557134111587258e-05, + "loss": 0.0714, + "step": 14170 + }, + { + "epoch": 0.8886382152033591, + "grad_norm": 6.950976848602295, + "learning_rate": 3.55607861349771e-05, + "loss": 0.0562, + "step": 14180 + }, + { + "epoch": 0.8892648994171837, + "grad_norm": 3.2038068771362305, + "learning_rate": 3.555023115408162e-05, + "loss": 0.1579, + "step": 14190 + }, + { + "epoch": 0.8898915836310083, + "grad_norm": 0.07791581749916077, + "learning_rate": 3.553967617318613e-05, + "loss": 0.0488, + "step": 14200 + }, + { + "epoch": 0.890518267844833, + "grad_norm": 0.6226568222045898, + "learning_rate": 3.5529121192290644e-05, + "loss": 0.128, + "step": 14210 + }, + { + "epoch": 0.8911449520586576, + "grad_norm": 3.773237466812134, + "learning_rate": 3.551856621139516e-05, + "loss": 0.2278, + "step": 14220 + }, + { + "epoch": 0.8917716362724823, + "grad_norm": 3.38862943649292, + "learning_rate": 3.550801123049968e-05, + "loss": 0.2659, + "step": 14230 + }, + { + "epoch": 0.892398320486307, + "grad_norm": 2.387253522872925, + "learning_rate": 3.549745624960419e-05, + "loss": 0.1546, + "step": 14240 + }, + { + "epoch": 0.8930250047001316, + "grad_norm": 0.2753084599971771, + "learning_rate": 3.5486901268708704e-05, + "loss": 0.2394, + "step": 14250 + }, + { + "epoch": 0.8936516889139563, + "grad_norm": 0.22734327614307404, + "learning_rate": 3.547634628781322e-05, + "loss": 0.0995, + "step": 14260 + }, + { + "epoch": 0.8942783731277809, + "grad_norm": 0.2785840928554535, + "learning_rate": 3.546579130691774e-05, + "loss": 0.2001, + "step": 14270 + }, + { + "epoch": 0.8949050573416055, + "grad_norm": 12.311131477355957, + "learning_rate": 3.5455236326022254e-05, + "loss": 0.0849, + "step": 14280 + }, + { + "epoch": 0.8955317415554302, + "grad_norm": 0.11820970475673676, + "learning_rate": 3.544468134512677e-05, + "loss": 0.0703, + "step": 14290 + }, + { + "epoch": 0.8961584257692549, + "grad_norm": 0.13383793830871582, + "learning_rate": 3.543412636423129e-05, + "loss": 0.1594, + "step": 14300 + }, + { + "epoch": 0.8967851099830795, + "grad_norm": 0.1279149353504181, + "learning_rate": 3.54235713833358e-05, + "loss": 0.1361, + "step": 14310 + }, + { + "epoch": 0.8974117941969042, + "grad_norm": 0.11653663963079453, + "learning_rate": 3.5413016402440314e-05, + "loss": 0.2208, + "step": 14320 + }, + { + "epoch": 0.8980384784107288, + "grad_norm": 6.753573894500732, + "learning_rate": 3.540246142154483e-05, + "loss": 0.3227, + "step": 14330 + }, + { + "epoch": 0.8986651626245535, + "grad_norm": 0.18352077901363373, + "learning_rate": 3.539190644064934e-05, + "loss": 0.1691, + "step": 14340 + }, + { + "epoch": 0.8992918468383782, + "grad_norm": 0.22739467024803162, + "learning_rate": 3.538135145975386e-05, + "loss": 0.1916, + "step": 14350 + }, + { + "epoch": 0.8999185310522028, + "grad_norm": 3.4141712188720703, + "learning_rate": 3.5370796478858373e-05, + "loss": 0.0496, + "step": 14360 + }, + { + "epoch": 0.9005452152660275, + "grad_norm": 4.359954357147217, + "learning_rate": 3.536024149796289e-05, + "loss": 0.3205, + "step": 14370 + }, + { + "epoch": 0.9011718994798521, + "grad_norm": 0.2058669477701187, + "learning_rate": 3.534968651706741e-05, + "loss": 0.0168, + "step": 14380 + }, + { + "epoch": 0.9017985836936767, + "grad_norm": 0.16354796290397644, + "learning_rate": 3.533913153617192e-05, + "loss": 0.1409, + "step": 14390 + }, + { + "epoch": 0.9024252679075014, + "grad_norm": 0.184243842959404, + "learning_rate": 3.532857655527644e-05, + "loss": 0.2108, + "step": 14400 + }, + { + "epoch": 0.903051952121326, + "grad_norm": 0.19970452785491943, + "learning_rate": 3.531802157438095e-05, + "loss": 0.0982, + "step": 14410 + }, + { + "epoch": 0.9036786363351507, + "grad_norm": 5.539926052093506, + "learning_rate": 3.5307466593485467e-05, + "loss": 0.0542, + "step": 14420 + }, + { + "epoch": 0.9043053205489754, + "grad_norm": 0.11390367895364761, + "learning_rate": 3.529691161258998e-05, + "loss": 0.1596, + "step": 14430 + }, + { + "epoch": 0.9049320047628, + "grad_norm": 3.1585936546325684, + "learning_rate": 3.52863566316945e-05, + "loss": 0.1152, + "step": 14440 + }, + { + "epoch": 0.9055586889766247, + "grad_norm": 0.20199960470199585, + "learning_rate": 3.527580165079901e-05, + "loss": 0.297, + "step": 14450 + }, + { + "epoch": 0.9061853731904493, + "grad_norm": 0.20266208052635193, + "learning_rate": 3.5265246669903526e-05, + "loss": 0.0973, + "step": 14460 + }, + { + "epoch": 0.906812057404274, + "grad_norm": 0.13221389055252075, + "learning_rate": 3.525469168900804e-05, + "loss": 0.0184, + "step": 14470 + }, + { + "epoch": 0.9074387416180987, + "grad_norm": 0.10203318297863007, + "learning_rate": 3.524413670811256e-05, + "loss": 0.0098, + "step": 14480 + }, + { + "epoch": 0.9080654258319233, + "grad_norm": 0.12804029881954193, + "learning_rate": 3.5233581727217076e-05, + "loss": 0.0036, + "step": 14490 + }, + { + "epoch": 0.9086921100457479, + "grad_norm": 0.0833655372262001, + "learning_rate": 3.522302674632159e-05, + "loss": 0.2275, + "step": 14500 + }, + { + "epoch": 0.9093187942595726, + "grad_norm": 60.491127014160156, + "learning_rate": 3.521247176542611e-05, + "loss": 0.2654, + "step": 14510 + }, + { + "epoch": 0.9099454784733972, + "grad_norm": 4.894341468811035, + "learning_rate": 3.520191678453062e-05, + "loss": 0.021, + "step": 14520 + }, + { + "epoch": 0.910572162687222, + "grad_norm": 3.7257537841796875, + "learning_rate": 3.5191361803635136e-05, + "loss": 0.2712, + "step": 14530 + }, + { + "epoch": 0.9111988469010466, + "grad_norm": 25.76340675354004, + "learning_rate": 3.518080682273965e-05, + "loss": 0.1857, + "step": 14540 + }, + { + "epoch": 0.9118255311148712, + "grad_norm": 3.0573976039886475, + "learning_rate": 3.517025184184417e-05, + "loss": 0.1953, + "step": 14550 + }, + { + "epoch": 0.9124522153286959, + "grad_norm": 3.152639389038086, + "learning_rate": 3.515969686094868e-05, + "loss": 0.2077, + "step": 14560 + }, + { + "epoch": 0.9130788995425205, + "grad_norm": 0.7926061153411865, + "learning_rate": 3.5149141880053196e-05, + "loss": 0.2346, + "step": 14570 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 0.23289482295513153, + "learning_rate": 3.513858689915772e-05, + "loss": 0.1212, + "step": 14580 + }, + { + "epoch": 0.9143322679701699, + "grad_norm": 0.28908389806747437, + "learning_rate": 3.512803191826223e-05, + "loss": 0.1442, + "step": 14590 + }, + { + "epoch": 0.9149589521839945, + "grad_norm": 0.2696382701396942, + "learning_rate": 3.5117476937366746e-05, + "loss": 0.1659, + "step": 14600 + }, + { + "epoch": 0.9155856363978191, + "grad_norm": 0.15111982822418213, + "learning_rate": 3.510692195647126e-05, + "loss": 0.2219, + "step": 14610 + }, + { + "epoch": 0.9162123206116438, + "grad_norm": 36.52037048339844, + "learning_rate": 3.509636697557578e-05, + "loss": 0.1229, + "step": 14620 + }, + { + "epoch": 0.9168390048254684, + "grad_norm": 0.141217902302742, + "learning_rate": 3.508581199468029e-05, + "loss": 0.0352, + "step": 14630 + }, + { + "epoch": 0.9174656890392932, + "grad_norm": 0.11265815049409866, + "learning_rate": 3.5075257013784806e-05, + "loss": 0.1197, + "step": 14640 + }, + { + "epoch": 0.9180923732531178, + "grad_norm": 6.607141971588135, + "learning_rate": 3.506470203288932e-05, + "loss": 0.2776, + "step": 14650 + }, + { + "epoch": 0.9187190574669424, + "grad_norm": 0.35055074095726013, + "learning_rate": 3.505414705199383e-05, + "loss": 0.2109, + "step": 14660 + }, + { + "epoch": 0.9193457416807671, + "grad_norm": 0.12104617059230804, + "learning_rate": 3.504359207109835e-05, + "loss": 0.1882, + "step": 14670 + }, + { + "epoch": 0.9199724258945917, + "grad_norm": 0.1288590133190155, + "learning_rate": 3.503303709020287e-05, + "loss": 0.1052, + "step": 14680 + }, + { + "epoch": 0.9205991101084163, + "grad_norm": 0.472810834646225, + "learning_rate": 3.502248210930739e-05, + "loss": 0.1659, + "step": 14690 + }, + { + "epoch": 0.921225794322241, + "grad_norm": 0.09575945138931274, + "learning_rate": 3.50119271284119e-05, + "loss": 0.0759, + "step": 14700 + }, + { + "epoch": 0.9218524785360657, + "grad_norm": 5.256383419036865, + "learning_rate": 3.5001372147516416e-05, + "loss": 0.4578, + "step": 14710 + }, + { + "epoch": 0.9224791627498903, + "grad_norm": 3.0405874252319336, + "learning_rate": 3.499081716662093e-05, + "loss": 0.2224, + "step": 14720 + }, + { + "epoch": 0.923105846963715, + "grad_norm": 185.64772033691406, + "learning_rate": 3.498026218572544e-05, + "loss": 0.0856, + "step": 14730 + }, + { + "epoch": 0.9237325311775396, + "grad_norm": 0.2516983449459076, + "learning_rate": 3.496970720482996e-05, + "loss": 0.1241, + "step": 14740 + }, + { + "epoch": 0.9243592153913643, + "grad_norm": 3.004882574081421, + "learning_rate": 3.4959152223934475e-05, + "loss": 0.0792, + "step": 14750 + }, + { + "epoch": 0.924985899605189, + "grad_norm": 0.12979790568351746, + "learning_rate": 3.494859724303899e-05, + "loss": 0.2953, + "step": 14760 + }, + { + "epoch": 0.9256125838190136, + "grad_norm": 0.11266639828681946, + "learning_rate": 3.493804226214351e-05, + "loss": 0.1118, + "step": 14770 + }, + { + "epoch": 0.9262392680328383, + "grad_norm": 0.18054163455963135, + "learning_rate": 3.4927487281248025e-05, + "loss": 0.1235, + "step": 14780 + }, + { + "epoch": 0.9268659522466629, + "grad_norm": 0.07736846059560776, + "learning_rate": 3.491693230035254e-05, + "loss": 0.0064, + "step": 14790 + }, + { + "epoch": 0.9274926364604875, + "grad_norm": 0.08575043082237244, + "learning_rate": 3.490637731945705e-05, + "loss": 0.1972, + "step": 14800 + }, + { + "epoch": 0.9281193206743122, + "grad_norm": 2.6329586505889893, + "learning_rate": 3.489582233856157e-05, + "loss": 0.1617, + "step": 14810 + }, + { + "epoch": 0.9287460048881369, + "grad_norm": 3.1636767387390137, + "learning_rate": 3.4885267357666085e-05, + "loss": 0.1465, + "step": 14820 + }, + { + "epoch": 0.9293726891019615, + "grad_norm": 1.366560697555542, + "learning_rate": 3.48747123767706e-05, + "loss": 0.2379, + "step": 14830 + }, + { + "epoch": 0.9299993733157862, + "grad_norm": 1.945754051208496, + "learning_rate": 3.486415739587511e-05, + "loss": 0.1275, + "step": 14840 + }, + { + "epoch": 0.9306260575296108, + "grad_norm": 3.5652472972869873, + "learning_rate": 3.485360241497963e-05, + "loss": 0.1644, + "step": 14850 + }, + { + "epoch": 0.9312527417434355, + "grad_norm": 0.10736887902021408, + "learning_rate": 3.4843047434084145e-05, + "loss": 0.0855, + "step": 14860 + }, + { + "epoch": 0.9318794259572601, + "grad_norm": 0.14352063834667206, + "learning_rate": 3.483249245318866e-05, + "loss": 0.1366, + "step": 14870 + }, + { + "epoch": 0.9325061101710848, + "grad_norm": 0.2069852501153946, + "learning_rate": 3.482193747229318e-05, + "loss": 0.131, + "step": 14880 + }, + { + "epoch": 0.9331327943849095, + "grad_norm": 2.4952032566070557, + "learning_rate": 3.4811382491397695e-05, + "loss": 0.043, + "step": 14890 + }, + { + "epoch": 0.9337594785987341, + "grad_norm": 0.050907865166664124, + "learning_rate": 3.480082751050221e-05, + "loss": 0.1149, + "step": 14900 + }, + { + "epoch": 0.9343861628125587, + "grad_norm": 0.11749767512083054, + "learning_rate": 3.479027252960672e-05, + "loss": 0.0684, + "step": 14910 + }, + { + "epoch": 0.9350128470263834, + "grad_norm": 94.1059341430664, + "learning_rate": 3.477971754871124e-05, + "loss": 0.2177, + "step": 14920 + }, + { + "epoch": 0.935639531240208, + "grad_norm": 0.7639022469520569, + "learning_rate": 3.4769162567815755e-05, + "loss": 0.3833, + "step": 14930 + }, + { + "epoch": 0.9362662154540327, + "grad_norm": 5.576292514801025, + "learning_rate": 3.475860758692027e-05, + "loss": 0.3064, + "step": 14940 + }, + { + "epoch": 0.9368928996678574, + "grad_norm": 0.25809335708618164, + "learning_rate": 3.474805260602478e-05, + "loss": 0.0185, + "step": 14950 + }, + { + "epoch": 0.937519583881682, + "grad_norm": 0.10612110793590546, + "learning_rate": 3.47374976251293e-05, + "loss": 0.0613, + "step": 14960 + }, + { + "epoch": 0.9381462680955067, + "grad_norm": 0.11804734915494919, + "learning_rate": 3.4726942644233815e-05, + "loss": 0.0727, + "step": 14970 + }, + { + "epoch": 0.9387729523093313, + "grad_norm": 0.1256975531578064, + "learning_rate": 3.471638766333833e-05, + "loss": 0.1307, + "step": 14980 + }, + { + "epoch": 0.939399636523156, + "grad_norm": 1.1834015846252441, + "learning_rate": 3.470583268244285e-05, + "loss": 0.1811, + "step": 14990 + }, + { + "epoch": 0.9400263207369807, + "grad_norm": 0.09870567917823792, + "learning_rate": 3.4695277701547365e-05, + "loss": 0.1701, + "step": 15000 + }, + { + "epoch": 0.9406530049508053, + "grad_norm": 0.08171737194061279, + "learning_rate": 3.468472272065188e-05, + "loss": 0.1085, + "step": 15010 + }, + { + "epoch": 0.9412796891646299, + "grad_norm": 0.08438348025083542, + "learning_rate": 3.467416773975639e-05, + "loss": 0.021, + "step": 15020 + }, + { + "epoch": 0.9419063733784546, + "grad_norm": 19.801450729370117, + "learning_rate": 3.466361275886091e-05, + "loss": 0.0466, + "step": 15030 + }, + { + "epoch": 0.9425330575922792, + "grad_norm": 4.311177730560303, + "learning_rate": 3.4653057777965424e-05, + "loss": 0.1786, + "step": 15040 + }, + { + "epoch": 0.943159741806104, + "grad_norm": 3.4445955753326416, + "learning_rate": 3.4642502797069934e-05, + "loss": 0.177, + "step": 15050 + }, + { + "epoch": 0.9437864260199286, + "grad_norm": 0.06916206330060959, + "learning_rate": 3.463194781617445e-05, + "loss": 0.0445, + "step": 15060 + }, + { + "epoch": 0.9444131102337532, + "grad_norm": 0.06181073561310768, + "learning_rate": 3.462139283527897e-05, + "loss": 0.1036, + "step": 15070 + }, + { + "epoch": 0.9450397944475779, + "grad_norm": 0.6313286423683167, + "learning_rate": 3.461083785438349e-05, + "loss": 0.1272, + "step": 15080 + }, + { + "epoch": 0.9456664786614025, + "grad_norm": 0.22247269749641418, + "learning_rate": 3.4600282873488e-05, + "loss": 0.195, + "step": 15090 + }, + { + "epoch": 0.9462931628752271, + "grad_norm": 0.09834704548120499, + "learning_rate": 3.458972789259252e-05, + "loss": 0.0369, + "step": 15100 + }, + { + "epoch": 0.9469198470890519, + "grad_norm": 41.505699157714844, + "learning_rate": 3.4579172911697034e-05, + "loss": 0.2066, + "step": 15110 + }, + { + "epoch": 0.9475465313028765, + "grad_norm": 0.06724511831998825, + "learning_rate": 3.4568617930801544e-05, + "loss": 0.0907, + "step": 15120 + }, + { + "epoch": 0.9481732155167011, + "grad_norm": 0.7390651702880859, + "learning_rate": 3.455806294990606e-05, + "loss": 0.1586, + "step": 15130 + }, + { + "epoch": 0.9487998997305258, + "grad_norm": 1.8994289636611938, + "learning_rate": 3.454750796901058e-05, + "loss": 0.146, + "step": 15140 + }, + { + "epoch": 0.9494265839443504, + "grad_norm": 0.05240581929683685, + "learning_rate": 3.4536952988115094e-05, + "loss": 0.0999, + "step": 15150 + }, + { + "epoch": 0.9500532681581751, + "grad_norm": 0.12572909891605377, + "learning_rate": 3.4526398007219604e-05, + "loss": 0.2249, + "step": 15160 + }, + { + "epoch": 0.9506799523719998, + "grad_norm": 6.992523670196533, + "learning_rate": 3.451584302632413e-05, + "loss": 0.2247, + "step": 15170 + }, + { + "epoch": 0.9513066365858244, + "grad_norm": 0.2796679437160492, + "learning_rate": 3.4505288045428644e-05, + "loss": 0.0354, + "step": 15180 + }, + { + "epoch": 0.9519333207996491, + "grad_norm": 0.16293998062610626, + "learning_rate": 3.4494733064533154e-05, + "loss": 0.122, + "step": 15190 + }, + { + "epoch": 0.9525600050134737, + "grad_norm": 0.2495272159576416, + "learning_rate": 3.448417808363767e-05, + "loss": 0.0535, + "step": 15200 + }, + { + "epoch": 0.9531866892272983, + "grad_norm": 59.95820617675781, + "learning_rate": 3.447362310274219e-05, + "loss": 0.1418, + "step": 15210 + }, + { + "epoch": 0.953813373441123, + "grad_norm": 2.4325592517852783, + "learning_rate": 3.4463068121846704e-05, + "loss": 0.0899, + "step": 15220 + }, + { + "epoch": 0.9544400576549477, + "grad_norm": 0.05558519810438156, + "learning_rate": 3.4452513140951214e-05, + "loss": 0.0207, + "step": 15230 + }, + { + "epoch": 0.9550667418687723, + "grad_norm": 0.07892566174268723, + "learning_rate": 3.444195816005573e-05, + "loss": 0.151, + "step": 15240 + }, + { + "epoch": 0.955693426082597, + "grad_norm": 0.07319292426109314, + "learning_rate": 3.443140317916025e-05, + "loss": 0.0545, + "step": 15250 + }, + { + "epoch": 0.9563201102964216, + "grad_norm": 0.5627008676528931, + "learning_rate": 3.442084819826476e-05, + "loss": 0.3364, + "step": 15260 + }, + { + "epoch": 0.9569467945102463, + "grad_norm": 0.2820579409599304, + "learning_rate": 3.441029321736928e-05, + "loss": 0.0516, + "step": 15270 + }, + { + "epoch": 0.957573478724071, + "grad_norm": 2.9985768795013428, + "learning_rate": 3.43997382364738e-05, + "loss": 0.2052, + "step": 15280 + }, + { + "epoch": 0.9582001629378956, + "grad_norm": 0.43202292919158936, + "learning_rate": 3.4389183255578314e-05, + "loss": 0.1178, + "step": 15290 + }, + { + "epoch": 0.9588268471517203, + "grad_norm": 0.08718843758106232, + "learning_rate": 3.4378628274682823e-05, + "loss": 0.161, + "step": 15300 + }, + { + "epoch": 0.9594535313655449, + "grad_norm": 0.2119925618171692, + "learning_rate": 3.436807329378734e-05, + "loss": 0.0193, + "step": 15310 + }, + { + "epoch": 0.9600802155793695, + "grad_norm": 0.25086766481399536, + "learning_rate": 3.435751831289186e-05, + "loss": 0.0914, + "step": 15320 + }, + { + "epoch": 0.9607068997931942, + "grad_norm": 0.19801168143749237, + "learning_rate": 3.434696333199637e-05, + "loss": 0.0216, + "step": 15330 + }, + { + "epoch": 0.9613335840070188, + "grad_norm": 1.4686822891235352, + "learning_rate": 3.433640835110088e-05, + "loss": 0.1907, + "step": 15340 + }, + { + "epoch": 0.9619602682208435, + "grad_norm": 7.100295066833496, + "learning_rate": 3.43258533702054e-05, + "loss": 0.2471, + "step": 15350 + }, + { + "epoch": 0.9625869524346682, + "grad_norm": 1.9733787775039673, + "learning_rate": 3.4315298389309917e-05, + "loss": 0.2039, + "step": 15360 + }, + { + "epoch": 0.9632136366484928, + "grad_norm": 0.7774206399917603, + "learning_rate": 3.430474340841443e-05, + "loss": 0.033, + "step": 15370 + }, + { + "epoch": 0.9638403208623175, + "grad_norm": 0.05901852250099182, + "learning_rate": 3.429418842751895e-05, + "loss": 0.1349, + "step": 15380 + }, + { + "epoch": 0.9644670050761421, + "grad_norm": 3.6407742500305176, + "learning_rate": 3.4283633446623466e-05, + "loss": 0.1869, + "step": 15390 + }, + { + "epoch": 0.9650936892899667, + "grad_norm": 6.53506326675415, + "learning_rate": 3.427307846572798e-05, + "loss": 0.2079, + "step": 15400 + }, + { + "epoch": 0.9657203735037915, + "grad_norm": 0.1483003944158554, + "learning_rate": 3.426252348483249e-05, + "loss": 0.0777, + "step": 15410 + }, + { + "epoch": 0.9663470577176161, + "grad_norm": 0.16284331679344177, + "learning_rate": 3.425196850393701e-05, + "loss": 0.1663, + "step": 15420 + }, + { + "epoch": 0.9669737419314407, + "grad_norm": 0.08627568185329437, + "learning_rate": 3.4241413523041526e-05, + "loss": 0.0315, + "step": 15430 + }, + { + "epoch": 0.9676004261452654, + "grad_norm": 0.3580785393714905, + "learning_rate": 3.4230858542146036e-05, + "loss": 0.0236, + "step": 15440 + }, + { + "epoch": 0.96822711035909, + "grad_norm": 2.2506258487701416, + "learning_rate": 3.422030356125055e-05, + "loss": 0.1656, + "step": 15450 + }, + { + "epoch": 0.9688537945729148, + "grad_norm": 0.042580630630254745, + "learning_rate": 3.420974858035507e-05, + "loss": 0.0181, + "step": 15460 + }, + { + "epoch": 0.9694804787867394, + "grad_norm": 4.1472344398498535, + "learning_rate": 3.4199193599459586e-05, + "loss": 0.1191, + "step": 15470 + }, + { + "epoch": 0.970107163000564, + "grad_norm": 0.04683535918593407, + "learning_rate": 3.41886386185641e-05, + "loss": 0.0648, + "step": 15480 + }, + { + "epoch": 0.9707338472143887, + "grad_norm": 0.19521360099315643, + "learning_rate": 3.417808363766862e-05, + "loss": 0.0304, + "step": 15490 + }, + { + "epoch": 0.9713605314282133, + "grad_norm": 0.04040757939219475, + "learning_rate": 3.4167528656773136e-05, + "loss": 0.0192, + "step": 15500 + }, + { + "epoch": 0.9719872156420379, + "grad_norm": 0.8133098483085632, + "learning_rate": 3.4156973675877646e-05, + "loss": 0.2299, + "step": 15510 + }, + { + "epoch": 0.9726138998558627, + "grad_norm": 1.3788111209869385, + "learning_rate": 3.414641869498216e-05, + "loss": 0.0283, + "step": 15520 + }, + { + "epoch": 0.9732405840696873, + "grad_norm": 0.045829709619283676, + "learning_rate": 3.413586371408668e-05, + "loss": 0.1146, + "step": 15530 + }, + { + "epoch": 0.9738672682835119, + "grad_norm": 0.03582155704498291, + "learning_rate": 3.4125308733191196e-05, + "loss": 0.0558, + "step": 15540 + }, + { + "epoch": 0.9744939524973366, + "grad_norm": 0.06874176859855652, + "learning_rate": 3.4114753752295706e-05, + "loss": 0.3368, + "step": 15550 + }, + { + "epoch": 0.9751206367111612, + "grad_norm": 0.11948882043361664, + "learning_rate": 3.410419877140022e-05, + "loss": 0.134, + "step": 15560 + }, + { + "epoch": 0.975747320924986, + "grad_norm": 3.4153361320495605, + "learning_rate": 3.409364379050474e-05, + "loss": 0.2266, + "step": 15570 + }, + { + "epoch": 0.9763740051388106, + "grad_norm": 0.24882833659648895, + "learning_rate": 3.4083088809609256e-05, + "loss": 0.0638, + "step": 15580 + }, + { + "epoch": 0.9770006893526352, + "grad_norm": 2.382737874984741, + "learning_rate": 3.407253382871377e-05, + "loss": 0.1365, + "step": 15590 + }, + { + "epoch": 0.9776273735664599, + "grad_norm": 0.11060065776109695, + "learning_rate": 3.406197884781829e-05, + "loss": 0.1752, + "step": 15600 + }, + { + "epoch": 0.9782540577802845, + "grad_norm": 0.1280880719423294, + "learning_rate": 3.4051423866922806e-05, + "loss": 0.0645, + "step": 15610 + }, + { + "epoch": 0.9788807419941091, + "grad_norm": 4.298983097076416, + "learning_rate": 3.4040868886027316e-05, + "loss": 0.1975, + "step": 15620 + }, + { + "epoch": 0.9795074262079339, + "grad_norm": 0.4864870607852936, + "learning_rate": 3.403031390513183e-05, + "loss": 0.0377, + "step": 15630 + }, + { + "epoch": 0.9801341104217585, + "grad_norm": 0.08115755021572113, + "learning_rate": 3.401975892423635e-05, + "loss": 0.0884, + "step": 15640 + }, + { + "epoch": 0.9807607946355831, + "grad_norm": 0.27335405349731445, + "learning_rate": 3.400920394334086e-05, + "loss": 0.3032, + "step": 15650 + }, + { + "epoch": 0.9813874788494078, + "grad_norm": 0.18419836461544037, + "learning_rate": 3.3998648962445375e-05, + "loss": 0.0266, + "step": 15660 + }, + { + "epoch": 0.9820141630632324, + "grad_norm": 0.09136651456356049, + "learning_rate": 3.39880939815499e-05, + "loss": 0.0558, + "step": 15670 + }, + { + "epoch": 0.9826408472770571, + "grad_norm": 0.06150569021701813, + "learning_rate": 3.3977539000654416e-05, + "loss": 0.2052, + "step": 15680 + }, + { + "epoch": 0.9832675314908818, + "grad_norm": 0.3006777763366699, + "learning_rate": 3.3966984019758925e-05, + "loss": 0.004, + "step": 15690 + }, + { + "epoch": 0.9838942157047064, + "grad_norm": 0.06138232722878456, + "learning_rate": 3.395642903886344e-05, + "loss": 0.1408, + "step": 15700 + }, + { + "epoch": 0.9845208999185311, + "grad_norm": 5.341544151306152, + "learning_rate": 3.394587405796796e-05, + "loss": 0.1527, + "step": 15710 + }, + { + "epoch": 0.9851475841323557, + "grad_norm": 1.9161176681518555, + "learning_rate": 3.393531907707247e-05, + "loss": 0.1775, + "step": 15720 + }, + { + "epoch": 0.9857742683461803, + "grad_norm": 0.08122947812080383, + "learning_rate": 3.3924764096176985e-05, + "loss": 0.111, + "step": 15730 + }, + { + "epoch": 0.986400952560005, + "grad_norm": 0.06884922832250595, + "learning_rate": 3.39142091152815e-05, + "loss": 0.0213, + "step": 15740 + }, + { + "epoch": 0.9870276367738297, + "grad_norm": 0.06372876465320587, + "learning_rate": 3.390365413438602e-05, + "loss": 0.0811, + "step": 15750 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.07776562124490738, + "learning_rate": 3.3893099153490535e-05, + "loss": 0.0972, + "step": 15760 + }, + { + "epoch": 0.988281005201479, + "grad_norm": 3.338428258895874, + "learning_rate": 3.388254417259505e-05, + "loss": 0.1325, + "step": 15770 + }, + { + "epoch": 0.9889076894153036, + "grad_norm": 17.43355941772461, + "learning_rate": 3.387198919169957e-05, + "loss": 0.1241, + "step": 15780 + }, + { + "epoch": 0.9895343736291283, + "grad_norm": 0.08450421690940857, + "learning_rate": 3.3861434210804085e-05, + "loss": 0.1008, + "step": 15790 + }, + { + "epoch": 0.9901610578429529, + "grad_norm": 1.0543551445007324, + "learning_rate": 3.3850879229908595e-05, + "loss": 0.1727, + "step": 15800 + }, + { + "epoch": 0.9907877420567776, + "grad_norm": 0.10355456173419952, + "learning_rate": 3.384032424901311e-05, + "loss": 0.1414, + "step": 15810 + }, + { + "epoch": 0.9914144262706023, + "grad_norm": 0.32224273681640625, + "learning_rate": 3.382976926811763e-05, + "loss": 0.2139, + "step": 15820 + }, + { + "epoch": 0.9920411104844269, + "grad_norm": 1.6528878211975098, + "learning_rate": 3.381921428722214e-05, + "loss": 0.0422, + "step": 15830 + }, + { + "epoch": 0.9926677946982515, + "grad_norm": 0.08288286626338959, + "learning_rate": 3.3808659306326655e-05, + "loss": 0.1074, + "step": 15840 + }, + { + "epoch": 0.9932944789120762, + "grad_norm": 0.5336832404136658, + "learning_rate": 3.379810432543117e-05, + "loss": 0.0239, + "step": 15850 + }, + { + "epoch": 0.9939211631259008, + "grad_norm": 0.07496319711208344, + "learning_rate": 3.378754934453569e-05, + "loss": 0.2356, + "step": 15860 + }, + { + "epoch": 0.9945478473397256, + "grad_norm": 20.032400131225586, + "learning_rate": 3.3776994363640205e-05, + "loss": 0.3109, + "step": 15870 + }, + { + "epoch": 0.9951745315535502, + "grad_norm": 0.8633710145950317, + "learning_rate": 3.376643938274472e-05, + "loss": 0.1438, + "step": 15880 + }, + { + "epoch": 0.9958012157673748, + "grad_norm": 3.0096535682678223, + "learning_rate": 3.375588440184924e-05, + "loss": 0.3369, + "step": 15890 + }, + { + "epoch": 0.9964278999811995, + "grad_norm": 0.24672941863536835, + "learning_rate": 3.374532942095375e-05, + "loss": 0.0294, + "step": 15900 + }, + { + "epoch": 0.9970545841950241, + "grad_norm": 0.13079603016376495, + "learning_rate": 3.3734774440058265e-05, + "loss": 0.1003, + "step": 15910 + }, + { + "epoch": 0.9976812684088487, + "grad_norm": 0.10342691093683243, + "learning_rate": 3.372421945916278e-05, + "loss": 0.0169, + "step": 15920 + }, + { + "epoch": 0.9983079526226735, + "grad_norm": 3.7852447032928467, + "learning_rate": 3.37136644782673e-05, + "loss": 0.248, + "step": 15930 + }, + { + "epoch": 0.9989346368364981, + "grad_norm": 0.4074271023273468, + "learning_rate": 3.370310949737181e-05, + "loss": 0.1473, + "step": 15940 + }, + { + "epoch": 0.9995613210503227, + "grad_norm": 0.073146291077137, + "learning_rate": 3.3692554516476324e-05, + "loss": 0.0179, + "step": 15950 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9645621181262729, + "eval_f1": 0.9639957278227224, + "eval_loss": 0.1336221545934677, + "eval_precision": 0.9636563068483206, + "eval_recall": 0.9645621181262729, + "eval_runtime": 288.363, + "eval_samples_per_second": 110.676, + "eval_steps_per_second": 13.837, + "step": 15957 + }, + { + "epoch": 1.0001880052641474, + "grad_norm": 1.1847440004348755, + "learning_rate": 3.368199953558084e-05, + "loss": 0.0828, + "step": 15960 + }, + { + "epoch": 1.0008146894779721, + "grad_norm": 0.06779574602842331, + "learning_rate": 3.367144455468536e-05, + "loss": 0.1321, + "step": 15970 + }, + { + "epoch": 1.0014413736917966, + "grad_norm": 0.06015975400805473, + "learning_rate": 3.3660889573789874e-05, + "loss": 0.0055, + "step": 15980 + }, + { + "epoch": 1.0020680579056214, + "grad_norm": 0.04776463657617569, + "learning_rate": 3.365033459289439e-05, + "loss": 0.1423, + "step": 15990 + }, + { + "epoch": 1.002694742119446, + "grad_norm": 3.770029306411743, + "learning_rate": 3.363977961199891e-05, + "loss": 0.1954, + "step": 16000 + }, + { + "epoch": 1.0033214263332706, + "grad_norm": 1.7021197080612183, + "learning_rate": 3.362922463110342e-05, + "loss": 0.1411, + "step": 16010 + }, + { + "epoch": 1.0039481105470953, + "grad_norm": 0.05614056438207626, + "learning_rate": 3.3618669650207934e-05, + "loss": 0.1156, + "step": 16020 + }, + { + "epoch": 1.00457479476092, + "grad_norm": 0.10399208217859268, + "learning_rate": 3.360811466931245e-05, + "loss": 0.1231, + "step": 16030 + }, + { + "epoch": 1.0052014789747445, + "grad_norm": 0.07346727699041367, + "learning_rate": 3.359755968841696e-05, + "loss": 0.0193, + "step": 16040 + }, + { + "epoch": 1.0058281631885693, + "grad_norm": 0.5521314740180969, + "learning_rate": 3.358700470752148e-05, + "loss": 0.0412, + "step": 16050 + }, + { + "epoch": 1.006454847402394, + "grad_norm": 0.34978896379470825, + "learning_rate": 3.3576449726625994e-05, + "loss": 0.0532, + "step": 16060 + }, + { + "epoch": 1.0070815316162185, + "grad_norm": 0.03547803685069084, + "learning_rate": 3.356589474573051e-05, + "loss": 0.1295, + "step": 16070 + }, + { + "epoch": 1.0077082158300432, + "grad_norm": 0.036940619349479675, + "learning_rate": 3.355533976483503e-05, + "loss": 0.0885, + "step": 16080 + }, + { + "epoch": 1.008334900043868, + "grad_norm": 0.6497445106506348, + "learning_rate": 3.3544784783939544e-05, + "loss": 0.1189, + "step": 16090 + }, + { + "epoch": 1.0089615842576924, + "grad_norm": 6.425511360168457, + "learning_rate": 3.353422980304406e-05, + "loss": 0.2203, + "step": 16100 + }, + { + "epoch": 1.0095882684715172, + "grad_norm": 0.05597083643078804, + "learning_rate": 3.352367482214857e-05, + "loss": 0.1882, + "step": 16110 + }, + { + "epoch": 1.010214952685342, + "grad_norm": 4.08495569229126, + "learning_rate": 3.351311984125309e-05, + "loss": 0.1379, + "step": 16120 + }, + { + "epoch": 1.0108416368991666, + "grad_norm": 4.696819305419922, + "learning_rate": 3.3502564860357604e-05, + "loss": 0.2121, + "step": 16130 + }, + { + "epoch": 1.0114683211129911, + "grad_norm": 0.11512627452611923, + "learning_rate": 3.349200987946212e-05, + "loss": 0.0728, + "step": 16140 + }, + { + "epoch": 1.0120950053268158, + "grad_norm": 2.806684970855713, + "learning_rate": 3.348145489856663e-05, + "loss": 0.0728, + "step": 16150 + }, + { + "epoch": 1.0127216895406406, + "grad_norm": 0.11947732418775558, + "learning_rate": 3.347089991767115e-05, + "loss": 0.0752, + "step": 16160 + }, + { + "epoch": 1.013348373754465, + "grad_norm": 0.5134645700454712, + "learning_rate": 3.346034493677567e-05, + "loss": 0.1281, + "step": 16170 + }, + { + "epoch": 1.0139750579682898, + "grad_norm": 0.08177413046360016, + "learning_rate": 3.344978995588019e-05, + "loss": 0.2551, + "step": 16180 + }, + { + "epoch": 1.0146017421821145, + "grad_norm": 2.666290760040283, + "learning_rate": 3.34392349749847e-05, + "loss": 0.0988, + "step": 16190 + }, + { + "epoch": 1.015228426395939, + "grad_norm": 0.20499251782894135, + "learning_rate": 3.3428679994089214e-05, + "loss": 0.1529, + "step": 16200 + }, + { + "epoch": 1.0158551106097637, + "grad_norm": 0.056084733456373215, + "learning_rate": 3.341812501319373e-05, + "loss": 0.1688, + "step": 16210 + }, + { + "epoch": 1.0164817948235885, + "grad_norm": 0.2051260620355606, + "learning_rate": 3.340757003229824e-05, + "loss": 0.1783, + "step": 16220 + }, + { + "epoch": 1.017108479037413, + "grad_norm": 0.20934149622917175, + "learning_rate": 3.339701505140276e-05, + "loss": 0.0797, + "step": 16230 + }, + { + "epoch": 1.0177351632512377, + "grad_norm": 8.540879249572754, + "learning_rate": 3.3386460070507273e-05, + "loss": 0.1156, + "step": 16240 + }, + { + "epoch": 1.0183618474650624, + "grad_norm": 0.07769911736249924, + "learning_rate": 3.337590508961179e-05, + "loss": 0.2469, + "step": 16250 + }, + { + "epoch": 1.018988531678887, + "grad_norm": 0.10938192158937454, + "learning_rate": 3.336535010871631e-05, + "loss": 0.1786, + "step": 16260 + }, + { + "epoch": 1.0196152158927116, + "grad_norm": 0.061687126755714417, + "learning_rate": 3.335479512782082e-05, + "loss": 0.0405, + "step": 16270 + }, + { + "epoch": 1.0202419001065364, + "grad_norm": 0.0524788424372673, + "learning_rate": 3.334424014692534e-05, + "loss": 0.0525, + "step": 16280 + }, + { + "epoch": 1.0208685843203609, + "grad_norm": 1.4585634469985962, + "learning_rate": 3.333368516602985e-05, + "loss": 0.1014, + "step": 16290 + }, + { + "epoch": 1.0214952685341856, + "grad_norm": 0.05513199046254158, + "learning_rate": 3.3323130185134367e-05, + "loss": 0.1112, + "step": 16300 + }, + { + "epoch": 1.0221219527480103, + "grad_norm": 0.04087262228131294, + "learning_rate": 3.331257520423888e-05, + "loss": 0.0553, + "step": 16310 + }, + { + "epoch": 1.0227486369618348, + "grad_norm": 0.03982210531830788, + "learning_rate": 3.33020202233434e-05, + "loss": 0.1507, + "step": 16320 + }, + { + "epoch": 1.0233753211756595, + "grad_norm": 0.3110230267047882, + "learning_rate": 3.329146524244791e-05, + "loss": 0.1162, + "step": 16330 + }, + { + "epoch": 1.0240020053894843, + "grad_norm": 0.05139019340276718, + "learning_rate": 3.3280910261552426e-05, + "loss": 0.1322, + "step": 16340 + }, + { + "epoch": 1.024628689603309, + "grad_norm": 4.573475360870361, + "learning_rate": 3.327035528065694e-05, + "loss": 0.2322, + "step": 16350 + }, + { + "epoch": 1.0252553738171335, + "grad_norm": 0.8857044577598572, + "learning_rate": 3.325980029976146e-05, + "loss": 0.0564, + "step": 16360 + }, + { + "epoch": 1.0258820580309582, + "grad_norm": 15.947443962097168, + "learning_rate": 3.3249245318865976e-05, + "loss": 0.111, + "step": 16370 + }, + { + "epoch": 1.026508742244783, + "grad_norm": 0.10292017459869385, + "learning_rate": 3.323869033797049e-05, + "loss": 0.0134, + "step": 16380 + }, + { + "epoch": 1.0271354264586074, + "grad_norm": 0.04638553783297539, + "learning_rate": 3.322813535707501e-05, + "loss": 0.0824, + "step": 16390 + }, + { + "epoch": 1.0277621106724322, + "grad_norm": 0.07538238167762756, + "learning_rate": 3.321758037617952e-05, + "loss": 0.0706, + "step": 16400 + }, + { + "epoch": 1.028388794886257, + "grad_norm": 22.8479061126709, + "learning_rate": 3.3207025395284036e-05, + "loss": 0.1156, + "step": 16410 + }, + { + "epoch": 1.0290154791000814, + "grad_norm": 0.6563345789909363, + "learning_rate": 3.319647041438855e-05, + "loss": 0.064, + "step": 16420 + }, + { + "epoch": 1.0296421633139061, + "grad_norm": 1.6031055450439453, + "learning_rate": 3.318591543349306e-05, + "loss": 0.1772, + "step": 16430 + }, + { + "epoch": 1.0302688475277308, + "grad_norm": 9.018767356872559, + "learning_rate": 3.317536045259758e-05, + "loss": 0.1231, + "step": 16440 + }, + { + "epoch": 1.0308955317415553, + "grad_norm": 0.10965459048748016, + "learning_rate": 3.3164805471702096e-05, + "loss": 0.2208, + "step": 16450 + }, + { + "epoch": 1.03152221595538, + "grad_norm": 1.207930326461792, + "learning_rate": 3.315425049080661e-05, + "loss": 0.0507, + "step": 16460 + }, + { + "epoch": 1.0321489001692048, + "grad_norm": 5.185263633728027, + "learning_rate": 3.314369550991113e-05, + "loss": 0.1507, + "step": 16470 + }, + { + "epoch": 1.0327755843830293, + "grad_norm": 0.04256342723965645, + "learning_rate": 3.3133140529015646e-05, + "loss": 0.2055, + "step": 16480 + }, + { + "epoch": 1.033402268596854, + "grad_norm": 0.7863683104515076, + "learning_rate": 3.312258554812016e-05, + "loss": 0.2199, + "step": 16490 + }, + { + "epoch": 1.0340289528106787, + "grad_norm": 0.0994349867105484, + "learning_rate": 3.311203056722467e-05, + "loss": 0.0236, + "step": 16500 + }, + { + "epoch": 1.0346556370245032, + "grad_norm": 0.051592059433460236, + "learning_rate": 3.310147558632919e-05, + "loss": 0.0409, + "step": 16510 + }, + { + "epoch": 1.035282321238328, + "grad_norm": 0.2096475213766098, + "learning_rate": 3.3090920605433706e-05, + "loss": 0.0062, + "step": 16520 + }, + { + "epoch": 1.0359090054521527, + "grad_norm": 0.027175744995474815, + "learning_rate": 3.308036562453822e-05, + "loss": 0.028, + "step": 16530 + }, + { + "epoch": 1.0365356896659774, + "grad_norm": 0.5925734043121338, + "learning_rate": 3.306981064364273e-05, + "loss": 0.0372, + "step": 16540 + }, + { + "epoch": 1.037162373879802, + "grad_norm": 0.03400746360421181, + "learning_rate": 3.305925566274725e-05, + "loss": 0.0195, + "step": 16550 + }, + { + "epoch": 1.0377890580936266, + "grad_norm": 0.03515337407588959, + "learning_rate": 3.3048700681851766e-05, + "loss": 0.1732, + "step": 16560 + }, + { + "epoch": 1.0384157423074514, + "grad_norm": 17.625837326049805, + "learning_rate": 3.303814570095628e-05, + "loss": 0.1175, + "step": 16570 + }, + { + "epoch": 1.0390424265212759, + "grad_norm": 0.02709149569272995, + "learning_rate": 3.30275907200608e-05, + "loss": 0.0583, + "step": 16580 + }, + { + "epoch": 1.0396691107351006, + "grad_norm": 3.532773971557617, + "learning_rate": 3.3017035739165316e-05, + "loss": 0.2537, + "step": 16590 + }, + { + "epoch": 1.0402957949489253, + "grad_norm": 3.3194799423217773, + "learning_rate": 3.300648075826983e-05, + "loss": 0.2859, + "step": 16600 + }, + { + "epoch": 1.0409224791627498, + "grad_norm": 0.24675169587135315, + "learning_rate": 3.299592577737434e-05, + "loss": 0.0852, + "step": 16610 + }, + { + "epoch": 1.0415491633765745, + "grad_norm": 3.3506369590759277, + "learning_rate": 3.298537079647886e-05, + "loss": 0.1883, + "step": 16620 + }, + { + "epoch": 1.0421758475903993, + "grad_norm": 0.6482851505279541, + "learning_rate": 3.2974815815583375e-05, + "loss": 0.0887, + "step": 16630 + }, + { + "epoch": 1.0428025318042238, + "grad_norm": 0.10803534835577011, + "learning_rate": 3.296426083468789e-05, + "loss": 0.0358, + "step": 16640 + }, + { + "epoch": 1.0434292160180485, + "grad_norm": 0.07975927740335464, + "learning_rate": 3.29537058537924e-05, + "loss": 0.211, + "step": 16650 + }, + { + "epoch": 1.0440559002318732, + "grad_norm": 0.0813465490937233, + "learning_rate": 3.294315087289692e-05, + "loss": 0.0042, + "step": 16660 + }, + { + "epoch": 1.0446825844456977, + "grad_norm": 0.09860538691282272, + "learning_rate": 3.293259589200144e-05, + "loss": 0.0623, + "step": 16670 + }, + { + "epoch": 1.0453092686595225, + "grad_norm": 0.06293389201164246, + "learning_rate": 3.292204091110595e-05, + "loss": 0.0985, + "step": 16680 + }, + { + "epoch": 1.0459359528733472, + "grad_norm": 0.148453950881958, + "learning_rate": 3.291148593021047e-05, + "loss": 0.048, + "step": 16690 + }, + { + "epoch": 1.0465626370871717, + "grad_norm": 0.07422909885644913, + "learning_rate": 3.2900930949314985e-05, + "loss": 0.167, + "step": 16700 + }, + { + "epoch": 1.0471893213009964, + "grad_norm": 0.07262309640645981, + "learning_rate": 3.28903759684195e-05, + "loss": 0.0609, + "step": 16710 + }, + { + "epoch": 1.0478160055148211, + "grad_norm": 0.14245441555976868, + "learning_rate": 3.287982098752401e-05, + "loss": 0.0112, + "step": 16720 + }, + { + "epoch": 1.0484426897286458, + "grad_norm": 0.3390648663043976, + "learning_rate": 3.286926600662853e-05, + "loss": 0.1155, + "step": 16730 + }, + { + "epoch": 1.0490693739424704, + "grad_norm": 0.23542775213718414, + "learning_rate": 3.2858711025733045e-05, + "loss": 0.0935, + "step": 16740 + }, + { + "epoch": 1.049696058156295, + "grad_norm": 21.997127532958984, + "learning_rate": 3.2848156044837555e-05, + "loss": 0.0458, + "step": 16750 + }, + { + "epoch": 1.0503227423701198, + "grad_norm": 0.0299488827586174, + "learning_rate": 3.283760106394208e-05, + "loss": 0.2047, + "step": 16760 + }, + { + "epoch": 1.0509494265839443, + "grad_norm": 0.046134356409311295, + "learning_rate": 3.2827046083046595e-05, + "loss": 0.1051, + "step": 16770 + }, + { + "epoch": 1.051576110797769, + "grad_norm": 2.58500075340271, + "learning_rate": 3.281649110215111e-05, + "loss": 0.0448, + "step": 16780 + }, + { + "epoch": 1.0522027950115938, + "grad_norm": 2.5090854167938232, + "learning_rate": 3.280593612125562e-05, + "loss": 0.1437, + "step": 16790 + }, + { + "epoch": 1.0528294792254183, + "grad_norm": 0.15016499161720276, + "learning_rate": 3.279538114036014e-05, + "loss": 0.0185, + "step": 16800 + }, + { + "epoch": 1.053456163439243, + "grad_norm": 0.15622135996818542, + "learning_rate": 3.2784826159464655e-05, + "loss": 0.2281, + "step": 16810 + }, + { + "epoch": 1.0540828476530677, + "grad_norm": 0.16764254868030548, + "learning_rate": 3.2774271178569165e-05, + "loss": 0.0489, + "step": 16820 + }, + { + "epoch": 1.0547095318668922, + "grad_norm": 13.331740379333496, + "learning_rate": 3.276371619767368e-05, + "loss": 0.2832, + "step": 16830 + }, + { + "epoch": 1.055336216080717, + "grad_norm": 0.08065997809171677, + "learning_rate": 3.27531612167782e-05, + "loss": 0.1134, + "step": 16840 + }, + { + "epoch": 1.0559629002945417, + "grad_norm": 1.8846511840820312, + "learning_rate": 3.2742606235882715e-05, + "loss": 0.0899, + "step": 16850 + }, + { + "epoch": 1.0565895845083662, + "grad_norm": 0.03695497661828995, + "learning_rate": 3.273205125498723e-05, + "loss": 0.122, + "step": 16860 + }, + { + "epoch": 1.0572162687221909, + "grad_norm": 1.0204274654388428, + "learning_rate": 3.272149627409175e-05, + "loss": 0.0522, + "step": 16870 + }, + { + "epoch": 1.0578429529360156, + "grad_norm": 0.9126099944114685, + "learning_rate": 3.2710941293196265e-05, + "loss": 0.0771, + "step": 16880 + }, + { + "epoch": 1.05846963714984, + "grad_norm": 20.570974349975586, + "learning_rate": 3.2700386312300774e-05, + "loss": 0.0991, + "step": 16890 + }, + { + "epoch": 1.0590963213636648, + "grad_norm": 30.219226837158203, + "learning_rate": 3.268983133140529e-05, + "loss": 0.2042, + "step": 16900 + }, + { + "epoch": 1.0597230055774896, + "grad_norm": 0.02713117003440857, + "learning_rate": 3.267927635050981e-05, + "loss": 0.0576, + "step": 16910 + }, + { + "epoch": 1.060349689791314, + "grad_norm": 15.484114646911621, + "learning_rate": 3.2668721369614324e-05, + "loss": 0.0664, + "step": 16920 + }, + { + "epoch": 1.0609763740051388, + "grad_norm": 0.24145345389842987, + "learning_rate": 3.2658166388718834e-05, + "loss": 0.1132, + "step": 16930 + }, + { + "epoch": 1.0616030582189635, + "grad_norm": 10.560787200927734, + "learning_rate": 3.264761140782335e-05, + "loss": 0.1444, + "step": 16940 + }, + { + "epoch": 1.062229742432788, + "grad_norm": 0.014589796774089336, + "learning_rate": 3.263705642692787e-05, + "loss": 0.1334, + "step": 16950 + }, + { + "epoch": 1.0628564266466127, + "grad_norm": 3.2847418785095215, + "learning_rate": 3.2626501446032384e-05, + "loss": 0.1642, + "step": 16960 + }, + { + "epoch": 1.0634831108604375, + "grad_norm": 0.014080416411161423, + "learning_rate": 3.26159464651369e-05, + "loss": 0.068, + "step": 16970 + }, + { + "epoch": 1.0641097950742622, + "grad_norm": 1.794739842414856, + "learning_rate": 3.260539148424142e-05, + "loss": 0.1811, + "step": 16980 + }, + { + "epoch": 1.0647364792880867, + "grad_norm": 0.10375296324491501, + "learning_rate": 3.2594836503345934e-05, + "loss": 0.0504, + "step": 16990 + }, + { + "epoch": 1.0653631635019114, + "grad_norm": 0.10705555230379105, + "learning_rate": 3.2584281522450444e-05, + "loss": 0.1058, + "step": 17000 + }, + { + "epoch": 1.0659898477157361, + "grad_norm": 0.012350371107459068, + "learning_rate": 3.257372654155496e-05, + "loss": 0.1455, + "step": 17010 + }, + { + "epoch": 1.0666165319295606, + "grad_norm": 3.1003317832946777, + "learning_rate": 3.256317156065948e-05, + "loss": 0.1223, + "step": 17020 + }, + { + "epoch": 1.0672432161433854, + "grad_norm": 7.691588878631592, + "learning_rate": 3.2552616579763994e-05, + "loss": 0.1669, + "step": 17030 + }, + { + "epoch": 1.06786990035721, + "grad_norm": 0.1589193344116211, + "learning_rate": 3.2542061598868504e-05, + "loss": 0.0303, + "step": 17040 + }, + { + "epoch": 1.0684965845710346, + "grad_norm": 6.517906665802002, + "learning_rate": 3.253150661797302e-05, + "loss": 0.1478, + "step": 17050 + }, + { + "epoch": 1.0691232687848593, + "grad_norm": 2.8963265419006348, + "learning_rate": 3.252095163707754e-05, + "loss": 0.1978, + "step": 17060 + }, + { + "epoch": 1.069749952998684, + "grad_norm": 0.056512974202632904, + "learning_rate": 3.2510396656182054e-05, + "loss": 0.0276, + "step": 17070 + }, + { + "epoch": 1.0703766372125085, + "grad_norm": 0.2997797131538391, + "learning_rate": 3.249984167528657e-05, + "loss": 0.0334, + "step": 17080 + }, + { + "epoch": 1.0710033214263333, + "grad_norm": 0.012841691263020039, + "learning_rate": 3.248928669439109e-05, + "loss": 0.0108, + "step": 17090 + }, + { + "epoch": 1.071630005640158, + "grad_norm": 0.11793465912342072, + "learning_rate": 3.2478731713495604e-05, + "loss": 0.1028, + "step": 17100 + }, + { + "epoch": 1.0722566898539825, + "grad_norm": 0.20693303644657135, + "learning_rate": 3.2468176732600114e-05, + "loss": 0.2159, + "step": 17110 + }, + { + "epoch": 1.0728833740678072, + "grad_norm": 0.15515050292015076, + "learning_rate": 3.245762175170463e-05, + "loss": 0.0425, + "step": 17120 + }, + { + "epoch": 1.073510058281632, + "grad_norm": 3.174474000930786, + "learning_rate": 3.244706677080915e-05, + "loss": 0.0634, + "step": 17130 + }, + { + "epoch": 1.0741367424954564, + "grad_norm": 0.009767886251211166, + "learning_rate": 3.243651178991366e-05, + "loss": 0.004, + "step": 17140 + }, + { + "epoch": 1.0747634267092812, + "grad_norm": 0.013338018208742142, + "learning_rate": 3.2425956809018173e-05, + "loss": 0.0869, + "step": 17150 + }, + { + "epoch": 1.0753901109231059, + "grad_norm": 0.009402146562933922, + "learning_rate": 3.241540182812269e-05, + "loss": 0.0134, + "step": 17160 + }, + { + "epoch": 1.0760167951369306, + "grad_norm": 4.5529375076293945, + "learning_rate": 3.2404846847227214e-05, + "loss": 0.3214, + "step": 17170 + }, + { + "epoch": 1.076643479350755, + "grad_norm": 0.06630541384220123, + "learning_rate": 3.2394291866331723e-05, + "loss": 0.1297, + "step": 17180 + }, + { + "epoch": 1.0772701635645798, + "grad_norm": 0.06347658485174179, + "learning_rate": 3.238373688543624e-05, + "loss": 0.007, + "step": 17190 + }, + { + "epoch": 1.0778968477784046, + "grad_norm": 0.060882568359375, + "learning_rate": 3.237318190454076e-05, + "loss": 0.0988, + "step": 17200 + }, + { + "epoch": 1.078523531992229, + "grad_norm": 0.44615989923477173, + "learning_rate": 3.236262692364527e-05, + "loss": 0.1414, + "step": 17210 + }, + { + "epoch": 1.0791502162060538, + "grad_norm": 0.9382216334342957, + "learning_rate": 3.235207194274978e-05, + "loss": 0.1092, + "step": 17220 + }, + { + "epoch": 1.0797769004198785, + "grad_norm": 3.53702974319458, + "learning_rate": 3.23415169618543e-05, + "loss": 0.0298, + "step": 17230 + }, + { + "epoch": 1.080403584633703, + "grad_norm": 0.23474286496639252, + "learning_rate": 3.2330961980958817e-05, + "loss": 0.0526, + "step": 17240 + }, + { + "epoch": 1.0810302688475277, + "grad_norm": 0.4255073666572571, + "learning_rate": 3.2320407000063326e-05, + "loss": 0.1293, + "step": 17250 + }, + { + "epoch": 1.0816569530613525, + "grad_norm": 9.479437828063965, + "learning_rate": 3.230985201916785e-05, + "loss": 0.2246, + "step": 17260 + }, + { + "epoch": 1.082283637275177, + "grad_norm": 5.882570743560791, + "learning_rate": 3.2299297038272367e-05, + "loss": 0.1457, + "step": 17270 + }, + { + "epoch": 1.0829103214890017, + "grad_norm": 0.03978152573108673, + "learning_rate": 3.2288742057376876e-05, + "loss": 0.0228, + "step": 17280 + }, + { + "epoch": 1.0835370057028264, + "grad_norm": 0.022685598582029343, + "learning_rate": 3.227818707648139e-05, + "loss": 0.0577, + "step": 17290 + }, + { + "epoch": 1.084163689916651, + "grad_norm": 0.03258649632334709, + "learning_rate": 3.226763209558591e-05, + "loss": 0.116, + "step": 17300 + }, + { + "epoch": 1.0847903741304756, + "grad_norm": 3.8251702785491943, + "learning_rate": 3.2257077114690426e-05, + "loss": 0.166, + "step": 17310 + }, + { + "epoch": 1.0854170583443004, + "grad_norm": 4.577685832977295, + "learning_rate": 3.2246522133794936e-05, + "loss": 0.0057, + "step": 17320 + }, + { + "epoch": 1.0860437425581249, + "grad_norm": 0.073435477912426, + "learning_rate": 3.223596715289945e-05, + "loss": 0.0874, + "step": 17330 + }, + { + "epoch": 1.0866704267719496, + "grad_norm": 3.804044723510742, + "learning_rate": 3.222541217200397e-05, + "loss": 0.1952, + "step": 17340 + }, + { + "epoch": 1.0872971109857743, + "grad_norm": 0.16419921815395355, + "learning_rate": 3.2214857191108486e-05, + "loss": 0.1542, + "step": 17350 + }, + { + "epoch": 1.087923795199599, + "grad_norm": 0.1198701560497284, + "learning_rate": 3.2204302210213e-05, + "loss": 0.0191, + "step": 17360 + }, + { + "epoch": 1.0885504794134235, + "grad_norm": 0.06243205815553665, + "learning_rate": 3.219374722931752e-05, + "loss": 0.0052, + "step": 17370 + }, + { + "epoch": 1.0891771636272483, + "grad_norm": 0.048908885568380356, + "learning_rate": 3.2183192248422036e-05, + "loss": 0.1644, + "step": 17380 + }, + { + "epoch": 1.089803847841073, + "grad_norm": 4.926666259765625, + "learning_rate": 3.2172637267526546e-05, + "loss": 0.1288, + "step": 17390 + }, + { + "epoch": 1.0904305320548975, + "grad_norm": 0.06150239333510399, + "learning_rate": 3.216208228663106e-05, + "loss": 0.0992, + "step": 17400 + }, + { + "epoch": 1.0910572162687222, + "grad_norm": 0.06462900340557098, + "learning_rate": 3.215152730573558e-05, + "loss": 0.0036, + "step": 17410 + }, + { + "epoch": 1.091683900482547, + "grad_norm": 0.4827132523059845, + "learning_rate": 3.2140972324840096e-05, + "loss": 0.0116, + "step": 17420 + }, + { + "epoch": 1.0923105846963714, + "grad_norm": 4.205441474914551, + "learning_rate": 3.2130417343944606e-05, + "loss": 0.0806, + "step": 17430 + }, + { + "epoch": 1.0929372689101962, + "grad_norm": 0.048327527940273285, + "learning_rate": 3.211986236304912e-05, + "loss": 0.3297, + "step": 17440 + }, + { + "epoch": 1.0935639531240209, + "grad_norm": 3.5729031562805176, + "learning_rate": 3.210930738215364e-05, + "loss": 0.1418, + "step": 17450 + }, + { + "epoch": 1.0941906373378454, + "grad_norm": 0.17036008834838867, + "learning_rate": 3.2098752401258156e-05, + "loss": 0.2521, + "step": 17460 + }, + { + "epoch": 1.09481732155167, + "grad_norm": 0.14171738922595978, + "learning_rate": 3.208819742036267e-05, + "loss": 0.0681, + "step": 17470 + }, + { + "epoch": 1.0954440057654948, + "grad_norm": 3.630122184753418, + "learning_rate": 3.207764243946719e-05, + "loss": 0.1545, + "step": 17480 + }, + { + "epoch": 1.0960706899793193, + "grad_norm": 6.238872528076172, + "learning_rate": 3.2067087458571706e-05, + "loss": 0.0847, + "step": 17490 + }, + { + "epoch": 1.096697374193144, + "grad_norm": 3.537527561187744, + "learning_rate": 3.2056532477676216e-05, + "loss": 0.1196, + "step": 17500 + }, + { + "epoch": 1.0973240584069688, + "grad_norm": 0.525282084941864, + "learning_rate": 3.204597749678073e-05, + "loss": 0.1227, + "step": 17510 + }, + { + "epoch": 1.0979507426207933, + "grad_norm": 0.16825850307941437, + "learning_rate": 3.203542251588525e-05, + "loss": 0.0205, + "step": 17520 + }, + { + "epoch": 1.098577426834618, + "grad_norm": 33.61454391479492, + "learning_rate": 3.202486753498976e-05, + "loss": 0.1907, + "step": 17530 + }, + { + "epoch": 1.0992041110484427, + "grad_norm": 1.6062792539596558, + "learning_rate": 3.2014312554094275e-05, + "loss": 0.0417, + "step": 17540 + }, + { + "epoch": 1.0998307952622675, + "grad_norm": 0.08073863387107849, + "learning_rate": 3.200375757319879e-05, + "loss": 0.0344, + "step": 17550 + }, + { + "epoch": 1.100457479476092, + "grad_norm": 0.09482072293758392, + "learning_rate": 3.199320259230331e-05, + "loss": 0.1362, + "step": 17560 + }, + { + "epoch": 1.1010841636899167, + "grad_norm": 12.88849925994873, + "learning_rate": 3.1982647611407825e-05, + "loss": 0.0964, + "step": 17570 + }, + { + "epoch": 1.1017108479037414, + "grad_norm": 4.788585186004639, + "learning_rate": 3.197209263051234e-05, + "loss": 0.1735, + "step": 17580 + }, + { + "epoch": 1.102337532117566, + "grad_norm": 0.06468937546014786, + "learning_rate": 3.196153764961686e-05, + "loss": 0.1058, + "step": 17590 + }, + { + "epoch": 1.1029642163313906, + "grad_norm": 0.057759225368499756, + "learning_rate": 3.195098266872137e-05, + "loss": 0.0705, + "step": 17600 + }, + { + "epoch": 1.1035909005452154, + "grad_norm": 0.7674767374992371, + "learning_rate": 3.1940427687825885e-05, + "loss": 0.1252, + "step": 17610 + }, + { + "epoch": 1.1042175847590399, + "grad_norm": 0.2729140818119049, + "learning_rate": 3.19298727069304e-05, + "loss": 0.1003, + "step": 17620 + }, + { + "epoch": 1.1048442689728646, + "grad_norm": 0.08846903592348099, + "learning_rate": 3.191931772603492e-05, + "loss": 0.1048, + "step": 17630 + }, + { + "epoch": 1.1054709531866893, + "grad_norm": 0.16635166108608246, + "learning_rate": 3.190876274513943e-05, + "loss": 0.0924, + "step": 17640 + }, + { + "epoch": 1.1060976374005138, + "grad_norm": 0.08288455754518509, + "learning_rate": 3.1898207764243945e-05, + "loss": 0.1139, + "step": 17650 + }, + { + "epoch": 1.1067243216143385, + "grad_norm": 22.163753509521484, + "learning_rate": 3.188765278334846e-05, + "loss": 0.1695, + "step": 17660 + }, + { + "epoch": 1.1073510058281633, + "grad_norm": 4.949541091918945, + "learning_rate": 3.187709780245298e-05, + "loss": 0.0646, + "step": 17670 + }, + { + "epoch": 1.1079776900419878, + "grad_norm": 0.12406647950410843, + "learning_rate": 3.1866542821557495e-05, + "loss": 0.1347, + "step": 17680 + }, + { + "epoch": 1.1086043742558125, + "grad_norm": 0.058781981468200684, + "learning_rate": 3.185598784066201e-05, + "loss": 0.0371, + "step": 17690 + }, + { + "epoch": 1.1092310584696372, + "grad_norm": 0.07266169041395187, + "learning_rate": 3.184543285976653e-05, + "loss": 0.0737, + "step": 17700 + }, + { + "epoch": 1.1098577426834617, + "grad_norm": 3.22371244430542, + "learning_rate": 3.183487787887104e-05, + "loss": 0.1593, + "step": 17710 + }, + { + "epoch": 1.1104844268972864, + "grad_norm": 0.045021601021289825, + "learning_rate": 3.1824322897975555e-05, + "loss": 0.0134, + "step": 17720 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.021073054522275925, + "learning_rate": 3.181376791708007e-05, + "loss": 0.0462, + "step": 17730 + }, + { + "epoch": 1.1117377953249357, + "grad_norm": 0.6529107689857483, + "learning_rate": 3.180321293618458e-05, + "loss": 0.0337, + "step": 17740 + }, + { + "epoch": 1.1123644795387604, + "grad_norm": 12.296225547790527, + "learning_rate": 3.17926579552891e-05, + "loss": 0.1708, + "step": 17750 + }, + { + "epoch": 1.112991163752585, + "grad_norm": 0.07589639723300934, + "learning_rate": 3.178210297439362e-05, + "loss": 0.1422, + "step": 17760 + }, + { + "epoch": 1.1136178479664096, + "grad_norm": 4.349477767944336, + "learning_rate": 3.177154799349814e-05, + "loss": 0.2205, + "step": 17770 + }, + { + "epoch": 1.1142445321802343, + "grad_norm": 0.06528756022453308, + "learning_rate": 3.176099301260265e-05, + "loss": 0.1047, + "step": 17780 + }, + { + "epoch": 1.114871216394059, + "grad_norm": 12.607084274291992, + "learning_rate": 3.1750438031707165e-05, + "loss": 0.1682, + "step": 17790 + }, + { + "epoch": 1.1154979006078838, + "grad_norm": 6.911495208740234, + "learning_rate": 3.173988305081168e-05, + "loss": 0.2568, + "step": 17800 + }, + { + "epoch": 1.1161245848217083, + "grad_norm": 9.881560325622559, + "learning_rate": 3.17293280699162e-05, + "loss": 0.117, + "step": 17810 + }, + { + "epoch": 1.116751269035533, + "grad_norm": 0.3809300363063812, + "learning_rate": 3.171877308902071e-05, + "loss": 0.1061, + "step": 17820 + }, + { + "epoch": 1.1173779532493577, + "grad_norm": 0.023420821875333786, + "learning_rate": 3.1708218108125224e-05, + "loss": 0.0837, + "step": 17830 + }, + { + "epoch": 1.1180046374631822, + "grad_norm": 2.3442838191986084, + "learning_rate": 3.169766312722974e-05, + "loss": 0.186, + "step": 17840 + }, + { + "epoch": 1.118631321677007, + "grad_norm": 1.4598783254623413, + "learning_rate": 3.168710814633426e-05, + "loss": 0.0194, + "step": 17850 + }, + { + "epoch": 1.1192580058908317, + "grad_norm": 0.017658187076449394, + "learning_rate": 3.1676553165438774e-05, + "loss": 0.1068, + "step": 17860 + }, + { + "epoch": 1.1198846901046562, + "grad_norm": 0.015302395448088646, + "learning_rate": 3.166599818454329e-05, + "loss": 0.0331, + "step": 17870 + }, + { + "epoch": 1.120511374318481, + "grad_norm": 0.8182141184806824, + "learning_rate": 3.165544320364781e-05, + "loss": 0.2699, + "step": 17880 + }, + { + "epoch": 1.1211380585323056, + "grad_norm": 0.027354877442121506, + "learning_rate": 3.164488822275232e-05, + "loss": 0.0664, + "step": 17890 + }, + { + "epoch": 1.1217647427461301, + "grad_norm": 0.11983570456504822, + "learning_rate": 3.1634333241856834e-05, + "loss": 0.0471, + "step": 17900 + }, + { + "epoch": 1.1223914269599549, + "grad_norm": 0.33365902304649353, + "learning_rate": 3.162377826096135e-05, + "loss": 0.0432, + "step": 17910 + }, + { + "epoch": 1.1230181111737796, + "grad_norm": 3.265024423599243, + "learning_rate": 3.161322328006586e-05, + "loss": 0.3213, + "step": 17920 + }, + { + "epoch": 1.123644795387604, + "grad_norm": 4.623913288116455, + "learning_rate": 3.160266829917038e-05, + "loss": 0.2395, + "step": 17930 + }, + { + "epoch": 1.1242714796014288, + "grad_norm": 0.09318087249994278, + "learning_rate": 3.1592113318274894e-05, + "loss": 0.039, + "step": 17940 + }, + { + "epoch": 1.1248981638152535, + "grad_norm": 0.1346893608570099, + "learning_rate": 3.158155833737941e-05, + "loss": 0.0821, + "step": 17950 + }, + { + "epoch": 1.125524848029078, + "grad_norm": 2.6381444931030273, + "learning_rate": 3.157100335648393e-05, + "loss": 0.0319, + "step": 17960 + }, + { + "epoch": 1.1261515322429028, + "grad_norm": 8.567200660705566, + "learning_rate": 3.1560448375588444e-05, + "loss": 0.044, + "step": 17970 + }, + { + "epoch": 1.1267782164567275, + "grad_norm": 0.02193138189613819, + "learning_rate": 3.154989339469296e-05, + "loss": 0.0932, + "step": 17980 + }, + { + "epoch": 1.1274049006705522, + "grad_norm": 1.1436899900436401, + "learning_rate": 3.153933841379747e-05, + "loss": 0.0233, + "step": 17990 + }, + { + "epoch": 1.1280315848843767, + "grad_norm": 1.2188142538070679, + "learning_rate": 3.152878343290199e-05, + "loss": 0.2734, + "step": 18000 + }, + { + "epoch": 1.1286582690982014, + "grad_norm": 14.177412033081055, + "learning_rate": 3.1518228452006504e-05, + "loss": 0.0656, + "step": 18010 + }, + { + "epoch": 1.1292849533120262, + "grad_norm": 0.12195923179388046, + "learning_rate": 3.150767347111102e-05, + "loss": 0.1494, + "step": 18020 + }, + { + "epoch": 1.1299116375258507, + "grad_norm": 5.3001389503479, + "learning_rate": 3.149711849021553e-05, + "loss": 0.0758, + "step": 18030 + }, + { + "epoch": 1.1305383217396754, + "grad_norm": 0.7896440625190735, + "learning_rate": 3.148656350932005e-05, + "loss": 0.0049, + "step": 18040 + }, + { + "epoch": 1.1311650059535001, + "grad_norm": 0.3458056151866913, + "learning_rate": 3.1476008528424564e-05, + "loss": 0.2096, + "step": 18050 + }, + { + "epoch": 1.1317916901673246, + "grad_norm": 0.04001154750585556, + "learning_rate": 3.146545354752908e-05, + "loss": 0.0157, + "step": 18060 + }, + { + "epoch": 1.1324183743811493, + "grad_norm": 0.3892626166343689, + "learning_rate": 3.14548985666336e-05, + "loss": 0.1478, + "step": 18070 + }, + { + "epoch": 1.133045058594974, + "grad_norm": 1.3462265729904175, + "learning_rate": 3.1444343585738114e-05, + "loss": 0.0465, + "step": 18080 + }, + { + "epoch": 1.1336717428087986, + "grad_norm": 0.8281564116477966, + "learning_rate": 3.143378860484263e-05, + "loss": 0.1297, + "step": 18090 + }, + { + "epoch": 1.1342984270226233, + "grad_norm": 0.05272606387734413, + "learning_rate": 3.142323362394714e-05, + "loss": 0.1197, + "step": 18100 + }, + { + "epoch": 1.134925111236448, + "grad_norm": 0.06631779670715332, + "learning_rate": 3.141267864305166e-05, + "loss": 0.1193, + "step": 18110 + }, + { + "epoch": 1.1355517954502725, + "grad_norm": 0.7379500865936279, + "learning_rate": 3.1402123662156173e-05, + "loss": 0.1209, + "step": 18120 + }, + { + "epoch": 1.1361784796640972, + "grad_norm": 0.08505575358867645, + "learning_rate": 3.139156868126068e-05, + "loss": 0.1933, + "step": 18130 + }, + { + "epoch": 1.136805163877922, + "grad_norm": 0.052564263343811035, + "learning_rate": 3.13810137003652e-05, + "loss": 0.0058, + "step": 18140 + }, + { + "epoch": 1.1374318480917465, + "grad_norm": 0.10592754930257797, + "learning_rate": 3.137045871946972e-05, + "loss": 0.2424, + "step": 18150 + }, + { + "epoch": 1.1380585323055712, + "grad_norm": 0.08464646339416504, + "learning_rate": 3.135990373857424e-05, + "loss": 0.0235, + "step": 18160 + }, + { + "epoch": 1.138685216519396, + "grad_norm": 0.025046486407518387, + "learning_rate": 3.134934875767875e-05, + "loss": 0.0385, + "step": 18170 + }, + { + "epoch": 1.1393119007332206, + "grad_norm": 0.019679777324199677, + "learning_rate": 3.1338793776783267e-05, + "loss": 0.0631, + "step": 18180 + }, + { + "epoch": 1.1399385849470451, + "grad_norm": 0.4192022383213043, + "learning_rate": 3.132823879588778e-05, + "loss": 0.1764, + "step": 18190 + }, + { + "epoch": 1.1405652691608699, + "grad_norm": 0.09451547265052795, + "learning_rate": 3.13176838149923e-05, + "loss": 0.1082, + "step": 18200 + }, + { + "epoch": 1.1411919533746944, + "grad_norm": 0.034728892147541046, + "learning_rate": 3.130712883409681e-05, + "loss": 0.1421, + "step": 18210 + }, + { + "epoch": 1.141818637588519, + "grad_norm": 2.024574041366577, + "learning_rate": 3.1296573853201326e-05, + "loss": 0.0126, + "step": 18220 + }, + { + "epoch": 1.1424453218023438, + "grad_norm": 11.830857276916504, + "learning_rate": 3.128601887230584e-05, + "loss": 0.1805, + "step": 18230 + }, + { + "epoch": 1.1430720060161685, + "grad_norm": 8.058704376220703, + "learning_rate": 3.127546389141035e-05, + "loss": 0.1096, + "step": 18240 + }, + { + "epoch": 1.143698690229993, + "grad_norm": 18.651052474975586, + "learning_rate": 3.126490891051487e-05, + "loss": 0.2482, + "step": 18250 + }, + { + "epoch": 1.1443253744438178, + "grad_norm": 0.1312084197998047, + "learning_rate": 3.125435392961939e-05, + "loss": 0.0713, + "step": 18260 + }, + { + "epoch": 1.1449520586576425, + "grad_norm": 5.29236364364624, + "learning_rate": 3.124379894872391e-05, + "loss": 0.159, + "step": 18270 + }, + { + "epoch": 1.145578742871467, + "grad_norm": 3.630553960800171, + "learning_rate": 3.123324396782842e-05, + "loss": 0.1035, + "step": 18280 + }, + { + "epoch": 1.1462054270852917, + "grad_norm": 0.05378236994147301, + "learning_rate": 3.1222688986932936e-05, + "loss": 0.0035, + "step": 18290 + }, + { + "epoch": 1.1468321112991164, + "grad_norm": 0.08268694579601288, + "learning_rate": 3.121213400603745e-05, + "loss": 0.2101, + "step": 18300 + }, + { + "epoch": 1.147458795512941, + "grad_norm": 0.09726406633853912, + "learning_rate": 3.120157902514196e-05, + "loss": 0.1929, + "step": 18310 + }, + { + "epoch": 1.1480854797267657, + "grad_norm": 0.07776237279176712, + "learning_rate": 3.119102404424648e-05, + "loss": 0.074, + "step": 18320 + }, + { + "epoch": 1.1487121639405904, + "grad_norm": 0.05973588675260544, + "learning_rate": 3.1180469063350996e-05, + "loss": 0.0628, + "step": 18330 + }, + { + "epoch": 1.149338848154415, + "grad_norm": 0.030430462211370468, + "learning_rate": 3.116991408245551e-05, + "loss": 0.0241, + "step": 18340 + }, + { + "epoch": 1.1499655323682396, + "grad_norm": 6.213319301605225, + "learning_rate": 3.115935910156003e-05, + "loss": 0.0966, + "step": 18350 + }, + { + "epoch": 1.1505922165820643, + "grad_norm": 0.06798451393842697, + "learning_rate": 3.1148804120664546e-05, + "loss": 0.3239, + "step": 18360 + }, + { + "epoch": 1.151218900795889, + "grad_norm": 0.03651350736618042, + "learning_rate": 3.113824913976906e-05, + "loss": 0.1506, + "step": 18370 + }, + { + "epoch": 1.1518455850097136, + "grad_norm": 10.000849723815918, + "learning_rate": 3.112769415887357e-05, + "loss": 0.1306, + "step": 18380 + }, + { + "epoch": 1.1524722692235383, + "grad_norm": 0.05033571273088455, + "learning_rate": 3.111713917797809e-05, + "loss": 0.148, + "step": 18390 + }, + { + "epoch": 1.1530989534373628, + "grad_norm": 0.07155164331197739, + "learning_rate": 3.1106584197082606e-05, + "loss": 0.1108, + "step": 18400 + }, + { + "epoch": 1.1537256376511875, + "grad_norm": 17.460355758666992, + "learning_rate": 3.109602921618712e-05, + "loss": 0.2014, + "step": 18410 + }, + { + "epoch": 1.1543523218650122, + "grad_norm": 0.22391580045223236, + "learning_rate": 3.108547423529163e-05, + "loss": 0.0475, + "step": 18420 + }, + { + "epoch": 1.154979006078837, + "grad_norm": 0.08404719829559326, + "learning_rate": 3.107491925439615e-05, + "loss": 0.0335, + "step": 18430 + }, + { + "epoch": 1.1556056902926615, + "grad_norm": 4.082967758178711, + "learning_rate": 3.1064364273500666e-05, + "loss": 0.0251, + "step": 18440 + }, + { + "epoch": 1.1562323745064862, + "grad_norm": 0.021450785920023918, + "learning_rate": 3.105380929260518e-05, + "loss": 0.1285, + "step": 18450 + }, + { + "epoch": 1.156859058720311, + "grad_norm": 0.05551962926983833, + "learning_rate": 3.10432543117097e-05, + "loss": 0.0565, + "step": 18460 + }, + { + "epoch": 1.1574857429341354, + "grad_norm": 0.031009657308459282, + "learning_rate": 3.1032699330814216e-05, + "loss": 0.1844, + "step": 18470 + }, + { + "epoch": 1.1581124271479601, + "grad_norm": 0.03998725861310959, + "learning_rate": 3.102214434991873e-05, + "loss": 0.0639, + "step": 18480 + }, + { + "epoch": 1.1587391113617849, + "grad_norm": 6.586679458618164, + "learning_rate": 3.101158936902324e-05, + "loss": 0.1592, + "step": 18490 + }, + { + "epoch": 1.1593657955756094, + "grad_norm": 9.556639671325684, + "learning_rate": 3.100103438812776e-05, + "loss": 0.0797, + "step": 18500 + }, + { + "epoch": 1.159992479789434, + "grad_norm": 0.06767041236162186, + "learning_rate": 3.0990479407232275e-05, + "loss": 0.0086, + "step": 18510 + }, + { + "epoch": 1.1606191640032588, + "grad_norm": 0.03151436150074005, + "learning_rate": 3.0979924426336785e-05, + "loss": 0.0038, + "step": 18520 + }, + { + "epoch": 1.1612458482170833, + "grad_norm": 0.08840905874967575, + "learning_rate": 3.09693694454413e-05, + "loss": 0.0063, + "step": 18530 + }, + { + "epoch": 1.161872532430908, + "grad_norm": 0.03952464088797569, + "learning_rate": 3.095881446454582e-05, + "loss": 0.1326, + "step": 18540 + }, + { + "epoch": 1.1624992166447328, + "grad_norm": 0.06938064843416214, + "learning_rate": 3.0948259483650335e-05, + "loss": 0.1138, + "step": 18550 + }, + { + "epoch": 1.1631259008585575, + "grad_norm": 4.21276330947876, + "learning_rate": 3.093770450275485e-05, + "loss": 0.1467, + "step": 18560 + }, + { + "epoch": 1.163752585072382, + "grad_norm": 2.3090898990631104, + "learning_rate": 3.092714952185937e-05, + "loss": 0.1913, + "step": 18570 + }, + { + "epoch": 1.1643792692862067, + "grad_norm": 8.786662101745605, + "learning_rate": 3.0916594540963885e-05, + "loss": 0.127, + "step": 18580 + }, + { + "epoch": 1.1650059535000312, + "grad_norm": 0.15764133632183075, + "learning_rate": 3.09060395600684e-05, + "loss": 0.0588, + "step": 18590 + }, + { + "epoch": 1.165632637713856, + "grad_norm": 0.08247308433055878, + "learning_rate": 3.089548457917291e-05, + "loss": 0.0592, + "step": 18600 + }, + { + "epoch": 1.1662593219276807, + "grad_norm": 0.4140074849128723, + "learning_rate": 3.088492959827743e-05, + "loss": 0.0054, + "step": 18610 + }, + { + "epoch": 1.1668860061415054, + "grad_norm": 0.03695308417081833, + "learning_rate": 3.0874374617381945e-05, + "loss": 0.1292, + "step": 18620 + }, + { + "epoch": 1.16751269035533, + "grad_norm": 9.783849716186523, + "learning_rate": 3.0863819636486455e-05, + "loss": 0.2868, + "step": 18630 + }, + { + "epoch": 1.1681393745691546, + "grad_norm": 0.05571041628718376, + "learning_rate": 3.085326465559097e-05, + "loss": 0.1062, + "step": 18640 + }, + { + "epoch": 1.1687660587829793, + "grad_norm": 0.5784240961074829, + "learning_rate": 3.084270967469549e-05, + "loss": 0.0648, + "step": 18650 + }, + { + "epoch": 1.1693927429968038, + "grad_norm": 7.899927139282227, + "learning_rate": 3.083215469380001e-05, + "loss": 0.1259, + "step": 18660 + }, + { + "epoch": 1.1700194272106286, + "grad_norm": 1.123893141746521, + "learning_rate": 3.082159971290452e-05, + "loss": 0.1433, + "step": 18670 + }, + { + "epoch": 1.1706461114244533, + "grad_norm": 1.1794332265853882, + "learning_rate": 3.081104473200904e-05, + "loss": 0.223, + "step": 18680 + }, + { + "epoch": 1.1712727956382778, + "grad_norm": 0.06373187899589539, + "learning_rate": 3.0800489751113555e-05, + "loss": 0.0366, + "step": 18690 + }, + { + "epoch": 1.1718994798521025, + "grad_norm": 0.5772104263305664, + "learning_rate": 3.0789934770218065e-05, + "loss": 0.1949, + "step": 18700 + }, + { + "epoch": 1.1725261640659272, + "grad_norm": 3.3197052478790283, + "learning_rate": 3.077937978932258e-05, + "loss": 0.058, + "step": 18710 + }, + { + "epoch": 1.1731528482797517, + "grad_norm": 1.338387131690979, + "learning_rate": 3.07688248084271e-05, + "loss": 0.0886, + "step": 18720 + }, + { + "epoch": 1.1737795324935765, + "grad_norm": 0.7524656653404236, + "learning_rate": 3.0758269827531615e-05, + "loss": 0.0612, + "step": 18730 + }, + { + "epoch": 1.1744062167074012, + "grad_norm": 0.05872412398457527, + "learning_rate": 3.0747714846636125e-05, + "loss": 0.1389, + "step": 18740 + }, + { + "epoch": 1.1750329009212257, + "grad_norm": 0.036100588738918304, + "learning_rate": 3.073715986574064e-05, + "loss": 0.1411, + "step": 18750 + }, + { + "epoch": 1.1756595851350504, + "grad_norm": 0.06897371262311935, + "learning_rate": 3.0726604884845165e-05, + "loss": 0.0841, + "step": 18760 + }, + { + "epoch": 1.1762862693488751, + "grad_norm": 0.0995137169957161, + "learning_rate": 3.0716049903949674e-05, + "loss": 0.0393, + "step": 18770 + }, + { + "epoch": 1.1769129535626996, + "grad_norm": 0.04892328381538391, + "learning_rate": 3.070549492305419e-05, + "loss": 0.0086, + "step": 18780 + }, + { + "epoch": 1.1775396377765244, + "grad_norm": 0.08550518751144409, + "learning_rate": 3.069493994215871e-05, + "loss": 0.0755, + "step": 18790 + }, + { + "epoch": 1.178166321990349, + "grad_norm": 0.03711172938346863, + "learning_rate": 3.0684384961263224e-05, + "loss": 0.109, + "step": 18800 + }, + { + "epoch": 1.1787930062041738, + "grad_norm": 0.09902484714984894, + "learning_rate": 3.0673829980367734e-05, + "loss": 0.1456, + "step": 18810 + }, + { + "epoch": 1.1794196904179983, + "grad_norm": 2.862833261489868, + "learning_rate": 3.066327499947225e-05, + "loss": 0.1037, + "step": 18820 + }, + { + "epoch": 1.180046374631823, + "grad_norm": 0.29862895607948303, + "learning_rate": 3.065272001857677e-05, + "loss": 0.1776, + "step": 18830 + }, + { + "epoch": 1.1806730588456478, + "grad_norm": 2.7370612621307373, + "learning_rate": 3.064216503768128e-05, + "loss": 0.0548, + "step": 18840 + }, + { + "epoch": 1.1812997430594723, + "grad_norm": 2.9477670192718506, + "learning_rate": 3.06316100567858e-05, + "loss": 0.1779, + "step": 18850 + }, + { + "epoch": 1.181926427273297, + "grad_norm": 0.349155455827713, + "learning_rate": 3.062105507589032e-05, + "loss": 0.0444, + "step": 18860 + }, + { + "epoch": 1.1825531114871217, + "grad_norm": 0.0302012600004673, + "learning_rate": 3.0610500094994834e-05, + "loss": 0.0967, + "step": 18870 + }, + { + "epoch": 1.1831797957009462, + "grad_norm": 4.532477855682373, + "learning_rate": 3.0599945114099344e-05, + "loss": 0.335, + "step": 18880 + }, + { + "epoch": 1.183806479914771, + "grad_norm": 0.14545802772045135, + "learning_rate": 3.058939013320386e-05, + "loss": 0.1045, + "step": 18890 + }, + { + "epoch": 1.1844331641285957, + "grad_norm": 8.656229019165039, + "learning_rate": 3.057883515230838e-05, + "loss": 0.3195, + "step": 18900 + }, + { + "epoch": 1.1850598483424202, + "grad_norm": 8.323212623596191, + "learning_rate": 3.056828017141289e-05, + "loss": 0.0389, + "step": 18910 + }, + { + "epoch": 1.185686532556245, + "grad_norm": 0.051557403057813644, + "learning_rate": 3.0557725190517404e-05, + "loss": 0.0346, + "step": 18920 + }, + { + "epoch": 1.1863132167700696, + "grad_norm": 0.1489444226026535, + "learning_rate": 3.054717020962192e-05, + "loss": 0.0837, + "step": 18930 + }, + { + "epoch": 1.1869399009838941, + "grad_norm": 0.04929697886109352, + "learning_rate": 3.053661522872644e-05, + "loss": 0.1245, + "step": 18940 + }, + { + "epoch": 1.1875665851977188, + "grad_norm": 4.793705463409424, + "learning_rate": 3.0526060247830954e-05, + "loss": 0.0366, + "step": 18950 + }, + { + "epoch": 1.1881932694115436, + "grad_norm": 38.849334716796875, + "learning_rate": 3.0515505266935467e-05, + "loss": 0.2209, + "step": 18960 + }, + { + "epoch": 1.188819953625368, + "grad_norm": 12.780930519104004, + "learning_rate": 3.0504950286039984e-05, + "loss": 0.135, + "step": 18970 + }, + { + "epoch": 1.1894466378391928, + "grad_norm": 0.12625622749328613, + "learning_rate": 3.0494395305144497e-05, + "loss": 0.2881, + "step": 18980 + }, + { + "epoch": 1.1900733220530175, + "grad_norm": 0.12638124823570251, + "learning_rate": 3.0483840324249014e-05, + "loss": 0.0959, + "step": 18990 + }, + { + "epoch": 1.1907000062668422, + "grad_norm": 3.6278929710388184, + "learning_rate": 3.047328534335353e-05, + "loss": 0.0529, + "step": 19000 + }, + { + "epoch": 1.1913266904806668, + "grad_norm": 0.10436367988586426, + "learning_rate": 3.0462730362458047e-05, + "loss": 0.0325, + "step": 19010 + }, + { + "epoch": 1.1919533746944915, + "grad_norm": 0.0621584877371788, + "learning_rate": 3.045217538156256e-05, + "loss": 0.1517, + "step": 19020 + }, + { + "epoch": 1.192580058908316, + "grad_norm": 1.4065021276474, + "learning_rate": 3.0441620400667077e-05, + "loss": 0.0239, + "step": 19030 + }, + { + "epoch": 1.1932067431221407, + "grad_norm": 0.06358971446752548, + "learning_rate": 3.0431065419771594e-05, + "loss": 0.0019, + "step": 19040 + }, + { + "epoch": 1.1938334273359654, + "grad_norm": 0.11690735816955566, + "learning_rate": 3.042051043887611e-05, + "loss": 0.2751, + "step": 19050 + }, + { + "epoch": 1.1944601115497901, + "grad_norm": 0.029574111104011536, + "learning_rate": 3.040995545798062e-05, + "loss": 0.101, + "step": 19060 + }, + { + "epoch": 1.1950867957636147, + "grad_norm": 2.2352538108825684, + "learning_rate": 3.0399400477085137e-05, + "loss": 0.0831, + "step": 19070 + }, + { + "epoch": 1.1957134799774394, + "grad_norm": 0.24031168222427368, + "learning_rate": 3.0388845496189657e-05, + "loss": 0.1658, + "step": 19080 + }, + { + "epoch": 1.196340164191264, + "grad_norm": 3.322831392288208, + "learning_rate": 3.0378290515294167e-05, + "loss": 0.0086, + "step": 19090 + }, + { + "epoch": 1.1969668484050886, + "grad_norm": 0.19900059700012207, + "learning_rate": 3.0367735534398683e-05, + "loss": 0.082, + "step": 19100 + }, + { + "epoch": 1.1975935326189133, + "grad_norm": 1.1015654802322388, + "learning_rate": 3.03571805535032e-05, + "loss": 0.1187, + "step": 19110 + }, + { + "epoch": 1.198220216832738, + "grad_norm": 0.13343365490436554, + "learning_rate": 3.0346625572607717e-05, + "loss": 0.1798, + "step": 19120 + }, + { + "epoch": 1.1988469010465626, + "grad_norm": 0.04171142354607582, + "learning_rate": 3.033607059171223e-05, + "loss": 0.0758, + "step": 19130 + }, + { + "epoch": 1.1994735852603873, + "grad_norm": 0.08893130719661713, + "learning_rate": 3.0325515610816747e-05, + "loss": 0.0852, + "step": 19140 + }, + { + "epoch": 1.200100269474212, + "grad_norm": 7.023293495178223, + "learning_rate": 3.0314960629921263e-05, + "loss": 0.0773, + "step": 19150 + }, + { + "epoch": 1.2007269536880365, + "grad_norm": 0.7998108267784119, + "learning_rate": 3.0304405649025773e-05, + "loss": 0.1002, + "step": 19160 + }, + { + "epoch": 1.2013536379018612, + "grad_norm": 0.13294047117233276, + "learning_rate": 3.029385066813029e-05, + "loss": 0.2127, + "step": 19170 + }, + { + "epoch": 1.201980322115686, + "grad_norm": 0.12403099983930588, + "learning_rate": 3.028329568723481e-05, + "loss": 0.0417, + "step": 19180 + }, + { + "epoch": 1.2026070063295107, + "grad_norm": 0.17409439384937286, + "learning_rate": 3.0272740706339326e-05, + "loss": 0.208, + "step": 19190 + }, + { + "epoch": 1.2032336905433352, + "grad_norm": 0.4515335261821747, + "learning_rate": 3.0262185725443836e-05, + "loss": 0.0736, + "step": 19200 + }, + { + "epoch": 1.20386037475716, + "grad_norm": 1.3704180717468262, + "learning_rate": 3.0251630744548353e-05, + "loss": 0.1135, + "step": 19210 + }, + { + "epoch": 1.2044870589709844, + "grad_norm": 10.743453979492188, + "learning_rate": 3.024107576365287e-05, + "loss": 0.0932, + "step": 19220 + }, + { + "epoch": 1.2051137431848091, + "grad_norm": 0.02900216355919838, + "learning_rate": 3.0230520782757383e-05, + "loss": 0.182, + "step": 19230 + }, + { + "epoch": 1.2057404273986339, + "grad_norm": 0.03926697000861168, + "learning_rate": 3.02199658018619e-05, + "loss": 0.0693, + "step": 19240 + }, + { + "epoch": 1.2063671116124586, + "grad_norm": 0.3335837423801422, + "learning_rate": 3.0209410820966416e-05, + "loss": 0.1682, + "step": 19250 + }, + { + "epoch": 1.206993795826283, + "grad_norm": 0.15186485648155212, + "learning_rate": 3.0198855840070933e-05, + "loss": 0.0481, + "step": 19260 + }, + { + "epoch": 1.2076204800401078, + "grad_norm": 0.017995532602071762, + "learning_rate": 3.0188300859175446e-05, + "loss": 0.0787, + "step": 19270 + }, + { + "epoch": 1.2082471642539325, + "grad_norm": 0.07066605240106583, + "learning_rate": 3.0177745878279963e-05, + "loss": 0.0587, + "step": 19280 + }, + { + "epoch": 1.208873848467757, + "grad_norm": 2.0296781063079834, + "learning_rate": 3.016719089738448e-05, + "loss": 0.1208, + "step": 19290 + }, + { + "epoch": 1.2095005326815818, + "grad_norm": 0.06564827263355255, + "learning_rate": 3.015663591648899e-05, + "loss": 0.143, + "step": 19300 + }, + { + "epoch": 1.2101272168954065, + "grad_norm": 0.47882741689682007, + "learning_rate": 3.0146080935593506e-05, + "loss": 0.0057, + "step": 19310 + }, + { + "epoch": 1.210753901109231, + "grad_norm": 0.044205307960510254, + "learning_rate": 3.0135525954698023e-05, + "loss": 0.0397, + "step": 19320 + }, + { + "epoch": 1.2113805853230557, + "grad_norm": 0.08370712399482727, + "learning_rate": 3.0124970973802543e-05, + "loss": 0.1867, + "step": 19330 + }, + { + "epoch": 1.2120072695368804, + "grad_norm": 0.859338641166687, + "learning_rate": 3.0114415992907052e-05, + "loss": 0.056, + "step": 19340 + }, + { + "epoch": 1.212633953750705, + "grad_norm": 0.03434952720999718, + "learning_rate": 3.010386101201157e-05, + "loss": 0.0506, + "step": 19350 + }, + { + "epoch": 1.2132606379645297, + "grad_norm": 0.1044652909040451, + "learning_rate": 3.0093306031116086e-05, + "loss": 0.0492, + "step": 19360 + }, + { + "epoch": 1.2138873221783544, + "grad_norm": 0.012052880600094795, + "learning_rate": 3.00827510502206e-05, + "loss": 0.0333, + "step": 19370 + }, + { + "epoch": 1.214514006392179, + "grad_norm": 0.021363429725170135, + "learning_rate": 3.0072196069325116e-05, + "loss": 0.1359, + "step": 19380 + }, + { + "epoch": 1.2151406906060036, + "grad_norm": 0.07590274512767792, + "learning_rate": 3.0061641088429632e-05, + "loss": 0.1013, + "step": 19390 + }, + { + "epoch": 1.2157673748198283, + "grad_norm": 0.4024260640144348, + "learning_rate": 3.005108610753415e-05, + "loss": 0.0974, + "step": 19400 + }, + { + "epoch": 1.2163940590336528, + "grad_norm": 0.035482216626405716, + "learning_rate": 3.004053112663866e-05, + "loss": 0.0986, + "step": 19410 + }, + { + "epoch": 1.2170207432474776, + "grad_norm": 0.7663148045539856, + "learning_rate": 3.0029976145743176e-05, + "loss": 0.1216, + "step": 19420 + }, + { + "epoch": 1.2176474274613023, + "grad_norm": 0.33147960901260376, + "learning_rate": 3.0019421164847696e-05, + "loss": 0.0986, + "step": 19430 + }, + { + "epoch": 1.218274111675127, + "grad_norm": 0.25695696473121643, + "learning_rate": 3.0008866183952212e-05, + "loss": 0.1765, + "step": 19440 + }, + { + "epoch": 1.2189007958889515, + "grad_norm": 0.01739318296313286, + "learning_rate": 2.9998311203056722e-05, + "loss": 0.126, + "step": 19450 + }, + { + "epoch": 1.2195274801027762, + "grad_norm": 1.8818984031677246, + "learning_rate": 2.998775622216124e-05, + "loss": 0.0637, + "step": 19460 + }, + { + "epoch": 1.220154164316601, + "grad_norm": 0.01870718225836754, + "learning_rate": 2.9977201241265755e-05, + "loss": 0.094, + "step": 19470 + }, + { + "epoch": 1.2207808485304255, + "grad_norm": 0.12540331482887268, + "learning_rate": 2.996664626037027e-05, + "loss": 0.108, + "step": 19480 + }, + { + "epoch": 1.2214075327442502, + "grad_norm": 3.370209217071533, + "learning_rate": 2.9956091279474785e-05, + "loss": 0.1678, + "step": 19490 + }, + { + "epoch": 1.222034216958075, + "grad_norm": 3.6439049243927, + "learning_rate": 2.9945536298579302e-05, + "loss": 0.244, + "step": 19500 + }, + { + "epoch": 1.2226609011718994, + "grad_norm": 0.9234188199043274, + "learning_rate": 2.993498131768382e-05, + "loss": 0.0357, + "step": 19510 + }, + { + "epoch": 1.2232875853857241, + "grad_norm": 0.31463149189949036, + "learning_rate": 2.9924426336788332e-05, + "loss": 0.0155, + "step": 19520 + }, + { + "epoch": 1.2239142695995489, + "grad_norm": 0.22235389053821564, + "learning_rate": 2.991387135589285e-05, + "loss": 0.1766, + "step": 19530 + }, + { + "epoch": 1.2245409538133734, + "grad_norm": 0.1038447692990303, + "learning_rate": 2.9903316374997365e-05, + "loss": 0.0483, + "step": 19540 + }, + { + "epoch": 1.225167638027198, + "grad_norm": 0.10996226221323013, + "learning_rate": 2.9892761394101875e-05, + "loss": 0.0979, + "step": 19550 + }, + { + "epoch": 1.2257943222410228, + "grad_norm": 0.05760728940367699, + "learning_rate": 2.988220641320639e-05, + "loss": 0.1704, + "step": 19560 + }, + { + "epoch": 1.2264210064548473, + "grad_norm": 0.3645038604736328, + "learning_rate": 2.987165143231091e-05, + "loss": 0.1405, + "step": 19570 + }, + { + "epoch": 1.227047690668672, + "grad_norm": 3.712083101272583, + "learning_rate": 2.986109645141543e-05, + "loss": 0.0835, + "step": 19580 + }, + { + "epoch": 1.2276743748824968, + "grad_norm": 0.049852531403303146, + "learning_rate": 2.9850541470519938e-05, + "loss": 0.1232, + "step": 19590 + }, + { + "epoch": 1.2283010590963213, + "grad_norm": 0.0666368305683136, + "learning_rate": 2.9839986489624455e-05, + "loss": 0.1781, + "step": 19600 + }, + { + "epoch": 1.228927743310146, + "grad_norm": 0.15124481916427612, + "learning_rate": 2.982943150872897e-05, + "loss": 0.2627, + "step": 19610 + }, + { + "epoch": 1.2295544275239707, + "grad_norm": 0.1682569831609726, + "learning_rate": 2.9818876527833485e-05, + "loss": 0.0087, + "step": 19620 + }, + { + "epoch": 1.2301811117377954, + "grad_norm": 0.1204722449183464, + "learning_rate": 2.9808321546938e-05, + "loss": 0.013, + "step": 19630 + }, + { + "epoch": 1.23080779595162, + "grad_norm": 5.767759799957275, + "learning_rate": 2.9797766566042518e-05, + "loss": 0.25, + "step": 19640 + }, + { + "epoch": 1.2314344801654447, + "grad_norm": 0.25917211174964905, + "learning_rate": 2.9787211585147035e-05, + "loss": 0.1393, + "step": 19650 + }, + { + "epoch": 1.2320611643792694, + "grad_norm": 0.15553507208824158, + "learning_rate": 2.9776656604251545e-05, + "loss": 0.0623, + "step": 19660 + }, + { + "epoch": 1.2326878485930939, + "grad_norm": 3.4714977741241455, + "learning_rate": 2.9766101623356065e-05, + "loss": 0.157, + "step": 19670 + }, + { + "epoch": 1.2333145328069186, + "grad_norm": 0.20367176830768585, + "learning_rate": 2.975554664246058e-05, + "loss": 0.0247, + "step": 19680 + }, + { + "epoch": 1.2339412170207433, + "grad_norm": 0.09446282684803009, + "learning_rate": 2.974499166156509e-05, + "loss": 0.139, + "step": 19690 + }, + { + "epoch": 1.2345679012345678, + "grad_norm": 0.0783671885728836, + "learning_rate": 2.9734436680669608e-05, + "loss": 0.0595, + "step": 19700 + }, + { + "epoch": 1.2351945854483926, + "grad_norm": 0.07558191567659378, + "learning_rate": 2.9723881699774125e-05, + "loss": 0.0884, + "step": 19710 + }, + { + "epoch": 1.2358212696622173, + "grad_norm": 0.12064258009195328, + "learning_rate": 2.971332671887864e-05, + "loss": 0.0598, + "step": 19720 + }, + { + "epoch": 1.2364479538760418, + "grad_norm": 22.905738830566406, + "learning_rate": 2.9702771737983154e-05, + "loss": 0.0969, + "step": 19730 + }, + { + "epoch": 1.2370746380898665, + "grad_norm": 0.04529252275824547, + "learning_rate": 2.969221675708767e-05, + "loss": 0.0691, + "step": 19740 + }, + { + "epoch": 1.2377013223036912, + "grad_norm": 0.08773695677518845, + "learning_rate": 2.9681661776192188e-05, + "loss": 0.142, + "step": 19750 + }, + { + "epoch": 1.2383280065175157, + "grad_norm": 0.12956632673740387, + "learning_rate": 2.9671106795296698e-05, + "loss": 0.0385, + "step": 19760 + }, + { + "epoch": 1.2389546907313405, + "grad_norm": 0.10451506078243256, + "learning_rate": 2.9660551814401218e-05, + "loss": 0.1296, + "step": 19770 + }, + { + "epoch": 1.2395813749451652, + "grad_norm": 0.03981676697731018, + "learning_rate": 2.9649996833505734e-05, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.2402080591589897, + "grad_norm": 5.25244665145874, + "learning_rate": 2.963944185261025e-05, + "loss": 0.011, + "step": 19790 + }, + { + "epoch": 1.2408347433728144, + "grad_norm": 0.01934889517724514, + "learning_rate": 2.962888687171476e-05, + "loss": 0.12, + "step": 19800 + }, + { + "epoch": 1.2414614275866391, + "grad_norm": 0.018547803163528442, + "learning_rate": 2.9618331890819277e-05, + "loss": 0.0381, + "step": 19810 + }, + { + "epoch": 1.2420881118004639, + "grad_norm": 0.03331906720995903, + "learning_rate": 2.9607776909923794e-05, + "loss": 0.1396, + "step": 19820 + }, + { + "epoch": 1.2427147960142884, + "grad_norm": 0.04414796829223633, + "learning_rate": 2.9597221929028314e-05, + "loss": 0.0023, + "step": 19830 + }, + { + "epoch": 1.243341480228113, + "grad_norm": 0.03142017871141434, + "learning_rate": 2.9586666948132824e-05, + "loss": 0.1793, + "step": 19840 + }, + { + "epoch": 1.2439681644419376, + "grad_norm": 0.02367275580763817, + "learning_rate": 2.957611196723734e-05, + "loss": 0.1257, + "step": 19850 + }, + { + "epoch": 1.2445948486557623, + "grad_norm": 24.037185668945312, + "learning_rate": 2.9565556986341857e-05, + "loss": 0.1726, + "step": 19860 + }, + { + "epoch": 1.245221532869587, + "grad_norm": 0.10329566150903702, + "learning_rate": 2.955500200544637e-05, + "loss": 0.1484, + "step": 19870 + }, + { + "epoch": 1.2458482170834118, + "grad_norm": 0.04350671544671059, + "learning_rate": 2.9544447024550887e-05, + "loss": 0.0351, + "step": 19880 + }, + { + "epoch": 1.2464749012972363, + "grad_norm": 10.928489685058594, + "learning_rate": 2.9533892043655404e-05, + "loss": 0.2075, + "step": 19890 + }, + { + "epoch": 1.247101585511061, + "grad_norm": 0.06471390277147293, + "learning_rate": 2.952333706275992e-05, + "loss": 0.0423, + "step": 19900 + }, + { + "epoch": 1.2477282697248857, + "grad_norm": 0.08085936307907104, + "learning_rate": 2.951278208186443e-05, + "loss": 0.1876, + "step": 19910 + }, + { + "epoch": 1.2483549539387102, + "grad_norm": 0.16564467549324036, + "learning_rate": 2.950222710096895e-05, + "loss": 0.161, + "step": 19920 + }, + { + "epoch": 1.248981638152535, + "grad_norm": 0.34986400604248047, + "learning_rate": 2.9491672120073467e-05, + "loss": 0.0752, + "step": 19930 + }, + { + "epoch": 1.2496083223663597, + "grad_norm": 0.02159041538834572, + "learning_rate": 2.9481117139177977e-05, + "loss": 0.1011, + "step": 19940 + }, + { + "epoch": 1.2502350065801844, + "grad_norm": 0.03885908052325249, + "learning_rate": 2.9470562158282494e-05, + "loss": 0.0151, + "step": 19950 + }, + { + "epoch": 1.2508616907940089, + "grad_norm": 0.02896152250468731, + "learning_rate": 2.946000717738701e-05, + "loss": 0.189, + "step": 19960 + }, + { + "epoch": 1.2514883750078336, + "grad_norm": 4.279455661773682, + "learning_rate": 2.9449452196491527e-05, + "loss": 0.1141, + "step": 19970 + }, + { + "epoch": 1.252115059221658, + "grad_norm": 0.07367860525846481, + "learning_rate": 2.943889721559604e-05, + "loss": 0.0584, + "step": 19980 + }, + { + "epoch": 1.2527417434354828, + "grad_norm": 1.4007365703582764, + "learning_rate": 2.9428342234700557e-05, + "loss": 0.1464, + "step": 19990 + }, + { + "epoch": 1.2533684276493076, + "grad_norm": 3.3747365474700928, + "learning_rate": 2.9417787253805074e-05, + "loss": 0.0838, + "step": 20000 + }, + { + "epoch": 1.2539951118631323, + "grad_norm": 3.90412974357605, + "learning_rate": 2.9407232272909583e-05, + "loss": 0.0122, + "step": 20010 + }, + { + "epoch": 1.2546217960769568, + "grad_norm": 0.6516067981719971, + "learning_rate": 2.9396677292014103e-05, + "loss": 0.1077, + "step": 20020 + }, + { + "epoch": 1.2552484802907815, + "grad_norm": 10.695253372192383, + "learning_rate": 2.938612231111862e-05, + "loss": 0.2566, + "step": 20030 + }, + { + "epoch": 1.255875164504606, + "grad_norm": 0.032031357288360596, + "learning_rate": 2.9375567330223137e-05, + "loss": 0.1136, + "step": 20040 + }, + { + "epoch": 1.2565018487184307, + "grad_norm": 0.04882671311497688, + "learning_rate": 2.9365012349327647e-05, + "loss": 0.0501, + "step": 20050 + }, + { + "epoch": 1.2571285329322555, + "grad_norm": 0.25139033794403076, + "learning_rate": 2.9354457368432163e-05, + "loss": 0.1784, + "step": 20060 + }, + { + "epoch": 1.2577552171460802, + "grad_norm": 1.5231380462646484, + "learning_rate": 2.934390238753668e-05, + "loss": 0.0535, + "step": 20070 + }, + { + "epoch": 1.2583819013599047, + "grad_norm": 5.682507514953613, + "learning_rate": 2.9333347406641193e-05, + "loss": 0.1149, + "step": 20080 + }, + { + "epoch": 1.2590085855737294, + "grad_norm": 5.259180545806885, + "learning_rate": 2.932279242574571e-05, + "loss": 0.0943, + "step": 20090 + }, + { + "epoch": 1.259635269787554, + "grad_norm": 0.047352973371744156, + "learning_rate": 2.9312237444850226e-05, + "loss": 0.0046, + "step": 20100 + }, + { + "epoch": 1.2602619540013786, + "grad_norm": 6.0193328857421875, + "learning_rate": 2.9301682463954743e-05, + "loss": 0.1647, + "step": 20110 + }, + { + "epoch": 1.2608886382152034, + "grad_norm": 14.669256210327148, + "learning_rate": 2.9291127483059256e-05, + "loss": 0.0057, + "step": 20120 + }, + { + "epoch": 1.261515322429028, + "grad_norm": 12.382940292358398, + "learning_rate": 2.9280572502163773e-05, + "loss": 0.0715, + "step": 20130 + }, + { + "epoch": 1.2621420066428526, + "grad_norm": 0.02463279478251934, + "learning_rate": 2.927001752126829e-05, + "loss": 0.0771, + "step": 20140 + }, + { + "epoch": 1.2627686908566773, + "grad_norm": 0.10457153618335724, + "learning_rate": 2.92594625403728e-05, + "loss": 0.1199, + "step": 20150 + }, + { + "epoch": 1.263395375070502, + "grad_norm": 8.55932903289795, + "learning_rate": 2.9248907559477316e-05, + "loss": 0.1022, + "step": 20160 + }, + { + "epoch": 1.2640220592843265, + "grad_norm": 0.043147992342710495, + "learning_rate": 2.9238352578581836e-05, + "loss": 0.0658, + "step": 20170 + }, + { + "epoch": 1.2646487434981513, + "grad_norm": 27.947301864624023, + "learning_rate": 2.9227797597686353e-05, + "loss": 0.0859, + "step": 20180 + }, + { + "epoch": 1.265275427711976, + "grad_norm": 0.02526325173676014, + "learning_rate": 2.9217242616790863e-05, + "loss": 0.0019, + "step": 20190 + }, + { + "epoch": 1.2659021119258007, + "grad_norm": 0.09650743752717972, + "learning_rate": 2.920668763589538e-05, + "loss": 0.0923, + "step": 20200 + }, + { + "epoch": 1.2665287961396252, + "grad_norm": 0.6603000164031982, + "learning_rate": 2.9196132654999896e-05, + "loss": 0.0079, + "step": 20210 + }, + { + "epoch": 1.26715548035345, + "grad_norm": 3.661068916320801, + "learning_rate": 2.9185577674104413e-05, + "loss": 0.167, + "step": 20220 + }, + { + "epoch": 1.2677821645672744, + "grad_norm": 0.8906214833259583, + "learning_rate": 2.9175022693208926e-05, + "loss": 0.1106, + "step": 20230 + }, + { + "epoch": 1.2684088487810992, + "grad_norm": 0.2638370990753174, + "learning_rate": 2.9164467712313443e-05, + "loss": 0.1843, + "step": 20240 + }, + { + "epoch": 1.2690355329949239, + "grad_norm": 3.85107684135437, + "learning_rate": 2.915391273141796e-05, + "loss": 0.1137, + "step": 20250 + }, + { + "epoch": 1.2696622172087486, + "grad_norm": 0.170636385679245, + "learning_rate": 2.914335775052247e-05, + "loss": 0.1177, + "step": 20260 + }, + { + "epoch": 1.2702889014225731, + "grad_norm": 0.25960955023765564, + "learning_rate": 2.913280276962699e-05, + "loss": 0.1968, + "step": 20270 + }, + { + "epoch": 1.2709155856363978, + "grad_norm": 4.221624851226807, + "learning_rate": 2.9122247788731506e-05, + "loss": 0.0598, + "step": 20280 + }, + { + "epoch": 1.2715422698502223, + "grad_norm": 1.4436137676239014, + "learning_rate": 2.9111692807836023e-05, + "loss": 0.0213, + "step": 20290 + }, + { + "epoch": 1.272168954064047, + "grad_norm": 0.39366188645362854, + "learning_rate": 2.9101137826940532e-05, + "loss": 0.1929, + "step": 20300 + }, + { + "epoch": 1.2727956382778718, + "grad_norm": 0.37669408321380615, + "learning_rate": 2.909058284604505e-05, + "loss": 0.0468, + "step": 20310 + }, + { + "epoch": 1.2734223224916965, + "grad_norm": 0.015775645151734352, + "learning_rate": 2.9080027865149566e-05, + "loss": 0.1132, + "step": 20320 + }, + { + "epoch": 1.274049006705521, + "grad_norm": 0.24999144673347473, + "learning_rate": 2.906947288425408e-05, + "loss": 0.1911, + "step": 20330 + }, + { + "epoch": 1.2746756909193457, + "grad_norm": 0.21520374715328217, + "learning_rate": 2.9058917903358596e-05, + "loss": 0.0826, + "step": 20340 + }, + { + "epoch": 1.2753023751331705, + "grad_norm": 2.923757314682007, + "learning_rate": 2.9048362922463112e-05, + "loss": 0.1644, + "step": 20350 + }, + { + "epoch": 1.275929059346995, + "grad_norm": 1.0977178812026978, + "learning_rate": 2.903780794156763e-05, + "loss": 0.0665, + "step": 20360 + }, + { + "epoch": 1.2765557435608197, + "grad_norm": 1.5767126083374023, + "learning_rate": 2.9027252960672142e-05, + "loss": 0.0443, + "step": 20370 + }, + { + "epoch": 1.2771824277746444, + "grad_norm": 1.1978888511657715, + "learning_rate": 2.901669797977666e-05, + "loss": 0.0876, + "step": 20380 + }, + { + "epoch": 1.2778091119884691, + "grad_norm": 0.14035820960998535, + "learning_rate": 2.9006142998881175e-05, + "loss": 0.1417, + "step": 20390 + }, + { + "epoch": 1.2784357962022936, + "grad_norm": 0.056992143392562866, + "learning_rate": 2.8995588017985685e-05, + "loss": 0.1431, + "step": 20400 + }, + { + "epoch": 1.2790624804161184, + "grad_norm": 0.09089019894599915, + "learning_rate": 2.8985033037090202e-05, + "loss": 0.0829, + "step": 20410 + }, + { + "epoch": 1.2796891646299429, + "grad_norm": 0.05624585598707199, + "learning_rate": 2.8974478056194722e-05, + "loss": 0.0377, + "step": 20420 + }, + { + "epoch": 1.2803158488437676, + "grad_norm": 27.855571746826172, + "learning_rate": 2.896392307529924e-05, + "loss": 0.3182, + "step": 20430 + }, + { + "epoch": 1.2809425330575923, + "grad_norm": 0.08453124761581421, + "learning_rate": 2.895336809440375e-05, + "loss": 0.1624, + "step": 20440 + }, + { + "epoch": 1.281569217271417, + "grad_norm": 0.27798300981521606, + "learning_rate": 2.8942813113508265e-05, + "loss": 0.0519, + "step": 20450 + }, + { + "epoch": 1.2821959014852415, + "grad_norm": 3.423630714416504, + "learning_rate": 2.8932258132612782e-05, + "loss": 0.0672, + "step": 20460 + }, + { + "epoch": 1.2828225856990663, + "grad_norm": 1.1507163047790527, + "learning_rate": 2.8921703151717295e-05, + "loss": 0.0965, + "step": 20470 + }, + { + "epoch": 1.2834492699128908, + "grad_norm": 0.03121231682598591, + "learning_rate": 2.8911148170821812e-05, + "loss": 0.007, + "step": 20480 + }, + { + "epoch": 1.2840759541267155, + "grad_norm": 0.4905284345149994, + "learning_rate": 2.890059318992633e-05, + "loss": 0.0541, + "step": 20490 + }, + { + "epoch": 1.2847026383405402, + "grad_norm": 0.020438535138964653, + "learning_rate": 2.8890038209030845e-05, + "loss": 0.002, + "step": 20500 + }, + { + "epoch": 1.285329322554365, + "grad_norm": 5.502569198608398, + "learning_rate": 2.887948322813536e-05, + "loss": 0.293, + "step": 20510 + }, + { + "epoch": 1.2859560067681894, + "grad_norm": 0.04871942475438118, + "learning_rate": 2.8868928247239875e-05, + "loss": 0.08, + "step": 20520 + }, + { + "epoch": 1.2865826909820142, + "grad_norm": 0.3004648685455322, + "learning_rate": 2.885837326634439e-05, + "loss": 0.0476, + "step": 20530 + }, + { + "epoch": 1.287209375195839, + "grad_norm": 6.2745041847229, + "learning_rate": 2.88478182854489e-05, + "loss": 0.169, + "step": 20540 + }, + { + "epoch": 1.2878360594096634, + "grad_norm": 0.2036275714635849, + "learning_rate": 2.8837263304553418e-05, + "loss": 0.0132, + "step": 20550 + }, + { + "epoch": 1.2884627436234881, + "grad_norm": 0.04330754280090332, + "learning_rate": 2.8826708323657935e-05, + "loss": 0.132, + "step": 20560 + }, + { + "epoch": 1.2890894278373128, + "grad_norm": 0.04521123319864273, + "learning_rate": 2.881615334276245e-05, + "loss": 0.0356, + "step": 20570 + }, + { + "epoch": 1.2897161120511376, + "grad_norm": 1.2175931930541992, + "learning_rate": 2.8805598361866965e-05, + "loss": 0.0142, + "step": 20580 + }, + { + "epoch": 1.290342796264962, + "grad_norm": 0.34771111607551575, + "learning_rate": 2.879504338097148e-05, + "loss": 0.0052, + "step": 20590 + }, + { + "epoch": 1.2909694804787868, + "grad_norm": 2.1885266304016113, + "learning_rate": 2.8784488400075998e-05, + "loss": 0.3319, + "step": 20600 + }, + { + "epoch": 1.2915961646926113, + "grad_norm": 0.10941804945468903, + "learning_rate": 2.877393341918051e-05, + "loss": 0.1175, + "step": 20610 + }, + { + "epoch": 1.292222848906436, + "grad_norm": 0.07854396849870682, + "learning_rate": 2.8763378438285028e-05, + "loss": 0.2803, + "step": 20620 + }, + { + "epoch": 1.2928495331202607, + "grad_norm": 0.07036060094833374, + "learning_rate": 2.8752823457389545e-05, + "loss": 0.0268, + "step": 20630 + }, + { + "epoch": 1.2934762173340855, + "grad_norm": 8.92553424835205, + "learning_rate": 2.874226847649406e-05, + "loss": 0.2271, + "step": 20640 + }, + { + "epoch": 1.29410290154791, + "grad_norm": 0.1353772133588791, + "learning_rate": 2.873171349559857e-05, + "loss": 0.0537, + "step": 20650 + }, + { + "epoch": 1.2947295857617347, + "grad_norm": 1.585835337638855, + "learning_rate": 2.8721158514703088e-05, + "loss": 0.1439, + "step": 20660 + }, + { + "epoch": 1.2953562699755592, + "grad_norm": 0.1559787541627884, + "learning_rate": 2.8710603533807608e-05, + "loss": 0.0652, + "step": 20670 + }, + { + "epoch": 1.295982954189384, + "grad_norm": 6.339298725128174, + "learning_rate": 2.8700048552912124e-05, + "loss": 0.1248, + "step": 20680 + }, + { + "epoch": 1.2966096384032086, + "grad_norm": 4.56957483291626, + "learning_rate": 2.8689493572016634e-05, + "loss": 0.0374, + "step": 20690 + }, + { + "epoch": 1.2972363226170334, + "grad_norm": 0.2088044285774231, + "learning_rate": 2.867893859112115e-05, + "loss": 0.0817, + "step": 20700 + }, + { + "epoch": 1.2978630068308579, + "grad_norm": 0.021302178502082825, + "learning_rate": 2.8668383610225668e-05, + "loss": 0.0559, + "step": 20710 + }, + { + "epoch": 1.2984896910446826, + "grad_norm": 0.028453296050429344, + "learning_rate": 2.865782862933018e-05, + "loss": 0.2479, + "step": 20720 + }, + { + "epoch": 1.2991163752585073, + "grad_norm": 0.03374246507883072, + "learning_rate": 2.8647273648434698e-05, + "loss": 0.1142, + "step": 20730 + }, + { + "epoch": 1.2997430594723318, + "grad_norm": 0.05600728467106819, + "learning_rate": 2.8636718667539214e-05, + "loss": 0.1438, + "step": 20740 + }, + { + "epoch": 1.3003697436861565, + "grad_norm": 0.056089840829372406, + "learning_rate": 2.862616368664373e-05, + "loss": 0.0818, + "step": 20750 + }, + { + "epoch": 1.3009964278999813, + "grad_norm": 0.04712797328829765, + "learning_rate": 2.8615608705748244e-05, + "loss": 0.1265, + "step": 20760 + }, + { + "epoch": 1.301623112113806, + "grad_norm": 0.1540490686893463, + "learning_rate": 2.860505372485276e-05, + "loss": 0.0057, + "step": 20770 + }, + { + "epoch": 1.3022497963276305, + "grad_norm": 4.8552565574646, + "learning_rate": 2.8594498743957277e-05, + "loss": 0.3901, + "step": 20780 + }, + { + "epoch": 1.3028764805414552, + "grad_norm": 0.19207793474197388, + "learning_rate": 2.8583943763061787e-05, + "loss": 0.0407, + "step": 20790 + }, + { + "epoch": 1.3035031647552797, + "grad_norm": 3.7182164192199707, + "learning_rate": 2.8573388782166304e-05, + "loss": 0.2068, + "step": 20800 + }, + { + "epoch": 1.3041298489691044, + "grad_norm": 0.1881396323442459, + "learning_rate": 2.856283380127082e-05, + "loss": 0.126, + "step": 20810 + }, + { + "epoch": 1.3047565331829292, + "grad_norm": 0.10974586009979248, + "learning_rate": 2.8552278820375337e-05, + "loss": 0.0115, + "step": 20820 + }, + { + "epoch": 1.305383217396754, + "grad_norm": 0.01864585094153881, + "learning_rate": 2.854172383947985e-05, + "loss": 0.0115, + "step": 20830 + }, + { + "epoch": 1.3060099016105784, + "grad_norm": 2.640251398086548, + "learning_rate": 2.8531168858584367e-05, + "loss": 0.1351, + "step": 20840 + }, + { + "epoch": 1.3066365858244031, + "grad_norm": 0.036722589284181595, + "learning_rate": 2.8520613877688884e-05, + "loss": 0.0089, + "step": 20850 + }, + { + "epoch": 1.3072632700382276, + "grad_norm": 0.7262831926345825, + "learning_rate": 2.8510058896793397e-05, + "loss": 0.1035, + "step": 20860 + }, + { + "epoch": 1.3078899542520523, + "grad_norm": 9.36829662322998, + "learning_rate": 2.8499503915897914e-05, + "loss": 0.2305, + "step": 20870 + }, + { + "epoch": 1.308516638465877, + "grad_norm": 0.041101399809122086, + "learning_rate": 2.848894893500243e-05, + "loss": 0.1464, + "step": 20880 + }, + { + "epoch": 1.3091433226797018, + "grad_norm": 0.08307519555091858, + "learning_rate": 2.8478393954106947e-05, + "loss": 0.191, + "step": 20890 + }, + { + "epoch": 1.3097700068935263, + "grad_norm": 0.17687416076660156, + "learning_rate": 2.8467838973211457e-05, + "loss": 0.2236, + "step": 20900 + }, + { + "epoch": 1.310396691107351, + "grad_norm": 0.32803231477737427, + "learning_rate": 2.8457283992315974e-05, + "loss": 0.1196, + "step": 20910 + }, + { + "epoch": 1.3110233753211755, + "grad_norm": 0.19486212730407715, + "learning_rate": 2.8446729011420494e-05, + "loss": 0.1044, + "step": 20920 + }, + { + "epoch": 1.3116500595350002, + "grad_norm": 0.38289082050323486, + "learning_rate": 2.8436174030525003e-05, + "loss": 0.1773, + "step": 20930 + }, + { + "epoch": 1.312276743748825, + "grad_norm": 0.06538600474596024, + "learning_rate": 2.842561904962952e-05, + "loss": 0.1115, + "step": 20940 + }, + { + "epoch": 1.3129034279626497, + "grad_norm": 0.19104982912540436, + "learning_rate": 2.8415064068734037e-05, + "loss": 0.0813, + "step": 20950 + }, + { + "epoch": 1.3135301121764742, + "grad_norm": 0.0975765660405159, + "learning_rate": 2.8404509087838553e-05, + "loss": 0.0691, + "step": 20960 + }, + { + "epoch": 1.314156796390299, + "grad_norm": 0.1305026262998581, + "learning_rate": 2.8393954106943067e-05, + "loss": 0.1324, + "step": 20970 + }, + { + "epoch": 1.3147834806041236, + "grad_norm": 0.05575420334935188, + "learning_rate": 2.8383399126047583e-05, + "loss": 0.0043, + "step": 20980 + }, + { + "epoch": 1.3154101648179481, + "grad_norm": 0.028829995542764664, + "learning_rate": 2.83728441451521e-05, + "loss": 0.0905, + "step": 20990 + }, + { + "epoch": 1.3160368490317729, + "grad_norm": 0.043897613883018494, + "learning_rate": 2.836228916425661e-05, + "loss": 0.0019, + "step": 21000 + }, + { + "epoch": 1.3166635332455976, + "grad_norm": 0.029821641743183136, + "learning_rate": 2.835173418336113e-05, + "loss": 0.2736, + "step": 21010 + }, + { + "epoch": 1.3172902174594223, + "grad_norm": 0.39863696694374084, + "learning_rate": 2.8341179202465647e-05, + "loss": 0.2053, + "step": 21020 + }, + { + "epoch": 1.3179169016732468, + "grad_norm": 2.2105536460876465, + "learning_rate": 2.8330624221570163e-05, + "loss": 0.1366, + "step": 21030 + }, + { + "epoch": 1.3185435858870715, + "grad_norm": 0.6516872048377991, + "learning_rate": 2.8320069240674673e-05, + "loss": 0.0166, + "step": 21040 + }, + { + "epoch": 1.319170270100896, + "grad_norm": 0.0926242247223854, + "learning_rate": 2.830951425977919e-05, + "loss": 0.0083, + "step": 21050 + }, + { + "epoch": 1.3197969543147208, + "grad_norm": 0.03947974368929863, + "learning_rate": 2.8298959278883706e-05, + "loss": 0.098, + "step": 21060 + }, + { + "epoch": 1.3204236385285455, + "grad_norm": 0.01785365492105484, + "learning_rate": 2.8288404297988223e-05, + "loss": 0.0004, + "step": 21070 + }, + { + "epoch": 1.3210503227423702, + "grad_norm": 0.11209467053413391, + "learning_rate": 2.8277849317092736e-05, + "loss": 0.1898, + "step": 21080 + }, + { + "epoch": 1.3216770069561947, + "grad_norm": 0.022274712100625038, + "learning_rate": 2.8267294336197253e-05, + "loss": 0.2389, + "step": 21090 + }, + { + "epoch": 1.3223036911700194, + "grad_norm": 0.02103334292769432, + "learning_rate": 2.825673935530177e-05, + "loss": 0.1084, + "step": 21100 + }, + { + "epoch": 1.322930375383844, + "grad_norm": 0.03857764974236488, + "learning_rate": 2.8246184374406283e-05, + "loss": 0.0906, + "step": 21110 + }, + { + "epoch": 1.3235570595976687, + "grad_norm": 0.04737214371562004, + "learning_rate": 2.82356293935108e-05, + "loss": 0.0023, + "step": 21120 + }, + { + "epoch": 1.3241837438114934, + "grad_norm": 9.90311050415039, + "learning_rate": 2.8225074412615316e-05, + "loss": 0.2539, + "step": 21130 + }, + { + "epoch": 1.3248104280253181, + "grad_norm": 0.11373288929462433, + "learning_rate": 2.8214519431719833e-05, + "loss": 0.0601, + "step": 21140 + }, + { + "epoch": 1.3254371122391426, + "grad_norm": 9.301064491271973, + "learning_rate": 2.8203964450824343e-05, + "loss": 0.1733, + "step": 21150 + }, + { + "epoch": 1.3260637964529673, + "grad_norm": 2.312943935394287, + "learning_rate": 2.819340946992886e-05, + "loss": 0.0314, + "step": 21160 + }, + { + "epoch": 1.326690480666792, + "grad_norm": 0.01871352083981037, + "learning_rate": 2.818285448903338e-05, + "loss": 0.0727, + "step": 21170 + }, + { + "epoch": 1.3273171648806166, + "grad_norm": 0.024634309113025665, + "learning_rate": 2.817229950813789e-05, + "loss": 0.0738, + "step": 21180 + }, + { + "epoch": 1.3279438490944413, + "grad_norm": 0.04853309318423271, + "learning_rate": 2.8161744527242406e-05, + "loss": 0.1316, + "step": 21190 + }, + { + "epoch": 1.328570533308266, + "grad_norm": 0.021960796788334846, + "learning_rate": 2.8151189546346923e-05, + "loss": 0.0776, + "step": 21200 + }, + { + "epoch": 1.3291972175220907, + "grad_norm": 0.014680769294500351, + "learning_rate": 2.814063456545144e-05, + "loss": 0.1248, + "step": 21210 + }, + { + "epoch": 1.3298239017359152, + "grad_norm": 0.012380057014524937, + "learning_rate": 2.8130079584555952e-05, + "loss": 0.0794, + "step": 21220 + }, + { + "epoch": 1.33045058594974, + "grad_norm": 0.014920385554432869, + "learning_rate": 2.811952460366047e-05, + "loss": 0.0538, + "step": 21230 + }, + { + "epoch": 1.3310772701635645, + "grad_norm": 4.893551349639893, + "learning_rate": 2.8108969622764986e-05, + "loss": 0.2239, + "step": 21240 + }, + { + "epoch": 1.3317039543773892, + "grad_norm": 0.04235278442502022, + "learning_rate": 2.8098414641869496e-05, + "loss": 0.0165, + "step": 21250 + }, + { + "epoch": 1.332330638591214, + "grad_norm": 0.85734623670578, + "learning_rate": 2.8087859660974016e-05, + "loss": 0.1398, + "step": 21260 + }, + { + "epoch": 1.3329573228050386, + "grad_norm": 0.03186863660812378, + "learning_rate": 2.8077304680078532e-05, + "loss": 0.0695, + "step": 21270 + }, + { + "epoch": 1.3335840070188631, + "grad_norm": 0.18006360530853271, + "learning_rate": 2.806674969918305e-05, + "loss": 0.1032, + "step": 21280 + }, + { + "epoch": 1.3342106912326879, + "grad_norm": 0.03093225508928299, + "learning_rate": 2.805619471828756e-05, + "loss": 0.0228, + "step": 21290 + }, + { + "epoch": 1.3348373754465124, + "grad_norm": 2.2488553524017334, + "learning_rate": 2.8045639737392076e-05, + "loss": 0.0423, + "step": 21300 + }, + { + "epoch": 1.335464059660337, + "grad_norm": 0.012684832327067852, + "learning_rate": 2.8035084756496592e-05, + "loss": 0.0628, + "step": 21310 + }, + { + "epoch": 1.3360907438741618, + "grad_norm": 0.013988979160785675, + "learning_rate": 2.8024529775601105e-05, + "loss": 0.0138, + "step": 21320 + }, + { + "epoch": 1.3367174280879865, + "grad_norm": 0.020398080348968506, + "learning_rate": 2.8013974794705622e-05, + "loss": 0.1118, + "step": 21330 + }, + { + "epoch": 1.337344112301811, + "grad_norm": 3.608513832092285, + "learning_rate": 2.800341981381014e-05, + "loss": 0.0097, + "step": 21340 + }, + { + "epoch": 1.3379707965156358, + "grad_norm": 0.012503975071012974, + "learning_rate": 2.7992864832914655e-05, + "loss": 0.013, + "step": 21350 + }, + { + "epoch": 1.3385974807294605, + "grad_norm": 0.14755772054195404, + "learning_rate": 2.798230985201917e-05, + "loss": 0.1064, + "step": 21360 + }, + { + "epoch": 1.339224164943285, + "grad_norm": 0.04783150553703308, + "learning_rate": 2.7971754871123685e-05, + "loss": 0.1094, + "step": 21370 + }, + { + "epoch": 1.3398508491571097, + "grad_norm": 0.027297774329781532, + "learning_rate": 2.7961199890228202e-05, + "loss": 0.0789, + "step": 21380 + }, + { + "epoch": 1.3404775333709344, + "grad_norm": 0.02567318268120289, + "learning_rate": 2.7950644909332712e-05, + "loss": 0.0552, + "step": 21390 + }, + { + "epoch": 1.3411042175847592, + "grad_norm": 10.832972526550293, + "learning_rate": 2.794008992843723e-05, + "loss": 0.2971, + "step": 21400 + }, + { + "epoch": 1.3417309017985837, + "grad_norm": 0.05646326020359993, + "learning_rate": 2.7929534947541745e-05, + "loss": 0.1354, + "step": 21410 + }, + { + "epoch": 1.3423575860124084, + "grad_norm": 0.16988039016723633, + "learning_rate": 2.7918979966646265e-05, + "loss": 0.0885, + "step": 21420 + }, + { + "epoch": 1.342984270226233, + "grad_norm": 0.1426088958978653, + "learning_rate": 2.7908424985750775e-05, + "loss": 0.0904, + "step": 21430 + }, + { + "epoch": 1.3436109544400576, + "grad_norm": 0.07782195508480072, + "learning_rate": 2.7897870004855292e-05, + "loss": 0.0178, + "step": 21440 + }, + { + "epoch": 1.3442376386538823, + "grad_norm": 0.051355279982089996, + "learning_rate": 2.788731502395981e-05, + "loss": 0.095, + "step": 21450 + }, + { + "epoch": 1.344864322867707, + "grad_norm": 3.4157044887542725, + "learning_rate": 2.7876760043064325e-05, + "loss": 0.124, + "step": 21460 + }, + { + "epoch": 1.3454910070815316, + "grad_norm": 5.504192352294922, + "learning_rate": 2.7866205062168838e-05, + "loss": 0.1536, + "step": 21470 + }, + { + "epoch": 1.3461176912953563, + "grad_norm": 0.0324571318924427, + "learning_rate": 2.7855650081273355e-05, + "loss": 0.2083, + "step": 21480 + }, + { + "epoch": 1.3467443755091808, + "grad_norm": 0.06137267127633095, + "learning_rate": 2.784509510037787e-05, + "loss": 0.0064, + "step": 21490 + }, + { + "epoch": 1.3473710597230055, + "grad_norm": 0.20257635414600372, + "learning_rate": 2.783454011948238e-05, + "loss": 0.2428, + "step": 21500 + }, + { + "epoch": 1.3479977439368303, + "grad_norm": 0.45073238015174866, + "learning_rate": 2.78239851385869e-05, + "loss": 0.1297, + "step": 21510 + }, + { + "epoch": 1.348624428150655, + "grad_norm": 0.13506528735160828, + "learning_rate": 2.7813430157691418e-05, + "loss": 0.1046, + "step": 21520 + }, + { + "epoch": 1.3492511123644795, + "grad_norm": 0.137327641248703, + "learning_rate": 2.7802875176795935e-05, + "loss": 0.0049, + "step": 21530 + }, + { + "epoch": 1.3498777965783042, + "grad_norm": 0.07374851405620575, + "learning_rate": 2.7792320195900445e-05, + "loss": 0.171, + "step": 21540 + }, + { + "epoch": 1.350504480792129, + "grad_norm": 0.07294797897338867, + "learning_rate": 2.778176521500496e-05, + "loss": 0.0991, + "step": 21550 + }, + { + "epoch": 1.3511311650059534, + "grad_norm": 0.6325371265411377, + "learning_rate": 2.7771210234109478e-05, + "loss": 0.0663, + "step": 21560 + }, + { + "epoch": 1.3517578492197782, + "grad_norm": 0.7016847133636475, + "learning_rate": 2.776065525321399e-05, + "loss": 0.2094, + "step": 21570 + }, + { + "epoch": 1.3523845334336029, + "grad_norm": 0.05865400284528732, + "learning_rate": 2.7750100272318508e-05, + "loss": 0.1343, + "step": 21580 + }, + { + "epoch": 1.3530112176474276, + "grad_norm": 0.05986565724015236, + "learning_rate": 2.7739545291423025e-05, + "loss": 0.0565, + "step": 21590 + }, + { + "epoch": 1.353637901861252, + "grad_norm": 3.4504475593566895, + "learning_rate": 2.772899031052754e-05, + "loss": 0.0961, + "step": 21600 + }, + { + "epoch": 1.3542645860750768, + "grad_norm": 0.04019607976078987, + "learning_rate": 2.7718435329632054e-05, + "loss": 0.0988, + "step": 21610 + }, + { + "epoch": 1.3548912702889013, + "grad_norm": 0.042770903557538986, + "learning_rate": 2.770788034873657e-05, + "loss": 0.1854, + "step": 21620 + }, + { + "epoch": 1.355517954502726, + "grad_norm": 0.040806010365486145, + "learning_rate": 2.7697325367841088e-05, + "loss": 0.1581, + "step": 21630 + }, + { + "epoch": 1.3561446387165508, + "grad_norm": 0.1587454378604889, + "learning_rate": 2.7686770386945598e-05, + "loss": 0.2592, + "step": 21640 + }, + { + "epoch": 1.3567713229303755, + "grad_norm": 3.8640308380126953, + "learning_rate": 2.7676215406050114e-05, + "loss": 0.2352, + "step": 21650 + }, + { + "epoch": 1.3573980071442, + "grad_norm": 6.425248622894287, + "learning_rate": 2.766566042515463e-05, + "loss": 0.1215, + "step": 21660 + }, + { + "epoch": 1.3580246913580247, + "grad_norm": 0.16422656178474426, + "learning_rate": 2.765510544425915e-05, + "loss": 0.1265, + "step": 21670 + }, + { + "epoch": 1.3586513755718492, + "grad_norm": 1.1562215089797974, + "learning_rate": 2.764455046336366e-05, + "loss": 0.1549, + "step": 21680 + }, + { + "epoch": 1.359278059785674, + "grad_norm": 1.9711147546768188, + "learning_rate": 2.7633995482468178e-05, + "loss": 0.1053, + "step": 21690 + }, + { + "epoch": 1.3599047439994987, + "grad_norm": 1.314103603363037, + "learning_rate": 2.7623440501572694e-05, + "loss": 0.0416, + "step": 21700 + }, + { + "epoch": 1.3605314282133234, + "grad_norm": 0.0646156594157219, + "learning_rate": 2.7612885520677207e-05, + "loss": 0.0671, + "step": 21710 + }, + { + "epoch": 1.361158112427148, + "grad_norm": 0.2056860476732254, + "learning_rate": 2.7602330539781724e-05, + "loss": 0.1479, + "step": 21720 + }, + { + "epoch": 1.3617847966409726, + "grad_norm": 0.06871373951435089, + "learning_rate": 2.759177555888624e-05, + "loss": 0.0234, + "step": 21730 + }, + { + "epoch": 1.3624114808547971, + "grad_norm": 0.07943173497915268, + "learning_rate": 2.7581220577990757e-05, + "loss": 0.2848, + "step": 21740 + }, + { + "epoch": 1.3630381650686219, + "grad_norm": 0.18097421526908875, + "learning_rate": 2.7570665597095267e-05, + "loss": 0.1584, + "step": 21750 + }, + { + "epoch": 1.3636648492824466, + "grad_norm": 0.20642918348312378, + "learning_rate": 2.7560110616199787e-05, + "loss": 0.0104, + "step": 21760 + }, + { + "epoch": 1.3642915334962713, + "grad_norm": 0.0687187910079956, + "learning_rate": 2.7549555635304304e-05, + "loss": 0.2375, + "step": 21770 + }, + { + "epoch": 1.3649182177100958, + "grad_norm": 0.3212626874446869, + "learning_rate": 2.7539000654408814e-05, + "loss": 0.0807, + "step": 21780 + }, + { + "epoch": 1.3655449019239205, + "grad_norm": 0.07216355204582214, + "learning_rate": 2.752844567351333e-05, + "loss": 0.0046, + "step": 21790 + }, + { + "epoch": 1.3661715861377453, + "grad_norm": 2.168639898300171, + "learning_rate": 2.7517890692617847e-05, + "loss": 0.1583, + "step": 21800 + }, + { + "epoch": 1.3667982703515698, + "grad_norm": 0.12190362066030502, + "learning_rate": 2.7507335711722364e-05, + "loss": 0.0594, + "step": 21810 + }, + { + "epoch": 1.3674249545653945, + "grad_norm": 0.09011233597993851, + "learning_rate": 2.7496780730826877e-05, + "loss": 0.1266, + "step": 21820 + }, + { + "epoch": 1.3680516387792192, + "grad_norm": 0.18184854090213776, + "learning_rate": 2.7486225749931394e-05, + "loss": 0.0919, + "step": 21830 + }, + { + "epoch": 1.368678322993044, + "grad_norm": 0.058258481323719025, + "learning_rate": 2.747567076903591e-05, + "loss": 0.0573, + "step": 21840 + }, + { + "epoch": 1.3693050072068684, + "grad_norm": 4.3048577308654785, + "learning_rate": 2.7465115788140427e-05, + "loss": 0.0845, + "step": 21850 + }, + { + "epoch": 1.3699316914206932, + "grad_norm": 0.04196920618414879, + "learning_rate": 2.745456080724494e-05, + "loss": 0.1047, + "step": 21860 + }, + { + "epoch": 1.3705583756345177, + "grad_norm": 0.07941604405641556, + "learning_rate": 2.7444005826349457e-05, + "loss": 0.0029, + "step": 21870 + }, + { + "epoch": 1.3711850598483424, + "grad_norm": 9.327536582946777, + "learning_rate": 2.7433450845453974e-05, + "loss": 0.2039, + "step": 21880 + }, + { + "epoch": 1.371811744062167, + "grad_norm": 0.03211827203631401, + "learning_rate": 2.7422895864558483e-05, + "loss": 0.002, + "step": 21890 + }, + { + "epoch": 1.3724384282759918, + "grad_norm": 0.4241490960121155, + "learning_rate": 2.7412340883663e-05, + "loss": 0.0386, + "step": 21900 + }, + { + "epoch": 1.3730651124898163, + "grad_norm": 0.02552582323551178, + "learning_rate": 2.7401785902767517e-05, + "loss": 0.0797, + "step": 21910 + }, + { + "epoch": 1.373691796703641, + "grad_norm": 0.08526312559843063, + "learning_rate": 2.7391230921872037e-05, + "loss": 0.1707, + "step": 21920 + }, + { + "epoch": 1.3743184809174656, + "grad_norm": 0.07207024097442627, + "learning_rate": 2.7380675940976547e-05, + "loss": 0.0661, + "step": 21930 + }, + { + "epoch": 1.3749451651312903, + "grad_norm": 0.10189100354909897, + "learning_rate": 2.7370120960081063e-05, + "loss": 0.2263, + "step": 21940 + }, + { + "epoch": 1.375571849345115, + "grad_norm": 20.908044815063477, + "learning_rate": 2.735956597918558e-05, + "loss": 0.13, + "step": 21950 + }, + { + "epoch": 1.3761985335589397, + "grad_norm": 0.1741807460784912, + "learning_rate": 2.7349010998290093e-05, + "loss": 0.1285, + "step": 21960 + }, + { + "epoch": 1.3768252177727642, + "grad_norm": 0.03784070163965225, + "learning_rate": 2.733845601739461e-05, + "loss": 0.0372, + "step": 21970 + }, + { + "epoch": 1.377451901986589, + "grad_norm": 0.1834995001554489, + "learning_rate": 2.7327901036499127e-05, + "loss": 0.0897, + "step": 21980 + }, + { + "epoch": 1.3780785862004137, + "grad_norm": 0.14013051986694336, + "learning_rate": 2.7317346055603643e-05, + "loss": 0.0053, + "step": 21990 + }, + { + "epoch": 1.3787052704142382, + "grad_norm": 0.028027813881635666, + "learning_rate": 2.7306791074708153e-05, + "loss": 0.0186, + "step": 22000 + }, + { + "epoch": 1.379331954628063, + "grad_norm": 12.102706909179688, + "learning_rate": 2.7296236093812673e-05, + "loss": 0.2363, + "step": 22010 + }, + { + "epoch": 1.3799586388418876, + "grad_norm": 0.07020573318004608, + "learning_rate": 2.728568111291719e-05, + "loss": 0.1667, + "step": 22020 + }, + { + "epoch": 1.3805853230557124, + "grad_norm": 3.281811237335205, + "learning_rate": 2.72751261320217e-05, + "loss": 0.1397, + "step": 22030 + }, + { + "epoch": 1.3812120072695369, + "grad_norm": 1.4566924571990967, + "learning_rate": 2.7264571151126216e-05, + "loss": 0.2678, + "step": 22040 + }, + { + "epoch": 1.3818386914833616, + "grad_norm": 5.477617263793945, + "learning_rate": 2.7254016170230733e-05, + "loss": 0.2475, + "step": 22050 + }, + { + "epoch": 1.382465375697186, + "grad_norm": 6.840038776397705, + "learning_rate": 2.724346118933525e-05, + "loss": 0.1136, + "step": 22060 + }, + { + "epoch": 1.3830920599110108, + "grad_norm": 0.04201101139187813, + "learning_rate": 2.7232906208439763e-05, + "loss": 0.039, + "step": 22070 + }, + { + "epoch": 1.3837187441248355, + "grad_norm": 0.6134777665138245, + "learning_rate": 2.722235122754428e-05, + "loss": 0.0134, + "step": 22080 + }, + { + "epoch": 1.3843454283386603, + "grad_norm": 0.3734954595565796, + "learning_rate": 2.7211796246648796e-05, + "loss": 0.1143, + "step": 22090 + }, + { + "epoch": 1.3849721125524848, + "grad_norm": 5.686514854431152, + "learning_rate": 2.720124126575331e-05, + "loss": 0.3968, + "step": 22100 + }, + { + "epoch": 1.3855987967663095, + "grad_norm": 0.06281892955303192, + "learning_rate": 2.7190686284857826e-05, + "loss": 0.1116, + "step": 22110 + }, + { + "epoch": 1.386225480980134, + "grad_norm": 0.1443566530942917, + "learning_rate": 2.7180131303962343e-05, + "loss": 0.1349, + "step": 22120 + }, + { + "epoch": 1.3868521651939587, + "grad_norm": 2.338820695877075, + "learning_rate": 2.716957632306686e-05, + "loss": 0.0705, + "step": 22130 + }, + { + "epoch": 1.3874788494077834, + "grad_norm": 0.23876045644283295, + "learning_rate": 2.715902134217137e-05, + "loss": 0.131, + "step": 22140 + }, + { + "epoch": 1.3881055336216082, + "grad_norm": 0.12372996658086777, + "learning_rate": 2.7148466361275886e-05, + "loss": 0.0628, + "step": 22150 + }, + { + "epoch": 1.3887322178354327, + "grad_norm": 0.6853922605514526, + "learning_rate": 2.7137911380380403e-05, + "loss": 0.0708, + "step": 22160 + }, + { + "epoch": 1.3893589020492574, + "grad_norm": 0.05106097832322121, + "learning_rate": 2.7127356399484916e-05, + "loss": 0.085, + "step": 22170 + }, + { + "epoch": 1.389985586263082, + "grad_norm": 7.483982086181641, + "learning_rate": 2.7116801418589432e-05, + "loss": 0.0481, + "step": 22180 + }, + { + "epoch": 1.3906122704769066, + "grad_norm": 3.056711196899414, + "learning_rate": 2.710624643769395e-05, + "loss": 0.2775, + "step": 22190 + }, + { + "epoch": 1.3912389546907313, + "grad_norm": 0.42939886450767517, + "learning_rate": 2.7095691456798466e-05, + "loss": 0.0791, + "step": 22200 + }, + { + "epoch": 1.391865638904556, + "grad_norm": 0.26278284192085266, + "learning_rate": 2.708513647590298e-05, + "loss": 0.0883, + "step": 22210 + }, + { + "epoch": 1.3924923231183808, + "grad_norm": 0.16981381177902222, + "learning_rate": 2.7074581495007496e-05, + "loss": 0.0157, + "step": 22220 + }, + { + "epoch": 1.3931190073322053, + "grad_norm": 13.162467002868652, + "learning_rate": 2.7064026514112012e-05, + "loss": 0.0918, + "step": 22230 + }, + { + "epoch": 1.39374569154603, + "grad_norm": 0.16399233043193817, + "learning_rate": 2.705347153321653e-05, + "loss": 0.1045, + "step": 22240 + }, + { + "epoch": 1.3943723757598545, + "grad_norm": 0.020432839170098305, + "learning_rate": 2.704291655232104e-05, + "loss": 0.029, + "step": 22250 + }, + { + "epoch": 1.3949990599736792, + "grad_norm": 0.023673556745052338, + "learning_rate": 2.703236157142556e-05, + "loss": 0.1443, + "step": 22260 + }, + { + "epoch": 1.395625744187504, + "grad_norm": 5.3616204261779785, + "learning_rate": 2.7021806590530076e-05, + "loss": 0.1986, + "step": 22270 + }, + { + "epoch": 1.3962524284013287, + "grad_norm": 5.972326278686523, + "learning_rate": 2.7011251609634585e-05, + "loss": 0.1403, + "step": 22280 + }, + { + "epoch": 1.3968791126151532, + "grad_norm": 1.81098210811615, + "learning_rate": 2.7000696628739102e-05, + "loss": 0.0171, + "step": 22290 + }, + { + "epoch": 1.397505796828978, + "grad_norm": 2.2598025798797607, + "learning_rate": 2.699014164784362e-05, + "loss": 0.0315, + "step": 22300 + }, + { + "epoch": 1.3981324810428024, + "grad_norm": 0.057428207248449326, + "learning_rate": 2.6979586666948135e-05, + "loss": 0.0492, + "step": 22310 + }, + { + "epoch": 1.3987591652566271, + "grad_norm": 11.328914642333984, + "learning_rate": 2.696903168605265e-05, + "loss": 0.0546, + "step": 22320 + }, + { + "epoch": 1.3993858494704519, + "grad_norm": 0.03397885710000992, + "learning_rate": 2.6958476705157165e-05, + "loss": 0.1241, + "step": 22330 + }, + { + "epoch": 1.4000125336842766, + "grad_norm": 0.1802971065044403, + "learning_rate": 2.6947921724261682e-05, + "loss": 0.2208, + "step": 22340 + }, + { + "epoch": 1.400639217898101, + "grad_norm": 0.26217713952064514, + "learning_rate": 2.6937366743366195e-05, + "loss": 0.0595, + "step": 22350 + }, + { + "epoch": 1.4012659021119258, + "grad_norm": 0.5535191893577576, + "learning_rate": 2.6926811762470712e-05, + "loss": 0.0478, + "step": 22360 + }, + { + "epoch": 1.4018925863257505, + "grad_norm": 4.9853596687316895, + "learning_rate": 2.691625678157523e-05, + "loss": 0.1245, + "step": 22370 + }, + { + "epoch": 1.402519270539575, + "grad_norm": 1.5813056230545044, + "learning_rate": 2.6905701800679745e-05, + "loss": 0.0386, + "step": 22380 + }, + { + "epoch": 1.4031459547533998, + "grad_norm": 0.043805863708257675, + "learning_rate": 2.6895146819784255e-05, + "loss": 0.1035, + "step": 22390 + }, + { + "epoch": 1.4037726389672245, + "grad_norm": 0.27779996395111084, + "learning_rate": 2.688459183888877e-05, + "loss": 0.129, + "step": 22400 + }, + { + "epoch": 1.404399323181049, + "grad_norm": 0.3968289792537689, + "learning_rate": 2.6874036857993288e-05, + "loss": 0.1455, + "step": 22410 + }, + { + "epoch": 1.4050260073948737, + "grad_norm": 0.04641241207718849, + "learning_rate": 2.68634818770978e-05, + "loss": 0.1024, + "step": 22420 + }, + { + "epoch": 1.4056526916086984, + "grad_norm": 0.04604057967662811, + "learning_rate": 2.6852926896202318e-05, + "loss": 0.0392, + "step": 22430 + }, + { + "epoch": 1.406279375822523, + "grad_norm": 0.15959686040878296, + "learning_rate": 2.6842371915306835e-05, + "loss": 0.2027, + "step": 22440 + }, + { + "epoch": 1.4069060600363477, + "grad_norm": 0.08660081773996353, + "learning_rate": 2.683181693441135e-05, + "loss": 0.1929, + "step": 22450 + }, + { + "epoch": 1.4075327442501724, + "grad_norm": 0.03353703022003174, + "learning_rate": 2.6821261953515865e-05, + "loss": 0.0416, + "step": 22460 + }, + { + "epoch": 1.408159428463997, + "grad_norm": 0.5888323187828064, + "learning_rate": 2.681070697262038e-05, + "loss": 0.0217, + "step": 22470 + }, + { + "epoch": 1.4087861126778216, + "grad_norm": 0.2682591676712036, + "learning_rate": 2.6800151991724898e-05, + "loss": 0.0112, + "step": 22480 + }, + { + "epoch": 1.4094127968916463, + "grad_norm": 0.022292733192443848, + "learning_rate": 2.6789597010829408e-05, + "loss": 0.1059, + "step": 22490 + }, + { + "epoch": 1.4100394811054708, + "grad_norm": 0.020648401230573654, + "learning_rate": 2.6779042029933925e-05, + "loss": 0.0329, + "step": 22500 + }, + { + "epoch": 1.4106661653192956, + "grad_norm": 0.025373414158821106, + "learning_rate": 2.6768487049038445e-05, + "loss": 0.0957, + "step": 22510 + }, + { + "epoch": 1.4112928495331203, + "grad_norm": 0.016700396314263344, + "learning_rate": 2.675793206814296e-05, + "loss": 0.0694, + "step": 22520 + }, + { + "epoch": 1.411919533746945, + "grad_norm": 2.5584840774536133, + "learning_rate": 2.674737708724747e-05, + "loss": 0.0216, + "step": 22530 + }, + { + "epoch": 1.4125462179607695, + "grad_norm": 0.023159755393862724, + "learning_rate": 2.6736822106351988e-05, + "loss": 0.1943, + "step": 22540 + }, + { + "epoch": 1.4131729021745942, + "grad_norm": 0.09470424056053162, + "learning_rate": 2.6726267125456504e-05, + "loss": 0.0548, + "step": 22550 + }, + { + "epoch": 1.4137995863884187, + "grad_norm": 0.02677854523062706, + "learning_rate": 2.6715712144561018e-05, + "loss": 0.1369, + "step": 22560 + }, + { + "epoch": 1.4144262706022435, + "grad_norm": 0.03269219398498535, + "learning_rate": 2.6705157163665534e-05, + "loss": 0.0458, + "step": 22570 + }, + { + "epoch": 1.4150529548160682, + "grad_norm": 2.119086742401123, + "learning_rate": 2.669460218277005e-05, + "loss": 0.0234, + "step": 22580 + }, + { + "epoch": 1.415679639029893, + "grad_norm": 0.02962580882012844, + "learning_rate": 2.6684047201874568e-05, + "loss": 0.1117, + "step": 22590 + }, + { + "epoch": 1.4163063232437174, + "grad_norm": 0.0268696341663599, + "learning_rate": 2.667349222097908e-05, + "loss": 0.0093, + "step": 22600 + }, + { + "epoch": 1.4169330074575421, + "grad_norm": 0.020729854702949524, + "learning_rate": 2.6662937240083598e-05, + "loss": 0.1382, + "step": 22610 + }, + { + "epoch": 1.4175596916713669, + "grad_norm": 9.315117835998535, + "learning_rate": 2.6652382259188114e-05, + "loss": 0.2076, + "step": 22620 + }, + { + "epoch": 1.4181863758851914, + "grad_norm": 0.24118176102638245, + "learning_rate": 2.6641827278292624e-05, + "loss": 0.0406, + "step": 22630 + }, + { + "epoch": 1.418813060099016, + "grad_norm": 6.347595691680908, + "learning_rate": 2.663127229739714e-05, + "loss": 0.0539, + "step": 22640 + }, + { + "epoch": 1.4194397443128408, + "grad_norm": 0.05370037630200386, + "learning_rate": 2.6620717316501657e-05, + "loss": 0.1254, + "step": 22650 + }, + { + "epoch": 1.4200664285266655, + "grad_norm": 0.020342102274298668, + "learning_rate": 2.6610162335606174e-05, + "loss": 0.0645, + "step": 22660 + }, + { + "epoch": 1.42069311274049, + "grad_norm": 0.10898140072822571, + "learning_rate": 2.6599607354710687e-05, + "loss": 0.1335, + "step": 22670 + }, + { + "epoch": 1.4213197969543148, + "grad_norm": 0.1842038780450821, + "learning_rate": 2.6589052373815204e-05, + "loss": 0.1147, + "step": 22680 + }, + { + "epoch": 1.4219464811681393, + "grad_norm": 1.9294404983520508, + "learning_rate": 2.657849739291972e-05, + "loss": 0.0036, + "step": 22690 + }, + { + "epoch": 1.422573165381964, + "grad_norm": 0.10110870003700256, + "learning_rate": 2.6567942412024237e-05, + "loss": 0.1219, + "step": 22700 + }, + { + "epoch": 1.4231998495957887, + "grad_norm": 0.06406152248382568, + "learning_rate": 2.655738743112875e-05, + "loss": 0.092, + "step": 22710 + }, + { + "epoch": 1.4238265338096134, + "grad_norm": 0.2047063410282135, + "learning_rate": 2.6546832450233267e-05, + "loss": 0.115, + "step": 22720 + }, + { + "epoch": 1.424453218023438, + "grad_norm": 0.02261444926261902, + "learning_rate": 2.6536277469337784e-05, + "loss": 0.0672, + "step": 22730 + }, + { + "epoch": 1.4250799022372627, + "grad_norm": 0.10477691888809204, + "learning_rate": 2.6525722488442294e-05, + "loss": 0.1383, + "step": 22740 + }, + { + "epoch": 1.4257065864510872, + "grad_norm": 4.8798747062683105, + "learning_rate": 2.651516750754681e-05, + "loss": 0.1659, + "step": 22750 + }, + { + "epoch": 1.426333270664912, + "grad_norm": 0.061641011387109756, + "learning_rate": 2.650461252665133e-05, + "loss": 0.1158, + "step": 22760 + }, + { + "epoch": 1.4269599548787366, + "grad_norm": 0.09502819925546646, + "learning_rate": 2.6494057545755847e-05, + "loss": 0.1218, + "step": 22770 + }, + { + "epoch": 1.4275866390925613, + "grad_norm": 0.12448341399431229, + "learning_rate": 2.6483502564860357e-05, + "loss": 0.1604, + "step": 22780 + }, + { + "epoch": 1.4282133233063858, + "grad_norm": 0.3269909620285034, + "learning_rate": 2.6472947583964874e-05, + "loss": 0.152, + "step": 22790 + }, + { + "epoch": 1.4288400075202106, + "grad_norm": 0.2900196611881256, + "learning_rate": 2.646239260306939e-05, + "loss": 0.0525, + "step": 22800 + }, + { + "epoch": 1.4294666917340353, + "grad_norm": 0.027310775592923164, + "learning_rate": 2.6451837622173904e-05, + "loss": 0.0276, + "step": 22810 + }, + { + "epoch": 1.4300933759478598, + "grad_norm": 0.02277687005698681, + "learning_rate": 2.644128264127842e-05, + "loss": 0.0277, + "step": 22820 + }, + { + "epoch": 1.4307200601616845, + "grad_norm": 0.021468304097652435, + "learning_rate": 2.6430727660382937e-05, + "loss": 0.1039, + "step": 22830 + }, + { + "epoch": 1.4313467443755092, + "grad_norm": 1.4674038887023926, + "learning_rate": 2.6420172679487453e-05, + "loss": 0.0692, + "step": 22840 + }, + { + "epoch": 1.431973428589334, + "grad_norm": 0.0249196607619524, + "learning_rate": 2.6409617698591967e-05, + "loss": 0.1458, + "step": 22850 + }, + { + "epoch": 1.4326001128031585, + "grad_norm": 0.018479831516742706, + "learning_rate": 2.6399062717696483e-05, + "loss": 0.0304, + "step": 22860 + }, + { + "epoch": 1.4332267970169832, + "grad_norm": 0.020352143794298172, + "learning_rate": 2.6388507736801e-05, + "loss": 0.0045, + "step": 22870 + }, + { + "epoch": 1.4338534812308077, + "grad_norm": 0.026012783870100975, + "learning_rate": 2.637795275590551e-05, + "loss": 0.1779, + "step": 22880 + }, + { + "epoch": 1.4344801654446324, + "grad_norm": 0.030733680352568626, + "learning_rate": 2.6367397775010027e-05, + "loss": 0.0091, + "step": 22890 + }, + { + "epoch": 1.4351068496584571, + "grad_norm": 0.16905340552330017, + "learning_rate": 2.6356842794114543e-05, + "loss": 0.1538, + "step": 22900 + }, + { + "epoch": 1.4357335338722819, + "grad_norm": 0.07733534276485443, + "learning_rate": 2.6346287813219063e-05, + "loss": 0.0962, + "step": 22910 + }, + { + "epoch": 1.4363602180861064, + "grad_norm": 0.12916617095470428, + "learning_rate": 2.6335732832323573e-05, + "loss": 0.0904, + "step": 22920 + }, + { + "epoch": 1.436986902299931, + "grad_norm": 0.05547713115811348, + "learning_rate": 2.632517785142809e-05, + "loss": 0.0416, + "step": 22930 + }, + { + "epoch": 1.4376135865137556, + "grad_norm": 4.29659366607666, + "learning_rate": 2.6314622870532606e-05, + "loss": 0.1677, + "step": 22940 + }, + { + "epoch": 1.4382402707275803, + "grad_norm": 0.09266739338636398, + "learning_rate": 2.630406788963712e-05, + "loss": 0.1797, + "step": 22950 + }, + { + "epoch": 1.438866954941405, + "grad_norm": 0.14676466584205627, + "learning_rate": 2.6293512908741636e-05, + "loss": 0.1898, + "step": 22960 + }, + { + "epoch": 1.4394936391552298, + "grad_norm": 13.134486198425293, + "learning_rate": 2.6282957927846153e-05, + "loss": 0.2278, + "step": 22970 + }, + { + "epoch": 1.4401203233690543, + "grad_norm": 0.40627726912498474, + "learning_rate": 2.627240294695067e-05, + "loss": 0.107, + "step": 22980 + }, + { + "epoch": 1.440747007582879, + "grad_norm": 0.21462000906467438, + "learning_rate": 2.626184796605518e-05, + "loss": 0.0617, + "step": 22990 + }, + { + "epoch": 1.4413736917967037, + "grad_norm": 0.14315448701381683, + "learning_rate": 2.6251292985159696e-05, + "loss": 0.2208, + "step": 23000 + }, + { + "epoch": 1.4420003760105282, + "grad_norm": 0.1582183539867401, + "learning_rate": 2.6240738004264216e-05, + "loss": 0.0407, + "step": 23010 + }, + { + "epoch": 1.442627060224353, + "grad_norm": 3.420944929122925, + "learning_rate": 2.6230183023368726e-05, + "loss": 0.2166, + "step": 23020 + }, + { + "epoch": 1.4432537444381777, + "grad_norm": 0.08734721690416336, + "learning_rate": 2.6219628042473243e-05, + "loss": 0.0895, + "step": 23030 + }, + { + "epoch": 1.4438804286520024, + "grad_norm": 3.684218406677246, + "learning_rate": 2.620907306157776e-05, + "loss": 0.1275, + "step": 23040 + }, + { + "epoch": 1.444507112865827, + "grad_norm": 57.07538604736328, + "learning_rate": 2.6198518080682276e-05, + "loss": 0.0456, + "step": 23050 + }, + { + "epoch": 1.4451337970796516, + "grad_norm": 0.13820041716098785, + "learning_rate": 2.618796309978679e-05, + "loss": 0.0564, + "step": 23060 + }, + { + "epoch": 1.4457604812934761, + "grad_norm": 0.377420037984848, + "learning_rate": 2.6177408118891306e-05, + "loss": 0.3262, + "step": 23070 + }, + { + "epoch": 1.4463871655073008, + "grad_norm": 14.740819931030273, + "learning_rate": 2.6166853137995823e-05, + "loss": 0.1122, + "step": 23080 + }, + { + "epoch": 1.4470138497211256, + "grad_norm": 0.07182967662811279, + "learning_rate": 2.615629815710034e-05, + "loss": 0.0342, + "step": 23090 + }, + { + "epoch": 1.4476405339349503, + "grad_norm": 0.36316248774528503, + "learning_rate": 2.6145743176204853e-05, + "loss": 0.2078, + "step": 23100 + }, + { + "epoch": 1.4482672181487748, + "grad_norm": 0.03609143942594528, + "learning_rate": 2.613518819530937e-05, + "loss": 0.0505, + "step": 23110 + }, + { + "epoch": 1.4488939023625995, + "grad_norm": 0.12309886515140533, + "learning_rate": 2.6124633214413886e-05, + "loss": 0.0456, + "step": 23120 + }, + { + "epoch": 1.449520586576424, + "grad_norm": 0.06193877011537552, + "learning_rate": 2.6114078233518396e-05, + "loss": 0.0057, + "step": 23130 + }, + { + "epoch": 1.4501472707902487, + "grad_norm": 0.04505753889679909, + "learning_rate": 2.6103523252622912e-05, + "loss": 0.2234, + "step": 23140 + }, + { + "epoch": 1.4507739550040735, + "grad_norm": 0.039813585579395294, + "learning_rate": 2.609296827172743e-05, + "loss": 0.137, + "step": 23150 + }, + { + "epoch": 1.4514006392178982, + "grad_norm": 0.10670791566371918, + "learning_rate": 2.608241329083195e-05, + "loss": 0.114, + "step": 23160 + }, + { + "epoch": 1.4520273234317227, + "grad_norm": 0.05462603643536568, + "learning_rate": 2.607185830993646e-05, + "loss": 0.1053, + "step": 23170 + }, + { + "epoch": 1.4526540076455474, + "grad_norm": 0.12284897267818451, + "learning_rate": 2.6061303329040976e-05, + "loss": 0.0603, + "step": 23180 + }, + { + "epoch": 1.4532806918593721, + "grad_norm": 0.1386725902557373, + "learning_rate": 2.6050748348145492e-05, + "loss": 0.0096, + "step": 23190 + }, + { + "epoch": 1.4539073760731966, + "grad_norm": 18.32283592224121, + "learning_rate": 2.6040193367250005e-05, + "loss": 0.1813, + "step": 23200 + }, + { + "epoch": 1.4545340602870214, + "grad_norm": 0.04442450776696205, + "learning_rate": 2.6029638386354522e-05, + "loss": 0.0997, + "step": 23210 + }, + { + "epoch": 1.455160744500846, + "grad_norm": 0.032071322202682495, + "learning_rate": 2.601908340545904e-05, + "loss": 0.0718, + "step": 23220 + }, + { + "epoch": 1.4557874287146706, + "grad_norm": 0.02288047969341278, + "learning_rate": 2.6008528424563555e-05, + "loss": 0.003, + "step": 23230 + }, + { + "epoch": 1.4564141129284953, + "grad_norm": 6.418118000030518, + "learning_rate": 2.5997973443668065e-05, + "loss": 0.0605, + "step": 23240 + }, + { + "epoch": 1.45704079714232, + "grad_norm": 0.02514999732375145, + "learning_rate": 2.5987418462772582e-05, + "loss": 0.1446, + "step": 23250 + }, + { + "epoch": 1.4576674813561445, + "grad_norm": 14.600542068481445, + "learning_rate": 2.5976863481877102e-05, + "loss": 0.0825, + "step": 23260 + }, + { + "epoch": 1.4582941655699693, + "grad_norm": 0.05681881681084633, + "learning_rate": 2.5966308500981612e-05, + "loss": 0.0676, + "step": 23270 + }, + { + "epoch": 1.458920849783794, + "grad_norm": 0.17089250683784485, + "learning_rate": 2.595575352008613e-05, + "loss": 0.1669, + "step": 23280 + }, + { + "epoch": 1.4595475339976187, + "grad_norm": 0.09021216630935669, + "learning_rate": 2.5945198539190645e-05, + "loss": 0.1839, + "step": 23290 + }, + { + "epoch": 1.4601742182114432, + "grad_norm": 0.03588192164897919, + "learning_rate": 2.5934643558295162e-05, + "loss": 0.0296, + "step": 23300 + }, + { + "epoch": 1.460800902425268, + "grad_norm": 0.7508852481842041, + "learning_rate": 2.5924088577399675e-05, + "loss": 0.1099, + "step": 23310 + }, + { + "epoch": 1.4614275866390924, + "grad_norm": 2.826711416244507, + "learning_rate": 2.5913533596504192e-05, + "loss": 0.1877, + "step": 23320 + }, + { + "epoch": 1.4620542708529172, + "grad_norm": 2.2574098110198975, + "learning_rate": 2.590297861560871e-05, + "loss": 0.1415, + "step": 23330 + }, + { + "epoch": 1.462680955066742, + "grad_norm": 2.780092239379883, + "learning_rate": 2.5892423634713218e-05, + "loss": 0.0736, + "step": 23340 + }, + { + "epoch": 1.4633076392805666, + "grad_norm": 0.1354340761899948, + "learning_rate": 2.588186865381774e-05, + "loss": 0.2455, + "step": 23350 + }, + { + "epoch": 1.4639343234943911, + "grad_norm": 0.03604722395539284, + "learning_rate": 2.5871313672922255e-05, + "loss": 0.0966, + "step": 23360 + }, + { + "epoch": 1.4645610077082158, + "grad_norm": 0.07439220696687698, + "learning_rate": 2.586075869202677e-05, + "loss": 0.1228, + "step": 23370 + }, + { + "epoch": 1.4651876919220403, + "grad_norm": 19.66285514831543, + "learning_rate": 2.585020371113128e-05, + "loss": 0.1963, + "step": 23380 + }, + { + "epoch": 1.465814376135865, + "grad_norm": 10.755349159240723, + "learning_rate": 2.5839648730235798e-05, + "loss": 0.1596, + "step": 23390 + }, + { + "epoch": 1.4664410603496898, + "grad_norm": 0.3825264871120453, + "learning_rate": 2.5829093749340315e-05, + "loss": 0.0476, + "step": 23400 + }, + { + "epoch": 1.4670677445635145, + "grad_norm": 0.985248863697052, + "learning_rate": 2.5818538768444828e-05, + "loss": 0.1206, + "step": 23410 + }, + { + "epoch": 1.467694428777339, + "grad_norm": 0.5918242335319519, + "learning_rate": 2.5807983787549345e-05, + "loss": 0.0314, + "step": 23420 + }, + { + "epoch": 1.4683211129911637, + "grad_norm": 0.038249701261520386, + "learning_rate": 2.579742880665386e-05, + "loss": 0.0369, + "step": 23430 + }, + { + "epoch": 1.4689477972049885, + "grad_norm": 0.04049490764737129, + "learning_rate": 2.5786873825758378e-05, + "loss": 0.1652, + "step": 23440 + }, + { + "epoch": 1.469574481418813, + "grad_norm": 0.027155594900250435, + "learning_rate": 2.577631884486289e-05, + "loss": 0.1038, + "step": 23450 + }, + { + "epoch": 1.4702011656326377, + "grad_norm": 1.8956443071365356, + "learning_rate": 2.5765763863967408e-05, + "loss": 0.135, + "step": 23460 + }, + { + "epoch": 1.4708278498464624, + "grad_norm": 0.02950974553823471, + "learning_rate": 2.5755208883071925e-05, + "loss": 0.2385, + "step": 23470 + }, + { + "epoch": 1.4714545340602871, + "grad_norm": 0.09720432013273239, + "learning_rate": 2.574465390217644e-05, + "loss": 0.0588, + "step": 23480 + }, + { + "epoch": 1.4720812182741116, + "grad_norm": 0.08742736279964447, + "learning_rate": 2.573409892128095e-05, + "loss": 0.0941, + "step": 23490 + }, + { + "epoch": 1.4727079024879364, + "grad_norm": 4.164999485015869, + "learning_rate": 2.5723543940385468e-05, + "loss": 0.2624, + "step": 23500 + }, + { + "epoch": 1.4733345867017609, + "grad_norm": 0.3092673420906067, + "learning_rate": 2.5712988959489988e-05, + "loss": 0.0784, + "step": 23510 + }, + { + "epoch": 1.4739612709155856, + "grad_norm": 0.16006037592887878, + "learning_rate": 2.5702433978594498e-05, + "loss": 0.0763, + "step": 23520 + }, + { + "epoch": 1.4745879551294103, + "grad_norm": 7.904965400695801, + "learning_rate": 2.5691878997699014e-05, + "loss": 0.2243, + "step": 23530 + }, + { + "epoch": 1.475214639343235, + "grad_norm": 7.310710430145264, + "learning_rate": 2.568132401680353e-05, + "loss": 0.1202, + "step": 23540 + }, + { + "epoch": 1.4758413235570595, + "grad_norm": 3.766096830368042, + "learning_rate": 2.5670769035908048e-05, + "loss": 0.1949, + "step": 23550 + }, + { + "epoch": 1.4764680077708843, + "grad_norm": 0.040846891701221466, + "learning_rate": 2.566021405501256e-05, + "loss": 0.1089, + "step": 23560 + }, + { + "epoch": 1.4770946919847088, + "grad_norm": 8.319687843322754, + "learning_rate": 2.5649659074117078e-05, + "loss": 0.1005, + "step": 23570 + }, + { + "epoch": 1.4777213761985335, + "grad_norm": 0.08896784484386444, + "learning_rate": 2.5639104093221594e-05, + "loss": 0.0395, + "step": 23580 + }, + { + "epoch": 1.4783480604123582, + "grad_norm": 0.05673737823963165, + "learning_rate": 2.5628549112326104e-05, + "loss": 0.0738, + "step": 23590 + }, + { + "epoch": 1.478974744626183, + "grad_norm": 0.14433351159095764, + "learning_rate": 2.5617994131430624e-05, + "loss": 0.111, + "step": 23600 + }, + { + "epoch": 1.4796014288400074, + "grad_norm": 0.09921615570783615, + "learning_rate": 2.560743915053514e-05, + "loss": 0.2772, + "step": 23610 + }, + { + "epoch": 1.4802281130538322, + "grad_norm": 0.5152047276496887, + "learning_rate": 2.5596884169639657e-05, + "loss": 0.0656, + "step": 23620 + }, + { + "epoch": 1.480854797267657, + "grad_norm": 0.6339969038963318, + "learning_rate": 2.5586329188744167e-05, + "loss": 0.0307, + "step": 23630 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.10018188506364822, + "learning_rate": 2.5575774207848684e-05, + "loss": 0.2019, + "step": 23640 + }, + { + "epoch": 1.4821081656953061, + "grad_norm": 11.523775100708008, + "learning_rate": 2.55652192269532e-05, + "loss": 0.0213, + "step": 23650 + }, + { + "epoch": 1.4827348499091308, + "grad_norm": 0.04751148447394371, + "learning_rate": 2.5554664246057714e-05, + "loss": 0.1503, + "step": 23660 + }, + { + "epoch": 1.4833615341229556, + "grad_norm": 0.05950698256492615, + "learning_rate": 2.554410926516223e-05, + "loss": 0.0822, + "step": 23670 + }, + { + "epoch": 1.48398821833678, + "grad_norm": 2.1161811351776123, + "learning_rate": 2.5533554284266747e-05, + "loss": 0.1776, + "step": 23680 + }, + { + "epoch": 1.4846149025506048, + "grad_norm": 0.06839194893836975, + "learning_rate": 2.5522999303371264e-05, + "loss": 0.0641, + "step": 23690 + }, + { + "epoch": 1.4852415867644293, + "grad_norm": 0.21065391600131989, + "learning_rate": 2.5512444322475777e-05, + "loss": 0.0117, + "step": 23700 + }, + { + "epoch": 1.485868270978254, + "grad_norm": 9.681075096130371, + "learning_rate": 2.5501889341580294e-05, + "loss": 0.1, + "step": 23710 + }, + { + "epoch": 1.4864949551920787, + "grad_norm": 0.11779667437076569, + "learning_rate": 2.549133436068481e-05, + "loss": 0.0378, + "step": 23720 + }, + { + "epoch": 1.4871216394059035, + "grad_norm": 1.3199436664581299, + "learning_rate": 2.548077937978932e-05, + "loss": 0.1533, + "step": 23730 + }, + { + "epoch": 1.487748323619728, + "grad_norm": 0.03990645334124565, + "learning_rate": 2.5470224398893837e-05, + "loss": 0.0279, + "step": 23740 + }, + { + "epoch": 1.4883750078335527, + "grad_norm": 0.021615929901599884, + "learning_rate": 2.5459669417998357e-05, + "loss": 0.1285, + "step": 23750 + }, + { + "epoch": 1.4890016920473772, + "grad_norm": 0.013920389115810394, + "learning_rate": 2.5449114437102874e-05, + "loss": 0.2084, + "step": 23760 + }, + { + "epoch": 1.489628376261202, + "grad_norm": 0.08621327579021454, + "learning_rate": 2.5438559456207383e-05, + "loss": 0.1109, + "step": 23770 + }, + { + "epoch": 1.4902550604750266, + "grad_norm": 0.018220432102680206, + "learning_rate": 2.54280044753119e-05, + "loss": 0.1331, + "step": 23780 + }, + { + "epoch": 1.4908817446888514, + "grad_norm": 1.2734665870666504, + "learning_rate": 2.5417449494416417e-05, + "loss": 0.0118, + "step": 23790 + }, + { + "epoch": 1.4915084289026759, + "grad_norm": 22.070463180541992, + "learning_rate": 2.540689451352093e-05, + "loss": 0.0705, + "step": 23800 + }, + { + "epoch": 1.4921351131165006, + "grad_norm": 228.2330780029297, + "learning_rate": 2.5396339532625447e-05, + "loss": 0.0961, + "step": 23810 + }, + { + "epoch": 1.4927617973303253, + "grad_norm": 6.635834217071533, + "learning_rate": 2.5385784551729963e-05, + "loss": 0.2087, + "step": 23820 + }, + { + "epoch": 1.4933884815441498, + "grad_norm": 0.03844684734940529, + "learning_rate": 2.537522957083448e-05, + "loss": 0.1419, + "step": 23830 + }, + { + "epoch": 1.4940151657579746, + "grad_norm": 0.1389332115650177, + "learning_rate": 2.536467458993899e-05, + "loss": 0.2215, + "step": 23840 + }, + { + "epoch": 1.4946418499717993, + "grad_norm": 0.015771828591823578, + "learning_rate": 2.535411960904351e-05, + "loss": 0.0782, + "step": 23850 + }, + { + "epoch": 1.495268534185624, + "grad_norm": 0.6827368140220642, + "learning_rate": 2.5343564628148027e-05, + "loss": 0.0727, + "step": 23860 + }, + { + "epoch": 1.4958952183994485, + "grad_norm": 0.11785943806171417, + "learning_rate": 2.5333009647252543e-05, + "loss": 0.0739, + "step": 23870 + }, + { + "epoch": 1.4965219026132732, + "grad_norm": 0.11551948636770248, + "learning_rate": 2.5322454666357053e-05, + "loss": 0.1316, + "step": 23880 + }, + { + "epoch": 1.4971485868270977, + "grad_norm": 9.339971542358398, + "learning_rate": 2.531189968546157e-05, + "loss": 0.0852, + "step": 23890 + }, + { + "epoch": 1.4977752710409225, + "grad_norm": 0.8613000512123108, + "learning_rate": 2.5301344704566086e-05, + "loss": 0.094, + "step": 23900 + }, + { + "epoch": 1.4984019552547472, + "grad_norm": 0.05517628416419029, + "learning_rate": 2.52907897236706e-05, + "loss": 0.2456, + "step": 23910 + }, + { + "epoch": 1.499028639468572, + "grad_norm": 0.04880961403250694, + "learning_rate": 2.5280234742775116e-05, + "loss": 0.1071, + "step": 23920 + }, + { + "epoch": 1.4996553236823964, + "grad_norm": 0.04997780919075012, + "learning_rate": 2.5269679761879633e-05, + "loss": 0.0426, + "step": 23930 + }, + { + "epoch": 1.5002820078962211, + "grad_norm": 0.047217972576618195, + "learning_rate": 2.525912478098415e-05, + "loss": 0.1243, + "step": 23940 + }, + { + "epoch": 1.5009086921100456, + "grad_norm": 0.09410211443901062, + "learning_rate": 2.5248569800088663e-05, + "loss": 0.0575, + "step": 23950 + }, + { + "epoch": 1.5015353763238704, + "grad_norm": 3.0829415321350098, + "learning_rate": 2.523801481919318e-05, + "loss": 0.1165, + "step": 23960 + }, + { + "epoch": 1.502162060537695, + "grad_norm": 0.029031164944171906, + "learning_rate": 2.5227459838297696e-05, + "loss": 0.1641, + "step": 23970 + }, + { + "epoch": 1.5027887447515198, + "grad_norm": 0.09065140038728714, + "learning_rate": 2.5216904857402206e-05, + "loss": 0.0072, + "step": 23980 + }, + { + "epoch": 1.5034154289653445, + "grad_norm": 0.08739957213401794, + "learning_rate": 2.5206349876506723e-05, + "loss": 0.1343, + "step": 23990 + }, + { + "epoch": 1.504042113179169, + "grad_norm": 0.4279542863368988, + "learning_rate": 2.5195794895611243e-05, + "loss": 0.0366, + "step": 24000 + }, + { + "epoch": 1.5046687973929935, + "grad_norm": 0.09616062790155411, + "learning_rate": 2.518523991471576e-05, + "loss": 0.0371, + "step": 24010 + }, + { + "epoch": 1.5052954816068183, + "grad_norm": 0.1089567318558693, + "learning_rate": 2.517468493382027e-05, + "loss": 0.2015, + "step": 24020 + }, + { + "epoch": 1.505922165820643, + "grad_norm": 0.04283067211508751, + "learning_rate": 2.5164129952924786e-05, + "loss": 0.1507, + "step": 24030 + }, + { + "epoch": 1.5065488500344677, + "grad_norm": 0.09019152820110321, + "learning_rate": 2.5153574972029303e-05, + "loss": 0.0534, + "step": 24040 + }, + { + "epoch": 1.5071755342482924, + "grad_norm": 0.11618775874376297, + "learning_rate": 2.5143019991133816e-05, + "loss": 0.1746, + "step": 24050 + }, + { + "epoch": 1.507802218462117, + "grad_norm": 1.8899295330047607, + "learning_rate": 2.5132465010238332e-05, + "loss": 0.0871, + "step": 24060 + }, + { + "epoch": 1.5084289026759414, + "grad_norm": 0.05963942036032677, + "learning_rate": 2.512191002934285e-05, + "loss": 0.073, + "step": 24070 + }, + { + "epoch": 1.5090555868897662, + "grad_norm": 12.7268705368042, + "learning_rate": 2.5111355048447366e-05, + "loss": 0.128, + "step": 24080 + }, + { + "epoch": 1.5096822711035909, + "grad_norm": 0.5341861844062805, + "learning_rate": 2.5100800067551876e-05, + "loss": 0.0791, + "step": 24090 + }, + { + "epoch": 1.5103089553174156, + "grad_norm": 2.8714306354522705, + "learning_rate": 2.5090245086656396e-05, + "loss": 0.0371, + "step": 24100 + }, + { + "epoch": 1.5109356395312403, + "grad_norm": 0.1821633130311966, + "learning_rate": 2.5079690105760912e-05, + "loss": 0.0986, + "step": 24110 + }, + { + "epoch": 1.5115623237450648, + "grad_norm": 1.7185479402542114, + "learning_rate": 2.5069135124865422e-05, + "loss": 0.139, + "step": 24120 + }, + { + "epoch": 1.5121890079588896, + "grad_norm": 4.6903977394104, + "learning_rate": 2.505858014396994e-05, + "loss": 0.1248, + "step": 24130 + }, + { + "epoch": 1.512815692172714, + "grad_norm": 9.343801498413086, + "learning_rate": 2.5048025163074456e-05, + "loss": 0.1528, + "step": 24140 + }, + { + "epoch": 1.5134423763865388, + "grad_norm": 0.07593511790037155, + "learning_rate": 2.5037470182178972e-05, + "loss": 0.0746, + "step": 24150 + }, + { + "epoch": 1.5140690606003635, + "grad_norm": 0.07884012162685394, + "learning_rate": 2.5026915201283485e-05, + "loss": 0.1444, + "step": 24160 + }, + { + "epoch": 1.5146957448141882, + "grad_norm": 1.0196356773376465, + "learning_rate": 2.5016360220388002e-05, + "loss": 0.014, + "step": 24170 + }, + { + "epoch": 1.5153224290280127, + "grad_norm": 3.953526735305786, + "learning_rate": 2.500580523949252e-05, + "loss": 0.1165, + "step": 24180 + }, + { + "epoch": 1.5159491132418375, + "grad_norm": 0.19802182912826538, + "learning_rate": 2.4995250258597032e-05, + "loss": 0.0583, + "step": 24190 + }, + { + "epoch": 1.516575797455662, + "grad_norm": 0.1346462517976761, + "learning_rate": 2.498469527770155e-05, + "loss": 0.0169, + "step": 24200 + }, + { + "epoch": 1.5172024816694867, + "grad_norm": 0.08564218133687973, + "learning_rate": 2.4974140296806065e-05, + "loss": 0.0021, + "step": 24210 + }, + { + "epoch": 1.5178291658833114, + "grad_norm": 3.5072386264801025, + "learning_rate": 2.496358531591058e-05, + "loss": 0.0853, + "step": 24220 + }, + { + "epoch": 1.5184558500971361, + "grad_norm": 0.09622887521982193, + "learning_rate": 2.4953030335015095e-05, + "loss": 0.1146, + "step": 24230 + }, + { + "epoch": 1.5190825343109609, + "grad_norm": 0.17273660004138947, + "learning_rate": 2.494247535411961e-05, + "loss": 0.0363, + "step": 24240 + }, + { + "epoch": 1.5197092185247854, + "grad_norm": 0.019315486773848534, + "learning_rate": 2.493192037322413e-05, + "loss": 0.1146, + "step": 24250 + }, + { + "epoch": 1.5203359027386099, + "grad_norm": 0.017586050555109978, + "learning_rate": 2.4921365392328642e-05, + "loss": 0.1751, + "step": 24260 + }, + { + "epoch": 1.5209625869524346, + "grad_norm": 5.868565559387207, + "learning_rate": 2.4910810411433155e-05, + "loss": 0.1345, + "step": 24270 + }, + { + "epoch": 1.5215892711662593, + "grad_norm": 0.08116251975297928, + "learning_rate": 2.490025543053767e-05, + "loss": 0.309, + "step": 24280 + }, + { + "epoch": 1.522215955380084, + "grad_norm": 2.3880562782287598, + "learning_rate": 2.4889700449642185e-05, + "loss": 0.2768, + "step": 24290 + }, + { + "epoch": 1.5228426395939088, + "grad_norm": 0.20012100040912628, + "learning_rate": 2.4879145468746705e-05, + "loss": 0.0256, + "step": 24300 + }, + { + "epoch": 1.5234693238077333, + "grad_norm": 0.08985351771116257, + "learning_rate": 2.4868590487851218e-05, + "loss": 0.0712, + "step": 24310 + }, + { + "epoch": 1.524096008021558, + "grad_norm": 0.023399904370307922, + "learning_rate": 2.4858035506955735e-05, + "loss": 0.0897, + "step": 24320 + }, + { + "epoch": 1.5247226922353825, + "grad_norm": 0.08582006394863129, + "learning_rate": 2.4847480526060248e-05, + "loss": 0.0974, + "step": 24330 + }, + { + "epoch": 1.5253493764492072, + "grad_norm": 0.03750643879175186, + "learning_rate": 2.483692554516476e-05, + "loss": 0.1763, + "step": 24340 + }, + { + "epoch": 1.525976060663032, + "grad_norm": 0.04496735706925392, + "learning_rate": 2.482637056426928e-05, + "loss": 0.0854, + "step": 24350 + }, + { + "epoch": 1.5266027448768567, + "grad_norm": 0.06347126513719559, + "learning_rate": 2.4815815583373795e-05, + "loss": 0.0259, + "step": 24360 + }, + { + "epoch": 1.5272294290906812, + "grad_norm": 0.05704139918088913, + "learning_rate": 2.480526060247831e-05, + "loss": 0.1453, + "step": 24370 + }, + { + "epoch": 1.5278561133045059, + "grad_norm": 0.04704032838344574, + "learning_rate": 2.4794705621582825e-05, + "loss": 0.0647, + "step": 24380 + }, + { + "epoch": 1.5284827975183304, + "grad_norm": 0.05256137251853943, + "learning_rate": 2.478415064068734e-05, + "loss": 0.0646, + "step": 24390 + }, + { + "epoch": 1.529109481732155, + "grad_norm": 0.05079631507396698, + "learning_rate": 2.4773595659791858e-05, + "loss": 0.1603, + "step": 24400 + }, + { + "epoch": 1.5297361659459798, + "grad_norm": 0.0261594969779253, + "learning_rate": 2.4763040678896375e-05, + "loss": 0.0042, + "step": 24410 + }, + { + "epoch": 1.5303628501598046, + "grad_norm": 7.628668785095215, + "learning_rate": 2.4752485698000888e-05, + "loss": 0.2903, + "step": 24420 + }, + { + "epoch": 1.5309895343736293, + "grad_norm": 3.294372081756592, + "learning_rate": 2.47419307171054e-05, + "loss": 0.2368, + "step": 24430 + }, + { + "epoch": 1.5316162185874538, + "grad_norm": 4.897356986999512, + "learning_rate": 2.4731375736209918e-05, + "loss": 0.1527, + "step": 24440 + }, + { + "epoch": 1.5322429028012783, + "grad_norm": 0.15121100842952728, + "learning_rate": 2.4720820755314434e-05, + "loss": 0.0644, + "step": 24450 + }, + { + "epoch": 1.532869587015103, + "grad_norm": 14.861132621765137, + "learning_rate": 2.471026577441895e-05, + "loss": 0.1316, + "step": 24460 + }, + { + "epoch": 1.5334962712289277, + "grad_norm": 0.042586345225572586, + "learning_rate": 2.4699710793523464e-05, + "loss": 0.286, + "step": 24470 + }, + { + "epoch": 1.5341229554427525, + "grad_norm": 1.1130905151367188, + "learning_rate": 2.468915581262798e-05, + "loss": 0.1371, + "step": 24480 + }, + { + "epoch": 1.5347496396565772, + "grad_norm": 0.2588573396205902, + "learning_rate": 2.4678600831732494e-05, + "loss": 0.0332, + "step": 24490 + }, + { + "epoch": 1.5353763238704017, + "grad_norm": 0.22446982562541962, + "learning_rate": 2.466804585083701e-05, + "loss": 0.0147, + "step": 24500 + }, + { + "epoch": 1.5360030080842264, + "grad_norm": 0.251001238822937, + "learning_rate": 2.4657490869941528e-05, + "loss": 0.0458, + "step": 24510 + }, + { + "epoch": 1.536629692298051, + "grad_norm": 4.180436134338379, + "learning_rate": 2.464693588904604e-05, + "loss": 0.1391, + "step": 24520 + }, + { + "epoch": 1.5372563765118756, + "grad_norm": 0.04928579553961754, + "learning_rate": 2.4636380908150557e-05, + "loss": 0.1121, + "step": 24530 + }, + { + "epoch": 1.5378830607257004, + "grad_norm": 0.12214578688144684, + "learning_rate": 2.462582592725507e-05, + "loss": 0.2212, + "step": 24540 + }, + { + "epoch": 1.538509744939525, + "grad_norm": 0.13402099907398224, + "learning_rate": 2.461527094635959e-05, + "loss": 0.1176, + "step": 24550 + }, + { + "epoch": 1.5391364291533496, + "grad_norm": 0.04098990187048912, + "learning_rate": 2.4604715965464104e-05, + "loss": 0.0895, + "step": 24560 + }, + { + "epoch": 1.5397631133671743, + "grad_norm": 0.22888420522212982, + "learning_rate": 2.4594160984568617e-05, + "loss": 0.0536, + "step": 24570 + }, + { + "epoch": 1.5403897975809988, + "grad_norm": 0.09464392066001892, + "learning_rate": 2.4583606003673134e-05, + "loss": 0.0432, + "step": 24580 + }, + { + "epoch": 1.5410164817948235, + "grad_norm": 0.012952485121786594, + "learning_rate": 2.457305102277765e-05, + "loss": 0.0356, + "step": 24590 + }, + { + "epoch": 1.5416431660086483, + "grad_norm": 2.1808762550354004, + "learning_rate": 2.4562496041882167e-05, + "loss": 0.2454, + "step": 24600 + }, + { + "epoch": 1.542269850222473, + "grad_norm": 0.05670209601521492, + "learning_rate": 2.455194106098668e-05, + "loss": 0.0498, + "step": 24610 + }, + { + "epoch": 1.5428965344362977, + "grad_norm": 0.48406851291656494, + "learning_rate": 2.4541386080091197e-05, + "loss": 0.0974, + "step": 24620 + }, + { + "epoch": 1.5435232186501222, + "grad_norm": 0.687736988067627, + "learning_rate": 2.453083109919571e-05, + "loss": 0.027, + "step": 24630 + }, + { + "epoch": 1.5441499028639467, + "grad_norm": 0.5006890892982483, + "learning_rate": 2.4520276118300227e-05, + "loss": 0.093, + "step": 24640 + }, + { + "epoch": 1.5447765870777714, + "grad_norm": 0.007649289909750223, + "learning_rate": 2.4509721137404744e-05, + "loss": 0.0057, + "step": 24650 + }, + { + "epoch": 1.5454032712915962, + "grad_norm": 0.055010851472616196, + "learning_rate": 2.4499166156509257e-05, + "loss": 0.108, + "step": 24660 + }, + { + "epoch": 1.5460299555054209, + "grad_norm": 0.07023721933364868, + "learning_rate": 2.4488611175613774e-05, + "loss": 0.0254, + "step": 24670 + }, + { + "epoch": 1.5466566397192456, + "grad_norm": 0.15422725677490234, + "learning_rate": 2.4478056194718287e-05, + "loss": 0.1481, + "step": 24680 + }, + { + "epoch": 1.54728332393307, + "grad_norm": 0.0953349694609642, + "learning_rate": 2.4467501213822804e-05, + "loss": 0.0028, + "step": 24690 + }, + { + "epoch": 1.5479100081468948, + "grad_norm": 0.03705024719238281, + "learning_rate": 2.445694623292732e-05, + "loss": 0.187, + "step": 24700 + }, + { + "epoch": 1.5485366923607193, + "grad_norm": 0.224687397480011, + "learning_rate": 2.4446391252031837e-05, + "loss": 0.1034, + "step": 24710 + }, + { + "epoch": 1.549163376574544, + "grad_norm": 0.056175027042627335, + "learning_rate": 2.443583627113635e-05, + "loss": 0.0122, + "step": 24720 + }, + { + "epoch": 1.5497900607883688, + "grad_norm": 3.002753496170044, + "learning_rate": 2.4425281290240863e-05, + "loss": 0.1492, + "step": 24730 + }, + { + "epoch": 1.5504167450021935, + "grad_norm": 0.20835314691066742, + "learning_rate": 2.441472630934538e-05, + "loss": 0.0698, + "step": 24740 + }, + { + "epoch": 1.551043429216018, + "grad_norm": 40.73422622680664, + "learning_rate": 2.4404171328449897e-05, + "loss": 0.2096, + "step": 24750 + }, + { + "epoch": 1.5516701134298427, + "grad_norm": 0.05524049699306488, + "learning_rate": 2.4393616347554413e-05, + "loss": 0.0075, + "step": 24760 + }, + { + "epoch": 1.5522967976436672, + "grad_norm": 0.08031153678894043, + "learning_rate": 2.4383061366658927e-05, + "loss": 0.2378, + "step": 24770 + }, + { + "epoch": 1.552923481857492, + "grad_norm": 0.09899525344371796, + "learning_rate": 2.4372506385763443e-05, + "loss": 0.1061, + "step": 24780 + }, + { + "epoch": 1.5535501660713167, + "grad_norm": 0.06336953490972519, + "learning_rate": 2.4361951404867957e-05, + "loss": 0.0193, + "step": 24790 + }, + { + "epoch": 1.5541768502851414, + "grad_norm": 0.03865973278880119, + "learning_rate": 2.4351396423972477e-05, + "loss": 0.2515, + "step": 24800 + }, + { + "epoch": 1.5548035344989661, + "grad_norm": 0.33897021412849426, + "learning_rate": 2.434084144307699e-05, + "loss": 0.0617, + "step": 24810 + }, + { + "epoch": 1.5554302187127906, + "grad_norm": 6.761913776397705, + "learning_rate": 2.4330286462181503e-05, + "loss": 0.1185, + "step": 24820 + }, + { + "epoch": 1.5560569029266151, + "grad_norm": 0.1353708952665329, + "learning_rate": 2.431973148128602e-05, + "loss": 0.048, + "step": 24830 + }, + { + "epoch": 1.5566835871404399, + "grad_norm": 0.38180655241012573, + "learning_rate": 2.4309176500390536e-05, + "loss": 0.098, + "step": 24840 + }, + { + "epoch": 1.5573102713542646, + "grad_norm": 0.05842465162277222, + "learning_rate": 2.4298621519495053e-05, + "loss": 0.1091, + "step": 24850 + }, + { + "epoch": 1.5579369555680893, + "grad_norm": 27.77541160583496, + "learning_rate": 2.4288066538599566e-05, + "loss": 0.1175, + "step": 24860 + }, + { + "epoch": 1.558563639781914, + "grad_norm": 0.08940772712230682, + "learning_rate": 2.4277511557704083e-05, + "loss": 0.0748, + "step": 24870 + }, + { + "epoch": 1.5591903239957385, + "grad_norm": 0.2604180872440338, + "learning_rate": 2.4266956576808596e-05, + "loss": 0.11, + "step": 24880 + }, + { + "epoch": 1.559817008209563, + "grad_norm": 0.12213179469108582, + "learning_rate": 2.4256401595913113e-05, + "loss": 0.0412, + "step": 24890 + }, + { + "epoch": 1.5604436924233878, + "grad_norm": 9.398280143737793, + "learning_rate": 2.424584661501763e-05, + "loss": 0.0677, + "step": 24900 + }, + { + "epoch": 1.5610703766372125, + "grad_norm": 10.272821426391602, + "learning_rate": 2.4235291634122143e-05, + "loss": 0.1476, + "step": 24910 + }, + { + "epoch": 1.5616970608510372, + "grad_norm": 0.024012258276343346, + "learning_rate": 2.422473665322666e-05, + "loss": 0.0965, + "step": 24920 + }, + { + "epoch": 1.562323745064862, + "grad_norm": 0.02501060627400875, + "learning_rate": 2.4214181672331173e-05, + "loss": 0.0118, + "step": 24930 + }, + { + "epoch": 1.5629504292786864, + "grad_norm": 5.121936321258545, + "learning_rate": 2.420362669143569e-05, + "loss": 0.1386, + "step": 24940 + }, + { + "epoch": 1.5635771134925112, + "grad_norm": 0.07374687492847443, + "learning_rate": 2.4193071710540206e-05, + "loss": 0.0618, + "step": 24950 + }, + { + "epoch": 1.5642037977063357, + "grad_norm": 0.05653063580393791, + "learning_rate": 2.418251672964472e-05, + "loss": 0.0351, + "step": 24960 + }, + { + "epoch": 1.5648304819201604, + "grad_norm": 89.24818420410156, + "learning_rate": 2.4171961748749236e-05, + "loss": 0.5215, + "step": 24970 + }, + { + "epoch": 1.5654571661339851, + "grad_norm": 7.567296504974365, + "learning_rate": 2.416140676785375e-05, + "loss": 0.244, + "step": 24980 + }, + { + "epoch": 1.5660838503478098, + "grad_norm": 0.25020354986190796, + "learning_rate": 2.4150851786958266e-05, + "loss": 0.008, + "step": 24990 + }, + { + "epoch": 1.5667105345616343, + "grad_norm": 0.6312108635902405, + "learning_rate": 2.4140296806062782e-05, + "loss": 0.0212, + "step": 25000 + }, + { + "epoch": 1.567337218775459, + "grad_norm": 0.08929482102394104, + "learning_rate": 2.41297418251673e-05, + "loss": 0.1959, + "step": 25010 + }, + { + "epoch": 1.5679639029892836, + "grad_norm": 4.561793327331543, + "learning_rate": 2.4119186844271812e-05, + "loss": 0.2022, + "step": 25020 + }, + { + "epoch": 1.5685905872031083, + "grad_norm": 0.09889715164899826, + "learning_rate": 2.410863186337633e-05, + "loss": 0.165, + "step": 25030 + }, + { + "epoch": 1.569217271416933, + "grad_norm": 0.21414883434772491, + "learning_rate": 2.4098076882480842e-05, + "loss": 0.152, + "step": 25040 + }, + { + "epoch": 1.5698439556307577, + "grad_norm": 0.10287700593471527, + "learning_rate": 2.408752190158536e-05, + "loss": 0.0582, + "step": 25050 + }, + { + "epoch": 1.5704706398445825, + "grad_norm": 0.013686907477676868, + "learning_rate": 2.4076966920689876e-05, + "loss": 0.0469, + "step": 25060 + }, + { + "epoch": 1.571097324058407, + "grad_norm": 2.523181915283203, + "learning_rate": 2.406641193979439e-05, + "loss": 0.0847, + "step": 25070 + }, + { + "epoch": 1.5717240082722315, + "grad_norm": 0.11900023370981216, + "learning_rate": 2.4055856958898906e-05, + "loss": 0.185, + "step": 25080 + }, + { + "epoch": 1.5723506924860562, + "grad_norm": 9.391716003417969, + "learning_rate": 2.4045301978003422e-05, + "loss": 0.1941, + "step": 25090 + }, + { + "epoch": 1.572977376699881, + "grad_norm": 0.12244875729084015, + "learning_rate": 2.403474699710794e-05, + "loss": 0.1257, + "step": 25100 + }, + { + "epoch": 1.5736040609137056, + "grad_norm": 6.436902046203613, + "learning_rate": 2.4024192016212452e-05, + "loss": 0.0995, + "step": 25110 + }, + { + "epoch": 1.5742307451275304, + "grad_norm": 10.673419952392578, + "learning_rate": 2.4013637035316965e-05, + "loss": 0.1573, + "step": 25120 + }, + { + "epoch": 1.5748574293413549, + "grad_norm": 0.02762596309185028, + "learning_rate": 2.4003082054421482e-05, + "loss": 0.0856, + "step": 25130 + }, + { + "epoch": 1.5754841135551796, + "grad_norm": 0.051068298518657684, + "learning_rate": 2.3992527073526e-05, + "loss": 0.0035, + "step": 25140 + }, + { + "epoch": 1.576110797769004, + "grad_norm": 0.03830769285559654, + "learning_rate": 2.3981972092630515e-05, + "loss": 0.065, + "step": 25150 + }, + { + "epoch": 1.5767374819828288, + "grad_norm": 5.371109962463379, + "learning_rate": 2.397141711173503e-05, + "loss": 0.1066, + "step": 25160 + }, + { + "epoch": 1.5773641661966535, + "grad_norm": 0.01236297283321619, + "learning_rate": 2.3960862130839545e-05, + "loss": 0.1319, + "step": 25170 + }, + { + "epoch": 1.5779908504104783, + "grad_norm": 8.42038345336914, + "learning_rate": 2.395030714994406e-05, + "loss": 0.1462, + "step": 25180 + }, + { + "epoch": 1.5786175346243028, + "grad_norm": 13.973833084106445, + "learning_rate": 2.3939752169048575e-05, + "loss": 0.0665, + "step": 25190 + }, + { + "epoch": 1.5792442188381275, + "grad_norm": 0.06125205382704735, + "learning_rate": 2.3929197188153092e-05, + "loss": 0.0698, + "step": 25200 + }, + { + "epoch": 1.579870903051952, + "grad_norm": 0.10730314999818802, + "learning_rate": 2.3918642207257605e-05, + "loss": 0.0668, + "step": 25210 + }, + { + "epoch": 1.5804975872657767, + "grad_norm": 10.271492958068848, + "learning_rate": 2.3908087226362122e-05, + "loss": 0.148, + "step": 25220 + }, + { + "epoch": 1.5811242714796014, + "grad_norm": 0.1615249067544937, + "learning_rate": 2.3897532245466635e-05, + "loss": 0.0969, + "step": 25230 + }, + { + "epoch": 1.5817509556934262, + "grad_norm": 0.05994172394275665, + "learning_rate": 2.388697726457115e-05, + "loss": 0.0318, + "step": 25240 + }, + { + "epoch": 1.5823776399072509, + "grad_norm": 0.10029243677854538, + "learning_rate": 2.3876422283675668e-05, + "loss": 0.1715, + "step": 25250 + }, + { + "epoch": 1.5830043241210754, + "grad_norm": 0.036729391664266586, + "learning_rate": 2.3865867302780185e-05, + "loss": 0.1219, + "step": 25260 + }, + { + "epoch": 1.5836310083349, + "grad_norm": 6.817121505737305, + "learning_rate": 2.3855312321884698e-05, + "loss": 0.0587, + "step": 25270 + }, + { + "epoch": 1.5842576925487246, + "grad_norm": 0.25154295563697815, + "learning_rate": 2.384475734098921e-05, + "loss": 0.2074, + "step": 25280 + }, + { + "epoch": 1.5848843767625493, + "grad_norm": 0.04093783348798752, + "learning_rate": 2.3834202360093728e-05, + "loss": 0.1101, + "step": 25290 + }, + { + "epoch": 1.585511060976374, + "grad_norm": 0.13500647246837616, + "learning_rate": 2.3823647379198245e-05, + "loss": 0.0332, + "step": 25300 + }, + { + "epoch": 1.5861377451901988, + "grad_norm": 0.03615393117070198, + "learning_rate": 2.381309239830276e-05, + "loss": 0.1278, + "step": 25310 + }, + { + "epoch": 1.5867644294040233, + "grad_norm": 0.051414694637060165, + "learning_rate": 2.3802537417407275e-05, + "loss": 0.1995, + "step": 25320 + }, + { + "epoch": 1.587391113617848, + "grad_norm": 1.4407033920288086, + "learning_rate": 2.379198243651179e-05, + "loss": 0.0081, + "step": 25330 + }, + { + "epoch": 1.5880177978316725, + "grad_norm": 9.254353523254395, + "learning_rate": 2.3781427455616308e-05, + "loss": 0.1209, + "step": 25340 + }, + { + "epoch": 1.5886444820454972, + "grad_norm": 0.05505969375371933, + "learning_rate": 2.377087247472082e-05, + "loss": 0.2014, + "step": 25350 + }, + { + "epoch": 1.589271166259322, + "grad_norm": 0.13277609646320343, + "learning_rate": 2.3760317493825338e-05, + "loss": 0.1068, + "step": 25360 + }, + { + "epoch": 1.5898978504731467, + "grad_norm": 0.5146042108535767, + "learning_rate": 2.374976251292985e-05, + "loss": 0.0216, + "step": 25370 + }, + { + "epoch": 1.5905245346869712, + "grad_norm": 0.35294288396835327, + "learning_rate": 2.3739207532034368e-05, + "loss": 0.0596, + "step": 25380 + }, + { + "epoch": 1.591151218900796, + "grad_norm": 28.5668888092041, + "learning_rate": 2.3728652551138884e-05, + "loss": 0.1636, + "step": 25390 + }, + { + "epoch": 1.5917779031146204, + "grad_norm": 0.3631909489631653, + "learning_rate": 2.37180975702434e-05, + "loss": 0.0676, + "step": 25400 + }, + { + "epoch": 1.5924045873284451, + "grad_norm": 0.10231323540210724, + "learning_rate": 2.3707542589347914e-05, + "loss": 0.1439, + "step": 25410 + }, + { + "epoch": 1.5930312715422699, + "grad_norm": 6.930258274078369, + "learning_rate": 2.369698760845243e-05, + "loss": 0.2736, + "step": 25420 + }, + { + "epoch": 1.5936579557560946, + "grad_norm": 4.582469940185547, + "learning_rate": 2.3686432627556944e-05, + "loss": 0.0806, + "step": 25430 + }, + { + "epoch": 1.5942846399699193, + "grad_norm": 0.033723387867212296, + "learning_rate": 2.367587764666146e-05, + "loss": 0.0098, + "step": 25440 + }, + { + "epoch": 1.5949113241837438, + "grad_norm": 1.0349847078323364, + "learning_rate": 2.3665322665765978e-05, + "loss": 0.0684, + "step": 25450 + }, + { + "epoch": 1.5955380083975683, + "grad_norm": 0.03423527255654335, + "learning_rate": 2.365476768487049e-05, + "loss": 0.1973, + "step": 25460 + }, + { + "epoch": 1.596164692611393, + "grad_norm": 8.22195053100586, + "learning_rate": 2.3644212703975008e-05, + "loss": 0.0391, + "step": 25470 + }, + { + "epoch": 1.5967913768252178, + "grad_norm": 1.6201304197311401, + "learning_rate": 2.363365772307952e-05, + "loss": 0.1338, + "step": 25480 + }, + { + "epoch": 1.5974180610390425, + "grad_norm": 1.193275809288025, + "learning_rate": 2.3623102742184037e-05, + "loss": 0.0354, + "step": 25490 + }, + { + "epoch": 1.5980447452528672, + "grad_norm": 0.09546542167663574, + "learning_rate": 2.3612547761288554e-05, + "loss": 0.1602, + "step": 25500 + }, + { + "epoch": 1.5986714294666917, + "grad_norm": 0.06019541993737221, + "learning_rate": 2.3601992780393067e-05, + "loss": 0.0713, + "step": 25510 + }, + { + "epoch": 1.5992981136805164, + "grad_norm": 0.02078704722225666, + "learning_rate": 2.3591437799497584e-05, + "loss": 0.012, + "step": 25520 + }, + { + "epoch": 1.599924797894341, + "grad_norm": 0.054111577570438385, + "learning_rate": 2.3580882818602097e-05, + "loss": 0.0506, + "step": 25530 + }, + { + "epoch": 1.6005514821081657, + "grad_norm": 0.15975138545036316, + "learning_rate": 2.3570327837706614e-05, + "loss": 0.1461, + "step": 25540 + }, + { + "epoch": 1.6011781663219904, + "grad_norm": 0.020044539123773575, + "learning_rate": 2.355977285681113e-05, + "loss": 0.0946, + "step": 25550 + }, + { + "epoch": 1.6018048505358151, + "grad_norm": 0.08575562387704849, + "learning_rate": 2.3549217875915647e-05, + "loss": 0.1476, + "step": 25560 + }, + { + "epoch": 1.6024315347496396, + "grad_norm": 0.009003080427646637, + "learning_rate": 2.353866289502016e-05, + "loss": 0.0111, + "step": 25570 + }, + { + "epoch": 1.6030582189634643, + "grad_norm": 7.464236736297607, + "learning_rate": 2.3528107914124674e-05, + "loss": 0.0868, + "step": 25580 + }, + { + "epoch": 1.6036849031772888, + "grad_norm": 23.07075309753418, + "learning_rate": 2.3517552933229194e-05, + "loss": 0.0932, + "step": 25590 + }, + { + "epoch": 1.6043115873911136, + "grad_norm": 0.007414340041577816, + "learning_rate": 2.3506997952333707e-05, + "loss": 0.0806, + "step": 25600 + }, + { + "epoch": 1.6049382716049383, + "grad_norm": 0.017645269632339478, + "learning_rate": 2.3496442971438224e-05, + "loss": 0.0524, + "step": 25610 + }, + { + "epoch": 1.605564955818763, + "grad_norm": 0.018196620047092438, + "learning_rate": 2.3485887990542737e-05, + "loss": 0.0024, + "step": 25620 + }, + { + "epoch": 1.6061916400325877, + "grad_norm": 1.105287790298462, + "learning_rate": 2.3475333009647254e-05, + "loss": 0.0663, + "step": 25630 + }, + { + "epoch": 1.6068183242464122, + "grad_norm": 0.007714731153100729, + "learning_rate": 2.346477802875177e-05, + "loss": 0.0115, + "step": 25640 + }, + { + "epoch": 1.6074450084602367, + "grad_norm": 7.8072075843811035, + "learning_rate": 2.3454223047856287e-05, + "loss": 0.1591, + "step": 25650 + }, + { + "epoch": 1.6080716926740615, + "grad_norm": 10.7738676071167, + "learning_rate": 2.34436680669608e-05, + "loss": 0.0779, + "step": 25660 + }, + { + "epoch": 1.6086983768878862, + "grad_norm": 0.04062759503722191, + "learning_rate": 2.3433113086065313e-05, + "loss": 0.0041, + "step": 25670 + }, + { + "epoch": 1.609325061101711, + "grad_norm": 0.18196269869804382, + "learning_rate": 2.342255810516983e-05, + "loss": 0.112, + "step": 25680 + }, + { + "epoch": 1.6099517453155356, + "grad_norm": 0.016388777643442154, + "learning_rate": 2.3412003124274347e-05, + "loss": 0.0033, + "step": 25690 + }, + { + "epoch": 1.6105784295293601, + "grad_norm": 0.022291744127869606, + "learning_rate": 2.3401448143378863e-05, + "loss": 0.0592, + "step": 25700 + }, + { + "epoch": 1.6112051137431846, + "grad_norm": 0.016995355486869812, + "learning_rate": 2.3390893162483377e-05, + "loss": 0.1401, + "step": 25710 + }, + { + "epoch": 1.6118317979570094, + "grad_norm": 0.19162079691886902, + "learning_rate": 2.3380338181587893e-05, + "loss": 0.2771, + "step": 25720 + }, + { + "epoch": 1.612458482170834, + "grad_norm": 0.0328703373670578, + "learning_rate": 2.3369783200692407e-05, + "loss": 0.011, + "step": 25730 + }, + { + "epoch": 1.6130851663846588, + "grad_norm": 0.04259718582034111, + "learning_rate": 2.3359228219796923e-05, + "loss": 0.1075, + "step": 25740 + }, + { + "epoch": 1.6137118505984835, + "grad_norm": 7.747127532958984, + "learning_rate": 2.334867323890144e-05, + "loss": 0.1878, + "step": 25750 + }, + { + "epoch": 1.614338534812308, + "grad_norm": 5.813741683959961, + "learning_rate": 2.3338118258005953e-05, + "loss": 0.0926, + "step": 25760 + }, + { + "epoch": 1.6149652190261328, + "grad_norm": 0.09612107276916504, + "learning_rate": 2.332756327711047e-05, + "loss": 0.2835, + "step": 25770 + }, + { + "epoch": 1.6155919032399573, + "grad_norm": 0.4905875325202942, + "learning_rate": 2.3317008296214983e-05, + "loss": 0.2207, + "step": 25780 + }, + { + "epoch": 1.616218587453782, + "grad_norm": 0.11081667244434357, + "learning_rate": 2.3306453315319503e-05, + "loss": 0.1188, + "step": 25790 + }, + { + "epoch": 1.6168452716676067, + "grad_norm": 0.1327042430639267, + "learning_rate": 2.3295898334424016e-05, + "loss": 0.0168, + "step": 25800 + }, + { + "epoch": 1.6174719558814314, + "grad_norm": 0.2520522177219391, + "learning_rate": 2.3285343353528533e-05, + "loss": 0.1639, + "step": 25810 + }, + { + "epoch": 1.618098640095256, + "grad_norm": 0.11177418380975723, + "learning_rate": 2.3274788372633046e-05, + "loss": 0.0033, + "step": 25820 + }, + { + "epoch": 1.6187253243090807, + "grad_norm": 5.912029266357422, + "learning_rate": 2.326423339173756e-05, + "loss": 0.2516, + "step": 25830 + }, + { + "epoch": 1.6193520085229052, + "grad_norm": 0.013550223782658577, + "learning_rate": 2.325367841084208e-05, + "loss": 0.0348, + "step": 25840 + }, + { + "epoch": 1.61997869273673, + "grad_norm": 2.5849907398223877, + "learning_rate": 2.3243123429946593e-05, + "loss": 0.0329, + "step": 25850 + }, + { + "epoch": 1.6206053769505546, + "grad_norm": 0.012683117762207985, + "learning_rate": 2.323256844905111e-05, + "loss": 0.0381, + "step": 25860 + }, + { + "epoch": 1.6212320611643793, + "grad_norm": 0.009466142393648624, + "learning_rate": 2.3222013468155623e-05, + "loss": 0.08, + "step": 25870 + }, + { + "epoch": 1.621858745378204, + "grad_norm": 2.259847402572632, + "learning_rate": 2.321145848726014e-05, + "loss": 0.0716, + "step": 25880 + }, + { + "epoch": 1.6224854295920286, + "grad_norm": 0.009451135993003845, + "learning_rate": 2.3200903506364656e-05, + "loss": 0.0905, + "step": 25890 + }, + { + "epoch": 1.623112113805853, + "grad_norm": 0.06618939340114594, + "learning_rate": 2.319034852546917e-05, + "loss": 0.1083, + "step": 25900 + }, + { + "epoch": 1.6237387980196778, + "grad_norm": 0.7064309120178223, + "learning_rate": 2.3179793544573686e-05, + "loss": 0.0481, + "step": 25910 + }, + { + "epoch": 1.6243654822335025, + "grad_norm": 0.02758204936981201, + "learning_rate": 2.31692385636782e-05, + "loss": 0.0686, + "step": 25920 + }, + { + "epoch": 1.6249921664473272, + "grad_norm": 0.40926969051361084, + "learning_rate": 2.3158683582782716e-05, + "loss": 0.0348, + "step": 25930 + }, + { + "epoch": 1.625618850661152, + "grad_norm": 0.03991227596998215, + "learning_rate": 2.3148128601887233e-05, + "loss": 0.1037, + "step": 25940 + }, + { + "epoch": 1.6262455348749765, + "grad_norm": 0.029415365308523178, + "learning_rate": 2.313757362099175e-05, + "loss": 0.1124, + "step": 25950 + }, + { + "epoch": 1.6268722190888012, + "grad_norm": 0.28045380115509033, + "learning_rate": 2.3127018640096262e-05, + "loss": 0.1201, + "step": 25960 + }, + { + "epoch": 1.6274989033026257, + "grad_norm": 0.016988355666399002, + "learning_rate": 2.3116463659200776e-05, + "loss": 0.0075, + "step": 25970 + }, + { + "epoch": 1.6281255875164504, + "grad_norm": 0.010528198443353176, + "learning_rate": 2.3105908678305292e-05, + "loss": 0.0829, + "step": 25980 + }, + { + "epoch": 1.6287522717302751, + "grad_norm": 0.007346003782004118, + "learning_rate": 2.309535369740981e-05, + "loss": 0.0659, + "step": 25990 + }, + { + "epoch": 1.6293789559440999, + "grad_norm": 0.12764669954776764, + "learning_rate": 2.3084798716514326e-05, + "loss": 0.1241, + "step": 26000 + }, + { + "epoch": 1.6300056401579244, + "grad_norm": 0.8179148435592651, + "learning_rate": 2.307424373561884e-05, + "loss": 0.0656, + "step": 26010 + }, + { + "epoch": 1.630632324371749, + "grad_norm": 0.22236420214176178, + "learning_rate": 2.3063688754723356e-05, + "loss": 0.0303, + "step": 26020 + }, + { + "epoch": 1.6312590085855736, + "grad_norm": 0.006591562181711197, + "learning_rate": 2.305313377382787e-05, + "loss": 0.0267, + "step": 26030 + }, + { + "epoch": 1.6318856927993983, + "grad_norm": 0.7650337219238281, + "learning_rate": 2.304257879293239e-05, + "loss": 0.0687, + "step": 26040 + }, + { + "epoch": 1.632512377013223, + "grad_norm": 0.09074480086565018, + "learning_rate": 2.3032023812036902e-05, + "loss": 0.193, + "step": 26050 + }, + { + "epoch": 1.6331390612270478, + "grad_norm": 0.035380203276872635, + "learning_rate": 2.3021468831141415e-05, + "loss": 0.1951, + "step": 26060 + }, + { + "epoch": 1.6337657454408725, + "grad_norm": 19.835845947265625, + "learning_rate": 2.3010913850245932e-05, + "loss": 0.0323, + "step": 26070 + }, + { + "epoch": 1.634392429654697, + "grad_norm": 0.12629912793636322, + "learning_rate": 2.3000358869350445e-05, + "loss": 0.1472, + "step": 26080 + }, + { + "epoch": 1.6350191138685215, + "grad_norm": 0.1876828670501709, + "learning_rate": 2.2989803888454965e-05, + "loss": 0.0306, + "step": 26090 + }, + { + "epoch": 1.6356457980823462, + "grad_norm": 1.093254804611206, + "learning_rate": 2.297924890755948e-05, + "loss": 0.0094, + "step": 26100 + }, + { + "epoch": 1.636272482296171, + "grad_norm": 0.009309528395533562, + "learning_rate": 2.2968693926663995e-05, + "loss": 0.0102, + "step": 26110 + }, + { + "epoch": 1.6368991665099957, + "grad_norm": 0.012318827211856842, + "learning_rate": 2.295813894576851e-05, + "loss": 0.1555, + "step": 26120 + }, + { + "epoch": 1.6375258507238204, + "grad_norm": 0.028864048421382904, + "learning_rate": 2.2947583964873022e-05, + "loss": 0.0332, + "step": 26130 + }, + { + "epoch": 1.638152534937645, + "grad_norm": 0.008642313070595264, + "learning_rate": 2.2937028983977542e-05, + "loss": 0.0884, + "step": 26140 + }, + { + "epoch": 1.6387792191514696, + "grad_norm": 10.992156028747559, + "learning_rate": 2.2926474003082055e-05, + "loss": 0.3043, + "step": 26150 + }, + { + "epoch": 1.6394059033652941, + "grad_norm": 0.035156864672899246, + "learning_rate": 2.2915919022186572e-05, + "loss": 0.0967, + "step": 26160 + }, + { + "epoch": 1.6400325875791189, + "grad_norm": 0.07959800213575363, + "learning_rate": 2.2905364041291085e-05, + "loss": 0.1759, + "step": 26170 + }, + { + "epoch": 1.6406592717929436, + "grad_norm": 1.4073405265808105, + "learning_rate": 2.28948090603956e-05, + "loss": 0.1282, + "step": 26180 + }, + { + "epoch": 1.6412859560067683, + "grad_norm": 0.11637117713689804, + "learning_rate": 2.2884254079500118e-05, + "loss": 0.0229, + "step": 26190 + }, + { + "epoch": 1.6419126402205928, + "grad_norm": 0.058900900185108185, + "learning_rate": 2.2873699098604635e-05, + "loss": 0.269, + "step": 26200 + }, + { + "epoch": 1.6425393244344175, + "grad_norm": 0.06711501628160477, + "learning_rate": 2.2863144117709148e-05, + "loss": 0.0458, + "step": 26210 + }, + { + "epoch": 1.643166008648242, + "grad_norm": 0.12759794294834137, + "learning_rate": 2.285258913681366e-05, + "loss": 0.1195, + "step": 26220 + }, + { + "epoch": 1.6437926928620668, + "grad_norm": 3.203655481338501, + "learning_rate": 2.2842034155918178e-05, + "loss": 0.0766, + "step": 26230 + }, + { + "epoch": 1.6444193770758915, + "grad_norm": 0.0073426892049610615, + "learning_rate": 2.2831479175022695e-05, + "loss": 0.0895, + "step": 26240 + }, + { + "epoch": 1.6450460612897162, + "grad_norm": 0.2755219042301178, + "learning_rate": 2.282092419412721e-05, + "loss": 0.1019, + "step": 26250 + }, + { + "epoch": 1.645672745503541, + "grad_norm": 0.01606954075396061, + "learning_rate": 2.2810369213231725e-05, + "loss": 0.0815, + "step": 26260 + }, + { + "epoch": 1.6462994297173654, + "grad_norm": 0.13661789894104004, + "learning_rate": 2.279981423233624e-05, + "loss": 0.0622, + "step": 26270 + }, + { + "epoch": 1.64692611393119, + "grad_norm": 0.4233393371105194, + "learning_rate": 2.2789259251440755e-05, + "loss": 0.0526, + "step": 26280 + }, + { + "epoch": 1.6475527981450147, + "grad_norm": 0.007650166749954224, + "learning_rate": 2.277870427054527e-05, + "loss": 0.0034, + "step": 26290 + }, + { + "epoch": 1.6481794823588394, + "grad_norm": 0.06980214267969131, + "learning_rate": 2.2768149289649788e-05, + "loss": 0.0229, + "step": 26300 + }, + { + "epoch": 1.648806166572664, + "grad_norm": 0.05770709365606308, + "learning_rate": 2.27575943087543e-05, + "loss": 0.045, + "step": 26310 + }, + { + "epoch": 1.6494328507864888, + "grad_norm": 0.006944793742150068, + "learning_rate": 2.2747039327858818e-05, + "loss": 0.0595, + "step": 26320 + }, + { + "epoch": 1.6500595350003133, + "grad_norm": 0.010900170542299747, + "learning_rate": 2.273648434696333e-05, + "loss": 0.3436, + "step": 26330 + }, + { + "epoch": 1.650686219214138, + "grad_norm": 6.457736968994141, + "learning_rate": 2.272592936606785e-05, + "loss": 0.1194, + "step": 26340 + }, + { + "epoch": 1.6513129034279626, + "grad_norm": 0.021840889006853104, + "learning_rate": 2.2715374385172364e-05, + "loss": 0.1054, + "step": 26350 + }, + { + "epoch": 1.6519395876417873, + "grad_norm": 0.07167918235063553, + "learning_rate": 2.2704819404276878e-05, + "loss": 0.1127, + "step": 26360 + }, + { + "epoch": 1.652566271855612, + "grad_norm": 4.3837432861328125, + "learning_rate": 2.2694264423381394e-05, + "loss": 0.1142, + "step": 26370 + }, + { + "epoch": 1.6531929560694367, + "grad_norm": 2.342277765274048, + "learning_rate": 2.2683709442485908e-05, + "loss": 0.1701, + "step": 26380 + }, + { + "epoch": 1.6538196402832612, + "grad_norm": 3.2521579265594482, + "learning_rate": 2.2673154461590428e-05, + "loss": 0.1781, + "step": 26390 + }, + { + "epoch": 1.654446324497086, + "grad_norm": 0.09720861911773682, + "learning_rate": 2.266259948069494e-05, + "loss": 0.0441, + "step": 26400 + }, + { + "epoch": 1.6550730087109105, + "grad_norm": 1.4399570226669312, + "learning_rate": 2.2652044499799458e-05, + "loss": 0.2398, + "step": 26410 + }, + { + "epoch": 1.6556996929247352, + "grad_norm": 3.2033612728118896, + "learning_rate": 2.264148951890397e-05, + "loss": 0.127, + "step": 26420 + }, + { + "epoch": 1.65632637713856, + "grad_norm": 0.40961018204689026, + "learning_rate": 2.2630934538008487e-05, + "loss": 0.0384, + "step": 26430 + }, + { + "epoch": 1.6569530613523846, + "grad_norm": 0.07372618466615677, + "learning_rate": 2.2620379557113004e-05, + "loss": 0.0118, + "step": 26440 + }, + { + "epoch": 1.6575797455662094, + "grad_norm": 0.006588101852685213, + "learning_rate": 2.2609824576217517e-05, + "loss": 0.0775, + "step": 26450 + }, + { + "epoch": 1.6582064297800339, + "grad_norm": 0.11311575025320053, + "learning_rate": 2.2599269595322034e-05, + "loss": 0.2475, + "step": 26460 + }, + { + "epoch": 1.6588331139938584, + "grad_norm": 0.10983302444219589, + "learning_rate": 2.2588714614426547e-05, + "loss": 0.0448, + "step": 26470 + }, + { + "epoch": 1.659459798207683, + "grad_norm": 0.3269750773906708, + "learning_rate": 2.2578159633531064e-05, + "loss": 0.1573, + "step": 26480 + }, + { + "epoch": 1.6600864824215078, + "grad_norm": 9.705367088317871, + "learning_rate": 2.256760465263558e-05, + "loss": 0.0499, + "step": 26490 + }, + { + "epoch": 1.6607131666353325, + "grad_norm": 0.08202777802944183, + "learning_rate": 2.2557049671740097e-05, + "loss": 0.0541, + "step": 26500 + }, + { + "epoch": 1.6613398508491573, + "grad_norm": 24.44474220275879, + "learning_rate": 2.254649469084461e-05, + "loss": 0.1385, + "step": 26510 + }, + { + "epoch": 1.6619665350629818, + "grad_norm": 0.00329552311450243, + "learning_rate": 2.2535939709949124e-05, + "loss": 0.0697, + "step": 26520 + }, + { + "epoch": 1.6625932192768063, + "grad_norm": 0.04606331139802933, + "learning_rate": 2.252538472905364e-05, + "loss": 0.186, + "step": 26530 + }, + { + "epoch": 1.663219903490631, + "grad_norm": 0.09263589978218079, + "learning_rate": 2.2514829748158157e-05, + "loss": 0.006, + "step": 26540 + }, + { + "epoch": 1.6638465877044557, + "grad_norm": 0.07792260497808456, + "learning_rate": 2.2504274767262674e-05, + "loss": 0.0679, + "step": 26550 + }, + { + "epoch": 1.6644732719182804, + "grad_norm": 0.6075377464294434, + "learning_rate": 2.2493719786367187e-05, + "loss": 0.0973, + "step": 26560 + }, + { + "epoch": 1.6650999561321052, + "grad_norm": 0.011001525446772575, + "learning_rate": 2.2483164805471704e-05, + "loss": 0.1077, + "step": 26570 + }, + { + "epoch": 1.6657266403459297, + "grad_norm": 3.2274794578552246, + "learning_rate": 2.2472609824576217e-05, + "loss": 0.1959, + "step": 26580 + }, + { + "epoch": 1.6663533245597544, + "grad_norm": 0.2066841572523117, + "learning_rate": 2.2462054843680734e-05, + "loss": 0.0543, + "step": 26590 + }, + { + "epoch": 1.6669800087735789, + "grad_norm": 6.34409761428833, + "learning_rate": 2.245149986278525e-05, + "loss": 0.0598, + "step": 26600 + }, + { + "epoch": 1.6676066929874036, + "grad_norm": 6.800547122955322, + "learning_rate": 2.2440944881889763e-05, + "loss": 0.1238, + "step": 26610 + }, + { + "epoch": 1.6682333772012283, + "grad_norm": 3.621145725250244, + "learning_rate": 2.243038990099428e-05, + "loss": 0.2065, + "step": 26620 + }, + { + "epoch": 1.668860061415053, + "grad_norm": 0.10276742279529572, + "learning_rate": 2.2419834920098793e-05, + "loss": 0.0092, + "step": 26630 + }, + { + "epoch": 1.6694867456288776, + "grad_norm": 0.07901106029748917, + "learning_rate": 2.2409279939203313e-05, + "loss": 0.0909, + "step": 26640 + }, + { + "epoch": 1.6701134298427023, + "grad_norm": 0.20364725589752197, + "learning_rate": 2.2398724958307827e-05, + "loss": 0.0745, + "step": 26650 + }, + { + "epoch": 1.6707401140565268, + "grad_norm": 0.005757195875048637, + "learning_rate": 2.2388169977412343e-05, + "loss": 0.0874, + "step": 26660 + }, + { + "epoch": 1.6713667982703515, + "grad_norm": 5.623529434204102, + "learning_rate": 2.2377614996516857e-05, + "loss": 0.0379, + "step": 26670 + }, + { + "epoch": 1.6719934824841762, + "grad_norm": 3.7896459102630615, + "learning_rate": 2.2367060015621373e-05, + "loss": 0.166, + "step": 26680 + }, + { + "epoch": 1.672620166698001, + "grad_norm": 0.09007629007101059, + "learning_rate": 2.235650503472589e-05, + "loss": 0.0028, + "step": 26690 + }, + { + "epoch": 1.6732468509118257, + "grad_norm": 0.03563718497753143, + "learning_rate": 2.2345950053830403e-05, + "loss": 0.0511, + "step": 26700 + }, + { + "epoch": 1.6738735351256502, + "grad_norm": 5.399358749389648, + "learning_rate": 2.233539507293492e-05, + "loss": 0.2271, + "step": 26710 + }, + { + "epoch": 1.6745002193394747, + "grad_norm": 8.996834754943848, + "learning_rate": 2.2324840092039433e-05, + "loss": 0.1034, + "step": 26720 + }, + { + "epoch": 1.6751269035532994, + "grad_norm": 0.01371771190315485, + "learning_rate": 2.231428511114395e-05, + "loss": 0.1111, + "step": 26730 + }, + { + "epoch": 1.6757535877671241, + "grad_norm": 0.4695885479450226, + "learning_rate": 2.2303730130248466e-05, + "loss": 0.0503, + "step": 26740 + }, + { + "epoch": 1.6763802719809489, + "grad_norm": 7.742033004760742, + "learning_rate": 2.229317514935298e-05, + "loss": 0.0578, + "step": 26750 + }, + { + "epoch": 1.6770069561947736, + "grad_norm": 0.6429186463356018, + "learning_rate": 2.2282620168457496e-05, + "loss": 0.1231, + "step": 26760 + }, + { + "epoch": 1.677633640408598, + "grad_norm": 0.336631178855896, + "learning_rate": 2.227206518756201e-05, + "loss": 0.0489, + "step": 26770 + }, + { + "epoch": 1.6782603246224228, + "grad_norm": 4.629275798797607, + "learning_rate": 2.2261510206666526e-05, + "loss": 0.2321, + "step": 26780 + }, + { + "epoch": 1.6788870088362473, + "grad_norm": 0.511132001876831, + "learning_rate": 2.2250955225771043e-05, + "loss": 0.0051, + "step": 26790 + }, + { + "epoch": 1.679513693050072, + "grad_norm": 0.4719582796096802, + "learning_rate": 2.224040024487556e-05, + "loss": 0.0036, + "step": 26800 + }, + { + "epoch": 1.6801403772638968, + "grad_norm": 3.879145622253418, + "learning_rate": 2.2229845263980073e-05, + "loss": 0.1711, + "step": 26810 + }, + { + "epoch": 1.6807670614777215, + "grad_norm": 0.018831124529242516, + "learning_rate": 2.221929028308459e-05, + "loss": 0.151, + "step": 26820 + }, + { + "epoch": 1.681393745691546, + "grad_norm": 0.2846979796886444, + "learning_rate": 2.2208735302189103e-05, + "loss": 0.0052, + "step": 26830 + }, + { + "epoch": 1.6820204299053707, + "grad_norm": 0.11228816211223602, + "learning_rate": 2.219818032129362e-05, + "loss": 0.1743, + "step": 26840 + }, + { + "epoch": 1.6826471141191952, + "grad_norm": 0.2110615074634552, + "learning_rate": 2.2187625340398136e-05, + "loss": 0.1674, + "step": 26850 + }, + { + "epoch": 1.68327379833302, + "grad_norm": 1.6484986543655396, + "learning_rate": 2.217707035950265e-05, + "loss": 0.1076, + "step": 26860 + }, + { + "epoch": 1.6839004825468447, + "grad_norm": 0.018023191019892693, + "learning_rate": 2.2166515378607166e-05, + "loss": 0.0119, + "step": 26870 + }, + { + "epoch": 1.6845271667606694, + "grad_norm": 3.573692560195923, + "learning_rate": 2.2155960397711683e-05, + "loss": 0.0043, + "step": 26880 + }, + { + "epoch": 1.685153850974494, + "grad_norm": 0.022061511874198914, + "learning_rate": 2.21454054168162e-05, + "loss": 0.149, + "step": 26890 + }, + { + "epoch": 1.6857805351883186, + "grad_norm": 0.05415266752243042, + "learning_rate": 2.2134850435920712e-05, + "loss": 0.0696, + "step": 26900 + }, + { + "epoch": 1.686407219402143, + "grad_norm": 0.31539714336395264, + "learning_rate": 2.2124295455025226e-05, + "loss": 0.11, + "step": 26910 + }, + { + "epoch": 1.6870339036159678, + "grad_norm": 0.010716657154262066, + "learning_rate": 2.2113740474129742e-05, + "loss": 0.1094, + "step": 26920 + }, + { + "epoch": 1.6876605878297926, + "grad_norm": 0.013285728171467781, + "learning_rate": 2.210318549323426e-05, + "loss": 0.1406, + "step": 26930 + }, + { + "epoch": 1.6882872720436173, + "grad_norm": 10.243504524230957, + "learning_rate": 2.2092630512338776e-05, + "loss": 0.2042, + "step": 26940 + }, + { + "epoch": 1.688913956257442, + "grad_norm": 0.20027025043964386, + "learning_rate": 2.208207553144329e-05, + "loss": 0.0606, + "step": 26950 + }, + { + "epoch": 1.6895406404712665, + "grad_norm": 0.3113136291503906, + "learning_rate": 2.2071520550547806e-05, + "loss": 0.0528, + "step": 26960 + }, + { + "epoch": 1.6901673246850912, + "grad_norm": 0.06819582730531693, + "learning_rate": 2.206096556965232e-05, + "loss": 0.0528, + "step": 26970 + }, + { + "epoch": 1.6907940088989157, + "grad_norm": 0.016161400824785233, + "learning_rate": 2.2050410588756835e-05, + "loss": 0.1354, + "step": 26980 + }, + { + "epoch": 1.6914206931127405, + "grad_norm": 0.014996406622231007, + "learning_rate": 2.2039855607861352e-05, + "loss": 0.054, + "step": 26990 + }, + { + "epoch": 1.6920473773265652, + "grad_norm": 1.241360068321228, + "learning_rate": 2.2029300626965865e-05, + "loss": 0.1431, + "step": 27000 + }, + { + "epoch": 1.69267406154039, + "grad_norm": 0.02852221392095089, + "learning_rate": 2.2018745646070382e-05, + "loss": 0.1485, + "step": 27010 + }, + { + "epoch": 1.6933007457542144, + "grad_norm": 2.456637382507324, + "learning_rate": 2.2008190665174895e-05, + "loss": 0.1193, + "step": 27020 + }, + { + "epoch": 1.6939274299680391, + "grad_norm": 10.817239761352539, + "learning_rate": 2.1997635684279412e-05, + "loss": 0.1513, + "step": 27030 + }, + { + "epoch": 1.6945541141818636, + "grad_norm": 0.029310384765267372, + "learning_rate": 2.198708070338393e-05, + "loss": 0.0419, + "step": 27040 + }, + { + "epoch": 1.6951807983956884, + "grad_norm": 0.3586648404598236, + "learning_rate": 2.1976525722488445e-05, + "loss": 0.0393, + "step": 27050 + }, + { + "epoch": 1.695807482609513, + "grad_norm": 0.38860127329826355, + "learning_rate": 2.196597074159296e-05, + "loss": 0.0953, + "step": 27060 + }, + { + "epoch": 1.6964341668233378, + "grad_norm": 0.016696346923708916, + "learning_rate": 2.1955415760697472e-05, + "loss": 0.0227, + "step": 27070 + }, + { + "epoch": 1.6970608510371625, + "grad_norm": 0.018799861893057823, + "learning_rate": 2.194486077980199e-05, + "loss": 0.1207, + "step": 27080 + }, + { + "epoch": 1.697687535250987, + "grad_norm": 0.029151970520615578, + "learning_rate": 2.1934305798906505e-05, + "loss": 0.2022, + "step": 27090 + }, + { + "epoch": 1.6983142194648115, + "grad_norm": 0.7271300554275513, + "learning_rate": 2.1923750818011022e-05, + "loss": 0.2016, + "step": 27100 + }, + { + "epoch": 1.6989409036786363, + "grad_norm": 0.06217074394226074, + "learning_rate": 2.1913195837115535e-05, + "loss": 0.0028, + "step": 27110 + }, + { + "epoch": 1.699567587892461, + "grad_norm": 0.047516193240880966, + "learning_rate": 2.190264085622005e-05, + "loss": 0.1477, + "step": 27120 + }, + { + "epoch": 1.7001942721062857, + "grad_norm": 0.05786709114909172, + "learning_rate": 2.189208587532457e-05, + "loss": 0.2008, + "step": 27130 + }, + { + "epoch": 1.7008209563201104, + "grad_norm": 1.7550990581512451, + "learning_rate": 2.188153089442908e-05, + "loss": 0.0092, + "step": 27140 + }, + { + "epoch": 1.701447640533935, + "grad_norm": 0.052369292825460434, + "learning_rate": 2.1870975913533598e-05, + "loss": 0.0517, + "step": 27150 + }, + { + "epoch": 1.7020743247477597, + "grad_norm": 3.845099925994873, + "learning_rate": 2.186042093263811e-05, + "loss": 0.1853, + "step": 27160 + }, + { + "epoch": 1.7027010089615842, + "grad_norm": 1.763410210609436, + "learning_rate": 2.1849865951742628e-05, + "loss": 0.1128, + "step": 27170 + }, + { + "epoch": 1.7033276931754089, + "grad_norm": 7.384634017944336, + "learning_rate": 2.1839310970847145e-05, + "loss": 0.1175, + "step": 27180 + }, + { + "epoch": 1.7039543773892336, + "grad_norm": 0.035043444484472275, + "learning_rate": 2.182875598995166e-05, + "loss": 0.1171, + "step": 27190 + }, + { + "epoch": 1.7045810616030583, + "grad_norm": 0.16392628848552704, + "learning_rate": 2.1818201009056175e-05, + "loss": 0.1787, + "step": 27200 + }, + { + "epoch": 1.7052077458168828, + "grad_norm": 4.580605506896973, + "learning_rate": 2.1807646028160688e-05, + "loss": 0.0822, + "step": 27210 + }, + { + "epoch": 1.7058344300307076, + "grad_norm": 0.17701353132724762, + "learning_rate": 2.1797091047265205e-05, + "loss": 0.0689, + "step": 27220 + }, + { + "epoch": 1.706461114244532, + "grad_norm": 0.10425002127885818, + "learning_rate": 2.178653606636972e-05, + "loss": 0.0782, + "step": 27230 + }, + { + "epoch": 1.7070877984583568, + "grad_norm": 0.0539705716073513, + "learning_rate": 2.1775981085474238e-05, + "loss": 0.0702, + "step": 27240 + }, + { + "epoch": 1.7077144826721815, + "grad_norm": 0.09829293191432953, + "learning_rate": 2.176542610457875e-05, + "loss": 0.0966, + "step": 27250 + }, + { + "epoch": 1.7083411668860062, + "grad_norm": 0.05815896391868591, + "learning_rate": 2.1754871123683268e-05, + "loss": 0.1604, + "step": 27260 + }, + { + "epoch": 1.708967851099831, + "grad_norm": 0.022052781656384468, + "learning_rate": 2.174431614278778e-05, + "loss": 0.0384, + "step": 27270 + }, + { + "epoch": 1.7095945353136555, + "grad_norm": 16.015851974487305, + "learning_rate": 2.1733761161892298e-05, + "loss": 0.0064, + "step": 27280 + }, + { + "epoch": 1.71022121952748, + "grad_norm": 0.05524298548698425, + "learning_rate": 2.1723206180996814e-05, + "loss": 0.0611, + "step": 27290 + }, + { + "epoch": 1.7108479037413047, + "grad_norm": 0.027206147089600563, + "learning_rate": 2.1712651200101328e-05, + "loss": 0.0416, + "step": 27300 + }, + { + "epoch": 1.7114745879551294, + "grad_norm": 0.024542508646845818, + "learning_rate": 2.1702096219205844e-05, + "loss": 0.1534, + "step": 27310 + }, + { + "epoch": 1.7121012721689541, + "grad_norm": 501.8453674316406, + "learning_rate": 2.1691541238310358e-05, + "loss": 0.1655, + "step": 27320 + }, + { + "epoch": 1.7127279563827789, + "grad_norm": 0.027051694691181183, + "learning_rate": 2.1680986257414874e-05, + "loss": 0.2394, + "step": 27330 + }, + { + "epoch": 1.7133546405966034, + "grad_norm": 0.17021749913692474, + "learning_rate": 2.167043127651939e-05, + "loss": 0.1643, + "step": 27340 + }, + { + "epoch": 1.7139813248104279, + "grad_norm": 0.1182183101773262, + "learning_rate": 2.1659876295623908e-05, + "loss": 0.1188, + "step": 27350 + }, + { + "epoch": 1.7146080090242526, + "grad_norm": 3.9816792011260986, + "learning_rate": 2.164932131472842e-05, + "loss": 0.2683, + "step": 27360 + }, + { + "epoch": 1.7152346932380773, + "grad_norm": 0.11562171578407288, + "learning_rate": 2.1638766333832934e-05, + "loss": 0.0492, + "step": 27370 + }, + { + "epoch": 1.715861377451902, + "grad_norm": 0.18632589280605316, + "learning_rate": 2.1628211352937454e-05, + "loss": 0.1284, + "step": 27380 + }, + { + "epoch": 1.7164880616657268, + "grad_norm": 0.08556509763002396, + "learning_rate": 2.1617656372041967e-05, + "loss": 0.02, + "step": 27390 + }, + { + "epoch": 1.7171147458795513, + "grad_norm": 12.92889404296875, + "learning_rate": 2.1607101391146484e-05, + "loss": 0.1641, + "step": 27400 + }, + { + "epoch": 1.717741430093376, + "grad_norm": 6.711617469787598, + "learning_rate": 2.1596546410250997e-05, + "loss": 0.086, + "step": 27410 + }, + { + "epoch": 1.7183681143072005, + "grad_norm": 0.4091886579990387, + "learning_rate": 2.1585991429355514e-05, + "loss": 0.3211, + "step": 27420 + }, + { + "epoch": 1.7189947985210252, + "grad_norm": 0.7776539921760559, + "learning_rate": 2.157543644846003e-05, + "loss": 0.1452, + "step": 27430 + }, + { + "epoch": 1.71962148273485, + "grad_norm": 0.05936276540160179, + "learning_rate": 2.1564881467564547e-05, + "loss": 0.0849, + "step": 27440 + }, + { + "epoch": 1.7202481669486747, + "grad_norm": 13.701384544372559, + "learning_rate": 2.155432648666906e-05, + "loss": 0.0801, + "step": 27450 + }, + { + "epoch": 1.7208748511624992, + "grad_norm": 5.974350929260254, + "learning_rate": 2.1543771505773574e-05, + "loss": 0.0845, + "step": 27460 + }, + { + "epoch": 1.7215015353763239, + "grad_norm": 0.09645674377679825, + "learning_rate": 2.153321652487809e-05, + "loss": 0.0393, + "step": 27470 + }, + { + "epoch": 1.7221282195901484, + "grad_norm": 0.05956216901540756, + "learning_rate": 2.1522661543982607e-05, + "loss": 0.1193, + "step": 27480 + }, + { + "epoch": 1.7227549038039731, + "grad_norm": 19.65474510192871, + "learning_rate": 2.1512106563087124e-05, + "loss": 0.0617, + "step": 27490 + }, + { + "epoch": 1.7233815880177978, + "grad_norm": 0.23389559984207153, + "learning_rate": 2.1501551582191637e-05, + "loss": 0.0382, + "step": 27500 + }, + { + "epoch": 1.7240082722316226, + "grad_norm": 0.03733931854367256, + "learning_rate": 2.1490996601296154e-05, + "loss": 0.0807, + "step": 27510 + }, + { + "epoch": 1.7246349564454473, + "grad_norm": 0.14345206320285797, + "learning_rate": 2.1480441620400667e-05, + "loss": 0.1723, + "step": 27520 + }, + { + "epoch": 1.7252616406592718, + "grad_norm": 0.28000232577323914, + "learning_rate": 2.1469886639505184e-05, + "loss": 0.014, + "step": 27530 + }, + { + "epoch": 1.7258883248730963, + "grad_norm": 0.3158402740955353, + "learning_rate": 2.14593316586097e-05, + "loss": 0.0806, + "step": 27540 + }, + { + "epoch": 1.726515009086921, + "grad_norm": 0.1986808329820633, + "learning_rate": 2.1448776677714213e-05, + "loss": 0.169, + "step": 27550 + }, + { + "epoch": 1.7271416933007457, + "grad_norm": 0.2472744733095169, + "learning_rate": 2.143822169681873e-05, + "loss": 0.1522, + "step": 27560 + }, + { + "epoch": 1.7277683775145705, + "grad_norm": 2.0546441078186035, + "learning_rate": 2.1427666715923243e-05, + "loss": 0.036, + "step": 27570 + }, + { + "epoch": 1.7283950617283952, + "grad_norm": 0.36932218074798584, + "learning_rate": 2.141711173502776e-05, + "loss": 0.056, + "step": 27580 + }, + { + "epoch": 1.7290217459422197, + "grad_norm": 0.0200295839458704, + "learning_rate": 2.1406556754132277e-05, + "loss": 0.0711, + "step": 27590 + }, + { + "epoch": 1.7296484301560444, + "grad_norm": 0.016135990619659424, + "learning_rate": 2.139600177323679e-05, + "loss": 0.0932, + "step": 27600 + }, + { + "epoch": 1.730275114369869, + "grad_norm": 0.15928195416927338, + "learning_rate": 2.1385446792341307e-05, + "loss": 0.1799, + "step": 27610 + }, + { + "epoch": 1.7309017985836936, + "grad_norm": 0.020300181582570076, + "learning_rate": 2.137489181144582e-05, + "loss": 0.2728, + "step": 27620 + }, + { + "epoch": 1.7315284827975184, + "grad_norm": 7.829452991485596, + "learning_rate": 2.136433683055034e-05, + "loss": 0.1462, + "step": 27630 + }, + { + "epoch": 1.732155167011343, + "grad_norm": 0.7304818034172058, + "learning_rate": 2.1353781849654853e-05, + "loss": 0.2374, + "step": 27640 + }, + { + "epoch": 1.7327818512251676, + "grad_norm": 0.22246751189231873, + "learning_rate": 2.134322686875937e-05, + "loss": 0.007, + "step": 27650 + }, + { + "epoch": 1.7334085354389923, + "grad_norm": 0.46738430857658386, + "learning_rate": 2.1332671887863883e-05, + "loss": 0.0589, + "step": 27660 + }, + { + "epoch": 1.7340352196528168, + "grad_norm": 2.720977783203125, + "learning_rate": 2.13221169069684e-05, + "loss": 0.1276, + "step": 27670 + }, + { + "epoch": 1.7346619038666415, + "grad_norm": 0.04329752177000046, + "learning_rate": 2.1311561926072916e-05, + "loss": 0.1775, + "step": 27680 + }, + { + "epoch": 1.7352885880804663, + "grad_norm": 0.5023499131202698, + "learning_rate": 2.130100694517743e-05, + "loss": 0.0093, + "step": 27690 + }, + { + "epoch": 1.735915272294291, + "grad_norm": 0.09492196887731552, + "learning_rate": 2.1290451964281946e-05, + "loss": 0.0215, + "step": 27700 + }, + { + "epoch": 1.7365419565081157, + "grad_norm": 0.034697309136390686, + "learning_rate": 2.127989698338646e-05, + "loss": 0.0845, + "step": 27710 + }, + { + "epoch": 1.7371686407219402, + "grad_norm": 0.04510561749339104, + "learning_rate": 2.1269342002490976e-05, + "loss": 0.1532, + "step": 27720 + }, + { + "epoch": 1.7377953249357647, + "grad_norm": 0.8914774656295776, + "learning_rate": 2.1258787021595493e-05, + "loss": 0.0917, + "step": 27730 + }, + { + "epoch": 1.7384220091495894, + "grad_norm": 0.5372912287712097, + "learning_rate": 2.124823204070001e-05, + "loss": 0.1138, + "step": 27740 + }, + { + "epoch": 1.7390486933634142, + "grad_norm": 1.636864185333252, + "learning_rate": 2.1237677059804523e-05, + "loss": 0.1581, + "step": 27750 + }, + { + "epoch": 1.739675377577239, + "grad_norm": 0.4151596128940582, + "learning_rate": 2.1227122078909036e-05, + "loss": 0.0094, + "step": 27760 + }, + { + "epoch": 1.7403020617910636, + "grad_norm": 0.06015501543879509, + "learning_rate": 2.1216567098013553e-05, + "loss": 0.2099, + "step": 27770 + }, + { + "epoch": 1.7409287460048881, + "grad_norm": 0.28171807527542114, + "learning_rate": 2.120601211711807e-05, + "loss": 0.0634, + "step": 27780 + }, + { + "epoch": 1.7415554302187128, + "grad_norm": 0.23028428852558136, + "learning_rate": 2.1195457136222586e-05, + "loss": 0.1892, + "step": 27790 + }, + { + "epoch": 1.7421821144325373, + "grad_norm": 0.060233842581510544, + "learning_rate": 2.11849021553271e-05, + "loss": 0.0961, + "step": 27800 + }, + { + "epoch": 1.742808798646362, + "grad_norm": 0.3065778315067291, + "learning_rate": 2.1174347174431616e-05, + "loss": 0.032, + "step": 27810 + }, + { + "epoch": 1.7434354828601868, + "grad_norm": 20.94129753112793, + "learning_rate": 2.116379219353613e-05, + "loss": 0.0605, + "step": 27820 + }, + { + "epoch": 1.7440621670740115, + "grad_norm": 6.300323009490967, + "learning_rate": 2.1153237212640646e-05, + "loss": 0.2072, + "step": 27830 + }, + { + "epoch": 1.744688851287836, + "grad_norm": 2.5606863498687744, + "learning_rate": 2.1142682231745162e-05, + "loss": 0.1082, + "step": 27840 + }, + { + "epoch": 1.7453155355016607, + "grad_norm": 0.05400048941373825, + "learning_rate": 2.1132127250849676e-05, + "loss": 0.1643, + "step": 27850 + }, + { + "epoch": 1.7459422197154852, + "grad_norm": 6.569550037384033, + "learning_rate": 2.1121572269954192e-05, + "loss": 0.0357, + "step": 27860 + }, + { + "epoch": 1.74656890392931, + "grad_norm": 10.0947265625, + "learning_rate": 2.1111017289058706e-05, + "loss": 0.0658, + "step": 27870 + }, + { + "epoch": 1.7471955881431347, + "grad_norm": 0.07155396044254303, + "learning_rate": 2.1100462308163226e-05, + "loss": 0.1062, + "step": 27880 + }, + { + "epoch": 1.7478222723569594, + "grad_norm": 0.055783960968256, + "learning_rate": 2.108990732726774e-05, + "loss": 0.0037, + "step": 27890 + }, + { + "epoch": 1.7484489565707841, + "grad_norm": 20.741167068481445, + "learning_rate": 2.1079352346372256e-05, + "loss": 0.0626, + "step": 27900 + }, + { + "epoch": 1.7490756407846086, + "grad_norm": 0.012712010182440281, + "learning_rate": 2.106879736547677e-05, + "loss": 0.1363, + "step": 27910 + }, + { + "epoch": 1.7497023249984331, + "grad_norm": 13.166600227355957, + "learning_rate": 2.1058242384581282e-05, + "loss": 0.2561, + "step": 27920 + }, + { + "epoch": 1.7503290092122579, + "grad_norm": 0.2196955382823944, + "learning_rate": 2.1047687403685802e-05, + "loss": 0.4069, + "step": 27930 + }, + { + "epoch": 1.7509556934260826, + "grad_norm": 1.294158935546875, + "learning_rate": 2.1037132422790315e-05, + "loss": 0.0529, + "step": 27940 + }, + { + "epoch": 1.7515823776399073, + "grad_norm": 0.22262226045131683, + "learning_rate": 2.1026577441894832e-05, + "loss": 0.0859, + "step": 27950 + }, + { + "epoch": 1.752209061853732, + "grad_norm": 0.1377834677696228, + "learning_rate": 2.1016022460999345e-05, + "loss": 0.2275, + "step": 27960 + }, + { + "epoch": 1.7528357460675565, + "grad_norm": 5.08927583694458, + "learning_rate": 2.1005467480103862e-05, + "loss": 0.1528, + "step": 27970 + }, + { + "epoch": 1.753462430281381, + "grad_norm": 4.58087158203125, + "learning_rate": 2.099491249920838e-05, + "loss": 0.101, + "step": 27980 + }, + { + "epoch": 1.7540891144952058, + "grad_norm": 0.10001590102910995, + "learning_rate": 2.0984357518312892e-05, + "loss": 0.0529, + "step": 27990 + }, + { + "epoch": 1.7547157987090305, + "grad_norm": 1.8981465101242065, + "learning_rate": 2.097380253741741e-05, + "loss": 0.08, + "step": 28000 + }, + { + "epoch": 1.7553424829228552, + "grad_norm": 0.06657546013593674, + "learning_rate": 2.0963247556521922e-05, + "loss": 0.0844, + "step": 28010 + }, + { + "epoch": 1.75596916713668, + "grad_norm": 0.08251860737800598, + "learning_rate": 2.095269257562644e-05, + "loss": 0.0366, + "step": 28020 + }, + { + "epoch": 1.7565958513505044, + "grad_norm": 0.30270349979400635, + "learning_rate": 2.0942137594730955e-05, + "loss": 0.1185, + "step": 28030 + }, + { + "epoch": 1.7572225355643292, + "grad_norm": 0.3146784007549286, + "learning_rate": 2.0931582613835472e-05, + "loss": 0.1438, + "step": 28040 + }, + { + "epoch": 1.7578492197781537, + "grad_norm": 0.2562068998813629, + "learning_rate": 2.0921027632939985e-05, + "loss": 0.0087, + "step": 28050 + }, + { + "epoch": 1.7584759039919784, + "grad_norm": 0.011984322220087051, + "learning_rate": 2.09104726520445e-05, + "loss": 0.0678, + "step": 28060 + }, + { + "epoch": 1.7591025882058031, + "grad_norm": 0.019979266449809074, + "learning_rate": 2.0899917671149015e-05, + "loss": 0.0312, + "step": 28070 + }, + { + "epoch": 1.7597292724196278, + "grad_norm": 0.5983748435974121, + "learning_rate": 2.088936269025353e-05, + "loss": 0.0317, + "step": 28080 + }, + { + "epoch": 1.7603559566334526, + "grad_norm": 0.33651646971702576, + "learning_rate": 2.0878807709358048e-05, + "loss": 0.088, + "step": 28090 + }, + { + "epoch": 1.760982640847277, + "grad_norm": 4.7786455154418945, + "learning_rate": 2.086825272846256e-05, + "loss": 0.1714, + "step": 28100 + }, + { + "epoch": 1.7616093250611016, + "grad_norm": 0.6487839221954346, + "learning_rate": 2.0857697747567078e-05, + "loss": 0.1049, + "step": 28110 + }, + { + "epoch": 1.7622360092749263, + "grad_norm": 11.593941688537598, + "learning_rate": 2.084714276667159e-05, + "loss": 0.1995, + "step": 28120 + }, + { + "epoch": 1.762862693488751, + "grad_norm": 0.08931588381528854, + "learning_rate": 2.083658778577611e-05, + "loss": 0.046, + "step": 28130 + }, + { + "epoch": 1.7634893777025757, + "grad_norm": 0.11976440995931625, + "learning_rate": 2.0826032804880625e-05, + "loss": 0.1334, + "step": 28140 + }, + { + "epoch": 1.7641160619164005, + "grad_norm": 0.256796658039093, + "learning_rate": 2.0815477823985138e-05, + "loss": 0.0965, + "step": 28150 + }, + { + "epoch": 1.764742746130225, + "grad_norm": 0.02123934030532837, + "learning_rate": 2.0804922843089655e-05, + "loss": 0.0506, + "step": 28160 + }, + { + "epoch": 1.7653694303440495, + "grad_norm": 0.1435829997062683, + "learning_rate": 2.0794367862194168e-05, + "loss": 0.0735, + "step": 28170 + }, + { + "epoch": 1.7659961145578742, + "grad_norm": 0.043175362050533295, + "learning_rate": 2.0783812881298688e-05, + "loss": 0.05, + "step": 28180 + }, + { + "epoch": 1.766622798771699, + "grad_norm": 0.02113891765475273, + "learning_rate": 2.07732579004032e-05, + "loss": 0.0836, + "step": 28190 + }, + { + "epoch": 1.7672494829855236, + "grad_norm": 0.014169618487358093, + "learning_rate": 2.0762702919507718e-05, + "loss": 0.284, + "step": 28200 + }, + { + "epoch": 1.7678761671993484, + "grad_norm": 0.4836975336074829, + "learning_rate": 2.075214793861223e-05, + "loss": 0.0589, + "step": 28210 + }, + { + "epoch": 1.7685028514131729, + "grad_norm": 0.10142585635185242, + "learning_rate": 2.0741592957716748e-05, + "loss": 0.0098, + "step": 28220 + }, + { + "epoch": 1.7691295356269976, + "grad_norm": 0.17484275996685028, + "learning_rate": 2.0731037976821264e-05, + "loss": 0.2536, + "step": 28230 + }, + { + "epoch": 1.769756219840822, + "grad_norm": 9.866804122924805, + "learning_rate": 2.0720482995925778e-05, + "loss": 0.1497, + "step": 28240 + }, + { + "epoch": 1.7703829040546468, + "grad_norm": 0.033019669353961945, + "learning_rate": 2.0709928015030294e-05, + "loss": 0.0064, + "step": 28250 + }, + { + "epoch": 1.7710095882684715, + "grad_norm": 0.03401433676481247, + "learning_rate": 2.0699373034134808e-05, + "loss": 0.045, + "step": 28260 + }, + { + "epoch": 1.7716362724822963, + "grad_norm": 0.028515907004475594, + "learning_rate": 2.0688818053239324e-05, + "loss": 0.0711, + "step": 28270 + }, + { + "epoch": 1.7722629566961208, + "grad_norm": 0.21594108641147614, + "learning_rate": 2.067826307234384e-05, + "loss": 0.0076, + "step": 28280 + }, + { + "epoch": 1.7728896409099455, + "grad_norm": 0.03385074809193611, + "learning_rate": 2.0667708091448358e-05, + "loss": 0.1374, + "step": 28290 + }, + { + "epoch": 1.77351632512377, + "grad_norm": 0.03485338017344475, + "learning_rate": 2.065715311055287e-05, + "loss": 0.1325, + "step": 28300 + }, + { + "epoch": 1.7741430093375947, + "grad_norm": 0.013920055702328682, + "learning_rate": 2.0646598129657384e-05, + "loss": 0.0993, + "step": 28310 + }, + { + "epoch": 1.7747696935514194, + "grad_norm": 3.2841951847076416, + "learning_rate": 2.06360431487619e-05, + "loss": 0.0898, + "step": 28320 + }, + { + "epoch": 1.7753963777652442, + "grad_norm": 0.015438011847436428, + "learning_rate": 2.0625488167866417e-05, + "loss": 0.003, + "step": 28330 + }, + { + "epoch": 1.776023061979069, + "grad_norm": 0.01668211817741394, + "learning_rate": 2.0614933186970934e-05, + "loss": 0.0454, + "step": 28340 + }, + { + "epoch": 1.7766497461928934, + "grad_norm": 0.31149980425834656, + "learning_rate": 2.0604378206075447e-05, + "loss": 0.247, + "step": 28350 + }, + { + "epoch": 1.777276430406718, + "grad_norm": 0.03061065450310707, + "learning_rate": 2.0593823225179964e-05, + "loss": 0.0469, + "step": 28360 + }, + { + "epoch": 1.7779031146205426, + "grad_norm": 0.28417980670928955, + "learning_rate": 2.0583268244284477e-05, + "loss": 0.0652, + "step": 28370 + }, + { + "epoch": 1.7785297988343673, + "grad_norm": 0.0512668751180172, + "learning_rate": 2.0572713263388994e-05, + "loss": 0.0497, + "step": 28380 + }, + { + "epoch": 1.779156483048192, + "grad_norm": 0.41348955035209656, + "learning_rate": 2.056215828249351e-05, + "loss": 0.0958, + "step": 28390 + }, + { + "epoch": 1.7797831672620168, + "grad_norm": 0.01674189791083336, + "learning_rate": 2.0551603301598024e-05, + "loss": 0.0602, + "step": 28400 + }, + { + "epoch": 1.7804098514758413, + "grad_norm": 0.3735155165195465, + "learning_rate": 2.054104832070254e-05, + "loss": 0.0747, + "step": 28410 + }, + { + "epoch": 1.781036535689666, + "grad_norm": 2.6698782444000244, + "learning_rate": 2.0530493339807054e-05, + "loss": 0.0906, + "step": 28420 + }, + { + "epoch": 1.7816632199034905, + "grad_norm": 0.012423294596374035, + "learning_rate": 2.0519938358911574e-05, + "loss": 0.1468, + "step": 28430 + }, + { + "epoch": 1.7822899041173152, + "grad_norm": 9.054516792297363, + "learning_rate": 2.0509383378016087e-05, + "loss": 0.1158, + "step": 28440 + }, + { + "epoch": 1.78291658833114, + "grad_norm": 0.5451701879501343, + "learning_rate": 2.0498828397120604e-05, + "loss": 0.0784, + "step": 28450 + }, + { + "epoch": 1.7835432725449647, + "grad_norm": 0.028169279918074608, + "learning_rate": 2.0488273416225117e-05, + "loss": 0.1311, + "step": 28460 + }, + { + "epoch": 1.7841699567587892, + "grad_norm": 0.05051959306001663, + "learning_rate": 2.0477718435329634e-05, + "loss": 0.1032, + "step": 28470 + }, + { + "epoch": 1.784796640972614, + "grad_norm": 4.341728210449219, + "learning_rate": 2.046716345443415e-05, + "loss": 0.2776, + "step": 28480 + }, + { + "epoch": 1.7854233251864384, + "grad_norm": 0.04859687015414238, + "learning_rate": 2.0456608473538663e-05, + "loss": 0.0333, + "step": 28490 + }, + { + "epoch": 1.7860500094002631, + "grad_norm": 0.02066011354327202, + "learning_rate": 2.044605349264318e-05, + "loss": 0.0276, + "step": 28500 + }, + { + "epoch": 1.7866766936140879, + "grad_norm": 0.0390385240316391, + "learning_rate": 2.0435498511747693e-05, + "loss": 0.0314, + "step": 28510 + }, + { + "epoch": 1.7873033778279126, + "grad_norm": 0.014242850244045258, + "learning_rate": 2.042494353085221e-05, + "loss": 0.073, + "step": 28520 + }, + { + "epoch": 1.7879300620417373, + "grad_norm": 0.02367878518998623, + "learning_rate": 2.0414388549956727e-05, + "loss": 0.1714, + "step": 28530 + }, + { + "epoch": 1.7885567462555618, + "grad_norm": 0.019421106204390526, + "learning_rate": 2.040383356906124e-05, + "loss": 0.0617, + "step": 28540 + }, + { + "epoch": 1.7891834304693863, + "grad_norm": 0.059307683259248734, + "learning_rate": 2.0393278588165757e-05, + "loss": 0.0595, + "step": 28550 + }, + { + "epoch": 1.789810114683211, + "grad_norm": 0.02156263403594494, + "learning_rate": 2.038272360727027e-05, + "loss": 0.0034, + "step": 28560 + }, + { + "epoch": 1.7904367988970358, + "grad_norm": 0.013104693964123726, + "learning_rate": 2.0372168626374787e-05, + "loss": 0.0526, + "step": 28570 + }, + { + "epoch": 1.7910634831108605, + "grad_norm": 0.12851563096046448, + "learning_rate": 2.0361613645479303e-05, + "loss": 0.0343, + "step": 28580 + }, + { + "epoch": 1.7916901673246852, + "grad_norm": 0.008035060949623585, + "learning_rate": 2.035105866458382e-05, + "loss": 0.0237, + "step": 28590 + }, + { + "epoch": 1.7923168515385097, + "grad_norm": 0.013913143426179886, + "learning_rate": 2.0340503683688333e-05, + "loss": 0.1845, + "step": 28600 + }, + { + "epoch": 1.7929435357523344, + "grad_norm": 0.7216804623603821, + "learning_rate": 2.0329948702792846e-05, + "loss": 0.1203, + "step": 28610 + }, + { + "epoch": 1.793570219966159, + "grad_norm": 0.04569417983293533, + "learning_rate": 2.0319393721897363e-05, + "loss": 0.0288, + "step": 28620 + }, + { + "epoch": 1.7941969041799837, + "grad_norm": 0.02900240570306778, + "learning_rate": 2.030883874100188e-05, + "loss": 0.0895, + "step": 28630 + }, + { + "epoch": 1.7948235883938084, + "grad_norm": 0.0236115250736475, + "learning_rate": 2.0298283760106396e-05, + "loss": 0.0794, + "step": 28640 + }, + { + "epoch": 1.7954502726076331, + "grad_norm": 0.03295353055000305, + "learning_rate": 2.028772877921091e-05, + "loss": 0.148, + "step": 28650 + }, + { + "epoch": 1.7960769568214576, + "grad_norm": 0.03861570730805397, + "learning_rate": 2.0277173798315426e-05, + "loss": 0.1315, + "step": 28660 + }, + { + "epoch": 1.7967036410352824, + "grad_norm": 0.03682591766119003, + "learning_rate": 2.026661881741994e-05, + "loss": 0.1394, + "step": 28670 + }, + { + "epoch": 1.7973303252491069, + "grad_norm": 0.04726806655526161, + "learning_rate": 2.025606383652446e-05, + "loss": 0.0256, + "step": 28680 + }, + { + "epoch": 1.7979570094629316, + "grad_norm": 0.476746141910553, + "learning_rate": 2.0245508855628973e-05, + "loss": 0.009, + "step": 28690 + }, + { + "epoch": 1.7985836936767563, + "grad_norm": 0.4977225959300995, + "learning_rate": 2.0234953874733486e-05, + "loss": 0.1839, + "step": 28700 + }, + { + "epoch": 1.799210377890581, + "grad_norm": 0.742252767086029, + "learning_rate": 2.0224398893838003e-05, + "loss": 0.1367, + "step": 28710 + }, + { + "epoch": 1.7998370621044057, + "grad_norm": 1.1202590465545654, + "learning_rate": 2.021384391294252e-05, + "loss": 0.0563, + "step": 28720 + }, + { + "epoch": 1.8004637463182303, + "grad_norm": 0.5046806335449219, + "learning_rate": 2.0203288932047036e-05, + "loss": 0.1431, + "step": 28730 + }, + { + "epoch": 1.8010904305320548, + "grad_norm": 0.024776900187134743, + "learning_rate": 2.019273395115155e-05, + "loss": 0.0273, + "step": 28740 + }, + { + "epoch": 1.8017171147458795, + "grad_norm": 0.03161754831671715, + "learning_rate": 2.0182178970256066e-05, + "loss": 0.2066, + "step": 28750 + }, + { + "epoch": 1.8023437989597042, + "grad_norm": 31.176733016967773, + "learning_rate": 2.017162398936058e-05, + "loss": 0.1162, + "step": 28760 + }, + { + "epoch": 1.802970483173529, + "grad_norm": 0.052438825368881226, + "learning_rate": 2.0161069008465096e-05, + "loss": 0.1372, + "step": 28770 + }, + { + "epoch": 1.8035971673873537, + "grad_norm": 0.04112754762172699, + "learning_rate": 2.0150514027569612e-05, + "loss": 0.0298, + "step": 28780 + }, + { + "epoch": 1.8042238516011782, + "grad_norm": 0.6142888069152832, + "learning_rate": 2.0139959046674126e-05, + "loss": 0.0805, + "step": 28790 + }, + { + "epoch": 1.8048505358150027, + "grad_norm": 0.06450961530208588, + "learning_rate": 2.0129404065778642e-05, + "loss": 0.0734, + "step": 28800 + }, + { + "epoch": 1.8054772200288274, + "grad_norm": 4.156648635864258, + "learning_rate": 2.0118849084883156e-05, + "loss": 0.0913, + "step": 28810 + }, + { + "epoch": 1.806103904242652, + "grad_norm": 0.024707753211259842, + "learning_rate": 2.0108294103987672e-05, + "loss": 0.0503, + "step": 28820 + }, + { + "epoch": 1.8067305884564768, + "grad_norm": 0.017378469929099083, + "learning_rate": 2.009773912309219e-05, + "loss": 0.0306, + "step": 28830 + }, + { + "epoch": 1.8073572726703016, + "grad_norm": 0.022257640957832336, + "learning_rate": 2.0087184142196706e-05, + "loss": 0.2535, + "step": 28840 + }, + { + "epoch": 1.807983956884126, + "grad_norm": 14.888116836547852, + "learning_rate": 2.007662916130122e-05, + "loss": 0.1329, + "step": 28850 + }, + { + "epoch": 1.8086106410979508, + "grad_norm": 0.7564731240272522, + "learning_rate": 2.0066074180405732e-05, + "loss": 0.0147, + "step": 28860 + }, + { + "epoch": 1.8092373253117753, + "grad_norm": 0.06251378357410431, + "learning_rate": 2.005551919951025e-05, + "loss": 0.2076, + "step": 28870 + }, + { + "epoch": 1.8098640095256, + "grad_norm": 0.03549632057547569, + "learning_rate": 2.0044964218614765e-05, + "loss": 0.064, + "step": 28880 + }, + { + "epoch": 1.8104906937394247, + "grad_norm": 9.972016334533691, + "learning_rate": 2.0034409237719282e-05, + "loss": 0.2049, + "step": 28890 + }, + { + "epoch": 1.8111173779532495, + "grad_norm": 0.5740152597427368, + "learning_rate": 2.0023854256823795e-05, + "loss": 0.0966, + "step": 28900 + }, + { + "epoch": 1.8117440621670742, + "grad_norm": 0.9303169250488281, + "learning_rate": 2.0013299275928312e-05, + "loss": 0.0953, + "step": 28910 + }, + { + "epoch": 1.8123707463808987, + "grad_norm": 0.337392657995224, + "learning_rate": 2.000274429503283e-05, + "loss": 0.118, + "step": 28920 + }, + { + "epoch": 1.8129974305947232, + "grad_norm": 0.04707547277212143, + "learning_rate": 1.9992189314137342e-05, + "loss": 0.1107, + "step": 28930 + }, + { + "epoch": 1.813624114808548, + "grad_norm": 0.08423315733671188, + "learning_rate": 1.998163433324186e-05, + "loss": 0.0055, + "step": 28940 + }, + { + "epoch": 1.8142507990223726, + "grad_norm": 0.021297458559274673, + "learning_rate": 1.9971079352346372e-05, + "loss": 0.0501, + "step": 28950 + }, + { + "epoch": 1.8148774832361974, + "grad_norm": 0.040586020797491074, + "learning_rate": 1.996052437145089e-05, + "loss": 0.0848, + "step": 28960 + }, + { + "epoch": 1.815504167450022, + "grad_norm": 5.234199047088623, + "learning_rate": 1.9949969390555405e-05, + "loss": 0.1686, + "step": 28970 + }, + { + "epoch": 1.8161308516638466, + "grad_norm": 0.32946547865867615, + "learning_rate": 1.9939414409659922e-05, + "loss": 0.0032, + "step": 28980 + }, + { + "epoch": 1.816757535877671, + "grad_norm": 0.26657867431640625, + "learning_rate": 1.9928859428764435e-05, + "loss": 0.2807, + "step": 28990 + }, + { + "epoch": 1.8173842200914958, + "grad_norm": 0.054265476763248444, + "learning_rate": 1.991830444786895e-05, + "loss": 0.0384, + "step": 29000 + }, + { + "epoch": 1.8180109043053205, + "grad_norm": 12.620076179504395, + "learning_rate": 1.9907749466973465e-05, + "loss": 0.17, + "step": 29010 + }, + { + "epoch": 1.8186375885191453, + "grad_norm": 0.30357375741004944, + "learning_rate": 1.989719448607798e-05, + "loss": 0.1859, + "step": 29020 + }, + { + "epoch": 1.81926427273297, + "grad_norm": 0.42700129747390747, + "learning_rate": 1.9886639505182498e-05, + "loss": 0.0094, + "step": 29030 + }, + { + "epoch": 1.8198909569467945, + "grad_norm": 0.030239397659897804, + "learning_rate": 1.987608452428701e-05, + "loss": 0.1341, + "step": 29040 + }, + { + "epoch": 1.8205176411606192, + "grad_norm": 0.03529753535985947, + "learning_rate": 1.9865529543391528e-05, + "loss": 0.0422, + "step": 29050 + }, + { + "epoch": 1.8211443253744437, + "grad_norm": 0.7819442749023438, + "learning_rate": 1.985497456249604e-05, + "loss": 0.1731, + "step": 29060 + }, + { + "epoch": 1.8217710095882684, + "grad_norm": 0.051512762904167175, + "learning_rate": 1.9844419581600558e-05, + "loss": 0.2309, + "step": 29070 + }, + { + "epoch": 1.8223976938020932, + "grad_norm": 6.448638439178467, + "learning_rate": 1.9833864600705075e-05, + "loss": 0.1451, + "step": 29080 + }, + { + "epoch": 1.8230243780159179, + "grad_norm": 0.08587915450334549, + "learning_rate": 1.9823309619809588e-05, + "loss": 0.0288, + "step": 29090 + }, + { + "epoch": 1.8236510622297424, + "grad_norm": 19.13962173461914, + "learning_rate": 1.9812754638914105e-05, + "loss": 0.1057, + "step": 29100 + }, + { + "epoch": 1.824277746443567, + "grad_norm": 0.018747830763459206, + "learning_rate": 1.9802199658018618e-05, + "loss": 0.0739, + "step": 29110 + }, + { + "epoch": 1.8249044306573916, + "grad_norm": 0.01629248820245266, + "learning_rate": 1.9791644677123135e-05, + "loss": 0.0735, + "step": 29120 + }, + { + "epoch": 1.8255311148712163, + "grad_norm": 15.566570281982422, + "learning_rate": 1.978108969622765e-05, + "loss": 0.0868, + "step": 29130 + }, + { + "epoch": 1.826157799085041, + "grad_norm": 0.03689432889223099, + "learning_rate": 1.9770534715332168e-05, + "loss": 0.1567, + "step": 29140 + }, + { + "epoch": 1.8267844832988658, + "grad_norm": 0.22734884917736053, + "learning_rate": 1.975997973443668e-05, + "loss": 0.023, + "step": 29150 + }, + { + "epoch": 1.8274111675126905, + "grad_norm": 0.0562271811068058, + "learning_rate": 1.9749424753541194e-05, + "loss": 0.1313, + "step": 29160 + }, + { + "epoch": 1.828037851726515, + "grad_norm": 0.5230720043182373, + "learning_rate": 1.9738869772645714e-05, + "loss": 0.0257, + "step": 29170 + }, + { + "epoch": 1.8286645359403395, + "grad_norm": 0.1260448396205902, + "learning_rate": 1.9728314791750228e-05, + "loss": 0.0753, + "step": 29180 + }, + { + "epoch": 1.8292912201541642, + "grad_norm": 19.483396530151367, + "learning_rate": 1.9717759810854744e-05, + "loss": 0.0821, + "step": 29190 + }, + { + "epoch": 1.829917904367989, + "grad_norm": 0.10309408605098724, + "learning_rate": 1.9707204829959258e-05, + "loss": 0.1264, + "step": 29200 + }, + { + "epoch": 1.8305445885818137, + "grad_norm": 8.662972450256348, + "learning_rate": 1.9696649849063774e-05, + "loss": 0.1339, + "step": 29210 + }, + { + "epoch": 1.8311712727956384, + "grad_norm": 0.4835667908191681, + "learning_rate": 1.968609486816829e-05, + "loss": 0.0097, + "step": 29220 + }, + { + "epoch": 1.831797957009463, + "grad_norm": 0.02930905483663082, + "learning_rate": 1.9675539887272804e-05, + "loss": 0.0038, + "step": 29230 + }, + { + "epoch": 1.8324246412232876, + "grad_norm": 0.10878875106573105, + "learning_rate": 1.966498490637732e-05, + "loss": 0.0659, + "step": 29240 + }, + { + "epoch": 1.8330513254371121, + "grad_norm": 0.05935356393456459, + "learning_rate": 1.9654429925481834e-05, + "loss": 0.1068, + "step": 29250 + }, + { + "epoch": 1.8336780096509369, + "grad_norm": 0.020778510719537735, + "learning_rate": 1.964387494458635e-05, + "loss": 0.1359, + "step": 29260 + }, + { + "epoch": 1.8343046938647616, + "grad_norm": 10.65028190612793, + "learning_rate": 1.9633319963690867e-05, + "loss": 0.1287, + "step": 29270 + }, + { + "epoch": 1.8349313780785863, + "grad_norm": 1.7528663873672485, + "learning_rate": 1.9622764982795384e-05, + "loss": 0.0258, + "step": 29280 + }, + { + "epoch": 1.8355580622924108, + "grad_norm": 0.010283700190484524, + "learning_rate": 1.9612210001899897e-05, + "loss": 0.0024, + "step": 29290 + }, + { + "epoch": 1.8361847465062355, + "grad_norm": 1.0101772546768188, + "learning_rate": 1.9601655021004414e-05, + "loss": 0.1069, + "step": 29300 + }, + { + "epoch": 1.83681143072006, + "grad_norm": 0.014128070324659348, + "learning_rate": 1.9591100040108927e-05, + "loss": 0.0097, + "step": 29310 + }, + { + "epoch": 1.8374381149338848, + "grad_norm": 0.009399221278727055, + "learning_rate": 1.9580545059213444e-05, + "loss": 0.1276, + "step": 29320 + }, + { + "epoch": 1.8380647991477095, + "grad_norm": 2.2243754863739014, + "learning_rate": 1.956999007831796e-05, + "loss": 0.1947, + "step": 29330 + }, + { + "epoch": 1.8386914833615342, + "grad_norm": 0.06796874105930328, + "learning_rate": 1.9559435097422474e-05, + "loss": 0.1281, + "step": 29340 + }, + { + "epoch": 1.839318167575359, + "grad_norm": 1.3128525018692017, + "learning_rate": 1.954888011652699e-05, + "loss": 0.0821, + "step": 29350 + }, + { + "epoch": 1.8399448517891834, + "grad_norm": 9.305420875549316, + "learning_rate": 1.9538325135631504e-05, + "loss": 0.5251, + "step": 29360 + }, + { + "epoch": 1.840571536003008, + "grad_norm": 0.2909277379512787, + "learning_rate": 1.952777015473602e-05, + "loss": 0.087, + "step": 29370 + }, + { + "epoch": 1.8411982202168327, + "grad_norm": 0.8060989379882812, + "learning_rate": 1.9517215173840537e-05, + "loss": 0.0592, + "step": 29380 + }, + { + "epoch": 1.8418249044306574, + "grad_norm": 0.1159672886133194, + "learning_rate": 1.950666019294505e-05, + "loss": 0.0417, + "step": 29390 + }, + { + "epoch": 1.842451588644482, + "grad_norm": 3.1500940322875977, + "learning_rate": 1.9496105212049567e-05, + "loss": 0.2034, + "step": 29400 + }, + { + "epoch": 1.8430782728583068, + "grad_norm": 0.062305234372615814, + "learning_rate": 1.948555023115408e-05, + "loss": 0.156, + "step": 29410 + }, + { + "epoch": 1.8437049570721313, + "grad_norm": 0.06603378802537918, + "learning_rate": 1.94749952502586e-05, + "loss": 0.2016, + "step": 29420 + }, + { + "epoch": 1.844331641285956, + "grad_norm": 0.5237300992012024, + "learning_rate": 1.9464440269363114e-05, + "loss": 0.0637, + "step": 29430 + }, + { + "epoch": 1.8449583254997806, + "grad_norm": 0.3289642632007599, + "learning_rate": 1.945388528846763e-05, + "loss": 0.102, + "step": 29440 + }, + { + "epoch": 1.8455850097136053, + "grad_norm": 4.283963203430176, + "learning_rate": 1.9443330307572143e-05, + "loss": 0.3358, + "step": 29450 + }, + { + "epoch": 1.84621169392743, + "grad_norm": 8.56247615814209, + "learning_rate": 1.943277532667666e-05, + "loss": 0.1617, + "step": 29460 + }, + { + "epoch": 1.8468383781412547, + "grad_norm": 0.1259661316871643, + "learning_rate": 1.9422220345781177e-05, + "loss": 0.1126, + "step": 29470 + }, + { + "epoch": 1.8474650623550792, + "grad_norm": 0.09990125149488449, + "learning_rate": 1.941166536488569e-05, + "loss": 0.0875, + "step": 29480 + }, + { + "epoch": 1.848091746568904, + "grad_norm": 0.11462511122226715, + "learning_rate": 1.9401110383990207e-05, + "loss": 0.1446, + "step": 29490 + }, + { + "epoch": 1.8487184307827285, + "grad_norm": 0.11472202092409134, + "learning_rate": 1.939055540309472e-05, + "loss": 0.088, + "step": 29500 + }, + { + "epoch": 1.8493451149965532, + "grad_norm": 1.1895668506622314, + "learning_rate": 1.9380000422199237e-05, + "loss": 0.036, + "step": 29510 + }, + { + "epoch": 1.849971799210378, + "grad_norm": 0.19908933341503143, + "learning_rate": 1.9369445441303753e-05, + "loss": 0.0983, + "step": 29520 + }, + { + "epoch": 1.8505984834242026, + "grad_norm": 1.47406005859375, + "learning_rate": 1.935889046040827e-05, + "loss": 0.0543, + "step": 29530 + }, + { + "epoch": 1.8512251676380274, + "grad_norm": 0.06669019162654877, + "learning_rate": 1.9348335479512783e-05, + "loss": 0.0428, + "step": 29540 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.05538703128695488, + "learning_rate": 1.9337780498617296e-05, + "loss": 0.1216, + "step": 29550 + }, + { + "epoch": 1.8524785360656764, + "grad_norm": 0.0620872937142849, + "learning_rate": 1.9327225517721813e-05, + "loss": 0.035, + "step": 29560 + }, + { + "epoch": 1.853105220279501, + "grad_norm": 4.431196212768555, + "learning_rate": 1.931667053682633e-05, + "loss": 0.1035, + "step": 29570 + }, + { + "epoch": 1.8537319044933258, + "grad_norm": 0.05511202663183212, + "learning_rate": 1.9306115555930846e-05, + "loss": 0.105, + "step": 29580 + }, + { + "epoch": 1.8543585887071505, + "grad_norm": 0.05063912644982338, + "learning_rate": 1.929556057503536e-05, + "loss": 0.1448, + "step": 29590 + }, + { + "epoch": 1.8549852729209753, + "grad_norm": 0.08396095782518387, + "learning_rate": 1.9285005594139876e-05, + "loss": 0.0884, + "step": 29600 + }, + { + "epoch": 1.8556119571347998, + "grad_norm": 0.22409754991531372, + "learning_rate": 1.927445061324439e-05, + "loss": 0.0645, + "step": 29610 + }, + { + "epoch": 1.8562386413486243, + "grad_norm": 0.05794045329093933, + "learning_rate": 1.9263895632348906e-05, + "loss": 0.0719, + "step": 29620 + }, + { + "epoch": 1.856865325562449, + "grad_norm": 0.15742503106594086, + "learning_rate": 1.9253340651453423e-05, + "loss": 0.3772, + "step": 29630 + }, + { + "epoch": 1.8574920097762737, + "grad_norm": 0.21620403230190277, + "learning_rate": 1.9242785670557936e-05, + "loss": 0.0839, + "step": 29640 + }, + { + "epoch": 1.8581186939900984, + "grad_norm": 0.09392804652452469, + "learning_rate": 1.9232230689662453e-05, + "loss": 0.0643, + "step": 29650 + }, + { + "epoch": 1.8587453782039232, + "grad_norm": 0.05274158716201782, + "learning_rate": 1.9221675708766966e-05, + "loss": 0.1715, + "step": 29660 + }, + { + "epoch": 1.8593720624177477, + "grad_norm": 0.04212874546647072, + "learning_rate": 1.9211120727871486e-05, + "loss": 0.0605, + "step": 29670 + }, + { + "epoch": 1.8599987466315724, + "grad_norm": 0.03563644737005234, + "learning_rate": 1.9200565746976e-05, + "loss": 0.1432, + "step": 29680 + }, + { + "epoch": 1.8606254308453969, + "grad_norm": 5.44032621383667, + "learning_rate": 1.9190010766080516e-05, + "loss": 0.1619, + "step": 29690 + }, + { + "epoch": 1.8612521150592216, + "grad_norm": 0.13287276029586792, + "learning_rate": 1.917945578518503e-05, + "loss": 0.1115, + "step": 29700 + }, + { + "epoch": 1.8618787992730463, + "grad_norm": 3.4286482334136963, + "learning_rate": 1.9168900804289542e-05, + "loss": 0.1127, + "step": 29710 + }, + { + "epoch": 1.862505483486871, + "grad_norm": 0.27782508730888367, + "learning_rate": 1.9158345823394063e-05, + "loss": 0.0366, + "step": 29720 + }, + { + "epoch": 1.8631321677006958, + "grad_norm": 0.06644205003976822, + "learning_rate": 1.9147790842498576e-05, + "loss": 0.0781, + "step": 29730 + }, + { + "epoch": 1.8637588519145203, + "grad_norm": 0.0695660188794136, + "learning_rate": 1.9137235861603092e-05, + "loss": 0.074, + "step": 29740 + }, + { + "epoch": 1.8643855361283448, + "grad_norm": 0.05306769534945488, + "learning_rate": 1.9126680880707606e-05, + "loss": 0.0964, + "step": 29750 + }, + { + "epoch": 1.8650122203421695, + "grad_norm": 0.053244441747665405, + "learning_rate": 1.9116125899812122e-05, + "loss": 0.0169, + "step": 29760 + }, + { + "epoch": 1.8656389045559942, + "grad_norm": 0.16310220956802368, + "learning_rate": 1.910557091891664e-05, + "loss": 0.105, + "step": 29770 + }, + { + "epoch": 1.866265588769819, + "grad_norm": 0.07683579623699188, + "learning_rate": 1.9095015938021152e-05, + "loss": 0.0391, + "step": 29780 + }, + { + "epoch": 1.8668922729836437, + "grad_norm": 0.33125782012939453, + "learning_rate": 1.908446095712567e-05, + "loss": 0.082, + "step": 29790 + }, + { + "epoch": 1.8675189571974682, + "grad_norm": 0.02789173647761345, + "learning_rate": 1.9073905976230182e-05, + "loss": 0.1035, + "step": 29800 + }, + { + "epoch": 1.8681456414112927, + "grad_norm": 0.03273582458496094, + "learning_rate": 1.90633509953347e-05, + "loss": 0.002, + "step": 29810 + }, + { + "epoch": 1.8687723256251174, + "grad_norm": 0.04846423864364624, + "learning_rate": 1.9052796014439215e-05, + "loss": 0.0543, + "step": 29820 + }, + { + "epoch": 1.8693990098389421, + "grad_norm": 0.04357713833451271, + "learning_rate": 1.9042241033543732e-05, + "loss": 0.0017, + "step": 29830 + }, + { + "epoch": 1.8700256940527669, + "grad_norm": 9.48665714263916, + "learning_rate": 1.9031686052648245e-05, + "loss": 0.2533, + "step": 29840 + }, + { + "epoch": 1.8706523782665916, + "grad_norm": 0.02235112152993679, + "learning_rate": 1.902113107175276e-05, + "loss": 0.1493, + "step": 29850 + }, + { + "epoch": 1.871279062480416, + "grad_norm": 7.93790864944458, + "learning_rate": 1.9010576090857275e-05, + "loss": 0.2031, + "step": 29860 + }, + { + "epoch": 1.8719057466942408, + "grad_norm": 0.4831400513648987, + "learning_rate": 1.9000021109961792e-05, + "loss": 0.0422, + "step": 29870 + }, + { + "epoch": 1.8725324309080653, + "grad_norm": 12.699409484863281, + "learning_rate": 1.898946612906631e-05, + "loss": 0.1498, + "step": 29880 + }, + { + "epoch": 1.87315911512189, + "grad_norm": 0.47280260920524597, + "learning_rate": 1.8978911148170822e-05, + "loss": 0.1014, + "step": 29890 + }, + { + "epoch": 1.8737857993357148, + "grad_norm": 0.07105378061532974, + "learning_rate": 1.896835616727534e-05, + "loss": 0.1349, + "step": 29900 + }, + { + "epoch": 1.8744124835495395, + "grad_norm": 7.203770160675049, + "learning_rate": 1.8957801186379852e-05, + "loss": 0.2386, + "step": 29910 + }, + { + "epoch": 1.875039167763364, + "grad_norm": 0.05521472170948982, + "learning_rate": 1.8947246205484372e-05, + "loss": 0.0533, + "step": 29920 + }, + { + "epoch": 1.8756658519771887, + "grad_norm": 1.2736027240753174, + "learning_rate": 1.8936691224588885e-05, + "loss": 0.107, + "step": 29930 + }, + { + "epoch": 1.8762925361910132, + "grad_norm": 0.05932888761162758, + "learning_rate": 1.89261362436934e-05, + "loss": 0.1205, + "step": 29940 + }, + { + "epoch": 1.876919220404838, + "grad_norm": 0.3483729064464569, + "learning_rate": 1.8915581262797915e-05, + "loss": 0.0029, + "step": 29950 + }, + { + "epoch": 1.8775459046186627, + "grad_norm": 0.05726746469736099, + "learning_rate": 1.8905026281902428e-05, + "loss": 0.0772, + "step": 29960 + }, + { + "epoch": 1.8781725888324874, + "grad_norm": 5.409941673278809, + "learning_rate": 1.8894471301006948e-05, + "loss": 0.1557, + "step": 29970 + }, + { + "epoch": 1.8787992730463121, + "grad_norm": 0.017120420932769775, + "learning_rate": 1.888391632011146e-05, + "loss": 0.1835, + "step": 29980 + }, + { + "epoch": 1.8794259572601366, + "grad_norm": 1.3128411769866943, + "learning_rate": 1.8873361339215978e-05, + "loss": 0.2665, + "step": 29990 + }, + { + "epoch": 1.8800526414739611, + "grad_norm": 0.05831111595034599, + "learning_rate": 1.886280635832049e-05, + "loss": 0.0641, + "step": 30000 + }, + { + "epoch": 1.8806793256877858, + "grad_norm": 0.04687171056866646, + "learning_rate": 1.8852251377425008e-05, + "loss": 0.0097, + "step": 30010 + }, + { + "epoch": 1.8813060099016106, + "grad_norm": 0.08850085735321045, + "learning_rate": 1.8841696396529525e-05, + "loss": 0.0698, + "step": 30020 + }, + { + "epoch": 1.8819326941154353, + "grad_norm": 3.7799575328826904, + "learning_rate": 1.8831141415634038e-05, + "loss": 0.0373, + "step": 30030 + }, + { + "epoch": 1.88255937832926, + "grad_norm": 0.03460925817489624, + "learning_rate": 1.8820586434738555e-05, + "loss": 0.0017, + "step": 30040 + }, + { + "epoch": 1.8831860625430845, + "grad_norm": 0.3564239740371704, + "learning_rate": 1.8810031453843068e-05, + "loss": 0.2192, + "step": 30050 + }, + { + "epoch": 1.8838127467569092, + "grad_norm": 0.014373106881976128, + "learning_rate": 1.8799476472947585e-05, + "loss": 0.0709, + "step": 30060 + }, + { + "epoch": 1.8844394309707337, + "grad_norm": 0.24445849657058716, + "learning_rate": 1.87889214920521e-05, + "loss": 0.0971, + "step": 30070 + }, + { + "epoch": 1.8850661151845585, + "grad_norm": 3.6444427967071533, + "learning_rate": 1.8778366511156618e-05, + "loss": 0.0798, + "step": 30080 + }, + { + "epoch": 1.8856927993983832, + "grad_norm": 3.2659192085266113, + "learning_rate": 1.876781153026113e-05, + "loss": 0.0547, + "step": 30090 + }, + { + "epoch": 1.886319483612208, + "grad_norm": 0.009950400330126286, + "learning_rate": 1.8757256549365644e-05, + "loss": 0.0039, + "step": 30100 + }, + { + "epoch": 1.8869461678260324, + "grad_norm": 0.030715815722942352, + "learning_rate": 1.874670156847016e-05, + "loss": 0.0666, + "step": 30110 + }, + { + "epoch": 1.8875728520398571, + "grad_norm": 0.03627600520849228, + "learning_rate": 1.8736146587574678e-05, + "loss": 0.0218, + "step": 30120 + }, + { + "epoch": 1.8881995362536816, + "grad_norm": 0.0974053218960762, + "learning_rate": 1.8725591606679194e-05, + "loss": 0.1929, + "step": 30130 + }, + { + "epoch": 1.8888262204675064, + "grad_norm": 6.635987281799316, + "learning_rate": 1.8715036625783708e-05, + "loss": 0.1065, + "step": 30140 + }, + { + "epoch": 1.889452904681331, + "grad_norm": 0.1497088223695755, + "learning_rate": 1.8704481644888224e-05, + "loss": 0.0606, + "step": 30150 + }, + { + "epoch": 1.8900795888951558, + "grad_norm": 0.03236452117562294, + "learning_rate": 1.8693926663992738e-05, + "loss": 0.1933, + "step": 30160 + }, + { + "epoch": 1.8907062731089805, + "grad_norm": 5.778759002685547, + "learning_rate": 1.8683371683097254e-05, + "loss": 0.0899, + "step": 30170 + }, + { + "epoch": 1.891332957322805, + "grad_norm": 0.8521557450294495, + "learning_rate": 1.867281670220177e-05, + "loss": 0.2154, + "step": 30180 + }, + { + "epoch": 1.8919596415366295, + "grad_norm": 0.08743500709533691, + "learning_rate": 1.8662261721306284e-05, + "loss": 0.0842, + "step": 30190 + }, + { + "epoch": 1.8925863257504543, + "grad_norm": 0.06743310391902924, + "learning_rate": 1.86517067404108e-05, + "loss": 0.0971, + "step": 30200 + }, + { + "epoch": 1.893213009964279, + "grad_norm": 2.5880138874053955, + "learning_rate": 1.8641151759515314e-05, + "loss": 0.0598, + "step": 30210 + }, + { + "epoch": 1.8938396941781037, + "grad_norm": 0.16789746284484863, + "learning_rate": 1.8630596778619834e-05, + "loss": 0.0475, + "step": 30220 + }, + { + "epoch": 1.8944663783919284, + "grad_norm": 0.9268659949302673, + "learning_rate": 1.8620041797724347e-05, + "loss": 0.041, + "step": 30230 + }, + { + "epoch": 1.895093062605753, + "grad_norm": 0.8711900115013123, + "learning_rate": 1.860948681682886e-05, + "loss": 0.1084, + "step": 30240 + }, + { + "epoch": 1.8957197468195777, + "grad_norm": 0.029752174392342567, + "learning_rate": 1.8598931835933377e-05, + "loss": 0.3856, + "step": 30250 + }, + { + "epoch": 1.8963464310334022, + "grad_norm": 0.6014415621757507, + "learning_rate": 1.8588376855037894e-05, + "loss": 0.1008, + "step": 30260 + }, + { + "epoch": 1.896973115247227, + "grad_norm": 1.2103004455566406, + "learning_rate": 1.857782187414241e-05, + "loss": 0.0331, + "step": 30270 + }, + { + "epoch": 1.8975997994610516, + "grad_norm": 0.16703811287879944, + "learning_rate": 1.8567266893246924e-05, + "loss": 0.0546, + "step": 30280 + }, + { + "epoch": 1.8982264836748763, + "grad_norm": 5.745260715484619, + "learning_rate": 1.855671191235144e-05, + "loss": 0.1985, + "step": 30290 + }, + { + "epoch": 1.8988531678887008, + "grad_norm": 0.5710949897766113, + "learning_rate": 1.8546156931455954e-05, + "loss": 0.0237, + "step": 30300 + }, + { + "epoch": 1.8994798521025256, + "grad_norm": 0.0267245564609766, + "learning_rate": 1.853560195056047e-05, + "loss": 0.0519, + "step": 30310 + }, + { + "epoch": 1.90010653631635, + "grad_norm": 3.8505678176879883, + "learning_rate": 1.8525046969664987e-05, + "loss": 0.1425, + "step": 30320 + }, + { + "epoch": 1.9007332205301748, + "grad_norm": 0.6326449513435364, + "learning_rate": 1.85144919887695e-05, + "loss": 0.0259, + "step": 30330 + }, + { + "epoch": 1.9013599047439995, + "grad_norm": 3.883139133453369, + "learning_rate": 1.8503937007874017e-05, + "loss": 0.1314, + "step": 30340 + }, + { + "epoch": 1.9019865889578242, + "grad_norm": 7.332738876342773, + "learning_rate": 1.849338202697853e-05, + "loss": 0.1249, + "step": 30350 + }, + { + "epoch": 1.902613273171649, + "grad_norm": 8.143255233764648, + "learning_rate": 1.8482827046083047e-05, + "loss": 0.2064, + "step": 30360 + }, + { + "epoch": 1.9032399573854735, + "grad_norm": 0.0913747027516365, + "learning_rate": 1.8472272065187564e-05, + "loss": 0.1017, + "step": 30370 + }, + { + "epoch": 1.903866641599298, + "grad_norm": 2.4039127826690674, + "learning_rate": 1.846171708429208e-05, + "loss": 0.1219, + "step": 30380 + }, + { + "epoch": 1.9044933258131227, + "grad_norm": 2.748976469039917, + "learning_rate": 1.8451162103396593e-05, + "loss": 0.0252, + "step": 30390 + }, + { + "epoch": 1.9051200100269474, + "grad_norm": 0.049904074519872665, + "learning_rate": 1.8440607122501107e-05, + "loss": 0.0039, + "step": 30400 + }, + { + "epoch": 1.9057466942407721, + "grad_norm": 0.015348660759627819, + "learning_rate": 1.8430052141605623e-05, + "loss": 0.0532, + "step": 30410 + }, + { + "epoch": 1.9063733784545969, + "grad_norm": 0.03833635151386261, + "learning_rate": 1.841949716071014e-05, + "loss": 0.1586, + "step": 30420 + }, + { + "epoch": 1.9070000626684214, + "grad_norm": 0.11076361685991287, + "learning_rate": 1.8408942179814657e-05, + "loss": 0.0743, + "step": 30430 + }, + { + "epoch": 1.9076267468822459, + "grad_norm": 2.92325758934021, + "learning_rate": 1.839838719891917e-05, + "loss": 0.0999, + "step": 30440 + }, + { + "epoch": 1.9082534310960706, + "grad_norm": 0.04999106749892235, + "learning_rate": 1.8387832218023687e-05, + "loss": 0.143, + "step": 30450 + }, + { + "epoch": 1.9088801153098953, + "grad_norm": 0.04987865686416626, + "learning_rate": 1.83772772371282e-05, + "loss": 0.0169, + "step": 30460 + }, + { + "epoch": 1.90950679952372, + "grad_norm": 0.12620976567268372, + "learning_rate": 1.836672225623272e-05, + "loss": 0.1632, + "step": 30470 + }, + { + "epoch": 1.9101334837375448, + "grad_norm": 0.0377831868827343, + "learning_rate": 1.8356167275337233e-05, + "loss": 0.0457, + "step": 30480 + }, + { + "epoch": 1.9107601679513693, + "grad_norm": 0.10486970096826553, + "learning_rate": 1.8345612294441746e-05, + "loss": 0.0926, + "step": 30490 + }, + { + "epoch": 1.911386852165194, + "grad_norm": 0.10072964429855347, + "learning_rate": 1.8335057313546263e-05, + "loss": 0.0815, + "step": 30500 + }, + { + "epoch": 1.9120135363790185, + "grad_norm": 0.10867179930210114, + "learning_rate": 1.832450233265078e-05, + "loss": 0.0788, + "step": 30510 + }, + { + "epoch": 1.9126402205928432, + "grad_norm": 0.022253375500440598, + "learning_rate": 1.8313947351755296e-05, + "loss": 0.076, + "step": 30520 + }, + { + "epoch": 1.913266904806668, + "grad_norm": 0.15047207474708557, + "learning_rate": 1.830339237085981e-05, + "loss": 0.0035, + "step": 30530 + }, + { + "epoch": 1.9138935890204927, + "grad_norm": 0.06832699477672577, + "learning_rate": 1.8292837389964326e-05, + "loss": 0.0788, + "step": 30540 + }, + { + "epoch": 1.9145202732343174, + "grad_norm": 7.129006385803223, + "learning_rate": 1.828228240906884e-05, + "loss": 0.1998, + "step": 30550 + }, + { + "epoch": 1.915146957448142, + "grad_norm": 0.014667839743196964, + "learning_rate": 1.8271727428173356e-05, + "loss": 0.0204, + "step": 30560 + }, + { + "epoch": 1.9157736416619664, + "grad_norm": 0.0635824203491211, + "learning_rate": 1.8261172447277873e-05, + "loss": 0.0038, + "step": 30570 + }, + { + "epoch": 1.9164003258757911, + "grad_norm": 0.14155304431915283, + "learning_rate": 1.8250617466382386e-05, + "loss": 0.1168, + "step": 30580 + }, + { + "epoch": 1.9170270100896158, + "grad_norm": 15.040778160095215, + "learning_rate": 1.8240062485486903e-05, + "loss": 0.147, + "step": 30590 + }, + { + "epoch": 1.9176536943034406, + "grad_norm": 0.07141759246587753, + "learning_rate": 1.8229507504591416e-05, + "loss": 0.0787, + "step": 30600 + }, + { + "epoch": 1.9182803785172653, + "grad_norm": 33.47121810913086, + "learning_rate": 1.8218952523695933e-05, + "loss": 0.0976, + "step": 30610 + }, + { + "epoch": 1.9189070627310898, + "grad_norm": 15.086108207702637, + "learning_rate": 1.820839754280045e-05, + "loss": 0.1981, + "step": 30620 + }, + { + "epoch": 1.9195337469449143, + "grad_norm": 1.0487416982650757, + "learning_rate": 1.8197842561904963e-05, + "loss": 0.0605, + "step": 30630 + }, + { + "epoch": 1.920160431158739, + "grad_norm": 0.14224369823932648, + "learning_rate": 1.818728758100948e-05, + "loss": 0.1471, + "step": 30640 + }, + { + "epoch": 1.9207871153725637, + "grad_norm": 2.3341376781463623, + "learning_rate": 1.8176732600113992e-05, + "loss": 0.0515, + "step": 30650 + }, + { + "epoch": 1.9214137995863885, + "grad_norm": 2.8665385246276855, + "learning_rate": 1.816617761921851e-05, + "loss": 0.0349, + "step": 30660 + }, + { + "epoch": 1.9220404838002132, + "grad_norm": 0.08486692607402802, + "learning_rate": 1.8155622638323026e-05, + "loss": 0.0768, + "step": 30670 + }, + { + "epoch": 1.9226671680140377, + "grad_norm": 5.3784332275390625, + "learning_rate": 1.8145067657427542e-05, + "loss": 0.1577, + "step": 30680 + }, + { + "epoch": 1.9232938522278624, + "grad_norm": 0.010095058009028435, + "learning_rate": 1.8134512676532056e-05, + "loss": 0.0529, + "step": 30690 + }, + { + "epoch": 1.923920536441687, + "grad_norm": 0.10281635820865631, + "learning_rate": 1.8123957695636572e-05, + "loss": 0.1099, + "step": 30700 + }, + { + "epoch": 1.9245472206555116, + "grad_norm": 7.003781318664551, + "learning_rate": 1.8113402714741086e-05, + "loss": 0.162, + "step": 30710 + }, + { + "epoch": 1.9251739048693364, + "grad_norm": 0.09196191281080246, + "learning_rate": 1.8102847733845602e-05, + "loss": 0.0531, + "step": 30720 + }, + { + "epoch": 1.925800589083161, + "grad_norm": 0.5653356909751892, + "learning_rate": 1.809229275295012e-05, + "loss": 0.1701, + "step": 30730 + }, + { + "epoch": 1.9264272732969856, + "grad_norm": 0.48378825187683105, + "learning_rate": 1.8081737772054632e-05, + "loss": 0.1326, + "step": 30740 + }, + { + "epoch": 1.9270539575108103, + "grad_norm": 0.2154448926448822, + "learning_rate": 1.807118279115915e-05, + "loss": 0.1484, + "step": 30750 + }, + { + "epoch": 1.9276806417246348, + "grad_norm": 0.0348966158926487, + "learning_rate": 1.8060627810263665e-05, + "loss": 0.0638, + "step": 30760 + }, + { + "epoch": 1.9283073259384595, + "grad_norm": 0.03106912225484848, + "learning_rate": 1.8050072829368182e-05, + "loss": 0.002, + "step": 30770 + }, + { + "epoch": 1.9289340101522843, + "grad_norm": 0.030033491551876068, + "learning_rate": 1.8039517848472695e-05, + "loss": 0.1182, + "step": 30780 + }, + { + "epoch": 1.929560694366109, + "grad_norm": 0.008149535395205021, + "learning_rate": 1.802896286757721e-05, + "loss": 0.1093, + "step": 30790 + }, + { + "epoch": 1.9301873785799337, + "grad_norm": 0.029186800122261047, + "learning_rate": 1.8018407886681725e-05, + "loss": 0.0311, + "step": 30800 + }, + { + "epoch": 1.9308140627937582, + "grad_norm": 0.24340951442718506, + "learning_rate": 1.8007852905786242e-05, + "loss": 0.0338, + "step": 30810 + }, + { + "epoch": 1.9314407470075827, + "grad_norm": 0.02873971126973629, + "learning_rate": 1.799729792489076e-05, + "loss": 0.0698, + "step": 30820 + }, + { + "epoch": 1.9320674312214074, + "grad_norm": 0.01757604442536831, + "learning_rate": 1.7986742943995272e-05, + "loss": 0.0096, + "step": 30830 + }, + { + "epoch": 1.9326941154352322, + "grad_norm": 0.012573912739753723, + "learning_rate": 1.797618796309979e-05, + "loss": 0.1414, + "step": 30840 + }, + { + "epoch": 1.933320799649057, + "grad_norm": 7.6997480392456055, + "learning_rate": 1.7965632982204302e-05, + "loss": 0.1177, + "step": 30850 + }, + { + "epoch": 1.9339474838628816, + "grad_norm": 0.007678247056901455, + "learning_rate": 1.795507800130882e-05, + "loss": 0.045, + "step": 30860 + }, + { + "epoch": 1.9345741680767061, + "grad_norm": 0.006309192162007093, + "learning_rate": 1.7944523020413335e-05, + "loss": 0.0119, + "step": 30870 + }, + { + "epoch": 1.9352008522905308, + "grad_norm": 0.0472506545484066, + "learning_rate": 1.793396803951785e-05, + "loss": 0.1314, + "step": 30880 + }, + { + "epoch": 1.9358275365043554, + "grad_norm": 0.019679518416523933, + "learning_rate": 1.7923413058622365e-05, + "loss": 0.0072, + "step": 30890 + }, + { + "epoch": 1.93645422071818, + "grad_norm": 0.025182457640767097, + "learning_rate": 1.7912858077726878e-05, + "loss": 0.0436, + "step": 30900 + }, + { + "epoch": 1.9370809049320048, + "grad_norm": 20.712900161743164, + "learning_rate": 1.7902303096831395e-05, + "loss": 0.235, + "step": 30910 + }, + { + "epoch": 1.9377075891458295, + "grad_norm": 0.10453003644943237, + "learning_rate": 1.789174811593591e-05, + "loss": 0.1638, + "step": 30920 + }, + { + "epoch": 1.938334273359654, + "grad_norm": 0.261699914932251, + "learning_rate": 1.7881193135040428e-05, + "loss": 0.0507, + "step": 30930 + }, + { + "epoch": 1.9389609575734787, + "grad_norm": 0.3301703631877899, + "learning_rate": 1.787063815414494e-05, + "loss": 0.1241, + "step": 30940 + }, + { + "epoch": 1.9395876417873033, + "grad_norm": 0.11412771791219711, + "learning_rate": 1.7860083173249455e-05, + "loss": 0.0123, + "step": 30950 + }, + { + "epoch": 1.940214326001128, + "grad_norm": 0.10817641019821167, + "learning_rate": 1.7849528192353975e-05, + "loss": 0.2673, + "step": 30960 + }, + { + "epoch": 1.9408410102149527, + "grad_norm": 3.031972646713257, + "learning_rate": 1.7838973211458488e-05, + "loss": 0.0864, + "step": 30970 + }, + { + "epoch": 1.9414676944287774, + "grad_norm": 0.1671517789363861, + "learning_rate": 1.7828418230563005e-05, + "loss": 0.1065, + "step": 30980 + }, + { + "epoch": 1.9420943786426021, + "grad_norm": 0.06491947174072266, + "learning_rate": 1.7817863249667518e-05, + "loss": 0.046, + "step": 30990 + }, + { + "epoch": 1.9427210628564267, + "grad_norm": 0.005951672792434692, + "learning_rate": 1.7807308268772035e-05, + "loss": 0.0031, + "step": 31000 + }, + { + "epoch": 1.9433477470702512, + "grad_norm": 0.060672201216220856, + "learning_rate": 1.779675328787655e-05, + "loss": 0.124, + "step": 31010 + }, + { + "epoch": 1.9439744312840759, + "grad_norm": 0.004647328983992338, + "learning_rate": 1.7786198306981065e-05, + "loss": 0.0021, + "step": 31020 + }, + { + "epoch": 1.9446011154979006, + "grad_norm": 2.4108574390411377, + "learning_rate": 1.777564332608558e-05, + "loss": 0.0126, + "step": 31030 + }, + { + "epoch": 1.9452277997117253, + "grad_norm": 0.024586204439401627, + "learning_rate": 1.7765088345190094e-05, + "loss": 0.0686, + "step": 31040 + }, + { + "epoch": 1.94585448392555, + "grad_norm": 3.447274923324585, + "learning_rate": 1.775453336429461e-05, + "loss": 0.0141, + "step": 31050 + }, + { + "epoch": 1.9464811681393746, + "grad_norm": 0.02935156226158142, + "learning_rate": 1.7743978383399128e-05, + "loss": 0.0688, + "step": 31060 + }, + { + "epoch": 1.9471078523531993, + "grad_norm": 0.04828619956970215, + "learning_rate": 1.7733423402503644e-05, + "loss": 0.0683, + "step": 31070 + }, + { + "epoch": 1.9477345365670238, + "grad_norm": 0.02683708630502224, + "learning_rate": 1.7722868421608158e-05, + "loss": 0.1011, + "step": 31080 + }, + { + "epoch": 1.9483612207808485, + "grad_norm": 0.03948148712515831, + "learning_rate": 1.7712313440712674e-05, + "loss": 0.1203, + "step": 31090 + }, + { + "epoch": 1.9489879049946732, + "grad_norm": 0.0707787424325943, + "learning_rate": 1.7701758459817188e-05, + "loss": 0.1169, + "step": 31100 + }, + { + "epoch": 1.949614589208498, + "grad_norm": 0.21310770511627197, + "learning_rate": 1.7691203478921704e-05, + "loss": 0.0081, + "step": 31110 + }, + { + "epoch": 1.9502412734223225, + "grad_norm": 0.798648476600647, + "learning_rate": 1.768064849802622e-05, + "loss": 0.0931, + "step": 31120 + }, + { + "epoch": 1.9508679576361472, + "grad_norm": 0.0065465401858091354, + "learning_rate": 1.7670093517130734e-05, + "loss": 0.2703, + "step": 31130 + }, + { + "epoch": 1.9514946418499717, + "grad_norm": 0.060641299933195114, + "learning_rate": 1.765953853623525e-05, + "loss": 0.088, + "step": 31140 + }, + { + "epoch": 1.9521213260637964, + "grad_norm": 64.52513122558594, + "learning_rate": 1.7648983555339764e-05, + "loss": 0.1919, + "step": 31150 + }, + { + "epoch": 1.9527480102776211, + "grad_norm": 0.5025072693824768, + "learning_rate": 1.763842857444428e-05, + "loss": 0.1127, + "step": 31160 + }, + { + "epoch": 1.9533746944914459, + "grad_norm": 0.23550131916999817, + "learning_rate": 1.7627873593548797e-05, + "loss": 0.0814, + "step": 31170 + }, + { + "epoch": 1.9540013787052706, + "grad_norm": 0.006434514187276363, + "learning_rate": 1.761731861265331e-05, + "loss": 0.0089, + "step": 31180 + }, + { + "epoch": 1.954628062919095, + "grad_norm": 0.3607669174671173, + "learning_rate": 1.7606763631757827e-05, + "loss": 0.0767, + "step": 31190 + }, + { + "epoch": 1.9552547471329196, + "grad_norm": 0.012200615368783474, + "learning_rate": 1.759620865086234e-05, + "loss": 0.007, + "step": 31200 + }, + { + "epoch": 1.9558814313467443, + "grad_norm": 0.04041421413421631, + "learning_rate": 1.758565366996686e-05, + "loss": 0.0545, + "step": 31210 + }, + { + "epoch": 1.956508115560569, + "grad_norm": 0.015096294693648815, + "learning_rate": 1.7575098689071374e-05, + "loss": 0.0678, + "step": 31220 + }, + { + "epoch": 1.9571347997743938, + "grad_norm": 0.08720281720161438, + "learning_rate": 1.756454370817589e-05, + "loss": 0.1194, + "step": 31230 + }, + { + "epoch": 1.9577614839882185, + "grad_norm": 0.013159573078155518, + "learning_rate": 1.7553988727280404e-05, + "loss": 0.0914, + "step": 31240 + }, + { + "epoch": 1.958388168202043, + "grad_norm": 4.2139458656311035, + "learning_rate": 1.7543433746384917e-05, + "loss": 0.2414, + "step": 31250 + }, + { + "epoch": 1.9590148524158675, + "grad_norm": 0.3159102499485016, + "learning_rate": 1.7532878765489437e-05, + "loss": 0.0105, + "step": 31260 + }, + { + "epoch": 1.9596415366296922, + "grad_norm": 0.03543320670723915, + "learning_rate": 1.752232378459395e-05, + "loss": 0.076, + "step": 31270 + }, + { + "epoch": 1.960268220843517, + "grad_norm": 0.19333113729953766, + "learning_rate": 1.7511768803698467e-05, + "loss": 0.036, + "step": 31280 + }, + { + "epoch": 1.9608949050573417, + "grad_norm": 0.017696138471364975, + "learning_rate": 1.750121382280298e-05, + "loss": 0.0557, + "step": 31290 + }, + { + "epoch": 1.9615215892711664, + "grad_norm": 0.19619330763816833, + "learning_rate": 1.7490658841907497e-05, + "loss": 0.0709, + "step": 31300 + }, + { + "epoch": 1.9621482734849909, + "grad_norm": 2.6561007499694824, + "learning_rate": 1.7480103861012014e-05, + "loss": 0.0741, + "step": 31310 + }, + { + "epoch": 1.9627749576988156, + "grad_norm": 0.20458722114562988, + "learning_rate": 1.746954888011653e-05, + "loss": 0.1059, + "step": 31320 + }, + { + "epoch": 1.96340164191264, + "grad_norm": 0.01259886845946312, + "learning_rate": 1.7458993899221043e-05, + "loss": 0.2263, + "step": 31330 + }, + { + "epoch": 1.9640283261264648, + "grad_norm": 0.17994363605976105, + "learning_rate": 1.7448438918325557e-05, + "loss": 0.044, + "step": 31340 + }, + { + "epoch": 1.9646550103402896, + "grad_norm": 0.0707567110657692, + "learning_rate": 1.7437883937430073e-05, + "loss": 0.0222, + "step": 31350 + }, + { + "epoch": 1.9652816945541143, + "grad_norm": 0.019019365310668945, + "learning_rate": 1.742732895653459e-05, + "loss": 0.0041, + "step": 31360 + }, + { + "epoch": 1.965908378767939, + "grad_norm": 0.030839603394269943, + "learning_rate": 1.7416773975639107e-05, + "loss": 0.1864, + "step": 31370 + }, + { + "epoch": 1.9665350629817635, + "grad_norm": 0.028285950422286987, + "learning_rate": 1.740621899474362e-05, + "loss": 0.0039, + "step": 31380 + }, + { + "epoch": 1.967161747195588, + "grad_norm": 0.055145930498838425, + "learning_rate": 1.7395664013848137e-05, + "loss": 0.0156, + "step": 31390 + }, + { + "epoch": 1.9677884314094127, + "grad_norm": 0.01977921463549137, + "learning_rate": 1.738510903295265e-05, + "loss": 0.1479, + "step": 31400 + }, + { + "epoch": 1.9684151156232375, + "grad_norm": 0.034482572227716446, + "learning_rate": 1.7374554052057167e-05, + "loss": 0.0572, + "step": 31410 + }, + { + "epoch": 1.9690417998370622, + "grad_norm": 6.365627288818359, + "learning_rate": 1.7363999071161683e-05, + "loss": 0.1508, + "step": 31420 + }, + { + "epoch": 1.969668484050887, + "grad_norm": 0.18992501497268677, + "learning_rate": 1.7353444090266196e-05, + "loss": 0.1212, + "step": 31430 + }, + { + "epoch": 1.9702951682647114, + "grad_norm": 95.36531066894531, + "learning_rate": 1.7342889109370713e-05, + "loss": 0.0771, + "step": 31440 + }, + { + "epoch": 1.970921852478536, + "grad_norm": 0.13923226296901703, + "learning_rate": 1.7332334128475226e-05, + "loss": 0.074, + "step": 31450 + }, + { + "epoch": 1.9715485366923606, + "grad_norm": 0.07367946207523346, + "learning_rate": 1.7321779147579746e-05, + "loss": 0.244, + "step": 31460 + }, + { + "epoch": 1.9721752209061854, + "grad_norm": 0.11902827024459839, + "learning_rate": 1.731122416668426e-05, + "loss": 0.1009, + "step": 31470 + }, + { + "epoch": 1.97280190512001, + "grad_norm": 4.948719501495361, + "learning_rate": 1.7300669185788776e-05, + "loss": 0.0484, + "step": 31480 + }, + { + "epoch": 1.9734285893338348, + "grad_norm": 6.367574691772461, + "learning_rate": 1.729011420489329e-05, + "loss": 0.0909, + "step": 31490 + }, + { + "epoch": 1.9740552735476593, + "grad_norm": 0.18085089325904846, + "learning_rate": 1.7279559223997803e-05, + "loss": 0.1165, + "step": 31500 + }, + { + "epoch": 1.974681957761484, + "grad_norm": 1.338959813117981, + "learning_rate": 1.7269004243102323e-05, + "loss": 0.044, + "step": 31510 + }, + { + "epoch": 1.9753086419753085, + "grad_norm": 0.16457167267799377, + "learning_rate": 1.7258449262206836e-05, + "loss": 0.1149, + "step": 31520 + }, + { + "epoch": 1.9759353261891333, + "grad_norm": 0.05906875804066658, + "learning_rate": 1.7247894281311353e-05, + "loss": 0.0059, + "step": 31530 + }, + { + "epoch": 1.976562010402958, + "grad_norm": 0.09920290857553482, + "learning_rate": 1.7237339300415866e-05, + "loss": 0.3544, + "step": 31540 + }, + { + "epoch": 1.9771886946167827, + "grad_norm": 0.11022662371397018, + "learning_rate": 1.7226784319520383e-05, + "loss": 0.1279, + "step": 31550 + }, + { + "epoch": 1.9778153788306072, + "grad_norm": 0.21044260263442993, + "learning_rate": 1.72162293386249e-05, + "loss": 0.2122, + "step": 31560 + }, + { + "epoch": 1.978442063044432, + "grad_norm": 3.5601449012756348, + "learning_rate": 1.7205674357729413e-05, + "loss": 0.0426, + "step": 31570 + }, + { + "epoch": 1.9790687472582564, + "grad_norm": 0.15041138231754303, + "learning_rate": 1.719511937683393e-05, + "loss": 0.0905, + "step": 31580 + }, + { + "epoch": 1.9796954314720812, + "grad_norm": 0.2093459963798523, + "learning_rate": 1.7184564395938443e-05, + "loss": 0.151, + "step": 31590 + }, + { + "epoch": 1.9803221156859059, + "grad_norm": 0.0759732648730278, + "learning_rate": 1.717400941504296e-05, + "loss": 0.0065, + "step": 31600 + }, + { + "epoch": 1.9809487998997306, + "grad_norm": 6.305356502532959, + "learning_rate": 1.7163454434147476e-05, + "loss": 0.2713, + "step": 31610 + }, + { + "epoch": 1.9815754841135553, + "grad_norm": 0.15519599616527557, + "learning_rate": 1.7152899453251992e-05, + "loss": 0.0993, + "step": 31620 + }, + { + "epoch": 1.9822021683273798, + "grad_norm": 0.022300608456134796, + "learning_rate": 1.7142344472356506e-05, + "loss": 0.1047, + "step": 31630 + }, + { + "epoch": 1.9828288525412043, + "grad_norm": 0.06441289931535721, + "learning_rate": 1.713178949146102e-05, + "loss": 0.0446, + "step": 31640 + }, + { + "epoch": 1.983455536755029, + "grad_norm": 0.13338226079940796, + "learning_rate": 1.7121234510565536e-05, + "loss": 0.0501, + "step": 31650 + }, + { + "epoch": 1.9840822209688538, + "grad_norm": 0.06936602294445038, + "learning_rate": 1.7110679529670052e-05, + "loss": 0.0919, + "step": 31660 + }, + { + "epoch": 1.9847089051826785, + "grad_norm": 0.03395620360970497, + "learning_rate": 1.710012454877457e-05, + "loss": 0.1159, + "step": 31670 + }, + { + "epoch": 1.9853355893965032, + "grad_norm": 3.857844591140747, + "learning_rate": 1.7089569567879082e-05, + "loss": 0.0358, + "step": 31680 + }, + { + "epoch": 1.9859622736103277, + "grad_norm": 12.544042587280273, + "learning_rate": 1.70790145869836e-05, + "loss": 0.1129, + "step": 31690 + }, + { + "epoch": 1.9865889578241525, + "grad_norm": 0.047209981828927994, + "learning_rate": 1.7068459606088112e-05, + "loss": 0.0967, + "step": 31700 + }, + { + "epoch": 1.987215642037977, + "grad_norm": 0.01187932025641203, + "learning_rate": 1.7057904625192632e-05, + "loss": 0.0957, + "step": 31710 + }, + { + "epoch": 1.9878423262518017, + "grad_norm": 0.009124066680669785, + "learning_rate": 1.7047349644297145e-05, + "loss": 0.0752, + "step": 31720 + }, + { + "epoch": 1.9884690104656264, + "grad_norm": 0.06783958524465561, + "learning_rate": 1.703679466340166e-05, + "loss": 0.0324, + "step": 31730 + }, + { + "epoch": 1.9890956946794511, + "grad_norm": 4.436092376708984, + "learning_rate": 1.7026239682506175e-05, + "loss": 0.0974, + "step": 31740 + }, + { + "epoch": 1.9897223788932756, + "grad_norm": 8.211563110351562, + "learning_rate": 1.701568470161069e-05, + "loss": 0.1157, + "step": 31750 + }, + { + "epoch": 1.9903490631071004, + "grad_norm": 6.582380294799805, + "learning_rate": 1.700512972071521e-05, + "loss": 0.2133, + "step": 31760 + }, + { + "epoch": 1.9909757473209249, + "grad_norm": 0.029442772269248962, + "learning_rate": 1.6994574739819722e-05, + "loss": 0.0751, + "step": 31770 + }, + { + "epoch": 1.9916024315347496, + "grad_norm": 10.132109642028809, + "learning_rate": 1.698401975892424e-05, + "loss": 0.1387, + "step": 31780 + }, + { + "epoch": 1.9922291157485743, + "grad_norm": 7.6260576248168945, + "learning_rate": 1.6973464778028752e-05, + "loss": 0.0754, + "step": 31790 + }, + { + "epoch": 1.992855799962399, + "grad_norm": 0.12112826108932495, + "learning_rate": 1.696290979713327e-05, + "loss": 0.0355, + "step": 31800 + }, + { + "epoch": 1.9934824841762238, + "grad_norm": 0.12398527562618256, + "learning_rate": 1.6952354816237785e-05, + "loss": 0.1311, + "step": 31810 + }, + { + "epoch": 1.9941091683900483, + "grad_norm": 0.023617800325155258, + "learning_rate": 1.69417998353423e-05, + "loss": 0.0997, + "step": 31820 + }, + { + "epoch": 1.9947358526038728, + "grad_norm": 0.919659435749054, + "learning_rate": 1.6931244854446815e-05, + "loss": 0.0071, + "step": 31830 + }, + { + "epoch": 1.9953625368176975, + "grad_norm": 0.2467774897813797, + "learning_rate": 1.6920689873551328e-05, + "loss": 0.0057, + "step": 31840 + }, + { + "epoch": 1.9959892210315222, + "grad_norm": 7.434739112854004, + "learning_rate": 1.6910134892655845e-05, + "loss": 0.1482, + "step": 31850 + }, + { + "epoch": 1.996615905245347, + "grad_norm": 0.011252244003117085, + "learning_rate": 1.689957991176036e-05, + "loss": 0.1319, + "step": 31860 + }, + { + "epoch": 1.9972425894591717, + "grad_norm": 0.03588128834962845, + "learning_rate": 1.6889024930864875e-05, + "loss": 0.0033, + "step": 31870 + }, + { + "epoch": 1.9978692736729962, + "grad_norm": 0.03404240682721138, + "learning_rate": 1.687846994996939e-05, + "loss": 0.1253, + "step": 31880 + }, + { + "epoch": 1.9984959578868209, + "grad_norm": 5.999427318572998, + "learning_rate": 1.6867914969073905e-05, + "loss": 0.2037, + "step": 31890 + }, + { + "epoch": 1.9991226421006454, + "grad_norm": 0.19319628179073334, + "learning_rate": 1.685735998817842e-05, + "loss": 0.0178, + "step": 31900 + }, + { + "epoch": 1.99974932631447, + "grad_norm": 8.734156608581543, + "learning_rate": 1.6846805007282938e-05, + "loss": 0.1435, + "step": 31910 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9659094469685101, + "eval_f1": 0.9651640718639931, + "eval_loss": 0.1303279846906662, + "eval_precision": 0.9648435825911362, + "eval_recall": 0.9659094469685101, + "eval_runtime": 288.0737, + "eval_samples_per_second": 110.788, + "eval_steps_per_second": 13.851, + "step": 31914 + }, + { + "epoch": 2.000376010528295, + "grad_norm": 0.5314622521400452, + "learning_rate": 1.6836250026387455e-05, + "loss": 0.0033, + "step": 31920 + }, + { + "epoch": 2.0010026947421196, + "grad_norm": 0.09729882329702377, + "learning_rate": 1.6825695045491968e-05, + "loss": 0.0044, + "step": 31930 + }, + { + "epoch": 2.0016293789559443, + "grad_norm": 0.02791699394583702, + "learning_rate": 1.6815140064596485e-05, + "loss": 0.0358, + "step": 31940 + }, + { + "epoch": 2.0022560631697686, + "grad_norm": 9.531925201416016, + "learning_rate": 1.6804585083700998e-05, + "loss": 0.0777, + "step": 31950 + }, + { + "epoch": 2.0028827473835933, + "grad_norm": 0.03661125525832176, + "learning_rate": 1.6794030102805515e-05, + "loss": 0.1605, + "step": 31960 + }, + { + "epoch": 2.003509431597418, + "grad_norm": 0.055404722690582275, + "learning_rate": 1.678347512191003e-05, + "loss": 0.1094, + "step": 31970 + }, + { + "epoch": 2.0041361158112427, + "grad_norm": 18.206174850463867, + "learning_rate": 1.6772920141014544e-05, + "loss": 0.0729, + "step": 31980 + }, + { + "epoch": 2.0047628000250675, + "grad_norm": 0.02194293774664402, + "learning_rate": 1.676236516011906e-05, + "loss": 0.0643, + "step": 31990 + }, + { + "epoch": 2.005389484238892, + "grad_norm": 0.046822380274534225, + "learning_rate": 1.6751810179223574e-05, + "loss": 0.0398, + "step": 32000 + }, + { + "epoch": 2.0060161684527165, + "grad_norm": 0.021439900621771812, + "learning_rate": 1.6741255198328094e-05, + "loss": 0.0673, + "step": 32010 + }, + { + "epoch": 2.006642852666541, + "grad_norm": 0.1626669466495514, + "learning_rate": 1.6730700217432608e-05, + "loss": 0.158, + "step": 32020 + }, + { + "epoch": 2.007269536880366, + "grad_norm": 0.24504497647285461, + "learning_rate": 1.672014523653712e-05, + "loss": 0.0311, + "step": 32030 + }, + { + "epoch": 2.0078962210941906, + "grad_norm": 0.02000507339835167, + "learning_rate": 1.6709590255641638e-05, + "loss": 0.0054, + "step": 32040 + }, + { + "epoch": 2.0085229053080154, + "grad_norm": 1.3739451169967651, + "learning_rate": 1.6699035274746154e-05, + "loss": 0.0444, + "step": 32050 + }, + { + "epoch": 2.00914958952184, + "grad_norm": 0.028412913903594017, + "learning_rate": 1.668848029385067e-05, + "loss": 0.0455, + "step": 32060 + }, + { + "epoch": 2.009776273735665, + "grad_norm": 0.13429409265518188, + "learning_rate": 1.6677925312955184e-05, + "loss": 0.0704, + "step": 32070 + }, + { + "epoch": 2.010402957949489, + "grad_norm": 0.25238388776779175, + "learning_rate": 1.66673703320597e-05, + "loss": 0.0777, + "step": 32080 + }, + { + "epoch": 2.011029642163314, + "grad_norm": 0.03479380160570145, + "learning_rate": 1.6656815351164214e-05, + "loss": 0.025, + "step": 32090 + }, + { + "epoch": 2.0116563263771385, + "grad_norm": 0.01721784472465515, + "learning_rate": 1.664626037026873e-05, + "loss": 0.033, + "step": 32100 + }, + { + "epoch": 2.0122830105909633, + "grad_norm": 0.024397028610110283, + "learning_rate": 1.6635705389373247e-05, + "loss": 0.1333, + "step": 32110 + }, + { + "epoch": 2.012909694804788, + "grad_norm": 0.009649792686104774, + "learning_rate": 1.662515040847776e-05, + "loss": 0.0016, + "step": 32120 + }, + { + "epoch": 2.0135363790186127, + "grad_norm": 0.037589482963085175, + "learning_rate": 1.6614595427582277e-05, + "loss": 0.0221, + "step": 32130 + }, + { + "epoch": 2.014163063232437, + "grad_norm": 0.04780086502432823, + "learning_rate": 1.660404044668679e-05, + "loss": 0.1299, + "step": 32140 + }, + { + "epoch": 2.0147897474462617, + "grad_norm": 14.039616584777832, + "learning_rate": 1.6593485465791307e-05, + "loss": 0.0447, + "step": 32150 + }, + { + "epoch": 2.0154164316600864, + "grad_norm": 0.18234844505786896, + "learning_rate": 1.6582930484895824e-05, + "loss": 0.1608, + "step": 32160 + }, + { + "epoch": 2.016043115873911, + "grad_norm": 44.759552001953125, + "learning_rate": 1.657237550400034e-05, + "loss": 0.1685, + "step": 32170 + }, + { + "epoch": 2.016669800087736, + "grad_norm": 6.447089672088623, + "learning_rate": 1.6561820523104854e-05, + "loss": 0.0545, + "step": 32180 + }, + { + "epoch": 2.0172964843015606, + "grad_norm": 8.790861129760742, + "learning_rate": 1.6551265542209367e-05, + "loss": 0.0656, + "step": 32190 + }, + { + "epoch": 2.017923168515385, + "grad_norm": 0.033416662365198135, + "learning_rate": 1.6540710561313884e-05, + "loss": 0.0715, + "step": 32200 + }, + { + "epoch": 2.0185498527292096, + "grad_norm": 0.03852026164531708, + "learning_rate": 1.65301555804184e-05, + "loss": 0.0336, + "step": 32210 + }, + { + "epoch": 2.0191765369430343, + "grad_norm": 0.06000113487243652, + "learning_rate": 1.6519600599522917e-05, + "loss": 0.1005, + "step": 32220 + }, + { + "epoch": 2.019803221156859, + "grad_norm": 0.05467413365840912, + "learning_rate": 1.650904561862743e-05, + "loss": 0.0024, + "step": 32230 + }, + { + "epoch": 2.020429905370684, + "grad_norm": 0.5257253646850586, + "learning_rate": 1.6498490637731947e-05, + "loss": 0.1164, + "step": 32240 + }, + { + "epoch": 2.0210565895845085, + "grad_norm": 0.007564585190266371, + "learning_rate": 1.648793565683646e-05, + "loss": 0.0011, + "step": 32250 + }, + { + "epoch": 2.0216832737983332, + "grad_norm": 0.01452536229044199, + "learning_rate": 1.6477380675940977e-05, + "loss": 0.1252, + "step": 32260 + }, + { + "epoch": 2.0223099580121575, + "grad_norm": 0.05928044021129608, + "learning_rate": 1.6466825695045493e-05, + "loss": 0.1359, + "step": 32270 + }, + { + "epoch": 2.0229366422259822, + "grad_norm": 0.030591899529099464, + "learning_rate": 1.6456270714150007e-05, + "loss": 0.1113, + "step": 32280 + }, + { + "epoch": 2.023563326439807, + "grad_norm": 0.016560450196266174, + "learning_rate": 1.6445715733254523e-05, + "loss": 0.0031, + "step": 32290 + }, + { + "epoch": 2.0241900106536317, + "grad_norm": 0.04848318174481392, + "learning_rate": 1.643516075235904e-05, + "loss": 0.1171, + "step": 32300 + }, + { + "epoch": 2.0248166948674564, + "grad_norm": 0.062350522726774216, + "learning_rate": 1.6424605771463557e-05, + "loss": 0.0333, + "step": 32310 + }, + { + "epoch": 2.025443379081281, + "grad_norm": 0.1351897269487381, + "learning_rate": 1.641405079056807e-05, + "loss": 0.0522, + "step": 32320 + }, + { + "epoch": 2.0260700632951054, + "grad_norm": 0.12988221645355225, + "learning_rate": 1.6403495809672587e-05, + "loss": 0.0363, + "step": 32330 + }, + { + "epoch": 2.02669674750893, + "grad_norm": 14.142264366149902, + "learning_rate": 1.63929408287771e-05, + "loss": 0.1323, + "step": 32340 + }, + { + "epoch": 2.027323431722755, + "grad_norm": 0.017919203266501427, + "learning_rate": 1.6382385847881617e-05, + "loss": 0.0035, + "step": 32350 + }, + { + "epoch": 2.0279501159365796, + "grad_norm": 0.04357292130589485, + "learning_rate": 1.6371830866986133e-05, + "loss": 0.0871, + "step": 32360 + }, + { + "epoch": 2.0285768001504043, + "grad_norm": 0.6641724705696106, + "learning_rate": 1.6361275886090646e-05, + "loss": 0.0236, + "step": 32370 + }, + { + "epoch": 2.029203484364229, + "grad_norm": 0.39285650849342346, + "learning_rate": 1.6350720905195163e-05, + "loss": 0.0043, + "step": 32380 + }, + { + "epoch": 2.0298301685780533, + "grad_norm": 0.14640651643276215, + "learning_rate": 1.6340165924299676e-05, + "loss": 0.0778, + "step": 32390 + }, + { + "epoch": 2.030456852791878, + "grad_norm": 0.006762874778360128, + "learning_rate": 1.6329610943404193e-05, + "loss": 0.1194, + "step": 32400 + }, + { + "epoch": 2.0310835370057028, + "grad_norm": 0.7191759943962097, + "learning_rate": 1.631905596250871e-05, + "loss": 0.0866, + "step": 32410 + }, + { + "epoch": 2.0317102212195275, + "grad_norm": 0.007037244271486998, + "learning_rate": 1.6308500981613223e-05, + "loss": 0.1469, + "step": 32420 + }, + { + "epoch": 2.032336905433352, + "grad_norm": 0.46012216806411743, + "learning_rate": 1.629794600071774e-05, + "loss": 0.0055, + "step": 32430 + }, + { + "epoch": 2.032963589647177, + "grad_norm": 10.74429702758789, + "learning_rate": 1.6287391019822253e-05, + "loss": 0.1375, + "step": 32440 + }, + { + "epoch": 2.0335902738610017, + "grad_norm": 0.01474764384329319, + "learning_rate": 1.627683603892677e-05, + "loss": 0.0703, + "step": 32450 + }, + { + "epoch": 2.034216958074826, + "grad_norm": 0.51497882604599, + "learning_rate": 1.6266281058031286e-05, + "loss": 0.0588, + "step": 32460 + }, + { + "epoch": 2.0348436422886507, + "grad_norm": 5.204861164093018, + "learning_rate": 1.6255726077135803e-05, + "loss": 0.0556, + "step": 32470 + }, + { + "epoch": 2.0354703265024754, + "grad_norm": 0.11711680889129639, + "learning_rate": 1.6245171096240316e-05, + "loss": 0.0082, + "step": 32480 + }, + { + "epoch": 2.0360970107163, + "grad_norm": 0.026732532307505608, + "learning_rate": 1.623461611534483e-05, + "loss": 0.0287, + "step": 32490 + }, + { + "epoch": 2.036723694930125, + "grad_norm": 0.21844738721847534, + "learning_rate": 1.6224061134449346e-05, + "loss": 0.1675, + "step": 32500 + }, + { + "epoch": 2.0373503791439496, + "grad_norm": 0.10609474033117294, + "learning_rate": 1.6213506153553863e-05, + "loss": 0.0282, + "step": 32510 + }, + { + "epoch": 2.037977063357774, + "grad_norm": 0.40876927971839905, + "learning_rate": 1.620295117265838e-05, + "loss": 0.0028, + "step": 32520 + }, + { + "epoch": 2.0386037475715986, + "grad_norm": 0.03981143981218338, + "learning_rate": 1.6192396191762893e-05, + "loss": 0.0257, + "step": 32530 + }, + { + "epoch": 2.0392304317854233, + "grad_norm": 0.20432570576667786, + "learning_rate": 1.618184121086741e-05, + "loss": 0.1235, + "step": 32540 + }, + { + "epoch": 2.039857115999248, + "grad_norm": 0.09158885478973389, + "learning_rate": 1.6171286229971926e-05, + "loss": 0.0567, + "step": 32550 + }, + { + "epoch": 2.0404838002130727, + "grad_norm": 0.09947940707206726, + "learning_rate": 1.6160731249076442e-05, + "loss": 0.1098, + "step": 32560 + }, + { + "epoch": 2.0411104844268975, + "grad_norm": 0.004586049355566502, + "learning_rate": 1.6150176268180956e-05, + "loss": 0.0011, + "step": 32570 + }, + { + "epoch": 2.0417371686407217, + "grad_norm": 0.02537638321518898, + "learning_rate": 1.613962128728547e-05, + "loss": 0.0755, + "step": 32580 + }, + { + "epoch": 2.0423638528545465, + "grad_norm": 3.6461362838745117, + "learning_rate": 1.6129066306389986e-05, + "loss": 0.0686, + "step": 32590 + }, + { + "epoch": 2.042990537068371, + "grad_norm": 0.021860415115952492, + "learning_rate": 1.6118511325494502e-05, + "loss": 0.0257, + "step": 32600 + }, + { + "epoch": 2.043617221282196, + "grad_norm": 0.004400935955345631, + "learning_rate": 1.610795634459902e-05, + "loss": 0.1238, + "step": 32610 + }, + { + "epoch": 2.0442439054960206, + "grad_norm": 0.009431498125195503, + "learning_rate": 1.6097401363703532e-05, + "loss": 0.0238, + "step": 32620 + }, + { + "epoch": 2.0448705897098454, + "grad_norm": 0.026406893506646156, + "learning_rate": 1.608684638280805e-05, + "loss": 0.1584, + "step": 32630 + }, + { + "epoch": 2.0454972739236696, + "grad_norm": 0.01052199024707079, + "learning_rate": 1.6076291401912562e-05, + "loss": 0.0757, + "step": 32640 + }, + { + "epoch": 2.0461239581374944, + "grad_norm": 0.012562397867441177, + "learning_rate": 1.606573642101708e-05, + "loss": 0.0015, + "step": 32650 + }, + { + "epoch": 2.046750642351319, + "grad_norm": 6.99421501159668, + "learning_rate": 1.6055181440121595e-05, + "loss": 0.1914, + "step": 32660 + }, + { + "epoch": 2.047377326565144, + "grad_norm": 0.0524737723171711, + "learning_rate": 1.604462645922611e-05, + "loss": 0.0706, + "step": 32670 + }, + { + "epoch": 2.0480040107789685, + "grad_norm": 0.019907524809241295, + "learning_rate": 1.6034071478330625e-05, + "loss": 0.0021, + "step": 32680 + }, + { + "epoch": 2.0486306949927933, + "grad_norm": 0.44236546754837036, + "learning_rate": 1.602351649743514e-05, + "loss": 0.1079, + "step": 32690 + }, + { + "epoch": 2.049257379206618, + "grad_norm": 0.02534150891005993, + "learning_rate": 1.6012961516539655e-05, + "loss": 0.0519, + "step": 32700 + }, + { + "epoch": 2.0498840634204423, + "grad_norm": 0.19184216856956482, + "learning_rate": 1.6002406535644172e-05, + "loss": 0.0912, + "step": 32710 + }, + { + "epoch": 2.050510747634267, + "grad_norm": 0.3994498550891876, + "learning_rate": 1.599185155474869e-05, + "loss": 0.0373, + "step": 32720 + }, + { + "epoch": 2.0511374318480917, + "grad_norm": 0.004602975212037563, + "learning_rate": 1.5981296573853202e-05, + "loss": 0.0745, + "step": 32730 + }, + { + "epoch": 2.0517641160619164, + "grad_norm": 0.057611722499132156, + "learning_rate": 1.5970741592957715e-05, + "loss": 0.0013, + "step": 32740 + }, + { + "epoch": 2.052390800275741, + "grad_norm": 0.004064835608005524, + "learning_rate": 1.5960186612062232e-05, + "loss": 0.0731, + "step": 32750 + }, + { + "epoch": 2.053017484489566, + "grad_norm": 0.04619257152080536, + "learning_rate": 1.594963163116675e-05, + "loss": 0.0281, + "step": 32760 + }, + { + "epoch": 2.05364416870339, + "grad_norm": 0.8780233263969421, + "learning_rate": 1.5939076650271265e-05, + "loss": 0.1676, + "step": 32770 + }, + { + "epoch": 2.054270852917215, + "grad_norm": 4.63099479675293, + "learning_rate": 1.592852166937578e-05, + "loss": 0.0789, + "step": 32780 + }, + { + "epoch": 2.0548975371310396, + "grad_norm": 0.012038362212479115, + "learning_rate": 1.5917966688480295e-05, + "loss": 0.0009, + "step": 32790 + }, + { + "epoch": 2.0555242213448643, + "grad_norm": 0.006544423755258322, + "learning_rate": 1.590741170758481e-05, + "loss": 0.0943, + "step": 32800 + }, + { + "epoch": 2.056150905558689, + "grad_norm": 0.023845335468649864, + "learning_rate": 1.5896856726689325e-05, + "loss": 0.0077, + "step": 32810 + }, + { + "epoch": 2.056777589772514, + "grad_norm": 21.74514389038086, + "learning_rate": 1.588630174579384e-05, + "loss": 0.0326, + "step": 32820 + }, + { + "epoch": 2.057404273986338, + "grad_norm": 0.04374400153756142, + "learning_rate": 1.5875746764898355e-05, + "loss": 0.3699, + "step": 32830 + }, + { + "epoch": 2.058030958200163, + "grad_norm": 0.160582035779953, + "learning_rate": 1.586519178400287e-05, + "loss": 0.1196, + "step": 32840 + }, + { + "epoch": 2.0586576424139875, + "grad_norm": 0.36026671528816223, + "learning_rate": 1.5854636803107388e-05, + "loss": 0.0051, + "step": 32850 + }, + { + "epoch": 2.0592843266278122, + "grad_norm": 0.07786229997873306, + "learning_rate": 1.5844081822211905e-05, + "loss": 0.0447, + "step": 32860 + }, + { + "epoch": 2.059911010841637, + "grad_norm": 0.37609755992889404, + "learning_rate": 1.5833526841316418e-05, + "loss": 0.0557, + "step": 32870 + }, + { + "epoch": 2.0605376950554617, + "grad_norm": 3.595386028289795, + "learning_rate": 1.582297186042093e-05, + "loss": 0.1036, + "step": 32880 + }, + { + "epoch": 2.0611643792692864, + "grad_norm": 0.2091987282037735, + "learning_rate": 1.5812416879525448e-05, + "loss": 0.1075, + "step": 32890 + }, + { + "epoch": 2.0617910634831107, + "grad_norm": 0.09946735948324203, + "learning_rate": 1.5801861898629965e-05, + "loss": 0.0028, + "step": 32900 + }, + { + "epoch": 2.0624177476969354, + "grad_norm": 0.05795929208397865, + "learning_rate": 1.579130691773448e-05, + "loss": 0.0835, + "step": 32910 + }, + { + "epoch": 2.06304443191076, + "grad_norm": 0.017441829666495323, + "learning_rate": 1.5780751936838994e-05, + "loss": 0.128, + "step": 32920 + }, + { + "epoch": 2.063671116124585, + "grad_norm": 0.10752920806407928, + "learning_rate": 1.577019695594351e-05, + "loss": 0.042, + "step": 32930 + }, + { + "epoch": 2.0642978003384096, + "grad_norm": 0.06938701122999191, + "learning_rate": 1.5759641975048024e-05, + "loss": 0.0017, + "step": 32940 + }, + { + "epoch": 2.0649244845522343, + "grad_norm": 0.4177788496017456, + "learning_rate": 1.574908699415254e-05, + "loss": 0.0966, + "step": 32950 + }, + { + "epoch": 2.0655511687660586, + "grad_norm": 0.1389980912208557, + "learning_rate": 1.5738532013257058e-05, + "loss": 0.0348, + "step": 32960 + }, + { + "epoch": 2.0661778529798833, + "grad_norm": 0.0164639949798584, + "learning_rate": 1.572797703236157e-05, + "loss": 0.0025, + "step": 32970 + }, + { + "epoch": 2.066804537193708, + "grad_norm": 0.043135009706020355, + "learning_rate": 1.5717422051466088e-05, + "loss": 0.0307, + "step": 32980 + }, + { + "epoch": 2.0674312214075328, + "grad_norm": 0.011760778725147247, + "learning_rate": 1.57068670705706e-05, + "loss": 0.0655, + "step": 32990 + }, + { + "epoch": 2.0680579056213575, + "grad_norm": 5.448158264160156, + "learning_rate": 1.569631208967512e-05, + "loss": 0.0796, + "step": 33000 + }, + { + "epoch": 2.068684589835182, + "grad_norm": 1.601997971534729, + "learning_rate": 1.5685757108779634e-05, + "loss": 0.0354, + "step": 33010 + }, + { + "epoch": 2.0693112740490065, + "grad_norm": 0.006840604357421398, + "learning_rate": 1.567520212788415e-05, + "loss": 0.1773, + "step": 33020 + }, + { + "epoch": 2.0699379582628312, + "grad_norm": 11.032891273498535, + "learning_rate": 1.5664647146988664e-05, + "loss": 0.1046, + "step": 33030 + }, + { + "epoch": 2.070564642476656, + "grad_norm": 0.19698861241340637, + "learning_rate": 1.5654092166093177e-05, + "loss": 0.2586, + "step": 33040 + }, + { + "epoch": 2.0711913266904807, + "grad_norm": 0.009995955042541027, + "learning_rate": 1.5643537185197697e-05, + "loss": 0.1307, + "step": 33050 + }, + { + "epoch": 2.0718180109043054, + "grad_norm": 0.4533624053001404, + "learning_rate": 1.563298220430221e-05, + "loss": 0.057, + "step": 33060 + }, + { + "epoch": 2.07244469511813, + "grad_norm": 0.14136387407779694, + "learning_rate": 1.5622427223406727e-05, + "loss": 0.1027, + "step": 33070 + }, + { + "epoch": 2.073071379331955, + "grad_norm": 0.011321029625833035, + "learning_rate": 1.561187224251124e-05, + "loss": 0.0377, + "step": 33080 + }, + { + "epoch": 2.073698063545779, + "grad_norm": 0.09532028436660767, + "learning_rate": 1.5601317261615757e-05, + "loss": 0.0028, + "step": 33090 + }, + { + "epoch": 2.074324747759604, + "grad_norm": 0.013551967218518257, + "learning_rate": 1.5590762280720274e-05, + "loss": 0.3631, + "step": 33100 + }, + { + "epoch": 2.0749514319734286, + "grad_norm": 0.023686746135354042, + "learning_rate": 1.558020729982479e-05, + "loss": 0.0351, + "step": 33110 + }, + { + "epoch": 2.0755781161872533, + "grad_norm": 0.0980924665927887, + "learning_rate": 1.5569652318929304e-05, + "loss": 0.0031, + "step": 33120 + }, + { + "epoch": 2.076204800401078, + "grad_norm": 0.024400917813181877, + "learning_rate": 1.5559097338033817e-05, + "loss": 0.0048, + "step": 33130 + }, + { + "epoch": 2.0768314846149027, + "grad_norm": 0.01752445660531521, + "learning_rate": 1.5548542357138334e-05, + "loss": 0.0031, + "step": 33140 + }, + { + "epoch": 2.077458168828727, + "grad_norm": 11.409141540527344, + "learning_rate": 1.553798737624285e-05, + "loss": 0.1458, + "step": 33150 + }, + { + "epoch": 2.0780848530425517, + "grad_norm": 0.10126788914203644, + "learning_rate": 1.5527432395347367e-05, + "loss": 0.0606, + "step": 33160 + }, + { + "epoch": 2.0787115372563765, + "grad_norm": 0.03489801287651062, + "learning_rate": 1.551687741445188e-05, + "loss": 0.004, + "step": 33170 + }, + { + "epoch": 2.079338221470201, + "grad_norm": 0.07026004046201706, + "learning_rate": 1.5506322433556397e-05, + "loss": 0.1357, + "step": 33180 + }, + { + "epoch": 2.079964905684026, + "grad_norm": 0.2072756290435791, + "learning_rate": 1.549576745266091e-05, + "loss": 0.0958, + "step": 33190 + }, + { + "epoch": 2.0805915898978506, + "grad_norm": 9.07013988494873, + "learning_rate": 1.5485212471765427e-05, + "loss": 0.2482, + "step": 33200 + }, + { + "epoch": 2.081218274111675, + "grad_norm": 4.005678653717041, + "learning_rate": 1.5474657490869943e-05, + "loss": 0.2936, + "step": 33210 + }, + { + "epoch": 2.0818449583254997, + "grad_norm": 0.03268539160490036, + "learning_rate": 1.5464102509974457e-05, + "loss": 0.1032, + "step": 33220 + }, + { + "epoch": 2.0824716425393244, + "grad_norm": 0.9119426608085632, + "learning_rate": 1.5453547529078973e-05, + "loss": 0.0428, + "step": 33230 + }, + { + "epoch": 2.083098326753149, + "grad_norm": 0.14467692375183105, + "learning_rate": 1.5442992548183487e-05, + "loss": 0.0198, + "step": 33240 + }, + { + "epoch": 2.083725010966974, + "grad_norm": 0.19485610723495483, + "learning_rate": 1.5432437567288007e-05, + "loss": 0.0031, + "step": 33250 + }, + { + "epoch": 2.0843516951807985, + "grad_norm": 0.09321003407239914, + "learning_rate": 1.542188258639252e-05, + "loss": 0.1007, + "step": 33260 + }, + { + "epoch": 2.084978379394623, + "grad_norm": 0.01489013247191906, + "learning_rate": 1.5411327605497033e-05, + "loss": 0.0854, + "step": 33270 + }, + { + "epoch": 2.0856050636084476, + "grad_norm": 0.022184649482369423, + "learning_rate": 1.540077262460155e-05, + "loss": 0.0329, + "step": 33280 + }, + { + "epoch": 2.0862317478222723, + "grad_norm": 0.942670464515686, + "learning_rate": 1.5390217643706063e-05, + "loss": 0.1323, + "step": 33290 + }, + { + "epoch": 2.086858432036097, + "grad_norm": 1.3526219129562378, + "learning_rate": 1.5379662662810583e-05, + "loss": 0.1154, + "step": 33300 + }, + { + "epoch": 2.0874851162499217, + "grad_norm": 0.050650764256715775, + "learning_rate": 1.5369107681915096e-05, + "loss": 0.0038, + "step": 33310 + }, + { + "epoch": 2.0881118004637464, + "grad_norm": 0.15342023968696594, + "learning_rate": 1.5358552701019613e-05, + "loss": 0.0759, + "step": 33320 + }, + { + "epoch": 2.088738484677571, + "grad_norm": 0.021919699385762215, + "learning_rate": 1.5347997720124126e-05, + "loss": 0.1006, + "step": 33330 + }, + { + "epoch": 2.0893651688913955, + "grad_norm": 53.63493728637695, + "learning_rate": 1.5337442739228643e-05, + "loss": 0.1111, + "step": 33340 + }, + { + "epoch": 2.08999185310522, + "grad_norm": 4.8673810958862305, + "learning_rate": 1.532688775833316e-05, + "loss": 0.0968, + "step": 33350 + }, + { + "epoch": 2.090618537319045, + "grad_norm": 0.17812536656856537, + "learning_rate": 1.5316332777437673e-05, + "loss": 0.0357, + "step": 33360 + }, + { + "epoch": 2.0912452215328696, + "grad_norm": 0.13300246000289917, + "learning_rate": 1.530577779654219e-05, + "loss": 0.0692, + "step": 33370 + }, + { + "epoch": 2.0918719057466943, + "grad_norm": 0.08312509953975677, + "learning_rate": 1.5295222815646703e-05, + "loss": 0.0561, + "step": 33380 + }, + { + "epoch": 2.092498589960519, + "grad_norm": 0.06152992323040962, + "learning_rate": 1.528466783475122e-05, + "loss": 0.0272, + "step": 33390 + }, + { + "epoch": 2.0931252741743434, + "grad_norm": 0.13552305102348328, + "learning_rate": 1.5274112853855736e-05, + "loss": 0.1758, + "step": 33400 + }, + { + "epoch": 2.093751958388168, + "grad_norm": 0.19440507888793945, + "learning_rate": 1.5263557872960253e-05, + "loss": 0.0442, + "step": 33410 + }, + { + "epoch": 2.094378642601993, + "grad_norm": 21.867822647094727, + "learning_rate": 1.5253002892064766e-05, + "loss": 0.0412, + "step": 33420 + }, + { + "epoch": 2.0950053268158175, + "grad_norm": 9.543660163879395, + "learning_rate": 1.5242447911169281e-05, + "loss": 0.0348, + "step": 33430 + }, + { + "epoch": 2.0956320110296422, + "grad_norm": 0.027133401483297348, + "learning_rate": 1.5231892930273798e-05, + "loss": 0.0583, + "step": 33440 + }, + { + "epoch": 2.096258695243467, + "grad_norm": 0.3416101038455963, + "learning_rate": 1.5221337949378311e-05, + "loss": 0.0022, + "step": 33450 + }, + { + "epoch": 2.0968853794572917, + "grad_norm": 8.35288143157959, + "learning_rate": 1.521078296848283e-05, + "loss": 0.0551, + "step": 33460 + }, + { + "epoch": 2.097512063671116, + "grad_norm": 0.008955095894634724, + "learning_rate": 1.5200227987587343e-05, + "loss": 0.0178, + "step": 33470 + }, + { + "epoch": 2.0981387478849407, + "grad_norm": 6.15145206451416, + "learning_rate": 1.518967300669186e-05, + "loss": 0.1334, + "step": 33480 + }, + { + "epoch": 2.0987654320987654, + "grad_norm": 0.016260214149951935, + "learning_rate": 1.5179118025796374e-05, + "loss": 0.026, + "step": 33490 + }, + { + "epoch": 2.09939211631259, + "grad_norm": 0.5407869219779968, + "learning_rate": 1.5168563044900887e-05, + "loss": 0.0199, + "step": 33500 + }, + { + "epoch": 2.100018800526415, + "grad_norm": 0.01398592721670866, + "learning_rate": 1.5158008064005406e-05, + "loss": 0.0227, + "step": 33510 + }, + { + "epoch": 2.1006454847402396, + "grad_norm": 0.0240841805934906, + "learning_rate": 1.5147453083109919e-05, + "loss": 0.2086, + "step": 33520 + }, + { + "epoch": 2.101272168954064, + "grad_norm": 4.998007297515869, + "learning_rate": 1.5136898102214436e-05, + "loss": 0.0781, + "step": 33530 + }, + { + "epoch": 2.1018988531678886, + "grad_norm": 1.8973308801651, + "learning_rate": 1.512634312131895e-05, + "loss": 0.1497, + "step": 33540 + }, + { + "epoch": 2.1025255373817133, + "grad_norm": 0.041549600660800934, + "learning_rate": 1.5115788140423467e-05, + "loss": 0.0448, + "step": 33550 + }, + { + "epoch": 2.103152221595538, + "grad_norm": 1.6395976543426514, + "learning_rate": 1.5105233159527982e-05, + "loss": 0.0626, + "step": 33560 + }, + { + "epoch": 2.1037789058093628, + "grad_norm": 0.01420541387051344, + "learning_rate": 1.5094678178632499e-05, + "loss": 0.04, + "step": 33570 + }, + { + "epoch": 2.1044055900231875, + "grad_norm": 5.16315221786499, + "learning_rate": 1.5084123197737012e-05, + "loss": 0.145, + "step": 33580 + }, + { + "epoch": 2.105032274237012, + "grad_norm": 0.022046558558940887, + "learning_rate": 1.5073568216841527e-05, + "loss": 0.0271, + "step": 33590 + }, + { + "epoch": 2.1056589584508365, + "grad_norm": 0.05051284655928612, + "learning_rate": 1.5063013235946044e-05, + "loss": 0.0417, + "step": 33600 + }, + { + "epoch": 2.1062856426646612, + "grad_norm": 0.23544947803020477, + "learning_rate": 1.5052458255050559e-05, + "loss": 0.0307, + "step": 33610 + }, + { + "epoch": 2.106912326878486, + "grad_norm": 0.01835622452199459, + "learning_rate": 1.5041903274155075e-05, + "loss": 0.0437, + "step": 33620 + }, + { + "epoch": 2.1075390110923107, + "grad_norm": 0.07070131599903107, + "learning_rate": 1.503134829325959e-05, + "loss": 0.056, + "step": 33630 + }, + { + "epoch": 2.1081656953061354, + "grad_norm": 0.02240445651113987, + "learning_rate": 1.5020793312364107e-05, + "loss": 0.1872, + "step": 33640 + }, + { + "epoch": 2.1087923795199597, + "grad_norm": 0.16485477983951569, + "learning_rate": 1.501023833146862e-05, + "loss": 0.0741, + "step": 33650 + }, + { + "epoch": 2.1094190637337844, + "grad_norm": 6.441537857055664, + "learning_rate": 1.4999683350573135e-05, + "loss": 0.091, + "step": 33660 + }, + { + "epoch": 2.110045747947609, + "grad_norm": 7.90993070602417, + "learning_rate": 1.4989128369677652e-05, + "loss": 0.037, + "step": 33670 + }, + { + "epoch": 2.110672432161434, + "grad_norm": 0.027580685913562775, + "learning_rate": 1.4978573388782167e-05, + "loss": 0.0665, + "step": 33680 + }, + { + "epoch": 2.1112991163752586, + "grad_norm": 0.021920882165431976, + "learning_rate": 1.4968018407886683e-05, + "loss": 0.0015, + "step": 33690 + }, + { + "epoch": 2.1119258005890833, + "grad_norm": 0.0395321287214756, + "learning_rate": 1.4957463426991197e-05, + "loss": 0.1043, + "step": 33700 + }, + { + "epoch": 2.112552484802908, + "grad_norm": 0.015341048128902912, + "learning_rate": 1.4946908446095715e-05, + "loss": 0.0992, + "step": 33710 + }, + { + "epoch": 2.1131791690167323, + "grad_norm": 0.032132405787706375, + "learning_rate": 1.4936353465200228e-05, + "loss": 0.0482, + "step": 33720 + }, + { + "epoch": 2.113805853230557, + "grad_norm": 0.016636233776807785, + "learning_rate": 1.4925798484304745e-05, + "loss": 0.099, + "step": 33730 + }, + { + "epoch": 2.1144325374443818, + "grad_norm": 0.021241918206214905, + "learning_rate": 1.491524350340926e-05, + "loss": 0.0018, + "step": 33740 + }, + { + "epoch": 2.1150592216582065, + "grad_norm": 0.01900082267820835, + "learning_rate": 1.4904688522513773e-05, + "loss": 0.1138, + "step": 33750 + }, + { + "epoch": 2.115685905872031, + "grad_norm": 10.862110137939453, + "learning_rate": 1.4894133541618292e-05, + "loss": 0.0826, + "step": 33760 + }, + { + "epoch": 2.116312590085856, + "grad_norm": 0.825947642326355, + "learning_rate": 1.4883578560722805e-05, + "loss": 0.0336, + "step": 33770 + }, + { + "epoch": 2.11693927429968, + "grad_norm": 35.69391632080078, + "learning_rate": 1.4873023579827321e-05, + "loss": 0.0781, + "step": 33780 + }, + { + "epoch": 2.117565958513505, + "grad_norm": 0.02140156924724579, + "learning_rate": 1.4862468598931836e-05, + "loss": 0.0309, + "step": 33790 + }, + { + "epoch": 2.1181926427273297, + "grad_norm": 0.17433586716651917, + "learning_rate": 1.4851913618036353e-05, + "loss": 0.001, + "step": 33800 + }, + { + "epoch": 2.1188193269411544, + "grad_norm": 0.011882697232067585, + "learning_rate": 1.4841358637140868e-05, + "loss": 0.0749, + "step": 33810 + }, + { + "epoch": 2.119446011154979, + "grad_norm": 7.203928470611572, + "learning_rate": 1.4830803656245381e-05, + "loss": 0.171, + "step": 33820 + }, + { + "epoch": 2.120072695368804, + "grad_norm": 0.020683668553829193, + "learning_rate": 1.4820248675349898e-05, + "loss": 0.0225, + "step": 33830 + }, + { + "epoch": 2.120699379582628, + "grad_norm": 4.708274841308594, + "learning_rate": 1.4809693694454413e-05, + "loss": 0.1266, + "step": 33840 + }, + { + "epoch": 2.121326063796453, + "grad_norm": 0.2545260190963745, + "learning_rate": 1.479913871355893e-05, + "loss": 0.0061, + "step": 33850 + }, + { + "epoch": 2.1219527480102776, + "grad_norm": 0.06264783442020416, + "learning_rate": 1.4788583732663445e-05, + "loss": 0.0348, + "step": 33860 + }, + { + "epoch": 2.1225794322241023, + "grad_norm": 0.04944666475057602, + "learning_rate": 1.4778028751767961e-05, + "loss": 0.0564, + "step": 33870 + }, + { + "epoch": 2.123206116437927, + "grad_norm": 0.05640007555484772, + "learning_rate": 1.4767473770872476e-05, + "loss": 0.1237, + "step": 33880 + }, + { + "epoch": 2.1238328006517517, + "grad_norm": 0.031840428709983826, + "learning_rate": 1.475691878997699e-05, + "loss": 0.062, + "step": 33890 + }, + { + "epoch": 2.124459484865576, + "grad_norm": 0.7252791523933411, + "learning_rate": 1.4746363809081506e-05, + "loss": 0.0555, + "step": 33900 + }, + { + "epoch": 2.1250861690794007, + "grad_norm": 0.09943141043186188, + "learning_rate": 1.4735808828186021e-05, + "loss": 0.1402, + "step": 33910 + }, + { + "epoch": 2.1257128532932255, + "grad_norm": 0.040643591433763504, + "learning_rate": 1.4725253847290538e-05, + "loss": 0.002, + "step": 33920 + }, + { + "epoch": 2.12633953750705, + "grad_norm": 0.06584443897008896, + "learning_rate": 1.4714698866395053e-05, + "loss": 0.0877, + "step": 33930 + }, + { + "epoch": 2.126966221720875, + "grad_norm": 0.03503456711769104, + "learning_rate": 1.470414388549957e-05, + "loss": 0.0065, + "step": 33940 + }, + { + "epoch": 2.1275929059346996, + "grad_norm": 14.166348457336426, + "learning_rate": 1.4693588904604083e-05, + "loss": 0.143, + "step": 33950 + }, + { + "epoch": 2.1282195901485244, + "grad_norm": 35.32097625732422, + "learning_rate": 1.4683033923708601e-05, + "loss": 0.1444, + "step": 33960 + }, + { + "epoch": 2.1288462743623486, + "grad_norm": 0.06533791869878769, + "learning_rate": 1.4672478942813114e-05, + "loss": 0.0792, + "step": 33970 + }, + { + "epoch": 2.1294729585761734, + "grad_norm": 0.1732708066701889, + "learning_rate": 1.4661923961917629e-05, + "loss": 0.1347, + "step": 33980 + }, + { + "epoch": 2.130099642789998, + "grad_norm": 11.223489761352539, + "learning_rate": 1.4651368981022146e-05, + "loss": 0.0402, + "step": 33990 + }, + { + "epoch": 2.130726327003823, + "grad_norm": 0.30682894587516785, + "learning_rate": 1.4640814000126659e-05, + "loss": 0.0591, + "step": 34000 + }, + { + "epoch": 2.1313530112176475, + "grad_norm": 0.0331999771296978, + "learning_rate": 1.4630259019231177e-05, + "loss": 0.0475, + "step": 34010 + }, + { + "epoch": 2.1319796954314723, + "grad_norm": 16.945070266723633, + "learning_rate": 1.461970403833569e-05, + "loss": 0.0869, + "step": 34020 + }, + { + "epoch": 2.1326063796452965, + "grad_norm": 0.034137096256017685, + "learning_rate": 1.4609149057440207e-05, + "loss": 0.1776, + "step": 34030 + }, + { + "epoch": 2.1332330638591213, + "grad_norm": 2.4567253589630127, + "learning_rate": 1.4598594076544722e-05, + "loss": 0.0038, + "step": 34040 + }, + { + "epoch": 2.133859748072946, + "grad_norm": 0.06618314236402512, + "learning_rate": 1.4588039095649237e-05, + "loss": 0.009, + "step": 34050 + }, + { + "epoch": 2.1344864322867707, + "grad_norm": 0.010522538796067238, + "learning_rate": 1.4577484114753754e-05, + "loss": 0.1303, + "step": 34060 + }, + { + "epoch": 2.1351131165005954, + "grad_norm": 17.482473373413086, + "learning_rate": 1.4566929133858267e-05, + "loss": 0.1436, + "step": 34070 + }, + { + "epoch": 2.13573980071442, + "grad_norm": 0.016001539304852486, + "learning_rate": 1.4556374152962784e-05, + "loss": 0.0007, + "step": 34080 + }, + { + "epoch": 2.136366484928245, + "grad_norm": 0.04239490255713463, + "learning_rate": 1.4545819172067299e-05, + "loss": 0.0855, + "step": 34090 + }, + { + "epoch": 2.136993169142069, + "grad_norm": 0.5522505640983582, + "learning_rate": 1.4535264191171815e-05, + "loss": 0.0036, + "step": 34100 + }, + { + "epoch": 2.137619853355894, + "grad_norm": 0.01569908857345581, + "learning_rate": 1.452470921027633e-05, + "loss": 0.0346, + "step": 34110 + }, + { + "epoch": 2.1382465375697186, + "grad_norm": 1.0340025424957275, + "learning_rate": 1.4514154229380847e-05, + "loss": 0.2189, + "step": 34120 + }, + { + "epoch": 2.1388732217835433, + "grad_norm": 0.019147315993905067, + "learning_rate": 1.4503599248485362e-05, + "loss": 0.0875, + "step": 34130 + }, + { + "epoch": 2.139499905997368, + "grad_norm": 0.047466620802879333, + "learning_rate": 1.4493044267589875e-05, + "loss": 0.0955, + "step": 34140 + }, + { + "epoch": 2.140126590211193, + "grad_norm": 0.020558997988700867, + "learning_rate": 1.4482489286694392e-05, + "loss": 0.1108, + "step": 34150 + }, + { + "epoch": 2.140753274425017, + "grad_norm": 0.02180035598576069, + "learning_rate": 1.4471934305798907e-05, + "loss": 0.0826, + "step": 34160 + }, + { + "epoch": 2.141379958638842, + "grad_norm": 0.024248367175459862, + "learning_rate": 1.4461379324903423e-05, + "loss": 0.0022, + "step": 34170 + }, + { + "epoch": 2.1420066428526665, + "grad_norm": 0.026557935401797295, + "learning_rate": 1.4450824344007938e-05, + "loss": 0.0256, + "step": 34180 + }, + { + "epoch": 2.1426333270664912, + "grad_norm": 0.013800345361232758, + "learning_rate": 1.4440269363112455e-05, + "loss": 0.1043, + "step": 34190 + }, + { + "epoch": 2.143260011280316, + "grad_norm": 0.0492568239569664, + "learning_rate": 1.4429714382216968e-05, + "loss": 0.0024, + "step": 34200 + }, + { + "epoch": 2.1438866954941407, + "grad_norm": 0.08210790902376175, + "learning_rate": 1.4419159401321483e-05, + "loss": 0.0627, + "step": 34210 + }, + { + "epoch": 2.144513379707965, + "grad_norm": 2.664041042327881, + "learning_rate": 1.4408604420426e-05, + "loss": 0.1104, + "step": 34220 + }, + { + "epoch": 2.1451400639217897, + "grad_norm": 0.6660194396972656, + "learning_rate": 1.4398049439530515e-05, + "loss": 0.0381, + "step": 34230 + }, + { + "epoch": 2.1457667481356144, + "grad_norm": 0.07248642295598984, + "learning_rate": 1.4387494458635032e-05, + "loss": 0.0531, + "step": 34240 + }, + { + "epoch": 2.146393432349439, + "grad_norm": 1.3321788311004639, + "learning_rate": 1.4376939477739545e-05, + "loss": 0.0295, + "step": 34250 + }, + { + "epoch": 2.147020116563264, + "grad_norm": 24.10909080505371, + "learning_rate": 1.4366384496844063e-05, + "loss": 0.0952, + "step": 34260 + }, + { + "epoch": 2.1476468007770886, + "grad_norm": 0.015267537906765938, + "learning_rate": 1.4355829515948576e-05, + "loss": 0.0007, + "step": 34270 + }, + { + "epoch": 2.148273484990913, + "grad_norm": 0.04566666856408119, + "learning_rate": 1.4345274535053091e-05, + "loss": 0.0016, + "step": 34280 + }, + { + "epoch": 2.1489001692047376, + "grad_norm": 0.007647485937923193, + "learning_rate": 1.4334719554157608e-05, + "loss": 0.0042, + "step": 34290 + }, + { + "epoch": 2.1495268534185623, + "grad_norm": 0.4660249352455139, + "learning_rate": 1.4324164573262123e-05, + "loss": 0.0145, + "step": 34300 + }, + { + "epoch": 2.150153537632387, + "grad_norm": 0.008947279304265976, + "learning_rate": 1.431360959236664e-05, + "loss": 0.0371, + "step": 34310 + }, + { + "epoch": 2.1507802218462118, + "grad_norm": 0.012975979596376419, + "learning_rate": 1.4303054611471153e-05, + "loss": 0.1095, + "step": 34320 + }, + { + "epoch": 2.1514069060600365, + "grad_norm": 0.011418702080845833, + "learning_rate": 1.429249963057567e-05, + "loss": 0.2002, + "step": 34330 + }, + { + "epoch": 2.152033590273861, + "grad_norm": 0.35795190930366516, + "learning_rate": 1.4281944649680184e-05, + "loss": 0.0012, + "step": 34340 + }, + { + "epoch": 2.1526602744876855, + "grad_norm": 0.06372092664241791, + "learning_rate": 1.4271389668784701e-05, + "loss": 0.0485, + "step": 34350 + }, + { + "epoch": 2.15328695870151, + "grad_norm": 0.06595498323440552, + "learning_rate": 1.4260834687889216e-05, + "loss": 0.0305, + "step": 34360 + }, + { + "epoch": 2.153913642915335, + "grad_norm": 0.010245050303637981, + "learning_rate": 1.425027970699373e-05, + "loss": 0.0007, + "step": 34370 + }, + { + "epoch": 2.1545403271291597, + "grad_norm": 0.04346427321434021, + "learning_rate": 1.4239724726098248e-05, + "loss": 0.0013, + "step": 34380 + }, + { + "epoch": 2.1551670113429844, + "grad_norm": 0.012852686457335949, + "learning_rate": 1.4229169745202761e-05, + "loss": 0.1434, + "step": 34390 + }, + { + "epoch": 2.155793695556809, + "grad_norm": 0.056614551693201065, + "learning_rate": 1.4218614764307278e-05, + "loss": 0.0815, + "step": 34400 + }, + { + "epoch": 2.1564203797706334, + "grad_norm": 0.054879337549209595, + "learning_rate": 1.4208059783411793e-05, + "loss": 0.1237, + "step": 34410 + }, + { + "epoch": 2.157047063984458, + "grad_norm": 0.010255315341055393, + "learning_rate": 1.419750480251631e-05, + "loss": 0.0008, + "step": 34420 + }, + { + "epoch": 2.157673748198283, + "grad_norm": 0.009069728665053844, + "learning_rate": 1.4186949821620824e-05, + "loss": 0.1256, + "step": 34430 + }, + { + "epoch": 2.1583004324121076, + "grad_norm": 0.047382794320583344, + "learning_rate": 1.4176394840725337e-05, + "loss": 0.0012, + "step": 34440 + }, + { + "epoch": 2.1589271166259323, + "grad_norm": 0.12266150861978531, + "learning_rate": 1.4165839859829854e-05, + "loss": 0.0589, + "step": 34450 + }, + { + "epoch": 2.159553800839757, + "grad_norm": 0.04495285078883171, + "learning_rate": 1.4155284878934369e-05, + "loss": 0.1503, + "step": 34460 + }, + { + "epoch": 2.1601804850535817, + "grad_norm": 0.0057646301575005054, + "learning_rate": 1.4144729898038886e-05, + "loss": 0.0806, + "step": 34470 + }, + { + "epoch": 2.160807169267406, + "grad_norm": 0.48330438137054443, + "learning_rate": 1.41341749171434e-05, + "loss": 0.0041, + "step": 34480 + }, + { + "epoch": 2.1614338534812307, + "grad_norm": 0.10639398545026779, + "learning_rate": 1.4123619936247917e-05, + "loss": 0.0579, + "step": 34490 + }, + { + "epoch": 2.1620605376950555, + "grad_norm": 0.01347813569009304, + "learning_rate": 1.411306495535243e-05, + "loss": 0.1687, + "step": 34500 + }, + { + "epoch": 2.16268722190888, + "grad_norm": 0.029893875122070312, + "learning_rate": 1.4102509974456946e-05, + "loss": 0.001, + "step": 34510 + }, + { + "epoch": 2.163313906122705, + "grad_norm": 0.16543273627758026, + "learning_rate": 1.4091954993561462e-05, + "loss": 0.0327, + "step": 34520 + }, + { + "epoch": 2.163940590336529, + "grad_norm": 7.4403815269470215, + "learning_rate": 1.4081400012665977e-05, + "loss": 0.2158, + "step": 34530 + }, + { + "epoch": 2.164567274550354, + "grad_norm": 0.01477138139307499, + "learning_rate": 1.4070845031770494e-05, + "loss": 0.042, + "step": 34540 + }, + { + "epoch": 2.1651939587641786, + "grad_norm": 0.01747424341738224, + "learning_rate": 1.4060290050875009e-05, + "loss": 0.0589, + "step": 34550 + }, + { + "epoch": 2.1658206429780034, + "grad_norm": 0.01443378534168005, + "learning_rate": 1.4049735069979525e-05, + "loss": 0.1629, + "step": 34560 + }, + { + "epoch": 2.166447327191828, + "grad_norm": 0.06920278072357178, + "learning_rate": 1.4039180089084039e-05, + "loss": 0.0235, + "step": 34570 + }, + { + "epoch": 2.167074011405653, + "grad_norm": 0.01657126098871231, + "learning_rate": 1.4028625108188555e-05, + "loss": 0.1257, + "step": 34580 + }, + { + "epoch": 2.1677006956194775, + "grad_norm": 0.009466479532420635, + "learning_rate": 1.401807012729307e-05, + "loss": 0.0129, + "step": 34590 + }, + { + "epoch": 2.168327379833302, + "grad_norm": 0.4203064739704132, + "learning_rate": 1.4007515146397585e-05, + "loss": 0.0021, + "step": 34600 + }, + { + "epoch": 2.1689540640471265, + "grad_norm": 0.03957168757915497, + "learning_rate": 1.3996960165502102e-05, + "loss": 0.0192, + "step": 34610 + }, + { + "epoch": 2.1695807482609513, + "grad_norm": 0.006051171105355024, + "learning_rate": 1.3986405184606615e-05, + "loss": 0.0307, + "step": 34620 + }, + { + "epoch": 2.170207432474776, + "grad_norm": 0.02826111391186714, + "learning_rate": 1.3975850203711133e-05, + "loss": 0.0751, + "step": 34630 + }, + { + "epoch": 2.1708341166886007, + "grad_norm": 0.007988009601831436, + "learning_rate": 1.3965295222815647e-05, + "loss": 0.0098, + "step": 34640 + }, + { + "epoch": 2.1714608009024254, + "grad_norm": 0.0047339689917862415, + "learning_rate": 1.3954740241920163e-05, + "loss": 0.05, + "step": 34650 + }, + { + "epoch": 2.1720874851162497, + "grad_norm": 0.04309682920575142, + "learning_rate": 1.3944185261024678e-05, + "loss": 0.0921, + "step": 34660 + }, + { + "epoch": 2.1727141693300744, + "grad_norm": 0.020489204674959183, + "learning_rate": 1.3933630280129192e-05, + "loss": 0.0826, + "step": 34670 + }, + { + "epoch": 2.173340853543899, + "grad_norm": 0.011994308792054653, + "learning_rate": 1.392307529923371e-05, + "loss": 0.0534, + "step": 34680 + }, + { + "epoch": 2.173967537757724, + "grad_norm": 0.0060248589143157005, + "learning_rate": 1.3912520318338223e-05, + "loss": 0.1518, + "step": 34690 + }, + { + "epoch": 2.1745942219715486, + "grad_norm": 2.8406407833099365, + "learning_rate": 1.390196533744274e-05, + "loss": 0.1138, + "step": 34700 + }, + { + "epoch": 2.1752209061853733, + "grad_norm": 0.010623175650835037, + "learning_rate": 1.3891410356547255e-05, + "loss": 0.0644, + "step": 34710 + }, + { + "epoch": 2.175847590399198, + "grad_norm": 24.322790145874023, + "learning_rate": 1.3880855375651771e-05, + "loss": 0.0389, + "step": 34720 + }, + { + "epoch": 2.1764742746130223, + "grad_norm": 0.01590089127421379, + "learning_rate": 1.3870300394756286e-05, + "loss": 0.0032, + "step": 34730 + }, + { + "epoch": 2.177100958826847, + "grad_norm": 0.015220998786389828, + "learning_rate": 1.3859745413860803e-05, + "loss": 0.0071, + "step": 34740 + }, + { + "epoch": 2.177727643040672, + "grad_norm": 0.01890176720917225, + "learning_rate": 1.3849190432965316e-05, + "loss": 0.002, + "step": 34750 + }, + { + "epoch": 2.1783543272544965, + "grad_norm": 0.08504509925842285, + "learning_rate": 1.3838635452069831e-05, + "loss": 0.0022, + "step": 34760 + }, + { + "epoch": 2.1789810114683212, + "grad_norm": 0.007461244240403175, + "learning_rate": 1.3828080471174348e-05, + "loss": 0.0841, + "step": 34770 + }, + { + "epoch": 2.179607695682146, + "grad_norm": 0.19550983607769012, + "learning_rate": 1.3817525490278863e-05, + "loss": 0.122, + "step": 34780 + }, + { + "epoch": 2.1802343798959702, + "grad_norm": 0.07259407639503479, + "learning_rate": 1.380697050938338e-05, + "loss": 0.0708, + "step": 34790 + }, + { + "epoch": 2.180861064109795, + "grad_norm": 0.02365555614233017, + "learning_rate": 1.3796415528487895e-05, + "loss": 0.0666, + "step": 34800 + }, + { + "epoch": 2.1814877483236197, + "grad_norm": 16.392606735229492, + "learning_rate": 1.3785860547592411e-05, + "loss": 0.0461, + "step": 34810 + }, + { + "epoch": 2.1821144325374444, + "grad_norm": 19.294736862182617, + "learning_rate": 1.3775305566696924e-05, + "loss": 0.0895, + "step": 34820 + }, + { + "epoch": 2.182741116751269, + "grad_norm": 0.03826337680220604, + "learning_rate": 1.376475058580144e-05, + "loss": 0.0013, + "step": 34830 + }, + { + "epoch": 2.183367800965094, + "grad_norm": 0.016204219311475754, + "learning_rate": 1.3754195604905956e-05, + "loss": 0.0407, + "step": 34840 + }, + { + "epoch": 2.183994485178918, + "grad_norm": 0.017499983310699463, + "learning_rate": 1.3743640624010471e-05, + "loss": 0.0585, + "step": 34850 + }, + { + "epoch": 2.184621169392743, + "grad_norm": 0.13106019794940948, + "learning_rate": 1.3733085643114988e-05, + "loss": 0.1402, + "step": 34860 + }, + { + "epoch": 2.1852478536065676, + "grad_norm": 0.005508528556674719, + "learning_rate": 1.3722530662219501e-05, + "loss": 0.0515, + "step": 34870 + }, + { + "epoch": 2.1858745378203923, + "grad_norm": 0.009193802252411842, + "learning_rate": 1.371197568132402e-05, + "loss": 0.0038, + "step": 34880 + }, + { + "epoch": 2.186501222034217, + "grad_norm": 0.035130925476551056, + "learning_rate": 1.3701420700428533e-05, + "loss": 0.0014, + "step": 34890 + }, + { + "epoch": 2.1871279062480418, + "grad_norm": 0.006802499294281006, + "learning_rate": 1.3690865719533047e-05, + "loss": 0.0543, + "step": 34900 + }, + { + "epoch": 2.187754590461866, + "grad_norm": 0.0060197836719453335, + "learning_rate": 1.3680310738637564e-05, + "loss": 0.0665, + "step": 34910 + }, + { + "epoch": 2.1883812746756908, + "grad_norm": 0.02188221924006939, + "learning_rate": 1.3669755757742077e-05, + "loss": 0.0215, + "step": 34920 + }, + { + "epoch": 2.1890079588895155, + "grad_norm": 0.14490945637226105, + "learning_rate": 1.3659200776846596e-05, + "loss": 0.169, + "step": 34930 + }, + { + "epoch": 2.18963464310334, + "grad_norm": 0.03352600708603859, + "learning_rate": 1.3648645795951109e-05, + "loss": 0.0012, + "step": 34940 + }, + { + "epoch": 2.190261327317165, + "grad_norm": 0.1054094210267067, + "learning_rate": 1.3638090815055626e-05, + "loss": 0.0891, + "step": 34950 + }, + { + "epoch": 2.1908880115309897, + "grad_norm": 0.0154159776866436, + "learning_rate": 1.362753583416014e-05, + "loss": 0.1988, + "step": 34960 + }, + { + "epoch": 2.1915146957448144, + "grad_norm": 0.010679114609956741, + "learning_rate": 1.3616980853264657e-05, + "loss": 0.1249, + "step": 34970 + }, + { + "epoch": 2.1921413799586387, + "grad_norm": 0.11450926214456558, + "learning_rate": 1.3606425872369172e-05, + "loss": 0.0023, + "step": 34980 + }, + { + "epoch": 2.1927680641724634, + "grad_norm": 15.463875770568848, + "learning_rate": 1.3595870891473685e-05, + "loss": 0.2882, + "step": 34990 + }, + { + "epoch": 2.193394748386288, + "grad_norm": 5.67316198348999, + "learning_rate": 1.3585315910578202e-05, + "loss": 0.0922, + "step": 35000 + }, + { + "epoch": 2.194021432600113, + "grad_norm": 0.02195173315703869, + "learning_rate": 1.3574760929682717e-05, + "loss": 0.1378, + "step": 35010 + }, + { + "epoch": 2.1946481168139376, + "grad_norm": 0.22539135813713074, + "learning_rate": 1.3564205948787234e-05, + "loss": 0.0881, + "step": 35020 + }, + { + "epoch": 2.1952748010277623, + "grad_norm": 0.022844787687063217, + "learning_rate": 1.3553650967891749e-05, + "loss": 0.0723, + "step": 35030 + }, + { + "epoch": 2.1959014852415866, + "grad_norm": 0.058049276471138, + "learning_rate": 1.3543095986996265e-05, + "loss": 0.0071, + "step": 35040 + }, + { + "epoch": 2.1965281694554113, + "grad_norm": 0.035429976880550385, + "learning_rate": 1.353254100610078e-05, + "loss": 0.1019, + "step": 35050 + }, + { + "epoch": 2.197154853669236, + "grad_norm": 3.9574286937713623, + "learning_rate": 1.3521986025205294e-05, + "loss": 0.154, + "step": 35060 + }, + { + "epoch": 2.1977815378830607, + "grad_norm": 0.02437640354037285, + "learning_rate": 1.351143104430981e-05, + "loss": 0.1157, + "step": 35070 + }, + { + "epoch": 2.1984082220968855, + "grad_norm": 0.3660740256309509, + "learning_rate": 1.3500876063414325e-05, + "loss": 0.082, + "step": 35080 + }, + { + "epoch": 2.19903490631071, + "grad_norm": 22.220230102539062, + "learning_rate": 1.3490321082518842e-05, + "loss": 0.0268, + "step": 35090 + }, + { + "epoch": 2.199661590524535, + "grad_norm": 405.9232482910156, + "learning_rate": 1.3479766101623357e-05, + "loss": 0.1237, + "step": 35100 + }, + { + "epoch": 2.200288274738359, + "grad_norm": 0.029608746990561485, + "learning_rate": 1.3469211120727873e-05, + "loss": 0.1838, + "step": 35110 + }, + { + "epoch": 2.200914958952184, + "grad_norm": 0.038986582309007645, + "learning_rate": 1.3458656139832387e-05, + "loss": 0.0051, + "step": 35120 + }, + { + "epoch": 2.2015416431660086, + "grad_norm": 0.04025193676352501, + "learning_rate": 1.3448101158936902e-05, + "loss": 0.0287, + "step": 35130 + }, + { + "epoch": 2.2021683273798334, + "grad_norm": 0.25155943632125854, + "learning_rate": 1.3437546178041418e-05, + "loss": 0.0335, + "step": 35140 + }, + { + "epoch": 2.202795011593658, + "grad_norm": 0.10960265249013901, + "learning_rate": 1.3426991197145933e-05, + "loss": 0.003, + "step": 35150 + }, + { + "epoch": 2.203421695807483, + "grad_norm": 4.629335403442383, + "learning_rate": 1.341643621625045e-05, + "loss": 0.0887, + "step": 35160 + }, + { + "epoch": 2.204048380021307, + "grad_norm": 0.01714794896543026, + "learning_rate": 1.3405881235354963e-05, + "loss": 0.0731, + "step": 35170 + }, + { + "epoch": 2.204675064235132, + "grad_norm": 0.04572465270757675, + "learning_rate": 1.3395326254459482e-05, + "loss": 0.1256, + "step": 35180 + }, + { + "epoch": 2.2053017484489565, + "grad_norm": 1.0245270729064941, + "learning_rate": 1.3384771273563995e-05, + "loss": 0.0712, + "step": 35190 + }, + { + "epoch": 2.2059284326627813, + "grad_norm": 0.020694352686405182, + "learning_rate": 1.3374216292668511e-05, + "loss": 0.0021, + "step": 35200 + }, + { + "epoch": 2.206555116876606, + "grad_norm": 0.025215351954102516, + "learning_rate": 1.3363661311773026e-05, + "loss": 0.0019, + "step": 35210 + }, + { + "epoch": 2.2071818010904307, + "grad_norm": 0.019033333286643028, + "learning_rate": 1.3353106330877541e-05, + "loss": 0.0031, + "step": 35220 + }, + { + "epoch": 2.207808485304255, + "grad_norm": 2.6878879070281982, + "learning_rate": 1.3342551349982058e-05, + "loss": 0.0039, + "step": 35230 + }, + { + "epoch": 2.2084351695180797, + "grad_norm": 0.020640341565012932, + "learning_rate": 1.3331996369086571e-05, + "loss": 0.0841, + "step": 35240 + }, + { + "epoch": 2.2090618537319044, + "grad_norm": 0.016223404556512833, + "learning_rate": 1.332144138819109e-05, + "loss": 0.0448, + "step": 35250 + }, + { + "epoch": 2.209688537945729, + "grad_norm": 0.005702883470803499, + "learning_rate": 1.3310886407295603e-05, + "loss": 0.0283, + "step": 35260 + }, + { + "epoch": 2.210315222159554, + "grad_norm": 13.36315631866455, + "learning_rate": 1.330033142640012e-05, + "loss": 0.121, + "step": 35270 + }, + { + "epoch": 2.2109419063733786, + "grad_norm": 0.007761610671877861, + "learning_rate": 1.3289776445504634e-05, + "loss": 0.0029, + "step": 35280 + }, + { + "epoch": 2.211568590587203, + "grad_norm": 0.05677232891321182, + "learning_rate": 1.3279221464609148e-05, + "loss": 0.1566, + "step": 35290 + }, + { + "epoch": 2.2121952748010276, + "grad_norm": 0.01307602971792221, + "learning_rate": 1.3268666483713666e-05, + "loss": 0.0427, + "step": 35300 + }, + { + "epoch": 2.2128219590148523, + "grad_norm": 0.021906176581978798, + "learning_rate": 1.325811150281818e-05, + "loss": 0.0512, + "step": 35310 + }, + { + "epoch": 2.213448643228677, + "grad_norm": 0.008934159763157368, + "learning_rate": 1.3247556521922696e-05, + "loss": 0.0585, + "step": 35320 + }, + { + "epoch": 2.214075327442502, + "grad_norm": 0.05048960819840431, + "learning_rate": 1.3237001541027211e-05, + "loss": 0.1755, + "step": 35330 + }, + { + "epoch": 2.2147020116563265, + "grad_norm": 0.03687850758433342, + "learning_rate": 1.3226446560131728e-05, + "loss": 0.1197, + "step": 35340 + }, + { + "epoch": 2.2153286958701512, + "grad_norm": 0.07645319402217865, + "learning_rate": 1.3215891579236243e-05, + "loss": 0.0426, + "step": 35350 + }, + { + "epoch": 2.2159553800839755, + "grad_norm": 0.07259127497673035, + "learning_rate": 1.320533659834076e-05, + "loss": 0.0037, + "step": 35360 + }, + { + "epoch": 2.2165820642978002, + "grad_norm": 0.034717291593551636, + "learning_rate": 1.3194781617445272e-05, + "loss": 0.0022, + "step": 35370 + }, + { + "epoch": 2.217208748511625, + "grad_norm": 0.19604039192199707, + "learning_rate": 1.3184226636549787e-05, + "loss": 0.0881, + "step": 35380 + }, + { + "epoch": 2.2178354327254497, + "grad_norm": 0.08138164132833481, + "learning_rate": 1.3173671655654304e-05, + "loss": 0.0602, + "step": 35390 + }, + { + "epoch": 2.2184621169392744, + "grad_norm": 0.012671887874603271, + "learning_rate": 1.3163116674758819e-05, + "loss": 0.0523, + "step": 35400 + }, + { + "epoch": 2.219088801153099, + "grad_norm": 0.08288644999265671, + "learning_rate": 1.3152561693863336e-05, + "loss": 0.0015, + "step": 35410 + }, + { + "epoch": 2.2197154853669234, + "grad_norm": 0.5564950108528137, + "learning_rate": 1.3142006712967849e-05, + "loss": 0.1163, + "step": 35420 + }, + { + "epoch": 2.220342169580748, + "grad_norm": 0.03301873803138733, + "learning_rate": 1.3131451732072367e-05, + "loss": 0.0419, + "step": 35430 + }, + { + "epoch": 2.220968853794573, + "grad_norm": 0.021805711090564728, + "learning_rate": 1.312089675117688e-05, + "loss": 0.0042, + "step": 35440 + }, + { + "epoch": 2.2215955380083976, + "grad_norm": 0.07749617099761963, + "learning_rate": 1.3110341770281396e-05, + "loss": 0.0013, + "step": 35450 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.007691920269280672, + "learning_rate": 1.3099786789385912e-05, + "loss": 0.0307, + "step": 35460 + }, + { + "epoch": 2.222848906436047, + "grad_norm": 0.006902730092406273, + "learning_rate": 1.3089231808490427e-05, + "loss": 0.0014, + "step": 35470 + }, + { + "epoch": 2.2234755906498713, + "grad_norm": 0.005276334937661886, + "learning_rate": 1.3078676827594944e-05, + "loss": 0.1014, + "step": 35480 + }, + { + "epoch": 2.224102274863696, + "grad_norm": 0.010618511587381363, + "learning_rate": 1.3068121846699457e-05, + "loss": 0.001, + "step": 35490 + }, + { + "epoch": 2.2247289590775208, + "grad_norm": 0.005693093407899141, + "learning_rate": 1.3057566865803975e-05, + "loss": 0.1915, + "step": 35500 + }, + { + "epoch": 2.2253556432913455, + "grad_norm": 0.014188366010785103, + "learning_rate": 1.3047011884908489e-05, + "loss": 0.0016, + "step": 35510 + }, + { + "epoch": 2.22598232750517, + "grad_norm": 0.008730873465538025, + "learning_rate": 1.3036456904013004e-05, + "loss": 0.0006, + "step": 35520 + }, + { + "epoch": 2.226609011718995, + "grad_norm": 0.007341964635998011, + "learning_rate": 1.302590192311752e-05, + "loss": 0.0014, + "step": 35530 + }, + { + "epoch": 2.2272356959328192, + "grad_norm": 0.00883073452860117, + "learning_rate": 1.3015346942222034e-05, + "loss": 0.0698, + "step": 35540 + }, + { + "epoch": 2.227862380146644, + "grad_norm": 0.12473952025175095, + "learning_rate": 1.3004791961326552e-05, + "loss": 0.267, + "step": 35550 + }, + { + "epoch": 2.2284890643604687, + "grad_norm": 8.776956558227539, + "learning_rate": 1.2994236980431065e-05, + "loss": 0.1964, + "step": 35560 + }, + { + "epoch": 2.2291157485742934, + "grad_norm": 0.05307682231068611, + "learning_rate": 1.2983681999535582e-05, + "loss": 0.0166, + "step": 35570 + }, + { + "epoch": 2.229742432788118, + "grad_norm": 0.13624274730682373, + "learning_rate": 1.2973127018640097e-05, + "loss": 0.0555, + "step": 35580 + }, + { + "epoch": 2.230369117001943, + "grad_norm": 0.45357584953308105, + "learning_rate": 1.2962572037744613e-05, + "loss": 0.0374, + "step": 35590 + }, + { + "epoch": 2.2309958012157676, + "grad_norm": 0.016800403594970703, + "learning_rate": 1.2952017056849128e-05, + "loss": 0.0024, + "step": 35600 + }, + { + "epoch": 2.231622485429592, + "grad_norm": 0.010358024388551712, + "learning_rate": 1.2941462075953642e-05, + "loss": 0.002, + "step": 35610 + }, + { + "epoch": 2.2322491696434166, + "grad_norm": 0.20369577407836914, + "learning_rate": 1.2930907095058158e-05, + "loss": 0.1755, + "step": 35620 + }, + { + "epoch": 2.2328758538572413, + "grad_norm": 0.012203558348119259, + "learning_rate": 1.2920352114162673e-05, + "loss": 0.1723, + "step": 35630 + }, + { + "epoch": 2.233502538071066, + "grad_norm": 0.026246270164847374, + "learning_rate": 1.290979713326719e-05, + "loss": 0.176, + "step": 35640 + }, + { + "epoch": 2.2341292222848907, + "grad_norm": 0.014504744671285152, + "learning_rate": 1.2899242152371705e-05, + "loss": 0.0751, + "step": 35650 + }, + { + "epoch": 2.2347559064987155, + "grad_norm": 0.010934627614915371, + "learning_rate": 1.2888687171476222e-05, + "loss": 0.0406, + "step": 35660 + }, + { + "epoch": 2.2353825907125398, + "grad_norm": 0.009612313471734524, + "learning_rate": 1.2878132190580735e-05, + "loss": 0.0758, + "step": 35670 + }, + { + "epoch": 2.2360092749263645, + "grad_norm": 0.18484322726726532, + "learning_rate": 1.286757720968525e-05, + "loss": 0.043, + "step": 35680 + }, + { + "epoch": 2.236635959140189, + "grad_norm": 0.17664383351802826, + "learning_rate": 1.2857022228789766e-05, + "loss": 0.0918, + "step": 35690 + }, + { + "epoch": 2.237262643354014, + "grad_norm": 0.05666724219918251, + "learning_rate": 1.2846467247894281e-05, + "loss": 0.0608, + "step": 35700 + }, + { + "epoch": 2.2378893275678386, + "grad_norm": 0.1784476339817047, + "learning_rate": 1.2835912266998798e-05, + "loss": 0.2159, + "step": 35710 + }, + { + "epoch": 2.2385160117816634, + "grad_norm": 0.6217782497406006, + "learning_rate": 1.2825357286103313e-05, + "loss": 0.0619, + "step": 35720 + }, + { + "epoch": 2.239142695995488, + "grad_norm": 0.1981956660747528, + "learning_rate": 1.281480230520783e-05, + "loss": 0.0399, + "step": 35730 + }, + { + "epoch": 2.2397693802093124, + "grad_norm": 0.08834876865148544, + "learning_rate": 1.2804247324312343e-05, + "loss": 0.0421, + "step": 35740 + }, + { + "epoch": 2.240396064423137, + "grad_norm": 5.194336414337158, + "learning_rate": 1.2793692343416861e-05, + "loss": 0.0668, + "step": 35750 + }, + { + "epoch": 2.241022748636962, + "grad_norm": 0.005984729155898094, + "learning_rate": 1.2783137362521374e-05, + "loss": 0.0331, + "step": 35760 + }, + { + "epoch": 2.2416494328507865, + "grad_norm": 0.056345418095588684, + "learning_rate": 1.277258238162589e-05, + "loss": 0.1728, + "step": 35770 + }, + { + "epoch": 2.2422761170646113, + "grad_norm": 0.10154221206903458, + "learning_rate": 1.2762027400730406e-05, + "loss": 0.0279, + "step": 35780 + }, + { + "epoch": 2.242902801278436, + "grad_norm": 0.15458165109157562, + "learning_rate": 1.275147241983492e-05, + "loss": 0.0027, + "step": 35790 + }, + { + "epoch": 2.2435294854922603, + "grad_norm": 7.0228271484375, + "learning_rate": 1.2740917438939438e-05, + "loss": 0.1917, + "step": 35800 + }, + { + "epoch": 2.244156169706085, + "grad_norm": 0.019665498286485672, + "learning_rate": 1.2730362458043951e-05, + "loss": 0.0937, + "step": 35810 + }, + { + "epoch": 2.2447828539199097, + "grad_norm": 0.11862511932849884, + "learning_rate": 1.2719807477148468e-05, + "loss": 0.0025, + "step": 35820 + }, + { + "epoch": 2.2454095381337345, + "grad_norm": 0.0292394757270813, + "learning_rate": 1.2709252496252983e-05, + "loss": 0.0008, + "step": 35830 + }, + { + "epoch": 2.246036222347559, + "grad_norm": 0.5504111051559448, + "learning_rate": 1.2698697515357496e-05, + "loss": 0.1048, + "step": 35840 + }, + { + "epoch": 2.246662906561384, + "grad_norm": 5.584314823150635, + "learning_rate": 1.2688142534462014e-05, + "loss": 0.2176, + "step": 35850 + }, + { + "epoch": 2.247289590775208, + "grad_norm": 0.19631190598011017, + "learning_rate": 1.2677587553566527e-05, + "loss": 0.1025, + "step": 35860 + }, + { + "epoch": 2.247916274989033, + "grad_norm": 0.12277902662754059, + "learning_rate": 1.2667032572671044e-05, + "loss": 0.0933, + "step": 35870 + }, + { + "epoch": 2.2485429592028576, + "grad_norm": 0.18572159111499786, + "learning_rate": 1.2656477591775559e-05, + "loss": 0.0701, + "step": 35880 + }, + { + "epoch": 2.2491696434166824, + "grad_norm": 0.09474160522222519, + "learning_rate": 1.2645922610880076e-05, + "loss": 0.0376, + "step": 35890 + }, + { + "epoch": 2.249796327630507, + "grad_norm": 0.10830563306808472, + "learning_rate": 1.263536762998459e-05, + "loss": 0.0533, + "step": 35900 + }, + { + "epoch": 2.250423011844332, + "grad_norm": 0.0841013714671135, + "learning_rate": 1.2624812649089104e-05, + "loss": 0.0689, + "step": 35910 + }, + { + "epoch": 2.251049696058156, + "grad_norm": 0.005682028364390135, + "learning_rate": 1.2614257668193622e-05, + "loss": 0.0077, + "step": 35920 + }, + { + "epoch": 2.251676380271981, + "grad_norm": 5.086600303649902, + "learning_rate": 1.2603702687298136e-05, + "loss": 0.1592, + "step": 35930 + }, + { + "epoch": 2.2523030644858055, + "grad_norm": 0.01670365035533905, + "learning_rate": 1.2593147706402652e-05, + "loss": 0.0045, + "step": 35940 + }, + { + "epoch": 2.2529297486996303, + "grad_norm": 3.9989254474639893, + "learning_rate": 1.2582592725507167e-05, + "loss": 0.0359, + "step": 35950 + }, + { + "epoch": 2.253556432913455, + "grad_norm": 0.015506722964346409, + "learning_rate": 1.2572037744611684e-05, + "loss": 0.0014, + "step": 35960 + }, + { + "epoch": 2.2541831171272797, + "grad_norm": 0.11642977595329285, + "learning_rate": 1.2561482763716199e-05, + "loss": 0.0708, + "step": 35970 + }, + { + "epoch": 2.2548098013411044, + "grad_norm": 0.35174962878227234, + "learning_rate": 1.2550927782820715e-05, + "loss": 0.002, + "step": 35980 + }, + { + "epoch": 2.2554364855549287, + "grad_norm": 0.6952834129333496, + "learning_rate": 1.2540372801925229e-05, + "loss": 0.0513, + "step": 35990 + }, + { + "epoch": 2.2560631697687534, + "grad_norm": 0.007165560964494944, + "learning_rate": 1.2529817821029744e-05, + "loss": 0.1342, + "step": 36000 + }, + { + "epoch": 2.256689853982578, + "grad_norm": 0.006440062541514635, + "learning_rate": 1.251926284013426e-05, + "loss": 0.043, + "step": 36010 + }, + { + "epoch": 2.257316538196403, + "grad_norm": 0.013440284878015518, + "learning_rate": 1.2508707859238775e-05, + "loss": 0.0136, + "step": 36020 + }, + { + "epoch": 2.2579432224102276, + "grad_norm": 0.5947048664093018, + "learning_rate": 1.249815287834329e-05, + "loss": 0.082, + "step": 36030 + }, + { + "epoch": 2.2585699066240523, + "grad_norm": 0.6158246994018555, + "learning_rate": 1.2487597897447805e-05, + "loss": 0.084, + "step": 36040 + }, + { + "epoch": 2.2591965908378766, + "grad_norm": 0.008292706683278084, + "learning_rate": 1.2477042916552322e-05, + "loss": 0.0303, + "step": 36050 + }, + { + "epoch": 2.2598232750517013, + "grad_norm": 0.2527766823768616, + "learning_rate": 1.2466487935656837e-05, + "loss": 0.0815, + "step": 36060 + }, + { + "epoch": 2.260449959265526, + "grad_norm": 1.2495615482330322, + "learning_rate": 1.2455932954761353e-05, + "loss": 0.0471, + "step": 36070 + }, + { + "epoch": 2.261076643479351, + "grad_norm": 0.5814064145088196, + "learning_rate": 1.2445377973865868e-05, + "loss": 0.2085, + "step": 36080 + }, + { + "epoch": 2.2617033276931755, + "grad_norm": 0.005435166414827108, + "learning_rate": 1.2434822992970383e-05, + "loss": 0.0408, + "step": 36090 + }, + { + "epoch": 2.2623300119070002, + "grad_norm": 0.0057927523739635944, + "learning_rate": 1.24242680120749e-05, + "loss": 0.07, + "step": 36100 + }, + { + "epoch": 2.262956696120825, + "grad_norm": 0.06232810765504837, + "learning_rate": 1.2413713031179413e-05, + "loss": 0.0007, + "step": 36110 + }, + { + "epoch": 2.2635833803346492, + "grad_norm": 5.673398017883301, + "learning_rate": 1.240315805028393e-05, + "loss": 0.1493, + "step": 36120 + }, + { + "epoch": 2.264210064548474, + "grad_norm": 0.03220819681882858, + "learning_rate": 1.2392603069388445e-05, + "loss": 0.0644, + "step": 36130 + }, + { + "epoch": 2.2648367487622987, + "grad_norm": 0.004986956249922514, + "learning_rate": 1.238204808849296e-05, + "loss": 0.0758, + "step": 36140 + }, + { + "epoch": 2.2654634329761234, + "grad_norm": 3.61641263961792, + "learning_rate": 1.2371493107597476e-05, + "loss": 0.0821, + "step": 36150 + }, + { + "epoch": 2.266090117189948, + "grad_norm": 0.0038919805083423853, + "learning_rate": 1.2360938126701991e-05, + "loss": 0.0038, + "step": 36160 + }, + { + "epoch": 2.2667168014037724, + "grad_norm": 0.14399461448192596, + "learning_rate": 1.2350383145806508e-05, + "loss": 0.2721, + "step": 36170 + }, + { + "epoch": 2.267343485617597, + "grad_norm": 0.1366736739873886, + "learning_rate": 1.2339828164911021e-05, + "loss": 0.1095, + "step": 36180 + }, + { + "epoch": 2.267970169831422, + "grad_norm": 0.020060712471604347, + "learning_rate": 1.2329273184015536e-05, + "loss": 0.0027, + "step": 36190 + }, + { + "epoch": 2.2685968540452466, + "grad_norm": 0.09323687851428986, + "learning_rate": 1.2318718203120053e-05, + "loss": 0.067, + "step": 36200 + }, + { + "epoch": 2.2692235382590713, + "grad_norm": 0.34588101506233215, + "learning_rate": 1.2308163222224568e-05, + "loss": 0.077, + "step": 36210 + }, + { + "epoch": 2.269850222472896, + "grad_norm": 0.16162216663360596, + "learning_rate": 1.2297608241329085e-05, + "loss": 0.0386, + "step": 36220 + }, + { + "epoch": 2.2704769066867208, + "grad_norm": 0.35178256034851074, + "learning_rate": 1.22870532604336e-05, + "loss": 0.004, + "step": 36230 + }, + { + "epoch": 2.271103590900545, + "grad_norm": 0.08593368530273438, + "learning_rate": 1.2276498279538114e-05, + "loss": 0.0616, + "step": 36240 + }, + { + "epoch": 2.2717302751143698, + "grad_norm": 0.6215272545814514, + "learning_rate": 1.2265943298642631e-05, + "loss": 0.0053, + "step": 36250 + }, + { + "epoch": 2.2723569593281945, + "grad_norm": 0.3504977524280548, + "learning_rate": 1.2255388317747144e-05, + "loss": 0.0525, + "step": 36260 + }, + { + "epoch": 2.272983643542019, + "grad_norm": 3.6675524711608887, + "learning_rate": 1.2244833336851661e-05, + "loss": 0.1653, + "step": 36270 + }, + { + "epoch": 2.273610327755844, + "grad_norm": 0.012513170018792152, + "learning_rate": 1.2234278355956176e-05, + "loss": 0.0449, + "step": 36280 + }, + { + "epoch": 2.2742370119696687, + "grad_norm": 0.009431272745132446, + "learning_rate": 1.2223723375060691e-05, + "loss": 0.0756, + "step": 36290 + }, + { + "epoch": 2.274863696183493, + "grad_norm": 3.7812867164611816, + "learning_rate": 1.2213168394165208e-05, + "loss": 0.1541, + "step": 36300 + }, + { + "epoch": 2.2754903803973177, + "grad_norm": 0.033403124660253525, + "learning_rate": 1.2202613413269723e-05, + "loss": 0.0012, + "step": 36310 + }, + { + "epoch": 2.2761170646111424, + "grad_norm": 0.046253349632024765, + "learning_rate": 1.219205843237424e-05, + "loss": 0.0517, + "step": 36320 + }, + { + "epoch": 2.276743748824967, + "grad_norm": 0.2164696902036667, + "learning_rate": 1.2181503451478754e-05, + "loss": 0.0435, + "step": 36330 + }, + { + "epoch": 2.277370433038792, + "grad_norm": 0.24268116056919098, + "learning_rate": 1.2170948470583269e-05, + "loss": 0.0321, + "step": 36340 + }, + { + "epoch": 2.2779971172526166, + "grad_norm": 0.017050549387931824, + "learning_rate": 1.2160393489687784e-05, + "loss": 0.2641, + "step": 36350 + }, + { + "epoch": 2.2786238014664413, + "grad_norm": 8.202871322631836, + "learning_rate": 1.2149838508792299e-05, + "loss": 0.1438, + "step": 36360 + }, + { + "epoch": 2.2792504856802656, + "grad_norm": 0.2853762209415436, + "learning_rate": 1.2139283527896816e-05, + "loss": 0.0668, + "step": 36370 + }, + { + "epoch": 2.2798771698940903, + "grad_norm": 0.03244467452168465, + "learning_rate": 1.212872854700133e-05, + "loss": 0.0191, + "step": 36380 + }, + { + "epoch": 2.280503854107915, + "grad_norm": 0.03982846438884735, + "learning_rate": 1.2118173566105846e-05, + "loss": 0.1769, + "step": 36390 + }, + { + "epoch": 2.2811305383217397, + "grad_norm": 0.10278554260730743, + "learning_rate": 1.2107618585210362e-05, + "loss": 0.0697, + "step": 36400 + }, + { + "epoch": 2.2817572225355645, + "grad_norm": 0.051736317574977875, + "learning_rate": 1.2097063604314877e-05, + "loss": 0.1525, + "step": 36410 + }, + { + "epoch": 2.2823839067493887, + "grad_norm": 0.10186845809221268, + "learning_rate": 1.2086508623419392e-05, + "loss": 0.0365, + "step": 36420 + }, + { + "epoch": 2.2830105909632135, + "grad_norm": 0.057203132659196854, + "learning_rate": 1.2075953642523907e-05, + "loss": 0.0055, + "step": 36430 + }, + { + "epoch": 2.283637275177038, + "grad_norm": 0.024866662919521332, + "learning_rate": 1.2065398661628422e-05, + "loss": 0.0431, + "step": 36440 + }, + { + "epoch": 2.284263959390863, + "grad_norm": 1.0920522212982178, + "learning_rate": 1.2054843680732939e-05, + "loss": 0.005, + "step": 36450 + }, + { + "epoch": 2.2848906436046876, + "grad_norm": 0.01022693607956171, + "learning_rate": 1.2044288699837454e-05, + "loss": 0.1027, + "step": 36460 + }, + { + "epoch": 2.2855173278185124, + "grad_norm": 0.356696754693985, + "learning_rate": 1.203373371894197e-05, + "loss": 0.0805, + "step": 36470 + }, + { + "epoch": 2.286144012032337, + "grad_norm": 0.008136897347867489, + "learning_rate": 1.2023178738046485e-05, + "loss": 0.0251, + "step": 36480 + }, + { + "epoch": 2.286770696246162, + "grad_norm": 0.017162851989269257, + "learning_rate": 1.2012623757151e-05, + "loss": 0.0505, + "step": 36490 + }, + { + "epoch": 2.287397380459986, + "grad_norm": 0.7460328936576843, + "learning_rate": 1.2002068776255515e-05, + "loss": 0.0285, + "step": 36500 + }, + { + "epoch": 2.288024064673811, + "grad_norm": 0.008685870096087456, + "learning_rate": 1.199151379536003e-05, + "loss": 0.0679, + "step": 36510 + }, + { + "epoch": 2.2886507488876355, + "grad_norm": 0.008240882307291031, + "learning_rate": 1.1980958814464547e-05, + "loss": 0.1384, + "step": 36520 + }, + { + "epoch": 2.2892774331014603, + "grad_norm": 0.008877246640622616, + "learning_rate": 1.1970403833569062e-05, + "loss": 0.1276, + "step": 36530 + }, + { + "epoch": 2.289904117315285, + "grad_norm": 0.0377153716981411, + "learning_rate": 1.1959848852673577e-05, + "loss": 0.0019, + "step": 36540 + }, + { + "epoch": 2.2905308015291093, + "grad_norm": 0.007972978055477142, + "learning_rate": 1.1949293871778093e-05, + "loss": 0.0022, + "step": 36550 + }, + { + "epoch": 2.291157485742934, + "grad_norm": 0.22672541439533234, + "learning_rate": 1.1938738890882608e-05, + "loss": 0.074, + "step": 36560 + }, + { + "epoch": 2.2917841699567587, + "grad_norm": 0.011422897689044476, + "learning_rate": 1.1928183909987123e-05, + "loss": 0.0549, + "step": 36570 + }, + { + "epoch": 2.2924108541705834, + "grad_norm": 3.5832436084747314, + "learning_rate": 1.1917628929091638e-05, + "loss": 0.0674, + "step": 36580 + }, + { + "epoch": 2.293037538384408, + "grad_norm": 0.006303721573203802, + "learning_rate": 1.1907073948196155e-05, + "loss": 0.0947, + "step": 36590 + }, + { + "epoch": 2.293664222598233, + "grad_norm": 0.005968243815004826, + "learning_rate": 1.189651896730067e-05, + "loss": 0.0781, + "step": 36600 + }, + { + "epoch": 2.2942909068120576, + "grad_norm": 0.005579926539212465, + "learning_rate": 1.1885963986405185e-05, + "loss": 0.0021, + "step": 36610 + }, + { + "epoch": 2.294917591025882, + "grad_norm": 0.006298782769590616, + "learning_rate": 1.1875409005509701e-05, + "loss": 0.1184, + "step": 36620 + }, + { + "epoch": 2.2955442752397066, + "grad_norm": 0.005551437381654978, + "learning_rate": 1.1864854024614216e-05, + "loss": 0.0034, + "step": 36630 + }, + { + "epoch": 2.2961709594535313, + "grad_norm": 0.3854537606239319, + "learning_rate": 1.1854299043718731e-05, + "loss": 0.0348, + "step": 36640 + }, + { + "epoch": 2.296797643667356, + "grad_norm": 6.585789203643799, + "learning_rate": 1.1843744062823246e-05, + "loss": 0.1863, + "step": 36650 + }, + { + "epoch": 2.297424327881181, + "grad_norm": 0.12899911403656006, + "learning_rate": 1.1833189081927761e-05, + "loss": 0.0734, + "step": 36660 + }, + { + "epoch": 2.2980510120950055, + "grad_norm": 0.006385709624737501, + "learning_rate": 1.1822634101032278e-05, + "loss": 0.0007, + "step": 36670 + }, + { + "epoch": 2.29867769630883, + "grad_norm": 0.1899183690547943, + "learning_rate": 1.1812079120136793e-05, + "loss": 0.0053, + "step": 36680 + }, + { + "epoch": 2.2993043805226545, + "grad_norm": 1.0379557609558105, + "learning_rate": 1.1801524139241308e-05, + "loss": 0.002, + "step": 36690 + }, + { + "epoch": 2.2999310647364792, + "grad_norm": 8.715649604797363, + "learning_rate": 1.1790969158345824e-05, + "loss": 0.1757, + "step": 36700 + }, + { + "epoch": 2.300557748950304, + "grad_norm": 0.014531532302498817, + "learning_rate": 1.178041417745034e-05, + "loss": 0.1319, + "step": 36710 + }, + { + "epoch": 2.3011844331641287, + "grad_norm": 0.00932616088539362, + "learning_rate": 1.1769859196554856e-05, + "loss": 0.0015, + "step": 36720 + }, + { + "epoch": 2.3018111173779534, + "grad_norm": 0.008489527739584446, + "learning_rate": 1.175930421565937e-05, + "loss": 0.0736, + "step": 36730 + }, + { + "epoch": 2.302437801591778, + "grad_norm": 0.04834457114338875, + "learning_rate": 1.1748749234763886e-05, + "loss": 0.0771, + "step": 36740 + }, + { + "epoch": 2.3030644858056024, + "grad_norm": 0.26478925347328186, + "learning_rate": 1.1738194253868401e-05, + "loss": 0.0638, + "step": 36750 + }, + { + "epoch": 2.303691170019427, + "grad_norm": 0.0104445219039917, + "learning_rate": 1.1727639272972916e-05, + "loss": 0.0027, + "step": 36760 + }, + { + "epoch": 2.304317854233252, + "grad_norm": 0.008162743411958218, + "learning_rate": 1.1717084292077433e-05, + "loss": 0.0778, + "step": 36770 + }, + { + "epoch": 2.3049445384470766, + "grad_norm": 0.2298077791929245, + "learning_rate": 1.1706529311181948e-05, + "loss": 0.0465, + "step": 36780 + }, + { + "epoch": 2.3055712226609013, + "grad_norm": 0.008726351894438267, + "learning_rate": 1.1695974330286462e-05, + "loss": 0.0401, + "step": 36790 + }, + { + "epoch": 2.3061979068747256, + "grad_norm": 0.24333196878433228, + "learning_rate": 1.1685419349390977e-05, + "loss": 0.2434, + "step": 36800 + }, + { + "epoch": 2.3068245910885503, + "grad_norm": 0.011418229900300503, + "learning_rate": 1.1674864368495492e-05, + "loss": 0.0016, + "step": 36810 + }, + { + "epoch": 2.307451275302375, + "grad_norm": 0.11549999564886093, + "learning_rate": 1.1664309387600009e-05, + "loss": 0.0032, + "step": 36820 + }, + { + "epoch": 2.3080779595161998, + "grad_norm": 5.21661376953125, + "learning_rate": 1.1653754406704524e-05, + "loss": 0.2179, + "step": 36830 + }, + { + "epoch": 2.3087046437300245, + "grad_norm": 0.016975464299321175, + "learning_rate": 1.164319942580904e-05, + "loss": 0.0251, + "step": 36840 + }, + { + "epoch": 2.309331327943849, + "grad_norm": 38.22617721557617, + "learning_rate": 1.1632644444913556e-05, + "loss": 0.052, + "step": 36850 + }, + { + "epoch": 2.309958012157674, + "grad_norm": 5.6047444343566895, + "learning_rate": 1.162208946401807e-05, + "loss": 0.0952, + "step": 36860 + }, + { + "epoch": 2.310584696371498, + "grad_norm": 0.04268406331539154, + "learning_rate": 1.1611534483122587e-05, + "loss": 0.1451, + "step": 36870 + }, + { + "epoch": 2.311211380585323, + "grad_norm": 0.7725669145584106, + "learning_rate": 1.16009795022271e-05, + "loss": 0.0431, + "step": 36880 + }, + { + "epoch": 2.3118380647991477, + "grad_norm": 0.016580281779170036, + "learning_rate": 1.1590424521331617e-05, + "loss": 0.033, + "step": 36890 + }, + { + "epoch": 2.3124647490129724, + "grad_norm": 6.017152309417725, + "learning_rate": 1.1579869540436132e-05, + "loss": 0.2377, + "step": 36900 + }, + { + "epoch": 2.313091433226797, + "grad_norm": 0.042723577469587326, + "learning_rate": 1.1569314559540647e-05, + "loss": 0.0019, + "step": 36910 + }, + { + "epoch": 2.313718117440622, + "grad_norm": 26.877918243408203, + "learning_rate": 1.1558759578645164e-05, + "loss": 0.0296, + "step": 36920 + }, + { + "epoch": 2.314344801654446, + "grad_norm": 0.01965474896132946, + "learning_rate": 1.1548204597749679e-05, + "loss": 0.0037, + "step": 36930 + }, + { + "epoch": 2.314971485868271, + "grad_norm": 12.622506141662598, + "learning_rate": 1.1537649616854195e-05, + "loss": 0.2077, + "step": 36940 + }, + { + "epoch": 2.3155981700820956, + "grad_norm": 0.08890029788017273, + "learning_rate": 1.152709463595871e-05, + "loss": 0.0894, + "step": 36950 + }, + { + "epoch": 2.3162248542959203, + "grad_norm": 0.08074666559696198, + "learning_rate": 1.1516539655063224e-05, + "loss": 0.0451, + "step": 36960 + }, + { + "epoch": 2.316851538509745, + "grad_norm": 6.843764305114746, + "learning_rate": 1.150598467416774e-05, + "loss": 0.0899, + "step": 36970 + }, + { + "epoch": 2.3174782227235697, + "grad_norm": 0.012340700253844261, + "learning_rate": 1.1495429693272255e-05, + "loss": 0.0809, + "step": 36980 + }, + { + "epoch": 2.3181049069373945, + "grad_norm": 0.07329646497964859, + "learning_rate": 1.1484874712376772e-05, + "loss": 0.145, + "step": 36990 + }, + { + "epoch": 2.3187315911512187, + "grad_norm": 0.22346799075603485, + "learning_rate": 1.1474319731481287e-05, + "loss": 0.0036, + "step": 37000 + }, + { + "epoch": 2.3193582753650435, + "grad_norm": 0.43645739555358887, + "learning_rate": 1.1463764750585802e-05, + "loss": 0.0983, + "step": 37010 + }, + { + "epoch": 2.319984959578868, + "grad_norm": 0.11260388046503067, + "learning_rate": 1.1453209769690318e-05, + "loss": 0.1085, + "step": 37020 + }, + { + "epoch": 2.320611643792693, + "grad_norm": 0.09059934318065643, + "learning_rate": 1.1442654788794833e-05, + "loss": 0.0394, + "step": 37030 + }, + { + "epoch": 2.3212383280065176, + "grad_norm": 3.420612335205078, + "learning_rate": 1.1432099807899348e-05, + "loss": 0.0855, + "step": 37040 + }, + { + "epoch": 2.3218650122203424, + "grad_norm": 0.07549533247947693, + "learning_rate": 1.1421544827003863e-05, + "loss": 0.0024, + "step": 37050 + }, + { + "epoch": 2.3224916964341666, + "grad_norm": 0.01121055893599987, + "learning_rate": 1.1410989846108378e-05, + "loss": 0.0828, + "step": 37060 + }, + { + "epoch": 2.3231183806479914, + "grad_norm": 0.03671666979789734, + "learning_rate": 1.1400434865212895e-05, + "loss": 0.0616, + "step": 37070 + }, + { + "epoch": 2.323745064861816, + "grad_norm": 0.01181852724403143, + "learning_rate": 1.138987988431741e-05, + "loss": 0.0378, + "step": 37080 + }, + { + "epoch": 2.324371749075641, + "grad_norm": 0.634685218334198, + "learning_rate": 1.1379324903421926e-05, + "loss": 0.2232, + "step": 37090 + }, + { + "epoch": 2.3249984332894655, + "grad_norm": 2.2556233406066895, + "learning_rate": 1.1368769922526441e-05, + "loss": 0.1033, + "step": 37100 + }, + { + "epoch": 2.3256251175032903, + "grad_norm": 0.02632150985300541, + "learning_rate": 1.1358214941630955e-05, + "loss": 0.0698, + "step": 37110 + }, + { + "epoch": 2.326251801717115, + "grad_norm": 8.871074676513672, + "learning_rate": 1.1347659960735471e-05, + "loss": 0.0906, + "step": 37120 + }, + { + "epoch": 2.3268784859309393, + "grad_norm": 1.0586740970611572, + "learning_rate": 1.1337104979839986e-05, + "loss": 0.0763, + "step": 37130 + }, + { + "epoch": 2.327505170144764, + "grad_norm": 0.023631436750292778, + "learning_rate": 1.1326549998944503e-05, + "loss": 0.0057, + "step": 37140 + }, + { + "epoch": 2.3281318543585887, + "grad_norm": 0.09110420942306519, + "learning_rate": 1.1315995018049018e-05, + "loss": 0.0792, + "step": 37150 + }, + { + "epoch": 2.3287585385724134, + "grad_norm": 0.1705956906080246, + "learning_rate": 1.1305440037153533e-05, + "loss": 0.0269, + "step": 37160 + }, + { + "epoch": 2.329385222786238, + "grad_norm": 0.04436657577753067, + "learning_rate": 1.129488505625805e-05, + "loss": 0.153, + "step": 37170 + }, + { + "epoch": 2.3300119070000624, + "grad_norm": 0.01578608714044094, + "learning_rate": 1.1284330075362564e-05, + "loss": 0.002, + "step": 37180 + }, + { + "epoch": 2.330638591213887, + "grad_norm": 0.011682651937007904, + "learning_rate": 1.127377509446708e-05, + "loss": 0.0338, + "step": 37190 + }, + { + "epoch": 2.331265275427712, + "grad_norm": 0.02722298726439476, + "learning_rate": 1.1263220113571594e-05, + "loss": 0.0246, + "step": 37200 + }, + { + "epoch": 2.3318919596415366, + "grad_norm": 0.03455738350749016, + "learning_rate": 1.125266513267611e-05, + "loss": 0.0013, + "step": 37210 + }, + { + "epoch": 2.3325186438553613, + "grad_norm": 0.6584683060646057, + "learning_rate": 1.1242110151780626e-05, + "loss": 0.3428, + "step": 37220 + }, + { + "epoch": 2.333145328069186, + "grad_norm": 0.017207475379109383, + "learning_rate": 1.1231555170885141e-05, + "loss": 0.0411, + "step": 37230 + }, + { + "epoch": 2.333772012283011, + "grad_norm": 4.051939964294434, + "learning_rate": 1.1221000189989658e-05, + "loss": 0.1354, + "step": 37240 + }, + { + "epoch": 2.334398696496835, + "grad_norm": 16.780792236328125, + "learning_rate": 1.1210445209094173e-05, + "loss": 0.0758, + "step": 37250 + }, + { + "epoch": 2.33502538071066, + "grad_norm": 0.6355787515640259, + "learning_rate": 1.1199890228198687e-05, + "loss": 0.1064, + "step": 37260 + }, + { + "epoch": 2.3356520649244845, + "grad_norm": 0.046288829296827316, + "learning_rate": 1.1189335247303202e-05, + "loss": 0.0298, + "step": 37270 + }, + { + "epoch": 2.3362787491383092, + "grad_norm": 5.317742347717285, + "learning_rate": 1.1178780266407717e-05, + "loss": 0.0783, + "step": 37280 + }, + { + "epoch": 2.336905433352134, + "grad_norm": 0.026349328458309174, + "learning_rate": 1.1168225285512234e-05, + "loss": 0.0038, + "step": 37290 + }, + { + "epoch": 2.3375321175659587, + "grad_norm": 0.026744646951556206, + "learning_rate": 1.1157670304616749e-05, + "loss": 0.0264, + "step": 37300 + }, + { + "epoch": 2.338158801779783, + "grad_norm": 0.013837196864187717, + "learning_rate": 1.1147115323721264e-05, + "loss": 0.0502, + "step": 37310 + }, + { + "epoch": 2.3387854859936077, + "grad_norm": 0.31136569380760193, + "learning_rate": 1.113656034282578e-05, + "loss": 0.0065, + "step": 37320 + }, + { + "epoch": 2.3394121702074324, + "grad_norm": 0.013896308839321136, + "learning_rate": 1.1126005361930296e-05, + "loss": 0.1444, + "step": 37330 + }, + { + "epoch": 2.340038854421257, + "grad_norm": 1.2215086221694946, + "learning_rate": 1.1115450381034812e-05, + "loss": 0.0053, + "step": 37340 + }, + { + "epoch": 2.340665538635082, + "grad_norm": 0.032671570777893066, + "learning_rate": 1.1104895400139325e-05, + "loss": 0.0256, + "step": 37350 + }, + { + "epoch": 2.3412922228489066, + "grad_norm": 21.453617095947266, + "learning_rate": 1.1094340419243842e-05, + "loss": 0.061, + "step": 37360 + }, + { + "epoch": 2.3419189070627313, + "grad_norm": 0.21888981759548187, + "learning_rate": 1.1083785438348357e-05, + "loss": 0.0257, + "step": 37370 + }, + { + "epoch": 2.3425455912765556, + "grad_norm": 5.909558296203613, + "learning_rate": 1.1073230457452872e-05, + "loss": 0.0862, + "step": 37380 + }, + { + "epoch": 2.3431722754903803, + "grad_norm": 0.038163937628269196, + "learning_rate": 1.1062675476557389e-05, + "loss": 0.0518, + "step": 37390 + }, + { + "epoch": 2.343798959704205, + "grad_norm": 0.045827217400074005, + "learning_rate": 1.1052120495661904e-05, + "loss": 0.1041, + "step": 37400 + }, + { + "epoch": 2.3444256439180298, + "grad_norm": 7.460287570953369, + "learning_rate": 1.1041565514766419e-05, + "loss": 0.028, + "step": 37410 + }, + { + "epoch": 2.3450523281318545, + "grad_norm": 0.032291725277900696, + "learning_rate": 1.1031010533870935e-05, + "loss": 0.0369, + "step": 37420 + }, + { + "epoch": 2.3456790123456788, + "grad_norm": 0.025432724505662918, + "learning_rate": 1.1020455552975449e-05, + "loss": 0.0819, + "step": 37430 + }, + { + "epoch": 2.3463056965595035, + "grad_norm": 0.009814858436584473, + "learning_rate": 1.1009900572079965e-05, + "loss": 0.1122, + "step": 37440 + }, + { + "epoch": 2.346932380773328, + "grad_norm": 0.10019972175359726, + "learning_rate": 1.099934559118448e-05, + "loss": 0.0015, + "step": 37450 + }, + { + "epoch": 2.347559064987153, + "grad_norm": 5.408548831939697, + "learning_rate": 1.0988790610288995e-05, + "loss": 0.1065, + "step": 37460 + }, + { + "epoch": 2.3481857492009777, + "grad_norm": 5.011567115783691, + "learning_rate": 1.0978235629393512e-05, + "loss": 0.1711, + "step": 37470 + }, + { + "epoch": 2.3488124334148024, + "grad_norm": 0.2545178234577179, + "learning_rate": 1.0967680648498027e-05, + "loss": 0.1407, + "step": 37480 + }, + { + "epoch": 2.349439117628627, + "grad_norm": 0.04436579346656799, + "learning_rate": 1.0957125667602543e-05, + "loss": 0.1503, + "step": 37490 + }, + { + "epoch": 2.3500658018424514, + "grad_norm": 18.1503849029541, + "learning_rate": 1.0946570686707057e-05, + "loss": 0.1198, + "step": 37500 + }, + { + "epoch": 2.350692486056276, + "grad_norm": 0.2500666379928589, + "learning_rate": 1.0936015705811573e-05, + "loss": 0.071, + "step": 37510 + }, + { + "epoch": 2.351319170270101, + "grad_norm": 0.2713271677494049, + "learning_rate": 1.0925460724916088e-05, + "loss": 0.1216, + "step": 37520 + }, + { + "epoch": 2.3519458544839256, + "grad_norm": 0.2982094883918762, + "learning_rate": 1.0914905744020603e-05, + "loss": 0.09, + "step": 37530 + }, + { + "epoch": 2.3525725386977503, + "grad_norm": 0.24116140604019165, + "learning_rate": 1.090435076312512e-05, + "loss": 0.0944, + "step": 37540 + }, + { + "epoch": 2.353199222911575, + "grad_norm": 0.02480962686240673, + "learning_rate": 1.0893795782229635e-05, + "loss": 0.0286, + "step": 37550 + }, + { + "epoch": 2.3538259071253993, + "grad_norm": 0.05994749814271927, + "learning_rate": 1.088324080133415e-05, + "loss": 0.1056, + "step": 37560 + }, + { + "epoch": 2.354452591339224, + "grad_norm": 4.796363830566406, + "learning_rate": 1.0872685820438666e-05, + "loss": 0.2076, + "step": 37570 + }, + { + "epoch": 2.3550792755530487, + "grad_norm": 0.0286555178463459, + "learning_rate": 1.086213083954318e-05, + "loss": 0.0233, + "step": 37580 + }, + { + "epoch": 2.3557059597668735, + "grad_norm": 0.052487812936306, + "learning_rate": 1.0851575858647696e-05, + "loss": 0.04, + "step": 37590 + }, + { + "epoch": 2.356332643980698, + "grad_norm": 0.03592820465564728, + "learning_rate": 1.0841020877752211e-05, + "loss": 0.0173, + "step": 37600 + }, + { + "epoch": 2.356959328194523, + "grad_norm": 0.015476588159799576, + "learning_rate": 1.0830465896856728e-05, + "loss": 0.0018, + "step": 37610 + }, + { + "epoch": 2.3575860124083476, + "grad_norm": 0.013065283186733723, + "learning_rate": 1.0819910915961243e-05, + "loss": 0.0705, + "step": 37620 + }, + { + "epoch": 2.358212696622172, + "grad_norm": 9.37665843963623, + "learning_rate": 1.0809355935065758e-05, + "loss": 0.1219, + "step": 37630 + }, + { + "epoch": 2.3588393808359966, + "grad_norm": 0.2045561969280243, + "learning_rate": 1.0798800954170275e-05, + "loss": 0.0633, + "step": 37640 + }, + { + "epoch": 2.3594660650498214, + "grad_norm": 0.18942023813724518, + "learning_rate": 1.078824597327479e-05, + "loss": 0.0611, + "step": 37650 + }, + { + "epoch": 2.360092749263646, + "grad_norm": 77.0622787475586, + "learning_rate": 1.0777690992379304e-05, + "loss": 0.1633, + "step": 37660 + }, + { + "epoch": 2.360719433477471, + "grad_norm": 0.24678970873355865, + "learning_rate": 1.076713601148382e-05, + "loss": 0.052, + "step": 37670 + }, + { + "epoch": 2.3613461176912955, + "grad_norm": 0.008172067813575268, + "learning_rate": 1.0756581030588334e-05, + "loss": 0.1067, + "step": 37680 + }, + { + "epoch": 2.36197280190512, + "grad_norm": 0.05612856149673462, + "learning_rate": 1.0746026049692851e-05, + "loss": 0.1683, + "step": 37690 + }, + { + "epoch": 2.3625994861189445, + "grad_norm": 0.016600340604782104, + "learning_rate": 1.0735471068797366e-05, + "loss": 0.0029, + "step": 37700 + }, + { + "epoch": 2.3632261703327693, + "grad_norm": 0.4090496897697449, + "learning_rate": 1.0724916087901881e-05, + "loss": 0.0012, + "step": 37710 + }, + { + "epoch": 2.363852854546594, + "grad_norm": 0.01410214975476265, + "learning_rate": 1.0714361107006398e-05, + "loss": 0.0022, + "step": 37720 + }, + { + "epoch": 2.3644795387604187, + "grad_norm": 0.045551199465990067, + "learning_rate": 1.0703806126110913e-05, + "loss": 0.0283, + "step": 37730 + }, + { + "epoch": 2.3651062229742434, + "grad_norm": 2.0473251342773438, + "learning_rate": 1.0693251145215427e-05, + "loss": 0.0508, + "step": 37740 + }, + { + "epoch": 2.365732907188068, + "grad_norm": 6.74009370803833, + "learning_rate": 1.0682696164319942e-05, + "loss": 0.0948, + "step": 37750 + }, + { + "epoch": 2.3663595914018924, + "grad_norm": 0.0833032876253128, + "learning_rate": 1.0672141183424459e-05, + "loss": 0.0332, + "step": 37760 + }, + { + "epoch": 2.366986275615717, + "grad_norm": 15.18896484375, + "learning_rate": 1.0661586202528974e-05, + "loss": 0.0505, + "step": 37770 + }, + { + "epoch": 2.367612959829542, + "grad_norm": 0.3920779526233673, + "learning_rate": 1.0651031221633489e-05, + "loss": 0.0363, + "step": 37780 + }, + { + "epoch": 2.3682396440433666, + "grad_norm": 0.008296458050608635, + "learning_rate": 1.0640476240738006e-05, + "loss": 0.0467, + "step": 37790 + }, + { + "epoch": 2.3688663282571913, + "grad_norm": 0.014929085038602352, + "learning_rate": 1.062992125984252e-05, + "loss": 0.1838, + "step": 37800 + }, + { + "epoch": 2.3694930124710156, + "grad_norm": 0.024601779878139496, + "learning_rate": 1.0619366278947036e-05, + "loss": 0.0027, + "step": 37810 + }, + { + "epoch": 2.3701196966848403, + "grad_norm": 0.1446009874343872, + "learning_rate": 1.060881129805155e-05, + "loss": 0.0521, + "step": 37820 + }, + { + "epoch": 2.370746380898665, + "grad_norm": 0.022335579618811607, + "learning_rate": 1.0598256317156065e-05, + "loss": 0.2932, + "step": 37830 + }, + { + "epoch": 2.37137306511249, + "grad_norm": 0.03476012498140335, + "learning_rate": 1.0587701336260582e-05, + "loss": 0.2129, + "step": 37840 + }, + { + "epoch": 2.3719997493263145, + "grad_norm": 0.10861321538686752, + "learning_rate": 1.0577146355365097e-05, + "loss": 0.0755, + "step": 37850 + }, + { + "epoch": 2.3726264335401392, + "grad_norm": 0.04184696078300476, + "learning_rate": 1.0566591374469614e-05, + "loss": 0.0031, + "step": 37860 + }, + { + "epoch": 2.373253117753964, + "grad_norm": 6.86499547958374, + "learning_rate": 1.0556036393574129e-05, + "loss": 0.2977, + "step": 37870 + }, + { + "epoch": 2.3738798019677883, + "grad_norm": 0.027534818276762962, + "learning_rate": 1.0545481412678644e-05, + "loss": 0.0055, + "step": 37880 + }, + { + "epoch": 2.374506486181613, + "grad_norm": 1.948462963104248, + "learning_rate": 1.0534926431783159e-05, + "loss": 0.0605, + "step": 37890 + }, + { + "epoch": 2.3751331703954377, + "grad_norm": 8.56440258026123, + "learning_rate": 1.0524371450887674e-05, + "loss": 0.0694, + "step": 37900 + }, + { + "epoch": 2.3757598546092624, + "grad_norm": 0.39405912160873413, + "learning_rate": 1.051381646999219e-05, + "loss": 0.0028, + "step": 37910 + }, + { + "epoch": 2.376386538823087, + "grad_norm": 0.13846786320209503, + "learning_rate": 1.0503261489096705e-05, + "loss": 0.0018, + "step": 37920 + }, + { + "epoch": 2.377013223036912, + "grad_norm": 0.024694940075278282, + "learning_rate": 1.049270650820122e-05, + "loss": 0.0351, + "step": 37930 + }, + { + "epoch": 2.377639907250736, + "grad_norm": 0.013253679499030113, + "learning_rate": 1.0482151527305737e-05, + "loss": 0.0972, + "step": 37940 + }, + { + "epoch": 2.378266591464561, + "grad_norm": 0.010091162286698818, + "learning_rate": 1.0471596546410252e-05, + "loss": 0.0472, + "step": 37950 + }, + { + "epoch": 2.3788932756783856, + "grad_norm": 0.0070129623636603355, + "learning_rate": 1.0461041565514768e-05, + "loss": 0.1027, + "step": 37960 + }, + { + "epoch": 2.3795199598922103, + "grad_norm": 0.36958637833595276, + "learning_rate": 1.0450486584619282e-05, + "loss": 0.1343, + "step": 37970 + }, + { + "epoch": 2.380146644106035, + "grad_norm": 24.933622360229492, + "learning_rate": 1.0439931603723797e-05, + "loss": 0.0473, + "step": 37980 + }, + { + "epoch": 2.3807733283198598, + "grad_norm": 0.015931887552142143, + "learning_rate": 1.0429376622828313e-05, + "loss": 0.0813, + "step": 37990 + }, + { + "epoch": 2.3814000125336845, + "grad_norm": 0.03066008910536766, + "learning_rate": 1.0418821641932828e-05, + "loss": 0.0206, + "step": 38000 + }, + { + "epoch": 2.3820266967475088, + "grad_norm": 0.011086874641478062, + "learning_rate": 1.0408266661037345e-05, + "loss": 0.0025, + "step": 38010 + }, + { + "epoch": 2.3826533809613335, + "grad_norm": 0.011415200307965279, + "learning_rate": 1.039771168014186e-05, + "loss": 0.051, + "step": 38020 + }, + { + "epoch": 2.3832800651751582, + "grad_norm": 0.017173422500491142, + "learning_rate": 1.0387156699246375e-05, + "loss": 0.1283, + "step": 38030 + }, + { + "epoch": 2.383906749388983, + "grad_norm": 0.03894858434796333, + "learning_rate": 1.0376601718350891e-05, + "loss": 0.0005, + "step": 38040 + }, + { + "epoch": 2.3845334336028077, + "grad_norm": 0.020368503406643867, + "learning_rate": 1.0366046737455405e-05, + "loss": 0.0711, + "step": 38050 + }, + { + "epoch": 2.385160117816632, + "grad_norm": 0.01607118360698223, + "learning_rate": 1.0355491756559921e-05, + "loss": 0.0114, + "step": 38060 + }, + { + "epoch": 2.3857868020304567, + "grad_norm": 14.362074851989746, + "learning_rate": 1.0344936775664436e-05, + "loss": 0.0455, + "step": 38070 + }, + { + "epoch": 2.3864134862442814, + "grad_norm": 0.18846359848976135, + "learning_rate": 1.0334381794768951e-05, + "loss": 0.0081, + "step": 38080 + }, + { + "epoch": 2.387040170458106, + "grad_norm": 0.020711679011583328, + "learning_rate": 1.0323826813873468e-05, + "loss": 0.1372, + "step": 38090 + }, + { + "epoch": 2.387666854671931, + "grad_norm": 0.01883024349808693, + "learning_rate": 1.0313271832977983e-05, + "loss": 0.0023, + "step": 38100 + }, + { + "epoch": 2.3882935388857556, + "grad_norm": 0.052873801440000534, + "learning_rate": 1.03027168520825e-05, + "loss": 0.0184, + "step": 38110 + }, + { + "epoch": 2.3889202230995803, + "grad_norm": 0.7084928750991821, + "learning_rate": 1.0292161871187013e-05, + "loss": 0.0062, + "step": 38120 + }, + { + "epoch": 2.389546907313405, + "grad_norm": 0.040277425199747086, + "learning_rate": 1.0281606890291528e-05, + "loss": 0.0184, + "step": 38130 + }, + { + "epoch": 2.3901735915272293, + "grad_norm": 12.49878978729248, + "learning_rate": 1.0271051909396044e-05, + "loss": 0.0084, + "step": 38140 + }, + { + "epoch": 2.390800275741054, + "grad_norm": 0.007965478114783764, + "learning_rate": 1.026049692850056e-05, + "loss": 0.0019, + "step": 38150 + }, + { + "epoch": 2.3914269599548788, + "grad_norm": 0.0063006095588207245, + "learning_rate": 1.0249941947605076e-05, + "loss": 0.0573, + "step": 38160 + }, + { + "epoch": 2.3920536441687035, + "grad_norm": 4.532165050506592, + "learning_rate": 1.0239386966709591e-05, + "loss": 0.0936, + "step": 38170 + }, + { + "epoch": 2.392680328382528, + "grad_norm": 0.012650924734771252, + "learning_rate": 1.0228831985814106e-05, + "loss": 0.0184, + "step": 38180 + }, + { + "epoch": 2.3933070125963525, + "grad_norm": 0.1698913872241974, + "learning_rate": 1.0218277004918623e-05, + "loss": 0.1913, + "step": 38190 + }, + { + "epoch": 2.393933696810177, + "grad_norm": 0.010228320956230164, + "learning_rate": 1.0207722024023136e-05, + "loss": 0.1132, + "step": 38200 + }, + { + "epoch": 2.394560381024002, + "grad_norm": 0.036675091832876205, + "learning_rate": 1.0197167043127652e-05, + "loss": 0.1673, + "step": 38210 + }, + { + "epoch": 2.3951870652378267, + "grad_norm": 0.07806365191936493, + "learning_rate": 1.0186612062232167e-05, + "loss": 0.1276, + "step": 38220 + }, + { + "epoch": 2.3958137494516514, + "grad_norm": 0.01533064991235733, + "learning_rate": 1.0176057081336682e-05, + "loss": 0.0448, + "step": 38230 + }, + { + "epoch": 2.396440433665476, + "grad_norm": 19.43166160583496, + "learning_rate": 1.0165502100441199e-05, + "loss": 0.1547, + "step": 38240 + }, + { + "epoch": 2.397067117879301, + "grad_norm": 0.04978703707456589, + "learning_rate": 1.0154947119545714e-05, + "loss": 0.0024, + "step": 38250 + }, + { + "epoch": 2.397693802093125, + "grad_norm": 0.014913588762283325, + "learning_rate": 1.014439213865023e-05, + "loss": 0.0507, + "step": 38260 + }, + { + "epoch": 2.39832048630695, + "grad_norm": 0.029882259666919708, + "learning_rate": 1.0133837157754746e-05, + "loss": 0.1431, + "step": 38270 + }, + { + "epoch": 2.3989471705207746, + "grad_norm": 0.27971410751342773, + "learning_rate": 1.012328217685926e-05, + "loss": 0.0017, + "step": 38280 + }, + { + "epoch": 2.3995738547345993, + "grad_norm": 0.0225802194327116, + "learning_rate": 1.0112727195963776e-05, + "loss": 0.0679, + "step": 38290 + }, + { + "epoch": 2.400200538948424, + "grad_norm": 0.11946109682321548, + "learning_rate": 1.010217221506829e-05, + "loss": 0.1072, + "step": 38300 + }, + { + "epoch": 2.4008272231622487, + "grad_norm": 0.018057910725474358, + "learning_rate": 1.0091617234172807e-05, + "loss": 0.0599, + "step": 38310 + }, + { + "epoch": 2.401453907376073, + "grad_norm": 0.16458840668201447, + "learning_rate": 1.0081062253277322e-05, + "loss": 0.0039, + "step": 38320 + }, + { + "epoch": 2.4020805915898977, + "grad_norm": 0.013043378479778767, + "learning_rate": 1.0070507272381837e-05, + "loss": 0.0631, + "step": 38330 + }, + { + "epoch": 2.4027072758037225, + "grad_norm": 0.08399718254804611, + "learning_rate": 1.0059952291486354e-05, + "loss": 0.065, + "step": 38340 + }, + { + "epoch": 2.403333960017547, + "grad_norm": 0.020437980070710182, + "learning_rate": 1.0049397310590869e-05, + "loss": 0.0777, + "step": 38350 + }, + { + "epoch": 2.403960644231372, + "grad_norm": 5.764674663543701, + "learning_rate": 1.0038842329695384e-05, + "loss": 0.0664, + "step": 38360 + }, + { + "epoch": 2.4045873284451966, + "grad_norm": 0.020989634096622467, + "learning_rate": 1.0028287348799899e-05, + "loss": 0.0679, + "step": 38370 + }, + { + "epoch": 2.4052140126590213, + "grad_norm": 0.12513042986392975, + "learning_rate": 1.0017732367904415e-05, + "loss": 0.0018, + "step": 38380 + }, + { + "epoch": 2.4058406968728456, + "grad_norm": 0.040014710277318954, + "learning_rate": 1.000717738700893e-05, + "loss": 0.0711, + "step": 38390 + }, + { + "epoch": 2.4064673810866704, + "grad_norm": 5.037256240844727, + "learning_rate": 9.996622406113445e-06, + "loss": 0.3477, + "step": 38400 + }, + { + "epoch": 2.407094065300495, + "grad_norm": 0.5470215678215027, + "learning_rate": 9.986067425217962e-06, + "loss": 0.0053, + "step": 38410 + }, + { + "epoch": 2.40772074951432, + "grad_norm": 0.16532324254512787, + "learning_rate": 9.975512444322477e-06, + "loss": 0.0031, + "step": 38420 + }, + { + "epoch": 2.4083474337281445, + "grad_norm": 0.011890468187630177, + "learning_rate": 9.964957463426992e-06, + "loss": 0.1645, + "step": 38430 + }, + { + "epoch": 2.408974117941969, + "grad_norm": 9.385170936584473, + "learning_rate": 9.954402482531507e-06, + "loss": 0.1329, + "step": 38440 + }, + { + "epoch": 2.4096008021557935, + "grad_norm": 0.021074773743748665, + "learning_rate": 9.943847501636022e-06, + "loss": 0.002, + "step": 38450 + }, + { + "epoch": 2.4102274863696183, + "grad_norm": 0.01659083366394043, + "learning_rate": 9.933292520740538e-06, + "loss": 0.0025, + "step": 38460 + }, + { + "epoch": 2.410854170583443, + "grad_norm": 0.01685764454305172, + "learning_rate": 9.922737539845053e-06, + "loss": 0.0032, + "step": 38470 + }, + { + "epoch": 2.4114808547972677, + "grad_norm": 0.007443627342581749, + "learning_rate": 9.912182558949568e-06, + "loss": 0.0022, + "step": 38480 + }, + { + "epoch": 2.4121075390110924, + "grad_norm": 0.13265888392925262, + "learning_rate": 9.901627578054085e-06, + "loss": 0.1902, + "step": 38490 + }, + { + "epoch": 2.412734223224917, + "grad_norm": 0.04250665754079819, + "learning_rate": 9.8910725971586e-06, + "loss": 0.0988, + "step": 38500 + }, + { + "epoch": 2.4133609074387414, + "grad_norm": 0.2933494448661804, + "learning_rate": 9.880517616263115e-06, + "loss": 0.0713, + "step": 38510 + }, + { + "epoch": 2.413987591652566, + "grad_norm": 0.030012181028723717, + "learning_rate": 9.86996263536763e-06, + "loss": 0.0635, + "step": 38520 + }, + { + "epoch": 2.414614275866391, + "grad_norm": 0.21046006679534912, + "learning_rate": 9.859407654472146e-06, + "loss": 0.0017, + "step": 38530 + }, + { + "epoch": 2.4152409600802156, + "grad_norm": 0.006909917574375868, + "learning_rate": 9.848852673576661e-06, + "loss": 0.2198, + "step": 38540 + }, + { + "epoch": 2.4158676442940403, + "grad_norm": 0.1460009217262268, + "learning_rate": 9.838297692681176e-06, + "loss": 0.0016, + "step": 38550 + }, + { + "epoch": 2.416494328507865, + "grad_norm": 0.49015122652053833, + "learning_rate": 9.827742711785693e-06, + "loss": 0.0041, + "step": 38560 + }, + { + "epoch": 2.4171210127216893, + "grad_norm": 0.0862099900841713, + "learning_rate": 9.817187730890208e-06, + "loss": 0.1908, + "step": 38570 + }, + { + "epoch": 2.417747696935514, + "grad_norm": 0.007962826639413834, + "learning_rate": 9.806632749994723e-06, + "loss": 0.0472, + "step": 38580 + }, + { + "epoch": 2.418374381149339, + "grad_norm": 0.023178890347480774, + "learning_rate": 9.796077769099238e-06, + "loss": 0.0047, + "step": 38590 + }, + { + "epoch": 2.4190010653631635, + "grad_norm": 0.013865641318261623, + "learning_rate": 9.785522788203753e-06, + "loss": 0.0388, + "step": 38600 + }, + { + "epoch": 2.4196277495769882, + "grad_norm": 0.03907917067408562, + "learning_rate": 9.77496780730827e-06, + "loss": 0.0341, + "step": 38610 + }, + { + "epoch": 2.420254433790813, + "grad_norm": 0.25156423449516296, + "learning_rate": 9.764412826412784e-06, + "loss": 0.0868, + "step": 38620 + }, + { + "epoch": 2.4208811180046377, + "grad_norm": 5.6584343910217285, + "learning_rate": 9.753857845517301e-06, + "loss": 0.1857, + "step": 38630 + }, + { + "epoch": 2.421507802218462, + "grad_norm": 6.962517261505127, + "learning_rate": 9.743302864621816e-06, + "loss": 0.0857, + "step": 38640 + }, + { + "epoch": 2.4221344864322867, + "grad_norm": 0.031737297773361206, + "learning_rate": 9.732747883726331e-06, + "loss": 0.1053, + "step": 38650 + }, + { + "epoch": 2.4227611706461114, + "grad_norm": 0.24770425260066986, + "learning_rate": 9.722192902830848e-06, + "loss": 0.0985, + "step": 38660 + }, + { + "epoch": 2.423387854859936, + "grad_norm": 0.041502561420202255, + "learning_rate": 9.71163792193536e-06, + "loss": 0.0038, + "step": 38670 + }, + { + "epoch": 2.424014539073761, + "grad_norm": 0.026886025443673134, + "learning_rate": 9.701082941039877e-06, + "loss": 0.0027, + "step": 38680 + }, + { + "epoch": 2.4246412232875856, + "grad_norm": 5.663731575012207, + "learning_rate": 9.690527960144392e-06, + "loss": 0.1705, + "step": 38690 + }, + { + "epoch": 2.42526790750141, + "grad_norm": 0.05266406387090683, + "learning_rate": 9.679972979248907e-06, + "loss": 0.0965, + "step": 38700 + }, + { + "epoch": 2.4258945917152346, + "grad_norm": 0.025294840335845947, + "learning_rate": 9.669417998353424e-06, + "loss": 0.0695, + "step": 38710 + }, + { + "epoch": 2.4265212759290593, + "grad_norm": 24.895811080932617, + "learning_rate": 9.658863017457939e-06, + "loss": 0.1557, + "step": 38720 + }, + { + "epoch": 2.427147960142884, + "grad_norm": 0.04512655735015869, + "learning_rate": 9.648308036562454e-06, + "loss": 0.0673, + "step": 38730 + }, + { + "epoch": 2.4277746443567088, + "grad_norm": 0.47210386395454407, + "learning_rate": 9.637753055666969e-06, + "loss": 0.1174, + "step": 38740 + }, + { + "epoch": 2.4284013285705335, + "grad_norm": 0.07949967682361603, + "learning_rate": 9.627198074771484e-06, + "loss": 0.0547, + "step": 38750 + }, + { + "epoch": 2.429028012784358, + "grad_norm": 0.11611676961183548, + "learning_rate": 9.616643093876e-06, + "loss": 0.1079, + "step": 38760 + }, + { + "epoch": 2.4296546969981825, + "grad_norm": 9.299555778503418, + "learning_rate": 9.606088112980515e-06, + "loss": 0.0926, + "step": 38770 + }, + { + "epoch": 2.430281381212007, + "grad_norm": 0.05925847962498665, + "learning_rate": 9.595533132085032e-06, + "loss": 0.215, + "step": 38780 + }, + { + "epoch": 2.430908065425832, + "grad_norm": 0.07853665202856064, + "learning_rate": 9.584978151189547e-06, + "loss": 0.0559, + "step": 38790 + }, + { + "epoch": 2.4315347496396567, + "grad_norm": 0.1322367936372757, + "learning_rate": 9.574423170294062e-06, + "loss": 0.0045, + "step": 38800 + }, + { + "epoch": 2.4321614338534814, + "grad_norm": 0.22832946479320526, + "learning_rate": 9.563868189398579e-06, + "loss": 0.1404, + "step": 38810 + }, + { + "epoch": 2.4327881180673057, + "grad_norm": 0.021371060982346535, + "learning_rate": 9.553313208503092e-06, + "loss": 0.1971, + "step": 38820 + }, + { + "epoch": 2.4334148022811304, + "grad_norm": 0.03888564184308052, + "learning_rate": 9.542758227607609e-06, + "loss": 0.059, + "step": 38830 + }, + { + "epoch": 2.434041486494955, + "grad_norm": 0.0451219379901886, + "learning_rate": 9.532203246712124e-06, + "loss": 0.094, + "step": 38840 + }, + { + "epoch": 2.43466817070878, + "grad_norm": 0.03573371469974518, + "learning_rate": 9.521648265816639e-06, + "loss": 0.0276, + "step": 38850 + }, + { + "epoch": 2.4352948549226046, + "grad_norm": 0.05684259161353111, + "learning_rate": 9.511093284921155e-06, + "loss": 0.0492, + "step": 38860 + }, + { + "epoch": 2.4359215391364293, + "grad_norm": 0.021530529484152794, + "learning_rate": 9.50053830402567e-06, + "loss": 0.0352, + "step": 38870 + }, + { + "epoch": 2.436548223350254, + "grad_norm": 6.627198219299316, + "learning_rate": 9.489983323130187e-06, + "loss": 0.2246, + "step": 38880 + }, + { + "epoch": 2.4371749075640783, + "grad_norm": 8.19845962524414, + "learning_rate": 9.479428342234702e-06, + "loss": 0.1038, + "step": 38890 + }, + { + "epoch": 2.437801591777903, + "grad_norm": 0.13801322877407074, + "learning_rate": 9.468873361339215e-06, + "loss": 0.0036, + "step": 38900 + }, + { + "epoch": 2.4384282759917277, + "grad_norm": 0.16967302560806274, + "learning_rate": 9.458318380443732e-06, + "loss": 0.0364, + "step": 38910 + }, + { + "epoch": 2.4390549602055525, + "grad_norm": 0.4350135922431946, + "learning_rate": 9.447763399548247e-06, + "loss": 0.028, + "step": 38920 + }, + { + "epoch": 2.439681644419377, + "grad_norm": 4.991820812225342, + "learning_rate": 9.437208418652763e-06, + "loss": 0.1386, + "step": 38930 + }, + { + "epoch": 2.440308328633202, + "grad_norm": 0.03626188263297081, + "learning_rate": 9.426653437757278e-06, + "loss": 0.0017, + "step": 38940 + }, + { + "epoch": 2.440935012847026, + "grad_norm": 0.006887183059006929, + "learning_rate": 9.416098456861793e-06, + "loss": 0.1327, + "step": 38950 + }, + { + "epoch": 2.441561697060851, + "grad_norm": 0.005281991325318813, + "learning_rate": 9.40554347596631e-06, + "loss": 0.004, + "step": 38960 + }, + { + "epoch": 2.4421883812746756, + "grad_norm": 0.00498551269993186, + "learning_rate": 9.394988495070825e-06, + "loss": 0.0388, + "step": 38970 + }, + { + "epoch": 2.4428150654885004, + "grad_norm": 0.017654692754149437, + "learning_rate": 9.38443351417534e-06, + "loss": 0.1044, + "step": 38980 + }, + { + "epoch": 2.443441749702325, + "grad_norm": 0.06669986993074417, + "learning_rate": 9.373878533279855e-06, + "loss": 0.1428, + "step": 38990 + }, + { + "epoch": 2.44406843391615, + "grad_norm": 0.19353161752223969, + "learning_rate": 9.36332355238437e-06, + "loss": 0.0032, + "step": 39000 + }, + { + "epoch": 2.4446951181299745, + "grad_norm": 0.19353748857975006, + "learning_rate": 9.352768571488886e-06, + "loss": 0.0608, + "step": 39010 + }, + { + "epoch": 2.445321802343799, + "grad_norm": 0.011640098877251148, + "learning_rate": 9.342213590593401e-06, + "loss": 0.021, + "step": 39020 + }, + { + "epoch": 2.4459484865576235, + "grad_norm": 0.052307840436697006, + "learning_rate": 9.331658609697918e-06, + "loss": 0.028, + "step": 39030 + }, + { + "epoch": 2.4465751707714483, + "grad_norm": 0.01141645573079586, + "learning_rate": 9.321103628802433e-06, + "loss": 0.051, + "step": 39040 + }, + { + "epoch": 2.447201854985273, + "grad_norm": 0.012516812421381474, + "learning_rate": 9.310548647906948e-06, + "loss": 0.0318, + "step": 39050 + }, + { + "epoch": 2.4478285391990977, + "grad_norm": 0.07261917740106583, + "learning_rate": 9.299993667011463e-06, + "loss": 0.0009, + "step": 39060 + }, + { + "epoch": 2.448455223412922, + "grad_norm": 0.07534591108560562, + "learning_rate": 9.289438686115978e-06, + "loss": 0.0406, + "step": 39070 + }, + { + "epoch": 2.4490819076267467, + "grad_norm": 0.012093325145542622, + "learning_rate": 9.278883705220494e-06, + "loss": 0.069, + "step": 39080 + }, + { + "epoch": 2.4497085918405714, + "grad_norm": 0.021784339100122452, + "learning_rate": 9.26832872432501e-06, + "loss": 0.1769, + "step": 39090 + }, + { + "epoch": 2.450335276054396, + "grad_norm": 0.08205260336399078, + "learning_rate": 9.257773743429524e-06, + "loss": 0.0058, + "step": 39100 + }, + { + "epoch": 2.450961960268221, + "grad_norm": 0.012557939626276493, + "learning_rate": 9.247218762534041e-06, + "loss": 0.0407, + "step": 39110 + }, + { + "epoch": 2.4515886444820456, + "grad_norm": 0.00688109640032053, + "learning_rate": 9.236663781638556e-06, + "loss": 0.0449, + "step": 39120 + }, + { + "epoch": 2.4522153286958703, + "grad_norm": 0.035959869623184204, + "learning_rate": 9.226108800743071e-06, + "loss": 0.0687, + "step": 39130 + }, + { + "epoch": 2.4528420129096946, + "grad_norm": 18.920211791992188, + "learning_rate": 9.215553819847586e-06, + "loss": 0.1367, + "step": 39140 + }, + { + "epoch": 2.4534686971235193, + "grad_norm": 0.011474884115159512, + "learning_rate": 9.2049988389521e-06, + "loss": 0.0033, + "step": 39150 + }, + { + "epoch": 2.454095381337344, + "grad_norm": 5.881065368652344, + "learning_rate": 9.194443858056617e-06, + "loss": 0.0946, + "step": 39160 + }, + { + "epoch": 2.454722065551169, + "grad_norm": 0.00592809310182929, + "learning_rate": 9.183888877161132e-06, + "loss": 0.0388, + "step": 39170 + }, + { + "epoch": 2.4553487497649935, + "grad_norm": 0.6982523202896118, + "learning_rate": 9.173333896265649e-06, + "loss": 0.1002, + "step": 39180 + }, + { + "epoch": 2.4559754339788182, + "grad_norm": 0.012783760204911232, + "learning_rate": 9.162778915370164e-06, + "loss": 0.0076, + "step": 39190 + }, + { + "epoch": 2.4566021181926425, + "grad_norm": 6.998777389526367, + "learning_rate": 9.152223934474679e-06, + "loss": 0.053, + "step": 39200 + }, + { + "epoch": 2.4572288024064672, + "grad_norm": 0.18721304833889008, + "learning_rate": 9.141668953579194e-06, + "loss": 0.0025, + "step": 39210 + }, + { + "epoch": 2.457855486620292, + "grad_norm": 0.012833227403461933, + "learning_rate": 9.131113972683709e-06, + "loss": 0.0976, + "step": 39220 + }, + { + "epoch": 2.4584821708341167, + "grad_norm": 0.014098461717367172, + "learning_rate": 9.120558991788226e-06, + "loss": 0.0631, + "step": 39230 + }, + { + "epoch": 2.4591088550479414, + "grad_norm": 0.7037414908409119, + "learning_rate": 9.11000401089274e-06, + "loss": 0.0014, + "step": 39240 + }, + { + "epoch": 2.459735539261766, + "grad_norm": 0.0042467243038117886, + "learning_rate": 9.099449029997255e-06, + "loss": 0.0062, + "step": 39250 + }, + { + "epoch": 2.460362223475591, + "grad_norm": 0.005058089271187782, + "learning_rate": 9.088894049101772e-06, + "loss": 0.0743, + "step": 39260 + }, + { + "epoch": 2.460988907689415, + "grad_norm": 0.009189349599182606, + "learning_rate": 9.078339068206287e-06, + "loss": 0.1397, + "step": 39270 + }, + { + "epoch": 2.46161559190324, + "grad_norm": 0.005269974935799837, + "learning_rate": 9.067784087310804e-06, + "loss": 0.0057, + "step": 39280 + }, + { + "epoch": 2.4622422761170646, + "grad_norm": 1.0407090187072754, + "learning_rate": 9.057229106415317e-06, + "loss": 0.043, + "step": 39290 + }, + { + "epoch": 2.4628689603308893, + "grad_norm": 0.6664018630981445, + "learning_rate": 9.046674125519834e-06, + "loss": 0.0912, + "step": 39300 + }, + { + "epoch": 2.463495644544714, + "grad_norm": 0.008333146572113037, + "learning_rate": 9.036119144624349e-06, + "loss": 0.315, + "step": 39310 + }, + { + "epoch": 2.4641223287585388, + "grad_norm": 6.8014445304870605, + "learning_rate": 9.025564163728864e-06, + "loss": 0.0057, + "step": 39320 + }, + { + "epoch": 2.464749012972363, + "grad_norm": 0.10319343209266663, + "learning_rate": 9.01500918283338e-06, + "loss": 0.0191, + "step": 39330 + }, + { + "epoch": 2.4653756971861878, + "grad_norm": 17.48542594909668, + "learning_rate": 9.004454201937895e-06, + "loss": 0.2083, + "step": 39340 + }, + { + "epoch": 2.4660023814000125, + "grad_norm": 0.022007020190358162, + "learning_rate": 8.99389922104241e-06, + "loss": 0.002, + "step": 39350 + }, + { + "epoch": 2.466629065613837, + "grad_norm": 0.08251410722732544, + "learning_rate": 8.983344240146927e-06, + "loss": 0.1012, + "step": 39360 + }, + { + "epoch": 2.467255749827662, + "grad_norm": 0.1723119467496872, + "learning_rate": 8.97278925925144e-06, + "loss": 0.104, + "step": 39370 + }, + { + "epoch": 2.4678824340414867, + "grad_norm": 0.06838217377662659, + "learning_rate": 8.962234278355957e-06, + "loss": 0.002, + "step": 39380 + }, + { + "epoch": 2.4685091182553114, + "grad_norm": 0.01145200990140438, + "learning_rate": 8.951679297460472e-06, + "loss": 0.0015, + "step": 39390 + }, + { + "epoch": 2.4691358024691357, + "grad_norm": 0.01762591302394867, + "learning_rate": 8.941124316564988e-06, + "loss": 0.0528, + "step": 39400 + }, + { + "epoch": 2.4697624866829604, + "grad_norm": 0.11748268455266953, + "learning_rate": 8.930569335669503e-06, + "loss": 0.0992, + "step": 39410 + }, + { + "epoch": 2.470389170896785, + "grad_norm": 0.04246789962053299, + "learning_rate": 8.920014354774018e-06, + "loss": 0.0537, + "step": 39420 + }, + { + "epoch": 2.47101585511061, + "grad_norm": 0.12296553701162338, + "learning_rate": 8.909459373878535e-06, + "loss": 0.0725, + "step": 39430 + }, + { + "epoch": 2.4716425393244346, + "grad_norm": 0.008064544759690762, + "learning_rate": 8.898904392983048e-06, + "loss": 0.04, + "step": 39440 + }, + { + "epoch": 2.472269223538259, + "grad_norm": 0.005667585413902998, + "learning_rate": 8.888349412087565e-06, + "loss": 0.0019, + "step": 39450 + }, + { + "epoch": 2.4728959077520836, + "grad_norm": 0.0050772554241120815, + "learning_rate": 8.87779443119208e-06, + "loss": 0.0014, + "step": 39460 + }, + { + "epoch": 2.4735225919659083, + "grad_norm": 0.010119447484612465, + "learning_rate": 8.867239450296595e-06, + "loss": 0.1343, + "step": 39470 + }, + { + "epoch": 2.474149276179733, + "grad_norm": 0.0813032016158104, + "learning_rate": 8.856684469401111e-06, + "loss": 0.0013, + "step": 39480 + }, + { + "epoch": 2.4747759603935577, + "grad_norm": 2.469439744949341, + "learning_rate": 8.846129488505626e-06, + "loss": 0.0497, + "step": 39490 + }, + { + "epoch": 2.4754026446073825, + "grad_norm": 0.027179885655641556, + "learning_rate": 8.835574507610141e-06, + "loss": 0.0521, + "step": 39500 + }, + { + "epoch": 2.476029328821207, + "grad_norm": 0.04990369826555252, + "learning_rate": 8.825019526714658e-06, + "loss": 0.1079, + "step": 39510 + }, + { + "epoch": 2.4766560130350315, + "grad_norm": 9.644336700439453, + "learning_rate": 8.814464545819171e-06, + "loss": 0.0785, + "step": 39520 + }, + { + "epoch": 2.477282697248856, + "grad_norm": 0.008019731380045414, + "learning_rate": 8.803909564923688e-06, + "loss": 0.0042, + "step": 39530 + }, + { + "epoch": 2.477909381462681, + "grad_norm": 0.008901724591851234, + "learning_rate": 8.793354584028203e-06, + "loss": 0.1083, + "step": 39540 + }, + { + "epoch": 2.4785360656765056, + "grad_norm": 0.05041956529021263, + "learning_rate": 8.78279960313272e-06, + "loss": 0.1845, + "step": 39550 + }, + { + "epoch": 2.4791627498903304, + "grad_norm": 0.06697653979063034, + "learning_rate": 8.772244622237234e-06, + "loss": 0.03, + "step": 39560 + }, + { + "epoch": 2.479789434104155, + "grad_norm": 0.031245127320289612, + "learning_rate": 8.76168964134175e-06, + "loss": 0.0018, + "step": 39570 + }, + { + "epoch": 2.4804161183179794, + "grad_norm": 0.024495404213666916, + "learning_rate": 8.751134660446266e-06, + "loss": 0.0303, + "step": 39580 + }, + { + "epoch": 2.481042802531804, + "grad_norm": 0.006389065179973841, + "learning_rate": 8.740579679550781e-06, + "loss": 0.0641, + "step": 39590 + }, + { + "epoch": 2.481669486745629, + "grad_norm": 0.29042214155197144, + "learning_rate": 8.730024698655296e-06, + "loss": 0.0874, + "step": 39600 + }, + { + "epoch": 2.4822961709594535, + "grad_norm": 0.007173601537942886, + "learning_rate": 8.719469717759811e-06, + "loss": 0.0814, + "step": 39610 + }, + { + "epoch": 2.4829228551732783, + "grad_norm": 40.7451286315918, + "learning_rate": 8.708914736864326e-06, + "loss": 0.0931, + "step": 39620 + }, + { + "epoch": 2.483549539387103, + "grad_norm": 0.007661072537302971, + "learning_rate": 8.698359755968842e-06, + "loss": 0.1018, + "step": 39630 + }, + { + "epoch": 2.4841762236009277, + "grad_norm": 0.0291671734303236, + "learning_rate": 8.687804775073357e-06, + "loss": 0.0021, + "step": 39640 + }, + { + "epoch": 2.484802907814752, + "grad_norm": 0.05833710730075836, + "learning_rate": 8.677249794177874e-06, + "loss": 0.0815, + "step": 39650 + }, + { + "epoch": 2.4854295920285767, + "grad_norm": 0.09396538883447647, + "learning_rate": 8.666694813282389e-06, + "loss": 0.1411, + "step": 39660 + }, + { + "epoch": 2.4860562762424014, + "grad_norm": 9.64147663116455, + "learning_rate": 8.656139832386904e-06, + "loss": 0.1394, + "step": 39670 + }, + { + "epoch": 2.486682960456226, + "grad_norm": 0.21708106994628906, + "learning_rate": 8.645584851491419e-06, + "loss": 0.0018, + "step": 39680 + }, + { + "epoch": 2.487309644670051, + "grad_norm": 6.932487964630127, + "learning_rate": 8.635029870595934e-06, + "loss": 0.2451, + "step": 39690 + }, + { + "epoch": 2.487936328883875, + "grad_norm": 0.5158403515815735, + "learning_rate": 8.62447488970045e-06, + "loss": 0.0026, + "step": 39700 + }, + { + "epoch": 2.4885630130977, + "grad_norm": 0.14011597633361816, + "learning_rate": 8.613919908804966e-06, + "loss": 0.0171, + "step": 39710 + }, + { + "epoch": 2.4891896973115246, + "grad_norm": 0.01039296854287386, + "learning_rate": 8.60336492790948e-06, + "loss": 0.0317, + "step": 39720 + }, + { + "epoch": 2.4898163815253493, + "grad_norm": 0.17363180220127106, + "learning_rate": 8.592809947013997e-06, + "loss": 0.0335, + "step": 39730 + }, + { + "epoch": 2.490443065739174, + "grad_norm": 0.46715372800827026, + "learning_rate": 8.582254966118512e-06, + "loss": 0.0054, + "step": 39740 + }, + { + "epoch": 2.491069749952999, + "grad_norm": 0.18222229182720184, + "learning_rate": 8.571699985223027e-06, + "loss": 0.1999, + "step": 39750 + }, + { + "epoch": 2.4916964341668235, + "grad_norm": 0.03781836852431297, + "learning_rate": 8.561145004327542e-06, + "loss": 0.0014, + "step": 39760 + }, + { + "epoch": 2.4923231183806482, + "grad_norm": 4.425987243652344, + "learning_rate": 8.550590023432057e-06, + "loss": 0.0785, + "step": 39770 + }, + { + "epoch": 2.4929498025944725, + "grad_norm": 0.0323716439306736, + "learning_rate": 8.540035042536574e-06, + "loss": 0.1397, + "step": 39780 + }, + { + "epoch": 2.4935764868082972, + "grad_norm": 0.3504541516304016, + "learning_rate": 8.529480061641089e-06, + "loss": 0.1509, + "step": 39790 + }, + { + "epoch": 2.494203171022122, + "grad_norm": 0.018971078097820282, + "learning_rate": 8.518925080745605e-06, + "loss": 0.0023, + "step": 39800 + }, + { + "epoch": 2.4948298552359467, + "grad_norm": 0.1483769714832306, + "learning_rate": 8.50837009985012e-06, + "loss": 0.0789, + "step": 39810 + }, + { + "epoch": 2.4954565394497714, + "grad_norm": 0.013152056373655796, + "learning_rate": 8.497815118954635e-06, + "loss": 0.0024, + "step": 39820 + }, + { + "epoch": 2.4960832236635957, + "grad_norm": 7.379671096801758, + "learning_rate": 8.48726013805915e-06, + "loss": 0.0829, + "step": 39830 + }, + { + "epoch": 2.4967099078774204, + "grad_norm": 0.007252856157720089, + "learning_rate": 8.476705157163665e-06, + "loss": 0.0338, + "step": 39840 + }, + { + "epoch": 2.497336592091245, + "grad_norm": 0.0384717620909214, + "learning_rate": 8.466150176268182e-06, + "loss": 0.0011, + "step": 39850 + }, + { + "epoch": 2.49796327630507, + "grad_norm": 0.05042622238397598, + "learning_rate": 8.455595195372697e-06, + "loss": 0.1155, + "step": 39860 + }, + { + "epoch": 2.4985899605188946, + "grad_norm": 0.009568829089403152, + "learning_rate": 8.445040214477212e-06, + "loss": 0.0009, + "step": 39870 + }, + { + "epoch": 2.4992166447327193, + "grad_norm": 0.05151621252298355, + "learning_rate": 8.434485233581728e-06, + "loss": 0.0021, + "step": 39880 + }, + { + "epoch": 2.499843328946544, + "grad_norm": 2.42625093460083, + "learning_rate": 8.423930252686243e-06, + "loss": 0.0025, + "step": 39890 + }, + { + "epoch": 2.5004700131603688, + "grad_norm": 0.03024793043732643, + "learning_rate": 8.41337527179076e-06, + "loss": 0.0068, + "step": 39900 + }, + { + "epoch": 2.501096697374193, + "grad_norm": 0.10120806097984314, + "learning_rate": 8.402820290895273e-06, + "loss": 0.1672, + "step": 39910 + }, + { + "epoch": 2.5017233815880178, + "grad_norm": 0.055157218128442764, + "learning_rate": 8.392265309999788e-06, + "loss": 0.0486, + "step": 39920 + }, + { + "epoch": 2.5023500658018425, + "grad_norm": 0.015098130330443382, + "learning_rate": 8.381710329104305e-06, + "loss": 0.1388, + "step": 39930 + }, + { + "epoch": 2.502976750015667, + "grad_norm": 0.17691518366336823, + "learning_rate": 8.37115534820882e-06, + "loss": 0.0009, + "step": 39940 + }, + { + "epoch": 2.5036034342294915, + "grad_norm": 0.35355162620544434, + "learning_rate": 8.360600367313336e-06, + "loss": 0.0522, + "step": 39950 + }, + { + "epoch": 2.504230118443316, + "grad_norm": 0.04962703213095665, + "learning_rate": 8.350045386417851e-06, + "loss": 0.0506, + "step": 39960 + }, + { + "epoch": 2.504856802657141, + "grad_norm": 0.08713936060667038, + "learning_rate": 8.339490405522366e-06, + "loss": 0.085, + "step": 39970 + }, + { + "epoch": 2.5054834868709657, + "grad_norm": 8.202126502990723, + "learning_rate": 8.328935424626883e-06, + "loss": 0.2781, + "step": 39980 + }, + { + "epoch": 2.5061101710847904, + "grad_norm": 15.094564437866211, + "learning_rate": 8.318380443731396e-06, + "loss": 0.2105, + "step": 39990 + }, + { + "epoch": 2.506736855298615, + "grad_norm": 0.005411601159721613, + "learning_rate": 8.307825462835913e-06, + "loss": 0.0096, + "step": 40000 + }, + { + "epoch": 2.50736353951244, + "grad_norm": 0.4384141266345978, + "learning_rate": 8.297270481940428e-06, + "loss": 0.039, + "step": 40010 + }, + { + "epoch": 2.5079902237262646, + "grad_norm": 10.535191535949707, + "learning_rate": 8.286715501044943e-06, + "loss": 0.0983, + "step": 40020 + }, + { + "epoch": 2.508616907940089, + "grad_norm": 17.734485626220703, + "learning_rate": 8.27616052014946e-06, + "loss": 0.0868, + "step": 40030 + }, + { + "epoch": 2.5092435921539136, + "grad_norm": 0.14247313141822815, + "learning_rate": 8.265605539253974e-06, + "loss": 0.0562, + "step": 40040 + }, + { + "epoch": 2.5098702763677383, + "grad_norm": 10.993691444396973, + "learning_rate": 8.255050558358491e-06, + "loss": 0.1285, + "step": 40050 + }, + { + "epoch": 2.510496960581563, + "grad_norm": 116.90253448486328, + "learning_rate": 8.244495577463004e-06, + "loss": 0.0778, + "step": 40060 + }, + { + "epoch": 2.5111236447953877, + "grad_norm": 0.01970071718096733, + "learning_rate": 8.233940596567521e-06, + "loss": 0.0018, + "step": 40070 + }, + { + "epoch": 2.511750329009212, + "grad_norm": 0.15241803228855133, + "learning_rate": 8.223385615672036e-06, + "loss": 0.0509, + "step": 40080 + }, + { + "epoch": 2.5123770132230367, + "grad_norm": 0.17657651007175446, + "learning_rate": 8.21283063477655e-06, + "loss": 0.1325, + "step": 40090 + }, + { + "epoch": 2.5130036974368615, + "grad_norm": 0.15709511935710907, + "learning_rate": 8.202275653881067e-06, + "loss": 0.0016, + "step": 40100 + }, + { + "epoch": 2.513630381650686, + "grad_norm": 0.06546179950237274, + "learning_rate": 8.191720672985582e-06, + "loss": 0.0021, + "step": 40110 + }, + { + "epoch": 2.514257065864511, + "grad_norm": 0.0067861187271773815, + "learning_rate": 8.181165692090097e-06, + "loss": 0.0012, + "step": 40120 + }, + { + "epoch": 2.5148837500783356, + "grad_norm": 0.006304751615971327, + "learning_rate": 8.170610711194614e-06, + "loss": 0.2247, + "step": 40130 + }, + { + "epoch": 2.5155104342921604, + "grad_norm": 0.013594291172921658, + "learning_rate": 8.160055730299127e-06, + "loss": 0.0031, + "step": 40140 + }, + { + "epoch": 2.516137118505985, + "grad_norm": 0.063226617872715, + "learning_rate": 8.149500749403644e-06, + "loss": 0.0957, + "step": 40150 + }, + { + "epoch": 2.5167638027198094, + "grad_norm": 5.06606912612915, + "learning_rate": 8.138945768508159e-06, + "loss": 0.2566, + "step": 40160 + }, + { + "epoch": 2.517390486933634, + "grad_norm": 0.040200620889663696, + "learning_rate": 8.128390787612674e-06, + "loss": 0.1106, + "step": 40170 + }, + { + "epoch": 2.518017171147459, + "grad_norm": 0.04336791858077049, + "learning_rate": 8.11783580671719e-06, + "loss": 0.0042, + "step": 40180 + }, + { + "epoch": 2.5186438553612835, + "grad_norm": 0.13072709739208221, + "learning_rate": 8.107280825821705e-06, + "loss": 0.0505, + "step": 40190 + }, + { + "epoch": 2.519270539575108, + "grad_norm": 0.13012777268886566, + "learning_rate": 8.096725844926222e-06, + "loss": 0.0073, + "step": 40200 + }, + { + "epoch": 2.5198972237889325, + "grad_norm": 0.03830237314105034, + "learning_rate": 8.086170864030737e-06, + "loss": 0.0673, + "step": 40210 + }, + { + "epoch": 2.5205239080027573, + "grad_norm": 0.032999083399772644, + "learning_rate": 8.075615883135252e-06, + "loss": 0.1633, + "step": 40220 + }, + { + "epoch": 2.521150592216582, + "grad_norm": 0.1449841558933258, + "learning_rate": 8.065060902239767e-06, + "loss": 0.0378, + "step": 40230 + }, + { + "epoch": 2.5217772764304067, + "grad_norm": 0.4678923785686493, + "learning_rate": 8.054505921344282e-06, + "loss": 0.0041, + "step": 40240 + }, + { + "epoch": 2.5224039606442314, + "grad_norm": 0.1612612009048462, + "learning_rate": 8.043950940448799e-06, + "loss": 0.0515, + "step": 40250 + }, + { + "epoch": 2.523030644858056, + "grad_norm": 0.06994244456291199, + "learning_rate": 8.033395959553314e-06, + "loss": 0.0494, + "step": 40260 + }, + { + "epoch": 2.523657329071881, + "grad_norm": 0.02159612812101841, + "learning_rate": 8.022840978657829e-06, + "loss": 0.0418, + "step": 40270 + }, + { + "epoch": 2.524284013285705, + "grad_norm": 4.538101673126221, + "learning_rate": 8.012285997762345e-06, + "loss": 0.0343, + "step": 40280 + }, + { + "epoch": 2.52491069749953, + "grad_norm": 0.0578109547495842, + "learning_rate": 8.00173101686686e-06, + "loss": 0.1129, + "step": 40290 + }, + { + "epoch": 2.5255373817133546, + "grad_norm": 6.311728477478027, + "learning_rate": 7.991176035971375e-06, + "loss": 0.0461, + "step": 40300 + }, + { + "epoch": 2.5261640659271793, + "grad_norm": 0.0051009259186685085, + "learning_rate": 7.98062105507589e-06, + "loss": 0.0041, + "step": 40310 + }, + { + "epoch": 2.526790750141004, + "grad_norm": 0.017606286332011223, + "learning_rate": 7.970066074180407e-06, + "loss": 0.0028, + "step": 40320 + }, + { + "epoch": 2.5274174343548284, + "grad_norm": 11.35566520690918, + "learning_rate": 7.959511093284922e-06, + "loss": 0.2051, + "step": 40330 + }, + { + "epoch": 2.528044118568653, + "grad_norm": 0.010604508221149445, + "learning_rate": 7.948956112389437e-06, + "loss": 0.0021, + "step": 40340 + }, + { + "epoch": 2.528670802782478, + "grad_norm": 0.005866426043212414, + "learning_rate": 7.938401131493953e-06, + "loss": 0.1692, + "step": 40350 + }, + { + "epoch": 2.5292974869963025, + "grad_norm": 5.990290641784668, + "learning_rate": 7.927846150598468e-06, + "loss": 0.0793, + "step": 40360 + }, + { + "epoch": 2.5299241712101272, + "grad_norm": 0.017411302775144577, + "learning_rate": 7.917291169702983e-06, + "loss": 0.0348, + "step": 40370 + }, + { + "epoch": 2.530550855423952, + "grad_norm": 4.7487969398498535, + "learning_rate": 7.906736188807498e-06, + "loss": 0.0432, + "step": 40380 + }, + { + "epoch": 2.5311775396377767, + "grad_norm": 0.01979987509548664, + "learning_rate": 7.896181207912013e-06, + "loss": 0.1079, + "step": 40390 + }, + { + "epoch": 2.5318042238516014, + "grad_norm": 17.053985595703125, + "learning_rate": 7.88562622701653e-06, + "loss": 0.074, + "step": 40400 + }, + { + "epoch": 2.5324309080654257, + "grad_norm": 0.03223740682005882, + "learning_rate": 7.875071246121045e-06, + "loss": 0.031, + "step": 40410 + }, + { + "epoch": 2.5330575922792504, + "grad_norm": 0.05917942896485329, + "learning_rate": 7.864516265225561e-06, + "loss": 0.1192, + "step": 40420 + }, + { + "epoch": 2.533684276493075, + "grad_norm": 0.008573943749070168, + "learning_rate": 7.853961284330076e-06, + "loss": 0.0051, + "step": 40430 + }, + { + "epoch": 2.5343109607069, + "grad_norm": 0.09303145110607147, + "learning_rate": 7.843406303434591e-06, + "loss": 0.1075, + "step": 40440 + }, + { + "epoch": 2.5349376449207246, + "grad_norm": 0.004650171380490065, + "learning_rate": 7.832851322539106e-06, + "loss": 0.0004, + "step": 40450 + }, + { + "epoch": 2.535564329134549, + "grad_norm": 1.0261204242706299, + "learning_rate": 7.822296341643621e-06, + "loss": 0.0653, + "step": 40460 + }, + { + "epoch": 2.5361910133483736, + "grad_norm": 0.06141495332121849, + "learning_rate": 7.811741360748138e-06, + "loss": 0.1281, + "step": 40470 + }, + { + "epoch": 2.5368176975621983, + "grad_norm": 0.12824884057044983, + "learning_rate": 7.801186379852653e-06, + "loss": 0.0246, + "step": 40480 + }, + { + "epoch": 2.537444381776023, + "grad_norm": 0.0036953743547201157, + "learning_rate": 7.790631398957168e-06, + "loss": 0.1411, + "step": 40490 + }, + { + "epoch": 2.5380710659898478, + "grad_norm": 0.012939559295773506, + "learning_rate": 7.780076418061684e-06, + "loss": 0.1519, + "step": 40500 + }, + { + "epoch": 2.5386977502036725, + "grad_norm": 0.12337151914834976, + "learning_rate": 7.7695214371662e-06, + "loss": 0.0698, + "step": 40510 + }, + { + "epoch": 2.5393244344174972, + "grad_norm": 0.19674107432365417, + "learning_rate": 7.758966456270714e-06, + "loss": 0.1287, + "step": 40520 + }, + { + "epoch": 2.539951118631322, + "grad_norm": 0.06755591183900833, + "learning_rate": 7.74841147537523e-06, + "loss": 0.0695, + "step": 40530 + }, + { + "epoch": 2.5405778028451462, + "grad_norm": 0.1544746607542038, + "learning_rate": 7.737856494479744e-06, + "loss": 0.0466, + "step": 40540 + }, + { + "epoch": 2.541204487058971, + "grad_norm": 0.03073558211326599, + "learning_rate": 7.727301513584261e-06, + "loss": 0.0462, + "step": 40550 + }, + { + "epoch": 2.5418311712727957, + "grad_norm": 10.895833015441895, + "learning_rate": 7.716746532688776e-06, + "loss": 0.069, + "step": 40560 + }, + { + "epoch": 2.5424578554866204, + "grad_norm": 0.006254229694604874, + "learning_rate": 7.706191551793292e-06, + "loss": 0.1509, + "step": 40570 + }, + { + "epoch": 2.5430845397004447, + "grad_norm": 0.10220787674188614, + "learning_rate": 7.695636570897807e-06, + "loss": 0.032, + "step": 40580 + }, + { + "epoch": 2.5437112239142694, + "grad_norm": 0.11436332762241364, + "learning_rate": 7.685081590002322e-06, + "loss": 0.1924, + "step": 40590 + }, + { + "epoch": 2.544337908128094, + "grad_norm": 0.015846652910113335, + "learning_rate": 7.674526609106839e-06, + "loss": 0.0413, + "step": 40600 + }, + { + "epoch": 2.544964592341919, + "grad_norm": 6.154116630554199, + "learning_rate": 7.663971628211352e-06, + "loss": 0.1555, + "step": 40610 + }, + { + "epoch": 2.5455912765557436, + "grad_norm": 0.022826502099633217, + "learning_rate": 7.653416647315869e-06, + "loss": 0.0445, + "step": 40620 + }, + { + "epoch": 2.5462179607695683, + "grad_norm": 1.5049920082092285, + "learning_rate": 7.642861666420384e-06, + "loss": 0.0827, + "step": 40630 + }, + { + "epoch": 2.546844644983393, + "grad_norm": 0.02650361694395542, + "learning_rate": 7.632306685524899e-06, + "loss": 0.0738, + "step": 40640 + }, + { + "epoch": 2.5474713291972177, + "grad_norm": 0.0813385620713234, + "learning_rate": 7.6217517046294155e-06, + "loss": 0.0428, + "step": 40650 + }, + { + "epoch": 2.548098013411042, + "grad_norm": 0.04428323358297348, + "learning_rate": 7.6111967237339305e-06, + "loss": 0.1311, + "step": 40660 + }, + { + "epoch": 2.5487246976248668, + "grad_norm": 0.15579140186309814, + "learning_rate": 7.600641742838446e-06, + "loss": 0.1653, + "step": 40670 + }, + { + "epoch": 2.5493513818386915, + "grad_norm": 11.184800148010254, + "learning_rate": 7.590086761942962e-06, + "loss": 0.061, + "step": 40680 + }, + { + "epoch": 2.549978066052516, + "grad_norm": 0.03695826232433319, + "learning_rate": 7.579531781047476e-06, + "loss": 0.0053, + "step": 40690 + }, + { + "epoch": 2.550604750266341, + "grad_norm": 0.011084161698818207, + "learning_rate": 7.568976800151992e-06, + "loss": 0.0534, + "step": 40700 + }, + { + "epoch": 2.551231434480165, + "grad_norm": 0.008540414273738861, + "learning_rate": 7.558421819256507e-06, + "loss": 0.034, + "step": 40710 + }, + { + "epoch": 2.55185811869399, + "grad_norm": 0.004470044281333685, + "learning_rate": 7.547866838361023e-06, + "loss": 0.0365, + "step": 40720 + }, + { + "epoch": 2.5524848029078147, + "grad_norm": 0.01315213367342949, + "learning_rate": 7.5373118574655386e-06, + "loss": 0.005, + "step": 40730 + }, + { + "epoch": 2.5531114871216394, + "grad_norm": 0.006134567316621542, + "learning_rate": 7.526756876570054e-06, + "loss": 0.0048, + "step": 40740 + }, + { + "epoch": 2.553738171335464, + "grad_norm": 0.006208804901689291, + "learning_rate": 7.516201895674569e-06, + "loss": 0.0122, + "step": 40750 + }, + { + "epoch": 2.554364855549289, + "grad_norm": 0.008568532764911652, + "learning_rate": 7.505646914779084e-06, + "loss": 0.0436, + "step": 40760 + }, + { + "epoch": 2.5549915397631136, + "grad_norm": 2.7557194232940674, + "learning_rate": 7.495091933883599e-06, + "loss": 0.0044, + "step": 40770 + }, + { + "epoch": 2.5556182239769383, + "grad_norm": 0.019581960514187813, + "learning_rate": 7.484536952988115e-06, + "loss": 0.0024, + "step": 40780 + }, + { + "epoch": 2.5562449081907626, + "grad_norm": 8.799368858337402, + "learning_rate": 7.473981972092631e-06, + "loss": 0.1515, + "step": 40790 + }, + { + "epoch": 2.5568715924045873, + "grad_norm": 0.006035898812115192, + "learning_rate": 7.463426991197147e-06, + "loss": 0.003, + "step": 40800 + }, + { + "epoch": 2.557498276618412, + "grad_norm": 0.008945782668888569, + "learning_rate": 7.452872010301662e-06, + "loss": 0.1117, + "step": 40810 + }, + { + "epoch": 2.5581249608322367, + "grad_norm": 51.71331787109375, + "learning_rate": 7.442317029406177e-06, + "loss": 0.0434, + "step": 40820 + }, + { + "epoch": 2.5587516450460615, + "grad_norm": 0.0057718148455023766, + "learning_rate": 7.431762048510693e-06, + "loss": 0.0409, + "step": 40830 + }, + { + "epoch": 2.5593783292598857, + "grad_norm": 0.006954421754926443, + "learning_rate": 7.421207067615207e-06, + "loss": 0.0488, + "step": 40840 + }, + { + "epoch": 2.5600050134737105, + "grad_norm": 0.009053668938577175, + "learning_rate": 7.410652086719723e-06, + "loss": 0.0025, + "step": 40850 + }, + { + "epoch": 2.560631697687535, + "grad_norm": 0.00380707997828722, + "learning_rate": 7.400097105824239e-06, + "loss": 0.1444, + "step": 40860 + }, + { + "epoch": 2.56125838190136, + "grad_norm": 0.022652175277471542, + "learning_rate": 7.389542124928754e-06, + "loss": 0.0867, + "step": 40870 + }, + { + "epoch": 2.5618850661151846, + "grad_norm": 0.3384392559528351, + "learning_rate": 7.37898714403327e-06, + "loss": 0.0178, + "step": 40880 + }, + { + "epoch": 2.5625117503290094, + "grad_norm": 0.004380661062896252, + "learning_rate": 7.3684321631377855e-06, + "loss": 0.0013, + "step": 40890 + }, + { + "epoch": 2.563138434542834, + "grad_norm": 0.0032064158003777266, + "learning_rate": 7.357877182242301e-06, + "loss": 0.002, + "step": 40900 + }, + { + "epoch": 2.5637651187566584, + "grad_norm": 0.055104441940784454, + "learning_rate": 7.347322201346816e-06, + "loss": 0.0609, + "step": 40910 + }, + { + "epoch": 2.564391802970483, + "grad_norm": 0.003831064561381936, + "learning_rate": 7.33676722045133e-06, + "loss": 0.1124, + "step": 40920 + }, + { + "epoch": 2.565018487184308, + "grad_norm": 0.0033136503770947456, + "learning_rate": 7.326212239555846e-06, + "loss": 0.0425, + "step": 40930 + }, + { + "epoch": 2.5656451713981325, + "grad_norm": 0.03835991770029068, + "learning_rate": 7.315657258660362e-06, + "loss": 0.0009, + "step": 40940 + }, + { + "epoch": 2.5662718556119573, + "grad_norm": 0.11937529593706131, + "learning_rate": 7.305102277764878e-06, + "loss": 0.1146, + "step": 40950 + }, + { + "epoch": 2.5668985398257815, + "grad_norm": 0.13023905456066132, + "learning_rate": 7.294547296869393e-06, + "loss": 0.127, + "step": 40960 + }, + { + "epoch": 2.5675252240396063, + "grad_norm": 0.021615734323859215, + "learning_rate": 7.2839923159739086e-06, + "loss": 0.0035, + "step": 40970 + }, + { + "epoch": 2.568151908253431, + "grad_norm": 0.03249719366431236, + "learning_rate": 7.273437335078424e-06, + "loss": 0.0023, + "step": 40980 + }, + { + "epoch": 2.5687785924672557, + "grad_norm": 0.007845253683626652, + "learning_rate": 7.26288235418294e-06, + "loss": 0.0853, + "step": 40990 + }, + { + "epoch": 2.5694052766810804, + "grad_norm": 0.8201372027397156, + "learning_rate": 7.252327373287454e-06, + "loss": 0.0428, + "step": 41000 + }, + { + "epoch": 2.570031960894905, + "grad_norm": 0.026965798810124397, + "learning_rate": 7.24177239239197e-06, + "loss": 0.0226, + "step": 41010 + }, + { + "epoch": 2.57065864510873, + "grad_norm": 0.39754635095596313, + "learning_rate": 7.231217411496485e-06, + "loss": 0.0014, + "step": 41020 + }, + { + "epoch": 2.5712853293225546, + "grad_norm": 0.15909859538078308, + "learning_rate": 7.220662430601001e-06, + "loss": 0.1904, + "step": 41030 + }, + { + "epoch": 2.571912013536379, + "grad_norm": 0.04498082771897316, + "learning_rate": 7.210107449705517e-06, + "loss": 0.0012, + "step": 41040 + }, + { + "epoch": 2.5725386977502036, + "grad_norm": 0.034305937588214874, + "learning_rate": 7.1995524688100324e-06, + "loss": 0.0518, + "step": 41050 + }, + { + "epoch": 2.5731653819640283, + "grad_norm": 17.956905364990234, + "learning_rate": 7.188997487914547e-06, + "loss": 0.0613, + "step": 41060 + }, + { + "epoch": 2.573792066177853, + "grad_norm": 0.006127170752733946, + "learning_rate": 7.178442507019062e-06, + "loss": 0.0619, + "step": 41070 + }, + { + "epoch": 2.574418750391678, + "grad_norm": 0.14973905682563782, + "learning_rate": 7.167887526123577e-06, + "loss": 0.1973, + "step": 41080 + }, + { + "epoch": 2.575045434605502, + "grad_norm": 5.818470001220703, + "learning_rate": 7.157332545228093e-06, + "loss": 0.0659, + "step": 41090 + }, + { + "epoch": 2.575672118819327, + "grad_norm": 6.122541427612305, + "learning_rate": 7.146777564332609e-06, + "loss": 0.0494, + "step": 41100 + }, + { + "epoch": 2.5762988030331515, + "grad_norm": 0.022952023893594742, + "learning_rate": 7.136222583437125e-06, + "loss": 0.0709, + "step": 41110 + }, + { + "epoch": 2.5769254872469762, + "grad_norm": 0.03813765570521355, + "learning_rate": 7.12566760254164e-06, + "loss": 0.1177, + "step": 41120 + }, + { + "epoch": 2.577552171460801, + "grad_norm": 0.07932894676923752, + "learning_rate": 7.1151126216461555e-06, + "loss": 0.0325, + "step": 41130 + }, + { + "epoch": 2.5781788556746257, + "grad_norm": 2.9672300815582275, + "learning_rate": 7.104557640750671e-06, + "loss": 0.0839, + "step": 41140 + }, + { + "epoch": 2.5788055398884504, + "grad_norm": 0.006282791495323181, + "learning_rate": 7.094002659855185e-06, + "loss": 0.0005, + "step": 41150 + }, + { + "epoch": 2.579432224102275, + "grad_norm": 0.04274289309978485, + "learning_rate": 7.083447678959701e-06, + "loss": 0.0583, + "step": 41160 + }, + { + "epoch": 2.5800589083160994, + "grad_norm": 0.06123344227671623, + "learning_rate": 7.072892698064216e-06, + "loss": 0.004, + "step": 41170 + }, + { + "epoch": 2.580685592529924, + "grad_norm": 2.8731517791748047, + "learning_rate": 7.062337717168732e-06, + "loss": 0.0143, + "step": 41180 + }, + { + "epoch": 2.581312276743749, + "grad_norm": 0.00873857643455267, + "learning_rate": 7.051782736273248e-06, + "loss": 0.1282, + "step": 41190 + }, + { + "epoch": 2.5819389609575736, + "grad_norm": 0.24595175683498383, + "learning_rate": 7.041227755377764e-06, + "loss": 0.0358, + "step": 41200 + }, + { + "epoch": 2.582565645171398, + "grad_norm": 0.508924126625061, + "learning_rate": 7.0306727744822785e-06, + "loss": 0.0816, + "step": 41210 + }, + { + "epoch": 2.5831923293852226, + "grad_norm": 0.05899544432759285, + "learning_rate": 7.020117793586794e-06, + "loss": 0.1588, + "step": 41220 + }, + { + "epoch": 2.5838190135990473, + "grad_norm": 9.842616081237793, + "learning_rate": 7.0095628126913085e-06, + "loss": 0.0661, + "step": 41230 + }, + { + "epoch": 2.584445697812872, + "grad_norm": 0.0917615070939064, + "learning_rate": 6.999007831795824e-06, + "loss": 0.0988, + "step": 41240 + }, + { + "epoch": 2.5850723820266968, + "grad_norm": 21.74089813232422, + "learning_rate": 6.98845285090034e-06, + "loss": 0.1004, + "step": 41250 + }, + { + "epoch": 2.5856990662405215, + "grad_norm": 0.03793327137827873, + "learning_rate": 6.977897870004856e-06, + "loss": 0.0648, + "step": 41260 + }, + { + "epoch": 2.586325750454346, + "grad_norm": 8.540681838989258, + "learning_rate": 6.967342889109371e-06, + "loss": 0.1421, + "step": 41270 + }, + { + "epoch": 2.586952434668171, + "grad_norm": 0.07502247393131256, + "learning_rate": 6.956787908213887e-06, + "loss": 0.0398, + "step": 41280 + }, + { + "epoch": 2.587579118881995, + "grad_norm": 0.7496595978736877, + "learning_rate": 6.9462329273184024e-06, + "loss": 0.0032, + "step": 41290 + }, + { + "epoch": 2.58820580309582, + "grad_norm": 0.8887155652046204, + "learning_rate": 6.935677946422918e-06, + "loss": 0.1033, + "step": 41300 + }, + { + "epoch": 2.5888324873096447, + "grad_norm": 0.09058801829814911, + "learning_rate": 6.925122965527432e-06, + "loss": 0.002, + "step": 41310 + }, + { + "epoch": 2.5894591715234694, + "grad_norm": 0.014632687903940678, + "learning_rate": 6.914567984631948e-06, + "loss": 0.0641, + "step": 41320 + }, + { + "epoch": 2.590085855737294, + "grad_norm": 0.035875823348760605, + "learning_rate": 6.904013003736463e-06, + "loss": 0.1075, + "step": 41330 + }, + { + "epoch": 2.5907125399511184, + "grad_norm": 0.645311713218689, + "learning_rate": 6.893458022840979e-06, + "loss": 0.0041, + "step": 41340 + }, + { + "epoch": 2.591339224164943, + "grad_norm": 6.724026203155518, + "learning_rate": 6.882903041945495e-06, + "loss": 0.1227, + "step": 41350 + }, + { + "epoch": 2.591965908378768, + "grad_norm": 0.04724414646625519, + "learning_rate": 6.8723480610500105e-06, + "loss": 0.2922, + "step": 41360 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.024513479322195053, + "learning_rate": 6.8617930801545255e-06, + "loss": 0.0545, + "step": 41370 + }, + { + "epoch": 2.5932192768064173, + "grad_norm": 0.8417285680770874, + "learning_rate": 6.85123809925904e-06, + "loss": 0.0518, + "step": 41380 + }, + { + "epoch": 2.593845961020242, + "grad_norm": 0.06562932580709457, + "learning_rate": 6.840683118363555e-06, + "loss": 0.0475, + "step": 41390 + }, + { + "epoch": 2.5944726452340667, + "grad_norm": 0.04768669232726097, + "learning_rate": 6.830128137468071e-06, + "loss": 0.1743, + "step": 41400 + }, + { + "epoch": 2.5950993294478915, + "grad_norm": 0.025528786703944206, + "learning_rate": 6.819573156572587e-06, + "loss": 0.0038, + "step": 41410 + }, + { + "epoch": 2.5957260136617157, + "grad_norm": 0.1445946991443634, + "learning_rate": 6.809018175677102e-06, + "loss": 0.0479, + "step": 41420 + }, + { + "epoch": 2.5963526978755405, + "grad_norm": 5.870827674865723, + "learning_rate": 6.798463194781618e-06, + "loss": 0.1458, + "step": 41430 + }, + { + "epoch": 2.596979382089365, + "grad_norm": 0.14257600903511047, + "learning_rate": 6.7879082138861336e-06, + "loss": 0.0583, + "step": 41440 + }, + { + "epoch": 2.59760606630319, + "grad_norm": 0.03526608645915985, + "learning_rate": 6.777353232990649e-06, + "loss": 0.0316, + "step": 41450 + }, + { + "epoch": 2.5982327505170146, + "grad_norm": 0.2761639654636383, + "learning_rate": 6.7667982520951635e-06, + "loss": 0.0428, + "step": 41460 + }, + { + "epoch": 2.598859434730839, + "grad_norm": 0.011171748861670494, + "learning_rate": 6.756243271199679e-06, + "loss": 0.0828, + "step": 41470 + }, + { + "epoch": 2.5994861189446636, + "grad_norm": 0.008683438412845135, + "learning_rate": 6.745688290304194e-06, + "loss": 0.0023, + "step": 41480 + }, + { + "epoch": 2.6001128031584884, + "grad_norm": 5.977514266967773, + "learning_rate": 6.73513330940871e-06, + "loss": 0.0436, + "step": 41490 + }, + { + "epoch": 2.600739487372313, + "grad_norm": 0.011247127316892147, + "learning_rate": 6.724578328513226e-06, + "loss": 0.0024, + "step": 41500 + }, + { + "epoch": 2.601366171586138, + "grad_norm": 0.168271005153656, + "learning_rate": 6.714023347617742e-06, + "loss": 0.0032, + "step": 41510 + }, + { + "epoch": 2.6019928557999625, + "grad_norm": 0.006584922783076763, + "learning_rate": 6.703468366722257e-06, + "loss": 0.0281, + "step": 41520 + }, + { + "epoch": 2.6026195400137873, + "grad_norm": 0.006742222234606743, + "learning_rate": 6.692913385826772e-06, + "loss": 0.0585, + "step": 41530 + }, + { + "epoch": 2.603246224227612, + "grad_norm": 9.171943664550781, + "learning_rate": 6.6823584049312865e-06, + "loss": 0.2837, + "step": 41540 + }, + { + "epoch": 2.6038729084414363, + "grad_norm": 0.11651241779327393, + "learning_rate": 6.671803424035802e-06, + "loss": 0.043, + "step": 41550 + }, + { + "epoch": 2.604499592655261, + "grad_norm": 0.008494174107909203, + "learning_rate": 6.661248443140318e-06, + "loss": 0.0693, + "step": 41560 + }, + { + "epoch": 2.6051262768690857, + "grad_norm": 0.08039513230323792, + "learning_rate": 6.650693462244834e-06, + "loss": 0.0345, + "step": 41570 + }, + { + "epoch": 2.6057529610829104, + "grad_norm": 0.01966715045273304, + "learning_rate": 6.640138481349349e-06, + "loss": 0.0016, + "step": 41580 + }, + { + "epoch": 2.6063796452967347, + "grad_norm": 0.10775923728942871, + "learning_rate": 6.629583500453865e-06, + "loss": 0.1824, + "step": 41590 + }, + { + "epoch": 2.6070063295105594, + "grad_norm": 0.11887072771787643, + "learning_rate": 6.6190285195583805e-06, + "loss": 0.0817, + "step": 41600 + }, + { + "epoch": 2.607633013724384, + "grad_norm": 0.031048081815242767, + "learning_rate": 6.608473538662896e-06, + "loss": 0.0473, + "step": 41610 + }, + { + "epoch": 2.608259697938209, + "grad_norm": 12.19013500213623, + "learning_rate": 6.59791855776741e-06, + "loss": 0.1918, + "step": 41620 + }, + { + "epoch": 2.6088863821520336, + "grad_norm": 0.7925893068313599, + "learning_rate": 6.587363576871925e-06, + "loss": 0.0302, + "step": 41630 + }, + { + "epoch": 2.6095130663658583, + "grad_norm": 0.013532374054193497, + "learning_rate": 6.576808595976441e-06, + "loss": 0.1553, + "step": 41640 + }, + { + "epoch": 2.610139750579683, + "grad_norm": 1.1819570064544678, + "learning_rate": 6.566253615080957e-06, + "loss": 0.031, + "step": 41650 + }, + { + "epoch": 2.610766434793508, + "grad_norm": 0.08105959743261337, + "learning_rate": 6.555698634185473e-06, + "loss": 0.0032, + "step": 41660 + }, + { + "epoch": 2.611393119007332, + "grad_norm": 0.47787657380104065, + "learning_rate": 6.545143653289989e-06, + "loss": 0.0621, + "step": 41670 + }, + { + "epoch": 2.612019803221157, + "grad_norm": 5.609531879425049, + "learning_rate": 6.5345886723945035e-06, + "loss": 0.0825, + "step": 41680 + }, + { + "epoch": 2.6126464874349815, + "grad_norm": 0.004527808167040348, + "learning_rate": 6.524033691499019e-06, + "loss": 0.1428, + "step": 41690 + }, + { + "epoch": 2.6132731716488062, + "grad_norm": 0.0466812439262867, + "learning_rate": 6.5134787106035335e-06, + "loss": 0.0012, + "step": 41700 + }, + { + "epoch": 2.613899855862631, + "grad_norm": 1.6810473203659058, + "learning_rate": 6.502923729708049e-06, + "loss": 0.0182, + "step": 41710 + }, + { + "epoch": 2.6145265400764552, + "grad_norm": 2.1184799671173096, + "learning_rate": 6.492368748812565e-06, + "loss": 0.0049, + "step": 41720 + }, + { + "epoch": 2.61515322429028, + "grad_norm": 0.01584853045642376, + "learning_rate": 6.48181376791708e-06, + "loss": 0.1132, + "step": 41730 + }, + { + "epoch": 2.6157799085041047, + "grad_norm": 0.7091934084892273, + "learning_rate": 6.471258787021596e-06, + "loss": 0.1384, + "step": 41740 + }, + { + "epoch": 2.6164065927179294, + "grad_norm": 0.013542457483708858, + "learning_rate": 6.460703806126112e-06, + "loss": 0.0336, + "step": 41750 + }, + { + "epoch": 2.617033276931754, + "grad_norm": 0.010229435749351978, + "learning_rate": 6.4501488252306274e-06, + "loss": 0.0331, + "step": 41760 + }, + { + "epoch": 2.617659961145579, + "grad_norm": 0.06294869631528854, + "learning_rate": 6.4395938443351416e-06, + "loss": 0.1182, + "step": 41770 + }, + { + "epoch": 2.6182866453594036, + "grad_norm": 0.12787538766860962, + "learning_rate": 6.429038863439657e-06, + "loss": 0.0817, + "step": 41780 + }, + { + "epoch": 2.6189133295732283, + "grad_norm": 0.7674179673194885, + "learning_rate": 6.418483882544172e-06, + "loss": 0.0175, + "step": 41790 + }, + { + "epoch": 2.6195400137870526, + "grad_norm": 13.270587921142578, + "learning_rate": 6.407928901648688e-06, + "loss": 0.1413, + "step": 41800 + }, + { + "epoch": 2.6201666980008773, + "grad_norm": 0.11268117278814316, + "learning_rate": 6.397373920753204e-06, + "loss": 0.0409, + "step": 41810 + }, + { + "epoch": 2.620793382214702, + "grad_norm": 0.040429167449474335, + "learning_rate": 6.38681893985772e-06, + "loss": 0.1457, + "step": 41820 + }, + { + "epoch": 2.6214200664285268, + "grad_norm": 3.8810534477233887, + "learning_rate": 6.376263958962235e-06, + "loss": 0.118, + "step": 41830 + }, + { + "epoch": 2.622046750642351, + "grad_norm": 0.08467940986156464, + "learning_rate": 6.3657089780667505e-06, + "loss": 0.0023, + "step": 41840 + }, + { + "epoch": 2.6226734348561758, + "grad_norm": 0.016723625361919403, + "learning_rate": 6.355153997171265e-06, + "loss": 0.1124, + "step": 41850 + }, + { + "epoch": 2.6233001190700005, + "grad_norm": 0.018312910571694374, + "learning_rate": 6.34459901627578e-06, + "loss": 0.1396, + "step": 41860 + }, + { + "epoch": 2.623926803283825, + "grad_norm": 0.4090491235256195, + "learning_rate": 6.334044035380296e-06, + "loss": 0.1224, + "step": 41870 + }, + { + "epoch": 2.62455348749765, + "grad_norm": 14.096756935119629, + "learning_rate": 6.323489054484812e-06, + "loss": 0.092, + "step": 41880 + }, + { + "epoch": 2.6251801717114747, + "grad_norm": 12.442021369934082, + "learning_rate": 6.312934073589327e-06, + "loss": 0.0976, + "step": 41890 + }, + { + "epoch": 2.6258068559252994, + "grad_norm": 10.28456974029541, + "learning_rate": 6.302379092693843e-06, + "loss": 0.0989, + "step": 41900 + }, + { + "epoch": 2.626433540139124, + "grad_norm": 0.009201687760651112, + "learning_rate": 6.2918241117983586e-06, + "loss": 0.041, + "step": 41910 + }, + { + "epoch": 2.6270602243529484, + "grad_norm": 0.019938096404075623, + "learning_rate": 6.281269130902874e-06, + "loss": 0.0052, + "step": 41920 + }, + { + "epoch": 2.627686908566773, + "grad_norm": 10.491694450378418, + "learning_rate": 6.2707141500073885e-06, + "loss": 0.1354, + "step": 41930 + }, + { + "epoch": 2.628313592780598, + "grad_norm": 0.3292548358440399, + "learning_rate": 6.2601591691119034e-06, + "loss": 0.0025, + "step": 41940 + }, + { + "epoch": 2.6289402769944226, + "grad_norm": 0.320330411195755, + "learning_rate": 6.249604188216419e-06, + "loss": 0.1281, + "step": 41950 + }, + { + "epoch": 2.6295669612082473, + "grad_norm": 0.015448945574462414, + "learning_rate": 6.239049207320935e-06, + "loss": 0.0047, + "step": 41960 + }, + { + "epoch": 2.6301936454220716, + "grad_norm": 0.2195909321308136, + "learning_rate": 6.228494226425451e-06, + "loss": 0.0077, + "step": 41970 + }, + { + "epoch": 2.6308203296358963, + "grad_norm": 0.3505687415599823, + "learning_rate": 6.217939245529966e-06, + "loss": 0.003, + "step": 41980 + }, + { + "epoch": 2.631447013849721, + "grad_norm": 0.20360609889030457, + "learning_rate": 6.207384264634481e-06, + "loss": 0.1084, + "step": 41990 + }, + { + "epoch": 2.6320736980635457, + "grad_norm": 0.02529897727072239, + "learning_rate": 6.196829283738997e-06, + "loss": 0.0012, + "step": 42000 + }, + { + "epoch": 2.6327003822773705, + "grad_norm": 0.02185901068150997, + "learning_rate": 6.186274302843512e-06, + "loss": 0.041, + "step": 42010 + }, + { + "epoch": 2.633327066491195, + "grad_norm": 0.006049241870641708, + "learning_rate": 6.175719321948028e-06, + "loss": 0.0067, + "step": 42020 + }, + { + "epoch": 2.63395375070502, + "grad_norm": 0.12334437668323517, + "learning_rate": 6.165164341052543e-06, + "loss": 0.0008, + "step": 42030 + }, + { + "epoch": 2.6345804349188446, + "grad_norm": 0.004662103019654751, + "learning_rate": 6.154609360157058e-06, + "loss": 0.003, + "step": 42040 + }, + { + "epoch": 2.635207119132669, + "grad_norm": 0.04426957294344902, + "learning_rate": 6.144054379261574e-06, + "loss": 0.1455, + "step": 42050 + }, + { + "epoch": 2.6358338033464936, + "grad_norm": 0.09200718253850937, + "learning_rate": 6.133499398366089e-06, + "loss": 0.0919, + "step": 42060 + }, + { + "epoch": 2.6364604875603184, + "grad_norm": 0.011606945656239986, + "learning_rate": 6.122944417470605e-06, + "loss": 0.2493, + "step": 42070 + }, + { + "epoch": 2.637087171774143, + "grad_norm": 0.00984701793640852, + "learning_rate": 6.1123894365751205e-06, + "loss": 0.0294, + "step": 42080 + }, + { + "epoch": 2.637713855987968, + "grad_norm": 0.2293330878019333, + "learning_rate": 6.1018344556796354e-06, + "loss": 0.0982, + "step": 42090 + }, + { + "epoch": 2.638340540201792, + "grad_norm": 13.595054626464844, + "learning_rate": 6.09127947478415e-06, + "loss": 0.1903, + "step": 42100 + }, + { + "epoch": 2.638967224415617, + "grad_norm": 1.0521003007888794, + "learning_rate": 6.080724493888666e-06, + "loss": 0.088, + "step": 42110 + }, + { + "epoch": 2.6395939086294415, + "grad_norm": 0.09486385434865952, + "learning_rate": 6.070169512993182e-06, + "loss": 0.0367, + "step": 42120 + }, + { + "epoch": 2.6402205928432663, + "grad_norm": 0.05776892229914665, + "learning_rate": 6.059614532097698e-06, + "loss": 0.0036, + "step": 42130 + }, + { + "epoch": 2.640847277057091, + "grad_norm": 0.19651919603347778, + "learning_rate": 6.049059551202212e-06, + "loss": 0.0629, + "step": 42140 + }, + { + "epoch": 2.6414739612709157, + "grad_norm": 0.419067919254303, + "learning_rate": 6.038504570306728e-06, + "loss": 0.0035, + "step": 42150 + }, + { + "epoch": 2.6421006454847404, + "grad_norm": 0.10673335194587708, + "learning_rate": 6.0279495894112435e-06, + "loss": 0.1531, + "step": 42160 + }, + { + "epoch": 2.642727329698565, + "grad_norm": 0.013691593892872334, + "learning_rate": 6.017394608515759e-06, + "loss": 0.061, + "step": 42170 + }, + { + "epoch": 2.6433540139123894, + "grad_norm": 0.022470830008387566, + "learning_rate": 6.006839627620274e-06, + "loss": 0.0417, + "step": 42180 + }, + { + "epoch": 2.643980698126214, + "grad_norm": 0.03461216017603874, + "learning_rate": 5.996284646724789e-06, + "loss": 0.0546, + "step": 42190 + }, + { + "epoch": 2.644607382340039, + "grad_norm": 0.02822648175060749, + "learning_rate": 5.985729665829305e-06, + "loss": 0.0752, + "step": 42200 + }, + { + "epoch": 2.6452340665538636, + "grad_norm": 4.36279821395874, + "learning_rate": 5.975174684933821e-06, + "loss": 0.1409, + "step": 42210 + }, + { + "epoch": 2.645860750767688, + "grad_norm": 0.024720776826143265, + "learning_rate": 5.964619704038336e-06, + "loss": 0.0329, + "step": 42220 + }, + { + "epoch": 2.6464874349815126, + "grad_norm": 0.040048062801361084, + "learning_rate": 5.954064723142852e-06, + "loss": 0.0036, + "step": 42230 + }, + { + "epoch": 2.6471141191953373, + "grad_norm": 0.05158762261271477, + "learning_rate": 5.9435097422473666e-06, + "loss": 0.0581, + "step": 42240 + }, + { + "epoch": 2.647740803409162, + "grad_norm": 0.018962504342198372, + "learning_rate": 5.932954761351882e-06, + "loss": 0.2953, + "step": 42250 + }, + { + "epoch": 2.648367487622987, + "grad_norm": 0.021726837381720543, + "learning_rate": 5.922399780456397e-06, + "loss": 0.0464, + "step": 42260 + }, + { + "epoch": 2.6489941718368115, + "grad_norm": 0.02034616470336914, + "learning_rate": 5.911844799560913e-06, + "loss": 0.054, + "step": 42270 + }, + { + "epoch": 2.6496208560506362, + "grad_norm": 0.04526104778051376, + "learning_rate": 5.901289818665429e-06, + "loss": 0.1052, + "step": 42280 + }, + { + "epoch": 2.650247540264461, + "grad_norm": 0.02887248992919922, + "learning_rate": 5.890734837769944e-06, + "loss": 0.006, + "step": 42290 + }, + { + "epoch": 2.6508742244782852, + "grad_norm": 5.1538472175598145, + "learning_rate": 5.880179856874459e-06, + "loss": 0.1128, + "step": 42300 + }, + { + "epoch": 2.65150090869211, + "grad_norm": 0.046616777777671814, + "learning_rate": 5.869624875978975e-06, + "loss": 0.1021, + "step": 42310 + }, + { + "epoch": 2.6521275929059347, + "grad_norm": 0.05382724851369858, + "learning_rate": 5.8590698950834905e-06, + "loss": 0.0401, + "step": 42320 + }, + { + "epoch": 2.6527542771197594, + "grad_norm": 0.027329618111252785, + "learning_rate": 5.848514914188006e-06, + "loss": 0.0029, + "step": 42330 + }, + { + "epoch": 2.653380961333584, + "grad_norm": 0.05983074754476547, + "learning_rate": 5.837959933292521e-06, + "loss": 0.0678, + "step": 42340 + }, + { + "epoch": 2.6540076455474084, + "grad_norm": 0.03666955605149269, + "learning_rate": 5.827404952397036e-06, + "loss": 0.132, + "step": 42350 + }, + { + "epoch": 2.654634329761233, + "grad_norm": 0.03756636381149292, + "learning_rate": 5.816849971501552e-06, + "loss": 0.0721, + "step": 42360 + }, + { + "epoch": 2.655261013975058, + "grad_norm": 0.02370397001504898, + "learning_rate": 5.806294990606067e-06, + "loss": 0.0888, + "step": 42370 + }, + { + "epoch": 2.6558876981888826, + "grad_norm": 0.0890141949057579, + "learning_rate": 5.795740009710583e-06, + "loss": 0.0935, + "step": 42380 + }, + { + "epoch": 2.6565143824027073, + "grad_norm": 0.01145352516323328, + "learning_rate": 5.7851850288150985e-06, + "loss": 0.0907, + "step": 42390 + }, + { + "epoch": 2.657141066616532, + "grad_norm": 11.319497108459473, + "learning_rate": 5.7746300479196135e-06, + "loss": 0.0856, + "step": 42400 + }, + { + "epoch": 2.6577677508303568, + "grad_norm": 1.578118920326233, + "learning_rate": 5.7640750670241285e-06, + "loss": 0.0597, + "step": 42410 + }, + { + "epoch": 2.6583944350441815, + "grad_norm": 0.2819182276725769, + "learning_rate": 5.753520086128644e-06, + "loss": 0.0032, + "step": 42420 + }, + { + "epoch": 2.6590211192580058, + "grad_norm": 0.009528083726763725, + "learning_rate": 5.74296510523316e-06, + "loss": 0.0017, + "step": 42430 + }, + { + "epoch": 2.6596478034718305, + "grad_norm": 2.445085287094116, + "learning_rate": 5.732410124337675e-06, + "loss": 0.0389, + "step": 42440 + }, + { + "epoch": 2.660274487685655, + "grad_norm": 0.010715600103139877, + "learning_rate": 5.72185514344219e-06, + "loss": 0.0801, + "step": 42450 + }, + { + "epoch": 2.66090117189948, + "grad_norm": 0.009130342863500118, + "learning_rate": 5.711300162546706e-06, + "loss": 0.0024, + "step": 42460 + }, + { + "epoch": 2.6615278561133047, + "grad_norm": 14.391807556152344, + "learning_rate": 5.700745181651222e-06, + "loss": 0.042, + "step": 42470 + }, + { + "epoch": 2.662154540327129, + "grad_norm": 0.7011982202529907, + "learning_rate": 5.690190200755737e-06, + "loss": 0.2109, + "step": 42480 + }, + { + "epoch": 2.6627812245409537, + "grad_norm": 0.0479697659611702, + "learning_rate": 5.679635219860252e-06, + "loss": 0.0387, + "step": 42490 + }, + { + "epoch": 2.6634079087547784, + "grad_norm": 0.02062498964369297, + "learning_rate": 5.669080238964767e-06, + "loss": 0.0018, + "step": 42500 + }, + { + "epoch": 2.664034592968603, + "grad_norm": 0.01683782786130905, + "learning_rate": 5.658525258069283e-06, + "loss": 0.0011, + "step": 42510 + }, + { + "epoch": 2.664661277182428, + "grad_norm": 0.006330487783998251, + "learning_rate": 5.647970277173799e-06, + "loss": 0.05, + "step": 42520 + }, + { + "epoch": 2.6652879613962526, + "grad_norm": 0.010017489083111286, + "learning_rate": 5.637415296278314e-06, + "loss": 0.096, + "step": 42530 + }, + { + "epoch": 2.6659146456100773, + "grad_norm": 0.01938549615442753, + "learning_rate": 5.62686031538283e-06, + "loss": 0.1787, + "step": 42540 + }, + { + "epoch": 2.6665413298239016, + "grad_norm": 0.02217995375394821, + "learning_rate": 5.616305334487345e-06, + "loss": 0.0657, + "step": 42550 + }, + { + "epoch": 2.6671680140377263, + "grad_norm": 0.039555083960294724, + "learning_rate": 5.6057503535918604e-06, + "loss": 0.0936, + "step": 42560 + }, + { + "epoch": 2.667794698251551, + "grad_norm": 28.735490798950195, + "learning_rate": 5.595195372696375e-06, + "loss": 0.1826, + "step": 42570 + }, + { + "epoch": 2.6684213824653757, + "grad_norm": 0.01792915351688862, + "learning_rate": 5.584640391800891e-06, + "loss": 0.1354, + "step": 42580 + }, + { + "epoch": 2.6690480666792005, + "grad_norm": 0.5480226278305054, + "learning_rate": 5.574085410905407e-06, + "loss": 0.1323, + "step": 42590 + }, + { + "epoch": 2.6696747508930248, + "grad_norm": 0.5054864883422852, + "learning_rate": 5.563530430009922e-06, + "loss": 0.0902, + "step": 42600 + }, + { + "epoch": 2.6703014351068495, + "grad_norm": 1.5177534818649292, + "learning_rate": 5.552975449114437e-06, + "loss": 0.0593, + "step": 42610 + }, + { + "epoch": 2.670928119320674, + "grad_norm": 0.1519448608160019, + "learning_rate": 5.542420468218953e-06, + "loss": 0.0512, + "step": 42620 + }, + { + "epoch": 2.671554803534499, + "grad_norm": 0.008680077269673347, + "learning_rate": 5.5318654873234685e-06, + "loss": 0.0316, + "step": 42630 + }, + { + "epoch": 2.6721814877483236, + "grad_norm": 0.04235262796282768, + "learning_rate": 5.521310506427984e-06, + "loss": 0.0016, + "step": 42640 + }, + { + "epoch": 2.6728081719621484, + "grad_norm": 0.014904645271599293, + "learning_rate": 5.5107555255324984e-06, + "loss": 0.0385, + "step": 42650 + }, + { + "epoch": 2.673434856175973, + "grad_norm": 0.011473514139652252, + "learning_rate": 5.500200544637014e-06, + "loss": 0.0452, + "step": 42660 + }, + { + "epoch": 2.674061540389798, + "grad_norm": 20.41715431213379, + "learning_rate": 5.48964556374153e-06, + "loss": 0.0704, + "step": 42670 + }, + { + "epoch": 2.674688224603622, + "grad_norm": 0.011435385793447495, + "learning_rate": 5.479090582846046e-06, + "loss": 0.2295, + "step": 42680 + }, + { + "epoch": 2.675314908817447, + "grad_norm": 0.013783792965114117, + "learning_rate": 5.468535601950561e-06, + "loss": 0.1978, + "step": 42690 + }, + { + "epoch": 2.6759415930312715, + "grad_norm": 0.053753383457660675, + "learning_rate": 5.457980621055076e-06, + "loss": 0.0721, + "step": 42700 + }, + { + "epoch": 2.6765682772450963, + "grad_norm": 0.12864500284194946, + "learning_rate": 5.4474256401595916e-06, + "loss": 0.0034, + "step": 42710 + }, + { + "epoch": 2.677194961458921, + "grad_norm": 15.265963554382324, + "learning_rate": 5.4368706592641065e-06, + "loss": 0.0486, + "step": 42720 + }, + { + "epoch": 2.6778216456727453, + "grad_norm": 0.01681533455848694, + "learning_rate": 5.426315678368622e-06, + "loss": 0.0048, + "step": 42730 + }, + { + "epoch": 2.67844832988657, + "grad_norm": 0.6825605630874634, + "learning_rate": 5.415760697473138e-06, + "loss": 0.1534, + "step": 42740 + }, + { + "epoch": 2.6790750141003947, + "grad_norm": 0.016611248254776, + "learning_rate": 5.405205716577653e-06, + "loss": 0.0404, + "step": 42750 + }, + { + "epoch": 2.6797016983142194, + "grad_norm": 0.017577961087226868, + "learning_rate": 5.394650735682168e-06, + "loss": 0.139, + "step": 42760 + }, + { + "epoch": 2.680328382528044, + "grad_norm": 6.641668796539307, + "learning_rate": 5.384095754786684e-06, + "loss": 0.1234, + "step": 42770 + }, + { + "epoch": 2.680955066741869, + "grad_norm": 0.4626294672489166, + "learning_rate": 5.3735407738912e-06, + "loss": 0.0406, + "step": 42780 + }, + { + "epoch": 2.6815817509556936, + "grad_norm": 10.911255836486816, + "learning_rate": 5.3629857929957155e-06, + "loss": 0.1176, + "step": 42790 + }, + { + "epoch": 2.6822084351695183, + "grad_norm": 13.511198997497559, + "learning_rate": 5.35243081210023e-06, + "loss": 0.0905, + "step": 42800 + }, + { + "epoch": 2.6828351193833426, + "grad_norm": 0.0641942247748375, + "learning_rate": 5.341875831204745e-06, + "loss": 0.0522, + "step": 42810 + }, + { + "epoch": 2.6834618035971673, + "grad_norm": 0.009677493013441563, + "learning_rate": 5.331320850309261e-06, + "loss": 0.0094, + "step": 42820 + }, + { + "epoch": 2.684088487810992, + "grad_norm": 0.06767957657575607, + "learning_rate": 5.320765869413777e-06, + "loss": 0.0327, + "step": 42830 + }, + { + "epoch": 2.684715172024817, + "grad_norm": 0.049471303820610046, + "learning_rate": 5.310210888518292e-06, + "loss": 0.072, + "step": 42840 + }, + { + "epoch": 2.685341856238641, + "grad_norm": 0.06735648959875107, + "learning_rate": 5.299655907622808e-06, + "loss": 0.2579, + "step": 42850 + }, + { + "epoch": 2.685968540452466, + "grad_norm": 0.09247054904699326, + "learning_rate": 5.289100926727323e-06, + "loss": 0.0933, + "step": 42860 + }, + { + "epoch": 2.6865952246662905, + "grad_norm": 0.04064527153968811, + "learning_rate": 5.2785459458318385e-06, + "loss": 0.1325, + "step": 42870 + }, + { + "epoch": 2.6872219088801153, + "grad_norm": 0.009018697775900364, + "learning_rate": 5.2679909649363535e-06, + "loss": 0.0428, + "step": 42880 + }, + { + "epoch": 2.68784859309394, + "grad_norm": 6.484643459320068, + "learning_rate": 5.257435984040869e-06, + "loss": 0.1189, + "step": 42890 + }, + { + "epoch": 2.6884752773077647, + "grad_norm": 0.013771232217550278, + "learning_rate": 5.246881003145385e-06, + "loss": 0.0026, + "step": 42900 + }, + { + "epoch": 2.6891019615215894, + "grad_norm": 0.022826027125120163, + "learning_rate": 5.2363260222499e-06, + "loss": 0.0361, + "step": 42910 + }, + { + "epoch": 2.689728645735414, + "grad_norm": 0.02315569669008255, + "learning_rate": 5.225771041354415e-06, + "loss": 0.0551, + "step": 42920 + }, + { + "epoch": 2.6903553299492384, + "grad_norm": 0.06086333841085434, + "learning_rate": 5.215216060458931e-06, + "loss": 0.0964, + "step": 42930 + }, + { + "epoch": 2.690982014163063, + "grad_norm": 0.3857003152370453, + "learning_rate": 5.204661079563447e-06, + "loss": 0.1914, + "step": 42940 + }, + { + "epoch": 2.691608698376888, + "grad_norm": 9.57112979888916, + "learning_rate": 5.1941060986679616e-06, + "loss": 0.2328, + "step": 42950 + }, + { + "epoch": 2.6922353825907126, + "grad_norm": 4.99583625793457, + "learning_rate": 5.1835511177724765e-06, + "loss": 0.1707, + "step": 42960 + }, + { + "epoch": 2.6928620668045373, + "grad_norm": 0.023815227672457695, + "learning_rate": 5.172996136876992e-06, + "loss": 0.0038, + "step": 42970 + }, + { + "epoch": 2.6934887510183616, + "grad_norm": 0.05912555754184723, + "learning_rate": 5.162441155981508e-06, + "loss": 0.0479, + "step": 42980 + }, + { + "epoch": 2.6941154352321863, + "grad_norm": 0.14293870329856873, + "learning_rate": 5.151886175086024e-06, + "loss": 0.1847, + "step": 42990 + }, + { + "epoch": 2.694742119446011, + "grad_norm": 0.02170496992766857, + "learning_rate": 5.141331194190539e-06, + "loss": 0.0728, + "step": 43000 + }, + { + "epoch": 2.6953688036598358, + "grad_norm": 0.4979763925075531, + "learning_rate": 5.130776213295054e-06, + "loss": 0.1014, + "step": 43010 + }, + { + "epoch": 2.6959954878736605, + "grad_norm": 4.050300121307373, + "learning_rate": 5.12022123239957e-06, + "loss": 0.1959, + "step": 43020 + }, + { + "epoch": 2.6966221720874852, + "grad_norm": 0.0454852357506752, + "learning_rate": 5.109666251504085e-06, + "loss": 0.1131, + "step": 43030 + }, + { + "epoch": 2.69724885630131, + "grad_norm": 0.37142908573150635, + "learning_rate": 5.0991112706086e-06, + "loss": 0.0955, + "step": 43040 + }, + { + "epoch": 2.6978755405151347, + "grad_norm": 0.08890983462333679, + "learning_rate": 5.088556289713116e-06, + "loss": 0.073, + "step": 43050 + }, + { + "epoch": 2.698502224728959, + "grad_norm": 0.04773983359336853, + "learning_rate": 5.078001308817631e-06, + "loss": 0.1204, + "step": 43060 + }, + { + "epoch": 2.6991289089427837, + "grad_norm": 0.06058727577328682, + "learning_rate": 5.067446327922146e-06, + "loss": 0.0382, + "step": 43070 + }, + { + "epoch": 2.6997555931566084, + "grad_norm": 0.09812531620264053, + "learning_rate": 5.056891347026662e-06, + "loss": 0.0313, + "step": 43080 + }, + { + "epoch": 2.700382277370433, + "grad_norm": 0.06097055599093437, + "learning_rate": 5.046336366131178e-06, + "loss": 0.0044, + "step": 43090 + }, + { + "epoch": 2.701008961584258, + "grad_norm": 0.1884794533252716, + "learning_rate": 5.0357813852356935e-06, + "loss": 0.0592, + "step": 43100 + }, + { + "epoch": 2.701635645798082, + "grad_norm": 0.12037229537963867, + "learning_rate": 5.0252264043402085e-06, + "loss": 0.0371, + "step": 43110 + }, + { + "epoch": 2.702262330011907, + "grad_norm": 0.5690087080001831, + "learning_rate": 5.0146714234447234e-06, + "loss": 0.0464, + "step": 43120 + }, + { + "epoch": 2.7028890142257316, + "grad_norm": 0.030820699408650398, + "learning_rate": 5.004116442549239e-06, + "loss": 0.0863, + "step": 43130 + }, + { + "epoch": 2.7035156984395563, + "grad_norm": 0.033750128000974655, + "learning_rate": 4.993561461653755e-06, + "loss": 0.0341, + "step": 43140 + }, + { + "epoch": 2.704142382653381, + "grad_norm": 0.0517287403345108, + "learning_rate": 4.98300648075827e-06, + "loss": 0.1082, + "step": 43150 + }, + { + "epoch": 2.7047690668672058, + "grad_norm": 0.01474336814135313, + "learning_rate": 4.972451499862785e-06, + "loss": 0.0034, + "step": 43160 + }, + { + "epoch": 2.7053957510810305, + "grad_norm": 0.19128337502479553, + "learning_rate": 4.961896518967301e-06, + "loss": 0.0952, + "step": 43170 + }, + { + "epoch": 2.706022435294855, + "grad_norm": 0.1466163694858551, + "learning_rate": 4.951341538071817e-06, + "loss": 0.038, + "step": 43180 + }, + { + "epoch": 2.7066491195086795, + "grad_norm": 7.711380958557129, + "learning_rate": 4.9407865571763315e-06, + "loss": 0.2064, + "step": 43190 + }, + { + "epoch": 2.707275803722504, + "grad_norm": 0.025164706632494926, + "learning_rate": 4.930231576280847e-06, + "loss": 0.0944, + "step": 43200 + }, + { + "epoch": 2.707902487936329, + "grad_norm": 5.459824085235596, + "learning_rate": 4.919676595385362e-06, + "loss": 0.2002, + "step": 43210 + }, + { + "epoch": 2.7085291721501537, + "grad_norm": 6.968769550323486, + "learning_rate": 4.909121614489878e-06, + "loss": 0.1127, + "step": 43220 + }, + { + "epoch": 2.709155856363978, + "grad_norm": 0.03392601013183594, + "learning_rate": 4.898566633594393e-06, + "loss": 0.037, + "step": 43230 + }, + { + "epoch": 2.7097825405778027, + "grad_norm": 0.13544121384620667, + "learning_rate": 4.888011652698909e-06, + "loss": 0.0745, + "step": 43240 + }, + { + "epoch": 2.7104092247916274, + "grad_norm": 7.2782301902771, + "learning_rate": 4.877456671803425e-06, + "loss": 0.0351, + "step": 43250 + }, + { + "epoch": 2.711035909005452, + "grad_norm": 0.06163564696907997, + "learning_rate": 4.86690169090794e-06, + "loss": 0.1478, + "step": 43260 + }, + { + "epoch": 2.711662593219277, + "grad_norm": 0.0893465131521225, + "learning_rate": 4.856346710012455e-06, + "loss": 0.0038, + "step": 43270 + }, + { + "epoch": 2.7122892774331016, + "grad_norm": 0.6175368428230286, + "learning_rate": 4.84579172911697e-06, + "loss": 0.1493, + "step": 43280 + }, + { + "epoch": 2.7129159616469263, + "grad_norm": 0.24023815989494324, + "learning_rate": 4.835236748221486e-06, + "loss": 0.0617, + "step": 43290 + }, + { + "epoch": 2.713542645860751, + "grad_norm": 0.01608484983444214, + "learning_rate": 4.824681767326002e-06, + "loss": 0.0713, + "step": 43300 + }, + { + "epoch": 2.7141693300745753, + "grad_norm": 0.17486777901649475, + "learning_rate": 4.814126786430517e-06, + "loss": 0.002, + "step": 43310 + }, + { + "epoch": 2.7147960142884, + "grad_norm": 0.24492332339286804, + "learning_rate": 4.803571805535032e-06, + "loss": 0.0289, + "step": 43320 + }, + { + "epoch": 2.7154226985022247, + "grad_norm": 7.103209972381592, + "learning_rate": 4.793016824639548e-06, + "loss": 0.0583, + "step": 43330 + }, + { + "epoch": 2.7160493827160495, + "grad_norm": 0.01576782949268818, + "learning_rate": 4.7824618437440635e-06, + "loss": 0.0413, + "step": 43340 + }, + { + "epoch": 2.716676066929874, + "grad_norm": 0.016768546774983406, + "learning_rate": 4.7719068628485785e-06, + "loss": 0.0285, + "step": 43350 + }, + { + "epoch": 2.7173027511436985, + "grad_norm": 0.014253930188715458, + "learning_rate": 4.761351881953094e-06, + "loss": 0.1184, + "step": 43360 + }, + { + "epoch": 2.717929435357523, + "grad_norm": 0.009223722852766514, + "learning_rate": 4.750796901057609e-06, + "loss": 0.099, + "step": 43370 + }, + { + "epoch": 2.718556119571348, + "grad_norm": 0.06819354742765427, + "learning_rate": 4.740241920162124e-06, + "loss": 0.0382, + "step": 43380 + }, + { + "epoch": 2.7191828037851726, + "grad_norm": 0.007454665377736092, + "learning_rate": 4.72968693926664e-06, + "loss": 0.2593, + "step": 43390 + }, + { + "epoch": 2.7198094879989974, + "grad_norm": 0.02343241311609745, + "learning_rate": 4.719131958371156e-06, + "loss": 0.0025, + "step": 43400 + }, + { + "epoch": 2.720436172212822, + "grad_norm": 0.007642934564501047, + "learning_rate": 4.708576977475672e-06, + "loss": 0.0171, + "step": 43410 + }, + { + "epoch": 2.721062856426647, + "grad_norm": 0.006195179186761379, + "learning_rate": 4.698021996580186e-06, + "loss": 0.1173, + "step": 43420 + }, + { + "epoch": 2.7216895406404715, + "grad_norm": 0.0281064510345459, + "learning_rate": 4.6874670156847015e-06, + "loss": 0.0045, + "step": 43430 + }, + { + "epoch": 2.722316224854296, + "grad_norm": 0.0708283931016922, + "learning_rate": 4.676912034789217e-06, + "loss": 0.0497, + "step": 43440 + }, + { + "epoch": 2.7229429090681205, + "grad_norm": 0.009736607782542706, + "learning_rate": 4.666357053893733e-06, + "loss": 0.0016, + "step": 43450 + }, + { + "epoch": 2.7235695932819453, + "grad_norm": 0.014862255193293095, + "learning_rate": 4.655802072998248e-06, + "loss": 0.0018, + "step": 43460 + }, + { + "epoch": 2.72419627749577, + "grad_norm": 0.3423691689968109, + "learning_rate": 4.645247092102763e-06, + "loss": 0.1195, + "step": 43470 + }, + { + "epoch": 2.7248229617095943, + "grad_norm": 1.0811691284179688, + "learning_rate": 4.634692111207279e-06, + "loss": 0.0056, + "step": 43480 + }, + { + "epoch": 2.725449645923419, + "grad_norm": 0.31755921244621277, + "learning_rate": 4.624137130311795e-06, + "loss": 0.3132, + "step": 43490 + }, + { + "epoch": 2.7260763301372437, + "grad_norm": 6.94637393951416, + "learning_rate": 4.61358214941631e-06, + "loss": 0.1208, + "step": 43500 + }, + { + "epoch": 2.7267030143510684, + "grad_norm": 0.23933982849121094, + "learning_rate": 4.603027168520825e-06, + "loss": 0.1155, + "step": 43510 + }, + { + "epoch": 2.727329698564893, + "grad_norm": 0.018548715859651566, + "learning_rate": 4.59247218762534e-06, + "loss": 0.0049, + "step": 43520 + }, + { + "epoch": 2.727956382778718, + "grad_norm": 0.02049705944955349, + "learning_rate": 4.581917206729856e-06, + "loss": 0.04, + "step": 43530 + }, + { + "epoch": 2.7285830669925426, + "grad_norm": 0.28012970089912415, + "learning_rate": 4.571362225834371e-06, + "loss": 0.0044, + "step": 43540 + }, + { + "epoch": 2.7292097512063673, + "grad_norm": 0.16389213502407074, + "learning_rate": 4.560807244938887e-06, + "loss": 0.0456, + "step": 43550 + }, + { + "epoch": 2.7298364354201916, + "grad_norm": 0.27304649353027344, + "learning_rate": 4.550252264043403e-06, + "loss": 0.1046, + "step": 43560 + }, + { + "epoch": 2.7304631196340163, + "grad_norm": 0.28648197650909424, + "learning_rate": 4.539697283147918e-06, + "loss": 0.0498, + "step": 43570 + }, + { + "epoch": 2.731089803847841, + "grad_norm": 0.013310969807207584, + "learning_rate": 4.529142302252433e-06, + "loss": 0.0713, + "step": 43580 + }, + { + "epoch": 2.731716488061666, + "grad_norm": 0.01846585050225258, + "learning_rate": 4.5185873213569485e-06, + "loss": 0.0194, + "step": 43590 + }, + { + "epoch": 2.7323431722754905, + "grad_norm": 10.159256935119629, + "learning_rate": 4.508032340461464e-06, + "loss": 0.0576, + "step": 43600 + }, + { + "epoch": 2.732969856489315, + "grad_norm": 0.012528217397630215, + "learning_rate": 4.49747735956598e-06, + "loss": 0.2147, + "step": 43610 + }, + { + "epoch": 2.7335965407031395, + "grad_norm": 4.751760482788086, + "learning_rate": 4.486922378670495e-06, + "loss": 0.0491, + "step": 43620 + }, + { + "epoch": 2.7342232249169642, + "grad_norm": 0.0792778953909874, + "learning_rate": 4.47636739777501e-06, + "loss": 0.1169, + "step": 43630 + }, + { + "epoch": 2.734849909130789, + "grad_norm": 0.20727191865444183, + "learning_rate": 4.465812416879526e-06, + "loss": 0.0531, + "step": 43640 + }, + { + "epoch": 2.7354765933446137, + "grad_norm": 0.009766854345798492, + "learning_rate": 4.455257435984042e-06, + "loss": 0.0051, + "step": 43650 + }, + { + "epoch": 2.7361032775584384, + "grad_norm": 7.263557434082031, + "learning_rate": 4.4447024550885565e-06, + "loss": 0.0657, + "step": 43660 + }, + { + "epoch": 2.736729961772263, + "grad_norm": 0.3022579252719879, + "learning_rate": 4.4341474741930715e-06, + "loss": 0.0035, + "step": 43670 + }, + { + "epoch": 2.737356645986088, + "grad_norm": 0.016718635335564613, + "learning_rate": 4.423592493297587e-06, + "loss": 0.0057, + "step": 43680 + }, + { + "epoch": 2.737983330199912, + "grad_norm": 0.004681083839386702, + "learning_rate": 4.413037512402102e-06, + "loss": 0.0038, + "step": 43690 + }, + { + "epoch": 2.738610014413737, + "grad_norm": 0.05114516615867615, + "learning_rate": 4.402482531506618e-06, + "loss": 0.0013, + "step": 43700 + }, + { + "epoch": 2.7392366986275616, + "grad_norm": 0.010205262340605259, + "learning_rate": 4.391927550611134e-06, + "loss": 0.0857, + "step": 43710 + }, + { + "epoch": 2.7398633828413863, + "grad_norm": 0.20754745602607727, + "learning_rate": 4.381372569715649e-06, + "loss": 0.0807, + "step": 43720 + }, + { + "epoch": 2.740490067055211, + "grad_norm": 25.73821258544922, + "learning_rate": 4.370817588820164e-06, + "loss": 0.1039, + "step": 43730 + }, + { + "epoch": 2.7411167512690353, + "grad_norm": 23.46526527404785, + "learning_rate": 4.36026260792468e-06, + "loss": 0.0647, + "step": 43740 + }, + { + "epoch": 2.74174343548286, + "grad_norm": 6.308553695678711, + "learning_rate": 4.349707627029195e-06, + "loss": 0.0606, + "step": 43750 + }, + { + "epoch": 2.7423701196966848, + "grad_norm": 0.009312220849096775, + "learning_rate": 4.339152646133711e-06, + "loss": 0.0727, + "step": 43760 + }, + { + "epoch": 2.7429968039105095, + "grad_norm": 0.026370780542492867, + "learning_rate": 4.328597665238226e-06, + "loss": 0.0012, + "step": 43770 + }, + { + "epoch": 2.743623488124334, + "grad_norm": 1.8105483055114746, + "learning_rate": 4.318042684342741e-06, + "loss": 0.2181, + "step": 43780 + }, + { + "epoch": 2.744250172338159, + "grad_norm": 0.026521051302552223, + "learning_rate": 4.307487703447257e-06, + "loss": 0.0558, + "step": 43790 + }, + { + "epoch": 2.7448768565519837, + "grad_norm": 0.056364260613918304, + "learning_rate": 4.296932722551773e-06, + "loss": 0.0425, + "step": 43800 + }, + { + "epoch": 2.7455035407658084, + "grad_norm": 0.056610897183418274, + "learning_rate": 4.286377741656288e-06, + "loss": 0.1089, + "step": 43810 + }, + { + "epoch": 2.7461302249796327, + "grad_norm": 0.016765808686614037, + "learning_rate": 4.2758227607608035e-06, + "loss": 0.0718, + "step": 43820 + }, + { + "epoch": 2.7467569091934574, + "grad_norm": 1.5205049514770508, + "learning_rate": 4.2652677798653184e-06, + "loss": 0.1873, + "step": 43830 + }, + { + "epoch": 2.747383593407282, + "grad_norm": 0.021429814398288727, + "learning_rate": 4.254712798969834e-06, + "loss": 0.1228, + "step": 43840 + }, + { + "epoch": 2.748010277621107, + "grad_norm": 0.4221652150154114, + "learning_rate": 4.244157818074349e-06, + "loss": 0.0062, + "step": 43850 + }, + { + "epoch": 2.748636961834931, + "grad_norm": 0.024672698229551315, + "learning_rate": 4.233602837178865e-06, + "loss": 0.0617, + "step": 43860 + }, + { + "epoch": 2.749263646048756, + "grad_norm": 0.3984994888305664, + "learning_rate": 4.223047856283381e-06, + "loss": 0.0381, + "step": 43870 + }, + { + "epoch": 2.7498903302625806, + "grad_norm": 0.023568619042634964, + "learning_rate": 4.212492875387896e-06, + "loss": 0.0323, + "step": 43880 + }, + { + "epoch": 2.7505170144764053, + "grad_norm": 0.8084057569503784, + "learning_rate": 4.201937894492411e-06, + "loss": 0.1702, + "step": 43890 + }, + { + "epoch": 2.75114369869023, + "grad_norm": 53.31449508666992, + "learning_rate": 4.1913829135969265e-06, + "loss": 0.1677, + "step": 43900 + }, + { + "epoch": 2.7517703829040547, + "grad_norm": 0.021764956414699554, + "learning_rate": 4.180827932701442e-06, + "loss": 0.0817, + "step": 43910 + }, + { + "epoch": 2.7523970671178795, + "grad_norm": 0.05249863490462303, + "learning_rate": 4.170272951805958e-06, + "loss": 0.0425, + "step": 43920 + }, + { + "epoch": 2.753023751331704, + "grad_norm": 6.336426258087158, + "learning_rate": 4.159717970910472e-06, + "loss": 0.1102, + "step": 43930 + }, + { + "epoch": 2.7536504355455285, + "grad_norm": 0.8154854774475098, + "learning_rate": 4.149162990014988e-06, + "loss": 0.0805, + "step": 43940 + }, + { + "epoch": 2.754277119759353, + "grad_norm": 0.028954818844795227, + "learning_rate": 4.138608009119504e-06, + "loss": 0.0646, + "step": 43950 + }, + { + "epoch": 2.754903803973178, + "grad_norm": 0.01812795363366604, + "learning_rate": 4.12805302822402e-06, + "loss": 0.1344, + "step": 43960 + }, + { + "epoch": 2.7555304881870026, + "grad_norm": 0.1904141902923584, + "learning_rate": 4.117498047328535e-06, + "loss": 0.1025, + "step": 43970 + }, + { + "epoch": 2.7561571724008274, + "grad_norm": 0.15014392137527466, + "learning_rate": 4.10694306643305e-06, + "loss": 0.0035, + "step": 43980 + }, + { + "epoch": 2.7567838566146516, + "grad_norm": 0.16566021740436554, + "learning_rate": 4.096388085537565e-06, + "loss": 0.1814, + "step": 43990 + }, + { + "epoch": 2.7574105408284764, + "grad_norm": 0.4417349398136139, + "learning_rate": 4.085833104642081e-06, + "loss": 0.0033, + "step": 44000 + }, + { + "epoch": 2.758037225042301, + "grad_norm": 0.03424863517284393, + "learning_rate": 4.075278123746596e-06, + "loss": 0.0012, + "step": 44010 + }, + { + "epoch": 2.758663909256126, + "grad_norm": 0.02660740353167057, + "learning_rate": 4.064723142851112e-06, + "loss": 0.0299, + "step": 44020 + }, + { + "epoch": 2.7592905934699505, + "grad_norm": 0.1258479654788971, + "learning_rate": 4.054168161955627e-06, + "loss": 0.0038, + "step": 44030 + }, + { + "epoch": 2.7599172776837753, + "grad_norm": 0.13935960829257965, + "learning_rate": 4.043613181060142e-06, + "loss": 0.0374, + "step": 44040 + }, + { + "epoch": 2.7605439618976, + "grad_norm": 0.08836539089679718, + "learning_rate": 4.033058200164658e-06, + "loss": 0.0768, + "step": 44050 + }, + { + "epoch": 2.7611706461114247, + "grad_norm": 0.18964870274066925, + "learning_rate": 4.0225032192691735e-06, + "loss": 0.106, + "step": 44060 + }, + { + "epoch": 2.761797330325249, + "grad_norm": 0.010779556818306446, + "learning_rate": 4.011948238373689e-06, + "loss": 0.0014, + "step": 44070 + }, + { + "epoch": 2.7624240145390737, + "grad_norm": 0.21891902387142181, + "learning_rate": 4.001393257478204e-06, + "loss": 0.0294, + "step": 44080 + }, + { + "epoch": 2.7630506987528984, + "grad_norm": 0.1056990772485733, + "learning_rate": 3.990838276582719e-06, + "loss": 0.0478, + "step": 44090 + }, + { + "epoch": 2.763677382966723, + "grad_norm": 0.1947309374809265, + "learning_rate": 3.980283295687235e-06, + "loss": 0.1257, + "step": 44100 + }, + { + "epoch": 2.764304067180548, + "grad_norm": 0.9363518357276917, + "learning_rate": 3.969728314791751e-06, + "loss": 0.0757, + "step": 44110 + }, + { + "epoch": 2.764930751394372, + "grad_norm": 0.006408642511814833, + "learning_rate": 3.959173333896266e-06, + "loss": 0.0577, + "step": 44120 + }, + { + "epoch": 2.765557435608197, + "grad_norm": 0.36477547883987427, + "learning_rate": 3.9486183530007816e-06, + "loss": 0.0014, + "step": 44130 + }, + { + "epoch": 2.7661841198220216, + "grad_norm": 0.11764029413461685, + "learning_rate": 3.9380633721052965e-06, + "loss": 0.0283, + "step": 44140 + }, + { + "epoch": 2.7668108040358463, + "grad_norm": 1.2040200233459473, + "learning_rate": 3.927508391209812e-06, + "loss": 0.0767, + "step": 44150 + }, + { + "epoch": 2.767437488249671, + "grad_norm": 0.04894113168120384, + "learning_rate": 3.916953410314327e-06, + "loss": 0.0558, + "step": 44160 + }, + { + "epoch": 2.768064172463496, + "grad_norm": 7.974740982055664, + "learning_rate": 3.906398429418843e-06, + "loss": 0.0612, + "step": 44170 + }, + { + "epoch": 2.7686908566773205, + "grad_norm": 0.5689123272895813, + "learning_rate": 3.895843448523358e-06, + "loss": 0.0887, + "step": 44180 + }, + { + "epoch": 2.769317540891145, + "grad_norm": 0.12117575854063034, + "learning_rate": 3.885288467627874e-06, + "loss": 0.0454, + "step": 44190 + }, + { + "epoch": 2.7699442251049695, + "grad_norm": 0.05510355532169342, + "learning_rate": 3.874733486732389e-06, + "loss": 0.0018, + "step": 44200 + }, + { + "epoch": 2.7705709093187942, + "grad_norm": 0.34615781903266907, + "learning_rate": 3.864178505836905e-06, + "loss": 0.1697, + "step": 44210 + }, + { + "epoch": 2.771197593532619, + "grad_norm": 0.016674628481268883, + "learning_rate": 3.85362352494142e-06, + "loss": 0.0888, + "step": 44220 + }, + { + "epoch": 2.7718242777464437, + "grad_norm": 0.009896052069962025, + "learning_rate": 3.843068544045935e-06, + "loss": 0.004, + "step": 44230 + }, + { + "epoch": 2.772450961960268, + "grad_norm": 11.549787521362305, + "learning_rate": 3.83251356315045e-06, + "loss": 0.0569, + "step": 44240 + }, + { + "epoch": 2.7730776461740927, + "grad_norm": 0.005293003749102354, + "learning_rate": 3.821958582254966e-06, + "loss": 0.1791, + "step": 44250 + }, + { + "epoch": 2.7737043303879174, + "grad_norm": 0.01065013650804758, + "learning_rate": 3.811403601359482e-06, + "loss": 0.0196, + "step": 44260 + }, + { + "epoch": 2.774331014601742, + "grad_norm": 0.4858344793319702, + "learning_rate": 3.8008486204639973e-06, + "loss": 0.0956, + "step": 44270 + }, + { + "epoch": 2.774957698815567, + "grad_norm": 0.007913574576377869, + "learning_rate": 3.7902936395685123e-06, + "loss": 0.0668, + "step": 44280 + }, + { + "epoch": 2.7755843830293916, + "grad_norm": 0.36157143115997314, + "learning_rate": 3.779738658673028e-06, + "loss": 0.1573, + "step": 44290 + }, + { + "epoch": 2.7762110672432163, + "grad_norm": 0.010120880790054798, + "learning_rate": 3.7691836777775435e-06, + "loss": 0.0908, + "step": 44300 + }, + { + "epoch": 2.776837751457041, + "grad_norm": 0.016430944204330444, + "learning_rate": 3.7586286968820593e-06, + "loss": 0.0009, + "step": 44310 + }, + { + "epoch": 2.7774644356708653, + "grad_norm": 0.011352479457855225, + "learning_rate": 3.7480737159865742e-06, + "loss": 0.0313, + "step": 44320 + }, + { + "epoch": 2.77809111988469, + "grad_norm": 0.011910725384950638, + "learning_rate": 3.7375187350910896e-06, + "loss": 0.0014, + "step": 44330 + }, + { + "epoch": 2.7787178040985148, + "grad_norm": 0.03391356021165848, + "learning_rate": 3.7269637541956054e-06, + "loss": 0.1373, + "step": 44340 + }, + { + "epoch": 2.7793444883123395, + "grad_norm": 0.06280501186847687, + "learning_rate": 3.7164087733001204e-06, + "loss": 0.0012, + "step": 44350 + }, + { + "epoch": 2.779971172526164, + "grad_norm": 0.01791159249842167, + "learning_rate": 3.7058537924046357e-06, + "loss": 0.0778, + "step": 44360 + }, + { + "epoch": 2.7805978567399885, + "grad_norm": 0.007814787328243256, + "learning_rate": 3.6952988115091515e-06, + "loss": 0.0069, + "step": 44370 + }, + { + "epoch": 2.781224540953813, + "grad_norm": 0.03260798752307892, + "learning_rate": 3.684743830613667e-06, + "loss": 0.0707, + "step": 44380 + }, + { + "epoch": 2.781851225167638, + "grad_norm": 0.04626648873090744, + "learning_rate": 3.674188849718182e-06, + "loss": 0.0044, + "step": 44390 + }, + { + "epoch": 2.7824779093814627, + "grad_norm": 0.004598037339746952, + "learning_rate": 3.6636338688226973e-06, + "loss": 0.0621, + "step": 44400 + }, + { + "epoch": 2.7831045935952874, + "grad_norm": 16.925655364990234, + "learning_rate": 3.653078887927213e-06, + "loss": 0.0674, + "step": 44410 + }, + { + "epoch": 2.783731277809112, + "grad_norm": 0.01036140788346529, + "learning_rate": 3.642523907031729e-06, + "loss": 0.0396, + "step": 44420 + }, + { + "epoch": 2.784357962022937, + "grad_norm": 0.004241311922669411, + "learning_rate": 3.6319689261362434e-06, + "loss": 0.1163, + "step": 44430 + }, + { + "epoch": 2.7849846462367616, + "grad_norm": 0.003902185009792447, + "learning_rate": 3.621413945240759e-06, + "loss": 0.0272, + "step": 44440 + }, + { + "epoch": 2.785611330450586, + "grad_norm": 0.474302738904953, + "learning_rate": 3.6108589643452746e-06, + "loss": 0.0135, + "step": 44450 + }, + { + "epoch": 2.7862380146644106, + "grad_norm": 0.011814618483185768, + "learning_rate": 3.6003039834497904e-06, + "loss": 0.1892, + "step": 44460 + }, + { + "epoch": 2.7868646988782353, + "grad_norm": 0.1012982577085495, + "learning_rate": 3.5897490025543053e-06, + "loss": 0.1607, + "step": 44470 + }, + { + "epoch": 2.78749138309206, + "grad_norm": 0.03258698061108589, + "learning_rate": 3.5791940216588207e-06, + "loss": 0.1343, + "step": 44480 + }, + { + "epoch": 2.7881180673058843, + "grad_norm": 0.004049960989505053, + "learning_rate": 3.5686390407633365e-06, + "loss": 0.2204, + "step": 44490 + }, + { + "epoch": 2.788744751519709, + "grad_norm": 0.008130665868520737, + "learning_rate": 3.558084059867852e-06, + "loss": 0.0028, + "step": 44500 + }, + { + "epoch": 2.7893714357335337, + "grad_norm": 9.015934944152832, + "learning_rate": 3.547529078972367e-06, + "loss": 0.082, + "step": 44510 + }, + { + "epoch": 2.7899981199473585, + "grad_norm": 0.00387417059391737, + "learning_rate": 3.5369740980768827e-06, + "loss": 0.0949, + "step": 44520 + }, + { + "epoch": 2.790624804161183, + "grad_norm": 4.448673725128174, + "learning_rate": 3.526419117181398e-06, + "loss": 0.0473, + "step": 44530 + }, + { + "epoch": 2.791251488375008, + "grad_norm": 0.11645185202360153, + "learning_rate": 3.515864136285914e-06, + "loss": 0.0798, + "step": 44540 + }, + { + "epoch": 2.7918781725888326, + "grad_norm": 0.047189708799123764, + "learning_rate": 3.505309155390429e-06, + "loss": 0.0368, + "step": 44550 + }, + { + "epoch": 2.7925048568026574, + "grad_norm": 16.466087341308594, + "learning_rate": 3.494754174494944e-06, + "loss": 0.09, + "step": 44560 + }, + { + "epoch": 2.7931315410164816, + "grad_norm": 0.07427942007780075, + "learning_rate": 3.48419919359946e-06, + "loss": 0.1788, + "step": 44570 + }, + { + "epoch": 2.7937582252303064, + "grad_norm": 0.014952383004128933, + "learning_rate": 3.4736442127039754e-06, + "loss": 0.0356, + "step": 44580 + }, + { + "epoch": 2.794384909444131, + "grad_norm": 0.5309322476387024, + "learning_rate": 3.4630892318084903e-06, + "loss": 0.0368, + "step": 44590 + }, + { + "epoch": 2.795011593657956, + "grad_norm": 4.181199550628662, + "learning_rate": 3.452534250913006e-06, + "loss": 0.1609, + "step": 44600 + }, + { + "epoch": 2.7956382778717805, + "grad_norm": 0.31589823961257935, + "learning_rate": 3.4419792700175215e-06, + "loss": 0.0745, + "step": 44610 + }, + { + "epoch": 2.796264962085605, + "grad_norm": 4.957091331481934, + "learning_rate": 3.4314242891220373e-06, + "loss": 0.0768, + "step": 44620 + }, + { + "epoch": 2.7968916462994295, + "grad_norm": 0.6653972268104553, + "learning_rate": 3.4208693082265523e-06, + "loss": 0.1005, + "step": 44630 + }, + { + "epoch": 2.7975183305132543, + "grad_norm": 0.03658928722143173, + "learning_rate": 3.4103143273310677e-06, + "loss": 0.0479, + "step": 44640 + }, + { + "epoch": 2.798145014727079, + "grad_norm": 0.028675353154540062, + "learning_rate": 3.3997593464355835e-06, + "loss": 0.01, + "step": 44650 + }, + { + "epoch": 2.7987716989409037, + "grad_norm": 0.05703501030802727, + "learning_rate": 3.389204365540099e-06, + "loss": 0.0476, + "step": 44660 + }, + { + "epoch": 2.7993983831547284, + "grad_norm": 0.025796543806791306, + "learning_rate": 3.378649384644614e-06, + "loss": 0.2466, + "step": 44670 + }, + { + "epoch": 2.800025067368553, + "grad_norm": 0.06946486234664917, + "learning_rate": 3.368094403749129e-06, + "loss": 0.0533, + "step": 44680 + }, + { + "epoch": 2.800651751582378, + "grad_norm": 0.03723221644759178, + "learning_rate": 3.357539422853645e-06, + "loss": 0.1658, + "step": 44690 + }, + { + "epoch": 2.801278435796202, + "grad_norm": 0.20439843833446503, + "learning_rate": 3.34698444195816e-06, + "loss": 0.0314, + "step": 44700 + }, + { + "epoch": 2.801905120010027, + "grad_norm": 9.646413803100586, + "learning_rate": 3.3364294610626753e-06, + "loss": 0.0337, + "step": 44710 + }, + { + "epoch": 2.8025318042238516, + "grad_norm": 124.05106353759766, + "learning_rate": 3.325874480167191e-06, + "loss": 0.0586, + "step": 44720 + }, + { + "epoch": 2.8031584884376763, + "grad_norm": 0.05693316087126732, + "learning_rate": 3.3153194992717065e-06, + "loss": 0.0022, + "step": 44730 + }, + { + "epoch": 2.803785172651501, + "grad_norm": 0.04369988664984703, + "learning_rate": 3.3047645183762215e-06, + "loss": 0.0427, + "step": 44740 + }, + { + "epoch": 2.8044118568653253, + "grad_norm": 0.03682061284780502, + "learning_rate": 3.2942095374807373e-06, + "loss": 0.0777, + "step": 44750 + }, + { + "epoch": 2.80503854107915, + "grad_norm": 0.3942996859550476, + "learning_rate": 3.2836545565852527e-06, + "loss": 0.123, + "step": 44760 + }, + { + "epoch": 2.805665225292975, + "grad_norm": 0.03973536938428879, + "learning_rate": 3.2730995756897685e-06, + "loss": 0.0912, + "step": 44770 + }, + { + "epoch": 2.8062919095067995, + "grad_norm": 0.3577216863632202, + "learning_rate": 3.2625445947942834e-06, + "loss": 0.0692, + "step": 44780 + }, + { + "epoch": 2.8069185937206242, + "grad_norm": 0.13281461596488953, + "learning_rate": 3.251989613898799e-06, + "loss": 0.1269, + "step": 44790 + }, + { + "epoch": 2.807545277934449, + "grad_norm": 0.1817847192287445, + "learning_rate": 3.2414346330033146e-06, + "loss": 0.0766, + "step": 44800 + }, + { + "epoch": 2.8081719621482737, + "grad_norm": 0.05994454398751259, + "learning_rate": 3.23087965210783e-06, + "loss": 0.038, + "step": 44810 + }, + { + "epoch": 2.808798646362098, + "grad_norm": 0.007000461686402559, + "learning_rate": 3.220324671212345e-06, + "loss": 0.1416, + "step": 44820 + }, + { + "epoch": 2.8094253305759227, + "grad_norm": 0.06378606706857681, + "learning_rate": 3.2097696903168607e-06, + "loss": 0.0422, + "step": 44830 + }, + { + "epoch": 2.8100520147897474, + "grad_norm": 0.007673150394111872, + "learning_rate": 3.199214709421376e-06, + "loss": 0.0294, + "step": 44840 + }, + { + "epoch": 2.810678699003572, + "grad_norm": 0.09554049372673035, + "learning_rate": 3.188659728525892e-06, + "loss": 0.0039, + "step": 44850 + }, + { + "epoch": 2.811305383217397, + "grad_norm": 0.07531704008579254, + "learning_rate": 3.178104747630407e-06, + "loss": 0.0449, + "step": 44860 + }, + { + "epoch": 2.811932067431221, + "grad_norm": 0.2533637583255768, + "learning_rate": 3.1675497667349223e-06, + "loss": 0.0041, + "step": 44870 + }, + { + "epoch": 2.812558751645046, + "grad_norm": 8.10892391204834, + "learning_rate": 3.156994785839438e-06, + "loss": 0.0659, + "step": 44880 + }, + { + "epoch": 2.8131854358588706, + "grad_norm": 7.151573657989502, + "learning_rate": 3.1464398049439535e-06, + "loss": 0.1933, + "step": 44890 + }, + { + "epoch": 2.8138121200726953, + "grad_norm": 0.3158617913722992, + "learning_rate": 3.1358848240484684e-06, + "loss": 0.0357, + "step": 44900 + }, + { + "epoch": 2.81443880428652, + "grad_norm": 0.012602360919117928, + "learning_rate": 3.125329843152984e-06, + "loss": 0.1188, + "step": 44910 + }, + { + "epoch": 2.8150654885003448, + "grad_norm": 5.404448509216309, + "learning_rate": 3.1147748622574996e-06, + "loss": 0.0822, + "step": 44920 + }, + { + "epoch": 2.8156921727141695, + "grad_norm": 0.0632733628153801, + "learning_rate": 3.104219881362015e-06, + "loss": 0.0099, + "step": 44930 + }, + { + "epoch": 2.816318856927994, + "grad_norm": 0.19753728806972504, + "learning_rate": 3.0936649004665304e-06, + "loss": 0.1135, + "step": 44940 + }, + { + "epoch": 2.8169455411418185, + "grad_norm": 5.0890092849731445, + "learning_rate": 3.0831099195710457e-06, + "loss": 0.1338, + "step": 44950 + }, + { + "epoch": 2.8175722253556432, + "grad_norm": 0.8431811332702637, + "learning_rate": 3.072554938675561e-06, + "loss": 0.0986, + "step": 44960 + }, + { + "epoch": 2.818198909569468, + "grad_norm": 0.20394667983055115, + "learning_rate": 3.0619999577800765e-06, + "loss": 0.0036, + "step": 44970 + }, + { + "epoch": 2.8188255937832927, + "grad_norm": 0.30946293473243713, + "learning_rate": 3.0514449768845923e-06, + "loss": 0.0354, + "step": 44980 + }, + { + "epoch": 2.8194522779971174, + "grad_norm": 0.1506856381893158, + "learning_rate": 3.0408899959891073e-06, + "loss": 0.0177, + "step": 44990 + }, + { + "epoch": 2.8200789622109417, + "grad_norm": 0.2589752972126007, + "learning_rate": 3.030335015093623e-06, + "loss": 0.0036, + "step": 45000 + }, + { + "epoch": 2.8207056464247664, + "grad_norm": 0.020266905426979065, + "learning_rate": 3.0197800341981384e-06, + "loss": 0.0016, + "step": 45010 + }, + { + "epoch": 2.821332330638591, + "grad_norm": 10.223530769348145, + "learning_rate": 3.0092250533026534e-06, + "loss": 0.1038, + "step": 45020 + }, + { + "epoch": 2.821959014852416, + "grad_norm": 0.005880449432879686, + "learning_rate": 2.998670072407169e-06, + "loss": 0.0008, + "step": 45030 + }, + { + "epoch": 2.8225856990662406, + "grad_norm": 0.00848589837551117, + "learning_rate": 2.988115091511684e-06, + "loss": 0.0746, + "step": 45040 + }, + { + "epoch": 2.8232123832800653, + "grad_norm": 0.02407548949122429, + "learning_rate": 2.9775601106162e-06, + "loss": 0.0358, + "step": 45050 + }, + { + "epoch": 2.82383906749389, + "grad_norm": 0.058489829301834106, + "learning_rate": 2.9670051297207154e-06, + "loss": 0.0485, + "step": 45060 + }, + { + "epoch": 2.8244657517077147, + "grad_norm": 172.251220703125, + "learning_rate": 2.9564501488252307e-06, + "loss": 0.096, + "step": 45070 + }, + { + "epoch": 2.825092435921539, + "grad_norm": 0.23265521228313446, + "learning_rate": 2.945895167929746e-06, + "loss": 0.0534, + "step": 45080 + }, + { + "epoch": 2.8257191201353637, + "grad_norm": 0.03863035887479782, + "learning_rate": 2.9353401870342615e-06, + "loss": 0.1314, + "step": 45090 + }, + { + "epoch": 2.8263458043491885, + "grad_norm": 0.027446454390883446, + "learning_rate": 2.924785206138777e-06, + "loss": 0.0281, + "step": 45100 + }, + { + "epoch": 2.826972488563013, + "grad_norm": 20.515501022338867, + "learning_rate": 2.9142302252432927e-06, + "loss": 0.0892, + "step": 45110 + }, + { + "epoch": 2.8275991727768375, + "grad_norm": 0.011976310983300209, + "learning_rate": 2.9036752443478076e-06, + "loss": 0.003, + "step": 45120 + }, + { + "epoch": 2.828225856990662, + "grad_norm": 0.042540911585092545, + "learning_rate": 2.8931202634523234e-06, + "loss": 0.1174, + "step": 45130 + }, + { + "epoch": 2.828852541204487, + "grad_norm": 7.581174373626709, + "learning_rate": 2.882565282556839e-06, + "loss": 0.0382, + "step": 45140 + }, + { + "epoch": 2.8294792254183116, + "grad_norm": 0.22137735784053802, + "learning_rate": 2.872010301661354e-06, + "loss": 0.0034, + "step": 45150 + }, + { + "epoch": 2.8301059096321364, + "grad_norm": 0.2696286737918854, + "learning_rate": 2.8614553207658696e-06, + "loss": 0.0279, + "step": 45160 + }, + { + "epoch": 2.830732593845961, + "grad_norm": 0.15648046135902405, + "learning_rate": 2.850900339870385e-06, + "loss": 0.0128, + "step": 45170 + }, + { + "epoch": 2.831359278059786, + "grad_norm": 0.03201805427670479, + "learning_rate": 2.8403453589749003e-06, + "loss": 0.097, + "step": 45180 + }, + { + "epoch": 2.8319859622736105, + "grad_norm": 0.17333370447158813, + "learning_rate": 2.8297903780794157e-06, + "loss": 0.0033, + "step": 45190 + }, + { + "epoch": 2.832612646487435, + "grad_norm": 0.009709888137876987, + "learning_rate": 2.819235397183931e-06, + "loss": 0.0018, + "step": 45200 + }, + { + "epoch": 2.8332393307012596, + "grad_norm": 0.012017948552966118, + "learning_rate": 2.808680416288447e-06, + "loss": 0.0017, + "step": 45210 + }, + { + "epoch": 2.8338660149150843, + "grad_norm": 0.01653211936354637, + "learning_rate": 2.798125435392962e-06, + "loss": 0.0024, + "step": 45220 + }, + { + "epoch": 2.834492699128909, + "grad_norm": 6.492367744445801, + "learning_rate": 2.7875704544974777e-06, + "loss": 0.1434, + "step": 45230 + }, + { + "epoch": 2.8351193833427337, + "grad_norm": 11.736387252807617, + "learning_rate": 2.777015473601993e-06, + "loss": 0.123, + "step": 45240 + }, + { + "epoch": 2.835746067556558, + "grad_norm": 0.013091763481497765, + "learning_rate": 2.7664604927065084e-06, + "loss": 0.1348, + "step": 45250 + }, + { + "epoch": 2.8363727517703827, + "grad_norm": 3.22715163230896, + "learning_rate": 2.755905511811024e-06, + "loss": 0.0459, + "step": 45260 + }, + { + "epoch": 2.8369994359842075, + "grad_norm": 15.400287628173828, + "learning_rate": 2.745350530915539e-06, + "loss": 0.0604, + "step": 45270 + }, + { + "epoch": 2.837626120198032, + "grad_norm": 0.014584527350962162, + "learning_rate": 2.7347955500200546e-06, + "loss": 0.0825, + "step": 45280 + }, + { + "epoch": 2.838252804411857, + "grad_norm": 0.24854691326618195, + "learning_rate": 2.7242405691245704e-06, + "loss": 0.002, + "step": 45290 + }, + { + "epoch": 2.8388794886256816, + "grad_norm": 0.1337384581565857, + "learning_rate": 2.7136855882290853e-06, + "loss": 0.0022, + "step": 45300 + }, + { + "epoch": 2.8395061728395063, + "grad_norm": 0.014081398025155067, + "learning_rate": 2.703130607333601e-06, + "loss": 0.0442, + "step": 45310 + }, + { + "epoch": 2.840132857053331, + "grad_norm": 0.017172805964946747, + "learning_rate": 2.692575626438116e-06, + "loss": 0.0428, + "step": 45320 + }, + { + "epoch": 2.8407595412671554, + "grad_norm": 0.021060975268483162, + "learning_rate": 2.682020645542632e-06, + "loss": 0.1414, + "step": 45330 + }, + { + "epoch": 2.84138622548098, + "grad_norm": 0.04201918840408325, + "learning_rate": 2.6714656646471473e-06, + "loss": 0.0713, + "step": 45340 + }, + { + "epoch": 2.842012909694805, + "grad_norm": 0.01611120067536831, + "learning_rate": 2.6609106837516622e-06, + "loss": 0.0448, + "step": 45350 + }, + { + "epoch": 2.8426395939086295, + "grad_norm": 0.021956700831651688, + "learning_rate": 2.650355702856178e-06, + "loss": 0.0068, + "step": 45360 + }, + { + "epoch": 2.8432662781224542, + "grad_norm": 0.031278882175683975, + "learning_rate": 2.6398007219606934e-06, + "loss": 0.0365, + "step": 45370 + }, + { + "epoch": 2.8438929623362785, + "grad_norm": 0.059587206691503525, + "learning_rate": 2.629245741065209e-06, + "loss": 0.0497, + "step": 45380 + }, + { + "epoch": 2.8445196465501033, + "grad_norm": 0.033658090978860855, + "learning_rate": 2.618690760169724e-06, + "loss": 0.1026, + "step": 45390 + }, + { + "epoch": 2.845146330763928, + "grad_norm": 0.4833011329174042, + "learning_rate": 2.6081357792742396e-06, + "loss": 0.0172, + "step": 45400 + }, + { + "epoch": 2.8457730149777527, + "grad_norm": 0.017121130600571632, + "learning_rate": 2.597580798378755e-06, + "loss": 0.1607, + "step": 45410 + }, + { + "epoch": 2.8463996991915774, + "grad_norm": 0.0939975306391716, + "learning_rate": 2.5870258174832703e-06, + "loss": 0.0868, + "step": 45420 + }, + { + "epoch": 2.847026383405402, + "grad_norm": 0.1767207384109497, + "learning_rate": 2.5764708365877857e-06, + "loss": 0.0336, + "step": 45430 + }, + { + "epoch": 2.847653067619227, + "grad_norm": 0.012052887119352818, + "learning_rate": 2.5659158556923015e-06, + "loss": 0.0405, + "step": 45440 + }, + { + "epoch": 2.8482797518330516, + "grad_norm": 0.10825521498918533, + "learning_rate": 2.5553608747968165e-06, + "loss": 0.0965, + "step": 45450 + }, + { + "epoch": 2.848906436046876, + "grad_norm": 0.20288436114788055, + "learning_rate": 2.5448058939013323e-06, + "loss": 0.0956, + "step": 45460 + }, + { + "epoch": 2.8495331202607006, + "grad_norm": 0.015252267010509968, + "learning_rate": 2.5342509130058477e-06, + "loss": 0.0015, + "step": 45470 + }, + { + "epoch": 2.8501598044745253, + "grad_norm": 0.0075327022932469845, + "learning_rate": 2.523695932110363e-06, + "loss": 0.0031, + "step": 45480 + }, + { + "epoch": 2.85078648868835, + "grad_norm": 0.024453936144709587, + "learning_rate": 2.5131409512148784e-06, + "loss": 0.0186, + "step": 45490 + }, + { + "epoch": 2.8514131729021743, + "grad_norm": 0.015601756051182747, + "learning_rate": 2.502585970319394e-06, + "loss": 0.0584, + "step": 45500 + }, + { + "epoch": 2.852039857115999, + "grad_norm": 0.020709333941340446, + "learning_rate": 2.492030989423909e-06, + "loss": 0.0372, + "step": 45510 + }, + { + "epoch": 2.852666541329824, + "grad_norm": 9.91942024230957, + "learning_rate": 2.481476008528425e-06, + "loss": 0.0665, + "step": 45520 + }, + { + "epoch": 2.8532932255436485, + "grad_norm": 0.011118386872112751, + "learning_rate": 2.47092102763294e-06, + "loss": 0.001, + "step": 45530 + }, + { + "epoch": 2.8539199097574732, + "grad_norm": 0.00901950802654028, + "learning_rate": 2.4603660467374557e-06, + "loss": 0.0011, + "step": 45540 + }, + { + "epoch": 2.854546593971298, + "grad_norm": 0.09835624694824219, + "learning_rate": 2.4498110658419707e-06, + "loss": 0.0642, + "step": 45550 + }, + { + "epoch": 2.8551732781851227, + "grad_norm": 9.125321388244629, + "learning_rate": 2.4392560849464865e-06, + "loss": 0.0341, + "step": 45560 + }, + { + "epoch": 2.8557999623989474, + "grad_norm": 0.007022942416369915, + "learning_rate": 2.428701104051002e-06, + "loss": 0.0011, + "step": 45570 + }, + { + "epoch": 2.8564266466127717, + "grad_norm": 0.13939209282398224, + "learning_rate": 2.4181461231555173e-06, + "loss": 0.0816, + "step": 45580 + }, + { + "epoch": 2.8570533308265964, + "grad_norm": 0.010702289640903473, + "learning_rate": 2.4075911422600326e-06, + "loss": 0.0439, + "step": 45590 + }, + { + "epoch": 2.857680015040421, + "grad_norm": 9.077207565307617, + "learning_rate": 2.397036161364548e-06, + "loss": 0.0487, + "step": 45600 + }, + { + "epoch": 2.858306699254246, + "grad_norm": 0.005782410968095064, + "learning_rate": 2.3864811804690634e-06, + "loss": 0.0007, + "step": 45610 + }, + { + "epoch": 2.8589333834680706, + "grad_norm": 0.006889980286359787, + "learning_rate": 2.375926199573579e-06, + "loss": 0.0009, + "step": 45620 + }, + { + "epoch": 2.859560067681895, + "grad_norm": 0.14798372983932495, + "learning_rate": 2.365371218678094e-06, + "loss": 0.0306, + "step": 45630 + }, + { + "epoch": 2.8601867518957196, + "grad_norm": 0.009764185175299644, + "learning_rate": 2.35481623778261e-06, + "loss": 0.0012, + "step": 45640 + }, + { + "epoch": 2.8608134361095443, + "grad_norm": 6.804908752441406, + "learning_rate": 2.3442612568871254e-06, + "loss": 0.12, + "step": 45650 + }, + { + "epoch": 2.861440120323369, + "grad_norm": 0.008579867891967297, + "learning_rate": 2.3337062759916407e-06, + "loss": 0.027, + "step": 45660 + }, + { + "epoch": 2.8620668045371938, + "grad_norm": 0.00904889777302742, + "learning_rate": 2.323151295096156e-06, + "loss": 0.0005, + "step": 45670 + }, + { + "epoch": 2.8626934887510185, + "grad_norm": 5.600223541259766, + "learning_rate": 2.312596314200671e-06, + "loss": 0.1602, + "step": 45680 + }, + { + "epoch": 2.863320172964843, + "grad_norm": 0.03661729022860527, + "learning_rate": 2.302041333305187e-06, + "loss": 0.0022, + "step": 45690 + }, + { + "epoch": 2.863946857178668, + "grad_norm": 0.07130564749240875, + "learning_rate": 2.2914863524097023e-06, + "loss": 0.0046, + "step": 45700 + }, + { + "epoch": 2.864573541392492, + "grad_norm": 0.005574720446020365, + "learning_rate": 2.2809313715142176e-06, + "loss": 0.0299, + "step": 45710 + }, + { + "epoch": 2.865200225606317, + "grad_norm": 0.006751071196049452, + "learning_rate": 2.270376390618733e-06, + "loss": 0.0568, + "step": 45720 + }, + { + "epoch": 2.8658269098201417, + "grad_norm": 0.6684667468070984, + "learning_rate": 2.2598214097232484e-06, + "loss": 0.0062, + "step": 45730 + }, + { + "epoch": 2.8664535940339664, + "grad_norm": 1.4935647249221802, + "learning_rate": 2.2492664288277638e-06, + "loss": 0.1296, + "step": 45740 + }, + { + "epoch": 2.867080278247791, + "grad_norm": 0.006710459478199482, + "learning_rate": 2.2387114479322796e-06, + "loss": 0.0375, + "step": 45750 + }, + { + "epoch": 2.8677069624616154, + "grad_norm": 15.230756759643555, + "learning_rate": 2.2281564670367945e-06, + "loss": 0.1664, + "step": 45760 + }, + { + "epoch": 2.86833364667544, + "grad_norm": 1.371279239654541, + "learning_rate": 2.2176014861413103e-06, + "loss": 0.0035, + "step": 45770 + }, + { + "epoch": 2.868960330889265, + "grad_norm": 0.05189768597483635, + "learning_rate": 2.2070465052458253e-06, + "loss": 0.0907, + "step": 45780 + }, + { + "epoch": 2.8695870151030896, + "grad_norm": 0.06382831186056137, + "learning_rate": 2.196491524350341e-06, + "loss": 0.2114, + "step": 45790 + }, + { + "epoch": 2.8702136993169143, + "grad_norm": 0.02207627147436142, + "learning_rate": 2.1859365434548565e-06, + "loss": 0.0651, + "step": 45800 + }, + { + "epoch": 2.870840383530739, + "grad_norm": 0.01618020236492157, + "learning_rate": 2.175381562559372e-06, + "loss": 0.0205, + "step": 45810 + }, + { + "epoch": 2.8714670677445637, + "grad_norm": 0.27104535698890686, + "learning_rate": 2.1648265816638872e-06, + "loss": 0.0584, + "step": 45820 + }, + { + "epoch": 2.872093751958388, + "grad_norm": 0.07692761719226837, + "learning_rate": 2.1542716007684026e-06, + "loss": 0.112, + "step": 45830 + }, + { + "epoch": 2.8727204361722127, + "grad_norm": 7.452527046203613, + "learning_rate": 2.143716619872918e-06, + "loss": 0.0788, + "step": 45840 + }, + { + "epoch": 2.8733471203860375, + "grad_norm": 0.011884380131959915, + "learning_rate": 2.133161638977434e-06, + "loss": 0.0006, + "step": 45850 + }, + { + "epoch": 2.873973804599862, + "grad_norm": 0.00841989554464817, + "learning_rate": 2.1226066580819488e-06, + "loss": 0.0029, + "step": 45860 + }, + { + "epoch": 2.874600488813687, + "grad_norm": 0.3218337297439575, + "learning_rate": 2.1120516771864646e-06, + "loss": 0.1013, + "step": 45870 + }, + { + "epoch": 2.875227173027511, + "grad_norm": 0.3481862246990204, + "learning_rate": 2.10149669629098e-06, + "loss": 0.1067, + "step": 45880 + }, + { + "epoch": 2.875853857241336, + "grad_norm": 0.014908461831510067, + "learning_rate": 2.0909417153954953e-06, + "loss": 0.0643, + "step": 45890 + }, + { + "epoch": 2.8764805414551606, + "grad_norm": 0.005841756239533424, + "learning_rate": 2.0803867345000107e-06, + "loss": 0.0578, + "step": 45900 + }, + { + "epoch": 2.8771072256689854, + "grad_norm": 0.007631808519363403, + "learning_rate": 2.069831753604526e-06, + "loss": 0.0035, + "step": 45910 + }, + { + "epoch": 2.87773390988281, + "grad_norm": 0.00997697003185749, + "learning_rate": 2.0592767727090415e-06, + "loss": 0.0051, + "step": 45920 + }, + { + "epoch": 2.878360594096635, + "grad_norm": 0.01217371691018343, + "learning_rate": 2.048721791813557e-06, + "loss": 0.2205, + "step": 45930 + }, + { + "epoch": 2.8789872783104595, + "grad_norm": 4.541561603546143, + "learning_rate": 2.0381668109180722e-06, + "loss": 0.3388, + "step": 45940 + }, + { + "epoch": 2.8796139625242843, + "grad_norm": 0.1472068428993225, + "learning_rate": 2.027611830022588e-06, + "loss": 0.0013, + "step": 45950 + }, + { + "epoch": 2.8802406467381085, + "grad_norm": 0.3544577360153198, + "learning_rate": 2.017056849127103e-06, + "loss": 0.0574, + "step": 45960 + }, + { + "epoch": 2.8808673309519333, + "grad_norm": 0.181627556681633, + "learning_rate": 2.006501868231619e-06, + "loss": 0.0161, + "step": 45970 + }, + { + "epoch": 2.881494015165758, + "grad_norm": 0.009927576407790184, + "learning_rate": 1.995946887336134e-06, + "loss": 0.0427, + "step": 45980 + }, + { + "epoch": 2.8821206993795827, + "grad_norm": 0.010722942650318146, + "learning_rate": 1.9853919064406496e-06, + "loss": 0.0541, + "step": 45990 + }, + { + "epoch": 2.8827473835934074, + "grad_norm": 0.087717205286026, + "learning_rate": 1.974836925545165e-06, + "loss": 0.1293, + "step": 46000 + }, + { + "epoch": 2.8833740678072317, + "grad_norm": 0.810115396976471, + "learning_rate": 1.96428194464968e-06, + "loss": 0.1102, + "step": 46010 + }, + { + "epoch": 2.8840007520210564, + "grad_norm": 0.2306370884180069, + "learning_rate": 1.9537269637541957e-06, + "loss": 0.0255, + "step": 46020 + }, + { + "epoch": 2.884627436234881, + "grad_norm": 0.09505550563335419, + "learning_rate": 1.943171982858711e-06, + "loss": 0.0287, + "step": 46030 + }, + { + "epoch": 2.885254120448706, + "grad_norm": 0.007353039458394051, + "learning_rate": 1.9326170019632265e-06, + "loss": 0.0801, + "step": 46040 + }, + { + "epoch": 2.8858808046625306, + "grad_norm": 0.0182589590549469, + "learning_rate": 1.922062021067742e-06, + "loss": 0.0027, + "step": 46050 + }, + { + "epoch": 2.8865074888763553, + "grad_norm": 0.008695291355252266, + "learning_rate": 1.9115070401722572e-06, + "loss": 0.0059, + "step": 46060 + }, + { + "epoch": 2.88713417309018, + "grad_norm": 0.08065420389175415, + "learning_rate": 1.9009520592767726e-06, + "loss": 0.108, + "step": 46070 + }, + { + "epoch": 2.887760857304005, + "grad_norm": 0.047303736209869385, + "learning_rate": 1.8903970783812882e-06, + "loss": 0.0261, + "step": 46080 + }, + { + "epoch": 2.888387541517829, + "grad_norm": 0.21223130822181702, + "learning_rate": 1.8798420974858036e-06, + "loss": 0.0461, + "step": 46090 + }, + { + "epoch": 2.889014225731654, + "grad_norm": 0.023573195561766624, + "learning_rate": 1.8692871165903192e-06, + "loss": 0.0967, + "step": 46100 + }, + { + "epoch": 2.8896409099454785, + "grad_norm": 0.007523571141064167, + "learning_rate": 1.8587321356948343e-06, + "loss": 0.0229, + "step": 46110 + }, + { + "epoch": 2.8902675941593032, + "grad_norm": 0.3306170403957367, + "learning_rate": 1.84817715479935e-06, + "loss": 0.0654, + "step": 46120 + }, + { + "epoch": 2.8908942783731275, + "grad_norm": 0.04974093288183212, + "learning_rate": 1.8376221739038653e-06, + "loss": 0.0948, + "step": 46130 + }, + { + "epoch": 2.8915209625869522, + "grad_norm": 0.14429162442684174, + "learning_rate": 1.827067193008381e-06, + "loss": 0.0017, + "step": 46140 + }, + { + "epoch": 2.892147646800777, + "grad_norm": 0.006346757058054209, + "learning_rate": 1.816512212112896e-06, + "loss": 0.0255, + "step": 46150 + }, + { + "epoch": 2.8927743310146017, + "grad_norm": 0.02242646925151348, + "learning_rate": 1.8059572312174117e-06, + "loss": 0.0803, + "step": 46160 + }, + { + "epoch": 2.8934010152284264, + "grad_norm": 0.026843460276722908, + "learning_rate": 1.7954022503219268e-06, + "loss": 0.0557, + "step": 46170 + }, + { + "epoch": 2.894027699442251, + "grad_norm": 0.005104394629597664, + "learning_rate": 1.7848472694264424e-06, + "loss": 0.0641, + "step": 46180 + }, + { + "epoch": 2.894654383656076, + "grad_norm": 16.5250244140625, + "learning_rate": 1.7742922885309578e-06, + "loss": 0.2605, + "step": 46190 + }, + { + "epoch": 2.8952810678699006, + "grad_norm": 0.08836544305086136, + "learning_rate": 1.7637373076354734e-06, + "loss": 0.0622, + "step": 46200 + }, + { + "epoch": 2.895907752083725, + "grad_norm": 0.011723213829100132, + "learning_rate": 1.7531823267399886e-06, + "loss": 0.1145, + "step": 46210 + }, + { + "epoch": 2.8965344362975496, + "grad_norm": 0.03772498294711113, + "learning_rate": 1.7426273458445042e-06, + "loss": 0.1735, + "step": 46220 + }, + { + "epoch": 2.8971611205113743, + "grad_norm": 4.869604587554932, + "learning_rate": 1.7320723649490195e-06, + "loss": 0.0396, + "step": 46230 + }, + { + "epoch": 2.897787804725199, + "grad_norm": 0.10343295335769653, + "learning_rate": 1.7215173840535351e-06, + "loss": 0.0015, + "step": 46240 + }, + { + "epoch": 2.8984144889390238, + "grad_norm": 0.21444806456565857, + "learning_rate": 1.7109624031580503e-06, + "loss": 0.0045, + "step": 46250 + }, + { + "epoch": 2.899041173152848, + "grad_norm": 0.011719079688191414, + "learning_rate": 1.700407422262566e-06, + "loss": 0.04, + "step": 46260 + }, + { + "epoch": 2.8996678573666728, + "grad_norm": 0.02197698876261711, + "learning_rate": 1.6898524413670813e-06, + "loss": 0.0023, + "step": 46270 + }, + { + "epoch": 2.9002945415804975, + "grad_norm": 0.07769487798213959, + "learning_rate": 1.6792974604715969e-06, + "loss": 0.0963, + "step": 46280 + }, + { + "epoch": 2.900921225794322, + "grad_norm": 9.129709243774414, + "learning_rate": 1.668742479576112e-06, + "loss": 0.037, + "step": 46290 + }, + { + "epoch": 2.901547910008147, + "grad_norm": 0.2846452295780182, + "learning_rate": 1.6581874986806276e-06, + "loss": 0.2469, + "step": 46300 + }, + { + "epoch": 2.9021745942219717, + "grad_norm": 0.0827544555068016, + "learning_rate": 1.6476325177851428e-06, + "loss": 0.0516, + "step": 46310 + }, + { + "epoch": 2.9028012784357964, + "grad_norm": 0.2681815028190613, + "learning_rate": 1.6370775368896584e-06, + "loss": 0.0819, + "step": 46320 + }, + { + "epoch": 2.903427962649621, + "grad_norm": 0.011799097061157227, + "learning_rate": 1.6265225559941738e-06, + "loss": 0.0402, + "step": 46330 + }, + { + "epoch": 2.9040546468634454, + "grad_norm": 0.5588979125022888, + "learning_rate": 1.615967575098689e-06, + "loss": 0.0788, + "step": 46340 + }, + { + "epoch": 2.90468133107727, + "grad_norm": 0.015121783129870892, + "learning_rate": 1.6054125942032045e-06, + "loss": 0.0032, + "step": 46350 + }, + { + "epoch": 2.905308015291095, + "grad_norm": 0.07492361217737198, + "learning_rate": 1.59485761330772e-06, + "loss": 0.0238, + "step": 46360 + }, + { + "epoch": 2.9059346995049196, + "grad_norm": 0.0074353525415062904, + "learning_rate": 1.5843026324122355e-06, + "loss": 0.031, + "step": 46370 + }, + { + "epoch": 2.9065613837187443, + "grad_norm": 0.057144444435834885, + "learning_rate": 1.5737476515167507e-06, + "loss": 0.0749, + "step": 46380 + }, + { + "epoch": 2.9071880679325686, + "grad_norm": 0.32816392183303833, + "learning_rate": 1.5631926706212663e-06, + "loss": 0.0029, + "step": 46390 + }, + { + "epoch": 2.9078147521463933, + "grad_norm": 0.15893450379371643, + "learning_rate": 1.5526376897257817e-06, + "loss": 0.1622, + "step": 46400 + }, + { + "epoch": 2.908441436360218, + "grad_norm": 0.39106377959251404, + "learning_rate": 1.542082708830297e-06, + "loss": 0.0057, + "step": 46410 + }, + { + "epoch": 2.9090681205740427, + "grad_norm": 0.046128787100315094, + "learning_rate": 1.5315277279348126e-06, + "loss": 0.0526, + "step": 46420 + }, + { + "epoch": 2.9096948047878675, + "grad_norm": 0.01425962895154953, + "learning_rate": 1.520972747039328e-06, + "loss": 0.0934, + "step": 46430 + }, + { + "epoch": 2.910321489001692, + "grad_norm": 0.014097287319600582, + "learning_rate": 1.5104177661438434e-06, + "loss": 0.002, + "step": 46440 + }, + { + "epoch": 2.910948173215517, + "grad_norm": 0.09383156150579453, + "learning_rate": 1.4998627852483588e-06, + "loss": 0.0871, + "step": 46450 + }, + { + "epoch": 2.911574857429341, + "grad_norm": 0.005777876358479261, + "learning_rate": 1.4893078043528744e-06, + "loss": 0.0014, + "step": 46460 + }, + { + "epoch": 2.912201541643166, + "grad_norm": 0.020350627601146698, + "learning_rate": 1.4787528234573897e-06, + "loss": 0.0513, + "step": 46470 + }, + { + "epoch": 2.9128282258569906, + "grad_norm": 0.007423871662467718, + "learning_rate": 1.4681978425619051e-06, + "loss": 0.065, + "step": 46480 + }, + { + "epoch": 2.9134549100708154, + "grad_norm": 0.33089637756347656, + "learning_rate": 1.4576428616664203e-06, + "loss": 0.0027, + "step": 46490 + }, + { + "epoch": 2.91408159428464, + "grad_norm": 0.005864977836608887, + "learning_rate": 1.4470878807709359e-06, + "loss": 0.002, + "step": 46500 + }, + { + "epoch": 2.9147082784984644, + "grad_norm": 7.691262722015381, + "learning_rate": 1.4365328998754513e-06, + "loss": 0.0641, + "step": 46510 + }, + { + "epoch": 2.915334962712289, + "grad_norm": 0.5212214589118958, + "learning_rate": 1.4259779189799666e-06, + "loss": 0.1364, + "step": 46520 + }, + { + "epoch": 2.915961646926114, + "grad_norm": 0.08162923902273178, + "learning_rate": 1.415422938084482e-06, + "loss": 0.0629, + "step": 46530 + }, + { + "epoch": 2.9165883311399385, + "grad_norm": 1.3101518154144287, + "learning_rate": 1.4048679571889974e-06, + "loss": 0.0516, + "step": 46540 + }, + { + "epoch": 2.9172150153537633, + "grad_norm": 0.06720486283302307, + "learning_rate": 1.394312976293513e-06, + "loss": 0.04, + "step": 46550 + }, + { + "epoch": 2.917841699567588, + "grad_norm": 0.2428748905658722, + "learning_rate": 1.3837579953980284e-06, + "loss": 0.0667, + "step": 46560 + }, + { + "epoch": 2.9184683837814127, + "grad_norm": 0.19993221759796143, + "learning_rate": 1.3732030145025438e-06, + "loss": 0.1962, + "step": 46570 + }, + { + "epoch": 2.9190950679952374, + "grad_norm": 0.02945621870458126, + "learning_rate": 1.3626480336070591e-06, + "loss": 0.258, + "step": 46580 + }, + { + "epoch": 2.9197217522090617, + "grad_norm": 0.1247883215546608, + "learning_rate": 1.3520930527115745e-06, + "loss": 0.1295, + "step": 46590 + }, + { + "epoch": 2.9203484364228864, + "grad_norm": 0.011887336149811745, + "learning_rate": 1.3415380718160901e-06, + "loss": 0.0717, + "step": 46600 + }, + { + "epoch": 2.920975120636711, + "grad_norm": 0.010727292858064175, + "learning_rate": 1.3309830909206055e-06, + "loss": 0.0272, + "step": 46610 + }, + { + "epoch": 2.921601804850536, + "grad_norm": 0.19950401782989502, + "learning_rate": 1.3204281100251209e-06, + "loss": 0.1339, + "step": 46620 + }, + { + "epoch": 2.9222284890643606, + "grad_norm": 0.018687257543206215, + "learning_rate": 1.3098731291296363e-06, + "loss": 0.1271, + "step": 46630 + }, + { + "epoch": 2.922855173278185, + "grad_norm": 0.13185709714889526, + "learning_rate": 1.2993181482341519e-06, + "loss": 0.1095, + "step": 46640 + }, + { + "epoch": 2.9234818574920096, + "grad_norm": 0.015231356024742126, + "learning_rate": 1.2887631673386672e-06, + "loss": 0.0017, + "step": 46650 + }, + { + "epoch": 2.9241085417058343, + "grad_norm": 0.012139714322984219, + "learning_rate": 1.2782081864431826e-06, + "loss": 0.0944, + "step": 46660 + }, + { + "epoch": 2.924735225919659, + "grad_norm": 0.012931933626532555, + "learning_rate": 1.267653205547698e-06, + "loss": 0.0102, + "step": 46670 + }, + { + "epoch": 2.925361910133484, + "grad_norm": 0.02020031027495861, + "learning_rate": 1.2570982246522134e-06, + "loss": 0.0012, + "step": 46680 + }, + { + "epoch": 2.9259885943473085, + "grad_norm": 0.4495704174041748, + "learning_rate": 1.246543243756729e-06, + "loss": 0.0709, + "step": 46690 + }, + { + "epoch": 2.9266152785611332, + "grad_norm": 0.006982484832406044, + "learning_rate": 1.2359882628612443e-06, + "loss": 0.0012, + "step": 46700 + }, + { + "epoch": 2.927241962774958, + "grad_norm": 0.005636376328766346, + "learning_rate": 1.2254332819657597e-06, + "loss": 0.0822, + "step": 46710 + }, + { + "epoch": 2.9278686469887822, + "grad_norm": 0.010224459692835808, + "learning_rate": 1.2148783010702751e-06, + "loss": 0.0009, + "step": 46720 + }, + { + "epoch": 2.928495331202607, + "grad_norm": 0.7171324491500854, + "learning_rate": 1.2043233201747905e-06, + "loss": 0.1434, + "step": 46730 + }, + { + "epoch": 2.9291220154164317, + "grad_norm": 0.2830602824687958, + "learning_rate": 1.193768339279306e-06, + "loss": 0.02, + "step": 46740 + }, + { + "epoch": 2.9297486996302564, + "grad_norm": 0.011865504086017609, + "learning_rate": 1.1832133583838215e-06, + "loss": 0.1664, + "step": 46750 + }, + { + "epoch": 2.9303753838440807, + "grad_norm": 0.015045304782688618, + "learning_rate": 1.1726583774883368e-06, + "loss": 0.0011, + "step": 46760 + }, + { + "epoch": 2.9310020680579054, + "grad_norm": 0.02452757954597473, + "learning_rate": 1.1621033965928522e-06, + "loss": 0.0009, + "step": 46770 + }, + { + "epoch": 2.93162875227173, + "grad_norm": 0.007630917243659496, + "learning_rate": 1.1515484156973678e-06, + "loss": 0.0451, + "step": 46780 + }, + { + "epoch": 2.932255436485555, + "grad_norm": 0.05825293809175491, + "learning_rate": 1.1409934348018832e-06, + "loss": 0.0955, + "step": 46790 + }, + { + "epoch": 2.9328821206993796, + "grad_norm": 5.381073951721191, + "learning_rate": 1.1304384539063986e-06, + "loss": 0.0924, + "step": 46800 + }, + { + "epoch": 2.9335088049132043, + "grad_norm": 0.2285376489162445, + "learning_rate": 1.119883473010914e-06, + "loss": 0.0039, + "step": 46810 + }, + { + "epoch": 2.934135489127029, + "grad_norm": 0.11770034581422806, + "learning_rate": 1.1093284921154293e-06, + "loss": 0.0365, + "step": 46820 + }, + { + "epoch": 2.9347621733408538, + "grad_norm": 0.009794503450393677, + "learning_rate": 1.0987735112199447e-06, + "loss": 0.1226, + "step": 46830 + }, + { + "epoch": 2.935388857554678, + "grad_norm": 0.010756062343716621, + "learning_rate": 1.08821853032446e-06, + "loss": 0.0753, + "step": 46840 + }, + { + "epoch": 2.9360155417685028, + "grad_norm": 2.7289679050445557, + "learning_rate": 1.0776635494289755e-06, + "loss": 0.0095, + "step": 46850 + }, + { + "epoch": 2.9366422259823275, + "grad_norm": 0.5125044584274292, + "learning_rate": 1.0671085685334909e-06, + "loss": 0.0563, + "step": 46860 + }, + { + "epoch": 2.937268910196152, + "grad_norm": 32.42184066772461, + "learning_rate": 1.0565535876380065e-06, + "loss": 0.157, + "step": 46870 + }, + { + "epoch": 2.937895594409977, + "grad_norm": 0.024757886305451393, + "learning_rate": 1.0459986067425218e-06, + "loss": 0.0593, + "step": 46880 + }, + { + "epoch": 2.938522278623801, + "grad_norm": 10.523122787475586, + "learning_rate": 1.0354436258470372e-06, + "loss": 0.0794, + "step": 46890 + }, + { + "epoch": 2.939148962837626, + "grad_norm": 0.024635594338178635, + "learning_rate": 1.0248886449515526e-06, + "loss": 0.0035, + "step": 46900 + }, + { + "epoch": 2.9397756470514507, + "grad_norm": 0.01637181080877781, + "learning_rate": 1.014333664056068e-06, + "loss": 0.0008, + "step": 46910 + }, + { + "epoch": 2.9404023312652754, + "grad_norm": 0.008923073299229145, + "learning_rate": 1.0037786831605836e-06, + "loss": 0.0071, + "step": 46920 + }, + { + "epoch": 2.9410290154791, + "grad_norm": 0.06598563492298126, + "learning_rate": 9.93223702265099e-07, + "loss": 0.0846, + "step": 46930 + }, + { + "epoch": 2.941655699692925, + "grad_norm": 0.06929517537355423, + "learning_rate": 9.826687213696143e-07, + "loss": 0.039, + "step": 46940 + }, + { + "epoch": 2.9422823839067496, + "grad_norm": 12.796843528747559, + "learning_rate": 9.721137404741297e-07, + "loss": 0.1326, + "step": 46950 + }, + { + "epoch": 2.9429090681205743, + "grad_norm": 0.06685430556535721, + "learning_rate": 9.61558759578645e-07, + "loss": 0.002, + "step": 46960 + }, + { + "epoch": 2.9435357523343986, + "grad_norm": 0.004804515279829502, + "learning_rate": 9.510037786831606e-07, + "loss": 0.1405, + "step": 46970 + }, + { + "epoch": 2.9441624365482233, + "grad_norm": 1.1889760494232178, + "learning_rate": 9.404487977876761e-07, + "loss": 0.083, + "step": 46980 + }, + { + "epoch": 2.944789120762048, + "grad_norm": 0.6792351603507996, + "learning_rate": 9.298938168921914e-07, + "loss": 0.003, + "step": 46990 + }, + { + "epoch": 2.9454158049758727, + "grad_norm": 0.012094573117792606, + "learning_rate": 9.193388359967069e-07, + "loss": 0.138, + "step": 47000 + }, + { + "epoch": 2.9460424891896975, + "grad_norm": 0.025209341198205948, + "learning_rate": 9.087838551012223e-07, + "loss": 0.1022, + "step": 47010 + }, + { + "epoch": 2.9466691734035217, + "grad_norm": 0.015056479722261429, + "learning_rate": 8.982288742057378e-07, + "loss": 0.087, + "step": 47020 + }, + { + "epoch": 2.9472958576173465, + "grad_norm": 0.1191815584897995, + "learning_rate": 8.876738933102532e-07, + "loss": 0.0039, + "step": 47030 + }, + { + "epoch": 2.947922541831171, + "grad_norm": 0.09055612236261368, + "learning_rate": 8.771189124147686e-07, + "loss": 0.0542, + "step": 47040 + }, + { + "epoch": 2.948549226044996, + "grad_norm": 0.031856048852205276, + "learning_rate": 8.66563931519284e-07, + "loss": 0.0304, + "step": 47050 + }, + { + "epoch": 2.9491759102588206, + "grad_norm": 0.005399190355092287, + "learning_rate": 8.560089506237994e-07, + "loss": 0.0488, + "step": 47060 + }, + { + "epoch": 2.9498025944726454, + "grad_norm": 0.07480963319540024, + "learning_rate": 8.454539697283149e-07, + "loss": 0.1056, + "step": 47070 + }, + { + "epoch": 2.95042927868647, + "grad_norm": 0.037359680980443954, + "learning_rate": 8.348989888328303e-07, + "loss": 0.0486, + "step": 47080 + }, + { + "epoch": 2.951055962900295, + "grad_norm": 0.03391611948609352, + "learning_rate": 8.243440079373458e-07, + "loss": 0.075, + "step": 47090 + }, + { + "epoch": 2.951682647114119, + "grad_norm": 0.3366416096687317, + "learning_rate": 8.137890270418612e-07, + "loss": 0.1217, + "step": 47100 + }, + { + "epoch": 2.952309331327944, + "grad_norm": 0.2528597116470337, + "learning_rate": 8.032340461463765e-07, + "loss": 0.1417, + "step": 47110 + }, + { + "epoch": 2.9529360155417685, + "grad_norm": 0.1561173051595688, + "learning_rate": 7.92679065250892e-07, + "loss": 0.0294, + "step": 47120 + }, + { + "epoch": 2.9535626997555933, + "grad_norm": 0.006439610850065947, + "learning_rate": 7.821240843554074e-07, + "loss": 0.0761, + "step": 47130 + }, + { + "epoch": 2.9541893839694175, + "grad_norm": 0.09393543750047684, + "learning_rate": 7.715691034599228e-07, + "loss": 0.1152, + "step": 47140 + }, + { + "epoch": 2.9548160681832423, + "grad_norm": 0.015306937508285046, + "learning_rate": 7.610141225644382e-07, + "loss": 0.1413, + "step": 47150 + }, + { + "epoch": 2.955442752397067, + "grad_norm": 0.013179498724639416, + "learning_rate": 7.504591416689537e-07, + "loss": 0.0375, + "step": 47160 + }, + { + "epoch": 2.9560694366108917, + "grad_norm": 0.02790948562324047, + "learning_rate": 7.39904160773469e-07, + "loss": 0.0948, + "step": 47170 + }, + { + "epoch": 2.9566961208247164, + "grad_norm": 0.3824322521686554, + "learning_rate": 7.293491798779845e-07, + "loss": 0.2018, + "step": 47180 + }, + { + "epoch": 2.957322805038541, + "grad_norm": 0.12762394547462463, + "learning_rate": 7.187941989824999e-07, + "loss": 0.1251, + "step": 47190 + }, + { + "epoch": 2.957949489252366, + "grad_norm": 0.007191898766905069, + "learning_rate": 7.082392180870153e-07, + "loss": 0.0033, + "step": 47200 + }, + { + "epoch": 2.9585761734661906, + "grad_norm": 0.03603360056877136, + "learning_rate": 6.976842371915308e-07, + "loss": 0.0031, + "step": 47210 + }, + { + "epoch": 2.959202857680015, + "grad_norm": 0.0791572779417038, + "learning_rate": 6.871292562960462e-07, + "loss": 0.0563, + "step": 47220 + }, + { + "epoch": 2.9598295418938396, + "grad_norm": 0.15367931127548218, + "learning_rate": 6.765742754005615e-07, + "loss": 0.0826, + "step": 47230 + }, + { + "epoch": 2.9604562261076643, + "grad_norm": 9.065729141235352, + "learning_rate": 6.660192945050769e-07, + "loss": 0.127, + "step": 47240 + }, + { + "epoch": 2.961082910321489, + "grad_norm": 0.014689859934151173, + "learning_rate": 6.554643136095924e-07, + "loss": 0.0907, + "step": 47250 + }, + { + "epoch": 2.961709594535314, + "grad_norm": 0.012969248928129673, + "learning_rate": 6.449093327141078e-07, + "loss": 0.0636, + "step": 47260 + }, + { + "epoch": 2.962336278749138, + "grad_norm": 0.0509127713739872, + "learning_rate": 6.343543518186233e-07, + "loss": 0.0992, + "step": 47270 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.40878334641456604, + "learning_rate": 6.237993709231387e-07, + "loss": 0.0273, + "step": 47280 + }, + { + "epoch": 2.9635896471767875, + "grad_norm": 0.0912589356303215, + "learning_rate": 6.13244390027654e-07, + "loss": 0.0213, + "step": 47290 + }, + { + "epoch": 2.9642163313906122, + "grad_norm": 0.005653919652104378, + "learning_rate": 6.026894091321695e-07, + "loss": 0.0164, + "step": 47300 + }, + { + "epoch": 2.964843015604437, + "grad_norm": 0.01706555485725403, + "learning_rate": 5.921344282366849e-07, + "loss": 0.0996, + "step": 47310 + }, + { + "epoch": 2.9654696998182617, + "grad_norm": 0.11770886182785034, + "learning_rate": 5.815794473412004e-07, + "loss": 0.0924, + "step": 47320 + }, + { + "epoch": 2.9660963840320864, + "grad_norm": 0.05868373066186905, + "learning_rate": 5.710244664457158e-07, + "loss": 0.1727, + "step": 47330 + }, + { + "epoch": 2.966723068245911, + "grad_norm": 10.509180068969727, + "learning_rate": 5.604694855502311e-07, + "loss": 0.1099, + "step": 47340 + }, + { + "epoch": 2.9673497524597354, + "grad_norm": 0.5250189900398254, + "learning_rate": 5.499145046547466e-07, + "loss": 0.0871, + "step": 47350 + }, + { + "epoch": 2.96797643667356, + "grad_norm": 0.005643198732286692, + "learning_rate": 5.39359523759262e-07, + "loss": 0.0155, + "step": 47360 + }, + { + "epoch": 2.968603120887385, + "grad_norm": 0.24792006611824036, + "learning_rate": 5.288045428637775e-07, + "loss": 0.0357, + "step": 47370 + }, + { + "epoch": 2.9692298051012096, + "grad_norm": 0.2950761020183563, + "learning_rate": 5.182495619682929e-07, + "loss": 0.026, + "step": 47380 + }, + { + "epoch": 2.9698564893150343, + "grad_norm": 0.00568793248385191, + "learning_rate": 5.076945810728084e-07, + "loss": 0.0264, + "step": 47390 + }, + { + "epoch": 2.9704831735288586, + "grad_norm": 0.008012909442186356, + "learning_rate": 4.971396001773236e-07, + "loss": 0.0018, + "step": 47400 + }, + { + "epoch": 2.9711098577426833, + "grad_norm": 0.04426155611872673, + "learning_rate": 4.865846192818391e-07, + "loss": 0.0976, + "step": 47410 + }, + { + "epoch": 2.971736541956508, + "grad_norm": 0.009348000399768353, + "learning_rate": 4.760296383863545e-07, + "loss": 0.041, + "step": 47420 + }, + { + "epoch": 2.9723632261703328, + "grad_norm": 0.09541311115026474, + "learning_rate": 4.6547465749086994e-07, + "loss": 0.0034, + "step": 47430 + }, + { + "epoch": 2.9729899103841575, + "grad_norm": 0.013340012170374393, + "learning_rate": 4.549196765953854e-07, + "loss": 0.0014, + "step": 47440 + }, + { + "epoch": 2.973616594597982, + "grad_norm": 0.04422794282436371, + "learning_rate": 4.443646956999008e-07, + "loss": 0.0389, + "step": 47450 + }, + { + "epoch": 2.974243278811807, + "grad_norm": 0.09784085303544998, + "learning_rate": 4.338097148044162e-07, + "loss": 0.1464, + "step": 47460 + }, + { + "epoch": 2.9748699630256312, + "grad_norm": 0.00832357257604599, + "learning_rate": 4.232547339089316e-07, + "loss": 0.03, + "step": 47470 + }, + { + "epoch": 2.975496647239456, + "grad_norm": 8.156325340270996, + "learning_rate": 4.1269975301344706e-07, + "loss": 0.1161, + "step": 47480 + }, + { + "epoch": 2.9761233314532807, + "grad_norm": 6.328774452209473, + "learning_rate": 4.021447721179625e-07, + "loss": 0.0816, + "step": 47490 + }, + { + "epoch": 2.9767500156671054, + "grad_norm": 0.010376522317528725, + "learning_rate": 3.915897912224779e-07, + "loss": 0.0299, + "step": 47500 + }, + { + "epoch": 2.97737669988093, + "grad_norm": 0.0181084256619215, + "learning_rate": 3.8103481032699336e-07, + "loss": 0.0526, + "step": 47510 + }, + { + "epoch": 2.9780033840947544, + "grad_norm": 0.04073436185717583, + "learning_rate": 3.7047982943150874e-07, + "loss": 0.0011, + "step": 47520 + }, + { + "epoch": 2.978630068308579, + "grad_norm": 0.2758326232433319, + "learning_rate": 3.599248485360242e-07, + "loss": 0.0034, + "step": 47530 + }, + { + "epoch": 2.979256752522404, + "grad_norm": 0.011556997895240784, + "learning_rate": 3.4936986764053955e-07, + "loss": 0.0011, + "step": 47540 + }, + { + "epoch": 2.9798834367362286, + "grad_norm": 0.03783591091632843, + "learning_rate": 3.38814886745055e-07, + "loss": 0.0521, + "step": 47550 + }, + { + "epoch": 2.9805101209500533, + "grad_norm": 0.5519959926605225, + "learning_rate": 3.282599058495704e-07, + "loss": 0.0333, + "step": 47560 + }, + { + "epoch": 2.981136805163878, + "grad_norm": 0.05450139567255974, + "learning_rate": 3.1770492495408586e-07, + "loss": 0.0707, + "step": 47570 + }, + { + "epoch": 2.9817634893777027, + "grad_norm": 0.056759271770715714, + "learning_rate": 3.071499440586013e-07, + "loss": 0.0408, + "step": 47580 + }, + { + "epoch": 2.9823901735915275, + "grad_norm": 12.344095230102539, + "learning_rate": 2.965949631631167e-07, + "loss": 0.1028, + "step": 47590 + }, + { + "epoch": 2.9830168578053518, + "grad_norm": 4.790780544281006, + "learning_rate": 2.860399822676321e-07, + "loss": 0.1723, + "step": 47600 + }, + { + "epoch": 2.9836435420191765, + "grad_norm": 0.1742323935031891, + "learning_rate": 2.7548500137214754e-07, + "loss": 0.105, + "step": 47610 + }, + { + "epoch": 2.984270226233001, + "grad_norm": 0.011358167044818401, + "learning_rate": 2.649300204766629e-07, + "loss": 0.2667, + "step": 47620 + }, + { + "epoch": 2.984896910446826, + "grad_norm": 0.010282701812684536, + "learning_rate": 2.5437503958117835e-07, + "loss": 0.0754, + "step": 47630 + }, + { + "epoch": 2.9855235946606506, + "grad_norm": 0.030036158859729767, + "learning_rate": 2.438200586856938e-07, + "loss": 0.0637, + "step": 47640 + }, + { + "epoch": 2.986150278874475, + "grad_norm": 0.008501702919602394, + "learning_rate": 2.3326507779020922e-07, + "loss": 0.1259, + "step": 47650 + }, + { + "epoch": 2.9867769630882997, + "grad_norm": 0.026741160079836845, + "learning_rate": 2.2271009689472465e-07, + "loss": 0.0055, + "step": 47660 + }, + { + "epoch": 2.9874036473021244, + "grad_norm": 0.008301452733576298, + "learning_rate": 2.1215511599924006e-07, + "loss": 0.2303, + "step": 47670 + }, + { + "epoch": 2.988030331515949, + "grad_norm": 0.005024346057325602, + "learning_rate": 2.016001351037555e-07, + "loss": 0.1822, + "step": 47680 + }, + { + "epoch": 2.988657015729774, + "grad_norm": 0.014699382707476616, + "learning_rate": 1.910451542082709e-07, + "loss": 0.0462, + "step": 47690 + }, + { + "epoch": 2.9892836999435985, + "grad_norm": 0.08590447157621384, + "learning_rate": 1.8049017331278633e-07, + "loss": 0.002, + "step": 47700 + }, + { + "epoch": 2.9899103841574233, + "grad_norm": 0.08910279721021652, + "learning_rate": 1.6993519241730171e-07, + "loss": 0.115, + "step": 47710 + }, + { + "epoch": 2.990537068371248, + "grad_norm": 8.29712963104248, + "learning_rate": 1.5938021152181715e-07, + "loss": 0.0961, + "step": 47720 + }, + { + "epoch": 2.9911637525850723, + "grad_norm": 1.3991972208023071, + "learning_rate": 1.4882523062633258e-07, + "loss": 0.0508, + "step": 47730 + }, + { + "epoch": 2.991790436798897, + "grad_norm": 4.287667274475098, + "learning_rate": 1.3827024973084802e-07, + "loss": 0.0809, + "step": 47740 + }, + { + "epoch": 2.9924171210127217, + "grad_norm": 0.1272421032190323, + "learning_rate": 1.277152688353634e-07, + "loss": 0.0012, + "step": 47750 + }, + { + "epoch": 2.9930438052265464, + "grad_norm": 0.025945696979761124, + "learning_rate": 1.1716028793987883e-07, + "loss": 0.123, + "step": 47760 + }, + { + "epoch": 2.9936704894403707, + "grad_norm": 0.06106176972389221, + "learning_rate": 1.0660530704439426e-07, + "loss": 0.0575, + "step": 47770 + }, + { + "epoch": 2.9942971736541955, + "grad_norm": 0.02119859866797924, + "learning_rate": 9.605032614890967e-08, + "loss": 0.0722, + "step": 47780 + }, + { + "epoch": 2.99492385786802, + "grad_norm": 0.01732785254716873, + "learning_rate": 8.54953452534251e-08, + "loss": 0.0866, + "step": 47790 + }, + { + "epoch": 2.995550542081845, + "grad_norm": 0.009875914081931114, + "learning_rate": 7.494036435794051e-08, + "loss": 0.0629, + "step": 47800 + }, + { + "epoch": 2.9961772262956696, + "grad_norm": 0.043381739407777786, + "learning_rate": 6.438538346245594e-08, + "loss": 0.0017, + "step": 47810 + }, + { + "epoch": 2.9968039105094944, + "grad_norm": 0.041683148592710495, + "learning_rate": 5.383040256697135e-08, + "loss": 0.0534, + "step": 47820 + }, + { + "epoch": 2.997430594723319, + "grad_norm": 0.05090497434139252, + "learning_rate": 4.327542167148678e-08, + "loss": 0.0547, + "step": 47830 + }, + { + "epoch": 2.998057278937144, + "grad_norm": 0.018531568348407745, + "learning_rate": 3.27204407760022e-08, + "loss": 0.0027, + "step": 47840 + }, + { + "epoch": 2.998683963150968, + "grad_norm": 0.22370028495788574, + "learning_rate": 2.2165459880517617e-08, + "loss": 0.0557, + "step": 47850 + }, + { + "epoch": 2.999310647364793, + "grad_norm": 3.741847038269043, + "learning_rate": 1.1610478985033039e-08, + "loss": 0.0429, + "step": 47860 + }, + { + "epoch": 2.9999373315786175, + "grad_norm": 0.8121204376220703, + "learning_rate": 1.055498089548458e-09, + "loss": 0.0538, + "step": 47870 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9655334482218393, + "eval_f1": 0.9651329077512896, + "eval_loss": 0.1548488736152649, + "eval_precision": 0.9648537768510945, + "eval_recall": 0.9655334482218393, + "eval_runtime": 288.3598, + "eval_samples_per_second": 110.678, + "eval_steps_per_second": 13.837, + "step": 47871 + } + ], + "logging_steps": 10, + "max_steps": 47871, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.073077472849101e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}