{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1443.5744044969551, "learning_rate": 5.91715976331361e-08, "loss": 12.2344, "step": 1 }, { "epoch": 0.0, "grad_norm": 1534.8176448250863, "learning_rate": 1.183431952662722e-07, "loss": 11.7344, "step": 2 }, { "epoch": 0.0, "grad_norm": 1326.3321019398716, "learning_rate": 1.775147928994083e-07, "loss": 11.6875, "step": 3 }, { "epoch": 0.0, "grad_norm": 1288.3465989587996, "learning_rate": 2.366863905325444e-07, "loss": 11.0938, "step": 4 }, { "epoch": 0.0, "grad_norm": 1341.804033668729, "learning_rate": 2.958579881656805e-07, "loss": 12.3125, "step": 5 }, { "epoch": 0.0, "grad_norm": 1329.3145585197879, "learning_rate": 3.550295857988166e-07, "loss": 11.0938, "step": 6 }, { "epoch": 0.0, "grad_norm": 1130.227729481188, "learning_rate": 4.1420118343195276e-07, "loss": 11.375, "step": 7 }, { "epoch": 0.01, "grad_norm": 1160.6261885441586, "learning_rate": 4.733727810650888e-07, "loss": 9.8906, "step": 8 }, { "epoch": 0.01, "grad_norm": 1039.7040454148366, "learning_rate": 5.32544378698225e-07, "loss": 9.8984, "step": 9 }, { "epoch": 0.01, "grad_norm": 835.4908181410407, "learning_rate": 5.91715976331361e-07, "loss": 8.6406, "step": 10 }, { "epoch": 0.01, "grad_norm": 687.6534892135122, "learning_rate": 6.50887573964497e-07, "loss": 7.5391, "step": 11 }, { "epoch": 0.01, "grad_norm": 354.312170952273, "learning_rate": 7.100591715976332e-07, "loss": 7.4141, "step": 12 }, { "epoch": 0.01, "grad_norm": 365.8199705384397, "learning_rate": 7.692307692307694e-07, "loss": 7.0469, "step": 13 }, { "epoch": 0.01, "grad_norm": 1216.9260067540185, "learning_rate": 8.284023668639055e-07, "loss": 8.3516, "step": 14 }, { "epoch": 0.01, "grad_norm": 1474.4218345737838, "learning_rate": 8.875739644970415e-07, "loss": 8.1016, "step": 15 }, { "epoch": 0.01, "grad_norm": 1393.9506087028988, "learning_rate": 9.467455621301776e-07, "loss": 8.0, "step": 16 }, { "epoch": 0.01, "grad_norm": 1475.954883625004, "learning_rate": 1.0059171597633138e-06, "loss": 8.7891, "step": 17 }, { "epoch": 0.01, "grad_norm": 1219.677012756364, "learning_rate": 1.06508875739645e-06, "loss": 6.4844, "step": 18 }, { "epoch": 0.01, "grad_norm": 938.9090098183609, "learning_rate": 1.1242603550295859e-06, "loss": 4.9531, "step": 19 }, { "epoch": 0.01, "grad_norm": 732.6809380140464, "learning_rate": 1.183431952662722e-06, "loss": 3.7578, "step": 20 }, { "epoch": 0.01, "grad_norm": 312.90545925368025, "learning_rate": 1.242603550295858e-06, "loss": 2.3359, "step": 21 }, { "epoch": 0.02, "grad_norm": 131.42857798633014, "learning_rate": 1.301775147928994e-06, "loss": 2.5195, "step": 22 }, { "epoch": 0.02, "grad_norm": 204.75611392809807, "learning_rate": 1.3609467455621303e-06, "loss": 2.1074, "step": 23 }, { "epoch": 0.02, "grad_norm": 281.53999585238597, "learning_rate": 1.4201183431952664e-06, "loss": 2.3008, "step": 24 }, { "epoch": 0.02, "grad_norm": 407.1340542086892, "learning_rate": 1.4792899408284026e-06, "loss": 2.3633, "step": 25 }, { "epoch": 0.02, "grad_norm": 513.0873413511963, "learning_rate": 1.5384615384615387e-06, "loss": 2.457, "step": 26 }, { "epoch": 0.02, "grad_norm": 461.55622845849354, "learning_rate": 1.5976331360946749e-06, "loss": 2.1602, "step": 27 }, { "epoch": 0.02, "grad_norm": 478.542794042002, "learning_rate": 1.656804733727811e-06, "loss": 2.1387, "step": 28 }, { "epoch": 0.02, "grad_norm": 344.6282465781462, "learning_rate": 1.7159763313609468e-06, "loss": 1.7637, "step": 29 }, { "epoch": 0.02, "grad_norm": 362.57393430326465, "learning_rate": 1.775147928994083e-06, "loss": 1.6602, "step": 30 }, { "epoch": 0.02, "grad_norm": 168.67532989078214, "learning_rate": 1.834319526627219e-06, "loss": 1.3105, "step": 31 }, { "epoch": 0.02, "grad_norm": 78.2335154608893, "learning_rate": 1.8934911242603552e-06, "loss": 1.0537, "step": 32 }, { "epoch": 0.02, "grad_norm": 74.86312041211706, "learning_rate": 1.952662721893491e-06, "loss": 0.9717, "step": 33 }, { "epoch": 0.02, "grad_norm": 173.72567437330426, "learning_rate": 2.0118343195266275e-06, "loss": 1.0166, "step": 34 }, { "epoch": 0.02, "grad_norm": 244.678248816213, "learning_rate": 2.0710059171597635e-06, "loss": 1.1279, "step": 35 }, { "epoch": 0.03, "grad_norm": 289.70307970146496, "learning_rate": 2.1301775147929e-06, "loss": 1.1523, "step": 36 }, { "epoch": 0.03, "grad_norm": 261.404070412537, "learning_rate": 2.1893491124260358e-06, "loss": 1.0615, "step": 37 }, { "epoch": 0.03, "grad_norm": 248.72543936315603, "learning_rate": 2.2485207100591717e-06, "loss": 0.96, "step": 38 }, { "epoch": 0.03, "grad_norm": 175.19387912539813, "learning_rate": 2.307692307692308e-06, "loss": 0.8223, "step": 39 }, { "epoch": 0.03, "grad_norm": 129.57230819430404, "learning_rate": 2.366863905325444e-06, "loss": 0.7617, "step": 40 }, { "epoch": 0.03, "grad_norm": 64.75648312193796, "learning_rate": 2.42603550295858e-06, "loss": 0.6279, "step": 41 }, { "epoch": 0.03, "grad_norm": 40.345496542665984, "learning_rate": 2.485207100591716e-06, "loss": 0.6426, "step": 42 }, { "epoch": 0.03, "grad_norm": 112.96546034948359, "learning_rate": 2.5443786982248527e-06, "loss": 0.6133, "step": 43 }, { "epoch": 0.03, "grad_norm": 113.11724928693071, "learning_rate": 2.603550295857988e-06, "loss": 0.6172, "step": 44 }, { "epoch": 0.03, "grad_norm": 170.2583256015434, "learning_rate": 2.6627218934911246e-06, "loss": 0.6826, "step": 45 }, { "epoch": 0.03, "grad_norm": 181.13824595705816, "learning_rate": 2.7218934911242605e-06, "loss": 0.6807, "step": 46 }, { "epoch": 0.03, "grad_norm": 127.83994169123808, "learning_rate": 2.7810650887573965e-06, "loss": 0.5532, "step": 47 }, { "epoch": 0.03, "grad_norm": 160.92696853722236, "learning_rate": 2.840236686390533e-06, "loss": 0.6377, "step": 48 }, { "epoch": 0.03, "grad_norm": 83.85759914507452, "learning_rate": 2.8994082840236688e-06, "loss": 0.5171, "step": 49 }, { "epoch": 0.04, "grad_norm": 34.82212793694129, "learning_rate": 2.958579881656805e-06, "loss": 0.4097, "step": 50 }, { "epoch": 0.04, "grad_norm": 25.956713751406728, "learning_rate": 3.017751479289941e-06, "loss": 0.4434, "step": 51 }, { "epoch": 0.04, "grad_norm": 78.75487723337257, "learning_rate": 3.0769230769230774e-06, "loss": 0.394, "step": 52 }, { "epoch": 0.04, "grad_norm": 154.51220671292447, "learning_rate": 3.1360946745562134e-06, "loss": 0.5396, "step": 53 }, { "epoch": 0.04, "grad_norm": 167.45386551318256, "learning_rate": 3.1952662721893497e-06, "loss": 0.5225, "step": 54 }, { "epoch": 0.04, "grad_norm": 153.7596657181405, "learning_rate": 3.2544378698224853e-06, "loss": 0.4897, "step": 55 }, { "epoch": 0.04, "grad_norm": 104.90584941846187, "learning_rate": 3.313609467455622e-06, "loss": 0.4272, "step": 56 }, { "epoch": 0.04, "grad_norm": 80.49574128984705, "learning_rate": 3.3727810650887576e-06, "loss": 0.395, "step": 57 }, { "epoch": 0.04, "grad_norm": 44.08211560352312, "learning_rate": 3.4319526627218935e-06, "loss": 0.3936, "step": 58 }, { "epoch": 0.04, "grad_norm": 74.39313269747478, "learning_rate": 3.49112426035503e-06, "loss": 0.4253, "step": 59 }, { "epoch": 0.04, "grad_norm": 69.8169714038155, "learning_rate": 3.550295857988166e-06, "loss": 0.3569, "step": 60 }, { "epoch": 0.04, "grad_norm": 89.54284154509011, "learning_rate": 3.609467455621302e-06, "loss": 0.3887, "step": 61 }, { "epoch": 0.04, "grad_norm": 103.65697090523831, "learning_rate": 3.668639053254438e-06, "loss": 0.3853, "step": 62 }, { "epoch": 0.04, "grad_norm": 77.98627320346968, "learning_rate": 3.7278106508875745e-06, "loss": 0.3628, "step": 63 }, { "epoch": 0.05, "grad_norm": 42.66416290101905, "learning_rate": 3.7869822485207104e-06, "loss": 0.3237, "step": 64 }, { "epoch": 0.05, "grad_norm": 33.27388124719927, "learning_rate": 3.846153846153847e-06, "loss": 0.3032, "step": 65 }, { "epoch": 0.05, "grad_norm": 36.86492361076572, "learning_rate": 3.905325443786982e-06, "loss": 0.3418, "step": 66 }, { "epoch": 0.05, "grad_norm": 66.58133496209655, "learning_rate": 3.964497041420119e-06, "loss": 0.3433, "step": 67 }, { "epoch": 0.05, "grad_norm": 34.40834590692991, "learning_rate": 4.023668639053255e-06, "loss": 0.3169, "step": 68 }, { "epoch": 0.05, "grad_norm": 89.33579913199095, "learning_rate": 4.0828402366863906e-06, "loss": 0.3481, "step": 69 }, { "epoch": 0.05, "grad_norm": 75.01768722175284, "learning_rate": 4.142011834319527e-06, "loss": 0.3252, "step": 70 }, { "epoch": 0.05, "grad_norm": 16.173648614704415, "learning_rate": 4.201183431952663e-06, "loss": 0.2759, "step": 71 }, { "epoch": 0.05, "grad_norm": 18.990275484108082, "learning_rate": 4.2603550295858e-06, "loss": 0.2705, "step": 72 }, { "epoch": 0.05, "grad_norm": 38.76375650821992, "learning_rate": 4.319526627218935e-06, "loss": 0.2842, "step": 73 }, { "epoch": 0.05, "grad_norm": 56.67502763908607, "learning_rate": 4.3786982248520715e-06, "loss": 0.3154, "step": 74 }, { "epoch": 0.05, "grad_norm": 66.38202402483932, "learning_rate": 4.437869822485207e-06, "loss": 0.334, "step": 75 }, { "epoch": 0.05, "grad_norm": 26.659837428204344, "learning_rate": 4.497041420118343e-06, "loss": 0.2437, "step": 76 }, { "epoch": 0.05, "grad_norm": 49.89366358764825, "learning_rate": 4.55621301775148e-06, "loss": 0.2893, "step": 77 }, { "epoch": 0.06, "grad_norm": 22.724969702099184, "learning_rate": 4.615384615384616e-06, "loss": 0.2725, "step": 78 }, { "epoch": 0.06, "grad_norm": 28.80561993848226, "learning_rate": 4.674556213017752e-06, "loss": 0.3044, "step": 79 }, { "epoch": 0.06, "grad_norm": 29.286530669940298, "learning_rate": 4.733727810650888e-06, "loss": 0.2842, "step": 80 }, { "epoch": 0.06, "grad_norm": 14.16660585102508, "learning_rate": 4.792899408284024e-06, "loss": 0.2437, "step": 81 }, { "epoch": 0.06, "grad_norm": 73.9926353523121, "learning_rate": 4.85207100591716e-06, "loss": 0.2827, "step": 82 }, { "epoch": 0.06, "grad_norm": 29.878951527569956, "learning_rate": 4.911242603550296e-06, "loss": 0.2458, "step": 83 }, { "epoch": 0.06, "grad_norm": 11.484475177918583, "learning_rate": 4.970414201183432e-06, "loss": 0.2476, "step": 84 }, { "epoch": 0.06, "grad_norm": 55.59592780267639, "learning_rate": 5.029585798816569e-06, "loss": 0.2793, "step": 85 }, { "epoch": 0.06, "grad_norm": 50.21555063218247, "learning_rate": 5.088757396449705e-06, "loss": 0.2581, "step": 86 }, { "epoch": 0.06, "grad_norm": 13.528834407068818, "learning_rate": 5.14792899408284e-06, "loss": 0.239, "step": 87 }, { "epoch": 0.06, "grad_norm": 13.041769271462492, "learning_rate": 5.207100591715976e-06, "loss": 0.2402, "step": 88 }, { "epoch": 0.06, "grad_norm": 21.52430533549206, "learning_rate": 5.266272189349113e-06, "loss": 0.2417, "step": 89 }, { "epoch": 0.06, "grad_norm": 13.677037672727517, "learning_rate": 5.325443786982249e-06, "loss": 0.2317, "step": 90 }, { "epoch": 0.06, "grad_norm": 38.59157227506834, "learning_rate": 5.384615384615385e-06, "loss": 0.2336, "step": 91 }, { "epoch": 0.07, "grad_norm": 50.86665965832706, "learning_rate": 5.443786982248521e-06, "loss": 0.2283, "step": 92 }, { "epoch": 0.07, "grad_norm": 13.565157782160531, "learning_rate": 5.502958579881657e-06, "loss": 0.229, "step": 93 }, { "epoch": 0.07, "grad_norm": 9.612894516114126, "learning_rate": 5.562130177514793e-06, "loss": 0.2034, "step": 94 }, { "epoch": 0.07, "grad_norm": 41.99675072884753, "learning_rate": 5.621301775147929e-06, "loss": 0.218, "step": 95 }, { "epoch": 0.07, "grad_norm": 21.606516634808887, "learning_rate": 5.680473372781066e-06, "loss": 0.238, "step": 96 }, { "epoch": 0.07, "grad_norm": 38.612734423443584, "learning_rate": 5.739644970414202e-06, "loss": 0.2727, "step": 97 }, { "epoch": 0.07, "grad_norm": 10.14173463955849, "learning_rate": 5.7988165680473375e-06, "loss": 0.2144, "step": 98 }, { "epoch": 0.07, "grad_norm": 11.726835683501431, "learning_rate": 5.857988165680474e-06, "loss": 0.2217, "step": 99 }, { "epoch": 0.07, "grad_norm": 33.271941153238885, "learning_rate": 5.91715976331361e-06, "loss": 0.2334, "step": 100 }, { "epoch": 0.07, "grad_norm": 28.312522667414854, "learning_rate": 5.976331360946747e-06, "loss": 0.229, "step": 101 }, { "epoch": 0.07, "grad_norm": 39.47150040026952, "learning_rate": 6.035502958579882e-06, "loss": 0.2241, "step": 102 }, { "epoch": 0.07, "grad_norm": 10.387184326679442, "learning_rate": 6.0946745562130185e-06, "loss": 0.1914, "step": 103 }, { "epoch": 0.07, "grad_norm": 9.393251230339942, "learning_rate": 6.153846153846155e-06, "loss": 0.1992, "step": 104 }, { "epoch": 0.07, "grad_norm": 43.36911762706155, "learning_rate": 6.21301775147929e-06, "loss": 0.2244, "step": 105 }, { "epoch": 0.08, "grad_norm": 54.08032944347241, "learning_rate": 6.272189349112427e-06, "loss": 0.2598, "step": 106 }, { "epoch": 0.08, "grad_norm": 34.12457211661539, "learning_rate": 6.331360946745563e-06, "loss": 0.2126, "step": 107 }, { "epoch": 0.08, "grad_norm": 10.87841011317557, "learning_rate": 6.3905325443786995e-06, "loss": 0.1987, "step": 108 }, { "epoch": 0.08, "grad_norm": 45.90938123438191, "learning_rate": 6.449704142011834e-06, "loss": 0.2549, "step": 109 }, { "epoch": 0.08, "grad_norm": 43.25371651313902, "learning_rate": 6.5088757396449705e-06, "loss": 0.239, "step": 110 }, { "epoch": 0.08, "grad_norm": 37.74000118926859, "learning_rate": 6.568047337278107e-06, "loss": 0.2085, "step": 111 }, { "epoch": 0.08, "grad_norm": 8.833634104200893, "learning_rate": 6.627218934911244e-06, "loss": 0.1685, "step": 112 }, { "epoch": 0.08, "grad_norm": 36.89708866713719, "learning_rate": 6.686390532544379e-06, "loss": 0.2686, "step": 113 }, { "epoch": 0.08, "grad_norm": 50.89388673898367, "learning_rate": 6.745562130177515e-06, "loss": 0.2107, "step": 114 }, { "epoch": 0.08, "grad_norm": 44.94360423791429, "learning_rate": 6.8047337278106515e-06, "loss": 0.2383, "step": 115 }, { "epoch": 0.08, "grad_norm": 12.889886449107443, "learning_rate": 6.863905325443787e-06, "loss": 0.2217, "step": 116 }, { "epoch": 0.08, "grad_norm": 51.161996812885185, "learning_rate": 6.923076923076923e-06, "loss": 0.1895, "step": 117 }, { "epoch": 0.08, "grad_norm": 48.48659614955822, "learning_rate": 6.98224852071006e-06, "loss": 0.2227, "step": 118 }, { "epoch": 0.08, "grad_norm": 11.885655124875624, "learning_rate": 7.041420118343196e-06, "loss": 0.1958, "step": 119 }, { "epoch": 0.09, "grad_norm": 27.132063430841765, "learning_rate": 7.100591715976332e-06, "loss": 0.2239, "step": 120 }, { "epoch": 0.09, "grad_norm": 18.86057434855715, "learning_rate": 7.159763313609468e-06, "loss": 0.1902, "step": 121 }, { "epoch": 0.09, "grad_norm": 12.637608877658806, "learning_rate": 7.218934911242604e-06, "loss": 0.2109, "step": 122 }, { "epoch": 0.09, "grad_norm": 10.047033973462321, "learning_rate": 7.278106508875741e-06, "loss": 0.1904, "step": 123 }, { "epoch": 0.09, "grad_norm": 11.709619680883495, "learning_rate": 7.337278106508876e-06, "loss": 0.1729, "step": 124 }, { "epoch": 0.09, "grad_norm": 44.025278094830405, "learning_rate": 7.396449704142013e-06, "loss": 0.2012, "step": 125 }, { "epoch": 0.09, "grad_norm": 5.579611221108369, "learning_rate": 7.455621301775149e-06, "loss": 0.1807, "step": 126 }, { "epoch": 0.09, "grad_norm": 9.178080204391332, "learning_rate": 7.5147928994082845e-06, "loss": 0.1792, "step": 127 }, { "epoch": 0.09, "grad_norm": 10.70187830067804, "learning_rate": 7.573964497041421e-06, "loss": 0.1648, "step": 128 }, { "epoch": 0.09, "grad_norm": 41.67469230519406, "learning_rate": 7.633136094674556e-06, "loss": 0.189, "step": 129 }, { "epoch": 0.09, "grad_norm": 56.145533909909716, "learning_rate": 7.692307692307694e-06, "loss": 0.1772, "step": 130 }, { "epoch": 0.09, "grad_norm": 12.886508074747201, "learning_rate": 7.751479289940829e-06, "loss": 0.2095, "step": 131 }, { "epoch": 0.09, "grad_norm": 92.16611785910685, "learning_rate": 7.810650887573965e-06, "loss": 0.2383, "step": 132 }, { "epoch": 0.09, "grad_norm": 40.64519507288005, "learning_rate": 7.869822485207102e-06, "loss": 0.2139, "step": 133 }, { "epoch": 0.1, "grad_norm": 14.527350282989591, "learning_rate": 7.928994082840237e-06, "loss": 0.2041, "step": 134 }, { "epoch": 0.1, "grad_norm": 82.46586621777837, "learning_rate": 7.988165680473373e-06, "loss": 0.2083, "step": 135 }, { "epoch": 0.1, "grad_norm": 60.35861295571972, "learning_rate": 8.04733727810651e-06, "loss": 0.2388, "step": 136 }, { "epoch": 0.1, "grad_norm": 11.976620441569606, "learning_rate": 8.106508875739646e-06, "loss": 0.1814, "step": 137 }, { "epoch": 0.1, "grad_norm": 76.28662037424316, "learning_rate": 8.165680473372781e-06, "loss": 0.218, "step": 138 }, { "epoch": 0.1, "grad_norm": 49.990057565286385, "learning_rate": 8.224852071005918e-06, "loss": 0.2263, "step": 139 }, { "epoch": 0.1, "grad_norm": 9.12483970210935, "learning_rate": 8.284023668639054e-06, "loss": 0.1628, "step": 140 }, { "epoch": 0.1, "grad_norm": 34.40581343675325, "learning_rate": 8.343195266272191e-06, "loss": 0.2046, "step": 141 }, { "epoch": 0.1, "grad_norm": 86.73910967064404, "learning_rate": 8.402366863905327e-06, "loss": 0.2832, "step": 142 }, { "epoch": 0.1, "grad_norm": 44.998603587663474, "learning_rate": 8.461538461538462e-06, "loss": 0.2, "step": 143 }, { "epoch": 0.1, "grad_norm": 60.632795608186285, "learning_rate": 8.5207100591716e-06, "loss": 0.163, "step": 144 }, { "epoch": 0.1, "grad_norm": 139.66966654303818, "learning_rate": 8.579881656804735e-06, "loss": 0.356, "step": 145 }, { "epoch": 0.1, "grad_norm": 21.720375790615257, "learning_rate": 8.63905325443787e-06, "loss": 0.1716, "step": 146 }, { "epoch": 0.1, "grad_norm": 93.50974348316393, "learning_rate": 8.698224852071006e-06, "loss": 0.2695, "step": 147 }, { "epoch": 0.11, "grad_norm": 77.34688239840453, "learning_rate": 8.757396449704143e-06, "loss": 0.2104, "step": 148 }, { "epoch": 0.11, "grad_norm": 38.11609599313387, "learning_rate": 8.816568047337279e-06, "loss": 0.1985, "step": 149 }, { "epoch": 0.11, "grad_norm": 34.022306259789644, "learning_rate": 8.875739644970414e-06, "loss": 0.1743, "step": 150 }, { "epoch": 0.11, "grad_norm": 86.92363217035724, "learning_rate": 8.934911242603551e-06, "loss": 0.2617, "step": 151 }, { "epoch": 0.11, "grad_norm": 43.71673888744464, "learning_rate": 8.994082840236687e-06, "loss": 0.2148, "step": 152 }, { "epoch": 0.11, "grad_norm": 31.570603625815373, "learning_rate": 9.053254437869822e-06, "loss": 0.1858, "step": 153 }, { "epoch": 0.11, "grad_norm": 61.077459467399706, "learning_rate": 9.11242603550296e-06, "loss": 0.2427, "step": 154 }, { "epoch": 0.11, "grad_norm": 50.10622818300115, "learning_rate": 9.171597633136095e-06, "loss": 0.2407, "step": 155 }, { "epoch": 0.11, "grad_norm": 29.673041634897242, "learning_rate": 9.230769230769232e-06, "loss": 0.1736, "step": 156 }, { "epoch": 0.11, "grad_norm": 34.729480106635634, "learning_rate": 9.289940828402368e-06, "loss": 0.1782, "step": 157 }, { "epoch": 0.11, "grad_norm": 44.993817096293725, "learning_rate": 9.349112426035503e-06, "loss": 0.1809, "step": 158 }, { "epoch": 0.11, "grad_norm": 8.899448788867884, "learning_rate": 9.40828402366864e-06, "loss": 0.1951, "step": 159 }, { "epoch": 0.11, "grad_norm": 18.286850457168683, "learning_rate": 9.467455621301776e-06, "loss": 0.1606, "step": 160 }, { "epoch": 0.11, "grad_norm": 56.53072371599443, "learning_rate": 9.526627218934912e-06, "loss": 0.2, "step": 161 }, { "epoch": 0.12, "grad_norm": 5.264038479497928, "learning_rate": 9.585798816568049e-06, "loss": 0.1448, "step": 162 }, { "epoch": 0.12, "grad_norm": 43.211370059501874, "learning_rate": 9.644970414201184e-06, "loss": 0.1975, "step": 163 }, { "epoch": 0.12, "grad_norm": 14.273967210767667, "learning_rate": 9.70414201183432e-06, "loss": 0.1809, "step": 164 }, { "epoch": 0.12, "grad_norm": 32.98648840761662, "learning_rate": 9.763313609467457e-06, "loss": 0.1685, "step": 165 }, { "epoch": 0.12, "grad_norm": 33.93062311124414, "learning_rate": 9.822485207100593e-06, "loss": 0.1956, "step": 166 }, { "epoch": 0.12, "grad_norm": 51.5363090967256, "learning_rate": 9.88165680473373e-06, "loss": 0.187, "step": 167 }, { "epoch": 0.12, "grad_norm": 9.60396802401412, "learning_rate": 9.940828402366864e-06, "loss": 0.1592, "step": 168 }, { "epoch": 0.12, "grad_norm": 25.22452245463198, "learning_rate": 1e-05, "loss": 0.1738, "step": 169 }, { "epoch": 0.12, "grad_norm": 57.51781109179427, "learning_rate": 9.999999164703534e-06, "loss": 0.21, "step": 170 }, { "epoch": 0.12, "grad_norm": 19.002582848688764, "learning_rate": 9.999996658814406e-06, "loss": 0.1428, "step": 171 }, { "epoch": 0.12, "grad_norm": 12.276224778176067, "learning_rate": 9.999992482333461e-06, "loss": 0.1465, "step": 172 }, { "epoch": 0.12, "grad_norm": 44.678917780683705, "learning_rate": 9.99998663526209e-06, "loss": 0.1617, "step": 173 }, { "epoch": 0.12, "grad_norm": 6.960442853339805, "learning_rate": 9.99997911760225e-06, "loss": 0.1443, "step": 174 }, { "epoch": 0.12, "grad_norm": 14.261742691312454, "learning_rate": 9.99996992935645e-06, "loss": 0.1682, "step": 175 }, { "epoch": 0.13, "grad_norm": 24.032339010668686, "learning_rate": 9.99995907052776e-06, "loss": 0.1469, "step": 176 }, { "epoch": 0.13, "grad_norm": 58.31377133614243, "learning_rate": 9.99994654111981e-06, "loss": 0.2073, "step": 177 }, { "epoch": 0.13, "grad_norm": 23.70246515992443, "learning_rate": 9.999932341136785e-06, "loss": 0.1718, "step": 178 }, { "epoch": 0.13, "grad_norm": 26.891031514237994, "learning_rate": 9.999916470583429e-06, "loss": 0.1554, "step": 179 }, { "epoch": 0.13, "grad_norm": 57.563572942900166, "learning_rate": 9.999898929465047e-06, "loss": 0.1926, "step": 180 }, { "epoch": 0.13, "grad_norm": 36.34968696505108, "learning_rate": 9.999879717787495e-06, "loss": 0.1558, "step": 181 }, { "epoch": 0.13, "grad_norm": 7.794023955932686, "learning_rate": 9.999858835557197e-06, "loss": 0.145, "step": 182 }, { "epoch": 0.13, "grad_norm": 47.23120098027718, "learning_rate": 9.999836282781128e-06, "loss": 0.178, "step": 183 }, { "epoch": 0.13, "grad_norm": 40.114505438540625, "learning_rate": 9.999812059466825e-06, "loss": 0.2019, "step": 184 }, { "epoch": 0.13, "grad_norm": 25.380586865046148, "learning_rate": 9.999786165622379e-06, "loss": 0.1559, "step": 185 }, { "epoch": 0.13, "grad_norm": 29.25336121583889, "learning_rate": 9.999758601256441e-06, "loss": 0.1888, "step": 186 }, { "epoch": 0.13, "grad_norm": 64.84408243732132, "learning_rate": 9.999729366378224e-06, "loss": 0.1921, "step": 187 }, { "epoch": 0.13, "grad_norm": 22.31665172260842, "learning_rate": 9.999698460997493e-06, "loss": 0.1838, "step": 188 }, { "epoch": 0.13, "grad_norm": 18.476404539169845, "learning_rate": 9.999665885124577e-06, "loss": 0.1499, "step": 189 }, { "epoch": 0.14, "grad_norm": 41.418357640800615, "learning_rate": 9.99963163877036e-06, "loss": 0.2021, "step": 190 }, { "epoch": 0.14, "grad_norm": 31.53664585439554, "learning_rate": 9.99959572194628e-06, "loss": 0.2122, "step": 191 }, { "epoch": 0.14, "grad_norm": 21.575514536412193, "learning_rate": 9.999558134664342e-06, "loss": 0.1829, "step": 192 }, { "epoch": 0.14, "grad_norm": 39.02562270046022, "learning_rate": 9.999518876937102e-06, "loss": 0.1566, "step": 193 }, { "epoch": 0.14, "grad_norm": 33.24695587894434, "learning_rate": 9.999477948777678e-06, "loss": 0.1599, "step": 194 }, { "epoch": 0.14, "grad_norm": 45.78127128268334, "learning_rate": 9.999435350199745e-06, "loss": 0.1709, "step": 195 }, { "epoch": 0.14, "grad_norm": 42.42344263018681, "learning_rate": 9.999391081217536e-06, "loss": 0.1919, "step": 196 }, { "epoch": 0.14, "grad_norm": 13.73579636286334, "learning_rate": 9.999345141845842e-06, "loss": 0.1562, "step": 197 }, { "epoch": 0.14, "grad_norm": 42.85891427323893, "learning_rate": 9.99929753210001e-06, "loss": 0.1689, "step": 198 }, { "epoch": 0.14, "grad_norm": 6.259144765611043, "learning_rate": 9.999248251995951e-06, "loss": 0.1653, "step": 199 }, { "epoch": 0.14, "grad_norm": 24.27283819335801, "learning_rate": 9.999197301550127e-06, "loss": 0.1798, "step": 200 }, { "epoch": 0.14, "grad_norm": 27.899669266770783, "learning_rate": 9.999144680779564e-06, "loss": 0.1731, "step": 201 }, { "epoch": 0.14, "grad_norm": 11.179927364455224, "learning_rate": 9.999090389701844e-06, "loss": 0.1702, "step": 202 }, { "epoch": 0.14, "grad_norm": 19.847681296179072, "learning_rate": 9.999034428335103e-06, "loss": 0.1591, "step": 203 }, { "epoch": 0.15, "grad_norm": 11.752832888535757, "learning_rate": 9.998976796698043e-06, "loss": 0.1545, "step": 204 }, { "epoch": 0.15, "grad_norm": 20.029510235909346, "learning_rate": 9.998917494809917e-06, "loss": 0.1603, "step": 205 }, { "epoch": 0.15, "grad_norm": 15.214337375526133, "learning_rate": 9.998856522690538e-06, "loss": 0.1699, "step": 206 }, { "epoch": 0.15, "grad_norm": 12.852230970076132, "learning_rate": 9.998793880360283e-06, "loss": 0.1493, "step": 207 }, { "epoch": 0.15, "grad_norm": 42.16352485230185, "learning_rate": 9.998729567840077e-06, "loss": 0.2166, "step": 208 }, { "epoch": 0.15, "grad_norm": 43.29824172685643, "learning_rate": 9.998663585151409e-06, "loss": 0.1897, "step": 209 }, { "epoch": 0.15, "grad_norm": 42.33267438448777, "learning_rate": 9.998595932316327e-06, "loss": 0.1721, "step": 210 }, { "epoch": 0.15, "grad_norm": 8.107364224481227, "learning_rate": 9.998526609357432e-06, "loss": 0.1514, "step": 211 }, { "epoch": 0.15, "grad_norm": 40.03181460372598, "learning_rate": 9.998455616297889e-06, "loss": 0.1746, "step": 212 }, { "epoch": 0.15, "grad_norm": 49.846724341530525, "learning_rate": 9.998382953161417e-06, "loss": 0.2153, "step": 213 }, { "epoch": 0.15, "grad_norm": 23.55333318015724, "learning_rate": 9.998308619972292e-06, "loss": 0.1775, "step": 214 }, { "epoch": 0.15, "grad_norm": 49.65058025675812, "learning_rate": 9.998232616755354e-06, "loss": 0.1721, "step": 215 }, { "epoch": 0.15, "grad_norm": 30.06267959273612, "learning_rate": 9.998154943535996e-06, "loss": 0.1575, "step": 216 }, { "epoch": 0.15, "grad_norm": 31.9421224193554, "learning_rate": 9.998075600340166e-06, "loss": 0.179, "step": 217 }, { "epoch": 0.16, "grad_norm": 44.71443339581383, "learning_rate": 9.997994587194381e-06, "loss": 0.1708, "step": 218 }, { "epoch": 0.16, "grad_norm": 5.825036612048357, "learning_rate": 9.997911904125704e-06, "loss": 0.141, "step": 219 }, { "epoch": 0.16, "grad_norm": 9.105912112927012, "learning_rate": 9.997827551161762e-06, "loss": 0.1552, "step": 220 }, { "epoch": 0.16, "grad_norm": 30.337684796583403, "learning_rate": 9.997741528330739e-06, "loss": 0.1497, "step": 221 }, { "epoch": 0.16, "grad_norm": 5.256728997034248, "learning_rate": 9.997653835661376e-06, "loss": 0.1792, "step": 222 }, { "epoch": 0.16, "grad_norm": 11.710837732666267, "learning_rate": 9.997564473182976e-06, "loss": 0.1416, "step": 223 }, { "epoch": 0.16, "grad_norm": 6.138467739217358, "learning_rate": 9.997473440925394e-06, "loss": 0.1589, "step": 224 }, { "epoch": 0.16, "grad_norm": 10.011674393770035, "learning_rate": 9.997380738919045e-06, "loss": 0.1641, "step": 225 }, { "epoch": 0.16, "grad_norm": 45.558916966443206, "learning_rate": 9.997286367194903e-06, "loss": 0.1786, "step": 226 }, { "epoch": 0.16, "grad_norm": 22.828341098912837, "learning_rate": 9.9971903257845e-06, "loss": 0.1472, "step": 227 }, { "epoch": 0.16, "grad_norm": 32.44780904932061, "learning_rate": 9.997092614719926e-06, "loss": 0.1599, "step": 228 }, { "epoch": 0.16, "grad_norm": 37.937074029999266, "learning_rate": 9.996993234033826e-06, "loss": 0.1521, "step": 229 }, { "epoch": 0.16, "grad_norm": 22.707824607414384, "learning_rate": 9.996892183759407e-06, "loss": 0.1743, "step": 230 }, { "epoch": 0.16, "grad_norm": 46.45915188550997, "learning_rate": 9.99678946393043e-06, "loss": 0.1892, "step": 231 }, { "epoch": 0.17, "grad_norm": 24.319677361782876, "learning_rate": 9.996685074581216e-06, "loss": 0.1489, "step": 232 }, { "epoch": 0.17, "grad_norm": 18.661367787669693, "learning_rate": 9.996579015746645e-06, "loss": 0.1721, "step": 233 }, { "epoch": 0.17, "grad_norm": 57.942764629266655, "learning_rate": 9.996471287462151e-06, "loss": 0.1649, "step": 234 }, { "epoch": 0.17, "grad_norm": 13.048230837396387, "learning_rate": 9.99636188976373e-06, "loss": 0.1602, "step": 235 }, { "epoch": 0.17, "grad_norm": 8.709658285204258, "learning_rate": 9.996250822687932e-06, "loss": 0.1438, "step": 236 }, { "epoch": 0.17, "grad_norm": 31.617100347623733, "learning_rate": 9.996138086271869e-06, "loss": 0.1556, "step": 237 }, { "epoch": 0.17, "grad_norm": 35.27416734620715, "learning_rate": 9.996023680553204e-06, "loss": 0.1575, "step": 238 }, { "epoch": 0.17, "grad_norm": 9.153491233670788, "learning_rate": 9.995907605570167e-06, "loss": 0.1797, "step": 239 }, { "epoch": 0.17, "grad_norm": 7.987526278742785, "learning_rate": 9.995789861361538e-06, "loss": 0.2065, "step": 240 }, { "epoch": 0.17, "grad_norm": 64.32372496086278, "learning_rate": 9.995670447966658e-06, "loss": 0.1824, "step": 241 }, { "epoch": 0.17, "grad_norm": 15.645118470787404, "learning_rate": 9.995549365425426e-06, "loss": 0.1591, "step": 242 }, { "epoch": 0.17, "grad_norm": 4.7785990929647815, "learning_rate": 9.995426613778297e-06, "loss": 0.1615, "step": 243 }, { "epoch": 0.17, "grad_norm": 44.643088379285764, "learning_rate": 9.995302193066286e-06, "loss": 0.1528, "step": 244 }, { "epoch": 0.17, "grad_norm": 29.669669631908086, "learning_rate": 9.995176103330962e-06, "loss": 0.1643, "step": 245 }, { "epoch": 0.18, "grad_norm": 5.187868389208368, "learning_rate": 9.995048344614455e-06, "loss": 0.1855, "step": 246 }, { "epoch": 0.18, "grad_norm": 64.34845710145298, "learning_rate": 9.994918916959453e-06, "loss": 0.2019, "step": 247 }, { "epoch": 0.18, "grad_norm": 27.545718441791433, "learning_rate": 9.994787820409198e-06, "loss": 0.1816, "step": 248 }, { "epoch": 0.18, "grad_norm": 16.691064146642706, "learning_rate": 9.994655055007491e-06, "loss": 0.1548, "step": 249 }, { "epoch": 0.18, "grad_norm": 32.47918718068585, "learning_rate": 9.994520620798696e-06, "loss": 0.1423, "step": 250 }, { "epoch": 0.18, "grad_norm": 62.24279005895381, "learning_rate": 9.994384517827726e-06, "loss": 0.1908, "step": 251 }, { "epoch": 0.18, "grad_norm": 6.306470473367477, "learning_rate": 9.994246746140057e-06, "loss": 0.1655, "step": 252 }, { "epoch": 0.18, "grad_norm": 69.9581859530375, "learning_rate": 9.99410730578172e-06, "loss": 0.2007, "step": 253 }, { "epoch": 0.18, "grad_norm": 35.84402462268306, "learning_rate": 9.993966196799304e-06, "loss": 0.1494, "step": 254 }, { "epoch": 0.18, "grad_norm": 6.728361882638175, "learning_rate": 9.993823419239959e-06, "loss": 0.1531, "step": 255 }, { "epoch": 0.18, "grad_norm": 30.744672622096342, "learning_rate": 9.993678973151388e-06, "loss": 0.1378, "step": 256 }, { "epoch": 0.18, "grad_norm": 72.68699910576366, "learning_rate": 9.993532858581853e-06, "loss": 0.2354, "step": 257 }, { "epoch": 0.18, "grad_norm": 10.87452116844784, "learning_rate": 9.993385075580173e-06, "loss": 0.1495, "step": 258 }, { "epoch": 0.18, "grad_norm": 27.98767655605202, "learning_rate": 9.993235624195728e-06, "loss": 0.1709, "step": 259 }, { "epoch": 0.19, "grad_norm": 33.83809754783617, "learning_rate": 9.993084504478448e-06, "loss": 0.167, "step": 260 }, { "epoch": 0.19, "grad_norm": 5.335493128269599, "learning_rate": 9.99293171647883e-06, "loss": 0.1222, "step": 261 }, { "epoch": 0.19, "grad_norm": 13.010955660808273, "learning_rate": 9.992777260247916e-06, "loss": 0.1418, "step": 262 }, { "epoch": 0.19, "grad_norm": 5.84935039014272, "learning_rate": 9.99262113583732e-06, "loss": 0.1448, "step": 263 }, { "epoch": 0.19, "grad_norm": 15.275558317087714, "learning_rate": 9.992463343299203e-06, "loss": 0.1398, "step": 264 }, { "epoch": 0.19, "grad_norm": 11.577795034557614, "learning_rate": 9.992303882686288e-06, "loss": 0.1459, "step": 265 }, { "epoch": 0.19, "grad_norm": 21.96507681305609, "learning_rate": 9.99214275405185e-06, "loss": 0.1466, "step": 266 }, { "epoch": 0.19, "grad_norm": 6.113512224090927, "learning_rate": 9.991979957449729e-06, "loss": 0.149, "step": 267 }, { "epoch": 0.19, "grad_norm": 7.810162297246139, "learning_rate": 9.991815492934318e-06, "loss": 0.1479, "step": 268 }, { "epoch": 0.19, "grad_norm": 32.60107223287011, "learning_rate": 9.991649360560565e-06, "loss": 0.1899, "step": 269 }, { "epoch": 0.19, "grad_norm": 29.946520202273987, "learning_rate": 9.99148156038398e-06, "loss": 0.1804, "step": 270 }, { "epoch": 0.19, "grad_norm": 10.367035744107707, "learning_rate": 9.991312092460626e-06, "loss": 0.1296, "step": 271 }, { "epoch": 0.19, "grad_norm": 62.72671197950166, "learning_rate": 9.991140956847128e-06, "loss": 0.207, "step": 272 }, { "epoch": 0.19, "grad_norm": 4.62480621517589, "learning_rate": 9.990968153600664e-06, "loss": 0.1626, "step": 273 }, { "epoch": 0.2, "grad_norm": 19.419784824468074, "learning_rate": 9.990793682778973e-06, "loss": 0.139, "step": 274 }, { "epoch": 0.2, "grad_norm": 52.135212954885354, "learning_rate": 9.990617544440346e-06, "loss": 0.1566, "step": 275 }, { "epoch": 0.2, "grad_norm": 20.1662562272831, "learning_rate": 9.990439738643635e-06, "loss": 0.1516, "step": 276 }, { "epoch": 0.2, "grad_norm": 21.742228599596476, "learning_rate": 9.99026026544825e-06, "loss": 0.1516, "step": 277 }, { "epoch": 0.2, "grad_norm": 47.92258723772546, "learning_rate": 9.990079124914156e-06, "loss": 0.1448, "step": 278 }, { "epoch": 0.2, "grad_norm": 30.997197668797394, "learning_rate": 9.989896317101873e-06, "loss": 0.1375, "step": 279 }, { "epoch": 0.2, "grad_norm": 31.062475355277208, "learning_rate": 9.989711842072482e-06, "loss": 0.1689, "step": 280 }, { "epoch": 0.2, "grad_norm": 53.09032787145096, "learning_rate": 9.989525699887619e-06, "loss": 0.1543, "step": 281 }, { "epoch": 0.2, "grad_norm": 31.435016992194903, "learning_rate": 9.989337890609478e-06, "loss": 0.1792, "step": 282 }, { "epoch": 0.2, "grad_norm": 17.590958097222614, "learning_rate": 9.98914841430081e-06, "loss": 0.1298, "step": 283 }, { "epoch": 0.2, "grad_norm": 7.770018952416595, "learning_rate": 9.988957271024922e-06, "loss": 0.119, "step": 284 }, { "epoch": 0.2, "grad_norm": 67.99377764408035, "learning_rate": 9.988764460845676e-06, "loss": 0.2058, "step": 285 }, { "epoch": 0.2, "grad_norm": 37.91613195879691, "learning_rate": 9.9885699838275e-06, "loss": 0.1306, "step": 286 }, { "epoch": 0.2, "grad_norm": 26.261219214046495, "learning_rate": 9.988373840035366e-06, "loss": 0.1443, "step": 287 }, { "epoch": 0.21, "grad_norm": 42.836761940466495, "learning_rate": 9.988176029534814e-06, "loss": 0.161, "step": 288 }, { "epoch": 0.21, "grad_norm": 39.39654040488338, "learning_rate": 9.987976552391933e-06, "loss": 0.1567, "step": 289 }, { "epoch": 0.21, "grad_norm": 11.790620806136728, "learning_rate": 9.987775408673373e-06, "loss": 0.1569, "step": 290 }, { "epoch": 0.21, "grad_norm": 53.31062556612756, "learning_rate": 9.987572598446337e-06, "loss": 0.1945, "step": 291 }, { "epoch": 0.21, "grad_norm": 56.490600010392015, "learning_rate": 9.987368121778594e-06, "loss": 0.1736, "step": 292 }, { "epoch": 0.21, "grad_norm": 11.122995442967113, "learning_rate": 9.98716197873846e-06, "loss": 0.1388, "step": 293 }, { "epoch": 0.21, "grad_norm": 92.30833385479315, "learning_rate": 9.98695416939481e-06, "loss": 0.231, "step": 294 }, { "epoch": 0.21, "grad_norm": 35.983480070160525, "learning_rate": 9.986744693817077e-06, "loss": 0.1768, "step": 295 }, { "epoch": 0.21, "grad_norm": 16.767014422656075, "learning_rate": 9.986533552075252e-06, "loss": 0.1654, "step": 296 }, { "epoch": 0.21, "grad_norm": 34.16924846592324, "learning_rate": 9.986320744239883e-06, "loss": 0.1589, "step": 297 }, { "epoch": 0.21, "grad_norm": 51.06145849537146, "learning_rate": 9.98610627038207e-06, "loss": 0.1792, "step": 298 }, { "epoch": 0.21, "grad_norm": 14.179575987397332, "learning_rate": 9.985890130573474e-06, "loss": 0.1523, "step": 299 }, { "epoch": 0.21, "grad_norm": 32.87174916074473, "learning_rate": 9.98567232488631e-06, "loss": 0.1482, "step": 300 }, { "epoch": 0.21, "grad_norm": 20.716840238877516, "learning_rate": 9.985452853393353e-06, "loss": 0.1276, "step": 301 }, { "epoch": 0.22, "grad_norm": 39.212614000189724, "learning_rate": 9.985231716167933e-06, "loss": 0.15, "step": 302 }, { "epoch": 0.22, "grad_norm": 14.977736600228983, "learning_rate": 9.985008913283933e-06, "loss": 0.1562, "step": 303 }, { "epoch": 0.22, "grad_norm": 36.10332759742872, "learning_rate": 9.984784444815799e-06, "loss": 0.191, "step": 304 }, { "epoch": 0.22, "grad_norm": 8.909269570291388, "learning_rate": 9.984558310838528e-06, "loss": 0.1589, "step": 305 }, { "epoch": 0.22, "grad_norm": 28.923562213086704, "learning_rate": 9.984330511427676e-06, "loss": 0.1799, "step": 306 }, { "epoch": 0.22, "grad_norm": 13.490403752500722, "learning_rate": 9.984101046659353e-06, "loss": 0.1479, "step": 307 }, { "epoch": 0.22, "grad_norm": 30.913895702455253, "learning_rate": 9.983869916610232e-06, "loss": 0.1377, "step": 308 }, { "epoch": 0.22, "grad_norm": 53.35070194289101, "learning_rate": 9.983637121357534e-06, "loss": 0.1831, "step": 309 }, { "epoch": 0.22, "grad_norm": 19.191550606900922, "learning_rate": 9.983402660979042e-06, "loss": 0.1614, "step": 310 }, { "epoch": 0.22, "grad_norm": 46.927298296283055, "learning_rate": 9.983166535553093e-06, "loss": 0.1902, "step": 311 }, { "epoch": 0.22, "grad_norm": 13.604114451062268, "learning_rate": 9.98292874515858e-06, "loss": 0.1169, "step": 312 }, { "epoch": 0.22, "grad_norm": 9.658236921460949, "learning_rate": 9.982689289874956e-06, "loss": 0.1569, "step": 313 }, { "epoch": 0.22, "grad_norm": 6.620979122118501, "learning_rate": 9.982448169782226e-06, "loss": 0.1575, "step": 314 }, { "epoch": 0.22, "grad_norm": 9.20134743823668, "learning_rate": 9.98220538496095e-06, "loss": 0.1302, "step": 315 }, { "epoch": 0.23, "grad_norm": 21.83470224282846, "learning_rate": 9.98196093549225e-06, "loss": 0.1381, "step": 316 }, { "epoch": 0.23, "grad_norm": 5.019438949112746, "learning_rate": 9.9817148214578e-06, "loss": 0.1637, "step": 317 }, { "epoch": 0.23, "grad_norm": 18.531774403521805, "learning_rate": 9.981467042939833e-06, "loss": 0.1531, "step": 318 }, { "epoch": 0.23, "grad_norm": 6.772842664240888, "learning_rate": 9.981217600021133e-06, "loss": 0.1455, "step": 319 }, { "epoch": 0.23, "grad_norm": 4.688710490868121, "learning_rate": 9.980966492785048e-06, "loss": 0.1639, "step": 320 }, { "epoch": 0.23, "grad_norm": 12.259163694801227, "learning_rate": 9.980713721315473e-06, "loss": 0.1166, "step": 321 }, { "epoch": 0.23, "grad_norm": 7.099541659387623, "learning_rate": 9.98045928569687e-06, "loss": 0.1406, "step": 322 }, { "epoch": 0.23, "grad_norm": 12.661009836373966, "learning_rate": 9.98020318601424e-06, "loss": 0.1086, "step": 323 }, { "epoch": 0.23, "grad_norm": 7.038778597302965, "learning_rate": 9.97994542235316e-06, "loss": 0.1447, "step": 324 }, { "epoch": 0.23, "grad_norm": 21.75909194513975, "learning_rate": 9.979685994799753e-06, "loss": 0.1561, "step": 325 }, { "epoch": 0.23, "grad_norm": 35.89676316531792, "learning_rate": 9.979424903440695e-06, "loss": 0.1526, "step": 326 }, { "epoch": 0.23, "grad_norm": 26.47012119449328, "learning_rate": 9.979162148363222e-06, "loss": 0.1331, "step": 327 }, { "epoch": 0.23, "grad_norm": 41.47116585357957, "learning_rate": 9.978897729655127e-06, "loss": 0.1527, "step": 328 }, { "epoch": 0.23, "grad_norm": 68.50658423771155, "learning_rate": 9.978631647404755e-06, "loss": 0.2024, "step": 329 }, { "epoch": 0.24, "grad_norm": 5.536910352068493, "learning_rate": 9.97836390170101e-06, "loss": 0.1448, "step": 330 }, { "epoch": 0.24, "grad_norm": 62.646920155000565, "learning_rate": 9.978094492633353e-06, "loss": 0.1959, "step": 331 }, { "epoch": 0.24, "grad_norm": 74.9129334802522, "learning_rate": 9.977823420291796e-06, "loss": 0.2213, "step": 332 }, { "epoch": 0.24, "grad_norm": 4.926863527790253, "learning_rate": 9.97755068476691e-06, "loss": 0.1501, "step": 333 }, { "epoch": 0.24, "grad_norm": 29.900030068841634, "learning_rate": 9.977276286149821e-06, "loss": 0.1589, "step": 334 }, { "epoch": 0.24, "grad_norm": 64.98822420575073, "learning_rate": 9.977000224532211e-06, "loss": 0.1938, "step": 335 }, { "epoch": 0.24, "grad_norm": 43.849118052159554, "learning_rate": 9.976722500006318e-06, "loss": 0.1895, "step": 336 }, { "epoch": 0.24, "grad_norm": 16.878076037266382, "learning_rate": 9.976443112664932e-06, "loss": 0.1444, "step": 337 }, { "epoch": 0.24, "grad_norm": 58.54099348004542, "learning_rate": 9.976162062601407e-06, "loss": 0.1805, "step": 338 }, { "epoch": 0.24, "grad_norm": 37.890609096404795, "learning_rate": 9.97587934990964e-06, "loss": 0.1713, "step": 339 }, { "epoch": 0.24, "grad_norm": 5.701668469320554, "learning_rate": 9.975594974684096e-06, "loss": 0.1388, "step": 340 }, { "epoch": 0.24, "grad_norm": 7.179582917571072, "learning_rate": 9.975308937019787e-06, "loss": 0.1361, "step": 341 }, { "epoch": 0.24, "grad_norm": 38.612872861365716, "learning_rate": 9.975021237012286e-06, "loss": 0.1589, "step": 342 }, { "epoch": 0.24, "grad_norm": 28.653520911802577, "learning_rate": 9.974731874757717e-06, "loss": 0.1484, "step": 343 }, { "epoch": 0.25, "grad_norm": 28.38881703138808, "learning_rate": 9.974440850352762e-06, "loss": 0.1755, "step": 344 }, { "epoch": 0.25, "grad_norm": 48.04652377386355, "learning_rate": 9.974148163894658e-06, "loss": 0.1395, "step": 345 }, { "epoch": 0.25, "grad_norm": 28.398568336832877, "learning_rate": 9.973853815481196e-06, "loss": 0.1409, "step": 346 }, { "epoch": 0.25, "grad_norm": 11.919991106134674, "learning_rate": 9.973557805210724e-06, "loss": 0.1555, "step": 347 }, { "epoch": 0.25, "grad_norm": 9.837518594770932, "learning_rate": 9.973260133182145e-06, "loss": 0.1455, "step": 348 }, { "epoch": 0.25, "grad_norm": 27.31277900154392, "learning_rate": 9.972960799494915e-06, "loss": 0.1361, "step": 349 }, { "epoch": 0.25, "grad_norm": 12.083930952779552, "learning_rate": 9.972659804249047e-06, "loss": 0.1295, "step": 350 }, { "epoch": 0.25, "grad_norm": 7.435839166347306, "learning_rate": 9.972357147545113e-06, "loss": 0.1345, "step": 351 }, { "epoch": 0.25, "grad_norm": 28.605070082539534, "learning_rate": 9.972052829484231e-06, "loss": 0.1387, "step": 352 }, { "epoch": 0.25, "grad_norm": 13.819008184513397, "learning_rate": 9.971746850168084e-06, "loss": 0.1255, "step": 353 }, { "epoch": 0.25, "grad_norm": 15.558085688257078, "learning_rate": 9.971439209698902e-06, "loss": 0.1755, "step": 354 }, { "epoch": 0.25, "grad_norm": 14.892786074375907, "learning_rate": 9.971129908179474e-06, "loss": 0.1552, "step": 355 }, { "epoch": 0.25, "grad_norm": 9.814580259790407, "learning_rate": 9.970818945713145e-06, "loss": 0.1426, "step": 356 }, { "epoch": 0.25, "grad_norm": 8.246334439430278, "learning_rate": 9.970506322403813e-06, "loss": 0.1237, "step": 357 }, { "epoch": 0.26, "grad_norm": 36.7630773678966, "learning_rate": 9.970192038355928e-06, "loss": 0.1527, "step": 358 }, { "epoch": 0.26, "grad_norm": 18.698463504899674, "learning_rate": 9.969876093674502e-06, "loss": 0.1565, "step": 359 }, { "epoch": 0.26, "grad_norm": 34.419554162278324, "learning_rate": 9.969558488465097e-06, "loss": 0.1506, "step": 360 }, { "epoch": 0.26, "grad_norm": 36.93690244776614, "learning_rate": 9.969239222833829e-06, "loss": 0.1531, "step": 361 }, { "epoch": 0.26, "grad_norm": 33.49368454947981, "learning_rate": 9.968918296887374e-06, "loss": 0.1509, "step": 362 }, { "epoch": 0.26, "grad_norm": 25.28539142873406, "learning_rate": 9.968595710732955e-06, "loss": 0.1499, "step": 363 }, { "epoch": 0.26, "grad_norm": 23.43222923122621, "learning_rate": 9.968271464478357e-06, "loss": 0.1312, "step": 364 }, { "epoch": 0.26, "grad_norm": 19.51822930107335, "learning_rate": 9.967945558231917e-06, "loss": 0.144, "step": 365 }, { "epoch": 0.26, "grad_norm": 41.83722751889746, "learning_rate": 9.967617992102526e-06, "loss": 0.1533, "step": 366 }, { "epoch": 0.26, "grad_norm": 30.973390532695422, "learning_rate": 9.967288766199628e-06, "loss": 0.13, "step": 367 }, { "epoch": 0.26, "grad_norm": 9.81797488242334, "learning_rate": 9.966957880633225e-06, "loss": 0.1371, "step": 368 }, { "epoch": 0.26, "grad_norm": 18.097159112396604, "learning_rate": 9.966625335513873e-06, "loss": 0.1356, "step": 369 }, { "epoch": 0.26, "grad_norm": 5.8534602258826025, "learning_rate": 9.96629113095268e-06, "loss": 0.1406, "step": 370 }, { "epoch": 0.26, "grad_norm": 12.63522722168294, "learning_rate": 9.965955267061309e-06, "loss": 0.1616, "step": 371 }, { "epoch": 0.27, "grad_norm": 12.281840014620311, "learning_rate": 9.965617743951982e-06, "loss": 0.1528, "step": 372 }, { "epoch": 0.27, "grad_norm": 11.010291018078059, "learning_rate": 9.965278561737466e-06, "loss": 0.1039, "step": 373 }, { "epoch": 0.27, "grad_norm": 23.593676742558646, "learning_rate": 9.964937720531094e-06, "loss": 0.1334, "step": 374 }, { "epoch": 0.27, "grad_norm": 11.379963157519395, "learning_rate": 9.964595220446744e-06, "loss": 0.1658, "step": 375 }, { "epoch": 0.27, "grad_norm": 8.612895660033113, "learning_rate": 9.964251061598853e-06, "loss": 0.1724, "step": 376 }, { "epoch": 0.27, "grad_norm": 10.169065570952082, "learning_rate": 9.96390524410241e-06, "loss": 0.1475, "step": 377 }, { "epoch": 0.27, "grad_norm": 24.25698722300466, "learning_rate": 9.96355776807296e-06, "loss": 0.1232, "step": 378 }, { "epoch": 0.27, "grad_norm": 17.4894072996316, "learning_rate": 9.9632086336266e-06, "loss": 0.1412, "step": 379 }, { "epoch": 0.27, "grad_norm": 14.429277148705365, "learning_rate": 9.962857840879983e-06, "loss": 0.1322, "step": 380 }, { "epoch": 0.27, "grad_norm": 21.967245963743874, "learning_rate": 9.962505389950317e-06, "loss": 0.1565, "step": 381 }, { "epoch": 0.27, "grad_norm": 13.836611609961958, "learning_rate": 9.962151280955359e-06, "loss": 0.1473, "step": 382 }, { "epoch": 0.27, "grad_norm": 6.777348182737883, "learning_rate": 9.961795514013424e-06, "loss": 0.1934, "step": 383 }, { "epoch": 0.27, "grad_norm": 14.417467203134215, "learning_rate": 9.961438089243384e-06, "loss": 0.1414, "step": 384 }, { "epoch": 0.27, "grad_norm": 6.704688741271175, "learning_rate": 9.961079006764659e-06, "loss": 0.198, "step": 385 }, { "epoch": 0.28, "grad_norm": 6.379293993035889, "learning_rate": 9.960718266697223e-06, "loss": 0.115, "step": 386 }, { "epoch": 0.28, "grad_norm": 13.477558643143759, "learning_rate": 9.960355869161609e-06, "loss": 0.1284, "step": 387 }, { "epoch": 0.28, "grad_norm": 5.698091583062664, "learning_rate": 9.959991814278898e-06, "loss": 0.1287, "step": 388 }, { "epoch": 0.28, "grad_norm": 22.68559034944762, "learning_rate": 9.95962610217073e-06, "loss": 0.1202, "step": 389 }, { "epoch": 0.28, "grad_norm": 11.617129910996631, "learning_rate": 9.959258732959296e-06, "loss": 0.1614, "step": 390 }, { "epoch": 0.28, "grad_norm": 22.012768496639392, "learning_rate": 9.958889706767341e-06, "loss": 0.1481, "step": 391 }, { "epoch": 0.28, "grad_norm": 42.213262359287036, "learning_rate": 9.95851902371816e-06, "loss": 0.1747, "step": 392 }, { "epoch": 0.28, "grad_norm": 10.688717536140965, "learning_rate": 9.95814668393561e-06, "loss": 0.1383, "step": 393 }, { "epoch": 0.28, "grad_norm": 41.26203920787363, "learning_rate": 9.957772687544094e-06, "loss": 0.1768, "step": 394 }, { "epoch": 0.28, "grad_norm": 10.068392998011415, "learning_rate": 9.95739703466857e-06, "loss": 0.136, "step": 395 }, { "epoch": 0.28, "grad_norm": 7.825594261580451, "learning_rate": 9.957019725434554e-06, "loss": 0.1346, "step": 396 }, { "epoch": 0.28, "grad_norm": 14.922584492469332, "learning_rate": 9.956640759968111e-06, "loss": 0.1091, "step": 397 }, { "epoch": 0.28, "grad_norm": 15.362761377543631, "learning_rate": 9.956260138395857e-06, "loss": 0.1241, "step": 398 }, { "epoch": 0.28, "grad_norm": 14.478039428734272, "learning_rate": 9.955877860844969e-06, "loss": 0.1665, "step": 399 }, { "epoch": 0.29, "grad_norm": 15.342154188172143, "learning_rate": 9.955493927443171e-06, "loss": 0.1294, "step": 400 }, { "epoch": 0.29, "grad_norm": 16.72367814770503, "learning_rate": 9.955108338318743e-06, "loss": 0.1521, "step": 401 }, { "epoch": 0.29, "grad_norm": 12.948093169878645, "learning_rate": 9.954721093600517e-06, "loss": 0.1439, "step": 402 }, { "epoch": 0.29, "grad_norm": 4.973524690035784, "learning_rate": 9.95433219341788e-06, "loss": 0.1324, "step": 403 }, { "epoch": 0.29, "grad_norm": 15.47228950899974, "learning_rate": 9.953941637900769e-06, "loss": 0.1686, "step": 404 }, { "epoch": 0.29, "grad_norm": 10.160262840433568, "learning_rate": 9.953549427179676e-06, "loss": 0.1477, "step": 405 }, { "epoch": 0.29, "grad_norm": 25.931016059983655, "learning_rate": 9.953155561385646e-06, "loss": 0.1356, "step": 406 }, { "epoch": 0.29, "grad_norm": 5.62503524150755, "learning_rate": 9.952760040650278e-06, "loss": 0.1508, "step": 407 }, { "epoch": 0.29, "grad_norm": 6.370817286061716, "learning_rate": 9.95236286510572e-06, "loss": 0.1111, "step": 408 }, { "epoch": 0.29, "grad_norm": 17.969755002141245, "learning_rate": 9.95196403488468e-06, "loss": 0.1555, "step": 409 }, { "epoch": 0.29, "grad_norm": 15.602862431905702, "learning_rate": 9.951563550120412e-06, "loss": 0.1444, "step": 410 }, { "epoch": 0.29, "grad_norm": 24.892905576485063, "learning_rate": 9.951161410946725e-06, "loss": 0.142, "step": 411 }, { "epoch": 0.29, "grad_norm": 26.823371334885348, "learning_rate": 9.950757617497983e-06, "loss": 0.1376, "step": 412 }, { "epoch": 0.29, "grad_norm": 42.11109194980369, "learning_rate": 9.950352169909101e-06, "loss": 0.1213, "step": 413 }, { "epoch": 0.3, "grad_norm": 11.958217327562295, "learning_rate": 9.949945068315544e-06, "loss": 0.1626, "step": 414 }, { "epoch": 0.3, "grad_norm": 17.71107942350547, "learning_rate": 9.949536312853334e-06, "loss": 0.166, "step": 415 }, { "epoch": 0.3, "grad_norm": 33.36805721946892, "learning_rate": 9.949125903659042e-06, "loss": 0.1525, "step": 416 }, { "epoch": 0.3, "grad_norm": 10.084727395074816, "learning_rate": 9.948713840869797e-06, "loss": 0.1426, "step": 417 }, { "epoch": 0.3, "grad_norm": 29.756453469589164, "learning_rate": 9.948300124623274e-06, "loss": 0.1035, "step": 418 }, { "epoch": 0.3, "grad_norm": 5.824683629550345, "learning_rate": 9.947884755057703e-06, "loss": 0.14, "step": 419 }, { "epoch": 0.3, "grad_norm": 11.852603046561496, "learning_rate": 9.947467732311868e-06, "loss": 0.1642, "step": 420 }, { "epoch": 0.3, "grad_norm": 14.091641894723889, "learning_rate": 9.947049056525104e-06, "loss": 0.1337, "step": 421 }, { "epoch": 0.3, "grad_norm": 11.772867686006098, "learning_rate": 9.9466287278373e-06, "loss": 0.1405, "step": 422 }, { "epoch": 0.3, "grad_norm": 20.227967360512032, "learning_rate": 9.946206746388892e-06, "loss": 0.1464, "step": 423 }, { "epoch": 0.3, "grad_norm": 28.120732815939572, "learning_rate": 9.94578311232087e-06, "loss": 0.1587, "step": 424 }, { "epoch": 0.3, "grad_norm": 6.836326247915467, "learning_rate": 9.945357825774786e-06, "loss": 0.1632, "step": 425 }, { "epoch": 0.3, "grad_norm": 38.068508076771145, "learning_rate": 9.944930886892731e-06, "loss": 0.1635, "step": 426 }, { "epoch": 0.3, "grad_norm": 17.53039371409448, "learning_rate": 9.944502295817353e-06, "loss": 0.1393, "step": 427 }, { "epoch": 0.31, "grad_norm": 5.744624734916593, "learning_rate": 9.944072052691853e-06, "loss": 0.1299, "step": 428 }, { "epoch": 0.31, "grad_norm": 36.21266814484719, "learning_rate": 9.943640157659984e-06, "loss": 0.1241, "step": 429 }, { "epoch": 0.31, "grad_norm": 27.99729719398013, "learning_rate": 9.94320661086605e-06, "loss": 0.1172, "step": 430 }, { "epoch": 0.31, "grad_norm": 32.61398458459067, "learning_rate": 9.942771412454906e-06, "loss": 0.1688, "step": 431 }, { "epoch": 0.31, "grad_norm": 29.53139637962103, "learning_rate": 9.942334562571961e-06, "loss": 0.14, "step": 432 }, { "epoch": 0.31, "grad_norm": 70.15413954052487, "learning_rate": 9.941896061363173e-06, "loss": 0.1909, "step": 433 }, { "epoch": 0.31, "grad_norm": 41.57928064536469, "learning_rate": 9.941455908975054e-06, "loss": 0.1348, "step": 434 }, { "epoch": 0.31, "grad_norm": 38.37885664713142, "learning_rate": 9.941014105554668e-06, "loss": 0.1616, "step": 435 }, { "epoch": 0.31, "grad_norm": 49.537343514648235, "learning_rate": 9.94057065124963e-06, "loss": 0.1626, "step": 436 }, { "epoch": 0.31, "grad_norm": 54.24518982500366, "learning_rate": 9.940125546208107e-06, "loss": 0.1528, "step": 437 }, { "epoch": 0.31, "grad_norm": 5.439592562127047, "learning_rate": 9.939678790578813e-06, "loss": 0.1382, "step": 438 }, { "epoch": 0.31, "grad_norm": 50.401662662406025, "learning_rate": 9.93923038451102e-06, "loss": 0.1444, "step": 439 }, { "epoch": 0.31, "grad_norm": 27.565998455504264, "learning_rate": 9.938780328154549e-06, "loss": 0.1638, "step": 440 }, { "epoch": 0.31, "grad_norm": 29.80949294384711, "learning_rate": 9.938328621659775e-06, "loss": 0.177, "step": 441 }, { "epoch": 0.32, "grad_norm": 39.42630244801424, "learning_rate": 9.937875265177615e-06, "loss": 0.1831, "step": 442 }, { "epoch": 0.32, "grad_norm": 5.22622431582067, "learning_rate": 9.937420258859547e-06, "loss": 0.1394, "step": 443 }, { "epoch": 0.32, "grad_norm": 32.328178133199046, "learning_rate": 9.9369636028576e-06, "loss": 0.1349, "step": 444 }, { "epoch": 0.32, "grad_norm": 6.504231110559711, "learning_rate": 9.936505297324346e-06, "loss": 0.1211, "step": 445 }, { "epoch": 0.32, "grad_norm": 20.781251334768122, "learning_rate": 9.936045342412917e-06, "loss": 0.1482, "step": 446 }, { "epoch": 0.32, "grad_norm": 21.21248632778865, "learning_rate": 9.93558373827699e-06, "loss": 0.1327, "step": 447 }, { "epoch": 0.32, "grad_norm": 12.707782785702369, "learning_rate": 9.935120485070799e-06, "loss": 0.1494, "step": 448 }, { "epoch": 0.32, "grad_norm": 9.673210776385321, "learning_rate": 9.934655582949123e-06, "loss": 0.0964, "step": 449 }, { "epoch": 0.32, "grad_norm": 9.888011994441523, "learning_rate": 9.934189032067296e-06, "loss": 0.1555, "step": 450 }, { "epoch": 0.32, "grad_norm": 32.12516155863512, "learning_rate": 9.933720832581197e-06, "loss": 0.1355, "step": 451 }, { "epoch": 0.32, "grad_norm": 22.322386591262855, "learning_rate": 9.933250984647266e-06, "loss": 0.1368, "step": 452 }, { "epoch": 0.32, "grad_norm": 15.223180132034958, "learning_rate": 9.932779488422484e-06, "loss": 0.1383, "step": 453 }, { "epoch": 0.32, "grad_norm": 52.31779619507051, "learning_rate": 9.93230634406439e-06, "loss": 0.1522, "step": 454 }, { "epoch": 0.32, "grad_norm": 29.017829224877428, "learning_rate": 9.931831551731067e-06, "loss": 0.1445, "step": 455 }, { "epoch": 0.33, "grad_norm": 18.997292779869753, "learning_rate": 9.931355111581154e-06, "loss": 0.1554, "step": 456 }, { "epoch": 0.33, "grad_norm": 34.735932438290625, "learning_rate": 9.930877023773837e-06, "loss": 0.134, "step": 457 }, { "epoch": 0.33, "grad_norm": 45.54908316077474, "learning_rate": 9.930397288468853e-06, "loss": 0.1663, "step": 458 }, { "epoch": 0.33, "grad_norm": 9.033708570010118, "learning_rate": 9.929915905826494e-06, "loss": 0.1084, "step": 459 }, { "epoch": 0.33, "grad_norm": 4.323441132876292, "learning_rate": 9.9294328760076e-06, "loss": 0.1165, "step": 460 }, { "epoch": 0.33, "grad_norm": 47.919864734101694, "learning_rate": 9.928948199173552e-06, "loss": 0.1626, "step": 461 }, { "epoch": 0.33, "grad_norm": 4.461706134664676, "learning_rate": 9.928461875486297e-06, "loss": 0.1068, "step": 462 }, { "epoch": 0.33, "grad_norm": 15.03570267608453, "learning_rate": 9.927973905108323e-06, "loss": 0.1066, "step": 463 }, { "epoch": 0.33, "grad_norm": 29.02078498555869, "learning_rate": 9.927484288202671e-06, "loss": 0.1425, "step": 464 }, { "epoch": 0.33, "grad_norm": 25.201159295770108, "learning_rate": 9.926993024932929e-06, "loss": 0.1377, "step": 465 }, { "epoch": 0.33, "grad_norm": 5.395550286499605, "learning_rate": 9.926500115463238e-06, "loss": 0.1176, "step": 466 }, { "epoch": 0.33, "grad_norm": 11.158696809668983, "learning_rate": 9.926005559958287e-06, "loss": 0.1361, "step": 467 }, { "epoch": 0.33, "grad_norm": 25.515101194445457, "learning_rate": 9.925509358583319e-06, "loss": 0.1162, "step": 468 }, { "epoch": 0.33, "grad_norm": 18.28360522188909, "learning_rate": 9.92501151150412e-06, "loss": 0.1367, "step": 469 }, { "epoch": 0.34, "grad_norm": 31.778596429481162, "learning_rate": 9.924512018887036e-06, "loss": 0.1229, "step": 470 }, { "epoch": 0.34, "grad_norm": 13.711321378139074, "learning_rate": 9.924010880898952e-06, "loss": 0.1389, "step": 471 }, { "epoch": 0.34, "grad_norm": 5.3412870602780185, "learning_rate": 9.923508097707306e-06, "loss": 0.1394, "step": 472 }, { "epoch": 0.34, "grad_norm": 39.662327865375396, "learning_rate": 9.923003669480094e-06, "loss": 0.187, "step": 473 }, { "epoch": 0.34, "grad_norm": 6.585151391933716, "learning_rate": 9.922497596385848e-06, "loss": 0.1266, "step": 474 }, { "epoch": 0.34, "grad_norm": 25.903061542242025, "learning_rate": 9.92198987859366e-06, "loss": 0.0936, "step": 475 }, { "epoch": 0.34, "grad_norm": 3.982867596376823, "learning_rate": 9.921480516273168e-06, "loss": 0.1238, "step": 476 }, { "epoch": 0.34, "grad_norm": 13.564020531973936, "learning_rate": 9.920969509594558e-06, "loss": 0.126, "step": 477 }, { "epoch": 0.34, "grad_norm": 10.136096874644808, "learning_rate": 9.920456858728567e-06, "loss": 0.1329, "step": 478 }, { "epoch": 0.34, "grad_norm": 4.415457519145033, "learning_rate": 9.919942563846482e-06, "loss": 0.1044, "step": 479 }, { "epoch": 0.34, "grad_norm": 6.035743042678794, "learning_rate": 9.919426625120137e-06, "loss": 0.1689, "step": 480 }, { "epoch": 0.34, "grad_norm": 10.45092970947201, "learning_rate": 9.918909042721918e-06, "loss": 0.136, "step": 481 }, { "epoch": 0.34, "grad_norm": 4.68056597961224, "learning_rate": 9.918389816824759e-06, "loss": 0.1423, "step": 482 }, { "epoch": 0.34, "grad_norm": 20.55146614949313, "learning_rate": 9.917868947602144e-06, "loss": 0.1532, "step": 483 }, { "epoch": 0.35, "grad_norm": 34.77111147030742, "learning_rate": 9.917346435228102e-06, "loss": 0.1746, "step": 484 }, { "epoch": 0.35, "grad_norm": 5.0196084233144695, "learning_rate": 9.916822279877217e-06, "loss": 0.1279, "step": 485 }, { "epoch": 0.35, "grad_norm": 3.941526827167951, "learning_rate": 9.91629648172462e-06, "loss": 0.1146, "step": 486 }, { "epoch": 0.35, "grad_norm": 12.890997507039701, "learning_rate": 9.915769040945984e-06, "loss": 0.1028, "step": 487 }, { "epoch": 0.35, "grad_norm": 27.88496903893723, "learning_rate": 9.915239957717542e-06, "loss": 0.1274, "step": 488 }, { "epoch": 0.35, "grad_norm": 9.00778567508186, "learning_rate": 9.91470923221607e-06, "loss": 0.1411, "step": 489 }, { "epoch": 0.35, "grad_norm": 23.484883782999354, "learning_rate": 9.914176864618891e-06, "loss": 0.1384, "step": 490 }, { "epoch": 0.35, "grad_norm": 4.929422863937659, "learning_rate": 9.913642855103881e-06, "loss": 0.0734, "step": 491 }, { "epoch": 0.35, "grad_norm": 5.36947027897346, "learning_rate": 9.913107203849464e-06, "loss": 0.1497, "step": 492 }, { "epoch": 0.35, "grad_norm": 24.750798415291204, "learning_rate": 9.912569911034607e-06, "loss": 0.1672, "step": 493 }, { "epoch": 0.35, "grad_norm": 6.881614535806723, "learning_rate": 9.912030976838832e-06, "loss": 0.1456, "step": 494 }, { "epoch": 0.35, "grad_norm": 33.54932127569477, "learning_rate": 9.911490401442205e-06, "loss": 0.1489, "step": 495 }, { "epoch": 0.35, "grad_norm": 20.00784172694422, "learning_rate": 9.910948185025345e-06, "loss": 0.1536, "step": 496 }, { "epoch": 0.35, "grad_norm": 5.238510328306569, "learning_rate": 9.910404327769414e-06, "loss": 0.1207, "step": 497 }, { "epoch": 0.36, "grad_norm": 30.054761593942306, "learning_rate": 9.909858829856127e-06, "loss": 0.1364, "step": 498 }, { "epoch": 0.36, "grad_norm": 7.592134090075237, "learning_rate": 9.909311691467744e-06, "loss": 0.1006, "step": 499 }, { "epoch": 0.36, "grad_norm": 5.936467444451623, "learning_rate": 9.908762912787073e-06, "loss": 0.1512, "step": 500 }, { "epoch": 0.36, "eval_avg_AUC": 0.8148136168882592, "eval_avg_Accuracy": 0.7284897214854111, "eval_avg_Accuracy-right": 0.8730272596843616, "eval_avg_Accuracy-wrong": 0.4764612235615192, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6962655285457768, "eval_last_AUC": 0.8285519781961206, "eval_last_Accuracy": 0.7570457559681698, "eval_last_Accuracy-right": 0.8302465110212599, "eval_last_Accuracy-wrong": 0.6294064134637253, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6999852156206944, "eval_max_AUC": 0.7665118487150447, "eval_max_Accuracy": 0.6521054376657824, "eval_max_Accuracy-right": 0.9773053345506717, "eval_max_Accuracy-wrong": 0.08505799408687742, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6174539818202015, "eval_min_AUC": 0.8243709198076318, "eval_min_Accuracy": 0.7511604774535809, "eval_min_Accuracy-right": 0.7600104343289422, "eval_min_Accuracy-wrong": 0.735728906072322, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6964298045694274, "eval_prod_AUC": 0.8209843050796547, "eval_prod_Accuracy": 0.6548408488063661, "eval_prod_Accuracy-right": 0.49582626842311206, "eval_prod_Accuracy-wrong": 0.9321128041846714, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6870361522901595, "eval_runtime": 248.2297, "eval_samples_per_second": 97.2, "eval_steps_per_second": 3.038, "eval_sum_AUC": 0.6857568992387502, "eval_sum_Accuracy": 0.6382211538461539, "eval_sum_Accuracy-right": 0.9962827703143342, "eval_sum_Accuracy-wrong": 0.013873095292244713, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6648248225648956, "step": 500 }, { "epoch": 0.36, "grad_norm": 7.521136475767055, "learning_rate": 9.908212493997473e-06, "loss": 0.1351, "step": 501 }, { "epoch": 0.36, "grad_norm": 24.8215182959066, "learning_rate": 9.90766043528285e-06, "loss": 0.1265, "step": 502 }, { "epoch": 0.36, "grad_norm": 20.262220801212766, "learning_rate": 9.907106736827654e-06, "loss": 0.1382, "step": 503 }, { "epoch": 0.36, "grad_norm": 6.612067632756019, "learning_rate": 9.906551398816886e-06, "loss": 0.1176, "step": 504 }, { "epoch": 0.36, "grad_norm": 10.684225952657133, "learning_rate": 9.9059944214361e-06, "loss": 0.1636, "step": 505 }, { "epoch": 0.36, "grad_norm": 12.984066186445835, "learning_rate": 9.905435804871387e-06, "loss": 0.1366, "step": 506 }, { "epoch": 0.36, "grad_norm": 24.763096461281147, "learning_rate": 9.904875549309391e-06, "loss": 0.1584, "step": 507 }, { "epoch": 0.36, "grad_norm": 5.7311046700749975, "learning_rate": 9.904313654937308e-06, "loss": 0.1486, "step": 508 }, { "epoch": 0.36, "grad_norm": 51.33110066179473, "learning_rate": 9.903750121942873e-06, "loss": 0.1875, "step": 509 }, { "epoch": 0.36, "grad_norm": 12.408291940217502, "learning_rate": 9.903184950514378e-06, "loss": 0.1373, "step": 510 }, { "epoch": 0.36, "grad_norm": 7.255602958809023, "learning_rate": 9.90261814084065e-06, "loss": 0.1256, "step": 511 }, { "epoch": 0.37, "grad_norm": 17.357961967373914, "learning_rate": 9.902049693111077e-06, "loss": 0.1616, "step": 512 }, { "epoch": 0.37, "grad_norm": 28.352859541701022, "learning_rate": 9.901479607515587e-06, "loss": 0.1322, "step": 513 }, { "epoch": 0.37, "grad_norm": 18.349020721142626, "learning_rate": 9.900907884244654e-06, "loss": 0.1311, "step": 514 }, { "epoch": 0.37, "grad_norm": 7.109150918544185, "learning_rate": 9.900334523489303e-06, "loss": 0.1604, "step": 515 }, { "epoch": 0.37, "grad_norm": 29.842240993629268, "learning_rate": 9.899759525441101e-06, "loss": 0.1586, "step": 516 }, { "epoch": 0.37, "grad_norm": 32.448571178210294, "learning_rate": 9.899182890292171e-06, "loss": 0.1516, "step": 517 }, { "epoch": 0.37, "grad_norm": 37.541295217424135, "learning_rate": 9.898604618235175e-06, "loss": 0.1541, "step": 518 }, { "epoch": 0.37, "grad_norm": 59.74203765524549, "learning_rate": 9.898024709463322e-06, "loss": 0.1914, "step": 519 }, { "epoch": 0.37, "grad_norm": 51.553681345435216, "learning_rate": 9.897443164170375e-06, "loss": 0.1547, "step": 520 }, { "epoch": 0.37, "grad_norm": 17.65120220241073, "learning_rate": 9.896859982550636e-06, "loss": 0.1357, "step": 521 }, { "epoch": 0.37, "grad_norm": 4.5369157168625405, "learning_rate": 9.89627516479896e-06, "loss": 0.1232, "step": 522 }, { "epoch": 0.37, "grad_norm": 28.586892727146566, "learning_rate": 9.895688711110739e-06, "loss": 0.1234, "step": 523 }, { "epoch": 0.37, "grad_norm": 4.714799632443614, "learning_rate": 9.895100621681923e-06, "loss": 0.1794, "step": 524 }, { "epoch": 0.37, "grad_norm": 10.894818046361022, "learning_rate": 9.894510896709003e-06, "loss": 0.1145, "step": 525 }, { "epoch": 0.38, "grad_norm": 11.020267897450193, "learning_rate": 9.893919536389017e-06, "loss": 0.1331, "step": 526 }, { "epoch": 0.38, "grad_norm": 6.603140469816235, "learning_rate": 9.89332654091955e-06, "loss": 0.1317, "step": 527 }, { "epoch": 0.38, "grad_norm": 5.317710681474739, "learning_rate": 9.892731910498731e-06, "loss": 0.134, "step": 528 }, { "epoch": 0.38, "grad_norm": 18.48919116376951, "learning_rate": 9.892135645325238e-06, "loss": 0.127, "step": 529 }, { "epoch": 0.38, "grad_norm": 15.110852007766283, "learning_rate": 9.891537745598293e-06, "loss": 0.1333, "step": 530 }, { "epoch": 0.38, "grad_norm": 25.48093356108608, "learning_rate": 9.89093821151767e-06, "loss": 0.1425, "step": 531 }, { "epoch": 0.38, "grad_norm": 14.211704092158119, "learning_rate": 9.89033704328368e-06, "loss": 0.1261, "step": 532 }, { "epoch": 0.38, "grad_norm": 12.49842219441739, "learning_rate": 9.889734241097186e-06, "loss": 0.1227, "step": 533 }, { "epoch": 0.38, "grad_norm": 6.538899092333007, "learning_rate": 9.889129805159595e-06, "loss": 0.1333, "step": 534 }, { "epoch": 0.38, "grad_norm": 11.229548690859028, "learning_rate": 9.888523735672861e-06, "loss": 0.1207, "step": 535 }, { "epoch": 0.38, "grad_norm": 14.706186292987425, "learning_rate": 9.887916032839482e-06, "loss": 0.1376, "step": 536 }, { "epoch": 0.38, "grad_norm": 15.889794280637537, "learning_rate": 9.887306696862504e-06, "loss": 0.1122, "step": 537 }, { "epoch": 0.38, "grad_norm": 9.453535010562696, "learning_rate": 9.886695727945515e-06, "loss": 0.131, "step": 538 }, { "epoch": 0.38, "grad_norm": 11.612128252341902, "learning_rate": 9.886083126292655e-06, "loss": 0.1492, "step": 539 }, { "epoch": 0.39, "grad_norm": 7.075171549803063, "learning_rate": 9.885468892108603e-06, "loss": 0.1272, "step": 540 }, { "epoch": 0.39, "grad_norm": 26.914960079687482, "learning_rate": 9.884853025598587e-06, "loss": 0.1514, "step": 541 }, { "epoch": 0.39, "grad_norm": 43.779863637748186, "learning_rate": 9.884235526968377e-06, "loss": 0.1256, "step": 542 }, { "epoch": 0.39, "grad_norm": 37.73960304494291, "learning_rate": 9.883616396424294e-06, "loss": 0.1553, "step": 543 }, { "epoch": 0.39, "grad_norm": 12.454022843924934, "learning_rate": 9.8829956341732e-06, "loss": 0.1204, "step": 544 }, { "epoch": 0.39, "grad_norm": 43.19321310165527, "learning_rate": 9.882373240422503e-06, "loss": 0.1389, "step": 545 }, { "epoch": 0.39, "grad_norm": 48.79232289830731, "learning_rate": 9.881749215380156e-06, "loss": 0.13, "step": 546 }, { "epoch": 0.39, "grad_norm": 18.76530769424551, "learning_rate": 9.881123559254658e-06, "loss": 0.1158, "step": 547 }, { "epoch": 0.39, "grad_norm": 62.57220924147817, "learning_rate": 9.880496272255053e-06, "loss": 0.158, "step": 548 }, { "epoch": 0.39, "grad_norm": 12.500417488061052, "learning_rate": 9.879867354590926e-06, "loss": 0.1196, "step": 549 }, { "epoch": 0.39, "grad_norm": 15.875141788148147, "learning_rate": 9.879236806472414e-06, "loss": 0.1333, "step": 550 }, { "epoch": 0.39, "grad_norm": 7.301629680806376, "learning_rate": 9.878604628110194e-06, "loss": 0.1799, "step": 551 }, { "epoch": 0.39, "grad_norm": 6.264465519965361, "learning_rate": 9.877970819715485e-06, "loss": 0.1427, "step": 552 }, { "epoch": 0.39, "grad_norm": 30.949313988770307, "learning_rate": 9.87733538150006e-06, "loss": 0.142, "step": 553 }, { "epoch": 0.4, "grad_norm": 6.380549224768007, "learning_rate": 9.876698313676225e-06, "loss": 0.1191, "step": 554 }, { "epoch": 0.4, "grad_norm": 10.66859233407463, "learning_rate": 9.876059616456842e-06, "loss": 0.1405, "step": 555 }, { "epoch": 0.4, "grad_norm": 20.946002565023058, "learning_rate": 9.875419290055305e-06, "loss": 0.1102, "step": 556 }, { "epoch": 0.4, "grad_norm": 30.897928782214787, "learning_rate": 9.874777334685565e-06, "loss": 0.1711, "step": 557 }, { "epoch": 0.4, "grad_norm": 13.033106438202273, "learning_rate": 9.874133750562108e-06, "loss": 0.1622, "step": 558 }, { "epoch": 0.4, "grad_norm": 40.36685994127496, "learning_rate": 9.873488537899967e-06, "loss": 0.2061, "step": 559 }, { "epoch": 0.4, "grad_norm": 19.50274995202192, "learning_rate": 9.872841696914721e-06, "loss": 0.1294, "step": 560 }, { "epoch": 0.4, "grad_norm": 11.932353294948314, "learning_rate": 9.872193227822492e-06, "loss": 0.1265, "step": 561 }, { "epoch": 0.4, "grad_norm": 20.43294471193903, "learning_rate": 9.871543130839944e-06, "loss": 0.1456, "step": 562 }, { "epoch": 0.4, "grad_norm": 42.946806904632204, "learning_rate": 9.870891406184288e-06, "loss": 0.1626, "step": 563 }, { "epoch": 0.4, "grad_norm": 33.149765129877956, "learning_rate": 9.870238054073275e-06, "loss": 0.1593, "step": 564 }, { "epoch": 0.4, "grad_norm": 5.929277127548148, "learning_rate": 9.869583074725206e-06, "loss": 0.1477, "step": 565 }, { "epoch": 0.4, "grad_norm": 39.416395744901585, "learning_rate": 9.868926468358919e-06, "loss": 0.1432, "step": 566 }, { "epoch": 0.4, "grad_norm": 43.22729095180558, "learning_rate": 9.868268235193796e-06, "loss": 0.1406, "step": 567 }, { "epoch": 0.41, "grad_norm": 15.377542148018398, "learning_rate": 9.867608375449772e-06, "loss": 0.1235, "step": 568 }, { "epoch": 0.41, "grad_norm": 23.10191212472918, "learning_rate": 9.866946889347311e-06, "loss": 0.127, "step": 569 }, { "epoch": 0.41, "grad_norm": 32.3797801287966, "learning_rate": 9.866283777107432e-06, "loss": 0.1323, "step": 570 }, { "epoch": 0.41, "grad_norm": 6.095686964503419, "learning_rate": 9.865619038951692e-06, "loss": 0.1375, "step": 571 }, { "epoch": 0.41, "grad_norm": 9.775807625906909, "learning_rate": 9.864952675102193e-06, "loss": 0.1379, "step": 572 }, { "epoch": 0.41, "grad_norm": 6.286767820247743, "learning_rate": 9.864284685781578e-06, "loss": 0.1425, "step": 573 }, { "epoch": 0.41, "grad_norm": 6.147344431329742, "learning_rate": 9.863615071213036e-06, "loss": 0.1304, "step": 574 }, { "epoch": 0.41, "grad_norm": 28.412228671248563, "learning_rate": 9.862943831620298e-06, "loss": 0.1273, "step": 575 }, { "epoch": 0.41, "grad_norm": 30.96935582463328, "learning_rate": 9.862270967227636e-06, "loss": 0.1459, "step": 576 }, { "epoch": 0.41, "grad_norm": 4.987197903594031, "learning_rate": 9.861596478259869e-06, "loss": 0.139, "step": 577 }, { "epoch": 0.41, "grad_norm": 63.33976410883062, "learning_rate": 9.860920364942353e-06, "loss": 0.1904, "step": 578 }, { "epoch": 0.41, "grad_norm": 4.66543675696079, "learning_rate": 9.860242627500994e-06, "loss": 0.1125, "step": 579 }, { "epoch": 0.41, "grad_norm": 11.0806252525076, "learning_rate": 9.859563266162231e-06, "loss": 0.1321, "step": 580 }, { "epoch": 0.41, "grad_norm": 5.080418185339296, "learning_rate": 9.858882281153058e-06, "loss": 0.1157, "step": 581 }, { "epoch": 0.42, "grad_norm": 23.03682096474836, "learning_rate": 9.858199672701e-06, "loss": 0.1392, "step": 582 }, { "epoch": 0.42, "grad_norm": 31.28197071942006, "learning_rate": 9.85751544103413e-06, "loss": 0.1129, "step": 583 }, { "epoch": 0.42, "grad_norm": 9.68760436913685, "learning_rate": 9.856829586381065e-06, "loss": 0.1071, "step": 584 }, { "epoch": 0.42, "grad_norm": 73.28546422256532, "learning_rate": 9.856142108970958e-06, "loss": 0.1958, "step": 585 }, { "epoch": 0.42, "grad_norm": 18.720122616928183, "learning_rate": 9.855453009033512e-06, "loss": 0.1326, "step": 586 }, { "epoch": 0.42, "grad_norm": 15.463997560094482, "learning_rate": 9.854762286798965e-06, "loss": 0.1453, "step": 587 }, { "epoch": 0.42, "grad_norm": 7.391237164066708, "learning_rate": 9.854069942498102e-06, "loss": 0.1965, "step": 588 }, { "epoch": 0.42, "grad_norm": 8.599884051273923, "learning_rate": 9.853375976362245e-06, "loss": 0.1635, "step": 589 }, { "epoch": 0.42, "grad_norm": 4.441491117087743, "learning_rate": 9.852680388623266e-06, "loss": 0.1158, "step": 590 }, { "epoch": 0.42, "grad_norm": 34.51436948860912, "learning_rate": 9.85198317951357e-06, "loss": 0.1531, "step": 591 }, { "epoch": 0.42, "grad_norm": 17.99454842623697, "learning_rate": 9.851284349266107e-06, "loss": 0.1305, "step": 592 }, { "epoch": 0.42, "grad_norm": 16.123249191913267, "learning_rate": 9.850583898114372e-06, "loss": 0.1575, "step": 593 }, { "epoch": 0.42, "grad_norm": 48.69893321264437, "learning_rate": 9.849881826292399e-06, "loss": 0.1558, "step": 594 }, { "epoch": 0.42, "grad_norm": 34.46486038497999, "learning_rate": 9.84917813403476e-06, "loss": 0.1367, "step": 595 }, { "epoch": 0.43, "grad_norm": 9.597468969380602, "learning_rate": 9.848472821576572e-06, "loss": 0.1439, "step": 596 }, { "epoch": 0.43, "grad_norm": 52.9719490703774, "learning_rate": 9.847765889153497e-06, "loss": 0.1819, "step": 597 }, { "epoch": 0.43, "grad_norm": 36.233987937356446, "learning_rate": 9.847057337001731e-06, "loss": 0.1594, "step": 598 }, { "epoch": 0.43, "grad_norm": 20.295048431141648, "learning_rate": 9.846347165358014e-06, "loss": 0.1284, "step": 599 }, { "epoch": 0.43, "grad_norm": 42.84524320177862, "learning_rate": 9.84563537445963e-06, "loss": 0.1504, "step": 600 }, { "epoch": 0.43, "grad_norm": 62.1195121067531, "learning_rate": 9.844921964544398e-06, "loss": 0.1758, "step": 601 }, { "epoch": 0.43, "grad_norm": 47.2653880369296, "learning_rate": 9.844206935850687e-06, "loss": 0.1831, "step": 602 }, { "epoch": 0.43, "grad_norm": 29.4931297961525, "learning_rate": 9.843490288617397e-06, "loss": 0.1008, "step": 603 }, { "epoch": 0.43, "grad_norm": 78.26510483270128, "learning_rate": 9.842772023083972e-06, "loss": 0.1829, "step": 604 }, { "epoch": 0.43, "grad_norm": 72.53305356196587, "learning_rate": 9.842052139490403e-06, "loss": 0.2009, "step": 605 }, { "epoch": 0.43, "grad_norm": 23.74689339670262, "learning_rate": 9.841330638077213e-06, "loss": 0.139, "step": 606 }, { "epoch": 0.43, "grad_norm": 58.871630884771356, "learning_rate": 9.840607519085467e-06, "loss": 0.1533, "step": 607 }, { "epoch": 0.43, "grad_norm": 60.04727742225944, "learning_rate": 9.839882782756778e-06, "loss": 0.1533, "step": 608 }, { "epoch": 0.43, "grad_norm": 34.76813691419715, "learning_rate": 9.839156429333291e-06, "loss": 0.1517, "step": 609 }, { "epoch": 0.44, "grad_norm": 39.487093806576866, "learning_rate": 9.838428459057694e-06, "loss": 0.1306, "step": 610 }, { "epoch": 0.44, "grad_norm": 71.43206314131315, "learning_rate": 9.837698872173214e-06, "loss": 0.166, "step": 611 }, { "epoch": 0.44, "grad_norm": 49.09265681558195, "learning_rate": 9.836967668923623e-06, "loss": 0.1689, "step": 612 }, { "epoch": 0.44, "grad_norm": 6.462674294507499, "learning_rate": 9.836234849553228e-06, "loss": 0.1088, "step": 613 }, { "epoch": 0.44, "grad_norm": 70.19656687365818, "learning_rate": 9.835500414306875e-06, "loss": 0.2119, "step": 614 }, { "epoch": 0.44, "grad_norm": 62.173880695304916, "learning_rate": 9.834764363429956e-06, "loss": 0.1681, "step": 615 }, { "epoch": 0.44, "grad_norm": 43.48786940185642, "learning_rate": 9.8340266971684e-06, "loss": 0.172, "step": 616 }, { "epoch": 0.44, "grad_norm": 26.124867896320307, "learning_rate": 9.83328741576867e-06, "loss": 0.1342, "step": 617 }, { "epoch": 0.44, "grad_norm": 47.911183471370855, "learning_rate": 9.832546519477778e-06, "loss": 0.179, "step": 618 }, { "epoch": 0.44, "grad_norm": 89.25182761050898, "learning_rate": 9.831804008543271e-06, "loss": 0.2285, "step": 619 }, { "epoch": 0.44, "grad_norm": 23.042941448066955, "learning_rate": 9.831059883213234e-06, "loss": 0.1616, "step": 620 }, { "epoch": 0.44, "grad_norm": 58.09372311927322, "learning_rate": 9.830314143736292e-06, "loss": 0.1641, "step": 621 }, { "epoch": 0.44, "grad_norm": 28.97753294613849, "learning_rate": 9.829566790361615e-06, "loss": 0.1344, "step": 622 }, { "epoch": 0.44, "grad_norm": 49.28980917332516, "learning_rate": 9.828817823338903e-06, "loss": 0.1614, "step": 623 }, { "epoch": 0.45, "grad_norm": 32.04413626104762, "learning_rate": 9.828067242918402e-06, "loss": 0.1404, "step": 624 }, { "epoch": 0.45, "grad_norm": 4.78476590401764, "learning_rate": 9.827315049350895e-06, "loss": 0.1169, "step": 625 }, { "epoch": 0.45, "grad_norm": 81.63675673285175, "learning_rate": 9.826561242887704e-06, "loss": 0.1953, "step": 626 }, { "epoch": 0.45, "grad_norm": 48.11606730737452, "learning_rate": 9.825805823780687e-06, "loss": 0.1619, "step": 627 }, { "epoch": 0.45, "grad_norm": 3.907109159794875, "learning_rate": 9.825048792282247e-06, "loss": 0.131, "step": 628 }, { "epoch": 0.45, "grad_norm": 11.103553566829753, "learning_rate": 9.824290148645322e-06, "loss": 0.1139, "step": 629 }, { "epoch": 0.45, "grad_norm": 51.12278081641119, "learning_rate": 9.823529893123384e-06, "loss": 0.1527, "step": 630 }, { "epoch": 0.45, "grad_norm": 64.29064695958792, "learning_rate": 9.822768025970456e-06, "loss": 0.1838, "step": 631 }, { "epoch": 0.45, "grad_norm": 22.424660906876873, "learning_rate": 9.822004547441088e-06, "loss": 0.1398, "step": 632 }, { "epoch": 0.45, "grad_norm": 5.706055561297325, "learning_rate": 9.821239457790373e-06, "loss": 0.1428, "step": 633 }, { "epoch": 0.45, "grad_norm": 59.7429449871699, "learning_rate": 9.82047275727394e-06, "loss": 0.176, "step": 634 }, { "epoch": 0.45, "grad_norm": 67.77135894876831, "learning_rate": 9.81970444614796e-06, "loss": 0.1714, "step": 635 }, { "epoch": 0.45, "grad_norm": 10.668865071226387, "learning_rate": 9.81893452466914e-06, "loss": 0.1267, "step": 636 }, { "epoch": 0.45, "grad_norm": 44.76001772659544, "learning_rate": 9.818162993094724e-06, "loss": 0.1423, "step": 637 }, { "epoch": 0.46, "grad_norm": 53.39779453147049, "learning_rate": 9.817389851682494e-06, "loss": 0.1842, "step": 638 }, { "epoch": 0.46, "grad_norm": 24.28617527330543, "learning_rate": 9.816615100690773e-06, "loss": 0.1235, "step": 639 }, { "epoch": 0.46, "grad_norm": 19.154914114241624, "learning_rate": 9.81583874037842e-06, "loss": 0.1217, "step": 640 }, { "epoch": 0.46, "grad_norm": 10.613120002801404, "learning_rate": 9.815060771004831e-06, "loss": 0.1311, "step": 641 }, { "epoch": 0.46, "grad_norm": 27.240227333671324, "learning_rate": 9.81428119282994e-06, "loss": 0.1553, "step": 642 }, { "epoch": 0.46, "grad_norm": 21.0473186526304, "learning_rate": 9.813500006114216e-06, "loss": 0.1239, "step": 643 }, { "epoch": 0.46, "grad_norm": 15.104115765634024, "learning_rate": 9.812717211118673e-06, "loss": 0.1184, "step": 644 }, { "epoch": 0.46, "grad_norm": 12.249235905236901, "learning_rate": 9.811932808104852e-06, "loss": 0.1505, "step": 645 }, { "epoch": 0.46, "grad_norm": 5.4945638837559105, "learning_rate": 9.811146797334838e-06, "loss": 0.1547, "step": 646 }, { "epoch": 0.46, "grad_norm": 15.362754513773162, "learning_rate": 9.810359179071255e-06, "loss": 0.1442, "step": 647 }, { "epoch": 0.46, "grad_norm": 19.255632442668595, "learning_rate": 9.809569953577258e-06, "loss": 0.0961, "step": 648 }, { "epoch": 0.46, "grad_norm": 8.391685162385883, "learning_rate": 9.808779121116542e-06, "loss": 0.1421, "step": 649 }, { "epoch": 0.46, "grad_norm": 15.810940924831158, "learning_rate": 9.807986681953341e-06, "loss": 0.1198, "step": 650 }, { "epoch": 0.46, "grad_norm": 9.376506844728723, "learning_rate": 9.807192636352422e-06, "loss": 0.1383, "step": 651 }, { "epoch": 0.47, "grad_norm": 17.64775555566848, "learning_rate": 9.80639698457909e-06, "loss": 0.108, "step": 652 }, { "epoch": 0.47, "grad_norm": 19.559096284384445, "learning_rate": 9.805599726899188e-06, "loss": 0.1316, "step": 653 }, { "epoch": 0.47, "grad_norm": 9.461717264545886, "learning_rate": 9.804800863579094e-06, "loss": 0.1169, "step": 654 }, { "epoch": 0.47, "grad_norm": 20.446164874079532, "learning_rate": 9.804000394885723e-06, "loss": 0.13, "step": 655 }, { "epoch": 0.47, "grad_norm": 51.40477529762973, "learning_rate": 9.803198321086527e-06, "loss": 0.2056, "step": 656 }, { "epoch": 0.47, "grad_norm": 7.068665322892394, "learning_rate": 9.802394642449494e-06, "loss": 0.1346, "step": 657 }, { "epoch": 0.47, "grad_norm": 10.538054958638275, "learning_rate": 9.801589359243147e-06, "loss": 0.1362, "step": 658 }, { "epoch": 0.47, "grad_norm": 35.66887445030199, "learning_rate": 9.800782471736547e-06, "loss": 0.135, "step": 659 }, { "epoch": 0.47, "grad_norm": 31.297567953445206, "learning_rate": 9.799973980199288e-06, "loss": 0.1344, "step": 660 }, { "epoch": 0.47, "grad_norm": 22.996584888633382, "learning_rate": 9.799163884901506e-06, "loss": 0.1359, "step": 661 }, { "epoch": 0.47, "grad_norm": 38.59277267402653, "learning_rate": 9.798352186113867e-06, "loss": 0.1353, "step": 662 }, { "epoch": 0.47, "grad_norm": 43.973533584689015, "learning_rate": 9.797538884107574e-06, "loss": 0.1582, "step": 663 }, { "epoch": 0.47, "grad_norm": 32.03919442027451, "learning_rate": 9.796723979154366e-06, "loss": 0.1212, "step": 664 }, { "epoch": 0.47, "grad_norm": 5.076183766829495, "learning_rate": 9.795907471526518e-06, "loss": 0.1411, "step": 665 }, { "epoch": 0.48, "grad_norm": 53.124967231457354, "learning_rate": 9.79508936149684e-06, "loss": 0.1311, "step": 666 }, { "epoch": 0.48, "grad_norm": 39.74430808850389, "learning_rate": 9.79426964933868e-06, "loss": 0.1692, "step": 667 }, { "epoch": 0.48, "grad_norm": 9.484329880517071, "learning_rate": 9.793448335325919e-06, "loss": 0.1332, "step": 668 }, { "epoch": 0.48, "grad_norm": 13.266390261252008, "learning_rate": 9.792625419732969e-06, "loss": 0.121, "step": 669 }, { "epoch": 0.48, "grad_norm": 22.195451826375955, "learning_rate": 9.791800902834787e-06, "loss": 0.0991, "step": 670 }, { "epoch": 0.48, "grad_norm": 28.240346235800146, "learning_rate": 9.790974784906855e-06, "loss": 0.1233, "step": 671 }, { "epoch": 0.48, "grad_norm": 5.289549119474038, "learning_rate": 9.790147066225198e-06, "loss": 0.1588, "step": 672 }, { "epoch": 0.48, "grad_norm": 40.07083822351059, "learning_rate": 9.789317747066369e-06, "loss": 0.1315, "step": 673 }, { "epoch": 0.48, "grad_norm": 17.30108556214769, "learning_rate": 9.788486827707462e-06, "loss": 0.1672, "step": 674 }, { "epoch": 0.48, "grad_norm": 14.124723292613249, "learning_rate": 9.7876543084261e-06, "loss": 0.103, "step": 675 }, { "epoch": 0.48, "grad_norm": 27.255940377336177, "learning_rate": 9.786820189500443e-06, "loss": 0.1493, "step": 676 }, { "epoch": 0.48, "grad_norm": 32.00415533490613, "learning_rate": 9.785984471209186e-06, "loss": 0.1235, "step": 677 }, { "epoch": 0.48, "grad_norm": 3.6327392186328873, "learning_rate": 9.785147153831562e-06, "loss": 0.1182, "step": 678 }, { "epoch": 0.48, "grad_norm": 4.6193887433702985, "learning_rate": 9.784308237647329e-06, "loss": 0.1451, "step": 679 }, { "epoch": 0.49, "grad_norm": 9.595692250193478, "learning_rate": 9.783467722936786e-06, "loss": 0.1777, "step": 680 }, { "epoch": 0.49, "grad_norm": 37.73689869043474, "learning_rate": 9.782625609980767e-06, "loss": 0.1315, "step": 681 }, { "epoch": 0.49, "grad_norm": 52.16154314059323, "learning_rate": 9.781781899060635e-06, "loss": 0.1628, "step": 682 }, { "epoch": 0.49, "grad_norm": 4.752406905653005, "learning_rate": 9.78093659045829e-06, "loss": 0.1372, "step": 683 }, { "epoch": 0.49, "grad_norm": 32.17012777708828, "learning_rate": 9.780089684456164e-06, "loss": 0.1354, "step": 684 }, { "epoch": 0.49, "grad_norm": 30.835260778351316, "learning_rate": 9.779241181337228e-06, "loss": 0.1133, "step": 685 }, { "epoch": 0.49, "grad_norm": 32.630866103987174, "learning_rate": 9.778391081384979e-06, "loss": 0.1271, "step": 686 }, { "epoch": 0.49, "grad_norm": 23.691240269860604, "learning_rate": 9.777539384883453e-06, "loss": 0.1061, "step": 687 }, { "epoch": 0.49, "grad_norm": 10.821205705393695, "learning_rate": 9.776686092117216e-06, "loss": 0.1611, "step": 688 }, { "epoch": 0.49, "grad_norm": 28.128381007348157, "learning_rate": 9.775831203371371e-06, "loss": 0.1252, "step": 689 }, { "epoch": 0.49, "grad_norm": 62.024712894378, "learning_rate": 9.774974718931551e-06, "loss": 0.2048, "step": 690 }, { "epoch": 0.49, "grad_norm": 30.29155638874213, "learning_rate": 9.774116639083923e-06, "loss": 0.1371, "step": 691 }, { "epoch": 0.49, "grad_norm": 17.014556677775154, "learning_rate": 9.773256964115189e-06, "loss": 0.0955, "step": 692 }, { "epoch": 0.49, "grad_norm": 55.463045257964794, "learning_rate": 9.772395694312583e-06, "loss": 0.1831, "step": 693 }, { "epoch": 0.5, "grad_norm": 24.022624545256317, "learning_rate": 9.771532829963865e-06, "loss": 0.1633, "step": 694 }, { "epoch": 0.5, "grad_norm": 20.571501004321036, "learning_rate": 9.770668371357344e-06, "loss": 0.1271, "step": 695 }, { "epoch": 0.5, "grad_norm": 30.701880427913235, "learning_rate": 9.769802318781842e-06, "loss": 0.1296, "step": 696 }, { "epoch": 0.5, "grad_norm": 25.625148935174835, "learning_rate": 9.76893467252673e-06, "loss": 0.1271, "step": 697 }, { "epoch": 0.5, "grad_norm": 5.240286750024976, "learning_rate": 9.768065432881903e-06, "loss": 0.1227, "step": 698 }, { "epoch": 0.5, "grad_norm": 19.49975018445689, "learning_rate": 9.767194600137789e-06, "loss": 0.1124, "step": 699 }, { "epoch": 0.5, "grad_norm": 7.416082012842279, "learning_rate": 9.766322174585347e-06, "loss": 0.1313, "step": 700 }, { "epoch": 0.5, "grad_norm": 15.26643688822226, "learning_rate": 9.765448156516077e-06, "loss": 0.1049, "step": 701 }, { "epoch": 0.5, "grad_norm": 23.46621543879915, "learning_rate": 9.764572546222e-06, "loss": 0.1229, "step": 702 }, { "epoch": 0.5, "grad_norm": 20.60346076669723, "learning_rate": 9.763695343995674e-06, "loss": 0.1364, "step": 703 }, { "epoch": 0.5, "grad_norm": 14.594095858724636, "learning_rate": 9.762816550130192e-06, "loss": 0.0992, "step": 704 }, { "epoch": 0.5, "grad_norm": 21.631405632050786, "learning_rate": 9.76193616491917e-06, "loss": 0.1521, "step": 705 }, { "epoch": 0.5, "grad_norm": 8.107810318183903, "learning_rate": 9.761054188656766e-06, "loss": 0.1497, "step": 706 }, { "epoch": 0.5, "grad_norm": 13.5375760091303, "learning_rate": 9.760170621637661e-06, "loss": 0.1255, "step": 707 }, { "epoch": 0.51, "grad_norm": 10.731462678449901, "learning_rate": 9.759285464157073e-06, "loss": 0.1245, "step": 708 }, { "epoch": 0.51, "grad_norm": 3.6574842453813687, "learning_rate": 9.758398716510751e-06, "loss": 0.1086, "step": 709 }, { "epoch": 0.51, "grad_norm": 30.727691788456823, "learning_rate": 9.75751037899497e-06, "loss": 0.1281, "step": 710 }, { "epoch": 0.51, "grad_norm": 27.202375481436885, "learning_rate": 9.756620451906543e-06, "loss": 0.1276, "step": 711 }, { "epoch": 0.51, "grad_norm": 18.595561761908904, "learning_rate": 9.75572893554281e-06, "loss": 0.1384, "step": 712 }, { "epoch": 0.51, "grad_norm": 14.144175158912619, "learning_rate": 9.754835830201645e-06, "loss": 0.1586, "step": 713 }, { "epoch": 0.51, "grad_norm": 37.849196456539325, "learning_rate": 9.753941136181448e-06, "loss": 0.145, "step": 714 }, { "epoch": 0.51, "grad_norm": 35.69633971283892, "learning_rate": 9.753044853781155e-06, "loss": 0.1268, "step": 715 }, { "epoch": 0.51, "grad_norm": 7.152193957134935, "learning_rate": 9.75214698330023e-06, "loss": 0.1831, "step": 716 }, { "epoch": 0.51, "grad_norm": 27.653182844039453, "learning_rate": 9.751247525038669e-06, "loss": 0.1306, "step": 717 }, { "epoch": 0.51, "grad_norm": 27.593230490233363, "learning_rate": 9.750346479296998e-06, "loss": 0.1471, "step": 718 }, { "epoch": 0.51, "grad_norm": 4.305679091229352, "learning_rate": 9.74944384637627e-06, "loss": 0.1173, "step": 719 }, { "epoch": 0.51, "grad_norm": 27.29753578820685, "learning_rate": 9.748539626578076e-06, "loss": 0.1168, "step": 720 }, { "epoch": 0.51, "grad_norm": 5.465815777296448, "learning_rate": 9.747633820204527e-06, "loss": 0.1176, "step": 721 }, { "epoch": 0.52, "grad_norm": 12.977436523645396, "learning_rate": 9.746726427558276e-06, "loss": 0.1294, "step": 722 }, { "epoch": 0.52, "grad_norm": 8.50325075384258, "learning_rate": 9.745817448942496e-06, "loss": 0.1541, "step": 723 }, { "epoch": 0.52, "grad_norm": 25.299637458129375, "learning_rate": 9.744906884660894e-06, "loss": 0.146, "step": 724 }, { "epoch": 0.52, "grad_norm": 29.392294563570957, "learning_rate": 9.743994735017708e-06, "loss": 0.1144, "step": 725 }, { "epoch": 0.52, "grad_norm": 9.051587116954162, "learning_rate": 9.743081000317703e-06, "loss": 0.1433, "step": 726 }, { "epoch": 0.52, "grad_norm": 38.587545048429, "learning_rate": 9.742165680866173e-06, "loss": 0.1388, "step": 727 }, { "epoch": 0.52, "grad_norm": 23.864218562404943, "learning_rate": 9.741248776968947e-06, "loss": 0.1458, "step": 728 }, { "epoch": 0.52, "grad_norm": 17.858968253120967, "learning_rate": 9.740330288932379e-06, "loss": 0.1136, "step": 729 }, { "epoch": 0.52, "grad_norm": 5.168166530378955, "learning_rate": 9.73941021706335e-06, "loss": 0.1274, "step": 730 }, { "epoch": 0.52, "grad_norm": 20.348246500543265, "learning_rate": 9.738488561669272e-06, "loss": 0.1316, "step": 731 }, { "epoch": 0.52, "grad_norm": 31.053717210440414, "learning_rate": 9.737565323058094e-06, "loss": 0.1594, "step": 732 }, { "epoch": 0.52, "grad_norm": 21.28339041051843, "learning_rate": 9.736640501538281e-06, "loss": 0.1228, "step": 733 }, { "epoch": 0.52, "grad_norm": 15.286309070528286, "learning_rate": 9.735714097418835e-06, "loss": 0.1377, "step": 734 }, { "epoch": 0.52, "grad_norm": 27.796625488655682, "learning_rate": 9.734786111009287e-06, "loss": 0.1254, "step": 735 }, { "epoch": 0.53, "grad_norm": 7.5056897186412606, "learning_rate": 9.73385654261969e-06, "loss": 0.0974, "step": 736 }, { "epoch": 0.53, "grad_norm": 4.791484316668564, "learning_rate": 9.732925392560634e-06, "loss": 0.131, "step": 737 }, { "epoch": 0.53, "grad_norm": 14.411536482139912, "learning_rate": 9.731992661143233e-06, "loss": 0.1107, "step": 738 }, { "epoch": 0.53, "grad_norm": 15.29447975137697, "learning_rate": 9.731058348679128e-06, "loss": 0.1244, "step": 739 }, { "epoch": 0.53, "grad_norm": 26.753002422549987, "learning_rate": 9.73012245548049e-06, "loss": 0.1729, "step": 740 }, { "epoch": 0.53, "grad_norm": 15.269880803641675, "learning_rate": 9.729184981860023e-06, "loss": 0.1367, "step": 741 }, { "epoch": 0.53, "grad_norm": 23.286568445497416, "learning_rate": 9.728245928130949e-06, "loss": 0.1204, "step": 742 }, { "epoch": 0.53, "grad_norm": 12.124695582884476, "learning_rate": 9.727305294607024e-06, "loss": 0.1174, "step": 743 }, { "epoch": 0.53, "grad_norm": 5.340039718599179, "learning_rate": 9.726363081602532e-06, "loss": 0.1272, "step": 744 }, { "epoch": 0.53, "grad_norm": 8.03978216102685, "learning_rate": 9.725419289432287e-06, "loss": 0.1349, "step": 745 }, { "epoch": 0.53, "grad_norm": 12.900890675498434, "learning_rate": 9.724473918411624e-06, "loss": 0.1295, "step": 746 }, { "epoch": 0.53, "grad_norm": 12.021350602091163, "learning_rate": 9.723526968856408e-06, "loss": 0.1057, "step": 747 }, { "epoch": 0.53, "grad_norm": 10.989225561921891, "learning_rate": 9.722578441083035e-06, "loss": 0.0867, "step": 748 }, { "epoch": 0.53, "grad_norm": 17.269981813172723, "learning_rate": 9.721628335408423e-06, "loss": 0.1116, "step": 749 }, { "epoch": 0.54, "grad_norm": 30.291178256649264, "learning_rate": 9.720676652150025e-06, "loss": 0.1224, "step": 750 }, { "epoch": 0.54, "grad_norm": 6.188545883869058, "learning_rate": 9.719723391625813e-06, "loss": 0.1566, "step": 751 }, { "epoch": 0.54, "grad_norm": 11.183215999542815, "learning_rate": 9.718768554154287e-06, "loss": 0.1754, "step": 752 }, { "epoch": 0.54, "grad_norm": 12.300039794356254, "learning_rate": 9.717812140054479e-06, "loss": 0.1091, "step": 753 }, { "epoch": 0.54, "grad_norm": 43.092405088706, "learning_rate": 9.716854149645945e-06, "loss": 0.1327, "step": 754 }, { "epoch": 0.54, "grad_norm": 14.989995368482804, "learning_rate": 9.715894583248764e-06, "loss": 0.1329, "step": 755 }, { "epoch": 0.54, "grad_norm": 4.450637838006689, "learning_rate": 9.714933441183549e-06, "loss": 0.1046, "step": 756 }, { "epoch": 0.54, "grad_norm": 4.370431682700567, "learning_rate": 9.713970723771432e-06, "loss": 0.094, "step": 757 }, { "epoch": 0.54, "grad_norm": 5.439106294822273, "learning_rate": 9.713006431334076e-06, "loss": 0.1075, "step": 758 }, { "epoch": 0.54, "grad_norm": 6.568446270079447, "learning_rate": 9.71204056419367e-06, "loss": 0.1646, "step": 759 }, { "epoch": 0.54, "grad_norm": 12.719463921354748, "learning_rate": 9.711073122672928e-06, "loss": 0.1296, "step": 760 }, { "epoch": 0.54, "grad_norm": 32.803026706716224, "learning_rate": 9.71010410709509e-06, "loss": 0.1307, "step": 761 }, { "epoch": 0.54, "grad_norm": 9.96453423011384, "learning_rate": 9.70913351778392e-06, "loss": 0.1228, "step": 762 }, { "epoch": 0.54, "grad_norm": 20.833929487014423, "learning_rate": 9.708161355063714e-06, "loss": 0.1479, "step": 763 }, { "epoch": 0.55, "grad_norm": 39.33943975858833, "learning_rate": 9.707187619259286e-06, "loss": 0.123, "step": 764 }, { "epoch": 0.55, "grad_norm": 7.376268195351014, "learning_rate": 9.706212310695981e-06, "loss": 0.125, "step": 765 }, { "epoch": 0.55, "grad_norm": 9.440271953630097, "learning_rate": 9.705235429699666e-06, "loss": 0.1115, "step": 766 }, { "epoch": 0.55, "grad_norm": 20.249022741934898, "learning_rate": 9.704256976596737e-06, "loss": 0.1263, "step": 767 }, { "epoch": 0.55, "grad_norm": 26.41981337531156, "learning_rate": 9.703276951714114e-06, "loss": 0.1115, "step": 768 }, { "epoch": 0.55, "grad_norm": 36.217352696254, "learning_rate": 9.70229535537924e-06, "loss": 0.0952, "step": 769 }, { "epoch": 0.55, "grad_norm": 14.397083314980746, "learning_rate": 9.701312187920084e-06, "loss": 0.1769, "step": 770 }, { "epoch": 0.55, "grad_norm": 26.0325827243847, "learning_rate": 9.700327449665143e-06, "loss": 0.1141, "step": 771 }, { "epoch": 0.55, "grad_norm": 36.351045967305154, "learning_rate": 9.699341140943434e-06, "loss": 0.1384, "step": 772 }, { "epoch": 0.55, "grad_norm": 23.25267147893353, "learning_rate": 9.698353262084501e-06, "loss": 0.1324, "step": 773 }, { "epoch": 0.55, "grad_norm": 8.482877435007008, "learning_rate": 9.697363813418414e-06, "loss": 0.1206, "step": 774 }, { "epoch": 0.55, "grad_norm": 12.560594535886262, "learning_rate": 9.696372795275766e-06, "loss": 0.1587, "step": 775 }, { "epoch": 0.55, "grad_norm": 14.141509443672007, "learning_rate": 9.695380207987675e-06, "loss": 0.0968, "step": 776 }, { "epoch": 0.55, "grad_norm": 23.18138524562932, "learning_rate": 9.69438605188578e-06, "loss": 0.1436, "step": 777 }, { "epoch": 0.56, "grad_norm": 18.79428105301295, "learning_rate": 9.69339032730225e-06, "loss": 0.1552, "step": 778 }, { "epoch": 0.56, "grad_norm": 30.44553819976487, "learning_rate": 9.692393034569776e-06, "loss": 0.1146, "step": 779 }, { "epoch": 0.56, "grad_norm": 6.399140231533663, "learning_rate": 9.69139417402157e-06, "loss": 0.1072, "step": 780 }, { "epoch": 0.56, "grad_norm": 19.39038831015829, "learning_rate": 9.690393745991368e-06, "loss": 0.1361, "step": 781 }, { "epoch": 0.56, "grad_norm": 16.807652464179398, "learning_rate": 9.689391750813436e-06, "loss": 0.1516, "step": 782 }, { "epoch": 0.56, "grad_norm": 12.595719018779906, "learning_rate": 9.688388188822556e-06, "loss": 0.1456, "step": 783 }, { "epoch": 0.56, "grad_norm": 78.16077311228811, "learning_rate": 9.687383060354038e-06, "loss": 0.2327, "step": 784 }, { "epoch": 0.56, "grad_norm": 12.822682377103986, "learning_rate": 9.686376365743714e-06, "loss": 0.1251, "step": 785 }, { "epoch": 0.56, "grad_norm": 35.69822535004326, "learning_rate": 9.685368105327938e-06, "loss": 0.1688, "step": 786 }, { "epoch": 0.56, "grad_norm": 27.540323241637438, "learning_rate": 9.684358279443593e-06, "loss": 0.1223, "step": 787 }, { "epoch": 0.56, "grad_norm": 52.99880117052681, "learning_rate": 9.683346888428074e-06, "loss": 0.1387, "step": 788 }, { "epoch": 0.56, "grad_norm": 38.310273349733706, "learning_rate": 9.68233393261931e-06, "loss": 0.156, "step": 789 }, { "epoch": 0.56, "grad_norm": 34.726834784529174, "learning_rate": 9.681319412355748e-06, "loss": 0.0992, "step": 790 }, { "epoch": 0.56, "grad_norm": 68.29196974086027, "learning_rate": 9.680303327976356e-06, "loss": 0.1697, "step": 791 }, { "epoch": 0.57, "grad_norm": 44.244078950175414, "learning_rate": 9.679285679820628e-06, "loss": 0.1471, "step": 792 }, { "epoch": 0.57, "grad_norm": 10.851951807112226, "learning_rate": 9.67826646822858e-06, "loss": 0.1339, "step": 793 }, { "epoch": 0.57, "grad_norm": 49.15989066226097, "learning_rate": 9.677245693540749e-06, "loss": 0.1746, "step": 794 }, { "epoch": 0.57, "grad_norm": 29.52416076684413, "learning_rate": 9.676223356098194e-06, "loss": 0.1154, "step": 795 }, { "epoch": 0.57, "grad_norm": 9.38163332741748, "learning_rate": 9.675199456242499e-06, "loss": 0.1305, "step": 796 }, { "epoch": 0.57, "grad_norm": 10.925248900398602, "learning_rate": 9.674173994315764e-06, "loss": 0.1724, "step": 797 }, { "epoch": 0.57, "grad_norm": 17.954529179430356, "learning_rate": 9.67314697066062e-06, "loss": 0.1324, "step": 798 }, { "epoch": 0.57, "grad_norm": 19.573255779518618, "learning_rate": 9.672118385620209e-06, "loss": 0.1199, "step": 799 }, { "epoch": 0.57, "grad_norm": 45.10260866334031, "learning_rate": 9.671088239538204e-06, "loss": 0.168, "step": 800 }, { "epoch": 0.57, "grad_norm": 4.096128376621521, "learning_rate": 9.670056532758798e-06, "loss": 0.113, "step": 801 }, { "epoch": 0.57, "grad_norm": 41.288709677166764, "learning_rate": 9.669023265626698e-06, "loss": 0.1699, "step": 802 }, { "epoch": 0.57, "grad_norm": 50.206876725438384, "learning_rate": 9.66798843848714e-06, "loss": 0.1154, "step": 803 }, { "epoch": 0.57, "grad_norm": 18.766537968172486, "learning_rate": 9.666952051685882e-06, "loss": 0.1078, "step": 804 }, { "epoch": 0.57, "grad_norm": 27.654720064496317, "learning_rate": 9.665914105569196e-06, "loss": 0.1472, "step": 805 }, { "epoch": 0.58, "grad_norm": 30.465348594344608, "learning_rate": 9.664874600483883e-06, "loss": 0.1125, "step": 806 }, { "epoch": 0.58, "grad_norm": 35.45942332075495, "learning_rate": 9.663833536777256e-06, "loss": 0.1239, "step": 807 }, { "epoch": 0.58, "grad_norm": 4.15194949775956, "learning_rate": 9.662790914797158e-06, "loss": 0.1382, "step": 808 }, { "epoch": 0.58, "grad_norm": 5.445521315197055, "learning_rate": 9.661746734891947e-06, "loss": 0.1438, "step": 809 }, { "epoch": 0.58, "grad_norm": 18.926741667241487, "learning_rate": 9.6607009974105e-06, "loss": 0.1327, "step": 810 }, { "epoch": 0.58, "grad_norm": 34.07683918961295, "learning_rate": 9.659653702702223e-06, "loss": 0.1337, "step": 811 }, { "epoch": 0.58, "grad_norm": 31.666273539929072, "learning_rate": 9.658604851117032e-06, "loss": 0.1421, "step": 812 }, { "epoch": 0.58, "grad_norm": 7.290269985502701, "learning_rate": 9.65755444300537e-06, "loss": 0.1259, "step": 813 }, { "epoch": 0.58, "grad_norm": 20.748229179136143, "learning_rate": 9.656502478718197e-06, "loss": 0.1207, "step": 814 }, { "epoch": 0.58, "grad_norm": 18.76253063384707, "learning_rate": 9.655448958606994e-06, "loss": 0.1289, "step": 815 }, { "epoch": 0.58, "grad_norm": 47.068157538168485, "learning_rate": 9.654393883023763e-06, "loss": 0.1449, "step": 816 }, { "epoch": 0.58, "grad_norm": 16.508111889804976, "learning_rate": 9.653337252321023e-06, "loss": 0.137, "step": 817 }, { "epoch": 0.58, "grad_norm": 6.161976884102493, "learning_rate": 9.652279066851811e-06, "loss": 0.126, "step": 818 }, { "epoch": 0.58, "grad_norm": 69.48129884052561, "learning_rate": 9.651219326969694e-06, "loss": 0.179, "step": 819 }, { "epoch": 0.59, "grad_norm": 25.548064760384744, "learning_rate": 9.650158033028743e-06, "loss": 0.1292, "step": 820 }, { "epoch": 0.59, "grad_norm": 22.848051753734424, "learning_rate": 9.64909518538356e-06, "loss": 0.1185, "step": 821 }, { "epoch": 0.59, "grad_norm": 38.528184137095906, "learning_rate": 9.648030784389264e-06, "loss": 0.1333, "step": 822 }, { "epoch": 0.59, "grad_norm": 40.289153016277766, "learning_rate": 9.646964830401487e-06, "loss": 0.1868, "step": 823 }, { "epoch": 0.59, "grad_norm": 28.63000932230696, "learning_rate": 9.645897323776386e-06, "loss": 0.1309, "step": 824 }, { "epoch": 0.59, "grad_norm": 29.390175324133896, "learning_rate": 9.644828264870634e-06, "loss": 0.1494, "step": 825 }, { "epoch": 0.59, "grad_norm": 29.520098412687467, "learning_rate": 9.643757654041423e-06, "loss": 0.1147, "step": 826 }, { "epoch": 0.59, "grad_norm": 30.864194139048347, "learning_rate": 9.642685491646467e-06, "loss": 0.1078, "step": 827 }, { "epoch": 0.59, "grad_norm": 42.21262734452298, "learning_rate": 9.641611778043992e-06, "loss": 0.1384, "step": 828 }, { "epoch": 0.59, "grad_norm": 12.587684898717356, "learning_rate": 9.64053651359275e-06, "loss": 0.1387, "step": 829 }, { "epoch": 0.59, "grad_norm": 36.28856657814182, "learning_rate": 9.639459698652e-06, "loss": 0.1475, "step": 830 }, { "epoch": 0.59, "grad_norm": 40.3795735401778, "learning_rate": 9.63838133358153e-06, "loss": 0.1454, "step": 831 }, { "epoch": 0.59, "grad_norm": 56.58394981425765, "learning_rate": 9.637301418741643e-06, "loss": 0.1733, "step": 832 }, { "epoch": 0.59, "grad_norm": 9.372522640119524, "learning_rate": 9.636219954493157e-06, "loss": 0.1172, "step": 833 }, { "epoch": 0.6, "grad_norm": 5.10785762912234, "learning_rate": 9.635136941197409e-06, "loss": 0.1244, "step": 834 }, { "epoch": 0.6, "grad_norm": 47.76352747100944, "learning_rate": 9.634052379216256e-06, "loss": 0.1663, "step": 835 }, { "epoch": 0.6, "grad_norm": 27.543918826169143, "learning_rate": 9.632966268912067e-06, "loss": 0.1227, "step": 836 }, { "epoch": 0.6, "grad_norm": 27.936130596657357, "learning_rate": 9.631878610647734e-06, "loss": 0.14, "step": 837 }, { "epoch": 0.6, "grad_norm": 35.17792432965117, "learning_rate": 9.630789404786664e-06, "loss": 0.156, "step": 838 }, { "epoch": 0.6, "grad_norm": 28.829265986491016, "learning_rate": 9.629698651692779e-06, "loss": 0.1443, "step": 839 }, { "epoch": 0.6, "grad_norm": 24.10923790760327, "learning_rate": 9.62860635173052e-06, "loss": 0.1364, "step": 840 }, { "epoch": 0.6, "grad_norm": 4.678265188180967, "learning_rate": 9.627512505264847e-06, "loss": 0.1251, "step": 841 }, { "epoch": 0.6, "grad_norm": 30.518872167549794, "learning_rate": 9.626417112661233e-06, "loss": 0.1193, "step": 842 }, { "epoch": 0.6, "grad_norm": 4.30248553766188, "learning_rate": 9.62532017428567e-06, "loss": 0.1294, "step": 843 }, { "epoch": 0.6, "grad_norm": 8.95928112006393, "learning_rate": 9.624221690504663e-06, "loss": 0.1318, "step": 844 }, { "epoch": 0.6, "grad_norm": 9.975229344522406, "learning_rate": 9.623121661685239e-06, "loss": 0.1351, "step": 845 }, { "epoch": 0.6, "grad_norm": 5.673008949146638, "learning_rate": 9.622020088194934e-06, "loss": 0.1339, "step": 846 }, { "epoch": 0.6, "grad_norm": 5.7187307082075485, "learning_rate": 9.62091697040181e-06, "loss": 0.109, "step": 847 }, { "epoch": 0.61, "grad_norm": 19.086793156437807, "learning_rate": 9.619812308674434e-06, "loss": 0.1528, "step": 848 }, { "epoch": 0.61, "grad_norm": 15.448285601263763, "learning_rate": 9.618706103381896e-06, "loss": 0.137, "step": 849 }, { "epoch": 0.61, "grad_norm": 8.43458261032492, "learning_rate": 9.6175983548938e-06, "loss": 0.1368, "step": 850 }, { "epoch": 0.61, "grad_norm": 6.695331340739746, "learning_rate": 9.616489063580265e-06, "loss": 0.1494, "step": 851 }, { "epoch": 0.61, "grad_norm": 16.36884007886458, "learning_rate": 9.615378229811927e-06, "loss": 0.098, "step": 852 }, { "epoch": 0.61, "grad_norm": 6.709638349793122, "learning_rate": 9.614265853959935e-06, "loss": 0.1013, "step": 853 }, { "epoch": 0.61, "grad_norm": 13.96540985996891, "learning_rate": 9.613151936395952e-06, "loss": 0.1692, "step": 854 }, { "epoch": 0.61, "grad_norm": 4.1809794075417095, "learning_rate": 9.612036477492163e-06, "loss": 0.1151, "step": 855 }, { "epoch": 0.61, "grad_norm": 31.864538660439695, "learning_rate": 9.610919477621262e-06, "loss": 0.094, "step": 856 }, { "epoch": 0.61, "grad_norm": 11.028949180354989, "learning_rate": 9.609800937156459e-06, "loss": 0.1671, "step": 857 }, { "epoch": 0.61, "grad_norm": 13.125900295808705, "learning_rate": 9.60868085647148e-06, "loss": 0.1697, "step": 858 }, { "epoch": 0.61, "grad_norm": 26.03409453086144, "learning_rate": 9.607559235940562e-06, "loss": 0.1094, "step": 859 }, { "epoch": 0.61, "grad_norm": 14.356106276553628, "learning_rate": 9.60643607593846e-06, "loss": 0.1176, "step": 860 }, { "epoch": 0.61, "grad_norm": 10.48182805841283, "learning_rate": 9.605311376840446e-06, "loss": 0.1034, "step": 861 }, { "epoch": 0.62, "grad_norm": 4.737169065727766, "learning_rate": 9.604185139022302e-06, "loss": 0.1119, "step": 862 }, { "epoch": 0.62, "grad_norm": 11.543502918761671, "learning_rate": 9.603057362860323e-06, "loss": 0.1512, "step": 863 }, { "epoch": 0.62, "grad_norm": 13.137753210423396, "learning_rate": 9.60192804873132e-06, "loss": 0.0869, "step": 864 }, { "epoch": 0.62, "grad_norm": 8.238531279627713, "learning_rate": 9.60079719701262e-06, "loss": 0.1251, "step": 865 }, { "epoch": 0.62, "grad_norm": 4.442828257491122, "learning_rate": 9.599664808082058e-06, "loss": 0.1073, "step": 866 }, { "epoch": 0.62, "grad_norm": 45.80139113041787, "learning_rate": 9.598530882317992e-06, "loss": 0.1849, "step": 867 }, { "epoch": 0.62, "grad_norm": 5.013274249831894, "learning_rate": 9.59739542009928e-06, "loss": 0.1652, "step": 868 }, { "epoch": 0.62, "grad_norm": 20.94648720784535, "learning_rate": 9.596258421805306e-06, "loss": 0.1349, "step": 869 }, { "epoch": 0.62, "grad_norm": 20.308753874252467, "learning_rate": 9.595119887815962e-06, "loss": 0.1345, "step": 870 }, { "epoch": 0.62, "grad_norm": 30.38587830318456, "learning_rate": 9.593979818511655e-06, "loss": 0.1326, "step": 871 }, { "epoch": 0.62, "grad_norm": 7.113951857489782, "learning_rate": 9.592838214273298e-06, "loss": 0.1516, "step": 872 }, { "epoch": 0.62, "grad_norm": 4.08267552970721, "learning_rate": 9.591695075482326e-06, "loss": 0.1118, "step": 873 }, { "epoch": 0.62, "grad_norm": 21.290502900984546, "learning_rate": 9.590550402520683e-06, "loss": 0.1262, "step": 874 }, { "epoch": 0.62, "grad_norm": 40.747754501062765, "learning_rate": 9.589404195770821e-06, "loss": 0.145, "step": 875 }, { "epoch": 0.63, "grad_norm": 5.363040561381, "learning_rate": 9.588256455615716e-06, "loss": 0.1309, "step": 876 }, { "epoch": 0.63, "grad_norm": 9.164698662773475, "learning_rate": 9.587107182438846e-06, "loss": 0.1272, "step": 877 }, { "epoch": 0.63, "grad_norm": 46.7965238441854, "learning_rate": 9.585956376624204e-06, "loss": 0.1318, "step": 878 }, { "epoch": 0.63, "grad_norm": 8.889294742395169, "learning_rate": 9.584804038556297e-06, "loss": 0.1427, "step": 879 }, { "epoch": 0.63, "grad_norm": 8.060406285399305, "learning_rate": 9.58365016862014e-06, "loss": 0.1256, "step": 880 }, { "epoch": 0.63, "grad_norm": 40.00127209719933, "learning_rate": 9.582494767201265e-06, "loss": 0.1309, "step": 881 }, { "epoch": 0.63, "grad_norm": 13.84245575000966, "learning_rate": 9.581337834685713e-06, "loss": 0.104, "step": 882 }, { "epoch": 0.63, "grad_norm": 23.402757032695074, "learning_rate": 9.580179371460034e-06, "loss": 0.1289, "step": 883 }, { "epoch": 0.63, "grad_norm": 24.43621959954114, "learning_rate": 9.579019377911296e-06, "loss": 0.1401, "step": 884 }, { "epoch": 0.63, "grad_norm": 9.753446651852203, "learning_rate": 9.57785785442707e-06, "loss": 0.1206, "step": 885 }, { "epoch": 0.63, "grad_norm": 9.301431458818122, "learning_rate": 9.576694801395447e-06, "loss": 0.1083, "step": 886 }, { "epoch": 0.63, "grad_norm": 15.99251554227728, "learning_rate": 9.57553021920502e-06, "loss": 0.1708, "step": 887 }, { "epoch": 0.63, "grad_norm": 27.354550811963918, "learning_rate": 9.574364108244903e-06, "loss": 0.1095, "step": 888 }, { "epoch": 0.63, "grad_norm": 14.453620875280626, "learning_rate": 9.573196468904711e-06, "loss": 0.1439, "step": 889 }, { "epoch": 0.64, "grad_norm": 23.906822227072045, "learning_rate": 9.572027301574576e-06, "loss": 0.177, "step": 890 }, { "epoch": 0.64, "grad_norm": 40.74283625046122, "learning_rate": 9.570856606645139e-06, "loss": 0.1349, "step": 891 }, { "epoch": 0.64, "grad_norm": 32.77881193498861, "learning_rate": 9.569684384507547e-06, "loss": 0.1088, "step": 892 }, { "epoch": 0.64, "grad_norm": 9.788598652893294, "learning_rate": 9.568510635553466e-06, "loss": 0.1145, "step": 893 }, { "epoch": 0.64, "grad_norm": 5.486909948066521, "learning_rate": 9.567335360175065e-06, "loss": 0.131, "step": 894 }, { "epoch": 0.64, "grad_norm": 6.250156911609917, "learning_rate": 9.566158558765026e-06, "loss": 0.1603, "step": 895 }, { "epoch": 0.64, "grad_norm": 23.236140333378174, "learning_rate": 9.564980231716541e-06, "loss": 0.1176, "step": 896 }, { "epoch": 0.64, "grad_norm": 37.16329810519581, "learning_rate": 9.56380037942331e-06, "loss": 0.1315, "step": 897 }, { "epoch": 0.64, "grad_norm": 8.545747912275482, "learning_rate": 9.562619002279541e-06, "loss": 0.1576, "step": 898 }, { "epoch": 0.64, "grad_norm": 4.7428484359296625, "learning_rate": 9.561436100679959e-06, "loss": 0.1204, "step": 899 }, { "epoch": 0.64, "grad_norm": 34.822828382532826, "learning_rate": 9.56025167501979e-06, "loss": 0.1407, "step": 900 }, { "epoch": 0.64, "grad_norm": 4.274669158416529, "learning_rate": 9.559065725694775e-06, "loss": 0.1305, "step": 901 }, { "epoch": 0.64, "grad_norm": 5.2730681558009245, "learning_rate": 9.55787825310116e-06, "loss": 0.1417, "step": 902 }, { "epoch": 0.64, "grad_norm": 4.919028484956142, "learning_rate": 9.5566892576357e-06, "loss": 0.0978, "step": 903 }, { "epoch": 0.65, "grad_norm": 8.4710721096915, "learning_rate": 9.555498739695665e-06, "loss": 0.1519, "step": 904 }, { "epoch": 0.65, "grad_norm": 7.993053552415446, "learning_rate": 9.554306699678827e-06, "loss": 0.1193, "step": 905 }, { "epoch": 0.65, "grad_norm": 29.810998416607553, "learning_rate": 9.553113137983467e-06, "loss": 0.1377, "step": 906 }, { "epoch": 0.65, "grad_norm": 10.703199156948056, "learning_rate": 9.551918055008378e-06, "loss": 0.125, "step": 907 }, { "epoch": 0.65, "grad_norm": 9.792361535663362, "learning_rate": 9.55072145115286e-06, "loss": 0.1246, "step": 908 }, { "epoch": 0.65, "grad_norm": 38.43165453677999, "learning_rate": 9.54952332681672e-06, "loss": 0.1543, "step": 909 }, { "epoch": 0.65, "grad_norm": 20.810549271538743, "learning_rate": 9.54832368240027e-06, "loss": 0.1169, "step": 910 }, { "epoch": 0.65, "grad_norm": 14.216738798712827, "learning_rate": 9.54712251830434e-06, "loss": 0.1195, "step": 911 }, { "epoch": 0.65, "grad_norm": 37.7812086135415, "learning_rate": 9.545919834930257e-06, "loss": 0.1229, "step": 912 }, { "epoch": 0.65, "grad_norm": 16.088887693282086, "learning_rate": 9.54471563267986e-06, "loss": 0.1521, "step": 913 }, { "epoch": 0.65, "grad_norm": 6.086877374953741, "learning_rate": 9.543509911955497e-06, "loss": 0.1245, "step": 914 }, { "epoch": 0.65, "grad_norm": 6.884647790510014, "learning_rate": 9.542302673160021e-06, "loss": 0.1477, "step": 915 }, { "epoch": 0.65, "grad_norm": 13.471052689354279, "learning_rate": 9.541093916696793e-06, "loss": 0.1655, "step": 916 }, { "epoch": 0.65, "grad_norm": 5.860831191922153, "learning_rate": 9.539883642969681e-06, "loss": 0.0962, "step": 917 }, { "epoch": 0.66, "grad_norm": 9.950951525429048, "learning_rate": 9.53867185238306e-06, "loss": 0.1212, "step": 918 }, { "epoch": 0.66, "grad_norm": 29.397346489598785, "learning_rate": 9.53745854534181e-06, "loss": 0.1531, "step": 919 }, { "epoch": 0.66, "grad_norm": 14.700115456334249, "learning_rate": 9.536243722251321e-06, "loss": 0.1633, "step": 920 }, { "epoch": 0.66, "grad_norm": 31.465682765345253, "learning_rate": 9.53502738351749e-06, "loss": 0.1273, "step": 921 }, { "epoch": 0.66, "grad_norm": 13.342265225292849, "learning_rate": 9.533809529546716e-06, "loss": 0.0986, "step": 922 }, { "epoch": 0.66, "grad_norm": 8.06223070740461, "learning_rate": 9.532590160745906e-06, "loss": 0.1138, "step": 923 }, { "epoch": 0.66, "grad_norm": 5.891965791948293, "learning_rate": 9.531369277522475e-06, "loss": 0.1008, "step": 924 }, { "epoch": 0.66, "grad_norm": 14.385809821534846, "learning_rate": 9.530146880284343e-06, "loss": 0.1107, "step": 925 }, { "epoch": 0.66, "grad_norm": 5.594656018466403, "learning_rate": 9.528922969439935e-06, "loss": 0.1097, "step": 926 }, { "epoch": 0.66, "grad_norm": 23.884572371985932, "learning_rate": 9.527697545398183e-06, "loss": 0.1483, "step": 927 }, { "epoch": 0.66, "grad_norm": 5.235374187689162, "learning_rate": 9.526470608568521e-06, "loss": 0.1179, "step": 928 }, { "epoch": 0.66, "grad_norm": 9.498044658058308, "learning_rate": 9.525242159360897e-06, "loss": 0.1262, "step": 929 }, { "epoch": 0.66, "grad_norm": 10.326025536830267, "learning_rate": 9.524012198185755e-06, "loss": 0.1978, "step": 930 }, { "epoch": 0.66, "grad_norm": 15.950579961915821, "learning_rate": 9.522780725454048e-06, "loss": 0.1472, "step": 931 }, { "epoch": 0.67, "grad_norm": 16.911939075772374, "learning_rate": 9.521547741577232e-06, "loss": 0.1405, "step": 932 }, { "epoch": 0.67, "grad_norm": 8.840723626936537, "learning_rate": 9.520313246967277e-06, "loss": 0.1378, "step": 933 }, { "epoch": 0.67, "grad_norm": 10.194781971670784, "learning_rate": 9.519077242036643e-06, "loss": 0.1351, "step": 934 }, { "epoch": 0.67, "grad_norm": 26.571594739816774, "learning_rate": 9.517839727198306e-06, "loss": 0.1461, "step": 935 }, { "epoch": 0.67, "grad_norm": 25.488817918137755, "learning_rate": 9.516600702865742e-06, "loss": 0.1245, "step": 936 }, { "epoch": 0.67, "grad_norm": 9.428264723234935, "learning_rate": 9.51536016945293e-06, "loss": 0.1256, "step": 937 }, { "epoch": 0.67, "grad_norm": 9.398832656295712, "learning_rate": 9.514118127374358e-06, "loss": 0.118, "step": 938 }, { "epoch": 0.67, "grad_norm": 28.08087002312531, "learning_rate": 9.512874577045016e-06, "loss": 0.1302, "step": 939 }, { "epoch": 0.67, "grad_norm": 20.624474099676195, "learning_rate": 9.511629518880394e-06, "loss": 0.1273, "step": 940 }, { "epoch": 0.67, "grad_norm": 12.613846962609959, "learning_rate": 9.510382953296492e-06, "loss": 0.0986, "step": 941 }, { "epoch": 0.67, "grad_norm": 35.918107603359275, "learning_rate": 9.50913488070981e-06, "loss": 0.1638, "step": 942 }, { "epoch": 0.67, "grad_norm": 26.49689819650897, "learning_rate": 9.50788530153735e-06, "loss": 0.1265, "step": 943 }, { "epoch": 0.67, "grad_norm": 16.996031473631124, "learning_rate": 9.506634216196621e-06, "loss": 0.1117, "step": 944 }, { "epoch": 0.67, "grad_norm": 29.06015069003101, "learning_rate": 9.505381625105636e-06, "loss": 0.1605, "step": 945 }, { "epoch": 0.68, "grad_norm": 30.453905417068526, "learning_rate": 9.504127528682907e-06, "loss": 0.1602, "step": 946 }, { "epoch": 0.68, "grad_norm": 11.628167915371439, "learning_rate": 9.502871927347452e-06, "loss": 0.101, "step": 947 }, { "epoch": 0.68, "grad_norm": 10.321842222130279, "learning_rate": 9.501614821518789e-06, "loss": 0.1279, "step": 948 }, { "epoch": 0.68, "grad_norm": 4.367606237846145, "learning_rate": 9.500356211616941e-06, "loss": 0.1444, "step": 949 }, { "epoch": 0.68, "grad_norm": 52.609164817487056, "learning_rate": 9.499096098062435e-06, "loss": 0.1614, "step": 950 }, { "epoch": 0.68, "grad_norm": 4.849007077901183, "learning_rate": 9.497834481276293e-06, "loss": 0.1133, "step": 951 }, { "epoch": 0.68, "grad_norm": 6.628839587514296, "learning_rate": 9.496571361680052e-06, "loss": 0.1362, "step": 952 }, { "epoch": 0.68, "grad_norm": 22.70492961795664, "learning_rate": 9.495306739695738e-06, "loss": 0.1348, "step": 953 }, { "epoch": 0.68, "grad_norm": 23.438670549885423, "learning_rate": 9.494040615745887e-06, "loss": 0.1139, "step": 954 }, { "epoch": 0.68, "grad_norm": 3.777828080048308, "learning_rate": 9.492772990253535e-06, "loss": 0.1149, "step": 955 }, { "epoch": 0.68, "grad_norm": 17.01356783506953, "learning_rate": 9.49150386364222e-06, "loss": 0.1173, "step": 956 }, { "epoch": 0.68, "grad_norm": 20.86435129792081, "learning_rate": 9.490233236335977e-06, "loss": 0.1227, "step": 957 }, { "epoch": 0.68, "grad_norm": 21.168010427297975, "learning_rate": 9.488961108759349e-06, "loss": 0.0913, "step": 958 }, { "epoch": 0.68, "grad_norm": 12.351204194124849, "learning_rate": 9.487687481337377e-06, "loss": 0.1135, "step": 959 }, { "epoch": 0.69, "grad_norm": 4.269203529612833, "learning_rate": 9.486412354495605e-06, "loss": 0.1016, "step": 960 }, { "epoch": 0.69, "grad_norm": 5.879287993430144, "learning_rate": 9.485135728660073e-06, "loss": 0.1355, "step": 961 }, { "epoch": 0.69, "grad_norm": 19.29718867284448, "learning_rate": 9.48385760425733e-06, "loss": 0.1233, "step": 962 }, { "epoch": 0.69, "grad_norm": 10.109327624294433, "learning_rate": 9.482577981714417e-06, "loss": 0.1221, "step": 963 }, { "epoch": 0.69, "grad_norm": 11.843077711835422, "learning_rate": 9.481296861458881e-06, "loss": 0.1158, "step": 964 }, { "epoch": 0.69, "grad_norm": 6.528475309427824, "learning_rate": 9.480014243918769e-06, "loss": 0.168, "step": 965 }, { "epoch": 0.69, "grad_norm": 15.294376985491054, "learning_rate": 9.478730129522627e-06, "loss": 0.1021, "step": 966 }, { "epoch": 0.69, "grad_norm": 23.53373087484837, "learning_rate": 9.477444518699501e-06, "loss": 0.156, "step": 967 }, { "epoch": 0.69, "grad_norm": 12.345322636300342, "learning_rate": 9.476157411878937e-06, "loss": 0.1395, "step": 968 }, { "epoch": 0.69, "grad_norm": 29.858200916606965, "learning_rate": 9.474868809490984e-06, "loss": 0.1115, "step": 969 }, { "epoch": 0.69, "grad_norm": 4.955084258234279, "learning_rate": 9.473578711966185e-06, "loss": 0.1597, "step": 970 }, { "epoch": 0.69, "grad_norm": 11.600246069094517, "learning_rate": 9.472287119735588e-06, "loss": 0.1439, "step": 971 }, { "epoch": 0.69, "grad_norm": 10.893917106674843, "learning_rate": 9.470994033230735e-06, "loss": 0.1113, "step": 972 }, { "epoch": 0.69, "grad_norm": 5.074697713863284, "learning_rate": 9.469699452883672e-06, "loss": 0.1354, "step": 973 }, { "epoch": 0.7, "grad_norm": 10.174208973947078, "learning_rate": 9.468403379126943e-06, "loss": 0.1121, "step": 974 }, { "epoch": 0.7, "grad_norm": 18.846130115274406, "learning_rate": 9.46710581239359e-06, "loss": 0.163, "step": 975 }, { "epoch": 0.7, "grad_norm": 4.592869078244344, "learning_rate": 9.465806753117153e-06, "loss": 0.1381, "step": 976 }, { "epoch": 0.7, "grad_norm": 3.7149921395013608, "learning_rate": 9.464506201731674e-06, "loss": 0.0979, "step": 977 }, { "epoch": 0.7, "grad_norm": 5.5795453248401605, "learning_rate": 9.463204158671687e-06, "loss": 0.1036, "step": 978 }, { "epoch": 0.7, "grad_norm": 9.827533012507683, "learning_rate": 9.461900624372233e-06, "loss": 0.1218, "step": 979 }, { "epoch": 0.7, "grad_norm": 9.285235509914184, "learning_rate": 9.460595599268848e-06, "loss": 0.1506, "step": 980 }, { "epoch": 0.7, "grad_norm": 6.469405449404304, "learning_rate": 9.45928908379756e-06, "loss": 0.0923, "step": 981 }, { "epoch": 0.7, "grad_norm": 13.088900389735285, "learning_rate": 9.457981078394905e-06, "loss": 0.1519, "step": 982 }, { "epoch": 0.7, "grad_norm": 11.416521356456194, "learning_rate": 9.45667158349791e-06, "loss": 0.1038, "step": 983 }, { "epoch": 0.7, "grad_norm": 7.000029309311584, "learning_rate": 9.4553605995441e-06, "loss": 0.1067, "step": 984 }, { "epoch": 0.7, "grad_norm": 6.179225960011602, "learning_rate": 9.4540481269715e-06, "loss": 0.1224, "step": 985 }, { "epoch": 0.7, "grad_norm": 12.010485857421125, "learning_rate": 9.452734166218635e-06, "loss": 0.1298, "step": 986 }, { "epoch": 0.7, "grad_norm": 28.699516768299603, "learning_rate": 9.451418717724518e-06, "loss": 0.1122, "step": 987 }, { "epoch": 0.71, "grad_norm": 17.47564053882029, "learning_rate": 9.45010178192867e-06, "loss": 0.105, "step": 988 }, { "epoch": 0.71, "grad_norm": 22.22446380391295, "learning_rate": 9.448783359271102e-06, "loss": 0.1224, "step": 989 }, { "epoch": 0.71, "grad_norm": 8.196080255624146, "learning_rate": 9.44746345019232e-06, "loss": 0.1234, "step": 990 }, { "epoch": 0.71, "grad_norm": 23.353760929453095, "learning_rate": 9.446142055133333e-06, "loss": 0.1925, "step": 991 }, { "epoch": 0.71, "grad_norm": 12.684837863530596, "learning_rate": 9.444819174535647e-06, "loss": 0.1752, "step": 992 }, { "epoch": 0.71, "grad_norm": 41.91167980648941, "learning_rate": 9.443494808841255e-06, "loss": 0.1741, "step": 993 }, { "epoch": 0.71, "grad_norm": 25.021333305393867, "learning_rate": 9.442168958492657e-06, "loss": 0.1377, "step": 994 }, { "epoch": 0.71, "grad_norm": 17.303187659047722, "learning_rate": 9.44084162393284e-06, "loss": 0.1587, "step": 995 }, { "epoch": 0.71, "grad_norm": 13.881013346273633, "learning_rate": 9.439512805605294e-06, "loss": 0.1152, "step": 996 }, { "epoch": 0.71, "grad_norm": 5.679523342858859, "learning_rate": 9.438182503954002e-06, "loss": 0.1433, "step": 997 }, { "epoch": 0.71, "grad_norm": 5.809722363046939, "learning_rate": 9.43685071942344e-06, "loss": 0.1212, "step": 998 }, { "epoch": 0.71, "grad_norm": 16.300528267719702, "learning_rate": 9.435517452458584e-06, "loss": 0.1003, "step": 999 }, { "epoch": 0.71, "grad_norm": 7.142200136073567, "learning_rate": 9.434182703504904e-06, "loss": 0.1167, "step": 1000 }, { "epoch": 0.71, "eval_avg_AUC": 0.8306426631221906, "eval_avg_Accuracy": 0.697198275862069, "eval_avg_Accuracy-right": 0.9357636624494587, "eval_avg_Accuracy-wrong": 0.2812144644075506, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.7007286233364818, "eval_last_AUC": 0.8499919642378029, "eval_last_Accuracy": 0.7737483421750663, "eval_last_Accuracy-right": 0.8576366244945872, "eval_last_Accuracy-wrong": 0.6274732772344781, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.7122224635862333, "eval_max_AUC": 0.7712377534346357, "eval_max_Accuracy": 0.6421170424403183, "eval_max_Accuracy-right": 0.9876744489370027, "eval_max_Accuracy-wrong": 0.03957243575164885, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6283448927147767, "eval_min_AUC": 0.8459536118200042, "eval_min_Accuracy": 0.7756962864721485, "eval_min_Accuracy-right": 0.8238554845441503, "eval_min_Accuracy-wrong": 0.6917216283829885, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.7067690279119097, "eval_prod_AUC": 0.8340437720336431, "eval_prod_Accuracy": 0.7153929045092838, "eval_prod_Accuracy-right": 0.5985391939480892, "eval_prod_Accuracy-wrong": 0.9191494200591313, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6925029632488553, "eval_runtime": 246.6967, "eval_samples_per_second": 97.804, "eval_steps_per_second": 3.056, "eval_sum_AUC": 0.6696619705759399, "eval_sum_Accuracy": 0.6358173076923077, "eval_sum_Accuracy-right": 0.9999347854441111, "eval_sum_Accuracy-wrong": 0.0009097111667045713, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6722441685783858, "step": 1000 }, { "epoch": 0.71, "grad_norm": 26.90301939765578, "learning_rate": 9.432846473008363e-06, "loss": 0.1073, "step": 1001 }, { "epoch": 0.72, "grad_norm": 22.473750563787483, "learning_rate": 9.431508761415422e-06, "loss": 0.1453, "step": 1002 }, { "epoch": 0.72, "grad_norm": 8.035685660671625, "learning_rate": 9.430169569173034e-06, "loss": 0.1307, "step": 1003 }, { "epoch": 0.72, "grad_norm": 11.372775149242225, "learning_rate": 9.428828896728645e-06, "loss": 0.1189, "step": 1004 }, { "epoch": 0.72, "grad_norm": 14.367906249168973, "learning_rate": 9.427486744530205e-06, "loss": 0.1089, "step": 1005 }, { "epoch": 0.72, "grad_norm": 21.80296312308416, "learning_rate": 9.426143113026147e-06, "loss": 0.1641, "step": 1006 }, { "epoch": 0.72, "grad_norm": 12.689136775090986, "learning_rate": 9.424798002665405e-06, "loss": 0.1456, "step": 1007 }, { "epoch": 0.72, "grad_norm": 19.914315829482117, "learning_rate": 9.423451413897406e-06, "loss": 0.1381, "step": 1008 }, { "epoch": 0.72, "grad_norm": 17.487013924724515, "learning_rate": 9.42210334717207e-06, "loss": 0.1842, "step": 1009 }, { "epoch": 0.72, "grad_norm": 41.591369148501364, "learning_rate": 9.42075380293981e-06, "loss": 0.1582, "step": 1010 }, { "epoch": 0.72, "grad_norm": 21.325164316590822, "learning_rate": 9.419402781651537e-06, "loss": 0.1091, "step": 1011 }, { "epoch": 0.72, "grad_norm": 27.610236895046608, "learning_rate": 9.418050283758647e-06, "loss": 0.1791, "step": 1012 }, { "epoch": 0.72, "grad_norm": 48.91961383487278, "learning_rate": 9.416696309713038e-06, "loss": 0.1597, "step": 1013 }, { "epoch": 0.72, "grad_norm": 6.314404735669536, "learning_rate": 9.415340859967099e-06, "loss": 0.1124, "step": 1014 }, { "epoch": 0.72, "grad_norm": 4.467559964579186, "learning_rate": 9.413983934973709e-06, "loss": 0.1421, "step": 1015 }, { "epoch": 0.73, "grad_norm": 39.483948098381305, "learning_rate": 9.412625535186242e-06, "loss": 0.1479, "step": 1016 }, { "epoch": 0.73, "grad_norm": 25.24978912021175, "learning_rate": 9.411265661058565e-06, "loss": 0.1482, "step": 1017 }, { "epoch": 0.73, "grad_norm": 13.0958518186262, "learning_rate": 9.409904313045038e-06, "loss": 0.1525, "step": 1018 }, { "epoch": 0.73, "grad_norm": 15.169402604256561, "learning_rate": 9.408541491600511e-06, "loss": 0.1279, "step": 1019 }, { "epoch": 0.73, "grad_norm": 65.13621930141626, "learning_rate": 9.407177197180328e-06, "loss": 0.1594, "step": 1020 }, { "epoch": 0.73, "grad_norm": 47.31199672873694, "learning_rate": 9.405811430240329e-06, "loss": 0.1588, "step": 1021 }, { "epoch": 0.73, "grad_norm": 15.929281558822629, "learning_rate": 9.404444191236837e-06, "loss": 0.1355, "step": 1022 }, { "epoch": 0.73, "grad_norm": 43.31429781114566, "learning_rate": 9.403075480626674e-06, "loss": 0.1611, "step": 1023 }, { "epoch": 0.73, "grad_norm": 50.56085481452662, "learning_rate": 9.401705298867151e-06, "loss": 0.1643, "step": 1024 }, { "epoch": 0.73, "grad_norm": 36.11556641873227, "learning_rate": 9.400333646416073e-06, "loss": 0.1428, "step": 1025 }, { "epoch": 0.73, "grad_norm": 6.0767580228440075, "learning_rate": 9.398960523731735e-06, "loss": 0.1222, "step": 1026 }, { "epoch": 0.73, "grad_norm": 48.47371243487692, "learning_rate": 9.397585931272919e-06, "loss": 0.1434, "step": 1027 }, { "epoch": 0.73, "grad_norm": 47.87986732416734, "learning_rate": 9.396209869498905e-06, "loss": 0.1599, "step": 1028 }, { "epoch": 0.73, "grad_norm": 39.188166800682154, "learning_rate": 9.39483233886946e-06, "loss": 0.1154, "step": 1029 }, { "epoch": 0.74, "grad_norm": 10.175502788260841, "learning_rate": 9.393453339844842e-06, "loss": 0.1465, "step": 1030 }, { "epoch": 0.74, "grad_norm": 15.843982783240834, "learning_rate": 9.392072872885802e-06, "loss": 0.1418, "step": 1031 }, { "epoch": 0.74, "grad_norm": 51.04924891166709, "learning_rate": 9.39069093845358e-06, "loss": 0.1541, "step": 1032 }, { "epoch": 0.74, "grad_norm": 28.092328125663045, "learning_rate": 9.389307537009902e-06, "loss": 0.1537, "step": 1033 }, { "epoch": 0.74, "grad_norm": 30.374904328612487, "learning_rate": 9.387922669016992e-06, "loss": 0.1475, "step": 1034 }, { "epoch": 0.74, "grad_norm": 16.81604856905053, "learning_rate": 9.386536334937557e-06, "loss": 0.1382, "step": 1035 }, { "epoch": 0.74, "grad_norm": 37.54277803011268, "learning_rate": 9.385148535234799e-06, "loss": 0.1389, "step": 1036 }, { "epoch": 0.74, "grad_norm": 37.991731569487285, "learning_rate": 9.383759270372408e-06, "loss": 0.1583, "step": 1037 }, { "epoch": 0.74, "grad_norm": 43.38772303094576, "learning_rate": 9.382368540814563e-06, "loss": 0.1724, "step": 1038 }, { "epoch": 0.74, "grad_norm": 17.689671523576397, "learning_rate": 9.380976347025932e-06, "loss": 0.1157, "step": 1039 }, { "epoch": 0.74, "grad_norm": 44.41572273988692, "learning_rate": 9.379582689471671e-06, "loss": 0.1479, "step": 1040 }, { "epoch": 0.74, "grad_norm": 24.499163894929794, "learning_rate": 9.378187568617431e-06, "loss": 0.1245, "step": 1041 }, { "epoch": 0.74, "grad_norm": 34.745047025595184, "learning_rate": 9.376790984929348e-06, "loss": 0.1395, "step": 1042 }, { "epoch": 0.74, "grad_norm": 18.610150565591297, "learning_rate": 9.37539293887404e-06, "loss": 0.1469, "step": 1043 }, { "epoch": 0.75, "grad_norm": 29.510494891674, "learning_rate": 9.373993430918626e-06, "loss": 0.1155, "step": 1044 }, { "epoch": 0.75, "grad_norm": 26.89748864189952, "learning_rate": 9.372592461530708e-06, "loss": 0.1505, "step": 1045 }, { "epoch": 0.75, "grad_norm": 10.407826158103616, "learning_rate": 9.371190031178372e-06, "loss": 0.1257, "step": 1046 }, { "epoch": 0.75, "grad_norm": 20.146410557268332, "learning_rate": 9.369786140330198e-06, "loss": 0.1201, "step": 1047 }, { "epoch": 0.75, "grad_norm": 4.649809459235866, "learning_rate": 9.368380789455251e-06, "loss": 0.1188, "step": 1048 }, { "epoch": 0.75, "grad_norm": 20.935495774130015, "learning_rate": 9.36697397902309e-06, "loss": 0.1434, "step": 1049 }, { "epoch": 0.75, "grad_norm": 25.501227504620445, "learning_rate": 9.365565709503748e-06, "loss": 0.1395, "step": 1050 }, { "epoch": 0.75, "grad_norm": 7.080004066515803, "learning_rate": 9.364155981367761e-06, "loss": 0.0883, "step": 1051 }, { "epoch": 0.75, "grad_norm": 10.423934524437326, "learning_rate": 9.36274479508614e-06, "loss": 0.1187, "step": 1052 }, { "epoch": 0.75, "grad_norm": 34.962167370390645, "learning_rate": 9.361332151130396e-06, "loss": 0.12, "step": 1053 }, { "epoch": 0.75, "grad_norm": 21.079610427912726, "learning_rate": 9.359918049972512e-06, "loss": 0.1295, "step": 1054 }, { "epoch": 0.75, "grad_norm": 28.618833107684583, "learning_rate": 9.358502492084969e-06, "loss": 0.1395, "step": 1055 }, { "epoch": 0.75, "grad_norm": 5.8759001732284855, "learning_rate": 9.35708547794073e-06, "loss": 0.1281, "step": 1056 }, { "epoch": 0.75, "grad_norm": 37.51265565785163, "learning_rate": 9.355667008013249e-06, "loss": 0.1451, "step": 1057 }, { "epoch": 0.76, "grad_norm": 63.60202101689444, "learning_rate": 9.354247082776459e-06, "loss": 0.1753, "step": 1058 }, { "epoch": 0.76, "grad_norm": 23.159854031756225, "learning_rate": 9.352825702704784e-06, "loss": 0.132, "step": 1059 }, { "epoch": 0.76, "grad_norm": 46.848693056166454, "learning_rate": 9.351402868273136e-06, "loss": 0.1409, "step": 1060 }, { "epoch": 0.76, "grad_norm": 35.31714858575622, "learning_rate": 9.349978579956908e-06, "loss": 0.1536, "step": 1061 }, { "epoch": 0.76, "grad_norm": 39.11227629802165, "learning_rate": 9.348552838231983e-06, "loss": 0.1378, "step": 1062 }, { "epoch": 0.76, "grad_norm": 14.341752586653023, "learning_rate": 9.347125643574726e-06, "loss": 0.1119, "step": 1063 }, { "epoch": 0.76, "grad_norm": 31.512222693182753, "learning_rate": 9.345696996461992e-06, "loss": 0.1544, "step": 1064 }, { "epoch": 0.76, "grad_norm": 44.78047175611448, "learning_rate": 9.344266897371114e-06, "loss": 0.1526, "step": 1065 }, { "epoch": 0.76, "grad_norm": 13.860702506152542, "learning_rate": 9.34283534677992e-06, "loss": 0.132, "step": 1066 }, { "epoch": 0.76, "grad_norm": 9.914053857460699, "learning_rate": 9.341402345166714e-06, "loss": 0.146, "step": 1067 }, { "epoch": 0.76, "grad_norm": 5.244521794867931, "learning_rate": 9.33996789301029e-06, "loss": 0.1675, "step": 1068 }, { "epoch": 0.76, "grad_norm": 30.55515143164621, "learning_rate": 9.338531990789926e-06, "loss": 0.1207, "step": 1069 }, { "epoch": 0.76, "grad_norm": 11.578710866057671, "learning_rate": 9.33709463898538e-06, "loss": 0.1359, "step": 1070 }, { "epoch": 0.76, "grad_norm": 7.390861205292069, "learning_rate": 9.335655838076902e-06, "loss": 0.0799, "step": 1071 }, { "epoch": 0.77, "grad_norm": 4.0312126352617454, "learning_rate": 9.33421558854522e-06, "loss": 0.1311, "step": 1072 }, { "epoch": 0.77, "grad_norm": 16.82866199649571, "learning_rate": 9.332773890871548e-06, "loss": 0.1306, "step": 1073 }, { "epoch": 0.77, "grad_norm": 23.34045634748492, "learning_rate": 9.331330745537586e-06, "loss": 0.1274, "step": 1074 }, { "epoch": 0.77, "grad_norm": 21.331301913161205, "learning_rate": 9.329886153025513e-06, "loss": 0.1211, "step": 1075 }, { "epoch": 0.77, "grad_norm": 11.711724010388526, "learning_rate": 9.328440113817995e-06, "loss": 0.1337, "step": 1076 }, { "epoch": 0.77, "grad_norm": 22.866056035390898, "learning_rate": 9.326992628398182e-06, "loss": 0.1196, "step": 1077 }, { "epoch": 0.77, "grad_norm": 6.234289721717951, "learning_rate": 9.325543697249706e-06, "loss": 0.1526, "step": 1078 }, { "epoch": 0.77, "grad_norm": 29.859928319090212, "learning_rate": 9.324093320856679e-06, "loss": 0.137, "step": 1079 }, { "epoch": 0.77, "grad_norm": 15.250096335579789, "learning_rate": 9.3226414997037e-06, "loss": 0.116, "step": 1080 }, { "epoch": 0.77, "grad_norm": 13.223197640483088, "learning_rate": 9.32118823427585e-06, "loss": 0.1439, "step": 1081 }, { "epoch": 0.77, "grad_norm": 7.3596389314819675, "learning_rate": 9.319733525058694e-06, "loss": 0.1238, "step": 1082 }, { "epoch": 0.77, "grad_norm": 5.286656373449186, "learning_rate": 9.318277372538274e-06, "loss": 0.1317, "step": 1083 }, { "epoch": 0.77, "grad_norm": 14.109356967510104, "learning_rate": 9.316819777201119e-06, "loss": 0.1257, "step": 1084 }, { "epoch": 0.77, "grad_norm": 10.223027875056157, "learning_rate": 9.315360739534235e-06, "loss": 0.1648, "step": 1085 }, { "epoch": 0.78, "grad_norm": 4.052529390869049, "learning_rate": 9.313900260025121e-06, "loss": 0.1232, "step": 1086 }, { "epoch": 0.78, "grad_norm": 14.816538587879734, "learning_rate": 9.312438339161746e-06, "loss": 0.1198, "step": 1087 }, { "epoch": 0.78, "grad_norm": 5.283898279259918, "learning_rate": 9.310974977432565e-06, "loss": 0.1261, "step": 1088 }, { "epoch": 0.78, "grad_norm": 19.36116992418242, "learning_rate": 9.309510175326515e-06, "loss": 0.1118, "step": 1089 }, { "epoch": 0.78, "grad_norm": 5.860358397763082, "learning_rate": 9.308043933333012e-06, "loss": 0.1211, "step": 1090 }, { "epoch": 0.78, "grad_norm": 12.600425083069407, "learning_rate": 9.306576251941957e-06, "loss": 0.1387, "step": 1091 }, { "epoch": 0.78, "grad_norm": 49.923681618239925, "learning_rate": 9.305107131643729e-06, "loss": 0.1647, "step": 1092 }, { "epoch": 0.78, "grad_norm": 9.41603756614823, "learning_rate": 9.303636572929188e-06, "loss": 0.1338, "step": 1093 }, { "epoch": 0.78, "grad_norm": 15.05350452195723, "learning_rate": 9.302164576289674e-06, "loss": 0.1469, "step": 1094 }, { "epoch": 0.78, "grad_norm": 28.33566516662762, "learning_rate": 9.30069114221701e-06, "loss": 0.153, "step": 1095 }, { "epoch": 0.78, "grad_norm": 23.243076526598593, "learning_rate": 9.299216271203498e-06, "loss": 0.1603, "step": 1096 }, { "epoch": 0.78, "grad_norm": 10.582545901960945, "learning_rate": 9.297739963741918e-06, "loss": 0.1294, "step": 1097 }, { "epoch": 0.78, "grad_norm": 11.667560754149541, "learning_rate": 9.296262220325535e-06, "loss": 0.11, "step": 1098 }, { "epoch": 0.78, "grad_norm": 10.168707735076364, "learning_rate": 9.294783041448088e-06, "loss": 0.127, "step": 1099 }, { "epoch": 0.79, "grad_norm": 12.57179862351945, "learning_rate": 9.293302427603796e-06, "loss": 0.1414, "step": 1100 }, { "epoch": 0.79, "grad_norm": 26.656118865639158, "learning_rate": 9.291820379287364e-06, "loss": 0.1296, "step": 1101 }, { "epoch": 0.79, "grad_norm": 5.172589483208834, "learning_rate": 9.29033689699397e-06, "loss": 0.1492, "step": 1102 }, { "epoch": 0.79, "grad_norm": 14.01791519086498, "learning_rate": 9.288851981219273e-06, "loss": 0.1327, "step": 1103 }, { "epoch": 0.79, "grad_norm": 33.859914601995875, "learning_rate": 9.28736563245941e-06, "loss": 0.1571, "step": 1104 }, { "epoch": 0.79, "grad_norm": 21.956222104782483, "learning_rate": 9.285877851210999e-06, "loss": 0.1, "step": 1105 }, { "epoch": 0.79, "grad_norm": 18.10448242341278, "learning_rate": 9.284388637971136e-06, "loss": 0.1234, "step": 1106 }, { "epoch": 0.79, "grad_norm": 30.84897554982973, "learning_rate": 9.282897993237392e-06, "loss": 0.1714, "step": 1107 }, { "epoch": 0.79, "grad_norm": 37.9856161444381, "learning_rate": 9.281405917507824e-06, "loss": 0.1244, "step": 1108 }, { "epoch": 0.79, "grad_norm": 15.563367998567477, "learning_rate": 9.279912411280958e-06, "loss": 0.1194, "step": 1109 }, { "epoch": 0.79, "grad_norm": 12.245185274327808, "learning_rate": 9.278417475055803e-06, "loss": 0.1119, "step": 1110 }, { "epoch": 0.79, "grad_norm": 12.455063691988519, "learning_rate": 9.276921109331845e-06, "loss": 0.1165, "step": 1111 }, { "epoch": 0.79, "grad_norm": 25.8213778871432, "learning_rate": 9.275423314609049e-06, "loss": 0.152, "step": 1112 }, { "epoch": 0.79, "grad_norm": 13.869919626594394, "learning_rate": 9.273924091387855e-06, "loss": 0.115, "step": 1113 }, { "epoch": 0.8, "grad_norm": 4.074864485432894, "learning_rate": 9.272423440169181e-06, "loss": 0.1091, "step": 1114 }, { "epoch": 0.8, "grad_norm": 5.656458241700619, "learning_rate": 9.270921361454424e-06, "loss": 0.0948, "step": 1115 }, { "epoch": 0.8, "grad_norm": 37.09019385402894, "learning_rate": 9.269417855745453e-06, "loss": 0.1337, "step": 1116 }, { "epoch": 0.8, "grad_norm": 33.08524762310668, "learning_rate": 9.267912923544621e-06, "loss": 0.1455, "step": 1117 }, { "epoch": 0.8, "grad_norm": 8.26175581668237, "learning_rate": 9.266406565354753e-06, "loss": 0.1221, "step": 1118 }, { "epoch": 0.8, "grad_norm": 23.067596741868993, "learning_rate": 9.26489878167915e-06, "loss": 0.1331, "step": 1119 }, { "epoch": 0.8, "grad_norm": 28.961300839321403, "learning_rate": 9.263389573021592e-06, "loss": 0.1156, "step": 1120 }, { "epoch": 0.8, "grad_norm": 36.58260181012153, "learning_rate": 9.261878939886332e-06, "loss": 0.1499, "step": 1121 }, { "epoch": 0.8, "grad_norm": 15.19087787168311, "learning_rate": 9.2603668827781e-06, "loss": 0.1367, "step": 1122 }, { "epoch": 0.8, "grad_norm": 20.099032625961726, "learning_rate": 9.258853402202106e-06, "loss": 0.1182, "step": 1123 }, { "epoch": 0.8, "grad_norm": 33.55489255269871, "learning_rate": 9.25733849866403e-06, "loss": 0.1436, "step": 1124 }, { "epoch": 0.8, "grad_norm": 22.799341496962658, "learning_rate": 9.255822172670028e-06, "loss": 0.119, "step": 1125 }, { "epoch": 0.8, "grad_norm": 10.031052291448315, "learning_rate": 9.254304424726734e-06, "loss": 0.1013, "step": 1126 }, { "epoch": 0.8, "grad_norm": 6.390605015029899, "learning_rate": 9.252785255341256e-06, "loss": 0.1116, "step": 1127 }, { "epoch": 0.81, "grad_norm": 8.188962911089087, "learning_rate": 9.251264665021178e-06, "loss": 0.1255, "step": 1128 }, { "epoch": 0.81, "grad_norm": 28.64616641930042, "learning_rate": 9.249742654274554e-06, "loss": 0.1237, "step": 1129 }, { "epoch": 0.81, "grad_norm": 9.921072334184103, "learning_rate": 9.24821922360992e-06, "loss": 0.1051, "step": 1130 }, { "epoch": 0.81, "grad_norm": 22.18891754085567, "learning_rate": 9.246694373536277e-06, "loss": 0.1155, "step": 1131 }, { "epoch": 0.81, "grad_norm": 22.768049432903226, "learning_rate": 9.245168104563112e-06, "loss": 0.1561, "step": 1132 }, { "epoch": 0.81, "grad_norm": 4.694267206038536, "learning_rate": 9.243640417200376e-06, "loss": 0.1177, "step": 1133 }, { "epoch": 0.81, "grad_norm": 19.472208799074128, "learning_rate": 9.242111311958502e-06, "loss": 0.1261, "step": 1134 }, { "epoch": 0.81, "grad_norm": 7.137828446129272, "learning_rate": 9.240580789348385e-06, "loss": 0.1493, "step": 1135 }, { "epoch": 0.81, "grad_norm": 17.02804572717195, "learning_rate": 9.23904884988141e-06, "loss": 0.1364, "step": 1136 }, { "epoch": 0.81, "grad_norm": 7.782199051410428, "learning_rate": 9.237515494069417e-06, "loss": 0.1243, "step": 1137 }, { "epoch": 0.81, "grad_norm": 5.708252427463937, "learning_rate": 9.235980722424737e-06, "loss": 0.1117, "step": 1138 }, { "epoch": 0.81, "grad_norm": 4.366900740290782, "learning_rate": 9.234444535460161e-06, "loss": 0.1058, "step": 1139 }, { "epoch": 0.81, "grad_norm": 7.735023185477729, "learning_rate": 9.232906933688959e-06, "loss": 0.1042, "step": 1140 }, { "epoch": 0.81, "grad_norm": 17.878356014797415, "learning_rate": 9.231367917624872e-06, "loss": 0.1143, "step": 1141 }, { "epoch": 0.82, "grad_norm": 13.500635215475153, "learning_rate": 9.229827487782115e-06, "loss": 0.1472, "step": 1142 }, { "epoch": 0.82, "grad_norm": 28.914775597777798, "learning_rate": 9.228285644675372e-06, "loss": 0.1571, "step": 1143 }, { "epoch": 0.82, "grad_norm": 7.679656527001652, "learning_rate": 9.226742388819804e-06, "loss": 0.1243, "step": 1144 }, { "epoch": 0.82, "grad_norm": 42.56589275402226, "learning_rate": 9.225197720731039e-06, "loss": 0.1748, "step": 1145 }, { "epoch": 0.82, "grad_norm": 33.46124830306206, "learning_rate": 9.223651640925181e-06, "loss": 0.1602, "step": 1146 }, { "epoch": 0.82, "grad_norm": 11.354447656553011, "learning_rate": 9.222104149918804e-06, "loss": 0.1125, "step": 1147 }, { "epoch": 0.82, "grad_norm": 4.943626062124622, "learning_rate": 9.220555248228954e-06, "loss": 0.1055, "step": 1148 }, { "epoch": 0.82, "grad_norm": 22.06199413082995, "learning_rate": 9.219004936373146e-06, "loss": 0.1202, "step": 1149 }, { "epoch": 0.82, "grad_norm": 78.80684998307608, "learning_rate": 9.217453214869368e-06, "loss": 0.2026, "step": 1150 }, { "epoch": 0.82, "grad_norm": 33.723692179365536, "learning_rate": 9.21590008423608e-06, "loss": 0.1237, "step": 1151 }, { "epoch": 0.82, "grad_norm": 20.8665304253155, "learning_rate": 9.214345544992214e-06, "loss": 0.136, "step": 1152 }, { "epoch": 0.82, "grad_norm": 66.63596872046561, "learning_rate": 9.212789597657167e-06, "loss": 0.1742, "step": 1153 }, { "epoch": 0.82, "grad_norm": 39.21810406123508, "learning_rate": 9.21123224275081e-06, "loss": 0.1675, "step": 1154 }, { "epoch": 0.82, "grad_norm": 23.06081056803674, "learning_rate": 9.209673480793486e-06, "loss": 0.1375, "step": 1155 }, { "epoch": 0.83, "grad_norm": 25.325897571173787, "learning_rate": 9.208113312306006e-06, "loss": 0.1434, "step": 1156 }, { "epoch": 0.83, "grad_norm": 52.68852183497776, "learning_rate": 9.206551737809653e-06, "loss": 0.1707, "step": 1157 }, { "epoch": 0.83, "grad_norm": 4.831358394803084, "learning_rate": 9.204988757826173e-06, "loss": 0.1233, "step": 1158 }, { "epoch": 0.83, "grad_norm": 16.788606151895603, "learning_rate": 9.203424372877791e-06, "loss": 0.116, "step": 1159 }, { "epoch": 0.83, "grad_norm": 10.314075233780184, "learning_rate": 9.201858583487195e-06, "loss": 0.1305, "step": 1160 }, { "epoch": 0.83, "grad_norm": 24.77340581749608, "learning_rate": 9.200291390177546e-06, "loss": 0.1249, "step": 1161 }, { "epoch": 0.83, "grad_norm": 13.838543735694156, "learning_rate": 9.198722793472471e-06, "loss": 0.0964, "step": 1162 }, { "epoch": 0.83, "grad_norm": 4.489793512918954, "learning_rate": 9.197152793896068e-06, "loss": 0.1027, "step": 1163 }, { "epoch": 0.83, "grad_norm": 17.370296763714304, "learning_rate": 9.195581391972903e-06, "loss": 0.1373, "step": 1164 }, { "epoch": 0.83, "grad_norm": 8.574790093828888, "learning_rate": 9.194008588228011e-06, "loss": 0.1179, "step": 1165 }, { "epoch": 0.83, "grad_norm": 10.243110764589119, "learning_rate": 9.192434383186894e-06, "loss": 0.1274, "step": 1166 }, { "epoch": 0.83, "grad_norm": 7.676977628339888, "learning_rate": 9.190858777375523e-06, "loss": 0.1256, "step": 1167 }, { "epoch": 0.83, "grad_norm": 19.690069281828507, "learning_rate": 9.18928177132034e-06, "loss": 0.1165, "step": 1168 }, { "epoch": 0.83, "grad_norm": 7.038428559464359, "learning_rate": 9.187703365548248e-06, "loss": 0.1094, "step": 1169 }, { "epoch": 0.84, "grad_norm": 26.547275502331654, "learning_rate": 9.186123560586623e-06, "loss": 0.2321, "step": 1170 }, { "epoch": 0.84, "grad_norm": 13.193929639465265, "learning_rate": 9.18454235696331e-06, "loss": 0.1295, "step": 1171 }, { "epoch": 0.84, "grad_norm": 26.734089366689073, "learning_rate": 9.182959755206613e-06, "loss": 0.1381, "step": 1172 }, { "epoch": 0.84, "grad_norm": 6.039923169051226, "learning_rate": 9.181375755845314e-06, "loss": 0.1196, "step": 1173 }, { "epoch": 0.84, "grad_norm": 5.543614416837632, "learning_rate": 9.179790359408655e-06, "loss": 0.1206, "step": 1174 }, { "epoch": 0.84, "grad_norm": 7.105282043598741, "learning_rate": 9.178203566426344e-06, "loss": 0.1093, "step": 1175 }, { "epoch": 0.84, "grad_norm": 20.186482641410777, "learning_rate": 9.176615377428563e-06, "loss": 0.1453, "step": 1176 }, { "epoch": 0.84, "grad_norm": 4.872272725289261, "learning_rate": 9.175025792945951e-06, "loss": 0.0986, "step": 1177 }, { "epoch": 0.84, "grad_norm": 8.738372660616186, "learning_rate": 9.173434813509618e-06, "loss": 0.0973, "step": 1178 }, { "epoch": 0.84, "grad_norm": 13.83532907369495, "learning_rate": 9.171842439651143e-06, "loss": 0.1072, "step": 1179 }, { "epoch": 0.84, "grad_norm": 6.269615641040455, "learning_rate": 9.170248671902565e-06, "loss": 0.106, "step": 1180 }, { "epoch": 0.84, "grad_norm": 19.59295084898656, "learning_rate": 9.168653510796392e-06, "loss": 0.1466, "step": 1181 }, { "epoch": 0.84, "grad_norm": 7.073455582971626, "learning_rate": 9.167056956865596e-06, "loss": 0.1572, "step": 1182 }, { "epoch": 0.84, "grad_norm": 11.573875087408759, "learning_rate": 9.165459010643618e-06, "loss": 0.1233, "step": 1183 }, { "epoch": 0.85, "grad_norm": 4.690591395107708, "learning_rate": 9.16385967266436e-06, "loss": 0.111, "step": 1184 }, { "epoch": 0.85, "grad_norm": 5.771275364512612, "learning_rate": 9.16225894346219e-06, "loss": 0.1333, "step": 1185 }, { "epoch": 0.85, "grad_norm": 24.721033576277165, "learning_rate": 9.160656823571942e-06, "loss": 0.1638, "step": 1186 }, { "epoch": 0.85, "grad_norm": 15.57598172877004, "learning_rate": 9.159053313528913e-06, "loss": 0.1168, "step": 1187 }, { "epoch": 0.85, "grad_norm": 26.636854607871165, "learning_rate": 9.15744841386887e-06, "loss": 0.1198, "step": 1188 }, { "epoch": 0.85, "grad_norm": 3.98440431085169, "learning_rate": 9.155842125128033e-06, "loss": 0.1067, "step": 1189 }, { "epoch": 0.85, "grad_norm": 31.917676167530633, "learning_rate": 9.154234447843098e-06, "loss": 0.1306, "step": 1190 }, { "epoch": 0.85, "grad_norm": 6.019593781975672, "learning_rate": 9.152625382551217e-06, "loss": 0.0909, "step": 1191 }, { "epoch": 0.85, "grad_norm": 8.806640969390639, "learning_rate": 9.15101492979001e-06, "loss": 0.1046, "step": 1192 }, { "epoch": 0.85, "grad_norm": 28.841021287717343, "learning_rate": 9.149403090097557e-06, "loss": 0.1171, "step": 1193 }, { "epoch": 0.85, "grad_norm": 16.465965710527545, "learning_rate": 9.147789864012408e-06, "loss": 0.1447, "step": 1194 }, { "epoch": 0.85, "grad_norm": 24.666832031038798, "learning_rate": 9.146175252073568e-06, "loss": 0.1456, "step": 1195 }, { "epoch": 0.85, "grad_norm": 19.683015165025946, "learning_rate": 9.144559254820511e-06, "loss": 0.1213, "step": 1196 }, { "epoch": 0.85, "grad_norm": 5.122237884315906, "learning_rate": 9.14294187279317e-06, "loss": 0.1357, "step": 1197 }, { "epoch": 0.86, "grad_norm": 6.419084999748459, "learning_rate": 9.141323106531943e-06, "loss": 0.1353, "step": 1198 }, { "epoch": 0.86, "grad_norm": 33.4219861921213, "learning_rate": 9.139702956577693e-06, "loss": 0.1405, "step": 1199 }, { "epoch": 0.86, "grad_norm": 23.625021627700484, "learning_rate": 9.138081423471736e-06, "loss": 0.1046, "step": 1200 }, { "epoch": 0.86, "grad_norm": 11.839297028784905, "learning_rate": 9.136458507755862e-06, "loss": 0.1106, "step": 1201 }, { "epoch": 0.86, "grad_norm": 12.004864533298464, "learning_rate": 9.134834209972314e-06, "loss": 0.104, "step": 1202 }, { "epoch": 0.86, "grad_norm": 34.08869419046535, "learning_rate": 9.133208530663801e-06, "loss": 0.1288, "step": 1203 }, { "epoch": 0.86, "grad_norm": 26.979912078853985, "learning_rate": 9.131581470373495e-06, "loss": 0.1449, "step": 1204 }, { "epoch": 0.86, "grad_norm": 26.374270596425987, "learning_rate": 9.129953029645022e-06, "loss": 0.1167, "step": 1205 }, { "epoch": 0.86, "grad_norm": 3.895140545361581, "learning_rate": 9.128323209022478e-06, "loss": 0.1267, "step": 1206 }, { "epoch": 0.86, "grad_norm": 51.67643816360633, "learning_rate": 9.126692009050415e-06, "loss": 0.1334, "step": 1207 }, { "epoch": 0.86, "grad_norm": 6.7195350284212845, "learning_rate": 9.125059430273848e-06, "loss": 0.1033, "step": 1208 }, { "epoch": 0.86, "grad_norm": 4.68146047091333, "learning_rate": 9.123425473238253e-06, "loss": 0.1194, "step": 1209 }, { "epoch": 0.86, "grad_norm": 10.479614467784797, "learning_rate": 9.121790138489564e-06, "loss": 0.1483, "step": 1210 }, { "epoch": 0.86, "grad_norm": 8.88151888755877, "learning_rate": 9.120153426574177e-06, "loss": 0.1454, "step": 1211 }, { "epoch": 0.87, "grad_norm": 21.761167483206385, "learning_rate": 9.118515338038947e-06, "loss": 0.1039, "step": 1212 }, { "epoch": 0.87, "grad_norm": 15.054311022200379, "learning_rate": 9.11687587343119e-06, "loss": 0.127, "step": 1213 }, { "epoch": 0.87, "grad_norm": 4.715530741583724, "learning_rate": 9.115235033298682e-06, "loss": 0.1182, "step": 1214 }, { "epoch": 0.87, "grad_norm": 6.72131217945236, "learning_rate": 9.113592818189661e-06, "loss": 0.1331, "step": 1215 }, { "epoch": 0.87, "grad_norm": 7.593220467655973, "learning_rate": 9.111949228652816e-06, "loss": 0.1128, "step": 1216 }, { "epoch": 0.87, "grad_norm": 4.570725755965095, "learning_rate": 9.110304265237304e-06, "loss": 0.1183, "step": 1217 }, { "epoch": 0.87, "grad_norm": 17.867725431027747, "learning_rate": 9.10865792849274e-06, "loss": 0.1313, "step": 1218 }, { "epoch": 0.87, "grad_norm": 41.74139211185614, "learning_rate": 9.107010218969191e-06, "loss": 0.1473, "step": 1219 }, { "epoch": 0.87, "grad_norm": 13.698548574340123, "learning_rate": 9.10536113721719e-06, "loss": 0.1401, "step": 1220 }, { "epoch": 0.87, "grad_norm": 56.22024108773497, "learning_rate": 9.103710683787728e-06, "loss": 0.129, "step": 1221 }, { "epoch": 0.87, "grad_norm": 10.143381456660101, "learning_rate": 9.102058859232247e-06, "loss": 0.1459, "step": 1222 }, { "epoch": 0.87, "grad_norm": 8.58414656315145, "learning_rate": 9.100405664102656e-06, "loss": 0.1403, "step": 1223 }, { "epoch": 0.87, "grad_norm": 33.73692819416245, "learning_rate": 9.098751098951317e-06, "loss": 0.1604, "step": 1224 }, { "epoch": 0.87, "grad_norm": 17.199990598047965, "learning_rate": 9.09709516433105e-06, "loss": 0.1375, "step": 1225 }, { "epoch": 0.88, "grad_norm": 15.42145961364144, "learning_rate": 9.095437860795138e-06, "loss": 0.1305, "step": 1226 }, { "epoch": 0.88, "grad_norm": 36.82138829966084, "learning_rate": 9.09377918889731e-06, "loss": 0.1387, "step": 1227 }, { "epoch": 0.88, "grad_norm": 4.248193826270128, "learning_rate": 9.092119149191765e-06, "loss": 0.1298, "step": 1228 }, { "epoch": 0.88, "grad_norm": 7.168241999958529, "learning_rate": 9.090457742233152e-06, "loss": 0.0919, "step": 1229 }, { "epoch": 0.88, "grad_norm": 4.413448398731609, "learning_rate": 9.088794968576575e-06, "loss": 0.1368, "step": 1230 }, { "epoch": 0.88, "grad_norm": 10.297298848033256, "learning_rate": 9.087130828777598e-06, "loss": 0.1672, "step": 1231 }, { "epoch": 0.88, "grad_norm": 3.995001961847258, "learning_rate": 9.085465323392243e-06, "loss": 0.1167, "step": 1232 }, { "epoch": 0.88, "grad_norm": 5.999725996925589, "learning_rate": 9.083798452976988e-06, "loss": 0.1295, "step": 1233 }, { "epoch": 0.88, "grad_norm": 20.375505744587326, "learning_rate": 9.082130218088762e-06, "loss": 0.1641, "step": 1234 }, { "epoch": 0.88, "grad_norm": 3.932899862057339, "learning_rate": 9.080460619284954e-06, "loss": 0.1132, "step": 1235 }, { "epoch": 0.88, "grad_norm": 27.645263230888336, "learning_rate": 9.07878965712341e-06, "loss": 0.1514, "step": 1236 }, { "epoch": 0.88, "grad_norm": 10.21802883875594, "learning_rate": 9.077117332162427e-06, "loss": 0.1427, "step": 1237 }, { "epoch": 0.88, "grad_norm": 18.322663661950923, "learning_rate": 9.075443644960761e-06, "loss": 0.1166, "step": 1238 }, { "epoch": 0.88, "grad_norm": 4.316841214679732, "learning_rate": 9.07376859607762e-06, "loss": 0.0997, "step": 1239 }, { "epoch": 0.89, "grad_norm": 6.244511662257982, "learning_rate": 9.072092186072675e-06, "loss": 0.1416, "step": 1240 }, { "epoch": 0.89, "grad_norm": 19.860679127095636, "learning_rate": 9.070414415506038e-06, "loss": 0.1356, "step": 1241 }, { "epoch": 0.89, "grad_norm": 10.605522811610166, "learning_rate": 9.068735284938288e-06, "loss": 0.1052, "step": 1242 }, { "epoch": 0.89, "grad_norm": 17.842863169949126, "learning_rate": 9.067054794930452e-06, "loss": 0.1169, "step": 1243 }, { "epoch": 0.89, "grad_norm": 8.718865016595677, "learning_rate": 9.065372946044014e-06, "loss": 0.1428, "step": 1244 }, { "epoch": 0.89, "grad_norm": 20.576896857972773, "learning_rate": 9.063689738840911e-06, "loss": 0.1407, "step": 1245 }, { "epoch": 0.89, "grad_norm": 7.3907765645746935, "learning_rate": 9.06200517388353e-06, "loss": 0.1586, "step": 1246 }, { "epoch": 0.89, "grad_norm": 4.412682061788436, "learning_rate": 9.060319251734723e-06, "loss": 0.1168, "step": 1247 }, { "epoch": 0.89, "grad_norm": 9.216748606683582, "learning_rate": 9.058631972957783e-06, "loss": 0.179, "step": 1248 }, { "epoch": 0.89, "grad_norm": 13.251080346982283, "learning_rate": 9.056943338116461e-06, "loss": 0.1057, "step": 1249 }, { "epoch": 0.89, "grad_norm": 13.556345489969091, "learning_rate": 9.055253347774961e-06, "loss": 0.1252, "step": 1250 }, { "epoch": 0.89, "grad_norm": 8.70070366178762, "learning_rate": 9.053562002497943e-06, "loss": 0.121, "step": 1251 }, { "epoch": 0.89, "grad_norm": 4.542057911871553, "learning_rate": 9.051869302850515e-06, "loss": 0.0962, "step": 1252 }, { "epoch": 0.89, "grad_norm": 16.718582711323283, "learning_rate": 9.05017524939824e-06, "loss": 0.1232, "step": 1253 }, { "epoch": 0.9, "grad_norm": 13.719584747865103, "learning_rate": 9.048479842707132e-06, "loss": 0.1292, "step": 1254 }, { "epoch": 0.9, "grad_norm": 3.570019714838347, "learning_rate": 9.046783083343657e-06, "loss": 0.0828, "step": 1255 }, { "epoch": 0.9, "grad_norm": 8.629289413960565, "learning_rate": 9.045084971874738e-06, "loss": 0.1039, "step": 1256 }, { "epoch": 0.9, "grad_norm": 6.7497642996638705, "learning_rate": 9.043385508867741e-06, "loss": 0.0898, "step": 1257 }, { "epoch": 0.9, "grad_norm": 20.039832970250274, "learning_rate": 9.041684694890492e-06, "loss": 0.1107, "step": 1258 }, { "epoch": 0.9, "grad_norm": 15.178772636850542, "learning_rate": 9.03998253051126e-06, "loss": 0.1654, "step": 1259 }, { "epoch": 0.9, "grad_norm": 27.037718152114405, "learning_rate": 9.038279016298773e-06, "loss": 0.1035, "step": 1260 }, { "epoch": 0.9, "grad_norm": 28.49390797177503, "learning_rate": 9.036574152822206e-06, "loss": 0.1362, "step": 1261 }, { "epoch": 0.9, "grad_norm": 8.695671686306866, "learning_rate": 9.034867940651186e-06, "loss": 0.1486, "step": 1262 }, { "epoch": 0.9, "grad_norm": 7.332210434139537, "learning_rate": 9.033160380355789e-06, "loss": 0.1077, "step": 1263 }, { "epoch": 0.9, "grad_norm": 4.95118182352966, "learning_rate": 9.031451472506544e-06, "loss": 0.1095, "step": 1264 }, { "epoch": 0.9, "grad_norm": 8.219397289040563, "learning_rate": 9.029741217674428e-06, "loss": 0.1373, "step": 1265 }, { "epoch": 0.9, "grad_norm": 28.958428037574063, "learning_rate": 9.02802961643087e-06, "loss": 0.1364, "step": 1266 }, { "epoch": 0.9, "grad_norm": 27.25383513627309, "learning_rate": 9.026316669347747e-06, "loss": 0.1401, "step": 1267 }, { "epoch": 0.91, "grad_norm": 7.0655709620800975, "learning_rate": 9.024602376997387e-06, "loss": 0.1539, "step": 1268 }, { "epoch": 0.91, "grad_norm": 45.88384945997455, "learning_rate": 9.022886739952565e-06, "loss": 0.1285, "step": 1269 }, { "epoch": 0.91, "grad_norm": 26.66446887230687, "learning_rate": 9.02116975878651e-06, "loss": 0.1138, "step": 1270 }, { "epoch": 0.91, "grad_norm": 9.862633449690026, "learning_rate": 9.019451434072894e-06, "loss": 0.1599, "step": 1271 }, { "epoch": 0.91, "grad_norm": 12.445395355203088, "learning_rate": 9.017731766385844e-06, "loss": 0.1327, "step": 1272 }, { "epoch": 0.91, "grad_norm": 9.717224738562036, "learning_rate": 9.016010756299934e-06, "loss": 0.1062, "step": 1273 }, { "epoch": 0.91, "grad_norm": 33.84566034930815, "learning_rate": 9.014288404390182e-06, "loss": 0.1376, "step": 1274 }, { "epoch": 0.91, "grad_norm": 17.609479593872077, "learning_rate": 9.012564711232059e-06, "loss": 0.1116, "step": 1275 }, { "epoch": 0.91, "grad_norm": 5.705629267795132, "learning_rate": 9.010839677401484e-06, "loss": 0.1307, "step": 1276 }, { "epoch": 0.91, "grad_norm": 3.4202600275574486, "learning_rate": 9.009113303474822e-06, "loss": 0.0847, "step": 1277 }, { "epoch": 0.91, "grad_norm": 27.46790309202585, "learning_rate": 9.007385590028887e-06, "loss": 0.1199, "step": 1278 }, { "epoch": 0.91, "grad_norm": 19.134070635982145, "learning_rate": 9.005656537640942e-06, "loss": 0.1385, "step": 1279 }, { "epoch": 0.91, "grad_norm": 4.867626401898946, "learning_rate": 9.003926146888691e-06, "loss": 0.098, "step": 1280 }, { "epoch": 0.91, "grad_norm": 12.301604393023416, "learning_rate": 9.002194418350291e-06, "loss": 0.1766, "step": 1281 }, { "epoch": 0.92, "grad_norm": 13.47168882178934, "learning_rate": 9.000461352604349e-06, "loss": 0.1528, "step": 1282 }, { "epoch": 0.92, "grad_norm": 20.94266276844876, "learning_rate": 8.99872695022991e-06, "loss": 0.13, "step": 1283 }, { "epoch": 0.92, "grad_norm": 7.568146188671397, "learning_rate": 8.996991211806471e-06, "loss": 0.1041, "step": 1284 }, { "epoch": 0.92, "grad_norm": 19.767793790726582, "learning_rate": 8.995254137913977e-06, "loss": 0.1321, "step": 1285 }, { "epoch": 0.92, "grad_norm": 5.925767904518081, "learning_rate": 8.99351572913281e-06, "loss": 0.1204, "step": 1286 }, { "epoch": 0.92, "grad_norm": 17.665286809508974, "learning_rate": 8.991775986043814e-06, "loss": 0.0851, "step": 1287 }, { "epoch": 0.92, "grad_norm": 15.691280452120386, "learning_rate": 8.990034909228262e-06, "loss": 0.1456, "step": 1288 }, { "epoch": 0.92, "grad_norm": 10.619640637451463, "learning_rate": 8.988292499267885e-06, "loss": 0.0916, "step": 1289 }, { "epoch": 0.92, "grad_norm": 39.48387258198995, "learning_rate": 8.986548756744852e-06, "loss": 0.1195, "step": 1290 }, { "epoch": 0.92, "grad_norm": 8.63610318578367, "learning_rate": 8.98480368224178e-06, "loss": 0.1492, "step": 1291 }, { "epoch": 0.92, "grad_norm": 20.03898408603859, "learning_rate": 8.98305727634173e-06, "loss": 0.1219, "step": 1292 }, { "epoch": 0.92, "grad_norm": 7.857773381391331, "learning_rate": 8.981309539628212e-06, "loss": 0.132, "step": 1293 }, { "epoch": 0.92, "grad_norm": 9.80086758659809, "learning_rate": 8.979560472685174e-06, "loss": 0.1019, "step": 1294 }, { "epoch": 0.92, "grad_norm": 6.467420571204822, "learning_rate": 8.977810076097013e-06, "loss": 0.0927, "step": 1295 }, { "epoch": 0.93, "grad_norm": 10.798830013093253, "learning_rate": 8.97605835044857e-06, "loss": 0.1462, "step": 1296 }, { "epoch": 0.93, "grad_norm": 3.3782928073977945, "learning_rate": 8.974305296325125e-06, "loss": 0.0842, "step": 1297 }, { "epoch": 0.93, "grad_norm": 8.272978733091367, "learning_rate": 8.97255091431241e-06, "loss": 0.1271, "step": 1298 }, { "epoch": 0.93, "grad_norm": 16.81966484700589, "learning_rate": 8.970795204996597e-06, "loss": 0.1229, "step": 1299 }, { "epoch": 0.93, "grad_norm": 8.739832847910312, "learning_rate": 8.969038168964298e-06, "loss": 0.1266, "step": 1300 }, { "epoch": 0.93, "grad_norm": 9.173928247267657, "learning_rate": 8.967279806802576e-06, "loss": 0.1152, "step": 1301 }, { "epoch": 0.93, "grad_norm": 22.14994655091031, "learning_rate": 8.965520119098926e-06, "loss": 0.1097, "step": 1302 }, { "epoch": 0.93, "grad_norm": 54.5742266393221, "learning_rate": 8.9637591064413e-06, "loss": 0.1621, "step": 1303 }, { "epoch": 0.93, "grad_norm": 12.6238048135336, "learning_rate": 8.961996769418077e-06, "loss": 0.0958, "step": 1304 }, { "epoch": 0.93, "grad_norm": 29.088793120301332, "learning_rate": 8.960233108618092e-06, "loss": 0.1401, "step": 1305 }, { "epoch": 0.93, "grad_norm": 51.52087793146708, "learning_rate": 8.958468124630617e-06, "loss": 0.1482, "step": 1306 }, { "epoch": 0.93, "grad_norm": 26.746196577981273, "learning_rate": 8.956701818045363e-06, "loss": 0.1227, "step": 1307 }, { "epoch": 0.93, "grad_norm": 35.98626766193302, "learning_rate": 8.954934189452489e-06, "loss": 0.1271, "step": 1308 }, { "epoch": 0.93, "grad_norm": 10.706810349285421, "learning_rate": 8.953165239442589e-06, "loss": 0.1436, "step": 1309 }, { "epoch": 0.94, "grad_norm": 17.689232861773036, "learning_rate": 8.951394968606704e-06, "loss": 0.1155, "step": 1310 }, { "epoch": 0.94, "grad_norm": 7.4873562096415744, "learning_rate": 8.949623377536314e-06, "loss": 0.1095, "step": 1311 }, { "epoch": 0.94, "grad_norm": 8.977404966444077, "learning_rate": 8.947850466823343e-06, "loss": 0.0917, "step": 1312 }, { "epoch": 0.94, "grad_norm": 8.150139134114927, "learning_rate": 8.946076237060148e-06, "loss": 0.1312, "step": 1313 }, { "epoch": 0.94, "grad_norm": 9.547066209117817, "learning_rate": 8.944300688839538e-06, "loss": 0.1211, "step": 1314 }, { "epoch": 0.94, "grad_norm": 6.574430114106702, "learning_rate": 8.942523822754751e-06, "loss": 0.1184, "step": 1315 }, { "epoch": 0.94, "grad_norm": 13.767237626071879, "learning_rate": 8.940745639399477e-06, "loss": 0.1439, "step": 1316 }, { "epoch": 0.94, "grad_norm": 18.217321006016, "learning_rate": 8.938966139367837e-06, "loss": 0.124, "step": 1317 }, { "epoch": 0.94, "grad_norm": 5.12123329875439, "learning_rate": 8.937185323254395e-06, "loss": 0.0945, "step": 1318 }, { "epoch": 0.94, "grad_norm": 20.992000622397523, "learning_rate": 8.935403191654155e-06, "loss": 0.0821, "step": 1319 }, { "epoch": 0.94, "grad_norm": 20.121514370471925, "learning_rate": 8.933619745162559e-06, "loss": 0.1375, "step": 1320 }, { "epoch": 0.94, "grad_norm": 19.672716362914535, "learning_rate": 8.931834984375492e-06, "loss": 0.1133, "step": 1321 }, { "epoch": 0.94, "grad_norm": 8.854511996956678, "learning_rate": 8.930048909889272e-06, "loss": 0.1428, "step": 1322 }, { "epoch": 0.94, "grad_norm": 11.981569840262182, "learning_rate": 8.928261522300665e-06, "loss": 0.0844, "step": 1323 }, { "epoch": 0.95, "grad_norm": 20.95789494137274, "learning_rate": 8.926472822206869e-06, "loss": 0.1074, "step": 1324 }, { "epoch": 0.95, "grad_norm": 9.085896564148001, "learning_rate": 8.924682810205519e-06, "loss": 0.1525, "step": 1325 }, { "epoch": 0.95, "grad_norm": 5.93521143384227, "learning_rate": 8.922891486894692e-06, "loss": 0.1149, "step": 1326 }, { "epoch": 0.95, "grad_norm": 9.576240348621916, "learning_rate": 8.921098852872904e-06, "loss": 0.123, "step": 1327 }, { "epoch": 0.95, "grad_norm": 6.371219220183408, "learning_rate": 8.919304908739106e-06, "loss": 0.1293, "step": 1328 }, { "epoch": 0.95, "grad_norm": 6.67584580080675, "learning_rate": 8.917509655092691e-06, "loss": 0.1666, "step": 1329 }, { "epoch": 0.95, "grad_norm": 4.493405850966914, "learning_rate": 8.915713092533483e-06, "loss": 0.1056, "step": 1330 }, { "epoch": 0.95, "grad_norm": 13.499498266925373, "learning_rate": 8.913915221661748e-06, "loss": 0.1012, "step": 1331 }, { "epoch": 0.95, "grad_norm": 30.9937070736886, "learning_rate": 8.912116043078188e-06, "loss": 0.1466, "step": 1332 }, { "epoch": 0.95, "grad_norm": 14.4905788898255, "learning_rate": 8.910315557383944e-06, "loss": 0.1387, "step": 1333 }, { "epoch": 0.95, "grad_norm": 8.442340648313488, "learning_rate": 8.90851376518059e-06, "loss": 0.1321, "step": 1334 }, { "epoch": 0.95, "grad_norm": 19.692706543472003, "learning_rate": 8.906710667070136e-06, "loss": 0.1663, "step": 1335 }, { "epoch": 0.95, "grad_norm": 7.236348097631177, "learning_rate": 8.904906263655036e-06, "loss": 0.1521, "step": 1336 }, { "epoch": 0.95, "grad_norm": 12.189125432366566, "learning_rate": 8.903100555538169e-06, "loss": 0.1282, "step": 1337 }, { "epoch": 0.96, "grad_norm": 10.711104406960347, "learning_rate": 8.90129354332286e-06, "loss": 0.146, "step": 1338 }, { "epoch": 0.96, "grad_norm": 33.786095997393936, "learning_rate": 8.899485227612865e-06, "loss": 0.1194, "step": 1339 }, { "epoch": 0.96, "grad_norm": 5.881201085589049, "learning_rate": 8.897675609012372e-06, "loss": 0.1199, "step": 1340 }, { "epoch": 0.96, "grad_norm": 5.300747945263061, "learning_rate": 8.895864688126013e-06, "loss": 0.0984, "step": 1341 }, { "epoch": 0.96, "grad_norm": 19.177339508083687, "learning_rate": 8.894052465558846e-06, "loss": 0.0996, "step": 1342 }, { "epoch": 0.96, "grad_norm": 10.531710013911395, "learning_rate": 8.892238941916372e-06, "loss": 0.1389, "step": 1343 }, { "epoch": 0.96, "grad_norm": 8.131493938719796, "learning_rate": 8.890424117804522e-06, "loss": 0.1129, "step": 1344 }, { "epoch": 0.96, "grad_norm": 12.104920527712904, "learning_rate": 8.88860799382966e-06, "loss": 0.1772, "step": 1345 }, { "epoch": 0.96, "grad_norm": 22.119517879117527, "learning_rate": 8.88679057059859e-06, "loss": 0.1145, "step": 1346 }, { "epoch": 0.96, "grad_norm": 4.741445859808426, "learning_rate": 8.884971848718544e-06, "loss": 0.1284, "step": 1347 }, { "epoch": 0.96, "grad_norm": 4.598276011199355, "learning_rate": 8.883151828797194e-06, "loss": 0.1213, "step": 1348 }, { "epoch": 0.96, "grad_norm": 19.888338261433105, "learning_rate": 8.88133051144264e-06, "loss": 0.1331, "step": 1349 }, { "epoch": 0.96, "grad_norm": 26.720524373083247, "learning_rate": 8.87950789726342e-06, "loss": 0.1427, "step": 1350 }, { "epoch": 0.96, "grad_norm": 9.296969295372449, "learning_rate": 8.8776839868685e-06, "loss": 0.1285, "step": 1351 }, { "epoch": 0.97, "grad_norm": 6.519639013034927, "learning_rate": 8.875858780867286e-06, "loss": 0.1084, "step": 1352 }, { "epoch": 0.97, "grad_norm": 5.140053963785225, "learning_rate": 8.87403227986961e-06, "loss": 0.1273, "step": 1353 }, { "epoch": 0.97, "grad_norm": 6.666680845030289, "learning_rate": 8.872204484485743e-06, "loss": 0.1301, "step": 1354 }, { "epoch": 0.97, "grad_norm": 32.518052200093564, "learning_rate": 8.870375395326384e-06, "loss": 0.1344, "step": 1355 }, { "epoch": 0.97, "grad_norm": 24.980724053437154, "learning_rate": 8.868545013002665e-06, "loss": 0.1077, "step": 1356 }, { "epoch": 0.97, "grad_norm": 15.549275682764785, "learning_rate": 8.866713338126152e-06, "loss": 0.13, "step": 1357 }, { "epoch": 0.97, "grad_norm": 10.581867589291187, "learning_rate": 8.86488037130884e-06, "loss": 0.13, "step": 1358 }, { "epoch": 0.97, "grad_norm": 13.682883003986115, "learning_rate": 8.863046113163158e-06, "loss": 0.0698, "step": 1359 }, { "epoch": 0.97, "grad_norm": 67.34606591529005, "learning_rate": 8.861210564301967e-06, "loss": 0.2075, "step": 1360 }, { "epoch": 0.97, "grad_norm": 16.000577042347064, "learning_rate": 8.859373725338558e-06, "loss": 0.1465, "step": 1361 }, { "epoch": 0.97, "grad_norm": 5.734984778309666, "learning_rate": 8.857535596886652e-06, "loss": 0.1035, "step": 1362 }, { "epoch": 0.97, "grad_norm": 19.14624542885837, "learning_rate": 8.855696179560402e-06, "loss": 0.1437, "step": 1363 }, { "epoch": 0.97, "grad_norm": 72.76158246694781, "learning_rate": 8.85385547397439e-06, "loss": 0.1649, "step": 1364 }, { "epoch": 0.97, "grad_norm": 6.2260099295276845, "learning_rate": 8.852013480743632e-06, "loss": 0.1321, "step": 1365 }, { "epoch": 0.98, "grad_norm": 9.131312974812547, "learning_rate": 8.850170200483573e-06, "loss": 0.1362, "step": 1366 }, { "epoch": 0.98, "grad_norm": 11.780427057540104, "learning_rate": 8.848325633810083e-06, "loss": 0.0975, "step": 1367 }, { "epoch": 0.98, "grad_norm": 38.41930656679251, "learning_rate": 8.84647978133947e-06, "loss": 0.1438, "step": 1368 }, { "epoch": 0.98, "grad_norm": 31.457034993977672, "learning_rate": 8.844632643688467e-06, "loss": 0.1525, "step": 1369 }, { "epoch": 0.98, "grad_norm": 9.017217356990798, "learning_rate": 8.842784221474237e-06, "loss": 0.1115, "step": 1370 }, { "epoch": 0.98, "grad_norm": 15.961687118243042, "learning_rate": 8.840934515314372e-06, "loss": 0.173, "step": 1371 }, { "epoch": 0.98, "grad_norm": 35.13831841169741, "learning_rate": 8.839083525826893e-06, "loss": 0.1478, "step": 1372 }, { "epoch": 0.98, "grad_norm": 56.57243122107551, "learning_rate": 8.837231253630247e-06, "loss": 0.1528, "step": 1373 }, { "epoch": 0.98, "grad_norm": 6.078243066957646, "learning_rate": 8.835377699343318e-06, "loss": 0.1129, "step": 1374 }, { "epoch": 0.98, "grad_norm": 4.098116188214989, "learning_rate": 8.83352286358541e-06, "loss": 0.1077, "step": 1375 }, { "epoch": 0.98, "grad_norm": 27.118006043141744, "learning_rate": 8.83166674697626e-06, "loss": 0.1112, "step": 1376 }, { "epoch": 0.98, "grad_norm": 35.911096298188546, "learning_rate": 8.829809350136027e-06, "loss": 0.1365, "step": 1377 }, { "epoch": 0.98, "grad_norm": 25.339967860287878, "learning_rate": 8.827950673685306e-06, "loss": 0.1319, "step": 1378 }, { "epoch": 0.98, "grad_norm": 3.559736283305232, "learning_rate": 8.826090718245112e-06, "loss": 0.1271, "step": 1379 }, { "epoch": 0.99, "grad_norm": 12.481495463900032, "learning_rate": 8.824229484436894e-06, "loss": 0.1123, "step": 1380 }, { "epoch": 0.99, "grad_norm": 6.710361054334273, "learning_rate": 8.822366972882523e-06, "loss": 0.1602, "step": 1381 }, { "epoch": 0.99, "grad_norm": 46.045622161560836, "learning_rate": 8.820503184204299e-06, "loss": 0.1102, "step": 1382 }, { "epoch": 0.99, "grad_norm": 57.381890629821335, "learning_rate": 8.818638119024949e-06, "loss": 0.1418, "step": 1383 }, { "epoch": 0.99, "grad_norm": 32.56557318410978, "learning_rate": 8.816771777967623e-06, "loss": 0.1338, "step": 1384 }, { "epoch": 0.99, "grad_norm": 59.30261725642934, "learning_rate": 8.814904161655904e-06, "loss": 0.1843, "step": 1385 }, { "epoch": 0.99, "grad_norm": 55.152942998068376, "learning_rate": 8.813035270713796e-06, "loss": 0.144, "step": 1386 }, { "epoch": 0.99, "grad_norm": 54.60066261642631, "learning_rate": 8.811165105765732e-06, "loss": 0.14, "step": 1387 }, { "epoch": 0.99, "grad_norm": 5.534246320273303, "learning_rate": 8.809293667436565e-06, "loss": 0.124, "step": 1388 }, { "epoch": 0.99, "grad_norm": 25.51275813460598, "learning_rate": 8.80742095635158e-06, "loss": 0.1329, "step": 1389 }, { "epoch": 0.99, "grad_norm": 41.23937073867847, "learning_rate": 8.805546973136481e-06, "loss": 0.139, "step": 1390 }, { "epoch": 0.99, "grad_norm": 44.99322436803483, "learning_rate": 8.803671718417407e-06, "loss": 0.1451, "step": 1391 }, { "epoch": 0.99, "grad_norm": 19.302534314720663, "learning_rate": 8.80179519282091e-06, "loss": 0.1173, "step": 1392 }, { "epoch": 0.99, "grad_norm": 4.692753538243162, "learning_rate": 8.799917396973976e-06, "loss": 0.0931, "step": 1393 }, { "epoch": 1.0, "grad_norm": 55.478799678782806, "learning_rate": 8.798038331504008e-06, "loss": 0.1469, "step": 1394 }, { "epoch": 1.0, "grad_norm": 51.060808138084866, "learning_rate": 8.79615799703884e-06, "loss": 0.1503, "step": 1395 }, { "epoch": 1.0, "grad_norm": 18.71241704458466, "learning_rate": 8.794276394206722e-06, "loss": 0.1154, "step": 1396 }, { "epoch": 1.0, "grad_norm": 20.47077129185746, "learning_rate": 8.792393523636337e-06, "loss": 0.1069, "step": 1397 }, { "epoch": 1.0, "grad_norm": 50.377448768013004, "learning_rate": 8.790509385956784e-06, "loss": 0.1875, "step": 1398 }, { "epoch": 1.0, "grad_norm": 24.38834863041375, "learning_rate": 8.788623981797592e-06, "loss": 0.145, "step": 1399 }, { "epoch": 1.0, "grad_norm": 18.459905174452402, "learning_rate": 8.786737311788708e-06, "loss": 0.1196, "step": 1400 }, { "epoch": 1.0, "grad_norm": 3.4253124318053634, "learning_rate": 8.784849376560503e-06, "loss": 0.0878, "step": 1401 }, { "epoch": 1.0, "grad_norm": 32.20871321310459, "learning_rate": 8.78296017674377e-06, "loss": 0.0911, "step": 1402 }, { "epoch": 1.0, "grad_norm": 29.868559521129765, "learning_rate": 8.781069712969726e-06, "loss": 0.0909, "step": 1403 }, { "epoch": 1.0, "grad_norm": 28.47627393710747, "learning_rate": 8.779177985870012e-06, "loss": 0.0869, "step": 1404 }, { "epoch": 1.0, "grad_norm": 4.353515032854893, "learning_rate": 8.77728499607669e-06, "loss": 0.0629, "step": 1405 }, { "epoch": 1.0, "grad_norm": 23.98867948652184, "learning_rate": 8.775390744222238e-06, "loss": 0.1105, "step": 1406 }, { "epoch": 1.0, "grad_norm": 20.72490090675031, "learning_rate": 8.773495230939567e-06, "loss": 0.0758, "step": 1407 }, { "epoch": 1.0, "grad_norm": 28.331372328309556, "learning_rate": 8.771598456861998e-06, "loss": 0.0789, "step": 1408 }, { "epoch": 1.01, "grad_norm": 6.026063050859235, "learning_rate": 8.769700422623283e-06, "loss": 0.0461, "step": 1409 }, { "epoch": 1.01, "grad_norm": 26.292659193113025, "learning_rate": 8.767801128857588e-06, "loss": 0.1025, "step": 1410 }, { "epoch": 1.01, "grad_norm": 7.794861855473636, "learning_rate": 8.765900576199502e-06, "loss": 0.0879, "step": 1411 }, { "epoch": 1.01, "grad_norm": 18.682338368358316, "learning_rate": 8.763998765284036e-06, "loss": 0.0944, "step": 1412 }, { "epoch": 1.01, "grad_norm": 11.412070873783431, "learning_rate": 8.76209569674662e-06, "loss": 0.0685, "step": 1413 }, { "epoch": 1.01, "grad_norm": 10.884099960069042, "learning_rate": 8.760191371223104e-06, "loss": 0.0643, "step": 1414 }, { "epoch": 1.01, "grad_norm": 9.034014941112474, "learning_rate": 8.758285789349759e-06, "loss": 0.1039, "step": 1415 }, { "epoch": 1.01, "grad_norm": 7.709293674256186, "learning_rate": 8.756378951763277e-06, "loss": 0.093, "step": 1416 }, { "epoch": 1.01, "grad_norm": 4.1454163736142275, "learning_rate": 8.754470859100765e-06, "loss": 0.0861, "step": 1417 }, { "epoch": 1.01, "grad_norm": 17.634562346783152, "learning_rate": 8.752561511999754e-06, "loss": 0.0907, "step": 1418 }, { "epoch": 1.01, "grad_norm": 5.054823596253129, "learning_rate": 8.750650911098193e-06, "loss": 0.0757, "step": 1419 }, { "epoch": 1.01, "grad_norm": 10.241226207922915, "learning_rate": 8.748739057034447e-06, "loss": 0.081, "step": 1420 }, { "epoch": 1.01, "grad_norm": 4.495722997870084, "learning_rate": 8.746825950447302e-06, "loss": 0.0734, "step": 1421 }, { "epoch": 1.01, "grad_norm": 17.82904323610015, "learning_rate": 8.744911591975967e-06, "loss": 0.0651, "step": 1422 }, { "epoch": 1.02, "grad_norm": 17.484241379617817, "learning_rate": 8.742995982260059e-06, "loss": 0.0819, "step": 1423 }, { "epoch": 1.02, "grad_norm": 18.69316674886684, "learning_rate": 8.741079121939621e-06, "loss": 0.0981, "step": 1424 }, { "epoch": 1.02, "grad_norm": 4.329418925647879, "learning_rate": 8.739161011655113e-06, "loss": 0.087, "step": 1425 }, { "epoch": 1.02, "grad_norm": 4.751184813489952, "learning_rate": 8.737241652047408e-06, "loss": 0.086, "step": 1426 }, { "epoch": 1.02, "grad_norm": 6.326549122631243, "learning_rate": 8.735321043757805e-06, "loss": 0.0751, "step": 1427 }, { "epoch": 1.02, "grad_norm": 10.632712593121198, "learning_rate": 8.73339918742801e-06, "loss": 0.0657, "step": 1428 }, { "epoch": 1.02, "grad_norm": 15.207265708492715, "learning_rate": 8.731476083700154e-06, "loss": 0.117, "step": 1429 }, { "epoch": 1.02, "grad_norm": 11.850694276149138, "learning_rate": 8.729551733216779e-06, "loss": 0.0742, "step": 1430 }, { "epoch": 1.02, "grad_norm": 12.467249641043043, "learning_rate": 8.727626136620848e-06, "loss": 0.0967, "step": 1431 }, { "epoch": 1.02, "grad_norm": 16.67115883838518, "learning_rate": 8.725699294555739e-06, "loss": 0.0746, "step": 1432 }, { "epoch": 1.02, "grad_norm": 21.221164427537694, "learning_rate": 8.723771207665245e-06, "loss": 0.0936, "step": 1433 }, { "epoch": 1.02, "grad_norm": 3.9808876084963707, "learning_rate": 8.721841876593576e-06, "loss": 0.0742, "step": 1434 }, { "epoch": 1.02, "grad_norm": 13.177543099730642, "learning_rate": 8.719911301985355e-06, "loss": 0.0726, "step": 1435 }, { "epoch": 1.02, "grad_norm": 5.383309194526871, "learning_rate": 8.717979484485628e-06, "loss": 0.0803, "step": 1436 }, { "epoch": 1.03, "grad_norm": 8.67535542961095, "learning_rate": 8.716046424739845e-06, "loss": 0.0964, "step": 1437 }, { "epoch": 1.03, "grad_norm": 23.546000226981334, "learning_rate": 8.714112123393882e-06, "loss": 0.0906, "step": 1438 }, { "epoch": 1.03, "grad_norm": 5.478616913562366, "learning_rate": 8.712176581094025e-06, "loss": 0.1, "step": 1439 }, { "epoch": 1.03, "grad_norm": 12.072295746434312, "learning_rate": 8.710239798486972e-06, "loss": 0.1013, "step": 1440 }, { "epoch": 1.03, "grad_norm": 6.156588542394663, "learning_rate": 8.708301776219838e-06, "loss": 0.0717, "step": 1441 }, { "epoch": 1.03, "grad_norm": 7.03749840440953, "learning_rate": 8.706362514940153e-06, "loss": 0.071, "step": 1442 }, { "epoch": 1.03, "grad_norm": 21.908593717607186, "learning_rate": 8.704422015295861e-06, "loss": 0.0908, "step": 1443 }, { "epoch": 1.03, "grad_norm": 12.743381450833912, "learning_rate": 8.702480277935319e-06, "loss": 0.093, "step": 1444 }, { "epoch": 1.03, "grad_norm": 12.237240562039307, "learning_rate": 8.700537303507298e-06, "loss": 0.0611, "step": 1445 }, { "epoch": 1.03, "grad_norm": 31.205512369091654, "learning_rate": 8.69859309266098e-06, "loss": 0.1085, "step": 1446 }, { "epoch": 1.03, "grad_norm": 6.508224635069914, "learning_rate": 8.696647646045962e-06, "loss": 0.0961, "step": 1447 }, { "epoch": 1.03, "grad_norm": 6.020337116432563, "learning_rate": 8.694700964312257e-06, "loss": 0.1194, "step": 1448 }, { "epoch": 1.03, "grad_norm": 11.85315501709236, "learning_rate": 8.692753048110283e-06, "loss": 0.1057, "step": 1449 }, { "epoch": 1.03, "grad_norm": 16.76312648328666, "learning_rate": 8.690803898090878e-06, "loss": 0.0859, "step": 1450 }, { "epoch": 1.04, "grad_norm": 18.181912184082726, "learning_rate": 8.68885351490529e-06, "loss": 0.0883, "step": 1451 }, { "epoch": 1.04, "grad_norm": 15.561501662544323, "learning_rate": 8.686901899205177e-06, "loss": 0.0615, "step": 1452 }, { "epoch": 1.04, "grad_norm": 10.141158740015651, "learning_rate": 8.684949051642609e-06, "loss": 0.0827, "step": 1453 }, { "epoch": 1.04, "grad_norm": 5.131395886113488, "learning_rate": 8.68299497287007e-06, "loss": 0.0664, "step": 1454 }, { "epoch": 1.04, "grad_norm": 35.18287088578419, "learning_rate": 8.681039663540454e-06, "loss": 0.0862, "step": 1455 }, { "epoch": 1.04, "grad_norm": 6.000912763873758, "learning_rate": 8.679083124307064e-06, "loss": 0.0785, "step": 1456 }, { "epoch": 1.04, "grad_norm": 3.954545314588621, "learning_rate": 8.67712535582362e-06, "loss": 0.0663, "step": 1457 }, { "epoch": 1.04, "grad_norm": 16.363053312350825, "learning_rate": 8.675166358744247e-06, "loss": 0.0945, "step": 1458 }, { "epoch": 1.04, "grad_norm": 32.23364016473821, "learning_rate": 8.67320613372348e-06, "loss": 0.1357, "step": 1459 }, { "epoch": 1.04, "grad_norm": 19.06812434731879, "learning_rate": 8.67124468141627e-06, "loss": 0.0738, "step": 1460 }, { "epoch": 1.04, "grad_norm": 9.336167974534424, "learning_rate": 8.669282002477975e-06, "loss": 0.116, "step": 1461 }, { "epoch": 1.04, "grad_norm": 30.216375689397136, "learning_rate": 8.66731809756436e-06, "loss": 0.0871, "step": 1462 }, { "epoch": 1.04, "grad_norm": 9.454651839939022, "learning_rate": 8.665352967331604e-06, "loss": 0.0981, "step": 1463 }, { "epoch": 1.04, "grad_norm": 5.880572461820449, "learning_rate": 8.66338661243629e-06, "loss": 0.0784, "step": 1464 }, { "epoch": 1.05, "grad_norm": 48.047477471559795, "learning_rate": 8.661419033535419e-06, "loss": 0.1255, "step": 1465 }, { "epoch": 1.05, "grad_norm": 5.296796150731295, "learning_rate": 8.659450231286392e-06, "loss": 0.0761, "step": 1466 }, { "epoch": 1.05, "grad_norm": 23.57114189695642, "learning_rate": 8.657480206347024e-06, "loss": 0.0864, "step": 1467 }, { "epoch": 1.05, "grad_norm": 25.506822238745904, "learning_rate": 8.655508959375536e-06, "loss": 0.0881, "step": 1468 }, { "epoch": 1.05, "grad_norm": 8.686210935110786, "learning_rate": 8.653536491030559e-06, "loss": 0.0715, "step": 1469 }, { "epoch": 1.05, "grad_norm": 7.926723814269391, "learning_rate": 8.651562801971131e-06, "loss": 0.0723, "step": 1470 }, { "epoch": 1.05, "grad_norm": 11.287008911087467, "learning_rate": 8.649587892856698e-06, "loss": 0.0819, "step": 1471 }, { "epoch": 1.05, "grad_norm": 17.18659129208043, "learning_rate": 8.647611764347114e-06, "loss": 0.0782, "step": 1472 }, { "epoch": 1.05, "grad_norm": 7.819540078505, "learning_rate": 8.64563441710264e-06, "loss": 0.1422, "step": 1473 }, { "epoch": 1.05, "grad_norm": 17.558856807362183, "learning_rate": 8.643655851783947e-06, "loss": 0.0733, "step": 1474 }, { "epoch": 1.05, "grad_norm": 16.51338622173928, "learning_rate": 8.641676069052104e-06, "loss": 0.0606, "step": 1475 }, { "epoch": 1.05, "grad_norm": 7.477740828151368, "learning_rate": 8.639695069568602e-06, "loss": 0.0682, "step": 1476 }, { "epoch": 1.05, "grad_norm": 14.973985830755504, "learning_rate": 8.637712853995324e-06, "loss": 0.105, "step": 1477 }, { "epoch": 1.05, "grad_norm": 15.603054833838772, "learning_rate": 8.635729422994566e-06, "loss": 0.087, "step": 1478 }, { "epoch": 1.06, "grad_norm": 6.290026085929057, "learning_rate": 8.633744777229029e-06, "loss": 0.0738, "step": 1479 }, { "epoch": 1.06, "grad_norm": 3.7390029716415563, "learning_rate": 8.63175891736182e-06, "loss": 0.0673, "step": 1480 }, { "epoch": 1.06, "grad_norm": 5.473545590164696, "learning_rate": 8.629771844056452e-06, "loss": 0.0686, "step": 1481 }, { "epoch": 1.06, "grad_norm": 14.272289047677544, "learning_rate": 8.627783557976846e-06, "loss": 0.0627, "step": 1482 }, { "epoch": 1.06, "grad_norm": 12.905709838650964, "learning_rate": 8.62579405978732e-06, "loss": 0.0811, "step": 1483 }, { "epoch": 1.06, "grad_norm": 13.0467547255787, "learning_rate": 8.623803350152606e-06, "loss": 0.0734, "step": 1484 }, { "epoch": 1.06, "grad_norm": 9.523481106503748, "learning_rate": 8.621811429737837e-06, "loss": 0.0906, "step": 1485 }, { "epoch": 1.06, "grad_norm": 26.36033723371387, "learning_rate": 8.619818299208548e-06, "loss": 0.0687, "step": 1486 }, { "epoch": 1.06, "grad_norm": 13.671703283276997, "learning_rate": 8.617823959230683e-06, "loss": 0.0829, "step": 1487 }, { "epoch": 1.06, "grad_norm": 13.44761997098398, "learning_rate": 8.615828410470589e-06, "loss": 0.0809, "step": 1488 }, { "epoch": 1.06, "grad_norm": 12.38686214081961, "learning_rate": 8.613831653595013e-06, "loss": 0.1403, "step": 1489 }, { "epoch": 1.06, "grad_norm": 29.543403014919598, "learning_rate": 8.61183368927111e-06, "loss": 0.0684, "step": 1490 }, { "epoch": 1.06, "grad_norm": 9.143382134106462, "learning_rate": 8.609834518166439e-06, "loss": 0.0809, "step": 1491 }, { "epoch": 1.06, "grad_norm": 14.121857063675124, "learning_rate": 8.607834140948958e-06, "loss": 0.0721, "step": 1492 }, { "epoch": 1.07, "grad_norm": 14.681563500765165, "learning_rate": 8.60583255828703e-06, "loss": 0.0598, "step": 1493 }, { "epoch": 1.07, "grad_norm": 11.402342240873446, "learning_rate": 8.603829770849421e-06, "loss": 0.0837, "step": 1494 }, { "epoch": 1.07, "grad_norm": 34.80956350140246, "learning_rate": 8.601825779305302e-06, "loss": 0.0812, "step": 1495 }, { "epoch": 1.07, "grad_norm": 25.664482199439714, "learning_rate": 8.59982058432424e-06, "loss": 0.0872, "step": 1496 }, { "epoch": 1.07, "grad_norm": 4.942475728354198, "learning_rate": 8.597814186576212e-06, "loss": 0.0752, "step": 1497 }, { "epoch": 1.07, "grad_norm": 21.889841226015804, "learning_rate": 8.595806586731589e-06, "loss": 0.0869, "step": 1498 }, { "epoch": 1.07, "grad_norm": 47.39501041858811, "learning_rate": 8.59379778546115e-06, "loss": 0.1328, "step": 1499 }, { "epoch": 1.07, "grad_norm": 23.843981585654273, "learning_rate": 8.591787783436073e-06, "loss": 0.0833, "step": 1500 }, { "epoch": 1.07, "eval_avg_AUC": 0.8187337540188822, "eval_avg_Accuracy": 0.7427470159151194, "eval_avg_Accuracy-right": 0.8404199817399244, "eval_avg_Accuracy-wrong": 0.5724357516488515, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6880919420787215, "eval_last_AUC": 0.8358783930516435, "eval_last_Accuracy": 0.7628066976127321, "eval_last_Accuracy-right": 0.8261379940002609, "eval_last_Accuracy-wrong": 0.6523766204230157, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.7015684504151366, "eval_max_AUC": 0.7842336113536985, "eval_max_Accuracy": 0.658363726790451, "eval_max_Accuracy-right": 0.9557845311073432, "eval_max_Accuracy-wrong": 0.13975437798498977, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6534520933484, "eval_min_AUC": 0.822015164482916, "eval_min_Accuracy": 0.7524452917771883, "eval_min_Accuracy-right": 0.7425981479066127, "eval_min_Accuracy-wrong": 0.7696156470320673, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6912215159082531, "eval_prod_AUC": 0.819309531656854, "eval_prod_Accuracy": 0.5770888594164456, "eval_prod_Accuracy-right": 0.35946263205947565, "eval_prod_Accuracy-wrong": 0.9565612917898567, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6874273712080468, "eval_runtime": 248.3319, "eval_samples_per_second": 97.16, "eval_steps_per_second": 3.036, "eval_sum_AUC": 0.7137200687510031, "eval_sum_Accuracy": 0.64253149867374, "eval_sum_Accuracy-right": 0.9941959045258901, "eval_sum_Accuracy-wrong": 0.029338185126222424, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6771763816044019, "step": 1500 }, { "epoch": 1.07, "grad_norm": 22.34310628383116, "learning_rate": 8.589776581327936e-06, "loss": 0.1263, "step": 1501 }, { "epoch": 1.07, "grad_norm": 22.26638647085685, "learning_rate": 8.587764179808716e-06, "loss": 0.0922, "step": 1502 }, { "epoch": 1.07, "grad_norm": 34.88623829684697, "learning_rate": 8.5857505795508e-06, "loss": 0.0751, "step": 1503 }, { "epoch": 1.07, "grad_norm": 19.362303625609805, "learning_rate": 8.583735781226964e-06, "loss": 0.0897, "step": 1504 }, { "epoch": 1.07, "grad_norm": 15.607349323647277, "learning_rate": 8.581719785510391e-06, "loss": 0.0867, "step": 1505 }, { "epoch": 1.07, "grad_norm": 11.194617495715056, "learning_rate": 8.579702593074666e-06, "loss": 0.0877, "step": 1506 }, { "epoch": 1.08, "grad_norm": 19.404643890863266, "learning_rate": 8.577684204593767e-06, "loss": 0.0745, "step": 1507 }, { "epoch": 1.08, "grad_norm": 12.482102245943857, "learning_rate": 8.575664620742073e-06, "loss": 0.0892, "step": 1508 }, { "epoch": 1.08, "grad_norm": 17.587811648576377, "learning_rate": 8.57364384219437e-06, "loss": 0.0742, "step": 1509 }, { "epoch": 1.08, "grad_norm": 19.655816441505596, "learning_rate": 8.571621869625835e-06, "loss": 0.093, "step": 1510 }, { "epoch": 1.08, "grad_norm": 30.03043984591091, "learning_rate": 8.569598703712045e-06, "loss": 0.0831, "step": 1511 }, { "epoch": 1.08, "grad_norm": 21.842875103984415, "learning_rate": 8.56757434512898e-06, "loss": 0.0834, "step": 1512 }, { "epoch": 1.08, "grad_norm": 12.39324221817927, "learning_rate": 8.565548794553016e-06, "loss": 0.0818, "step": 1513 }, { "epoch": 1.08, "grad_norm": 43.49885915979903, "learning_rate": 8.563522052660925e-06, "loss": 0.1049, "step": 1514 }, { "epoch": 1.08, "grad_norm": 6.834088679375381, "learning_rate": 8.561494120129878e-06, "loss": 0.1014, "step": 1515 }, { "epoch": 1.08, "grad_norm": 10.466927403243657, "learning_rate": 8.55946499763745e-06, "loss": 0.0749, "step": 1516 }, { "epoch": 1.08, "grad_norm": 10.911200002806018, "learning_rate": 8.557434685861604e-06, "loss": 0.0834, "step": 1517 }, { "epoch": 1.08, "grad_norm": 33.65082017177862, "learning_rate": 8.555403185480706e-06, "loss": 0.0867, "step": 1518 }, { "epoch": 1.08, "grad_norm": 12.496700495977283, "learning_rate": 8.553370497173518e-06, "loss": 0.0687, "step": 1519 }, { "epoch": 1.08, "grad_norm": 13.198970575902443, "learning_rate": 8.551336621619202e-06, "loss": 0.1044, "step": 1520 }, { "epoch": 1.09, "grad_norm": 6.252606767967176, "learning_rate": 8.549301559497309e-06, "loss": 0.0756, "step": 1521 }, { "epoch": 1.09, "grad_norm": 19.71387444708082, "learning_rate": 8.547265311487794e-06, "loss": 0.0796, "step": 1522 }, { "epoch": 1.09, "grad_norm": 13.494983831348298, "learning_rate": 8.545227878271004e-06, "loss": 0.0994, "step": 1523 }, { "epoch": 1.09, "grad_norm": 31.544098364929408, "learning_rate": 8.543189260527685e-06, "loss": 0.0847, "step": 1524 }, { "epoch": 1.09, "grad_norm": 7.203150409865368, "learning_rate": 8.541149458938972e-06, "loss": 0.0718, "step": 1525 }, { "epoch": 1.09, "grad_norm": 6.754617592326723, "learning_rate": 8.539108474186408e-06, "loss": 0.0932, "step": 1526 }, { "epoch": 1.09, "grad_norm": 28.549577345821078, "learning_rate": 8.53706630695192e-06, "loss": 0.0979, "step": 1527 }, { "epoch": 1.09, "grad_norm": 22.622866020467605, "learning_rate": 8.535022957917833e-06, "loss": 0.0777, "step": 1528 }, { "epoch": 1.09, "grad_norm": 12.70378295713489, "learning_rate": 8.53297842776687e-06, "loss": 0.1194, "step": 1529 }, { "epoch": 1.09, "grad_norm": 9.841315864137869, "learning_rate": 8.530932717182148e-06, "loss": 0.1196, "step": 1530 }, { "epoch": 1.09, "grad_norm": 38.579351262870325, "learning_rate": 8.528885826847173e-06, "loss": 0.0726, "step": 1531 }, { "epoch": 1.09, "grad_norm": 29.95101519238097, "learning_rate": 8.52683775744585e-06, "loss": 0.0677, "step": 1532 }, { "epoch": 1.09, "grad_norm": 18.422090044359035, "learning_rate": 8.524788509662478e-06, "loss": 0.069, "step": 1533 }, { "epoch": 1.09, "grad_norm": 18.93769711941636, "learning_rate": 8.522738084181749e-06, "loss": 0.0963, "step": 1534 }, { "epoch": 1.1, "grad_norm": 22.851640489831993, "learning_rate": 8.52068648168875e-06, "loss": 0.0773, "step": 1535 }, { "epoch": 1.1, "grad_norm": 51.74572885384836, "learning_rate": 8.518633702868955e-06, "loss": 0.1013, "step": 1536 }, { "epoch": 1.1, "grad_norm": 12.047738013956764, "learning_rate": 8.516579748408237e-06, "loss": 0.0697, "step": 1537 }, { "epoch": 1.1, "grad_norm": 8.15886762195816, "learning_rate": 8.514524618992864e-06, "loss": 0.0896, "step": 1538 }, { "epoch": 1.1, "grad_norm": 47.185535045680865, "learning_rate": 8.51246831530949e-06, "loss": 0.1111, "step": 1539 }, { "epoch": 1.1, "grad_norm": 56.966239713258744, "learning_rate": 8.510410838045165e-06, "loss": 0.1395, "step": 1540 }, { "epoch": 1.1, "grad_norm": 13.443718359824823, "learning_rate": 8.508352187887329e-06, "loss": 0.0826, "step": 1541 }, { "epoch": 1.1, "grad_norm": 21.80564065604766, "learning_rate": 8.506292365523816e-06, "loss": 0.0814, "step": 1542 }, { "epoch": 1.1, "grad_norm": 24.864346346179897, "learning_rate": 8.504231371642852e-06, "loss": 0.1256, "step": 1543 }, { "epoch": 1.1, "grad_norm": 54.020150916277935, "learning_rate": 8.502169206933053e-06, "loss": 0.1006, "step": 1544 }, { "epoch": 1.1, "grad_norm": 25.153860020715037, "learning_rate": 8.500105872083424e-06, "loss": 0.0862, "step": 1545 }, { "epoch": 1.1, "grad_norm": 8.72803160180219, "learning_rate": 8.498041367783367e-06, "loss": 0.0757, "step": 1546 }, { "epoch": 1.1, "grad_norm": 44.857111526137345, "learning_rate": 8.49597569472267e-06, "loss": 0.1027, "step": 1547 }, { "epoch": 1.1, "grad_norm": 39.333400063483836, "learning_rate": 8.493908853591515e-06, "loss": 0.0902, "step": 1548 }, { "epoch": 1.11, "grad_norm": 18.48890358440609, "learning_rate": 8.491840845080467e-06, "loss": 0.0837, "step": 1549 }, { "epoch": 1.11, "grad_norm": 5.104802433608296, "learning_rate": 8.489771669880489e-06, "loss": 0.0821, "step": 1550 }, { "epoch": 1.11, "grad_norm": 17.233922736828326, "learning_rate": 8.487701328682932e-06, "loss": 0.0866, "step": 1551 }, { "epoch": 1.11, "grad_norm": 31.87602778903619, "learning_rate": 8.485629822179533e-06, "loss": 0.084, "step": 1552 }, { "epoch": 1.11, "grad_norm": 34.96613218097158, "learning_rate": 8.483557151062423e-06, "loss": 0.0948, "step": 1553 }, { "epoch": 1.11, "grad_norm": 5.749975742639682, "learning_rate": 8.481483316024117e-06, "loss": 0.0853, "step": 1554 }, { "epoch": 1.11, "grad_norm": 12.11538066802083, "learning_rate": 8.479408317757525e-06, "loss": 0.11, "step": 1555 }, { "epoch": 1.11, "grad_norm": 35.93167259883927, "learning_rate": 8.477332156955942e-06, "loss": 0.0968, "step": 1556 }, { "epoch": 1.11, "grad_norm": 45.13111946912449, "learning_rate": 8.475254834313051e-06, "loss": 0.1176, "step": 1557 }, { "epoch": 1.11, "grad_norm": 8.099342716236617, "learning_rate": 8.473176350522925e-06, "loss": 0.0784, "step": 1558 }, { "epoch": 1.11, "grad_norm": 6.454912943269717, "learning_rate": 8.471096706280022e-06, "loss": 0.1095, "step": 1559 }, { "epoch": 1.11, "grad_norm": 27.021317013221832, "learning_rate": 8.469015902279191e-06, "loss": 0.0663, "step": 1560 }, { "epoch": 1.11, "grad_norm": 34.78660476793632, "learning_rate": 8.466933939215669e-06, "loss": 0.1093, "step": 1561 }, { "epoch": 1.11, "grad_norm": 21.9565790223612, "learning_rate": 8.464850817785075e-06, "loss": 0.0702, "step": 1562 }, { "epoch": 1.12, "grad_norm": 11.065785710336767, "learning_rate": 8.462766538683422e-06, "loss": 0.0822, "step": 1563 }, { "epoch": 1.12, "grad_norm": 27.319906992202778, "learning_rate": 8.460681102607106e-06, "loss": 0.0803, "step": 1564 }, { "epoch": 1.12, "grad_norm": 12.384719292234578, "learning_rate": 8.45859451025291e-06, "loss": 0.0873, "step": 1565 }, { "epoch": 1.12, "grad_norm": 8.592013770399037, "learning_rate": 8.456506762317998e-06, "loss": 0.1086, "step": 1566 }, { "epoch": 1.12, "grad_norm": 15.454204756646234, "learning_rate": 8.454417859499932e-06, "loss": 0.1068, "step": 1567 }, { "epoch": 1.12, "grad_norm": 13.97423432264866, "learning_rate": 8.45232780249665e-06, "loss": 0.0629, "step": 1568 }, { "epoch": 1.12, "grad_norm": 8.11051784415923, "learning_rate": 8.450236592006481e-06, "loss": 0.0693, "step": 1569 }, { "epoch": 1.12, "grad_norm": 10.52568011914339, "learning_rate": 8.448144228728135e-06, "loss": 0.0842, "step": 1570 }, { "epoch": 1.12, "grad_norm": 13.957701644591483, "learning_rate": 8.446050713360711e-06, "loss": 0.0625, "step": 1571 }, { "epoch": 1.12, "grad_norm": 8.651605410737629, "learning_rate": 8.443956046603692e-06, "loss": 0.0853, "step": 1572 }, { "epoch": 1.12, "grad_norm": 11.215401871180575, "learning_rate": 8.441860229156944e-06, "loss": 0.0886, "step": 1573 }, { "epoch": 1.12, "grad_norm": 16.2620319467978, "learning_rate": 8.439763261720716e-06, "loss": 0.1127, "step": 1574 }, { "epoch": 1.12, "grad_norm": 19.65795515380451, "learning_rate": 8.43766514499565e-06, "loss": 0.0867, "step": 1575 }, { "epoch": 1.12, "grad_norm": 15.569766631054197, "learning_rate": 8.435565879682759e-06, "loss": 0.0986, "step": 1576 }, { "epoch": 1.13, "grad_norm": 4.803571295203503, "learning_rate": 8.433465466483452e-06, "loss": 0.0811, "step": 1577 }, { "epoch": 1.13, "grad_norm": 8.29721737997988, "learning_rate": 8.431363906099513e-06, "loss": 0.0776, "step": 1578 }, { "epoch": 1.13, "grad_norm": 15.709180796487498, "learning_rate": 8.429261199233114e-06, "loss": 0.0936, "step": 1579 }, { "epoch": 1.13, "grad_norm": 9.65559442729195, "learning_rate": 8.427157346586807e-06, "loss": 0.0811, "step": 1580 }, { "epoch": 1.13, "grad_norm": 9.08245662725313, "learning_rate": 8.42505234886353e-06, "loss": 0.1066, "step": 1581 }, { "epoch": 1.13, "grad_norm": 15.963511956163712, "learning_rate": 8.422946206766598e-06, "loss": 0.0867, "step": 1582 }, { "epoch": 1.13, "grad_norm": 16.28827683392989, "learning_rate": 8.420838920999718e-06, "loss": 0.0611, "step": 1583 }, { "epoch": 1.13, "grad_norm": 28.81251875839677, "learning_rate": 8.418730492266968e-06, "loss": 0.086, "step": 1584 }, { "epoch": 1.13, "grad_norm": 10.509715363597547, "learning_rate": 8.416620921272818e-06, "loss": 0.074, "step": 1585 }, { "epoch": 1.13, "grad_norm": 21.04769017279374, "learning_rate": 8.414510208722111e-06, "loss": 0.0928, "step": 1586 }, { "epoch": 1.13, "grad_norm": 29.427287736707218, "learning_rate": 8.412398355320078e-06, "loss": 0.0986, "step": 1587 }, { "epoch": 1.13, "grad_norm": 12.45398042864309, "learning_rate": 8.410285361772328e-06, "loss": 0.0876, "step": 1588 }, { "epoch": 1.13, "grad_norm": 14.68115002988725, "learning_rate": 8.408171228784847e-06, "loss": 0.0646, "step": 1589 }, { "epoch": 1.13, "grad_norm": 23.379480357894426, "learning_rate": 8.406055957064014e-06, "loss": 0.0698, "step": 1590 }, { "epoch": 1.14, "grad_norm": 33.42786888545678, "learning_rate": 8.403939547316576e-06, "loss": 0.1045, "step": 1591 }, { "epoch": 1.14, "grad_norm": 18.63204059915201, "learning_rate": 8.401822000249661e-06, "loss": 0.0681, "step": 1592 }, { "epoch": 1.14, "grad_norm": 34.22901610610889, "learning_rate": 8.399703316570788e-06, "loss": 0.0841, "step": 1593 }, { "epoch": 1.14, "grad_norm": 7.427652450081075, "learning_rate": 8.397583496987846e-06, "loss": 0.0715, "step": 1594 }, { "epoch": 1.14, "grad_norm": 10.923600709443539, "learning_rate": 8.395462542209106e-06, "loss": 0.0789, "step": 1595 }, { "epoch": 1.14, "grad_norm": 12.337513283657069, "learning_rate": 8.393340452943219e-06, "loss": 0.0902, "step": 1596 }, { "epoch": 1.14, "grad_norm": 27.489704010210236, "learning_rate": 8.391217229899211e-06, "loss": 0.1047, "step": 1597 }, { "epoch": 1.14, "grad_norm": 21.02825485654168, "learning_rate": 8.389092873786495e-06, "loss": 0.0721, "step": 1598 }, { "epoch": 1.14, "grad_norm": 7.16992364078436, "learning_rate": 8.386967385314857e-06, "loss": 0.0811, "step": 1599 }, { "epoch": 1.14, "grad_norm": 12.091998028936846, "learning_rate": 8.384840765194458e-06, "loss": 0.0624, "step": 1600 }, { "epoch": 1.14, "grad_norm": 51.589968347407535, "learning_rate": 8.382713014135846e-06, "loss": 0.1481, "step": 1601 }, { "epoch": 1.14, "grad_norm": 23.84886638204971, "learning_rate": 8.38058413284994e-06, "loss": 0.0939, "step": 1602 }, { "epoch": 1.14, "grad_norm": 21.203344409082234, "learning_rate": 8.37845412204804e-06, "loss": 0.0793, "step": 1603 }, { "epoch": 1.14, "grad_norm": 24.550507200743613, "learning_rate": 8.376322982441821e-06, "loss": 0.0908, "step": 1604 }, { "epoch": 1.15, "grad_norm": 19.77879329537523, "learning_rate": 8.374190714743338e-06, "loss": 0.0679, "step": 1605 }, { "epoch": 1.15, "grad_norm": 29.6562862247157, "learning_rate": 8.37205731966502e-06, "loss": 0.0901, "step": 1606 }, { "epoch": 1.15, "grad_norm": 5.814058600851218, "learning_rate": 8.369922797919672e-06, "loss": 0.0811, "step": 1607 }, { "epoch": 1.15, "grad_norm": 39.8862320984172, "learning_rate": 8.367787150220481e-06, "loss": 0.0906, "step": 1608 }, { "epoch": 1.15, "grad_norm": 28.666925523395125, "learning_rate": 8.365650377281004e-06, "loss": 0.0891, "step": 1609 }, { "epoch": 1.15, "grad_norm": 21.206415015810865, "learning_rate": 8.36351247981518e-06, "loss": 0.0583, "step": 1610 }, { "epoch": 1.15, "grad_norm": 8.871052189203208, "learning_rate": 8.361373458537316e-06, "loss": 0.0898, "step": 1611 }, { "epoch": 1.15, "grad_norm": 32.08612248377674, "learning_rate": 8.359233314162102e-06, "loss": 0.1099, "step": 1612 }, { "epoch": 1.15, "grad_norm": 28.62885152393508, "learning_rate": 8.357092047404598e-06, "loss": 0.0684, "step": 1613 }, { "epoch": 1.15, "grad_norm": 12.122663530621193, "learning_rate": 8.354949658980243e-06, "loss": 0.0867, "step": 1614 }, { "epoch": 1.15, "grad_norm": 10.300802431022989, "learning_rate": 8.352806149604847e-06, "loss": 0.0674, "step": 1615 }, { "epoch": 1.15, "grad_norm": 14.64308369502829, "learning_rate": 8.350661519994596e-06, "loss": 0.1304, "step": 1616 }, { "epoch": 1.15, "grad_norm": 14.480196587169626, "learning_rate": 8.348515770866051e-06, "loss": 0.1102, "step": 1617 }, { "epoch": 1.15, "grad_norm": 25.760410586184832, "learning_rate": 8.346368902936149e-06, "loss": 0.1083, "step": 1618 }, { "epoch": 1.16, "grad_norm": 5.429983401294056, "learning_rate": 8.344220916922195e-06, "loss": 0.0852, "step": 1619 }, { "epoch": 1.16, "grad_norm": 13.695217928576449, "learning_rate": 8.342071813541873e-06, "loss": 0.0719, "step": 1620 }, { "epoch": 1.16, "grad_norm": 18.550814397047013, "learning_rate": 8.339921593513239e-06, "loss": 0.1259, "step": 1621 }, { "epoch": 1.16, "grad_norm": 5.205918109284469, "learning_rate": 8.337770257554721e-06, "loss": 0.0732, "step": 1622 }, { "epoch": 1.16, "grad_norm": 4.675401155226301, "learning_rate": 8.335617806385119e-06, "loss": 0.0649, "step": 1623 }, { "epoch": 1.16, "grad_norm": 5.349145931438254, "learning_rate": 8.333464240723608e-06, "loss": 0.0719, "step": 1624 }, { "epoch": 1.16, "grad_norm": 21.629273939026348, "learning_rate": 8.331309561289734e-06, "loss": 0.089, "step": 1625 }, { "epoch": 1.16, "grad_norm": 13.33043717619918, "learning_rate": 8.329153768803415e-06, "loss": 0.0852, "step": 1626 }, { "epoch": 1.16, "grad_norm": 22.20088070759164, "learning_rate": 8.326996863984942e-06, "loss": 0.1255, "step": 1627 }, { "epoch": 1.16, "grad_norm": 14.843059951898912, "learning_rate": 8.324838847554976e-06, "loss": 0.1042, "step": 1628 }, { "epoch": 1.16, "grad_norm": 28.36135952406115, "learning_rate": 8.322679720234553e-06, "loss": 0.0717, "step": 1629 }, { "epoch": 1.16, "grad_norm": 21.469435660824352, "learning_rate": 8.320519482745076e-06, "loss": 0.0778, "step": 1630 }, { "epoch": 1.16, "grad_norm": 6.953286758260457, "learning_rate": 8.31835813580832e-06, "loss": 0.0997, "step": 1631 }, { "epoch": 1.16, "grad_norm": 45.73029621305888, "learning_rate": 8.316195680146431e-06, "loss": 0.1168, "step": 1632 }, { "epoch": 1.17, "grad_norm": 27.95351189790114, "learning_rate": 8.314032116481927e-06, "loss": 0.1123, "step": 1633 }, { "epoch": 1.17, "grad_norm": 19.796349764504168, "learning_rate": 8.311867445537694e-06, "loss": 0.0734, "step": 1634 }, { "epoch": 1.17, "grad_norm": 30.2548744067372, "learning_rate": 8.30970166803699e-06, "loss": 0.1013, "step": 1635 }, { "epoch": 1.17, "grad_norm": 32.50299572337184, "learning_rate": 8.307534784703438e-06, "loss": 0.0886, "step": 1636 }, { "epoch": 1.17, "grad_norm": 23.322609551734118, "learning_rate": 8.305366796261036e-06, "loss": 0.0909, "step": 1637 }, { "epoch": 1.17, "grad_norm": 18.271103305323894, "learning_rate": 8.303197703434151e-06, "loss": 0.111, "step": 1638 }, { "epoch": 1.17, "grad_norm": 37.2120104735074, "learning_rate": 8.301027506947516e-06, "loss": 0.1141, "step": 1639 }, { "epoch": 1.17, "grad_norm": 26.100003029295397, "learning_rate": 8.298856207526234e-06, "loss": 0.0931, "step": 1640 }, { "epoch": 1.17, "grad_norm": 13.681595047192792, "learning_rate": 8.296683805895777e-06, "loss": 0.0827, "step": 1641 }, { "epoch": 1.17, "grad_norm": 4.15509856357399, "learning_rate": 8.294510302781984e-06, "loss": 0.0536, "step": 1642 }, { "epoch": 1.17, "grad_norm": 26.6273672687637, "learning_rate": 8.29233569891106e-06, "loss": 0.082, "step": 1643 }, { "epoch": 1.17, "grad_norm": 24.756619530194843, "learning_rate": 8.290159995009586e-06, "loss": 0.1047, "step": 1644 }, { "epoch": 1.17, "grad_norm": 23.963631520040913, "learning_rate": 8.2879831918045e-06, "loss": 0.0957, "step": 1645 }, { "epoch": 1.17, "grad_norm": 14.18439751234935, "learning_rate": 8.285805290023119e-06, "loss": 0.1011, "step": 1646 }, { "epoch": 1.18, "grad_norm": 42.43033780709877, "learning_rate": 8.283626290393112e-06, "loss": 0.1035, "step": 1647 }, { "epoch": 1.18, "grad_norm": 50.888225111603596, "learning_rate": 8.28144619364253e-06, "loss": 0.0992, "step": 1648 }, { "epoch": 1.18, "grad_norm": 6.112792402279644, "learning_rate": 8.279265000499783e-06, "loss": 0.076, "step": 1649 }, { "epoch": 1.18, "grad_norm": 22.063075048957163, "learning_rate": 8.277082711693645e-06, "loss": 0.0963, "step": 1650 }, { "epoch": 1.18, "grad_norm": 15.773608935822898, "learning_rate": 8.274899327953261e-06, "loss": 0.1035, "step": 1651 }, { "epoch": 1.18, "grad_norm": 11.5617974343692, "learning_rate": 8.272714850008142e-06, "loss": 0.1187, "step": 1652 }, { "epoch": 1.18, "grad_norm": 15.775555813001663, "learning_rate": 8.270529278588158e-06, "loss": 0.1015, "step": 1653 }, { "epoch": 1.18, "grad_norm": 14.11119372001958, "learning_rate": 8.268342614423553e-06, "loss": 0.0741, "step": 1654 }, { "epoch": 1.18, "grad_norm": 22.852005917074994, "learning_rate": 8.26615485824493e-06, "loss": 0.0905, "step": 1655 }, { "epoch": 1.18, "grad_norm": 10.383090597471917, "learning_rate": 8.263966010783259e-06, "loss": 0.0772, "step": 1656 }, { "epoch": 1.18, "grad_norm": 20.778806982750876, "learning_rate": 8.261776072769878e-06, "loss": 0.0751, "step": 1657 }, { "epoch": 1.18, "grad_norm": 32.605084582643634, "learning_rate": 8.259585044936484e-06, "loss": 0.0916, "step": 1658 }, { "epoch": 1.18, "grad_norm": 16.061701345538165, "learning_rate": 8.257392928015138e-06, "loss": 0.0689, "step": 1659 }, { "epoch": 1.18, "grad_norm": 5.880742039534995, "learning_rate": 8.25519972273827e-06, "loss": 0.0938, "step": 1660 }, { "epoch": 1.19, "grad_norm": 38.374981943860945, "learning_rate": 8.253005429838667e-06, "loss": 0.0822, "step": 1661 }, { "epoch": 1.19, "grad_norm": 31.098642690273483, "learning_rate": 8.250810050049488e-06, "loss": 0.0938, "step": 1662 }, { "epoch": 1.19, "grad_norm": 19.79092470836297, "learning_rate": 8.248613584104245e-06, "loss": 0.073, "step": 1663 }, { "epoch": 1.19, "grad_norm": 10.879540641520684, "learning_rate": 8.246416032736824e-06, "loss": 0.0814, "step": 1664 }, { "epoch": 1.19, "grad_norm": 35.35301070268229, "learning_rate": 8.244217396681461e-06, "loss": 0.0746, "step": 1665 }, { "epoch": 1.19, "grad_norm": 32.156642571249634, "learning_rate": 8.242017676672766e-06, "loss": 0.1055, "step": 1666 }, { "epoch": 1.19, "grad_norm": 5.761163190326076, "learning_rate": 8.239816873445705e-06, "loss": 0.0907, "step": 1667 }, { "epoch": 1.19, "grad_norm": 7.04210178955987, "learning_rate": 8.237614987735607e-06, "loss": 0.0601, "step": 1668 }, { "epoch": 1.19, "grad_norm": 18.6708744832657, "learning_rate": 8.235412020278164e-06, "loss": 0.0577, "step": 1669 }, { "epoch": 1.19, "grad_norm": 18.924197204027514, "learning_rate": 8.233207971809427e-06, "loss": 0.0748, "step": 1670 }, { "epoch": 1.19, "grad_norm": 6.057484238524624, "learning_rate": 8.23100284306581e-06, "loss": 0.0565, "step": 1671 }, { "epoch": 1.19, "grad_norm": 8.302716030736626, "learning_rate": 8.228796634784086e-06, "loss": 0.0578, "step": 1672 }, { "epoch": 1.19, "grad_norm": 7.599559113192868, "learning_rate": 8.226589347701396e-06, "loss": 0.0682, "step": 1673 }, { "epoch": 1.19, "grad_norm": 7.509062555162927, "learning_rate": 8.224380982555226e-06, "loss": 0.0937, "step": 1674 }, { "epoch": 1.2, "grad_norm": 15.548280441790565, "learning_rate": 8.222171540083442e-06, "loss": 0.1221, "step": 1675 }, { "epoch": 1.2, "grad_norm": 19.248751048696665, "learning_rate": 8.219961021024251e-06, "loss": 0.0949, "step": 1676 }, { "epoch": 1.2, "grad_norm": 12.733356442809999, "learning_rate": 8.217749426116238e-06, "loss": 0.0925, "step": 1677 }, { "epoch": 1.2, "grad_norm": 19.53707190829396, "learning_rate": 8.215536756098327e-06, "loss": 0.0745, "step": 1678 }, { "epoch": 1.2, "grad_norm": 5.487045075606509, "learning_rate": 8.21332301170982e-06, "loss": 0.0776, "step": 1679 }, { "epoch": 1.2, "grad_norm": 7.500739390496608, "learning_rate": 8.211108193690369e-06, "loss": 0.1046, "step": 1680 }, { "epoch": 1.2, "grad_norm": 7.048956581103097, "learning_rate": 8.208892302779982e-06, "loss": 0.0927, "step": 1681 }, { "epoch": 1.2, "grad_norm": 13.805405676241328, "learning_rate": 8.206675339719034e-06, "loss": 0.0771, "step": 1682 }, { "epoch": 1.2, "grad_norm": 15.463386393474316, "learning_rate": 8.204457305248253e-06, "loss": 0.0728, "step": 1683 }, { "epoch": 1.2, "grad_norm": 15.88977529722202, "learning_rate": 8.202238200108721e-06, "loss": 0.0798, "step": 1684 }, { "epoch": 1.2, "grad_norm": 17.967914215750977, "learning_rate": 8.200018025041887e-06, "loss": 0.1217, "step": 1685 }, { "epoch": 1.2, "grad_norm": 28.123029815824335, "learning_rate": 8.19779678078955e-06, "loss": 0.084, "step": 1686 }, { "epoch": 1.2, "grad_norm": 34.442573623512985, "learning_rate": 8.195574468093872e-06, "loss": 0.1146, "step": 1687 }, { "epoch": 1.2, "grad_norm": 13.908280626151955, "learning_rate": 8.193351087697366e-06, "loss": 0.0895, "step": 1688 }, { "epoch": 1.21, "grad_norm": 17.926660993388946, "learning_rate": 8.191126640342906e-06, "loss": 0.0702, "step": 1689 }, { "epoch": 1.21, "grad_norm": 18.323048704617886, "learning_rate": 8.18890112677372e-06, "loss": 0.083, "step": 1690 }, { "epoch": 1.21, "grad_norm": 15.46406615699352, "learning_rate": 8.186674547733398e-06, "loss": 0.0956, "step": 1691 }, { "epoch": 1.21, "grad_norm": 12.626608111415358, "learning_rate": 8.184446903965875e-06, "loss": 0.1058, "step": 1692 }, { "epoch": 1.21, "grad_norm": 14.08504431889377, "learning_rate": 8.182218196215452e-06, "loss": 0.1021, "step": 1693 }, { "epoch": 1.21, "grad_norm": 18.58510270452204, "learning_rate": 8.17998842522678e-06, "loss": 0.0598, "step": 1694 }, { "epoch": 1.21, "grad_norm": 10.819894273976141, "learning_rate": 8.17775759174487e-06, "loss": 0.1018, "step": 1695 }, { "epoch": 1.21, "grad_norm": 26.40052974707958, "learning_rate": 8.17552569651508e-06, "loss": 0.0984, "step": 1696 }, { "epoch": 1.21, "grad_norm": 6.349053416270252, "learning_rate": 8.173292740283135e-06, "loss": 0.0953, "step": 1697 }, { "epoch": 1.21, "grad_norm": 19.342874409960686, "learning_rate": 8.171058723795097e-06, "loss": 0.0953, "step": 1698 }, { "epoch": 1.21, "grad_norm": 27.925603947809098, "learning_rate": 8.168823647797401e-06, "loss": 0.1146, "step": 1699 }, { "epoch": 1.21, "grad_norm": 77.29392142046062, "learning_rate": 8.166587513036826e-06, "loss": 0.1232, "step": 1700 }, { "epoch": 1.21, "grad_norm": 16.95185229227817, "learning_rate": 8.164350320260502e-06, "loss": 0.0662, "step": 1701 }, { "epoch": 1.21, "grad_norm": 21.974085417487288, "learning_rate": 8.16211207021592e-06, "loss": 0.0947, "step": 1702 }, { "epoch": 1.22, "grad_norm": 7.92675942999132, "learning_rate": 8.15987276365092e-06, "loss": 0.1031, "step": 1703 }, { "epoch": 1.22, "grad_norm": 8.05360437061264, "learning_rate": 8.157632401313696e-06, "loss": 0.1014, "step": 1704 }, { "epoch": 1.22, "grad_norm": 7.27119929232412, "learning_rate": 8.155390983952795e-06, "loss": 0.0781, "step": 1705 }, { "epoch": 1.22, "grad_norm": 4.3328835898750695, "learning_rate": 8.153148512317117e-06, "loss": 0.0669, "step": 1706 }, { "epoch": 1.22, "grad_norm": 11.753037962222542, "learning_rate": 8.150904987155911e-06, "loss": 0.0864, "step": 1707 }, { "epoch": 1.22, "grad_norm": 16.976159095803407, "learning_rate": 8.148660409218786e-06, "loss": 0.1355, "step": 1708 }, { "epoch": 1.22, "grad_norm": 5.360819721353881, "learning_rate": 8.146414779255689e-06, "loss": 0.1117, "step": 1709 }, { "epoch": 1.22, "grad_norm": 17.94580268964115, "learning_rate": 8.144168098016933e-06, "loss": 0.071, "step": 1710 }, { "epoch": 1.22, "grad_norm": 22.882975417855626, "learning_rate": 8.141920366253173e-06, "loss": 0.089, "step": 1711 }, { "epoch": 1.22, "grad_norm": 8.388813141318186, "learning_rate": 8.139671584715419e-06, "loss": 0.088, "step": 1712 }, { "epoch": 1.22, "grad_norm": 19.66518390061167, "learning_rate": 8.137421754155031e-06, "loss": 0.1162, "step": 1713 }, { "epoch": 1.22, "grad_norm": 5.009113369388294, "learning_rate": 8.13517087532372e-06, "loss": 0.0576, "step": 1714 }, { "epoch": 1.22, "grad_norm": 14.062847580323941, "learning_rate": 8.132918948973543e-06, "loss": 0.0834, "step": 1715 }, { "epoch": 1.22, "grad_norm": 23.85559125219598, "learning_rate": 8.130665975856913e-06, "loss": 0.0881, "step": 1716 }, { "epoch": 1.23, "grad_norm": 25.11264401301826, "learning_rate": 8.128411956726592e-06, "loss": 0.1072, "step": 1717 }, { "epoch": 1.23, "grad_norm": 9.641948331236978, "learning_rate": 8.126156892335686e-06, "loss": 0.0957, "step": 1718 }, { "epoch": 1.23, "grad_norm": 11.36919292282907, "learning_rate": 8.123900783437655e-06, "loss": 0.1229, "step": 1719 }, { "epoch": 1.23, "grad_norm": 7.6768793485009965, "learning_rate": 8.121643630786308e-06, "loss": 0.1084, "step": 1720 }, { "epoch": 1.23, "grad_norm": 11.056594907308932, "learning_rate": 8.1193854351358e-06, "loss": 0.08, "step": 1721 }, { "epoch": 1.23, "grad_norm": 13.02410509965871, "learning_rate": 8.11712619724064e-06, "loss": 0.0721, "step": 1722 }, { "epoch": 1.23, "grad_norm": 8.593130940430317, "learning_rate": 8.114865917855676e-06, "loss": 0.0872, "step": 1723 }, { "epoch": 1.23, "grad_norm": 5.9240949988331035, "learning_rate": 8.112604597736113e-06, "loss": 0.0928, "step": 1724 }, { "epoch": 1.23, "grad_norm": 22.20784644695516, "learning_rate": 8.110342237637501e-06, "loss": 0.0628, "step": 1725 }, { "epoch": 1.23, "grad_norm": 10.921566844245627, "learning_rate": 8.108078838315732e-06, "loss": 0.0618, "step": 1726 }, { "epoch": 1.23, "grad_norm": 8.111574295790442, "learning_rate": 8.105814400527052e-06, "loss": 0.0785, "step": 1727 }, { "epoch": 1.23, "grad_norm": 8.289683871686902, "learning_rate": 8.103548925028054e-06, "loss": 0.1143, "step": 1728 }, { "epoch": 1.23, "grad_norm": 14.203313022838092, "learning_rate": 8.101282412575673e-06, "loss": 0.098, "step": 1729 }, { "epoch": 1.23, "grad_norm": 36.75773831067622, "learning_rate": 8.099014863927192e-06, "loss": 0.0892, "step": 1730 }, { "epoch": 1.24, "grad_norm": 11.032969099373302, "learning_rate": 8.096746279840245e-06, "loss": 0.0819, "step": 1731 }, { "epoch": 1.24, "grad_norm": 27.902605396219545, "learning_rate": 8.094476661072806e-06, "loss": 0.0928, "step": 1732 }, { "epoch": 1.24, "grad_norm": 30.50428948295198, "learning_rate": 8.092206008383195e-06, "loss": 0.0852, "step": 1733 }, { "epoch": 1.24, "grad_norm": 26.430089450059356, "learning_rate": 8.089934322530082e-06, "loss": 0.1184, "step": 1734 }, { "epoch": 1.24, "grad_norm": 21.899501892048658, "learning_rate": 8.087661604272477e-06, "loss": 0.0836, "step": 1735 }, { "epoch": 1.24, "grad_norm": 20.60542568709364, "learning_rate": 8.08538785436974e-06, "loss": 0.097, "step": 1736 }, { "epoch": 1.24, "grad_norm": 22.941207777328003, "learning_rate": 8.08311307358157e-06, "loss": 0.1018, "step": 1737 }, { "epoch": 1.24, "grad_norm": 6.3384979167538775, "learning_rate": 8.080837262668017e-06, "loss": 0.0703, "step": 1738 }, { "epoch": 1.24, "grad_norm": 6.570398124978654, "learning_rate": 8.078560422389472e-06, "loss": 0.0669, "step": 1739 }, { "epoch": 1.24, "grad_norm": 21.414339602584793, "learning_rate": 8.076282553506664e-06, "loss": 0.0767, "step": 1740 }, { "epoch": 1.24, "grad_norm": 18.95372832984155, "learning_rate": 8.074003656780678e-06, "loss": 0.0938, "step": 1741 }, { "epoch": 1.24, "grad_norm": 10.549353249338639, "learning_rate": 8.071723732972933e-06, "loss": 0.0778, "step": 1742 }, { "epoch": 1.24, "grad_norm": 16.938439415836026, "learning_rate": 8.069442782845191e-06, "loss": 0.1041, "step": 1743 }, { "epoch": 1.24, "grad_norm": 27.528040440849797, "learning_rate": 8.067160807159566e-06, "loss": 0.1102, "step": 1744 }, { "epoch": 1.25, "grad_norm": 34.857238335191894, "learning_rate": 8.064877806678504e-06, "loss": 0.1146, "step": 1745 }, { "epoch": 1.25, "grad_norm": 19.588923727022934, "learning_rate": 8.062593782164798e-06, "loss": 0.1074, "step": 1746 }, { "epoch": 1.25, "grad_norm": 11.262300732118138, "learning_rate": 8.060308734381585e-06, "loss": 0.0928, "step": 1747 }, { "epoch": 1.25, "grad_norm": 21.63069752918576, "learning_rate": 8.05802266409234e-06, "loss": 0.0915, "step": 1748 }, { "epoch": 1.25, "grad_norm": 17.730732468468556, "learning_rate": 8.055735572060883e-06, "loss": 0.0682, "step": 1749 }, { "epoch": 1.25, "grad_norm": 15.174261533303634, "learning_rate": 8.053447459051374e-06, "loss": 0.0715, "step": 1750 }, { "epoch": 1.25, "grad_norm": 14.744003542025283, "learning_rate": 8.051158325828315e-06, "loss": 0.0828, "step": 1751 }, { "epoch": 1.25, "grad_norm": 16.141517917244126, "learning_rate": 8.048868173156546e-06, "loss": 0.0897, "step": 1752 }, { "epoch": 1.25, "grad_norm": 27.216388324679485, "learning_rate": 8.046577001801248e-06, "loss": 0.1003, "step": 1753 }, { "epoch": 1.25, "grad_norm": 12.025194670403664, "learning_rate": 8.044284812527949e-06, "loss": 0.1011, "step": 1754 }, { "epoch": 1.25, "grad_norm": 17.72776253943835, "learning_rate": 8.041991606102507e-06, "loss": 0.1263, "step": 1755 }, { "epoch": 1.25, "grad_norm": 6.379107531262259, "learning_rate": 8.039697383291127e-06, "loss": 0.068, "step": 1756 }, { "epoch": 1.25, "grad_norm": 10.098384373784489, "learning_rate": 8.037402144860353e-06, "loss": 0.0898, "step": 1757 }, { "epoch": 1.25, "grad_norm": 11.389445025532513, "learning_rate": 8.035105891577064e-06, "loss": 0.0896, "step": 1758 }, { "epoch": 1.26, "grad_norm": 8.922481057387875, "learning_rate": 8.032808624208485e-06, "loss": 0.1005, "step": 1759 }, { "epoch": 1.26, "grad_norm": 25.126444637378192, "learning_rate": 8.030510343522172e-06, "loss": 0.0884, "step": 1760 }, { "epoch": 1.26, "grad_norm": 12.30642758311427, "learning_rate": 8.02821105028602e-06, "loss": 0.076, "step": 1761 }, { "epoch": 1.26, "grad_norm": 6.33267110220372, "learning_rate": 8.025910745268276e-06, "loss": 0.1035, "step": 1762 }, { "epoch": 1.26, "grad_norm": 19.02072960195617, "learning_rate": 8.023609429237504e-06, "loss": 0.0708, "step": 1763 }, { "epoch": 1.26, "grad_norm": 6.3968909866767545, "learning_rate": 8.021307102962623e-06, "loss": 0.0759, "step": 1764 }, { "epoch": 1.26, "grad_norm": 23.20723618473892, "learning_rate": 8.019003767212881e-06, "loss": 0.0717, "step": 1765 }, { "epoch": 1.26, "grad_norm": 32.55425111351312, "learning_rate": 8.016699422757865e-06, "loss": 0.126, "step": 1766 }, { "epoch": 1.26, "grad_norm": 8.562249690813024, "learning_rate": 8.014394070367499e-06, "loss": 0.1367, "step": 1767 }, { "epoch": 1.26, "grad_norm": 36.530412481845815, "learning_rate": 8.012087710812047e-06, "loss": 0.1062, "step": 1768 }, { "epoch": 1.26, "grad_norm": 38.24443582138145, "learning_rate": 8.009780344862101e-06, "loss": 0.0983, "step": 1769 }, { "epoch": 1.26, "grad_norm": 19.804828163884277, "learning_rate": 8.0074719732886e-06, "loss": 0.0815, "step": 1770 }, { "epoch": 1.26, "grad_norm": 19.841573958043558, "learning_rate": 8.005162596862812e-06, "loss": 0.0888, "step": 1771 }, { "epoch": 1.26, "grad_norm": 36.698744929178375, "learning_rate": 8.002852216356343e-06, "loss": 0.1433, "step": 1772 }, { "epoch": 1.27, "grad_norm": 41.58220436938131, "learning_rate": 8.000540832541132e-06, "loss": 0.1224, "step": 1773 }, { "epoch": 1.27, "grad_norm": 16.701232099627646, "learning_rate": 7.99822844618946e-06, "loss": 0.1053, "step": 1774 }, { "epoch": 1.27, "grad_norm": 23.90585403265861, "learning_rate": 7.995915058073933e-06, "loss": 0.1041, "step": 1775 }, { "epoch": 1.27, "grad_norm": 22.792947908502864, "learning_rate": 7.9936006689675e-06, "loss": 0.0842, "step": 1776 }, { "epoch": 1.27, "grad_norm": 28.537044176142224, "learning_rate": 7.99128527964344e-06, "loss": 0.1215, "step": 1777 }, { "epoch": 1.27, "grad_norm": 11.80089352447319, "learning_rate": 7.988968890875368e-06, "loss": 0.0922, "step": 1778 }, { "epoch": 1.27, "grad_norm": 10.311869719605001, "learning_rate": 7.986651503437233e-06, "loss": 0.0958, "step": 1779 }, { "epoch": 1.27, "grad_norm": 21.233417174050086, "learning_rate": 7.984333118103318e-06, "loss": 0.1084, "step": 1780 }, { "epoch": 1.27, "grad_norm": 8.340066676153791, "learning_rate": 7.982013735648235e-06, "loss": 0.0981, "step": 1781 }, { "epoch": 1.27, "grad_norm": 24.679907926446546, "learning_rate": 7.979693356846937e-06, "loss": 0.1503, "step": 1782 }, { "epoch": 1.27, "grad_norm": 25.530690055700195, "learning_rate": 7.977371982474705e-06, "loss": 0.1339, "step": 1783 }, { "epoch": 1.27, "grad_norm": 7.597467209345345, "learning_rate": 7.975049613307151e-06, "loss": 0.1124, "step": 1784 }, { "epoch": 1.27, "grad_norm": 10.392568087679368, "learning_rate": 7.972726250120225e-06, "loss": 0.1146, "step": 1785 }, { "epoch": 1.27, "grad_norm": 7.338556178755361, "learning_rate": 7.970401893690202e-06, "loss": 0.1012, "step": 1786 }, { "epoch": 1.28, "grad_norm": 22.062178634771666, "learning_rate": 7.968076544793696e-06, "loss": 0.0973, "step": 1787 }, { "epoch": 1.28, "grad_norm": 6.614753659360179, "learning_rate": 7.965750204207647e-06, "loss": 0.0793, "step": 1788 }, { "epoch": 1.28, "grad_norm": 6.37429769749807, "learning_rate": 7.96342287270933e-06, "loss": 0.0891, "step": 1789 }, { "epoch": 1.28, "grad_norm": 6.518825899555654, "learning_rate": 7.96109455107635e-06, "loss": 0.0751, "step": 1790 }, { "epoch": 1.28, "grad_norm": 39.72404138798098, "learning_rate": 7.958765240086639e-06, "loss": 0.1064, "step": 1791 }, { "epoch": 1.28, "grad_norm": 13.961553572881497, "learning_rate": 7.956434940518468e-06, "loss": 0.0696, "step": 1792 }, { "epoch": 1.28, "grad_norm": 13.524395349313767, "learning_rate": 7.954103653150432e-06, "loss": 0.1025, "step": 1793 }, { "epoch": 1.28, "grad_norm": 12.512822700266833, "learning_rate": 7.951771378761455e-06, "loss": 0.0912, "step": 1794 }, { "epoch": 1.28, "grad_norm": 31.031749476523494, "learning_rate": 7.949438118130797e-06, "loss": 0.1554, "step": 1795 }, { "epoch": 1.28, "grad_norm": 21.655240692631608, "learning_rate": 7.94710387203804e-06, "loss": 0.1049, "step": 1796 }, { "epoch": 1.28, "grad_norm": 23.573349737519887, "learning_rate": 7.944768641263101e-06, "loss": 0.0951, "step": 1797 }, { "epoch": 1.28, "grad_norm": 6.423633965336433, "learning_rate": 7.942432426586224e-06, "loss": 0.0883, "step": 1798 }, { "epoch": 1.28, "grad_norm": 33.7924509461897, "learning_rate": 7.94009522878798e-06, "loss": 0.1217, "step": 1799 }, { "epoch": 1.28, "grad_norm": 24.608886378571242, "learning_rate": 7.937757048649274e-06, "loss": 0.1155, "step": 1800 }, { "epoch": 1.29, "grad_norm": 31.5944802631455, "learning_rate": 7.935417886951332e-06, "loss": 0.1301, "step": 1801 }, { "epoch": 1.29, "grad_norm": 24.56355408273472, "learning_rate": 7.933077744475713e-06, "loss": 0.0983, "step": 1802 }, { "epoch": 1.29, "grad_norm": 12.600335223994328, "learning_rate": 7.930736622004301e-06, "loss": 0.0992, "step": 1803 }, { "epoch": 1.29, "grad_norm": 17.365260585180668, "learning_rate": 7.928394520319311e-06, "loss": 0.1122, "step": 1804 }, { "epoch": 1.29, "grad_norm": 5.327789011223336, "learning_rate": 7.926051440203278e-06, "loss": 0.0663, "step": 1805 }, { "epoch": 1.29, "grad_norm": 8.382747080442627, "learning_rate": 7.923707382439073e-06, "loss": 0.082, "step": 1806 }, { "epoch": 1.29, "grad_norm": 9.124410943625675, "learning_rate": 7.921362347809888e-06, "loss": 0.1038, "step": 1807 }, { "epoch": 1.29, "grad_norm": 19.289726689987205, "learning_rate": 7.919016337099242e-06, "loss": 0.1105, "step": 1808 }, { "epoch": 1.29, "grad_norm": 7.964546128423192, "learning_rate": 7.916669351090981e-06, "loss": 0.0864, "step": 1809 }, { "epoch": 1.29, "grad_norm": 4.824052539383046, "learning_rate": 7.914321390569278e-06, "loss": 0.0694, "step": 1810 }, { "epoch": 1.29, "grad_norm": 8.33312536205575, "learning_rate": 7.911972456318629e-06, "loss": 0.1254, "step": 1811 }, { "epoch": 1.29, "grad_norm": 6.507945976935379, "learning_rate": 7.909622549123855e-06, "loss": 0.0936, "step": 1812 }, { "epoch": 1.29, "grad_norm": 13.143972556032457, "learning_rate": 7.907271669770107e-06, "loss": 0.0985, "step": 1813 }, { "epoch": 1.29, "grad_norm": 12.491831143764811, "learning_rate": 7.904919819042855e-06, "loss": 0.1169, "step": 1814 }, { "epoch": 1.3, "grad_norm": 13.22535591359354, "learning_rate": 7.902566997727896e-06, "loss": 0.1021, "step": 1815 }, { "epoch": 1.3, "grad_norm": 11.970708252245554, "learning_rate": 7.900213206611353e-06, "loss": 0.1017, "step": 1816 }, { "epoch": 1.3, "grad_norm": 26.696684320453635, "learning_rate": 7.897858446479672e-06, "loss": 0.1003, "step": 1817 }, { "epoch": 1.3, "grad_norm": 31.522235837524857, "learning_rate": 7.895502718119618e-06, "loss": 0.1056, "step": 1818 }, { "epoch": 1.3, "grad_norm": 11.01303572938718, "learning_rate": 7.89314602231829e-06, "loss": 0.0793, "step": 1819 }, { "epoch": 1.3, "grad_norm": 34.25204366156014, "learning_rate": 7.8907883598631e-06, "loss": 0.1014, "step": 1820 }, { "epoch": 1.3, "grad_norm": 40.92644841373539, "learning_rate": 7.888429731541784e-06, "loss": 0.1143, "step": 1821 }, { "epoch": 1.3, "grad_norm": 5.3982661444680335, "learning_rate": 7.886070138142407e-06, "loss": 0.071, "step": 1822 }, { "epoch": 1.3, "grad_norm": 32.32811463120643, "learning_rate": 7.883709580453354e-06, "loss": 0.1158, "step": 1823 }, { "epoch": 1.3, "grad_norm": 19.031316596008647, "learning_rate": 7.88134805926333e-06, "loss": 0.0977, "step": 1824 }, { "epoch": 1.3, "grad_norm": 15.045104813096087, "learning_rate": 7.878985575361362e-06, "loss": 0.0897, "step": 1825 }, { "epoch": 1.3, "grad_norm": 21.035459837713777, "learning_rate": 7.876622129536801e-06, "loss": 0.1024, "step": 1826 }, { "epoch": 1.3, "grad_norm": 13.660668150169178, "learning_rate": 7.874257722579319e-06, "loss": 0.0901, "step": 1827 }, { "epoch": 1.3, "grad_norm": 13.858105852883831, "learning_rate": 7.871892355278906e-06, "loss": 0.0676, "step": 1828 }, { "epoch": 1.31, "grad_norm": 30.715956025025108, "learning_rate": 7.869526028425878e-06, "loss": 0.1143, "step": 1829 }, { "epoch": 1.31, "grad_norm": 20.527869229321144, "learning_rate": 7.867158742810866e-06, "loss": 0.0834, "step": 1830 }, { "epoch": 1.31, "grad_norm": 7.21553644321985, "learning_rate": 7.864790499224825e-06, "loss": 0.069, "step": 1831 }, { "epoch": 1.31, "grad_norm": 15.834412029043872, "learning_rate": 7.86242129845903e-06, "loss": 0.0689, "step": 1832 }, { "epoch": 1.31, "grad_norm": 16.28611478117609, "learning_rate": 7.860051141305074e-06, "loss": 0.0878, "step": 1833 }, { "epoch": 1.31, "grad_norm": 18.470539205773807, "learning_rate": 7.857680028554873e-06, "loss": 0.0988, "step": 1834 }, { "epoch": 1.31, "grad_norm": 14.517688298308093, "learning_rate": 7.855307961000656e-06, "loss": 0.0839, "step": 1835 }, { "epoch": 1.31, "grad_norm": 8.127093098505616, "learning_rate": 7.852934939434977e-06, "loss": 0.0699, "step": 1836 }, { "epoch": 1.31, "grad_norm": 15.700240348518008, "learning_rate": 7.850560964650707e-06, "loss": 0.1311, "step": 1837 }, { "epoch": 1.31, "grad_norm": 33.191617585225714, "learning_rate": 7.848186037441035e-06, "loss": 0.1548, "step": 1838 }, { "epoch": 1.31, "grad_norm": 38.7790193219392, "learning_rate": 7.845810158599467e-06, "loss": 0.0975, "step": 1839 }, { "epoch": 1.31, "grad_norm": 16.850557590964687, "learning_rate": 7.84343332891983e-06, "loss": 0.0818, "step": 1840 }, { "epoch": 1.31, "grad_norm": 28.72144236111981, "learning_rate": 7.841055549196267e-06, "loss": 0.1003, "step": 1841 }, { "epoch": 1.31, "grad_norm": 21.710573467867395, "learning_rate": 7.838676820223234e-06, "loss": 0.0881, "step": 1842 }, { "epoch": 1.32, "grad_norm": 35.03115815820891, "learning_rate": 7.836297142795515e-06, "loss": 0.0961, "step": 1843 }, { "epoch": 1.32, "grad_norm": 12.832217299261412, "learning_rate": 7.833916517708203e-06, "loss": 0.0826, "step": 1844 }, { "epoch": 1.32, "grad_norm": 35.32560567226581, "learning_rate": 7.831534945756703e-06, "loss": 0.1127, "step": 1845 }, { "epoch": 1.32, "grad_norm": 27.26210477910772, "learning_rate": 7.82915242773675e-06, "loss": 0.0912, "step": 1846 }, { "epoch": 1.32, "grad_norm": 12.005708358639822, "learning_rate": 7.826768964444384e-06, "loss": 0.0798, "step": 1847 }, { "epoch": 1.32, "grad_norm": 21.5695948891362, "learning_rate": 7.824384556675966e-06, "loss": 0.0976, "step": 1848 }, { "epoch": 1.32, "grad_norm": 32.38044189186778, "learning_rate": 7.821999205228168e-06, "loss": 0.13, "step": 1849 }, { "epoch": 1.32, "grad_norm": 78.64683053933734, "learning_rate": 7.819612910897985e-06, "loss": 0.2098, "step": 1850 }, { "epoch": 1.32, "grad_norm": 18.63082381659104, "learning_rate": 7.817225674482717e-06, "loss": 0.0945, "step": 1851 }, { "epoch": 1.32, "grad_norm": 36.11550459481589, "learning_rate": 7.814837496779988e-06, "loss": 0.0802, "step": 1852 }, { "epoch": 1.32, "grad_norm": 57.52467349010068, "learning_rate": 7.812448378587731e-06, "loss": 0.1255, "step": 1853 }, { "epoch": 1.32, "grad_norm": 38.526328381998304, "learning_rate": 7.810058320704194e-06, "loss": 0.11, "step": 1854 }, { "epoch": 1.32, "grad_norm": 7.853053246927483, "learning_rate": 7.807667323927941e-06, "loss": 0.0726, "step": 1855 }, { "epoch": 1.32, "grad_norm": 32.05414416545823, "learning_rate": 7.80527538905785e-06, "loss": 0.0781, "step": 1856 }, { "epoch": 1.33, "grad_norm": 50.865898236236205, "learning_rate": 7.802882516893106e-06, "loss": 0.1028, "step": 1857 }, { "epoch": 1.33, "grad_norm": 31.687847029880817, "learning_rate": 7.800488708233219e-06, "loss": 0.0911, "step": 1858 }, { "epoch": 1.33, "grad_norm": 11.587216812760387, "learning_rate": 7.798093963877998e-06, "loss": 0.0892, "step": 1859 }, { "epoch": 1.33, "grad_norm": 32.49642886667288, "learning_rate": 7.795698284627575e-06, "loss": 0.1219, "step": 1860 }, { "epoch": 1.33, "grad_norm": 32.145261643797554, "learning_rate": 7.793301671282391e-06, "loss": 0.1083, "step": 1861 }, { "epoch": 1.33, "grad_norm": 24.419766007483094, "learning_rate": 7.7909041246432e-06, "loss": 0.0844, "step": 1862 }, { "epoch": 1.33, "grad_norm": 9.398067835525364, "learning_rate": 7.788505645511065e-06, "loss": 0.1044, "step": 1863 }, { "epoch": 1.33, "grad_norm": 30.039981713715175, "learning_rate": 7.786106234687362e-06, "loss": 0.0833, "step": 1864 }, { "epoch": 1.33, "grad_norm": 47.34351073345086, "learning_rate": 7.783705892973782e-06, "loss": 0.1405, "step": 1865 }, { "epoch": 1.33, "grad_norm": 12.349109025106152, "learning_rate": 7.78130462117232e-06, "loss": 0.0813, "step": 1866 }, { "epoch": 1.33, "grad_norm": 9.908058765643409, "learning_rate": 7.778902420085289e-06, "loss": 0.0681, "step": 1867 }, { "epoch": 1.33, "grad_norm": 7.3494219707093515, "learning_rate": 7.776499290515304e-06, "loss": 0.0879, "step": 1868 }, { "epoch": 1.33, "grad_norm": 11.040424391158401, "learning_rate": 7.7740952332653e-06, "loss": 0.0918, "step": 1869 }, { "epoch": 1.33, "grad_norm": 24.22287383306475, "learning_rate": 7.771690249138517e-06, "loss": 0.0968, "step": 1870 }, { "epoch": 1.34, "grad_norm": 11.109944144915485, "learning_rate": 7.769284338938502e-06, "loss": 0.1071, "step": 1871 }, { "epoch": 1.34, "grad_norm": 9.840567242675737, "learning_rate": 7.766877503469117e-06, "loss": 0.1022, "step": 1872 }, { "epoch": 1.34, "grad_norm": 7.497594965854643, "learning_rate": 7.764469743534529e-06, "loss": 0.0892, "step": 1873 }, { "epoch": 1.34, "grad_norm": 34.64644715009004, "learning_rate": 7.762061059939214e-06, "loss": 0.0901, "step": 1874 }, { "epoch": 1.34, "grad_norm": 17.020914763066287, "learning_rate": 7.759651453487963e-06, "loss": 0.0775, "step": 1875 }, { "epoch": 1.34, "grad_norm": 41.443228475209594, "learning_rate": 7.757240924985866e-06, "loss": 0.1339, "step": 1876 }, { "epoch": 1.34, "grad_norm": 22.513277159467158, "learning_rate": 7.754829475238323e-06, "loss": 0.105, "step": 1877 }, { "epoch": 1.34, "grad_norm": 21.265724907023937, "learning_rate": 7.752417105051051e-06, "loss": 0.1527, "step": 1878 }, { "epoch": 1.34, "grad_norm": 48.24610738893118, "learning_rate": 7.750003815230062e-06, "loss": 0.0941, "step": 1879 }, { "epoch": 1.34, "grad_norm": 6.8333978374856015, "learning_rate": 7.747589606581686e-06, "loss": 0.0563, "step": 1880 }, { "epoch": 1.34, "grad_norm": 17.172826245285943, "learning_rate": 7.745174479912551e-06, "loss": 0.1003, "step": 1881 }, { "epoch": 1.34, "grad_norm": 10.693710087910775, "learning_rate": 7.742758436029596e-06, "loss": 0.1112, "step": 1882 }, { "epoch": 1.34, "grad_norm": 18.75748741216541, "learning_rate": 7.740341475740068e-06, "loss": 0.0944, "step": 1883 }, { "epoch": 1.34, "grad_norm": 29.562990385496306, "learning_rate": 7.737923599851519e-06, "loss": 0.0795, "step": 1884 }, { "epoch": 1.35, "grad_norm": 6.523359606962466, "learning_rate": 7.735504809171801e-06, "loss": 0.0846, "step": 1885 }, { "epoch": 1.35, "grad_norm": 6.171598232912832, "learning_rate": 7.733085104509084e-06, "loss": 0.0924, "step": 1886 }, { "epoch": 1.35, "grad_norm": 29.73271224436981, "learning_rate": 7.730664486671831e-06, "loss": 0.1106, "step": 1887 }, { "epoch": 1.35, "grad_norm": 30.655108989409246, "learning_rate": 7.72824295646882e-06, "loss": 0.0946, "step": 1888 }, { "epoch": 1.35, "grad_norm": 11.90587683853757, "learning_rate": 7.725820514709124e-06, "loss": 0.0774, "step": 1889 }, { "epoch": 1.35, "grad_norm": 23.959766879108276, "learning_rate": 7.723397162202128e-06, "loss": 0.1327, "step": 1890 }, { "epoch": 1.35, "grad_norm": 18.9085622070494, "learning_rate": 7.720972899757522e-06, "loss": 0.0879, "step": 1891 }, { "epoch": 1.35, "grad_norm": 23.68721606790896, "learning_rate": 7.718547728185293e-06, "loss": 0.1257, "step": 1892 }, { "epoch": 1.35, "grad_norm": 12.866401919651024, "learning_rate": 7.716121648295738e-06, "loss": 0.0869, "step": 1893 }, { "epoch": 1.35, "grad_norm": 12.203988683415128, "learning_rate": 7.713694660899455e-06, "loss": 0.087, "step": 1894 }, { "epoch": 1.35, "grad_norm": 47.327087832020034, "learning_rate": 7.711266766807345e-06, "loss": 0.1295, "step": 1895 }, { "epoch": 1.35, "grad_norm": 37.47630197076899, "learning_rate": 7.708837966830615e-06, "loss": 0.1118, "step": 1896 }, { "epoch": 1.35, "grad_norm": 15.991272091833782, "learning_rate": 7.706408261780769e-06, "loss": 0.075, "step": 1897 }, { "epoch": 1.35, "grad_norm": 35.91340982420784, "learning_rate": 7.703977652469618e-06, "loss": 0.1104, "step": 1898 }, { "epoch": 1.36, "grad_norm": 31.129407279590517, "learning_rate": 7.701546139709272e-06, "loss": 0.0825, "step": 1899 }, { "epoch": 1.36, "grad_norm": 6.731671111833922, "learning_rate": 7.69911372431215e-06, "loss": 0.1019, "step": 1900 }, { "epoch": 1.36, "grad_norm": 7.665645173275369, "learning_rate": 7.696680407090962e-06, "loss": 0.1041, "step": 1901 }, { "epoch": 1.36, "grad_norm": 14.15220002595494, "learning_rate": 7.694246188858726e-06, "loss": 0.087, "step": 1902 }, { "epoch": 1.36, "grad_norm": 11.320413195892144, "learning_rate": 7.691811070428758e-06, "loss": 0.1177, "step": 1903 }, { "epoch": 1.36, "grad_norm": 39.10964804895109, "learning_rate": 7.689375052614681e-06, "loss": 0.1274, "step": 1904 }, { "epoch": 1.36, "grad_norm": 12.690397207526837, "learning_rate": 7.686938136230408e-06, "loss": 0.1031, "step": 1905 }, { "epoch": 1.36, "grad_norm": 15.07798986443616, "learning_rate": 7.684500322090162e-06, "loss": 0.1309, "step": 1906 }, { "epoch": 1.36, "grad_norm": 3.8068024583617093, "learning_rate": 7.68206161100846e-06, "loss": 0.0712, "step": 1907 }, { "epoch": 1.36, "grad_norm": 7.903182113883539, "learning_rate": 7.679622003800122e-06, "loss": 0.0999, "step": 1908 }, { "epoch": 1.36, "grad_norm": 41.30605521807284, "learning_rate": 7.677181501280266e-06, "loss": 0.0956, "step": 1909 }, { "epoch": 1.36, "grad_norm": 13.709690616051338, "learning_rate": 7.674740104264308e-06, "loss": 0.0869, "step": 1910 }, { "epoch": 1.36, "grad_norm": 13.260512003188966, "learning_rate": 7.672297813567968e-06, "loss": 0.1622, "step": 1911 }, { "epoch": 1.36, "grad_norm": 21.84565087029452, "learning_rate": 7.669854630007257e-06, "loss": 0.1017, "step": 1912 }, { "epoch": 1.37, "grad_norm": 33.420476181478065, "learning_rate": 7.667410554398486e-06, "loss": 0.098, "step": 1913 }, { "epoch": 1.37, "grad_norm": 31.57040112746546, "learning_rate": 7.664965587558271e-06, "loss": 0.1077, "step": 1914 }, { "epoch": 1.37, "grad_norm": 4.308046009484161, "learning_rate": 7.662519730303517e-06, "loss": 0.0771, "step": 1915 }, { "epoch": 1.37, "grad_norm": 60.79999436143695, "learning_rate": 7.660072983451433e-06, "loss": 0.1399, "step": 1916 }, { "epoch": 1.37, "grad_norm": 18.3987711534509, "learning_rate": 7.657625347819522e-06, "loss": 0.1149, "step": 1917 }, { "epoch": 1.37, "grad_norm": 6.265288838813149, "learning_rate": 7.655176824225582e-06, "loss": 0.0807, "step": 1918 }, { "epoch": 1.37, "grad_norm": 6.300914605261491, "learning_rate": 7.652727413487716e-06, "loss": 0.0993, "step": 1919 }, { "epoch": 1.37, "grad_norm": 22.124179459804967, "learning_rate": 7.650277116424313e-06, "loss": 0.0684, "step": 1920 }, { "epoch": 1.37, "grad_norm": 25.625468309309074, "learning_rate": 7.647825933854063e-06, "loss": 0.1117, "step": 1921 }, { "epoch": 1.37, "grad_norm": 7.624552453874793, "learning_rate": 7.645373866595953e-06, "loss": 0.1179, "step": 1922 }, { "epoch": 1.37, "grad_norm": 6.194508526818871, "learning_rate": 7.642920915469265e-06, "loss": 0.0785, "step": 1923 }, { "epoch": 1.37, "grad_norm": 24.148000648762252, "learning_rate": 7.640467081293573e-06, "loss": 0.1417, "step": 1924 }, { "epoch": 1.37, "grad_norm": 25.24596065135585, "learning_rate": 7.638012364888751e-06, "loss": 0.1062, "step": 1925 }, { "epoch": 1.37, "grad_norm": 11.663735609649464, "learning_rate": 7.635556767074965e-06, "loss": 0.0919, "step": 1926 }, { "epoch": 1.38, "grad_norm": 19.983238031321108, "learning_rate": 7.633100288672674e-06, "loss": 0.0861, "step": 1927 }, { "epoch": 1.38, "grad_norm": 24.516987429952046, "learning_rate": 7.630642930502634e-06, "loss": 0.1084, "step": 1928 }, { "epoch": 1.38, "grad_norm": 23.7079785548782, "learning_rate": 7.628184693385896e-06, "loss": 0.0987, "step": 1929 }, { "epoch": 1.38, "grad_norm": 8.632779942934862, "learning_rate": 7.625725578143801e-06, "loss": 0.0803, "step": 1930 }, { "epoch": 1.38, "grad_norm": 26.75562561931583, "learning_rate": 7.6232655855979844e-06, "loss": 0.12, "step": 1931 }, { "epoch": 1.38, "grad_norm": 9.220766214367428, "learning_rate": 7.620804716570376e-06, "loss": 0.1036, "step": 1932 }, { "epoch": 1.38, "grad_norm": 35.404659677885, "learning_rate": 7.618342971883199e-06, "loss": 0.1191, "step": 1933 }, { "epoch": 1.38, "grad_norm": 5.539001124567162, "learning_rate": 7.615880352358967e-06, "loss": 0.0887, "step": 1934 }, { "epoch": 1.38, "grad_norm": 33.35991052555321, "learning_rate": 7.613416858820486e-06, "loss": 0.0751, "step": 1935 }, { "epoch": 1.38, "grad_norm": 43.77244661048231, "learning_rate": 7.6109524920908575e-06, "loss": 0.105, "step": 1936 }, { "epoch": 1.38, "grad_norm": 8.095324647325247, "learning_rate": 7.608487252993471e-06, "loss": 0.1018, "step": 1937 }, { "epoch": 1.38, "grad_norm": 10.482483420441799, "learning_rate": 7.6060211423520095e-06, "loss": 0.0607, "step": 1938 }, { "epoch": 1.38, "grad_norm": 39.33723284296271, "learning_rate": 7.6035541609904425e-06, "loss": 0.1287, "step": 1939 }, { "epoch": 1.38, "grad_norm": 16.80992526081807, "learning_rate": 7.60108630973304e-06, "loss": 0.0977, "step": 1940 }, { "epoch": 1.39, "grad_norm": 14.940668855422615, "learning_rate": 7.598617589404354e-06, "loss": 0.0879, "step": 1941 }, { "epoch": 1.39, "grad_norm": 7.542100439749129, "learning_rate": 7.596148000829229e-06, "loss": 0.1262, "step": 1942 }, { "epoch": 1.39, "grad_norm": 17.47476515065765, "learning_rate": 7.593677544832802e-06, "loss": 0.1219, "step": 1943 }, { "epoch": 1.39, "grad_norm": 7.605399845369206, "learning_rate": 7.5912062222404965e-06, "loss": 0.064, "step": 1944 }, { "epoch": 1.39, "grad_norm": 30.299165014514305, "learning_rate": 7.588734033878031e-06, "loss": 0.1134, "step": 1945 }, { "epoch": 1.39, "grad_norm": 12.53081257535877, "learning_rate": 7.586260980571407e-06, "loss": 0.0906, "step": 1946 }, { "epoch": 1.39, "grad_norm": 28.76809322028387, "learning_rate": 7.5837870631469165e-06, "loss": 0.1503, "step": 1947 }, { "epoch": 1.39, "grad_norm": 21.855556241959924, "learning_rate": 7.581312282431143e-06, "loss": 0.0753, "step": 1948 }, { "epoch": 1.39, "grad_norm": 29.170849942865587, "learning_rate": 7.578836639250958e-06, "loss": 0.0962, "step": 1949 }, { "epoch": 1.39, "grad_norm": 11.690758795177931, "learning_rate": 7.576360134433517e-06, "loss": 0.0917, "step": 1950 }, { "epoch": 1.39, "grad_norm": 12.408711643478298, "learning_rate": 7.5738827688062676e-06, "loss": 0.0862, "step": 1951 }, { "epoch": 1.39, "grad_norm": 13.868396523210292, "learning_rate": 7.571404543196943e-06, "loss": 0.1176, "step": 1952 }, { "epoch": 1.39, "grad_norm": 26.57491798793351, "learning_rate": 7.568925458433567e-06, "loss": 0.1008, "step": 1953 }, { "epoch": 1.39, "grad_norm": 5.974416690063627, "learning_rate": 7.566445515344445e-06, "loss": 0.0807, "step": 1954 }, { "epoch": 1.4, "grad_norm": 14.981211780495261, "learning_rate": 7.563964714758172e-06, "loss": 0.0732, "step": 1955 }, { "epoch": 1.4, "grad_norm": 30.306827143553917, "learning_rate": 7.561483057503632e-06, "loss": 0.1293, "step": 1956 }, { "epoch": 1.4, "grad_norm": 4.761468592139051, "learning_rate": 7.559000544409991e-06, "loss": 0.0891, "step": 1957 }, { "epoch": 1.4, "grad_norm": 10.534845276627891, "learning_rate": 7.556517176306704e-06, "loss": 0.0975, "step": 1958 }, { "epoch": 1.4, "grad_norm": 18.880359885969494, "learning_rate": 7.554032954023508e-06, "loss": 0.0979, "step": 1959 }, { "epoch": 1.4, "grad_norm": 15.143403477025016, "learning_rate": 7.55154787839043e-06, "loss": 0.1144, "step": 1960 }, { "epoch": 1.4, "grad_norm": 15.521438570335969, "learning_rate": 7.5490619502377805e-06, "loss": 0.1129, "step": 1961 }, { "epoch": 1.4, "grad_norm": 18.5242277495622, "learning_rate": 7.546575170396153e-06, "loss": 0.1074, "step": 1962 }, { "epoch": 1.4, "grad_norm": 24.927697963385498, "learning_rate": 7.544087539696427e-06, "loss": 0.1021, "step": 1963 }, { "epoch": 1.4, "grad_norm": 10.703714901569482, "learning_rate": 7.541599058969766e-06, "loss": 0.0956, "step": 1964 }, { "epoch": 1.4, "grad_norm": 34.83739720762142, "learning_rate": 7.539109729047619e-06, "loss": 0.121, "step": 1965 }, { "epoch": 1.4, "grad_norm": 10.37898184340536, "learning_rate": 7.5366195507617155e-06, "loss": 0.0914, "step": 1966 }, { "epoch": 1.4, "grad_norm": 32.17544750214366, "learning_rate": 7.534128524944071e-06, "loss": 0.0848, "step": 1967 }, { "epoch": 1.4, "grad_norm": 47.63891616778719, "learning_rate": 7.531636652426985e-06, "loss": 0.144, "step": 1968 }, { "epoch": 1.41, "grad_norm": 12.949628774093883, "learning_rate": 7.529143934043036e-06, "loss": 0.1061, "step": 1969 }, { "epoch": 1.41, "grad_norm": 32.96795634535262, "learning_rate": 7.526650370625088e-06, "loss": 0.1438, "step": 1970 }, { "epoch": 1.41, "grad_norm": 55.11333254799953, "learning_rate": 7.5241559630062896e-06, "loss": 0.1469, "step": 1971 }, { "epoch": 1.41, "grad_norm": 18.925573978897116, "learning_rate": 7.5216607120200655e-06, "loss": 0.0874, "step": 1972 }, { "epoch": 1.41, "grad_norm": 21.262962891421868, "learning_rate": 7.519164618500127e-06, "loss": 0.0812, "step": 1973 }, { "epoch": 1.41, "grad_norm": 41.7596554934021, "learning_rate": 7.5166676832804655e-06, "loss": 0.1268, "step": 1974 }, { "epoch": 1.41, "grad_norm": 11.588135089329397, "learning_rate": 7.514169907195352e-06, "loss": 0.0864, "step": 1975 }, { "epoch": 1.41, "grad_norm": 5.342692979719796, "learning_rate": 7.511671291079342e-06, "loss": 0.0705, "step": 1976 }, { "epoch": 1.41, "grad_norm": 5.712686440388007, "learning_rate": 7.509171835767268e-06, "loss": 0.0907, "step": 1977 }, { "epoch": 1.41, "grad_norm": 36.70804565028893, "learning_rate": 7.506671542094246e-06, "loss": 0.1252, "step": 1978 }, { "epoch": 1.41, "grad_norm": 10.489002626915171, "learning_rate": 7.504170410895668e-06, "loss": 0.0996, "step": 1979 }, { "epoch": 1.41, "grad_norm": 6.3444733677159295, "learning_rate": 7.501668443007212e-06, "loss": 0.0735, "step": 1980 }, { "epoch": 1.41, "grad_norm": 11.093855896464442, "learning_rate": 7.499165639264828e-06, "loss": 0.115, "step": 1981 }, { "epoch": 1.41, "grad_norm": 33.7558992291076, "learning_rate": 7.496662000504752e-06, "loss": 0.165, "step": 1982 }, { "epoch": 1.42, "grad_norm": 10.907611777929603, "learning_rate": 7.4941575275634945e-06, "loss": 0.11, "step": 1983 }, { "epoch": 1.42, "grad_norm": 28.17923318389303, "learning_rate": 7.49165222127785e-06, "loss": 0.0878, "step": 1984 }, { "epoch": 1.42, "grad_norm": 15.100103791027154, "learning_rate": 7.489146082484882e-06, "loss": 0.0727, "step": 1985 }, { "epoch": 1.42, "grad_norm": 12.562326160266043, "learning_rate": 7.486639112021944e-06, "loss": 0.0812, "step": 1986 }, { "epoch": 1.42, "grad_norm": 31.277970594845105, "learning_rate": 7.484131310726658e-06, "loss": 0.0872, "step": 1987 }, { "epoch": 1.42, "grad_norm": 14.473359713155952, "learning_rate": 7.481622679436929e-06, "loss": 0.0759, "step": 1988 }, { "epoch": 1.42, "grad_norm": 7.391891563984436, "learning_rate": 7.479113218990934e-06, "loss": 0.0832, "step": 1989 }, { "epoch": 1.42, "grad_norm": 6.8735457218956455, "learning_rate": 7.4766029302271335e-06, "loss": 0.108, "step": 1990 }, { "epoch": 1.42, "grad_norm": 17.97877074755628, "learning_rate": 7.474091813984261e-06, "loss": 0.0837, "step": 1991 }, { "epoch": 1.42, "grad_norm": 15.971760147819229, "learning_rate": 7.471579871101326e-06, "loss": 0.1023, "step": 1992 }, { "epoch": 1.42, "grad_norm": 7.376728168558531, "learning_rate": 7.4690671024176165e-06, "loss": 0.1162, "step": 1993 }, { "epoch": 1.42, "grad_norm": 9.360966837136843, "learning_rate": 7.466553508772695e-06, "loss": 0.0769, "step": 1994 }, { "epoch": 1.42, "grad_norm": 8.398769699308323, "learning_rate": 7.4640390910064e-06, "loss": 0.093, "step": 1995 }, { "epoch": 1.42, "grad_norm": 19.282073477802033, "learning_rate": 7.461523849958845e-06, "loss": 0.1223, "step": 1996 }, { "epoch": 1.43, "grad_norm": 9.488837006383456, "learning_rate": 7.459007786470418e-06, "loss": 0.0822, "step": 1997 }, { "epoch": 1.43, "grad_norm": 16.12399085356851, "learning_rate": 7.4564909013817845e-06, "loss": 0.1228, "step": 1998 }, { "epoch": 1.43, "grad_norm": 7.449212798239846, "learning_rate": 7.45397319553388e-06, "loss": 0.0666, "step": 1999 }, { "epoch": 1.43, "grad_norm": 30.988058304482948, "learning_rate": 7.451454669767919e-06, "loss": 0.1119, "step": 2000 }, { "epoch": 1.43, "eval_avg_AUC": 0.7993414523799219, "eval_avg_Accuracy": 0.6962864721485411, "eval_avg_Accuracy-right": 0.895069779574801, "eval_avg_Accuracy-wrong": 0.34967022970206957, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6534253440240803, "eval_last_AUC": 0.796335909721104, "eval_last_Accuracy": 0.7389340185676393, "eval_last_Accuracy-right": 0.8014868918742664, "eval_last_Accuracy-wrong": 0.6298612690470775, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6609678828685568, "eval_max_AUC": 0.731932816756531, "eval_max_Accuracy": 0.6443136604774535, "eval_max_Accuracy-right": 0.9846745793661145, "eval_max_Accuracy-wrong": 0.05083011143961792, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.597156835188284, "eval_min_AUC": 0.8068730624550771, "eval_min_Accuracy": 0.7409234084880637, "eval_min_Accuracy-right": 0.7598800052171645, "eval_min_Accuracy-wrong": 0.7078690015919945, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.662910126847143, "eval_prod_AUC": 0.8081306191935946, "eval_prod_Accuracy": 0.7138594164456233, "eval_prod_Accuracy-right": 0.6302334681100822, "eval_prod_Accuracy-wrong": 0.8596770525358198, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.653658027762353, "eval_runtime": 246.3352, "eval_samples_per_second": 97.948, "eval_steps_per_second": 3.061, "eval_sum_AUC": 0.6609696244629366, "eval_sum_Accuracy": 0.6379310344827587, "eval_sum_Accuracy-right": 0.9988261379940002, "eval_sum_Accuracy-wrong": 0.008642256083693428, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6421408268547636, "step": 2000 }, { "epoch": 1.43, "grad_norm": 13.839120903654035, "learning_rate": 7.448935324925386e-06, "loss": 0.0573, "step": 2001 }, { "epoch": 1.43, "grad_norm": 6.253220256521355, "learning_rate": 7.446415161848043e-06, "loss": 0.0869, "step": 2002 }, { "epoch": 1.43, "grad_norm": 10.932247825123236, "learning_rate": 7.443894181377921e-06, "loss": 0.0869, "step": 2003 }, { "epoch": 1.43, "grad_norm": 14.774479223409482, "learning_rate": 7.441372384357328e-06, "loss": 0.0901, "step": 2004 }, { "epoch": 1.43, "grad_norm": 28.556495798503843, "learning_rate": 7.438849771628844e-06, "loss": 0.1105, "step": 2005 }, { "epoch": 1.43, "grad_norm": 10.991255023773615, "learning_rate": 7.43632634403532e-06, "loss": 0.0887, "step": 2006 }, { "epoch": 1.43, "grad_norm": 10.019756556058779, "learning_rate": 7.433802102419878e-06, "loss": 0.1019, "step": 2007 }, { "epoch": 1.43, "grad_norm": 26.827197018870162, "learning_rate": 7.431277047625918e-06, "loss": 0.1045, "step": 2008 }, { "epoch": 1.43, "grad_norm": 40.7772525438193, "learning_rate": 7.428751180497104e-06, "loss": 0.1014, "step": 2009 }, { "epoch": 1.43, "grad_norm": 10.794657275533215, "learning_rate": 7.426224501877376e-06, "loss": 0.1172, "step": 2010 }, { "epoch": 1.44, "grad_norm": 4.9017737328348465, "learning_rate": 7.423697012610947e-06, "loss": 0.0688, "step": 2011 }, { "epoch": 1.44, "grad_norm": 32.11030414551294, "learning_rate": 7.421168713542294e-06, "loss": 0.1136, "step": 2012 }, { "epoch": 1.44, "grad_norm": 23.50894816197612, "learning_rate": 7.418639605516172e-06, "loss": 0.103, "step": 2013 }, { "epoch": 1.44, "grad_norm": 7.586007382354752, "learning_rate": 7.416109689377603e-06, "loss": 0.121, "step": 2014 }, { "epoch": 1.44, "grad_norm": 8.23229798707242, "learning_rate": 7.413578965971876e-06, "loss": 0.0738, "step": 2015 }, { "epoch": 1.44, "grad_norm": 5.177266007361741, "learning_rate": 7.411047436144556e-06, "loss": 0.0636, "step": 2016 }, { "epoch": 1.44, "grad_norm": 7.750691106855549, "learning_rate": 7.408515100741471e-06, "loss": 0.1152, "step": 2017 }, { "epoch": 1.44, "grad_norm": 33.35658858310154, "learning_rate": 7.405981960608725e-06, "loss": 0.0986, "step": 2018 }, { "epoch": 1.44, "grad_norm": 12.949061067234727, "learning_rate": 7.403448016592685e-06, "loss": 0.0984, "step": 2019 }, { "epoch": 1.44, "grad_norm": 30.906573797223782, "learning_rate": 7.400913269539988e-06, "loss": 0.119, "step": 2020 }, { "epoch": 1.44, "grad_norm": 14.698468315096022, "learning_rate": 7.398377720297541e-06, "loss": 0.0934, "step": 2021 }, { "epoch": 1.44, "grad_norm": 38.759538949910294, "learning_rate": 7.39584136971252e-06, "loss": 0.1078, "step": 2022 }, { "epoch": 1.44, "grad_norm": 12.61303665133732, "learning_rate": 7.393304218632364e-06, "loss": 0.0614, "step": 2023 }, { "epoch": 1.44, "grad_norm": 33.99016387504461, "learning_rate": 7.390766267904783e-06, "loss": 0.1074, "step": 2024 }, { "epoch": 1.45, "grad_norm": 25.92261989754995, "learning_rate": 7.3882275183777554e-06, "loss": 0.1655, "step": 2025 }, { "epoch": 1.45, "grad_norm": 40.11705918481798, "learning_rate": 7.385687970899523e-06, "loss": 0.1057, "step": 2026 }, { "epoch": 1.45, "grad_norm": 10.286685919964656, "learning_rate": 7.3831476263185965e-06, "loss": 0.0775, "step": 2027 }, { "epoch": 1.45, "grad_norm": 15.0192688550375, "learning_rate": 7.380606485483751e-06, "loss": 0.0768, "step": 2028 }, { "epoch": 1.45, "grad_norm": 55.38449400925222, "learning_rate": 7.378064549244031e-06, "loss": 0.1389, "step": 2029 }, { "epoch": 1.45, "grad_norm": 23.97034166915613, "learning_rate": 7.375521818448741e-06, "loss": 0.1333, "step": 2030 }, { "epoch": 1.45, "grad_norm": 13.121072563372268, "learning_rate": 7.372978293947459e-06, "loss": 0.0995, "step": 2031 }, { "epoch": 1.45, "grad_norm": 45.71363021283654, "learning_rate": 7.3704339765900205e-06, "loss": 0.1544, "step": 2032 }, { "epoch": 1.45, "grad_norm": 29.422717589921906, "learning_rate": 7.367888867226531e-06, "loss": 0.1324, "step": 2033 }, { "epoch": 1.45, "grad_norm": 38.96958949899909, "learning_rate": 7.365342966707359e-06, "loss": 0.1, "step": 2034 }, { "epoch": 1.45, "grad_norm": 6.289954542505453, "learning_rate": 7.362796275883135e-06, "loss": 0.0754, "step": 2035 }, { "epoch": 1.45, "grad_norm": 10.57359176786369, "learning_rate": 7.360248795604758e-06, "loss": 0.0861, "step": 2036 }, { "epoch": 1.45, "grad_norm": 24.090251255820395, "learning_rate": 7.3577005267233885e-06, "loss": 0.0708, "step": 2037 }, { "epoch": 1.45, "grad_norm": 37.93968118245042, "learning_rate": 7.355151470090449e-06, "loss": 0.1194, "step": 2038 }, { "epoch": 1.46, "grad_norm": 13.449078150315941, "learning_rate": 7.352601626557628e-06, "loss": 0.0925, "step": 2039 }, { "epoch": 1.46, "grad_norm": 18.95997171971615, "learning_rate": 7.350050996976875e-06, "loss": 0.1005, "step": 2040 }, { "epoch": 1.46, "grad_norm": 26.92934782855485, "learning_rate": 7.347499582200404e-06, "loss": 0.0961, "step": 2041 }, { "epoch": 1.46, "grad_norm": 22.73730072835748, "learning_rate": 7.344947383080687e-06, "loss": 0.1058, "step": 2042 }, { "epoch": 1.46, "grad_norm": 22.503461102204398, "learning_rate": 7.342394400470463e-06, "loss": 0.0862, "step": 2043 }, { "epoch": 1.46, "grad_norm": 20.649364677252795, "learning_rate": 7.339840635222732e-06, "loss": 0.0664, "step": 2044 }, { "epoch": 1.46, "grad_norm": 11.339548594219169, "learning_rate": 7.337286088190754e-06, "loss": 0.0889, "step": 2045 }, { "epoch": 1.46, "grad_norm": 11.400084265838423, "learning_rate": 7.334730760228049e-06, "loss": 0.0951, "step": 2046 }, { "epoch": 1.46, "grad_norm": 14.486076375014688, "learning_rate": 7.332174652188401e-06, "loss": 0.1224, "step": 2047 }, { "epoch": 1.46, "grad_norm": 34.7417013692277, "learning_rate": 7.329617764925853e-06, "loss": 0.1178, "step": 2048 }, { "epoch": 1.46, "grad_norm": 30.777860474477603, "learning_rate": 7.32706009929471e-06, "loss": 0.1587, "step": 2049 }, { "epoch": 1.46, "grad_norm": 18.592979828725138, "learning_rate": 7.324501656149532e-06, "loss": 0.1007, "step": 2050 }, { "epoch": 1.46, "grad_norm": 42.347294863149365, "learning_rate": 7.321942436345146e-06, "loss": 0.1045, "step": 2051 }, { "epoch": 1.46, "grad_norm": 21.001661806227776, "learning_rate": 7.319382440736632e-06, "loss": 0.1101, "step": 2052 }, { "epoch": 1.47, "grad_norm": 19.577035199548465, "learning_rate": 7.316821670179335e-06, "loss": 0.1144, "step": 2053 }, { "epoch": 1.47, "grad_norm": 11.230363742790843, "learning_rate": 7.314260125528854e-06, "loss": 0.1404, "step": 2054 }, { "epoch": 1.47, "grad_norm": 17.499694248271904, "learning_rate": 7.311697807641048e-06, "loss": 0.0714, "step": 2055 }, { "epoch": 1.47, "grad_norm": 23.806058024752186, "learning_rate": 7.3091347173720386e-06, "loss": 0.1033, "step": 2056 }, { "epoch": 1.47, "grad_norm": 6.085617803027079, "learning_rate": 7.3065708555781986e-06, "loss": 0.0939, "step": 2057 }, { "epoch": 1.47, "grad_norm": 34.50468476483311, "learning_rate": 7.304006223116162e-06, "loss": 0.1208, "step": 2058 }, { "epoch": 1.47, "grad_norm": 13.365971501697732, "learning_rate": 7.301440820842822e-06, "loss": 0.09, "step": 2059 }, { "epoch": 1.47, "grad_norm": 10.044928264199827, "learning_rate": 7.298874649615327e-06, "loss": 0.153, "step": 2060 }, { "epoch": 1.47, "grad_norm": 28.362345328223054, "learning_rate": 7.29630771029108e-06, "loss": 0.0945, "step": 2061 }, { "epoch": 1.47, "grad_norm": 6.658453340675622, "learning_rate": 7.293740003727745e-06, "loss": 0.0809, "step": 2062 }, { "epoch": 1.47, "grad_norm": 15.121775553304609, "learning_rate": 7.291171530783241e-06, "loss": 0.1129, "step": 2063 }, { "epoch": 1.47, "grad_norm": 11.416830583602653, "learning_rate": 7.288602292315742e-06, "loss": 0.1333, "step": 2064 }, { "epoch": 1.47, "grad_norm": 5.084441019797188, "learning_rate": 7.286032289183679e-06, "loss": 0.1097, "step": 2065 }, { "epoch": 1.47, "grad_norm": 15.642621559627102, "learning_rate": 7.283461522245736e-06, "loss": 0.1122, "step": 2066 }, { "epoch": 1.48, "grad_norm": 41.35476137894392, "learning_rate": 7.280889992360856e-06, "loss": 0.1206, "step": 2067 }, { "epoch": 1.48, "grad_norm": 21.18129028339432, "learning_rate": 7.278317700388232e-06, "loss": 0.1133, "step": 2068 }, { "epoch": 1.48, "grad_norm": 19.902882472946708, "learning_rate": 7.275744647187318e-06, "loss": 0.1512, "step": 2069 }, { "epoch": 1.48, "grad_norm": 15.939602818419818, "learning_rate": 7.273170833617818e-06, "loss": 0.1044, "step": 2070 }, { "epoch": 1.48, "grad_norm": 31.533034131028536, "learning_rate": 7.2705962605396895e-06, "loss": 0.1394, "step": 2071 }, { "epoch": 1.48, "grad_norm": 9.111943201249398, "learning_rate": 7.268020928813147e-06, "loss": 0.0912, "step": 2072 }, { "epoch": 1.48, "grad_norm": 13.514191499285351, "learning_rate": 7.265444839298656e-06, "loss": 0.0923, "step": 2073 }, { "epoch": 1.48, "grad_norm": 6.330749877489256, "learning_rate": 7.262867992856934e-06, "loss": 0.0975, "step": 2074 }, { "epoch": 1.48, "grad_norm": 30.935621396553604, "learning_rate": 7.260290390348956e-06, "loss": 0.1388, "step": 2075 }, { "epoch": 1.48, "grad_norm": 6.688613763402811, "learning_rate": 7.257712032635946e-06, "loss": 0.1047, "step": 2076 }, { "epoch": 1.48, "grad_norm": 13.33335451161966, "learning_rate": 7.255132920579382e-06, "loss": 0.1041, "step": 2077 }, { "epoch": 1.48, "grad_norm": 8.619130876094033, "learning_rate": 7.252553055040991e-06, "loss": 0.0897, "step": 2078 }, { "epoch": 1.48, "grad_norm": 12.168562646783498, "learning_rate": 7.249972436882756e-06, "loss": 0.1006, "step": 2079 }, { "epoch": 1.48, "grad_norm": 14.278860858221417, "learning_rate": 7.247391066966909e-06, "loss": 0.0814, "step": 2080 }, { "epoch": 1.49, "grad_norm": 13.260089786437403, "learning_rate": 7.244808946155933e-06, "loss": 0.0886, "step": 2081 }, { "epoch": 1.49, "grad_norm": 5.521768030802181, "learning_rate": 7.242226075312564e-06, "loss": 0.0858, "step": 2082 }, { "epoch": 1.49, "grad_norm": 13.198034640001179, "learning_rate": 7.239642455299787e-06, "loss": 0.0963, "step": 2083 }, { "epoch": 1.49, "grad_norm": 35.149002150189034, "learning_rate": 7.237058086980835e-06, "loss": 0.1558, "step": 2084 }, { "epoch": 1.49, "grad_norm": 10.810236800782775, "learning_rate": 7.234472971219197e-06, "loss": 0.08, "step": 2085 }, { "epoch": 1.49, "grad_norm": 14.523629121141207, "learning_rate": 7.231887108878606e-06, "loss": 0.1177, "step": 2086 }, { "epoch": 1.49, "grad_norm": 9.493707671164756, "learning_rate": 7.229300500823047e-06, "loss": 0.1161, "step": 2087 }, { "epoch": 1.49, "grad_norm": 7.760754279339331, "learning_rate": 7.226713147916754e-06, "loss": 0.0781, "step": 2088 }, { "epoch": 1.49, "grad_norm": 10.837060450720395, "learning_rate": 7.22412505102421e-06, "loss": 0.0732, "step": 2089 }, { "epoch": 1.49, "grad_norm": 21.583304857531907, "learning_rate": 7.221536211010147e-06, "loss": 0.0729, "step": 2090 }, { "epoch": 1.49, "grad_norm": 12.445127735446684, "learning_rate": 7.2189466287395425e-06, "loss": 0.0918, "step": 2091 }, { "epoch": 1.49, "grad_norm": 8.94159436880388, "learning_rate": 7.216356305077625e-06, "loss": 0.1051, "step": 2092 }, { "epoch": 1.49, "grad_norm": 18.138755911165397, "learning_rate": 7.21376524088987e-06, "loss": 0.1355, "step": 2093 }, { "epoch": 1.49, "grad_norm": 15.689142765328521, "learning_rate": 7.211173437042001e-06, "loss": 0.0781, "step": 2094 }, { "epoch": 1.5, "grad_norm": 37.30887264410398, "learning_rate": 7.208580894399986e-06, "loss": 0.1034, "step": 2095 }, { "epoch": 1.5, "grad_norm": 27.998517100165653, "learning_rate": 7.205987613830043e-06, "loss": 0.1226, "step": 2096 }, { "epoch": 1.5, "grad_norm": 27.401516777324407, "learning_rate": 7.203393596198635e-06, "loss": 0.1133, "step": 2097 }, { "epoch": 1.5, "grad_norm": 23.244970427895833, "learning_rate": 7.200798842372472e-06, "loss": 0.1119, "step": 2098 }, { "epoch": 1.5, "grad_norm": 24.537751698472384, "learning_rate": 7.198203353218508e-06, "loss": 0.1145, "step": 2099 }, { "epoch": 1.5, "grad_norm": 6.5385945727391395, "learning_rate": 7.195607129603946e-06, "loss": 0.0945, "step": 2100 }, { "epoch": 1.5, "grad_norm": 17.09406318610685, "learning_rate": 7.19301017239623e-06, "loss": 0.0776, "step": 2101 }, { "epoch": 1.5, "grad_norm": 7.953522177542275, "learning_rate": 7.190412482463054e-06, "loss": 0.1013, "step": 2102 }, { "epoch": 1.5, "grad_norm": 31.262974276965277, "learning_rate": 7.187814060672354e-06, "loss": 0.1171, "step": 2103 }, { "epoch": 1.5, "grad_norm": 10.270590362702018, "learning_rate": 7.1852149078923105e-06, "loss": 0.1014, "step": 2104 }, { "epoch": 1.5, "grad_norm": 12.499466735424543, "learning_rate": 7.1826150249913495e-06, "loss": 0.131, "step": 2105 }, { "epoch": 1.5, "grad_norm": 26.68680284528321, "learning_rate": 7.18001441283814e-06, "loss": 0.0918, "step": 2106 }, { "epoch": 1.5, "grad_norm": 42.91007158573037, "learning_rate": 7.1774130723015955e-06, "loss": 0.1384, "step": 2107 }, { "epoch": 1.5, "grad_norm": 8.298129546397226, "learning_rate": 7.17481100425087e-06, "loss": 0.0711, "step": 2108 }, { "epoch": 1.51, "grad_norm": 22.30804777349356, "learning_rate": 7.172208209555365e-06, "loss": 0.077, "step": 2109 }, { "epoch": 1.51, "grad_norm": 21.45735632031128, "learning_rate": 7.1696046890847206e-06, "loss": 0.1058, "step": 2110 }, { "epoch": 1.51, "grad_norm": 21.40607494365893, "learning_rate": 7.167000443708823e-06, "loss": 0.1253, "step": 2111 }, { "epoch": 1.51, "grad_norm": 7.216056021799717, "learning_rate": 7.164395474297798e-06, "loss": 0.067, "step": 2112 }, { "epoch": 1.51, "grad_norm": 22.5190941505317, "learning_rate": 7.161789781722016e-06, "loss": 0.1038, "step": 2113 }, { "epoch": 1.51, "grad_norm": 24.586064614156214, "learning_rate": 7.159183366852085e-06, "loss": 0.1046, "step": 2114 }, { "epoch": 1.51, "grad_norm": 11.51113614776396, "learning_rate": 7.156576230558859e-06, "loss": 0.1046, "step": 2115 }, { "epoch": 1.51, "grad_norm": 70.95164168879344, "learning_rate": 7.153968373713429e-06, "loss": 0.1827, "step": 2116 }, { "epoch": 1.51, "grad_norm": 6.699926065717833, "learning_rate": 7.1513597971871295e-06, "loss": 0.0992, "step": 2117 }, { "epoch": 1.51, "grad_norm": 11.794229177328235, "learning_rate": 7.148750501851532e-06, "loss": 0.0793, "step": 2118 }, { "epoch": 1.51, "grad_norm": 18.355183637296523, "learning_rate": 7.1461404885784545e-06, "loss": 0.1051, "step": 2119 }, { "epoch": 1.51, "grad_norm": 32.86488907052558, "learning_rate": 7.1435297582399475e-06, "loss": 0.1007, "step": 2120 }, { "epoch": 1.51, "grad_norm": 21.387125829223244, "learning_rate": 7.140918311708306e-06, "loss": 0.0792, "step": 2121 }, { "epoch": 1.51, "grad_norm": 13.590993567540462, "learning_rate": 7.138306149856062e-06, "loss": 0.075, "step": 2122 }, { "epoch": 1.52, "grad_norm": 6.447054862968271, "learning_rate": 7.1356932735559905e-06, "loss": 0.085, "step": 2123 }, { "epoch": 1.52, "grad_norm": 25.482506147936167, "learning_rate": 7.133079683681099e-06, "loss": 0.1274, "step": 2124 }, { "epoch": 1.52, "grad_norm": 9.387675092660638, "learning_rate": 7.130465381104635e-06, "loss": 0.0701, "step": 2125 }, { "epoch": 1.52, "grad_norm": 8.715433430188718, "learning_rate": 7.1278503667000885e-06, "loss": 0.0797, "step": 2126 }, { "epoch": 1.52, "grad_norm": 8.423200658182854, "learning_rate": 7.125234641341185e-06, "loss": 0.0934, "step": 2127 }, { "epoch": 1.52, "grad_norm": 6.710068469682492, "learning_rate": 7.1226182059018835e-06, "loss": 0.0822, "step": 2128 }, { "epoch": 1.52, "grad_norm": 13.329201988437877, "learning_rate": 7.120001061256387e-06, "loss": 0.0792, "step": 2129 }, { "epoch": 1.52, "grad_norm": 22.059536956159867, "learning_rate": 7.1173832082791294e-06, "loss": 0.1038, "step": 2130 }, { "epoch": 1.52, "grad_norm": 13.826617022708637, "learning_rate": 7.114764647844788e-06, "loss": 0.1299, "step": 2131 }, { "epoch": 1.52, "grad_norm": 17.18931427687854, "learning_rate": 7.112145380828267e-06, "loss": 0.0983, "step": 2132 }, { "epoch": 1.52, "grad_norm": 12.456228718757368, "learning_rate": 7.109525408104717e-06, "loss": 0.125, "step": 2133 }, { "epoch": 1.52, "grad_norm": 12.457600501785418, "learning_rate": 7.106904730549517e-06, "loss": 0.0661, "step": 2134 }, { "epoch": 1.52, "grad_norm": 19.169864989415867, "learning_rate": 7.104283349038285e-06, "loss": 0.0616, "step": 2135 }, { "epoch": 1.52, "grad_norm": 28.451596210521583, "learning_rate": 7.101661264446875e-06, "loss": 0.1067, "step": 2136 }, { "epoch": 1.53, "grad_norm": 11.557672547776232, "learning_rate": 7.099038477651371e-06, "loss": 0.0716, "step": 2137 }, { "epoch": 1.53, "grad_norm": 13.915155495212185, "learning_rate": 7.096414989528095e-06, "loss": 0.0714, "step": 2138 }, { "epoch": 1.53, "grad_norm": 28.111132386929246, "learning_rate": 7.093790800953606e-06, "loss": 0.0987, "step": 2139 }, { "epoch": 1.53, "grad_norm": 13.342771869341991, "learning_rate": 7.091165912804693e-06, "loss": 0.1157, "step": 2140 }, { "epoch": 1.53, "grad_norm": 10.294032473258458, "learning_rate": 7.088540325958379e-06, "loss": 0.1033, "step": 2141 }, { "epoch": 1.53, "grad_norm": 6.93465060964492, "learning_rate": 7.085914041291921e-06, "loss": 0.0911, "step": 2142 }, { "epoch": 1.53, "grad_norm": 17.880292154170586, "learning_rate": 7.08328705968281e-06, "loss": 0.1404, "step": 2143 }, { "epoch": 1.53, "grad_norm": 22.929359757090396, "learning_rate": 7.080659382008772e-06, "loss": 0.1053, "step": 2144 }, { "epoch": 1.53, "grad_norm": 7.958105578684251, "learning_rate": 7.078031009147759e-06, "loss": 0.1392, "step": 2145 }, { "epoch": 1.53, "grad_norm": 14.942761890735525, "learning_rate": 7.075401941977961e-06, "loss": 0.0994, "step": 2146 }, { "epoch": 1.53, "grad_norm": 6.000079482176476, "learning_rate": 7.072772181377798e-06, "loss": 0.0935, "step": 2147 }, { "epoch": 1.53, "grad_norm": 9.21102239643325, "learning_rate": 7.070141728225922e-06, "loss": 0.0652, "step": 2148 }, { "epoch": 1.53, "grad_norm": 10.92020872783142, "learning_rate": 7.067510583401217e-06, "loss": 0.0845, "step": 2149 }, { "epoch": 1.53, "grad_norm": 4.251963896243572, "learning_rate": 7.0648787477827965e-06, "loss": 0.0601, "step": 2150 }, { "epoch": 1.54, "grad_norm": 7.8790995364041905, "learning_rate": 7.062246222250005e-06, "loss": 0.1079, "step": 2151 }, { "epoch": 1.54, "grad_norm": 16.979064512752753, "learning_rate": 7.05961300768242e-06, "loss": 0.1107, "step": 2152 }, { "epoch": 1.54, "grad_norm": 39.1816186306808, "learning_rate": 7.056979104959847e-06, "loss": 0.1908, "step": 2153 }, { "epoch": 1.54, "grad_norm": 15.074164527605571, "learning_rate": 7.054344514962319e-06, "loss": 0.0779, "step": 2154 }, { "epoch": 1.54, "grad_norm": 19.191667670584966, "learning_rate": 7.051709238570106e-06, "loss": 0.1405, "step": 2155 }, { "epoch": 1.54, "grad_norm": 27.248491502820745, "learning_rate": 7.0490732766637e-06, "loss": 0.1021, "step": 2156 }, { "epoch": 1.54, "grad_norm": 9.506985241043234, "learning_rate": 7.046436630123826e-06, "loss": 0.1384, "step": 2157 }, { "epoch": 1.54, "grad_norm": 6.615642105259546, "learning_rate": 7.043799299831438e-06, "loss": 0.1138, "step": 2158 }, { "epoch": 1.54, "grad_norm": 8.07347512799611, "learning_rate": 7.041161286667713e-06, "loss": 0.1147, "step": 2159 }, { "epoch": 1.54, "grad_norm": 24.75914750945603, "learning_rate": 7.038522591514061e-06, "loss": 0.0977, "step": 2160 }, { "epoch": 1.54, "grad_norm": 20.871761477687055, "learning_rate": 7.035883215252123e-06, "loss": 0.0725, "step": 2161 }, { "epoch": 1.54, "grad_norm": 7.2177050883819165, "learning_rate": 7.03324315876376e-06, "loss": 0.0817, "step": 2162 }, { "epoch": 1.54, "grad_norm": 10.886438037155624, "learning_rate": 7.030602422931065e-06, "loss": 0.0947, "step": 2163 }, { "epoch": 1.54, "grad_norm": 32.18182550206542, "learning_rate": 7.027961008636359e-06, "loss": 0.1033, "step": 2164 }, { "epoch": 1.55, "grad_norm": 10.8123263405742, "learning_rate": 7.025318916762185e-06, "loss": 0.0902, "step": 2165 }, { "epoch": 1.55, "grad_norm": 5.74661964617569, "learning_rate": 7.022676148191315e-06, "loss": 0.1548, "step": 2166 }, { "epoch": 1.55, "grad_norm": 11.66124484632883, "learning_rate": 7.020032703806748e-06, "loss": 0.1166, "step": 2167 }, { "epoch": 1.55, "grad_norm": 5.129462459098309, "learning_rate": 7.017388584491709e-06, "loss": 0.0863, "step": 2168 }, { "epoch": 1.55, "grad_norm": 19.70040390124027, "learning_rate": 7.014743791129644e-06, "loss": 0.1052, "step": 2169 }, { "epoch": 1.55, "grad_norm": 4.783745164454452, "learning_rate": 7.012098324604231e-06, "loss": 0.0804, "step": 2170 }, { "epoch": 1.55, "grad_norm": 5.83062015200628, "learning_rate": 7.009452185799368e-06, "loss": 0.0876, "step": 2171 }, { "epoch": 1.55, "grad_norm": 8.96404912676571, "learning_rate": 7.00680537559918e-06, "loss": 0.0917, "step": 2172 }, { "epoch": 1.55, "grad_norm": 26.88319898284892, "learning_rate": 7.0041578948880155e-06, "loss": 0.0736, "step": 2173 }, { "epoch": 1.55, "grad_norm": 11.385232172270138, "learning_rate": 7.001509744550446e-06, "loss": 0.1053, "step": 2174 }, { "epoch": 1.55, "grad_norm": 7.771562344479279, "learning_rate": 6.998860925471267e-06, "loss": 0.0853, "step": 2175 }, { "epoch": 1.55, "grad_norm": 18.270461704058164, "learning_rate": 6.9962114385355e-06, "loss": 0.0884, "step": 2176 }, { "epoch": 1.55, "grad_norm": 43.811495080381825, "learning_rate": 6.993561284628388e-06, "loss": 0.1189, "step": 2177 }, { "epoch": 1.55, "grad_norm": 9.153350063653045, "learning_rate": 6.990910464635395e-06, "loss": 0.075, "step": 2178 }, { "epoch": 1.56, "grad_norm": 16.548387260500697, "learning_rate": 6.9882589794422105e-06, "loss": 0.0931, "step": 2179 }, { "epoch": 1.56, "grad_norm": 25.197771744639976, "learning_rate": 6.9856068299347455e-06, "loss": 0.1284, "step": 2180 }, { "epoch": 1.56, "grad_norm": 30.717886675806074, "learning_rate": 6.98295401699913e-06, "loss": 0.0989, "step": 2181 }, { "epoch": 1.56, "grad_norm": 22.815922730000807, "learning_rate": 6.980300541521721e-06, "loss": 0.1202, "step": 2182 }, { "epoch": 1.56, "grad_norm": 8.881826965201908, "learning_rate": 6.977646404389092e-06, "loss": 0.0764, "step": 2183 }, { "epoch": 1.56, "grad_norm": 74.805756583807, "learning_rate": 6.9749916064880404e-06, "loss": 0.1982, "step": 2184 }, { "epoch": 1.56, "grad_norm": 15.273061210962908, "learning_rate": 6.972336148705583e-06, "loss": 0.0898, "step": 2185 }, { "epoch": 1.56, "grad_norm": 26.839917671298732, "learning_rate": 6.969680031928959e-06, "loss": 0.1118, "step": 2186 }, { "epoch": 1.56, "grad_norm": 18.784018802631863, "learning_rate": 6.967023257045624e-06, "loss": 0.1224, "step": 2187 }, { "epoch": 1.56, "grad_norm": 42.06255919354335, "learning_rate": 6.96436582494326e-06, "loss": 0.0928, "step": 2188 }, { "epoch": 1.56, "grad_norm": 38.1069266876009, "learning_rate": 6.961707736509759e-06, "loss": 0.1373, "step": 2189 }, { "epoch": 1.56, "grad_norm": 29.757725569858078, "learning_rate": 6.959048992633241e-06, "loss": 0.0897, "step": 2190 }, { "epoch": 1.56, "grad_norm": 4.726770945951311, "learning_rate": 6.956389594202041e-06, "loss": 0.0917, "step": 2191 }, { "epoch": 1.56, "grad_norm": 36.79881925491694, "learning_rate": 6.953729542104713e-06, "loss": 0.1097, "step": 2192 }, { "epoch": 1.57, "grad_norm": 44.56686035867685, "learning_rate": 6.951068837230032e-06, "loss": 0.1515, "step": 2193 }, { "epoch": 1.57, "grad_norm": 27.897014911286362, "learning_rate": 6.9484074804669865e-06, "loss": 0.1221, "step": 2194 }, { "epoch": 1.57, "grad_norm": 17.338158874149745, "learning_rate": 6.945745472704786e-06, "loss": 0.1188, "step": 2195 }, { "epoch": 1.57, "grad_norm": 38.83775106356088, "learning_rate": 6.943082814832858e-06, "loss": 0.0985, "step": 2196 }, { "epoch": 1.57, "grad_norm": 44.22872017990922, "learning_rate": 6.940419507740843e-06, "loss": 0.1453, "step": 2197 }, { "epoch": 1.57, "grad_norm": 23.616953797426014, "learning_rate": 6.937755552318606e-06, "loss": 0.0958, "step": 2198 }, { "epoch": 1.57, "grad_norm": 16.228217397227283, "learning_rate": 6.935090949456219e-06, "loss": 0.0956, "step": 2199 }, { "epoch": 1.57, "grad_norm": 16.949120884184445, "learning_rate": 6.93242570004398e-06, "loss": 0.1097, "step": 2200 }, { "epoch": 1.57, "grad_norm": 45.65816773721843, "learning_rate": 6.929759804972394e-06, "loss": 0.1361, "step": 2201 }, { "epoch": 1.57, "grad_norm": 28.0710224178119, "learning_rate": 6.92709326513219e-06, "loss": 0.11, "step": 2202 }, { "epoch": 1.57, "grad_norm": 13.390078383095538, "learning_rate": 6.924426081414305e-06, "loss": 0.088, "step": 2203 }, { "epoch": 1.57, "grad_norm": 5.427364387155767, "learning_rate": 6.921758254709897e-06, "loss": 0.0829, "step": 2204 }, { "epoch": 1.57, "grad_norm": 26.214900361222632, "learning_rate": 6.919089785910336e-06, "loss": 0.0849, "step": 2205 }, { "epoch": 1.57, "grad_norm": 31.71611812858684, "learning_rate": 6.916420675907207e-06, "loss": 0.0957, "step": 2206 }, { "epoch": 1.58, "grad_norm": 21.55026185223569, "learning_rate": 6.9137509255923085e-06, "loss": 0.1045, "step": 2207 }, { "epoch": 1.58, "grad_norm": 5.845490265416999, "learning_rate": 6.911080535857655e-06, "loss": 0.0778, "step": 2208 }, { "epoch": 1.58, "grad_norm": 18.755449635394918, "learning_rate": 6.908409507595472e-06, "loss": 0.114, "step": 2209 }, { "epoch": 1.58, "grad_norm": 27.69838224283327, "learning_rate": 6.905737841698201e-06, "loss": 0.1097, "step": 2210 }, { "epoch": 1.58, "grad_norm": 13.295405659497893, "learning_rate": 6.903065539058496e-06, "loss": 0.0918, "step": 2211 }, { "epoch": 1.58, "grad_norm": 9.442426195870846, "learning_rate": 6.900392600569219e-06, "loss": 0.0647, "step": 2212 }, { "epoch": 1.58, "grad_norm": 8.137208442510627, "learning_rate": 6.897719027123451e-06, "loss": 0.1304, "step": 2213 }, { "epoch": 1.58, "grad_norm": 29.345677962603688, "learning_rate": 6.895044819614484e-06, "loss": 0.1, "step": 2214 }, { "epoch": 1.58, "grad_norm": 17.490421643869904, "learning_rate": 6.8923699789358185e-06, "loss": 0.0791, "step": 2215 }, { "epoch": 1.58, "grad_norm": 14.661343717167206, "learning_rate": 6.88969450598117e-06, "loss": 0.068, "step": 2216 }, { "epoch": 1.58, "grad_norm": 18.33228470361208, "learning_rate": 6.887018401644463e-06, "loss": 0.1045, "step": 2217 }, { "epoch": 1.58, "grad_norm": 11.382139849610716, "learning_rate": 6.884341666819832e-06, "loss": 0.0778, "step": 2218 }, { "epoch": 1.58, "grad_norm": 9.342522202577735, "learning_rate": 6.881664302401626e-06, "loss": 0.0865, "step": 2219 }, { "epoch": 1.58, "grad_norm": 11.14544309202564, "learning_rate": 6.878986309284401e-06, "loss": 0.0933, "step": 2220 }, { "epoch": 1.59, "grad_norm": 6.725068137788035, "learning_rate": 6.876307688362925e-06, "loss": 0.0895, "step": 2221 }, { "epoch": 1.59, "grad_norm": 5.054484543343092, "learning_rate": 6.873628440532175e-06, "loss": 0.0834, "step": 2222 }, { "epoch": 1.59, "grad_norm": 10.231322227289729, "learning_rate": 6.8709485666873375e-06, "loss": 0.104, "step": 2223 }, { "epoch": 1.59, "grad_norm": 5.956539872126496, "learning_rate": 6.868268067723808e-06, "loss": 0.1002, "step": 2224 }, { "epoch": 1.59, "grad_norm": 17.94119750705428, "learning_rate": 6.86558694453719e-06, "loss": 0.1049, "step": 2225 }, { "epoch": 1.59, "grad_norm": 7.710067333196714, "learning_rate": 6.8629051980233e-06, "loss": 0.0728, "step": 2226 }, { "epoch": 1.59, "grad_norm": 13.507965465766109, "learning_rate": 6.860222829078156e-06, "loss": 0.0928, "step": 2227 }, { "epoch": 1.59, "grad_norm": 15.267717469976887, "learning_rate": 6.857539838597987e-06, "loss": 0.0722, "step": 2228 }, { "epoch": 1.59, "grad_norm": 6.716234484479027, "learning_rate": 6.8548562274792325e-06, "loss": 0.0989, "step": 2229 }, { "epoch": 1.59, "grad_norm": 32.705783237669955, "learning_rate": 6.8521719966185355e-06, "loss": 0.1067, "step": 2230 }, { "epoch": 1.59, "grad_norm": 9.096469392127245, "learning_rate": 6.8494871469127474e-06, "loss": 0.1183, "step": 2231 }, { "epoch": 1.59, "grad_norm": 7.547206226988792, "learning_rate": 6.846801679258926e-06, "loss": 0.1047, "step": 2232 }, { "epoch": 1.59, "grad_norm": 13.029631509644526, "learning_rate": 6.844115594554338e-06, "loss": 0.0861, "step": 2233 }, { "epoch": 1.59, "grad_norm": 43.40421796011947, "learning_rate": 6.841428893696453e-06, "loss": 0.1598, "step": 2234 }, { "epoch": 1.6, "grad_norm": 18.30412375137895, "learning_rate": 6.838741577582946e-06, "loss": 0.1125, "step": 2235 }, { "epoch": 1.6, "grad_norm": 4.815879130885394, "learning_rate": 6.836053647111701e-06, "loss": 0.0886, "step": 2236 }, { "epoch": 1.6, "grad_norm": 20.612409133400302, "learning_rate": 6.833365103180806e-06, "loss": 0.1157, "step": 2237 }, { "epoch": 1.6, "grad_norm": 23.966186458842202, "learning_rate": 6.830675946688552e-06, "loss": 0.0892, "step": 2238 }, { "epoch": 1.6, "grad_norm": 7.940623113295723, "learning_rate": 6.827986178533437e-06, "loss": 0.1117, "step": 2239 }, { "epoch": 1.6, "grad_norm": 7.44672935549102, "learning_rate": 6.825295799614163e-06, "loss": 0.1162, "step": 2240 }, { "epoch": 1.6, "grad_norm": 14.891138354626657, "learning_rate": 6.822604810829634e-06, "loss": 0.0913, "step": 2241 }, { "epoch": 1.6, "grad_norm": 17.653225783254584, "learning_rate": 6.819913213078961e-06, "loss": 0.0951, "step": 2242 }, { "epoch": 1.6, "grad_norm": 5.9242045134536525, "learning_rate": 6.817221007261456e-06, "loss": 0.1039, "step": 2243 }, { "epoch": 1.6, "grad_norm": 14.31038138115803, "learning_rate": 6.814528194276636e-06, "loss": 0.0687, "step": 2244 }, { "epoch": 1.6, "grad_norm": 8.40265784360439, "learning_rate": 6.811834775024219e-06, "loss": 0.1125, "step": 2245 }, { "epoch": 1.6, "grad_norm": 20.076086550456807, "learning_rate": 6.809140750404127e-06, "loss": 0.0917, "step": 2246 }, { "epoch": 1.6, "grad_norm": 23.9239943476738, "learning_rate": 6.8064461213164825e-06, "loss": 0.1105, "step": 2247 }, { "epoch": 1.6, "grad_norm": 7.421604483486793, "learning_rate": 6.803750888661611e-06, "loss": 0.0867, "step": 2248 }, { "epoch": 1.61, "grad_norm": 26.261644106440436, "learning_rate": 6.8010550533400425e-06, "loss": 0.1093, "step": 2249 }, { "epoch": 1.61, "grad_norm": 24.854024525208718, "learning_rate": 6.798358616252503e-06, "loss": 0.09, "step": 2250 }, { "epoch": 1.61, "grad_norm": 28.96708219096894, "learning_rate": 6.795661578299924e-06, "loss": 0.0673, "step": 2251 }, { "epoch": 1.61, "grad_norm": 11.997934589159817, "learning_rate": 6.792963940383436e-06, "loss": 0.132, "step": 2252 }, { "epoch": 1.61, "grad_norm": 51.52301595812061, "learning_rate": 6.790265703404368e-06, "loss": 0.1309, "step": 2253 }, { "epoch": 1.61, "grad_norm": 33.198458196121244, "learning_rate": 6.787566868264253e-06, "loss": 0.1149, "step": 2254 }, { "epoch": 1.61, "grad_norm": 39.63747128767255, "learning_rate": 6.7848674358648195e-06, "loss": 0.1301, "step": 2255 }, { "epoch": 1.61, "grad_norm": 16.270808913561332, "learning_rate": 6.782167407108001e-06, "loss": 0.1317, "step": 2256 }, { "epoch": 1.61, "grad_norm": 32.72395746940256, "learning_rate": 6.779466782895926e-06, "loss": 0.087, "step": 2257 }, { "epoch": 1.61, "grad_norm": 48.91482478473609, "learning_rate": 6.7767655641309234e-06, "loss": 0.1608, "step": 2258 }, { "epoch": 1.61, "grad_norm": 34.08193473685804, "learning_rate": 6.7740637517155205e-06, "loss": 0.1154, "step": 2259 }, { "epoch": 1.61, "grad_norm": 12.943831638411528, "learning_rate": 6.771361346552445e-06, "loss": 0.1069, "step": 2260 }, { "epoch": 1.61, "grad_norm": 7.135693851021515, "learning_rate": 6.7686583495446164e-06, "loss": 0.0879, "step": 2261 }, { "epoch": 1.61, "grad_norm": 37.11558243569602, "learning_rate": 6.765954761595161e-06, "loss": 0.1265, "step": 2262 }, { "epoch": 1.62, "grad_norm": 51.426289905430444, "learning_rate": 6.763250583607392e-06, "loss": 0.1113, "step": 2263 }, { "epoch": 1.62, "grad_norm": 24.538812354381417, "learning_rate": 6.7605458164848316e-06, "loss": 0.1005, "step": 2264 }, { "epoch": 1.62, "grad_norm": 13.374751197386294, "learning_rate": 6.75784046113119e-06, "loss": 0.1232, "step": 2265 }, { "epoch": 1.62, "grad_norm": 19.870883488045923, "learning_rate": 6.755134518450377e-06, "loss": 0.1033, "step": 2266 }, { "epoch": 1.62, "grad_norm": 9.625372295659911, "learning_rate": 6.752427989346497e-06, "loss": 0.1268, "step": 2267 }, { "epoch": 1.62, "grad_norm": 10.575802908582206, "learning_rate": 6.749720874723854e-06, "loss": 0.0843, "step": 2268 }, { "epoch": 1.62, "grad_norm": 25.526322460359744, "learning_rate": 6.747013175486944e-06, "loss": 0.1187, "step": 2269 }, { "epoch": 1.62, "grad_norm": 7.939467806031972, "learning_rate": 6.74430489254046e-06, "loss": 0.088, "step": 2270 }, { "epoch": 1.62, "grad_norm": 4.729990582136349, "learning_rate": 6.741596026789288e-06, "loss": 0.1049, "step": 2271 }, { "epoch": 1.62, "grad_norm": 6.467119491306653, "learning_rate": 6.7388865791385124e-06, "loss": 0.0942, "step": 2272 }, { "epoch": 1.62, "grad_norm": 9.876980913769884, "learning_rate": 6.736176550493411e-06, "loss": 0.1053, "step": 2273 }, { "epoch": 1.62, "grad_norm": 20.755840347838348, "learning_rate": 6.7334659417594514e-06, "loss": 0.116, "step": 2274 }, { "epoch": 1.62, "grad_norm": 17.06569796812762, "learning_rate": 6.730754753842303e-06, "loss": 0.1023, "step": 2275 }, { "epoch": 1.62, "grad_norm": 7.8737047418848904, "learning_rate": 6.728042987647818e-06, "loss": 0.0779, "step": 2276 }, { "epoch": 1.63, "grad_norm": 31.73870174511549, "learning_rate": 6.725330644082054e-06, "loss": 0.1567, "step": 2277 }, { "epoch": 1.63, "grad_norm": 27.556880854348176, "learning_rate": 6.7226177240512516e-06, "loss": 0.085, "step": 2278 }, { "epoch": 1.63, "grad_norm": 11.788183835441814, "learning_rate": 6.7199042284618484e-06, "loss": 0.093, "step": 2279 }, { "epoch": 1.63, "grad_norm": 31.31737563867473, "learning_rate": 6.717190158220475e-06, "loss": 0.1143, "step": 2280 }, { "epoch": 1.63, "grad_norm": 5.564357936167133, "learning_rate": 6.714475514233951e-06, "loss": 0.0759, "step": 2281 }, { "epoch": 1.63, "grad_norm": 46.40291596398642, "learning_rate": 6.71176029740929e-06, "loss": 0.1226, "step": 2282 }, { "epoch": 1.63, "grad_norm": 24.154317765086333, "learning_rate": 6.709044508653697e-06, "loss": 0.1367, "step": 2283 }, { "epoch": 1.63, "grad_norm": 6.925533803113575, "learning_rate": 6.706328148874568e-06, "loss": 0.1111, "step": 2284 }, { "epoch": 1.63, "grad_norm": 17.52523515570555, "learning_rate": 6.703611218979488e-06, "loss": 0.113, "step": 2285 }, { "epoch": 1.63, "grad_norm": 32.63171057290881, "learning_rate": 6.700893719876234e-06, "loss": 0.1052, "step": 2286 }, { "epoch": 1.63, "grad_norm": 35.04462192513845, "learning_rate": 6.698175652472774e-06, "loss": 0.0858, "step": 2287 }, { "epoch": 1.63, "grad_norm": 18.72923605107616, "learning_rate": 6.695457017677263e-06, "loss": 0.1011, "step": 2288 }, { "epoch": 1.63, "grad_norm": 14.768416928691146, "learning_rate": 6.692737816398048e-06, "loss": 0.1317, "step": 2289 }, { "epoch": 1.63, "grad_norm": 21.649718208972313, "learning_rate": 6.6900180495436664e-06, "loss": 0.1016, "step": 2290 }, { "epoch": 1.64, "grad_norm": 16.92349184199488, "learning_rate": 6.68729771802284e-06, "loss": 0.1027, "step": 2291 }, { "epoch": 1.64, "grad_norm": 18.73784013126941, "learning_rate": 6.6845768227444855e-06, "loss": 0.0793, "step": 2292 }, { "epoch": 1.64, "grad_norm": 10.89593946985034, "learning_rate": 6.681855364617702e-06, "loss": 0.0908, "step": 2293 }, { "epoch": 1.64, "grad_norm": 12.205255707572324, "learning_rate": 6.67913334455178e-06, "loss": 0.0995, "step": 2294 }, { "epoch": 1.64, "grad_norm": 16.757751110332908, "learning_rate": 6.676410763456197e-06, "loss": 0.1001, "step": 2295 }, { "epoch": 1.64, "grad_norm": 7.269571127624379, "learning_rate": 6.673687622240619e-06, "loss": 0.088, "step": 2296 }, { "epoch": 1.64, "grad_norm": 9.609560802494752, "learning_rate": 6.670963921814896e-06, "loss": 0.1106, "step": 2297 }, { "epoch": 1.64, "grad_norm": 8.246135299481141, "learning_rate": 6.668239663089069e-06, "loss": 0.1433, "step": 2298 }, { "epoch": 1.64, "grad_norm": 18.24562720984829, "learning_rate": 6.665514846973361e-06, "loss": 0.0901, "step": 2299 }, { "epoch": 1.64, "grad_norm": 13.37161564678813, "learning_rate": 6.662789474378186e-06, "loss": 0.1213, "step": 2300 }, { "epoch": 1.64, "grad_norm": 11.844017421917352, "learning_rate": 6.6600635462141415e-06, "loss": 0.0905, "step": 2301 }, { "epoch": 1.64, "grad_norm": 13.504340961380517, "learning_rate": 6.657337063392011e-06, "loss": 0.1224, "step": 2302 }, { "epoch": 1.64, "grad_norm": 5.623690274333707, "learning_rate": 6.654610026822761e-06, "loss": 0.0778, "step": 2303 }, { "epoch": 1.64, "grad_norm": 10.443915001651478, "learning_rate": 6.651882437417546e-06, "loss": 0.1146, "step": 2304 }, { "epoch": 1.65, "grad_norm": 8.261849012510766, "learning_rate": 6.649154296087705e-06, "loss": 0.1248, "step": 2305 }, { "epoch": 1.65, "grad_norm": 9.712674765149288, "learning_rate": 6.646425603744759e-06, "loss": 0.1034, "step": 2306 }, { "epoch": 1.65, "grad_norm": 8.076830070701273, "learning_rate": 6.643696361300418e-06, "loss": 0.1305, "step": 2307 }, { "epoch": 1.65, "grad_norm": 16.23453792461202, "learning_rate": 6.6409665696665715e-06, "loss": 0.0792, "step": 2308 }, { "epoch": 1.65, "grad_norm": 6.928315712867063, "learning_rate": 6.638236229755292e-06, "loss": 0.1429, "step": 2309 }, { "epoch": 1.65, "grad_norm": 5.653617940327973, "learning_rate": 6.635505342478838e-06, "loss": 0.0847, "step": 2310 }, { "epoch": 1.65, "grad_norm": 9.207528691085546, "learning_rate": 6.632773908749649e-06, "loss": 0.0923, "step": 2311 }, { "epoch": 1.65, "grad_norm": 26.119937550135738, "learning_rate": 6.630041929480349e-06, "loss": 0.1027, "step": 2312 }, { "epoch": 1.65, "grad_norm": 24.180449478429487, "learning_rate": 6.627309405583741e-06, "loss": 0.1044, "step": 2313 }, { "epoch": 1.65, "grad_norm": 20.448643687176528, "learning_rate": 6.624576337972815e-06, "loss": 0.0922, "step": 2314 }, { "epoch": 1.65, "grad_norm": 11.591680766177754, "learning_rate": 6.621842727560737e-06, "loss": 0.0912, "step": 2315 }, { "epoch": 1.65, "grad_norm": 4.828986274241329, "learning_rate": 6.6191085752608575e-06, "loss": 0.0772, "step": 2316 }, { "epoch": 1.65, "grad_norm": 9.954083488247607, "learning_rate": 6.616373881986708e-06, "loss": 0.1047, "step": 2317 }, { "epoch": 1.65, "grad_norm": 5.6263248883934125, "learning_rate": 6.613638648652002e-06, "loss": 0.0961, "step": 2318 }, { "epoch": 1.66, "grad_norm": 5.087116155084636, "learning_rate": 6.610902876170631e-06, "loss": 0.0953, "step": 2319 }, { "epoch": 1.66, "grad_norm": 18.559528074367204, "learning_rate": 6.608166565456666e-06, "loss": 0.1322, "step": 2320 }, { "epoch": 1.66, "grad_norm": 36.09006097390221, "learning_rate": 6.605429717424359e-06, "loss": 0.0972, "step": 2321 }, { "epoch": 1.66, "grad_norm": 5.741647863961911, "learning_rate": 6.602692332988143e-06, "loss": 0.0908, "step": 2322 }, { "epoch": 1.66, "grad_norm": 30.835808916611533, "learning_rate": 6.5999544130626305e-06, "loss": 0.0942, "step": 2323 }, { "epoch": 1.66, "grad_norm": 14.628025775072933, "learning_rate": 6.597215958562608e-06, "loss": 0.1154, "step": 2324 }, { "epoch": 1.66, "grad_norm": 19.30215107454651, "learning_rate": 6.5944769704030465e-06, "loss": 0.0925, "step": 2325 }, { "epoch": 1.66, "grad_norm": 16.28763762345787, "learning_rate": 6.591737449499092e-06, "loss": 0.1129, "step": 2326 }, { "epoch": 1.66, "grad_norm": 8.830045219544676, "learning_rate": 6.58899739676607e-06, "loss": 0.0909, "step": 2327 }, { "epoch": 1.66, "grad_norm": 8.33628287139074, "learning_rate": 6.586256813119482e-06, "loss": 0.0975, "step": 2328 }, { "epoch": 1.66, "grad_norm": 16.449451637998475, "learning_rate": 6.583515699475009e-06, "loss": 0.129, "step": 2329 }, { "epoch": 1.66, "grad_norm": 6.413192128563511, "learning_rate": 6.580774056748508e-06, "loss": 0.0936, "step": 2330 }, { "epoch": 1.66, "grad_norm": 24.55379335050217, "learning_rate": 6.578031885856011e-06, "loss": 0.0896, "step": 2331 }, { "epoch": 1.66, "grad_norm": 16.570585530302036, "learning_rate": 6.575289187713731e-06, "loss": 0.0884, "step": 2332 }, { "epoch": 1.67, "grad_norm": 16.171822796583307, "learning_rate": 6.572545963238053e-06, "loss": 0.0837, "step": 2333 }, { "epoch": 1.67, "grad_norm": 7.644151539511672, "learning_rate": 6.569802213345537e-06, "loss": 0.1268, "step": 2334 }, { "epoch": 1.67, "grad_norm": 14.08472143552305, "learning_rate": 6.5670579389529255e-06, "loss": 0.0915, "step": 2335 }, { "epoch": 1.67, "grad_norm": 10.674980055295068, "learning_rate": 6.56431314097713e-06, "loss": 0.0601, "step": 2336 }, { "epoch": 1.67, "grad_norm": 20.618948771998678, "learning_rate": 6.561567820335236e-06, "loss": 0.0776, "step": 2337 }, { "epoch": 1.67, "grad_norm": 7.837958868858976, "learning_rate": 6.558821977944508e-06, "loss": 0.0777, "step": 2338 }, { "epoch": 1.67, "grad_norm": 10.081314355572308, "learning_rate": 6.556075614722383e-06, "loss": 0.097, "step": 2339 }, { "epoch": 1.67, "grad_norm": 12.979423171549898, "learning_rate": 6.553328731586473e-06, "loss": 0.1097, "step": 2340 }, { "epoch": 1.67, "grad_norm": 31.475378687444337, "learning_rate": 6.550581329454561e-06, "loss": 0.1274, "step": 2341 }, { "epoch": 1.67, "grad_norm": 19.29402235295992, "learning_rate": 6.547833409244606e-06, "loss": 0.0825, "step": 2342 }, { "epoch": 1.67, "grad_norm": 17.39491613562142, "learning_rate": 6.545084971874738e-06, "loss": 0.1018, "step": 2343 }, { "epoch": 1.67, "grad_norm": 6.751666154792371, "learning_rate": 6.542336018263262e-06, "loss": 0.0856, "step": 2344 }, { "epoch": 1.67, "grad_norm": 28.598101139931043, "learning_rate": 6.539586549328656e-06, "loss": 0.1866, "step": 2345 }, { "epoch": 1.67, "grad_norm": 9.960017160676509, "learning_rate": 6.536836565989565e-06, "loss": 0.0786, "step": 2346 }, { "epoch": 1.68, "grad_norm": 12.28562563434325, "learning_rate": 6.534086069164813e-06, "loss": 0.1039, "step": 2347 }, { "epoch": 1.68, "grad_norm": 11.822129310031302, "learning_rate": 6.531335059773392e-06, "loss": 0.0911, "step": 2348 }, { "epoch": 1.68, "grad_norm": 33.14817099069893, "learning_rate": 6.528583538734463e-06, "loss": 0.115, "step": 2349 }, { "epoch": 1.68, "grad_norm": 9.926580522705304, "learning_rate": 6.525831506967361e-06, "loss": 0.1115, "step": 2350 }, { "epoch": 1.68, "grad_norm": 25.639367084236095, "learning_rate": 6.523078965391592e-06, "loss": 0.1034, "step": 2351 }, { "epoch": 1.68, "grad_norm": 6.074092820606506, "learning_rate": 6.520325914926831e-06, "loss": 0.0667, "step": 2352 }, { "epoch": 1.68, "grad_norm": 16.162347392614848, "learning_rate": 6.517572356492922e-06, "loss": 0.0854, "step": 2353 }, { "epoch": 1.68, "grad_norm": 4.991411546020857, "learning_rate": 6.514818291009881e-06, "loss": 0.1069, "step": 2354 }, { "epoch": 1.68, "grad_norm": 15.6323688780031, "learning_rate": 6.512063719397894e-06, "loss": 0.0802, "step": 2355 }, { "epoch": 1.68, "grad_norm": 8.816757264191924, "learning_rate": 6.5093086425773126e-06, "loss": 0.1061, "step": 2356 }, { "epoch": 1.68, "grad_norm": 11.042693972596268, "learning_rate": 6.506553061468659e-06, "loss": 0.0801, "step": 2357 }, { "epoch": 1.68, "grad_norm": 7.60615774529474, "learning_rate": 6.5037969769926256e-06, "loss": 0.1073, "step": 2358 }, { "epoch": 1.68, "grad_norm": 7.938469871759891, "learning_rate": 6.501040390070071e-06, "loss": 0.1047, "step": 2359 }, { "epoch": 1.68, "grad_norm": 8.407806278240889, "learning_rate": 6.498283301622022e-06, "loss": 0.0829, "step": 2360 }, { "epoch": 1.69, "grad_norm": 21.855833457945497, "learning_rate": 6.495525712569673e-06, "loss": 0.0896, "step": 2361 }, { "epoch": 1.69, "grad_norm": 24.55266138680911, "learning_rate": 6.492767623834385e-06, "loss": 0.0958, "step": 2362 }, { "epoch": 1.69, "grad_norm": 7.844762029942168, "learning_rate": 6.490009036337687e-06, "loss": 0.1097, "step": 2363 }, { "epoch": 1.69, "grad_norm": 14.157061696422106, "learning_rate": 6.487249951001276e-06, "loss": 0.0968, "step": 2364 }, { "epoch": 1.69, "grad_norm": 23.749244203785135, "learning_rate": 6.484490368747012e-06, "loss": 0.1128, "step": 2365 }, { "epoch": 1.69, "grad_norm": 6.384281292680775, "learning_rate": 6.4817302904969226e-06, "loss": 0.1133, "step": 2366 }, { "epoch": 1.69, "grad_norm": 12.27197718764886, "learning_rate": 6.4789697171732024e-06, "loss": 0.1234, "step": 2367 }, { "epoch": 1.69, "grad_norm": 6.1705409111581115, "learning_rate": 6.476208649698209e-06, "loss": 0.1206, "step": 2368 }, { "epoch": 1.69, "grad_norm": 9.676461914963904, "learning_rate": 6.473447088994467e-06, "loss": 0.0778, "step": 2369 }, { "epoch": 1.69, "grad_norm": 26.76018722096519, "learning_rate": 6.470685035984667e-06, "loss": 0.1274, "step": 2370 }, { "epoch": 1.69, "grad_norm": 12.24487360044837, "learning_rate": 6.467922491591658e-06, "loss": 0.0955, "step": 2371 }, { "epoch": 1.69, "grad_norm": 27.638340021369807, "learning_rate": 6.465159456738461e-06, "loss": 0.0926, "step": 2372 }, { "epoch": 1.69, "grad_norm": 4.785845976111559, "learning_rate": 6.462395932348257e-06, "loss": 0.0978, "step": 2373 }, { "epoch": 1.69, "grad_norm": 8.312844920909349, "learning_rate": 6.459631919344389e-06, "loss": 0.067, "step": 2374 }, { "epoch": 1.7, "grad_norm": 9.685594864488179, "learning_rate": 6.456867418650366e-06, "loss": 0.0913, "step": 2375 }, { "epoch": 1.7, "grad_norm": 9.9810523595601, "learning_rate": 6.454102431189859e-06, "loss": 0.1007, "step": 2376 }, { "epoch": 1.7, "grad_norm": 6.265965225581489, "learning_rate": 6.4513369578867026e-06, "loss": 0.104, "step": 2377 }, { "epoch": 1.7, "grad_norm": 16.893745136728803, "learning_rate": 6.448570999664894e-06, "loss": 0.1005, "step": 2378 }, { "epoch": 1.7, "grad_norm": 28.908957709905614, "learning_rate": 6.4458045574485875e-06, "loss": 0.1255, "step": 2379 }, { "epoch": 1.7, "grad_norm": 24.22139522152318, "learning_rate": 6.443037632162104e-06, "loss": 0.0996, "step": 2380 }, { "epoch": 1.7, "grad_norm": 26.572515462610134, "learning_rate": 6.440270224729927e-06, "loss": 0.0901, "step": 2381 }, { "epoch": 1.7, "grad_norm": 12.207827481160598, "learning_rate": 6.437502336076695e-06, "loss": 0.1221, "step": 2382 }, { "epoch": 1.7, "grad_norm": 19.41999789803127, "learning_rate": 6.4347339671272155e-06, "loss": 0.0783, "step": 2383 }, { "epoch": 1.7, "grad_norm": 16.873084876197144, "learning_rate": 6.431965118806449e-06, "loss": 0.094, "step": 2384 }, { "epoch": 1.7, "grad_norm": 16.372299469008183, "learning_rate": 6.42919579203952e-06, "loss": 0.1016, "step": 2385 }, { "epoch": 1.7, "grad_norm": 27.497086153318044, "learning_rate": 6.4264259877517124e-06, "loss": 0.1262, "step": 2386 }, { "epoch": 1.7, "grad_norm": 24.053964065962585, "learning_rate": 6.423655706868468e-06, "loss": 0.0745, "step": 2387 }, { "epoch": 1.7, "grad_norm": 22.25200523393893, "learning_rate": 6.4208849503153915e-06, "loss": 0.0837, "step": 2388 }, { "epoch": 1.71, "grad_norm": 11.764255564259074, "learning_rate": 6.418113719018242e-06, "loss": 0.1034, "step": 2389 }, { "epoch": 1.71, "grad_norm": 30.75721694150146, "learning_rate": 6.415342013902939e-06, "loss": 0.1151, "step": 2390 }, { "epoch": 1.71, "grad_norm": 5.427156665392917, "learning_rate": 6.412569835895562e-06, "loss": 0.0956, "step": 2391 }, { "epoch": 1.71, "grad_norm": 18.046997378438945, "learning_rate": 6.409797185922349e-06, "loss": 0.1169, "step": 2392 }, { "epoch": 1.71, "grad_norm": 7.538487314232114, "learning_rate": 6.40702406490969e-06, "loss": 0.1074, "step": 2393 }, { "epoch": 1.71, "grad_norm": 14.150795863197285, "learning_rate": 6.404250473784138e-06, "loss": 0.0994, "step": 2394 }, { "epoch": 1.71, "grad_norm": 24.75329036726323, "learning_rate": 6.401476413472404e-06, "loss": 0.1069, "step": 2395 }, { "epoch": 1.71, "grad_norm": 12.43088365316308, "learning_rate": 6.398701884901348e-06, "loss": 0.1107, "step": 2396 }, { "epoch": 1.71, "grad_norm": 8.74404078917541, "learning_rate": 6.3959268889979956e-06, "loss": 0.0984, "step": 2397 }, { "epoch": 1.71, "grad_norm": 9.577324694920657, "learning_rate": 6.393151426689522e-06, "loss": 0.1191, "step": 2398 }, { "epoch": 1.71, "grad_norm": 13.63480133602996, "learning_rate": 6.390375498903263e-06, "loss": 0.0992, "step": 2399 }, { "epoch": 1.71, "grad_norm": 14.152036274866392, "learning_rate": 6.387599106566705e-06, "loss": 0.0937, "step": 2400 }, { "epoch": 1.71, "grad_norm": 6.121175600708925, "learning_rate": 6.384822250607495e-06, "loss": 0.1039, "step": 2401 }, { "epoch": 1.71, "grad_norm": 6.47698057810036, "learning_rate": 6.382044931953431e-06, "loss": 0.0828, "step": 2402 }, { "epoch": 1.72, "grad_norm": 10.08821825459922, "learning_rate": 6.379267151532467e-06, "loss": 0.135, "step": 2403 }, { "epoch": 1.72, "grad_norm": 3.9574752880861226, "learning_rate": 6.376488910272709e-06, "loss": 0.0742, "step": 2404 }, { "epoch": 1.72, "grad_norm": 5.2069319299192065, "learning_rate": 6.373710209102423e-06, "loss": 0.1099, "step": 2405 }, { "epoch": 1.72, "grad_norm": 26.51909679991199, "learning_rate": 6.370931048950022e-06, "loss": 0.0972, "step": 2406 }, { "epoch": 1.72, "grad_norm": 18.736652018749133, "learning_rate": 6.368151430744075e-06, "loss": 0.1042, "step": 2407 }, { "epoch": 1.72, "grad_norm": 15.498335426749598, "learning_rate": 6.365371355413306e-06, "loss": 0.1053, "step": 2408 }, { "epoch": 1.72, "grad_norm": 35.0436335302218, "learning_rate": 6.362590823886588e-06, "loss": 0.094, "step": 2409 }, { "epoch": 1.72, "grad_norm": 35.01179148908803, "learning_rate": 6.359809837092947e-06, "loss": 0.1014, "step": 2410 }, { "epoch": 1.72, "grad_norm": 8.194118078049293, "learning_rate": 6.357028395961566e-06, "loss": 0.0864, "step": 2411 }, { "epoch": 1.72, "grad_norm": 6.697601824559924, "learning_rate": 6.354246501421777e-06, "loss": 0.12, "step": 2412 }, { "epoch": 1.72, "grad_norm": 15.721750841959683, "learning_rate": 6.3514641544030575e-06, "loss": 0.1021, "step": 2413 }, { "epoch": 1.72, "grad_norm": 23.028362842896957, "learning_rate": 6.348681355835043e-06, "loss": 0.106, "step": 2414 }, { "epoch": 1.72, "grad_norm": 23.07284177900744, "learning_rate": 6.345898106647521e-06, "loss": 0.1036, "step": 2415 }, { "epoch": 1.72, "grad_norm": 14.759239504580846, "learning_rate": 6.3431144077704245e-06, "loss": 0.1005, "step": 2416 }, { "epoch": 1.73, "grad_norm": 7.849400235997493, "learning_rate": 6.340330260133839e-06, "loss": 0.1296, "step": 2417 }, { "epoch": 1.73, "grad_norm": 13.264559348008135, "learning_rate": 6.337545664668001e-06, "loss": 0.1018, "step": 2418 }, { "epoch": 1.73, "grad_norm": 54.345532921290065, "learning_rate": 6.334760622303294e-06, "loss": 0.1444, "step": 2419 }, { "epoch": 1.73, "grad_norm": 11.648785289643577, "learning_rate": 6.331975133970255e-06, "loss": 0.0713, "step": 2420 }, { "epoch": 1.73, "grad_norm": 20.282763003405808, "learning_rate": 6.329189200599566e-06, "loss": 0.0861, "step": 2421 }, { "epoch": 1.73, "grad_norm": 19.267431318750052, "learning_rate": 6.326402823122059e-06, "loss": 0.061, "step": 2422 }, { "epoch": 1.73, "grad_norm": 16.487214621707068, "learning_rate": 6.3236160024687134e-06, "loss": 0.0946, "step": 2423 }, { "epoch": 1.73, "grad_norm": 8.987316243770564, "learning_rate": 6.3208287395706595e-06, "loss": 0.1047, "step": 2424 }, { "epoch": 1.73, "grad_norm": 23.44498703021866, "learning_rate": 6.3180410353591735e-06, "loss": 0.1006, "step": 2425 }, { "epoch": 1.73, "grad_norm": 11.465144785371189, "learning_rate": 6.315252890765678e-06, "loss": 0.0963, "step": 2426 }, { "epoch": 1.73, "grad_norm": 5.166961299823076, "learning_rate": 6.312464306721745e-06, "loss": 0.0905, "step": 2427 }, { "epoch": 1.73, "grad_norm": 9.634991916602496, "learning_rate": 6.309675284159093e-06, "loss": 0.0927, "step": 2428 }, { "epoch": 1.73, "grad_norm": 11.247409453302282, "learning_rate": 6.306885824009585e-06, "loss": 0.0801, "step": 2429 }, { "epoch": 1.73, "grad_norm": 6.056553339693292, "learning_rate": 6.3040959272052315e-06, "loss": 0.0787, "step": 2430 }, { "epoch": 1.74, "grad_norm": 13.238200555357215, "learning_rate": 6.301305594678189e-06, "loss": 0.0916, "step": 2431 }, { "epoch": 1.74, "grad_norm": 5.245856258170428, "learning_rate": 6.2985148273607586e-06, "loss": 0.0818, "step": 2432 }, { "epoch": 1.74, "grad_norm": 16.58705389948258, "learning_rate": 6.29572362618539e-06, "loss": 0.0864, "step": 2433 }, { "epoch": 1.74, "grad_norm": 24.4358198678026, "learning_rate": 6.292931992084672e-06, "loss": 0.1365, "step": 2434 }, { "epoch": 1.74, "grad_norm": 14.195560279980855, "learning_rate": 6.290139925991345e-06, "loss": 0.1036, "step": 2435 }, { "epoch": 1.74, "grad_norm": 14.566630680835873, "learning_rate": 6.287347428838289e-06, "loss": 0.067, "step": 2436 }, { "epoch": 1.74, "grad_norm": 10.846086245631618, "learning_rate": 6.2845545015585275e-06, "loss": 0.126, "step": 2437 }, { "epoch": 1.74, "grad_norm": 6.710131680304093, "learning_rate": 6.281761145085232e-06, "loss": 0.0868, "step": 2438 }, { "epoch": 1.74, "grad_norm": 9.77685196724837, "learning_rate": 6.278967360351712e-06, "loss": 0.0619, "step": 2439 }, { "epoch": 1.74, "grad_norm": 22.26451062145235, "learning_rate": 6.276173148291425e-06, "loss": 0.1013, "step": 2440 }, { "epoch": 1.74, "grad_norm": 4.273890399178918, "learning_rate": 6.273378509837969e-06, "loss": 0.0831, "step": 2441 }, { "epoch": 1.74, "grad_norm": 22.997496293271507, "learning_rate": 6.2705834459250825e-06, "loss": 0.1004, "step": 2442 }, { "epoch": 1.74, "grad_norm": 11.57569324607047, "learning_rate": 6.2677879574866515e-06, "loss": 0.0858, "step": 2443 }, { "epoch": 1.74, "grad_norm": 33.78323186801771, "learning_rate": 6.264992045456699e-06, "loss": 0.111, "step": 2444 }, { "epoch": 1.75, "grad_norm": 10.134971347540446, "learning_rate": 6.262195710769391e-06, "loss": 0.089, "step": 2445 }, { "epoch": 1.75, "grad_norm": 26.861775030048683, "learning_rate": 6.259398954359037e-06, "loss": 0.1116, "step": 2446 }, { "epoch": 1.75, "grad_norm": 7.188038510887187, "learning_rate": 6.256601777160082e-06, "loss": 0.1488, "step": 2447 }, { "epoch": 1.75, "grad_norm": 23.487091229583946, "learning_rate": 6.253804180107116e-06, "loss": 0.1021, "step": 2448 }, { "epoch": 1.75, "grad_norm": 12.589395392272444, "learning_rate": 6.2510061641348695e-06, "loss": 0.1018, "step": 2449 }, { "epoch": 1.75, "grad_norm": 13.31139980832625, "learning_rate": 6.248207730178211e-06, "loss": 0.0819, "step": 2450 }, { "epoch": 1.75, "grad_norm": 12.211913937617608, "learning_rate": 6.245408879172148e-06, "loss": 0.1106, "step": 2451 }, { "epoch": 1.75, "grad_norm": 5.641616344245171, "learning_rate": 6.24260961205183e-06, "loss": 0.0833, "step": 2452 }, { "epoch": 1.75, "grad_norm": 21.256785415127464, "learning_rate": 6.239809929752544e-06, "loss": 0.0796, "step": 2453 }, { "epoch": 1.75, "grad_norm": 10.627318442838185, "learning_rate": 6.237009833209715e-06, "loss": 0.1066, "step": 2454 }, { "epoch": 1.75, "grad_norm": 12.179171841920033, "learning_rate": 6.2342093233589095e-06, "loss": 0.1462, "step": 2455 }, { "epoch": 1.75, "grad_norm": 14.930949343627839, "learning_rate": 6.231408401135828e-06, "loss": 0.1081, "step": 2456 }, { "epoch": 1.75, "grad_norm": 27.796422103824614, "learning_rate": 6.228607067476311e-06, "loss": 0.1116, "step": 2457 }, { "epoch": 1.75, "grad_norm": 19.282120894394577, "learning_rate": 6.225805323316336e-06, "loss": 0.1353, "step": 2458 }, { "epoch": 1.76, "grad_norm": 24.664287072388348, "learning_rate": 6.223003169592018e-06, "loss": 0.1069, "step": 2459 }, { "epoch": 1.76, "grad_norm": 11.014341949865857, "learning_rate": 6.220200607239609e-06, "loss": 0.1069, "step": 2460 }, { "epoch": 1.76, "grad_norm": 9.682585859806537, "learning_rate": 6.217397637195497e-06, "loss": 0.1176, "step": 2461 }, { "epoch": 1.76, "grad_norm": 5.254486314903181, "learning_rate": 6.214594260396206e-06, "loss": 0.078, "step": 2462 }, { "epoch": 1.76, "grad_norm": 6.715958544218641, "learning_rate": 6.211790477778399e-06, "loss": 0.0914, "step": 2463 }, { "epoch": 1.76, "grad_norm": 9.16222865335177, "learning_rate": 6.208986290278866e-06, "loss": 0.0664, "step": 2464 }, { "epoch": 1.76, "grad_norm": 17.220240072763353, "learning_rate": 6.206181698834544e-06, "loss": 0.1018, "step": 2465 }, { "epoch": 1.76, "grad_norm": 16.278402617080406, "learning_rate": 6.2033767043824955e-06, "loss": 0.0988, "step": 2466 }, { "epoch": 1.76, "grad_norm": 16.789027698060593, "learning_rate": 6.200571307859923e-06, "loss": 0.1199, "step": 2467 }, { "epoch": 1.76, "grad_norm": 16.858190409234904, "learning_rate": 6.197765510204161e-06, "loss": 0.1025, "step": 2468 }, { "epoch": 1.76, "grad_norm": 20.311780934271166, "learning_rate": 6.19495931235268e-06, "loss": 0.0986, "step": 2469 }, { "epoch": 1.76, "grad_norm": 8.24075864852246, "learning_rate": 6.19215271524308e-06, "loss": 0.0897, "step": 2470 }, { "epoch": 1.76, "grad_norm": 7.537619256321832, "learning_rate": 6.189345719813099e-06, "loss": 0.1053, "step": 2471 }, { "epoch": 1.76, "grad_norm": 19.985538588188874, "learning_rate": 6.186538327000609e-06, "loss": 0.0933, "step": 2472 }, { "epoch": 1.77, "grad_norm": 10.951691550287652, "learning_rate": 6.183730537743607e-06, "loss": 0.0876, "step": 2473 }, { "epoch": 1.77, "grad_norm": 14.796383785272676, "learning_rate": 6.18092235298023e-06, "loss": 0.1133, "step": 2474 }, { "epoch": 1.77, "grad_norm": 8.222463592646822, "learning_rate": 6.178113773648745e-06, "loss": 0.0845, "step": 2475 }, { "epoch": 1.77, "grad_norm": 9.583268825766998, "learning_rate": 6.175304800687551e-06, "loss": 0.0778, "step": 2476 }, { "epoch": 1.77, "grad_norm": 6.546628402576362, "learning_rate": 6.172495435035176e-06, "loss": 0.1199, "step": 2477 }, { "epoch": 1.77, "grad_norm": 6.515726741394918, "learning_rate": 6.169685677630284e-06, "loss": 0.0916, "step": 2478 }, { "epoch": 1.77, "grad_norm": 6.737944386905074, "learning_rate": 6.1668755294116655e-06, "loss": 0.0611, "step": 2479 }, { "epoch": 1.77, "grad_norm": 9.071570166337342, "learning_rate": 6.1640649913182436e-06, "loss": 0.0988, "step": 2480 }, { "epoch": 1.77, "grad_norm": 5.529604937790912, "learning_rate": 6.161254064289072e-06, "loss": 0.084, "step": 2481 }, { "epoch": 1.77, "grad_norm": 13.061035895196843, "learning_rate": 6.158442749263332e-06, "loss": 0.0971, "step": 2482 }, { "epoch": 1.77, "grad_norm": 17.682527665030868, "learning_rate": 6.155631047180337e-06, "loss": 0.0814, "step": 2483 }, { "epoch": 1.77, "grad_norm": 14.400108863552052, "learning_rate": 6.152818958979529e-06, "loss": 0.0972, "step": 2484 }, { "epoch": 1.77, "grad_norm": 4.875457146337422, "learning_rate": 6.1500064856004796e-06, "loss": 0.0699, "step": 2485 }, { "epoch": 1.77, "grad_norm": 15.446178764057269, "learning_rate": 6.147193627982887e-06, "loss": 0.099, "step": 2486 }, { "epoch": 1.78, "grad_norm": 7.245784964129334, "learning_rate": 6.144380387066581e-06, "loss": 0.1026, "step": 2487 }, { "epoch": 1.78, "grad_norm": 24.36888800556823, "learning_rate": 6.141566763791518e-06, "loss": 0.1219, "step": 2488 }, { "epoch": 1.78, "grad_norm": 11.458177901064403, "learning_rate": 6.138752759097778e-06, "loss": 0.0728, "step": 2489 }, { "epoch": 1.78, "grad_norm": 6.774247738507147, "learning_rate": 6.135938373925576e-06, "loss": 0.0879, "step": 2490 }, { "epoch": 1.78, "grad_norm": 5.476440380457411, "learning_rate": 6.133123609215249e-06, "loss": 0.0761, "step": 2491 }, { "epoch": 1.78, "grad_norm": 5.095496325152822, "learning_rate": 6.130308465907263e-06, "loss": 0.0991, "step": 2492 }, { "epoch": 1.78, "grad_norm": 6.029873546120703, "learning_rate": 6.127492944942209e-06, "loss": 0.0861, "step": 2493 }, { "epoch": 1.78, "grad_norm": 19.09411821013496, "learning_rate": 6.124677047260805e-06, "loss": 0.0801, "step": 2494 }, { "epoch": 1.78, "grad_norm": 18.044110197346168, "learning_rate": 6.121860773803895e-06, "loss": 0.099, "step": 2495 }, { "epoch": 1.78, "grad_norm": 9.097311899728787, "learning_rate": 6.119044125512447e-06, "loss": 0.082, "step": 2496 }, { "epoch": 1.78, "grad_norm": 10.196462877637837, "learning_rate": 6.116227103327559e-06, "loss": 0.1102, "step": 2497 }, { "epoch": 1.78, "grad_norm": 17.411823414220972, "learning_rate": 6.113409708190447e-06, "loss": 0.1019, "step": 2498 }, { "epoch": 1.78, "grad_norm": 18.155557304557057, "learning_rate": 6.1105919410424566e-06, "loss": 0.1013, "step": 2499 }, { "epoch": 1.78, "grad_norm": 16.792200022527275, "learning_rate": 6.107773802825055e-06, "loss": 0.1157, "step": 2500 }, { "epoch": 1.78, "eval_avg_AUC": 0.8162853827527864, "eval_avg_Accuracy": 0.7180039787798409, "eval_avg_Accuracy-right": 0.8944176340159123, "eval_avg_Accuracy-wrong": 0.4103934500795997, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6879813648500559, "eval_last_AUC": 0.8267063817923214, "eval_last_Accuracy": 0.7504559018567639, "eval_last_Accuracy-right": 0.8129646537107083, "eval_last_Accuracy-wrong": 0.6414600864225608, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6987671721798714, "eval_max_AUC": 0.7501564696977459, "eval_max_Accuracy": 0.6474635278514589, "eval_max_Accuracy-right": 0.9788704838920047, "eval_max_Accuracy-wrong": 0.0695929042528997, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6173876633213748, "eval_min_AUC": 0.824010385799939, "eval_min_Accuracy": 0.7522795092838196, "eval_min_Accuracy-right": 0.7631407330116082, "eval_min_Accuracy-wrong": 0.7333409142597226, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6985406564564841, "eval_prod_AUC": 0.8230216036527745, "eval_prod_Accuracy": 0.712367374005305, "eval_prod_Accuracy-right": 0.617320986044085, "eval_prod_Accuracy-wrong": 0.8780987036615875, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6893611435134097, "eval_runtime": 247.0285, "eval_samples_per_second": 97.673, "eval_steps_per_second": 3.052, "eval_sum_AUC": 0.6795322314303073, "eval_sum_Accuracy": 0.6392572944297082, "eval_sum_Accuracy-right": 0.9985000652145559, "eval_sum_Accuracy-wrong": 0.01284967022970207, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6636898117156945, "step": 2500 }, { "epoch": 1.79, "grad_norm": 27.66785481931168, "learning_rate": 6.1049552944798355e-06, "loss": 0.1031, "step": 2501 }, { "epoch": 1.79, "grad_norm": 13.877238096501856, "learning_rate": 6.102136416948513e-06, "loss": 0.0922, "step": 2502 }, { "epoch": 1.79, "grad_norm": 14.832991307429838, "learning_rate": 6.099317171172929e-06, "loss": 0.1277, "step": 2503 }, { "epoch": 1.79, "grad_norm": 13.151505015978923, "learning_rate": 6.0964975580950445e-06, "loss": 0.088, "step": 2504 }, { "epoch": 1.79, "grad_norm": 14.194545354333274, "learning_rate": 6.093677578656946e-06, "loss": 0.1083, "step": 2505 }, { "epoch": 1.79, "grad_norm": 8.548368155403958, "learning_rate": 6.090857233800839e-06, "loss": 0.0778, "step": 2506 }, { "epoch": 1.79, "grad_norm": 29.162206973113896, "learning_rate": 6.0880365244690546e-06, "loss": 0.1195, "step": 2507 }, { "epoch": 1.79, "grad_norm": 8.958049048309908, "learning_rate": 6.085215451604044e-06, "loss": 0.0724, "step": 2508 }, { "epoch": 1.79, "grad_norm": 29.68574871023764, "learning_rate": 6.082394016148379e-06, "loss": 0.0842, "step": 2509 }, { "epoch": 1.79, "grad_norm": 34.56039583857383, "learning_rate": 6.079572219044755e-06, "loss": 0.1259, "step": 2510 }, { "epoch": 1.79, "grad_norm": 12.146084312643838, "learning_rate": 6.076750061235985e-06, "loss": 0.0802, "step": 2511 }, { "epoch": 1.79, "grad_norm": 17.542205198619733, "learning_rate": 6.073927543665008e-06, "loss": 0.1012, "step": 2512 }, { "epoch": 1.79, "grad_norm": 28.234603954607753, "learning_rate": 6.071104667274875e-06, "loss": 0.1429, "step": 2513 }, { "epoch": 1.79, "grad_norm": 22.631420233858943, "learning_rate": 6.068281433008765e-06, "loss": 0.1251, "step": 2514 }, { "epoch": 1.8, "grad_norm": 8.105380110647712, "learning_rate": 6.0654578418099715e-06, "loss": 0.0927, "step": 2515 }, { "epoch": 1.8, "grad_norm": 13.931020901993193, "learning_rate": 6.062633894621909e-06, "loss": 0.0729, "step": 2516 }, { "epoch": 1.8, "grad_norm": 26.71305807576688, "learning_rate": 6.0598095923881105e-06, "loss": 0.1173, "step": 2517 }, { "epoch": 1.8, "grad_norm": 23.253308243155512, "learning_rate": 6.056984936052229e-06, "loss": 0.092, "step": 2518 }, { "epoch": 1.8, "grad_norm": 11.399153561239252, "learning_rate": 6.054159926558033e-06, "loss": 0.1056, "step": 2519 }, { "epoch": 1.8, "grad_norm": 32.15751111995552, "learning_rate": 6.051334564849413e-06, "loss": 0.1127, "step": 2520 }, { "epoch": 1.8, "grad_norm": 7.043809237704044, "learning_rate": 6.048508851870372e-06, "loss": 0.0915, "step": 2521 }, { "epoch": 1.8, "grad_norm": 17.66369609169051, "learning_rate": 6.045682788565036e-06, "loss": 0.0673, "step": 2522 }, { "epoch": 1.8, "grad_norm": 11.479436418010804, "learning_rate": 6.042856375877644e-06, "loss": 0.1146, "step": 2523 }, { "epoch": 1.8, "grad_norm": 12.596499035596006, "learning_rate": 6.040029614752551e-06, "loss": 0.1011, "step": 2524 }, { "epoch": 1.8, "grad_norm": 27.997863595749756, "learning_rate": 6.037202506134234e-06, "loss": 0.0824, "step": 2525 }, { "epoch": 1.8, "grad_norm": 21.9014456038331, "learning_rate": 6.03437505096728e-06, "loss": 0.0857, "step": 2526 }, { "epoch": 1.8, "grad_norm": 11.901316851446808, "learning_rate": 6.0315472501963955e-06, "loss": 0.1244, "step": 2527 }, { "epoch": 1.8, "grad_norm": 8.351044648698084, "learning_rate": 6.028719104766402e-06, "loss": 0.0792, "step": 2528 }, { "epoch": 1.81, "grad_norm": 21.352307377026413, "learning_rate": 6.025890615622233e-06, "loss": 0.1433, "step": 2529 }, { "epoch": 1.81, "grad_norm": 26.395611117406183, "learning_rate": 6.023061783708941e-06, "loss": 0.0824, "step": 2530 }, { "epoch": 1.81, "grad_norm": 8.387064879765818, "learning_rate": 6.020232609971694e-06, "loss": 0.093, "step": 2531 }, { "epoch": 1.81, "grad_norm": 7.527848046304538, "learning_rate": 6.017403095355766e-06, "loss": 0.106, "step": 2532 }, { "epoch": 1.81, "grad_norm": 30.56609985998804, "learning_rate": 6.014573240806553e-06, "loss": 0.1035, "step": 2533 }, { "epoch": 1.81, "grad_norm": 18.801880253810776, "learning_rate": 6.011743047269563e-06, "loss": 0.1088, "step": 2534 }, { "epoch": 1.81, "grad_norm": 9.532091172107885, "learning_rate": 6.008912515690415e-06, "loss": 0.0753, "step": 2535 }, { "epoch": 1.81, "grad_norm": 12.658859923242405, "learning_rate": 6.006081647014842e-06, "loss": 0.0805, "step": 2536 }, { "epoch": 1.81, "grad_norm": 29.195819838210074, "learning_rate": 6.00325044218869e-06, "loss": 0.1105, "step": 2537 }, { "epoch": 1.81, "grad_norm": 18.51162600271363, "learning_rate": 6.000418902157919e-06, "loss": 0.1062, "step": 2538 }, { "epoch": 1.81, "grad_norm": 44.1732405351717, "learning_rate": 5.997587027868598e-06, "loss": 0.1221, "step": 2539 }, { "epoch": 1.81, "grad_norm": 29.309897683209353, "learning_rate": 5.994754820266908e-06, "loss": 0.0969, "step": 2540 }, { "epoch": 1.81, "grad_norm": 8.87554200493296, "learning_rate": 5.991922280299143e-06, "loss": 0.0918, "step": 2541 }, { "epoch": 1.81, "grad_norm": 7.609783728113712, "learning_rate": 5.989089408911706e-06, "loss": 0.0911, "step": 2542 }, { "epoch": 1.82, "grad_norm": 14.418526098296795, "learning_rate": 5.986256207051113e-06, "loss": 0.1036, "step": 2543 }, { "epoch": 1.82, "grad_norm": 58.75440816966157, "learning_rate": 5.98342267566399e-06, "loss": 0.1527, "step": 2544 }, { "epoch": 1.82, "grad_norm": 32.69007877459673, "learning_rate": 5.9805888156970714e-06, "loss": 0.1378, "step": 2545 }, { "epoch": 1.82, "grad_norm": 28.46074577433084, "learning_rate": 5.977754628097203e-06, "loss": 0.0883, "step": 2546 }, { "epoch": 1.82, "grad_norm": 36.58388507387772, "learning_rate": 5.97492011381134e-06, "loss": 0.088, "step": 2547 }, { "epoch": 1.82, "grad_norm": 40.783240549154485, "learning_rate": 5.972085273786547e-06, "loss": 0.0876, "step": 2548 }, { "epoch": 1.82, "grad_norm": 27.81800708078887, "learning_rate": 5.969250108969995e-06, "loss": 0.1101, "step": 2549 }, { "epoch": 1.82, "grad_norm": 8.59147754384461, "learning_rate": 5.966414620308965e-06, "loss": 0.092, "step": 2550 }, { "epoch": 1.82, "grad_norm": 28.42033100421827, "learning_rate": 5.9635788087508474e-06, "loss": 0.1016, "step": 2551 }, { "epoch": 1.82, "grad_norm": 34.74266526203714, "learning_rate": 5.960742675243139e-06, "loss": 0.0794, "step": 2552 }, { "epoch": 1.82, "grad_norm": 21.553513330683376, "learning_rate": 5.957906220733447e-06, "loss": 0.0999, "step": 2553 }, { "epoch": 1.82, "grad_norm": 5.114093682906385, "learning_rate": 5.9550694461694806e-06, "loss": 0.0754, "step": 2554 }, { "epoch": 1.82, "grad_norm": 7.8221315937987494, "learning_rate": 5.95223235249906e-06, "loss": 0.099, "step": 2555 }, { "epoch": 1.82, "grad_norm": 16.717018622807625, "learning_rate": 5.949394940670112e-06, "loss": 0.0839, "step": 2556 }, { "epoch": 1.83, "grad_norm": 10.214246667729665, "learning_rate": 5.946557211630667e-06, "loss": 0.0936, "step": 2557 }, { "epoch": 1.83, "grad_norm": 9.657749645898761, "learning_rate": 5.943719166328864e-06, "loss": 0.0894, "step": 2558 }, { "epoch": 1.83, "grad_norm": 5.149493730592742, "learning_rate": 5.940880805712945e-06, "loss": 0.0777, "step": 2559 }, { "epoch": 1.83, "grad_norm": 6.2592533001332225, "learning_rate": 5.938042130731262e-06, "loss": 0.1044, "step": 2560 }, { "epoch": 1.83, "grad_norm": 32.71064285721712, "learning_rate": 5.935203142332267e-06, "loss": 0.1262, "step": 2561 }, { "epoch": 1.83, "grad_norm": 6.8348479431905105, "learning_rate": 5.932363841464519e-06, "loss": 0.0815, "step": 2562 }, { "epoch": 1.83, "grad_norm": 30.961681239814183, "learning_rate": 5.9295242290766805e-06, "loss": 0.1127, "step": 2563 }, { "epoch": 1.83, "grad_norm": 18.416776056408292, "learning_rate": 5.9266843061175216e-06, "loss": 0.0973, "step": 2564 }, { "epoch": 1.83, "grad_norm": 10.93811275476671, "learning_rate": 5.92384407353591e-06, "loss": 0.1251, "step": 2565 }, { "epoch": 1.83, "grad_norm": 5.447480213973523, "learning_rate": 5.921003532280822e-06, "loss": 0.0801, "step": 2566 }, { "epoch": 1.83, "grad_norm": 7.813801842459522, "learning_rate": 5.918162683301336e-06, "loss": 0.1125, "step": 2567 }, { "epoch": 1.83, "grad_norm": 28.216557976157063, "learning_rate": 5.91532152754663e-06, "loss": 0.1176, "step": 2568 }, { "epoch": 1.83, "grad_norm": 19.011803166843467, "learning_rate": 5.91248006596599e-06, "loss": 0.1005, "step": 2569 }, { "epoch": 1.83, "grad_norm": 22.129204827206834, "learning_rate": 5.909638299508798e-06, "loss": 0.0659, "step": 2570 }, { "epoch": 1.84, "grad_norm": 8.030892883456888, "learning_rate": 5.906796229124543e-06, "loss": 0.101, "step": 2571 }, { "epoch": 1.84, "grad_norm": 16.213774164615035, "learning_rate": 5.903953855762812e-06, "loss": 0.0795, "step": 2572 }, { "epoch": 1.84, "grad_norm": 5.430953096593536, "learning_rate": 5.901111180373298e-06, "loss": 0.1147, "step": 2573 }, { "epoch": 1.84, "grad_norm": 43.58218646987675, "learning_rate": 5.898268203905788e-06, "loss": 0.1244, "step": 2574 }, { "epoch": 1.84, "grad_norm": 36.70832267287658, "learning_rate": 5.895424927310174e-06, "loss": 0.1086, "step": 2575 }, { "epoch": 1.84, "grad_norm": 23.88936445567082, "learning_rate": 5.89258135153645e-06, "loss": 0.1085, "step": 2576 }, { "epoch": 1.84, "grad_norm": 43.53612470901018, "learning_rate": 5.889737477534704e-06, "loss": 0.103, "step": 2577 }, { "epoch": 1.84, "grad_norm": 34.14388783488702, "learning_rate": 5.886893306255129e-06, "loss": 0.1014, "step": 2578 }, { "epoch": 1.84, "grad_norm": 21.552187716115085, "learning_rate": 5.884048838648017e-06, "loss": 0.1003, "step": 2579 }, { "epoch": 1.84, "grad_norm": 6.854450569018791, "learning_rate": 5.881204075663755e-06, "loss": 0.1073, "step": 2580 }, { "epoch": 1.84, "grad_norm": 21.51464575703357, "learning_rate": 5.878359018252831e-06, "loss": 0.1342, "step": 2581 }, { "epoch": 1.84, "grad_norm": 41.8483916842586, "learning_rate": 5.8755136673658365e-06, "loss": 0.1082, "step": 2582 }, { "epoch": 1.84, "grad_norm": 37.636416874159046, "learning_rate": 5.872668023953449e-06, "loss": 0.0976, "step": 2583 }, { "epoch": 1.84, "grad_norm": 7.552098998633584, "learning_rate": 5.869822088966455e-06, "loss": 0.0833, "step": 2584 }, { "epoch": 1.85, "grad_norm": 10.713494577494489, "learning_rate": 5.866975863355734e-06, "loss": 0.0893, "step": 2585 }, { "epoch": 1.85, "grad_norm": 23.2091103383476, "learning_rate": 5.864129348072261e-06, "loss": 0.1205, "step": 2586 }, { "epoch": 1.85, "grad_norm": 31.823573317574535, "learning_rate": 5.861282544067112e-06, "loss": 0.1232, "step": 2587 }, { "epoch": 1.85, "grad_norm": 22.21437732970618, "learning_rate": 5.8584354522914555e-06, "loss": 0.1088, "step": 2588 }, { "epoch": 1.85, "grad_norm": 11.487315525706093, "learning_rate": 5.855588073696559e-06, "loss": 0.0837, "step": 2589 }, { "epoch": 1.85, "grad_norm": 22.281223727113854, "learning_rate": 5.852740409233785e-06, "loss": 0.0732, "step": 2590 }, { "epoch": 1.85, "grad_norm": 24.22884461950451, "learning_rate": 5.849892459854588e-06, "loss": 0.0881, "step": 2591 }, { "epoch": 1.85, "grad_norm": 25.959868893514688, "learning_rate": 5.847044226510524e-06, "loss": 0.0851, "step": 2592 }, { "epoch": 1.85, "grad_norm": 17.515409904214152, "learning_rate": 5.84419571015324e-06, "loss": 0.1182, "step": 2593 }, { "epoch": 1.85, "grad_norm": 14.3143330651017, "learning_rate": 5.8413469117344766e-06, "loss": 0.1311, "step": 2594 }, { "epoch": 1.85, "grad_norm": 13.137514401761287, "learning_rate": 5.838497832206074e-06, "loss": 0.0912, "step": 2595 }, { "epoch": 1.85, "grad_norm": 6.325028784584776, "learning_rate": 5.835648472519958e-06, "loss": 0.0896, "step": 2596 }, { "epoch": 1.85, "grad_norm": 24.617506819797416, "learning_rate": 5.832798833628156e-06, "loss": 0.0967, "step": 2597 }, { "epoch": 1.85, "grad_norm": 5.056349053356649, "learning_rate": 5.829948916482784e-06, "loss": 0.0851, "step": 2598 }, { "epoch": 1.86, "grad_norm": 5.059342030707879, "learning_rate": 5.827098722036053e-06, "loss": 0.0838, "step": 2599 }, { "epoch": 1.86, "grad_norm": 21.709897870658843, "learning_rate": 5.824248251240265e-06, "loss": 0.1256, "step": 2600 }, { "epoch": 1.86, "grad_norm": 21.868681173826214, "learning_rate": 5.8213975050478155e-06, "loss": 0.0929, "step": 2601 }, { "epoch": 1.86, "grad_norm": 12.614721781840734, "learning_rate": 5.818546484411191e-06, "loss": 0.1243, "step": 2602 }, { "epoch": 1.86, "grad_norm": 10.034829947869662, "learning_rate": 5.815695190282974e-06, "loss": 0.1393, "step": 2603 }, { "epoch": 1.86, "grad_norm": 13.607835352795393, "learning_rate": 5.81284362361583e-06, "loss": 0.1298, "step": 2604 }, { "epoch": 1.86, "grad_norm": 10.740168450428905, "learning_rate": 5.809991785362525e-06, "loss": 0.0995, "step": 2605 }, { "epoch": 1.86, "grad_norm": 10.42600482469918, "learning_rate": 5.8071396764759065e-06, "loss": 0.1045, "step": 2606 }, { "epoch": 1.86, "grad_norm": 14.34092985354534, "learning_rate": 5.804287297908923e-06, "loss": 0.099, "step": 2607 }, { "epoch": 1.86, "grad_norm": 12.665461324372817, "learning_rate": 5.801434650614601e-06, "loss": 0.1346, "step": 2608 }, { "epoch": 1.86, "grad_norm": 6.536614697320999, "learning_rate": 5.798581735546066e-06, "loss": 0.1066, "step": 2609 }, { "epoch": 1.86, "grad_norm": 5.547830795525716, "learning_rate": 5.79572855365653e-06, "loss": 0.0737, "step": 2610 }, { "epoch": 1.86, "grad_norm": 22.917140065123103, "learning_rate": 5.792875105899294e-06, "loss": 0.1536, "step": 2611 }, { "epoch": 1.86, "grad_norm": 14.280858621166734, "learning_rate": 5.790021393227747e-06, "loss": 0.1257, "step": 2612 }, { "epoch": 1.87, "grad_norm": 8.601450815452466, "learning_rate": 5.787167416595369e-06, "loss": 0.0939, "step": 2613 }, { "epoch": 1.87, "grad_norm": 23.25278219450862, "learning_rate": 5.784313176955726e-06, "loss": 0.1099, "step": 2614 }, { "epoch": 1.87, "grad_norm": 13.52299424961681, "learning_rate": 5.781458675262472e-06, "loss": 0.0918, "step": 2615 }, { "epoch": 1.87, "grad_norm": 13.140181284280587, "learning_rate": 5.778603912469349e-06, "loss": 0.1211, "step": 2616 }, { "epoch": 1.87, "grad_norm": 17.987639399325246, "learning_rate": 5.775748889530187e-06, "loss": 0.1158, "step": 2617 }, { "epoch": 1.87, "grad_norm": 7.029185285872469, "learning_rate": 5.772893607398901e-06, "loss": 0.0793, "step": 2618 }, { "epoch": 1.87, "grad_norm": 10.243106638432673, "learning_rate": 5.770038067029496e-06, "loss": 0.0837, "step": 2619 }, { "epoch": 1.87, "grad_norm": 7.120185441810166, "learning_rate": 5.76718226937606e-06, "loss": 0.071, "step": 2620 }, { "epoch": 1.87, "grad_norm": 17.79704560468372, "learning_rate": 5.764326215392768e-06, "loss": 0.0668, "step": 2621 }, { "epoch": 1.87, "grad_norm": 23.092612411054002, "learning_rate": 5.761469906033879e-06, "loss": 0.0911, "step": 2622 }, { "epoch": 1.87, "grad_norm": 5.923215155786091, "learning_rate": 5.758613342253743e-06, "loss": 0.0652, "step": 2623 }, { "epoch": 1.87, "grad_norm": 12.23566762430646, "learning_rate": 5.7557565250067896e-06, "loss": 0.093, "step": 2624 }, { "epoch": 1.87, "grad_norm": 29.68819365852448, "learning_rate": 5.752899455247532e-06, "loss": 0.0942, "step": 2625 }, { "epoch": 1.87, "grad_norm": 25.11407988203367, "learning_rate": 5.750042133930571e-06, "loss": 0.1067, "step": 2626 }, { "epoch": 1.88, "grad_norm": 13.029913042547735, "learning_rate": 5.7471845620105925e-06, "loss": 0.095, "step": 2627 }, { "epoch": 1.88, "grad_norm": 8.765283553153818, "learning_rate": 5.744326740442364e-06, "loss": 0.1107, "step": 2628 }, { "epoch": 1.88, "grad_norm": 14.7909670270782, "learning_rate": 5.741468670180737e-06, "loss": 0.1256, "step": 2629 }, { "epoch": 1.88, "grad_norm": 16.345896107400005, "learning_rate": 5.738610352180645e-06, "loss": 0.1219, "step": 2630 }, { "epoch": 1.88, "grad_norm": 6.525593347954734, "learning_rate": 5.735751787397106e-06, "loss": 0.0771, "step": 2631 }, { "epoch": 1.88, "grad_norm": 29.13519994204438, "learning_rate": 5.732892976785218e-06, "loss": 0.1133, "step": 2632 }, { "epoch": 1.88, "grad_norm": 10.037077891178535, "learning_rate": 5.730033921300166e-06, "loss": 0.0765, "step": 2633 }, { "epoch": 1.88, "grad_norm": 11.01310603937196, "learning_rate": 5.7271746218972105e-06, "loss": 0.0965, "step": 2634 }, { "epoch": 1.88, "grad_norm": 10.383120380608432, "learning_rate": 5.724315079531697e-06, "loss": 0.0765, "step": 2635 }, { "epoch": 1.88, "grad_norm": 14.053650424576215, "learning_rate": 5.721455295159053e-06, "loss": 0.1095, "step": 2636 }, { "epoch": 1.88, "grad_norm": 13.49823698629184, "learning_rate": 5.7185952697347844e-06, "loss": 0.1095, "step": 2637 }, { "epoch": 1.88, "grad_norm": 15.100988227402953, "learning_rate": 5.71573500421448e-06, "loss": 0.0826, "step": 2638 }, { "epoch": 1.88, "grad_norm": 18.69455842462284, "learning_rate": 5.712874499553807e-06, "loss": 0.1101, "step": 2639 }, { "epoch": 1.88, "grad_norm": 8.131975095287098, "learning_rate": 5.710013756708513e-06, "loss": 0.1218, "step": 2640 }, { "epoch": 1.89, "grad_norm": 18.309767202091365, "learning_rate": 5.707152776634427e-06, "loss": 0.0981, "step": 2641 }, { "epoch": 1.89, "grad_norm": 17.760090104193882, "learning_rate": 5.704291560287454e-06, "loss": 0.1068, "step": 2642 }, { "epoch": 1.89, "grad_norm": 17.356458991602008, "learning_rate": 5.701430108623578e-06, "loss": 0.0968, "step": 2643 }, { "epoch": 1.89, "grad_norm": 8.346261757755903, "learning_rate": 5.698568422598867e-06, "loss": 0.0783, "step": 2644 }, { "epoch": 1.89, "grad_norm": 10.936135220918278, "learning_rate": 5.69570650316946e-06, "loss": 0.0816, "step": 2645 }, { "epoch": 1.89, "grad_norm": 12.619615725003502, "learning_rate": 5.69284435129158e-06, "loss": 0.105, "step": 2646 }, { "epoch": 1.89, "grad_norm": 5.963461261946324, "learning_rate": 5.689981967921523e-06, "loss": 0.0933, "step": 2647 }, { "epoch": 1.89, "grad_norm": 11.236246645246913, "learning_rate": 5.6871193540156666e-06, "loss": 0.0918, "step": 2648 }, { "epoch": 1.89, "grad_norm": 12.81684610998191, "learning_rate": 5.684256510530461e-06, "loss": 0.1476, "step": 2649 }, { "epoch": 1.89, "grad_norm": 11.153293940717711, "learning_rate": 5.68139343842244e-06, "loss": 0.0962, "step": 2650 }, { "epoch": 1.89, "grad_norm": 14.765702957258346, "learning_rate": 5.678530138648204e-06, "loss": 0.0981, "step": 2651 }, { "epoch": 1.89, "grad_norm": 7.541403950790154, "learning_rate": 5.675666612164436e-06, "loss": 0.1184, "step": 2652 }, { "epoch": 1.89, "grad_norm": 27.66496804411558, "learning_rate": 5.672802859927895e-06, "loss": 0.093, "step": 2653 }, { "epoch": 1.89, "grad_norm": 9.315476185301423, "learning_rate": 5.669938882895412e-06, "loss": 0.0898, "step": 2654 }, { "epoch": 1.9, "grad_norm": 9.178791444866505, "learning_rate": 5.667074682023896e-06, "loss": 0.0924, "step": 2655 }, { "epoch": 1.9, "grad_norm": 13.533905636009482, "learning_rate": 5.664210258270331e-06, "loss": 0.1217, "step": 2656 }, { "epoch": 1.9, "grad_norm": 18.36169164512371, "learning_rate": 5.661345612591771e-06, "loss": 0.0782, "step": 2657 }, { "epoch": 1.9, "grad_norm": 7.538301490553328, "learning_rate": 5.6584807459453515e-06, "loss": 0.0942, "step": 2658 }, { "epoch": 1.9, "grad_norm": 10.264391649732847, "learning_rate": 5.655615659288274e-06, "loss": 0.1205, "step": 2659 }, { "epoch": 1.9, "grad_norm": 6.794709350968707, "learning_rate": 5.652750353577818e-06, "loss": 0.1161, "step": 2660 }, { "epoch": 1.9, "grad_norm": 13.99871974442599, "learning_rate": 5.649884829771337e-06, "loss": 0.1078, "step": 2661 }, { "epoch": 1.9, "grad_norm": 7.867770861552083, "learning_rate": 5.6470190888262545e-06, "loss": 0.0922, "step": 2662 }, { "epoch": 1.9, "grad_norm": 17.41387225248798, "learning_rate": 5.644153131700067e-06, "loss": 0.1163, "step": 2663 }, { "epoch": 1.9, "grad_norm": 39.2761918554314, "learning_rate": 5.6412869593503476e-06, "loss": 0.1678, "step": 2664 }, { "epoch": 1.9, "grad_norm": 9.993724717189073, "learning_rate": 5.638420572734733e-06, "loss": 0.0949, "step": 2665 }, { "epoch": 1.9, "grad_norm": 10.59354372045396, "learning_rate": 5.63555397281094e-06, "loss": 0.0689, "step": 2666 }, { "epoch": 1.9, "grad_norm": 24.937857294603422, "learning_rate": 5.632687160536751e-06, "loss": 0.0983, "step": 2667 }, { "epoch": 1.9, "grad_norm": 41.37417208875139, "learning_rate": 5.629820136870022e-06, "loss": 0.1111, "step": 2668 }, { "epoch": 1.91, "grad_norm": 7.437831972634287, "learning_rate": 5.626952902768678e-06, "loss": 0.1069, "step": 2669 }, { "epoch": 1.91, "grad_norm": 14.905422981253997, "learning_rate": 5.624085459190717e-06, "loss": 0.0795, "step": 2670 }, { "epoch": 1.91, "grad_norm": 18.705922511860052, "learning_rate": 5.621217807094202e-06, "loss": 0.0938, "step": 2671 }, { "epoch": 1.91, "grad_norm": 31.833596818269967, "learning_rate": 5.618349947437272e-06, "loss": 0.1108, "step": 2672 }, { "epoch": 1.91, "grad_norm": 18.633295676824503, "learning_rate": 5.615481881178132e-06, "loss": 0.1321, "step": 2673 }, { "epoch": 1.91, "grad_norm": 4.9713241603062395, "learning_rate": 5.612613609275054e-06, "loss": 0.0859, "step": 2674 }, { "epoch": 1.91, "grad_norm": 11.386906666975248, "learning_rate": 5.609745132686383e-06, "loss": 0.1326, "step": 2675 }, { "epoch": 1.91, "grad_norm": 33.73569341071956, "learning_rate": 5.60687645237053e-06, "loss": 0.108, "step": 2676 }, { "epoch": 1.91, "grad_norm": 37.4374522679626, "learning_rate": 5.604007569285973e-06, "loss": 0.135, "step": 2677 }, { "epoch": 1.91, "grad_norm": 15.482062569547526, "learning_rate": 5.6011384843912605e-06, "loss": 0.098, "step": 2678 }, { "epoch": 1.91, "grad_norm": 23.405290661541976, "learning_rate": 5.598269198645008e-06, "loss": 0.0983, "step": 2679 }, { "epoch": 1.91, "grad_norm": 12.43065289610563, "learning_rate": 5.5953997130058945e-06, "loss": 0.1143, "step": 2680 }, { "epoch": 1.91, "grad_norm": 25.630102473252506, "learning_rate": 5.5925300284326715e-06, "loss": 0.1412, "step": 2681 }, { "epoch": 1.91, "grad_norm": 45.50344170828186, "learning_rate": 5.5896601458841505e-06, "loss": 0.1259, "step": 2682 }, { "epoch": 1.92, "grad_norm": 30.06550381414141, "learning_rate": 5.586790066319217e-06, "loss": 0.0741, "step": 2683 }, { "epoch": 1.92, "grad_norm": 7.271702734925442, "learning_rate": 5.583919790696814e-06, "loss": 0.0865, "step": 2684 }, { "epoch": 1.92, "grad_norm": 24.493565366620036, "learning_rate": 5.581049319975957e-06, "loss": 0.0911, "step": 2685 }, { "epoch": 1.92, "grad_norm": 10.872141679797997, "learning_rate": 5.57817865511572e-06, "loss": 0.1267, "step": 2686 }, { "epoch": 1.92, "grad_norm": 31.622766470405587, "learning_rate": 5.575307797075249e-06, "loss": 0.1034, "step": 2687 }, { "epoch": 1.92, "grad_norm": 33.281994290944546, "learning_rate": 5.572436746813748e-06, "loss": 0.1101, "step": 2688 }, { "epoch": 1.92, "grad_norm": 9.921027242320786, "learning_rate": 5.5695655052904905e-06, "loss": 0.1333, "step": 2689 }, { "epoch": 1.92, "grad_norm": 3.842511723248251, "learning_rate": 5.566694073464812e-06, "loss": 0.0715, "step": 2690 }, { "epoch": 1.92, "grad_norm": 31.392960437519093, "learning_rate": 5.56382245229611e-06, "loss": 0.1133, "step": 2691 }, { "epoch": 1.92, "grad_norm": 33.161077306234404, "learning_rate": 5.560950642743847e-06, "loss": 0.1158, "step": 2692 }, { "epoch": 1.92, "grad_norm": 7.72182359708755, "learning_rate": 5.558078645767547e-06, "loss": 0.082, "step": 2693 }, { "epoch": 1.92, "grad_norm": 33.264247772214716, "learning_rate": 5.5552064623267986e-06, "loss": 0.1349, "step": 2694 }, { "epoch": 1.92, "grad_norm": 18.089787658256412, "learning_rate": 5.5523340933812505e-06, "loss": 0.1017, "step": 2695 }, { "epoch": 1.92, "grad_norm": 15.398888267717105, "learning_rate": 5.549461539890616e-06, "loss": 0.0844, "step": 2696 }, { "epoch": 1.93, "grad_norm": 24.67559393234702, "learning_rate": 5.546588802814669e-06, "loss": 0.0819, "step": 2697 }, { "epoch": 1.93, "grad_norm": 18.90602279218803, "learning_rate": 5.543715883113241e-06, "loss": 0.0938, "step": 2698 }, { "epoch": 1.93, "grad_norm": 26.617935820127663, "learning_rate": 5.540842781746231e-06, "loss": 0.0928, "step": 2699 }, { "epoch": 1.93, "grad_norm": 9.410245020771018, "learning_rate": 5.537969499673598e-06, "loss": 0.1094, "step": 2700 }, { "epoch": 1.93, "grad_norm": 12.46263647968909, "learning_rate": 5.535096037855353e-06, "loss": 0.0858, "step": 2701 }, { "epoch": 1.93, "grad_norm": 24.402438809378204, "learning_rate": 5.532222397251576e-06, "loss": 0.0955, "step": 2702 }, { "epoch": 1.93, "grad_norm": 18.28253890357763, "learning_rate": 5.529348578822403e-06, "loss": 0.132, "step": 2703 }, { "epoch": 1.93, "grad_norm": 18.717809032260746, "learning_rate": 5.526474583528032e-06, "loss": 0.137, "step": 2704 }, { "epoch": 1.93, "grad_norm": 24.233609934720455, "learning_rate": 5.523600412328716e-06, "loss": 0.1124, "step": 2705 }, { "epoch": 1.93, "grad_norm": 12.27153406116996, "learning_rate": 5.520726066184769e-06, "loss": 0.1357, "step": 2706 }, { "epoch": 1.93, "grad_norm": 36.491820592631434, "learning_rate": 5.517851546056566e-06, "loss": 0.0982, "step": 2707 }, { "epoch": 1.93, "grad_norm": 24.71547500695131, "learning_rate": 5.5149768529045355e-06, "loss": 0.0795, "step": 2708 }, { "epoch": 1.93, "grad_norm": 14.636866072311467, "learning_rate": 5.512101987689168e-06, "loss": 0.0864, "step": 2709 }, { "epoch": 1.93, "grad_norm": 7.1436136261471574, "learning_rate": 5.509226951371006e-06, "loss": 0.0834, "step": 2710 }, { "epoch": 1.94, "grad_norm": 20.77019856777048, "learning_rate": 5.506351744910654e-06, "loss": 0.0908, "step": 2711 }, { "epoch": 1.94, "grad_norm": 20.18000054845474, "learning_rate": 5.503476369268773e-06, "loss": 0.0805, "step": 2712 }, { "epoch": 1.94, "grad_norm": 11.97702212144314, "learning_rate": 5.50060082540608e-06, "loss": 0.0823, "step": 2713 }, { "epoch": 1.94, "grad_norm": 18.93799132198841, "learning_rate": 5.4977251142833445e-06, "loss": 0.1274, "step": 2714 }, { "epoch": 1.94, "grad_norm": 18.781493822249985, "learning_rate": 5.494849236861397e-06, "loss": 0.093, "step": 2715 }, { "epoch": 1.94, "grad_norm": 25.586693957909443, "learning_rate": 5.491973194101122e-06, "loss": 0.1316, "step": 2716 }, { "epoch": 1.94, "grad_norm": 33.83182887402268, "learning_rate": 5.4890969869634606e-06, "loss": 0.1128, "step": 2717 }, { "epoch": 1.94, "grad_norm": 39.09734754951871, "learning_rate": 5.486220616409403e-06, "loss": 0.1139, "step": 2718 }, { "epoch": 1.94, "grad_norm": 17.296007817801954, "learning_rate": 5.4833440834e-06, "loss": 0.0948, "step": 2719 }, { "epoch": 1.94, "grad_norm": 36.970414189809944, "learning_rate": 5.480467388896353e-06, "loss": 0.1217, "step": 2720 }, { "epoch": 1.94, "grad_norm": 41.98479812243881, "learning_rate": 5.477590533859623e-06, "loss": 0.1089, "step": 2721 }, { "epoch": 1.94, "grad_norm": 39.39789221503169, "learning_rate": 5.474713519251018e-06, "loss": 0.1365, "step": 2722 }, { "epoch": 1.94, "grad_norm": 10.022368374310624, "learning_rate": 5.471836346031802e-06, "loss": 0.0945, "step": 2723 }, { "epoch": 1.94, "grad_norm": 11.411241170144551, "learning_rate": 5.468959015163293e-06, "loss": 0.0825, "step": 2724 }, { "epoch": 1.95, "grad_norm": 17.404101960272982, "learning_rate": 5.46608152760686e-06, "loss": 0.0732, "step": 2725 }, { "epoch": 1.95, "grad_norm": 39.62197496277501, "learning_rate": 5.463203884323926e-06, "loss": 0.1486, "step": 2726 }, { "epoch": 1.95, "grad_norm": 27.49778841473844, "learning_rate": 5.460326086275964e-06, "loss": 0.1128, "step": 2727 }, { "epoch": 1.95, "grad_norm": 6.379646370483621, "learning_rate": 5.4574481344245015e-06, "loss": 0.1173, "step": 2728 }, { "epoch": 1.95, "grad_norm": 21.947214598255393, "learning_rate": 5.454570029731115e-06, "loss": 0.1093, "step": 2729 }, { "epoch": 1.95, "grad_norm": 18.335934862211666, "learning_rate": 5.451691773157431e-06, "loss": 0.1183, "step": 2730 }, { "epoch": 1.95, "grad_norm": 51.38289661985466, "learning_rate": 5.448813365665129e-06, "loss": 0.1513, "step": 2731 }, { "epoch": 1.95, "grad_norm": 17.977264185528178, "learning_rate": 5.44593480821594e-06, "loss": 0.1239, "step": 2732 }, { "epoch": 1.95, "grad_norm": 6.769399150938259, "learning_rate": 5.443056101771643e-06, "loss": 0.0939, "step": 2733 }, { "epoch": 1.95, "grad_norm": 49.50787284234708, "learning_rate": 5.44017724729407e-06, "loss": 0.1066, "step": 2734 }, { "epoch": 1.95, "grad_norm": 41.47491103746017, "learning_rate": 5.437298245745093e-06, "loss": 0.1199, "step": 2735 }, { "epoch": 1.95, "grad_norm": 31.21920187501198, "learning_rate": 5.434419098086645e-06, "loss": 0.1379, "step": 2736 }, { "epoch": 1.95, "grad_norm": 4.019317654949999, "learning_rate": 5.431539805280702e-06, "loss": 0.0836, "step": 2737 }, { "epoch": 1.95, "grad_norm": 16.908346573560465, "learning_rate": 5.428660368289289e-06, "loss": 0.1027, "step": 2738 }, { "epoch": 1.96, "grad_norm": 54.3446520419247, "learning_rate": 5.42578078807448e-06, "loss": 0.1442, "step": 2739 }, { "epoch": 1.96, "grad_norm": 29.604460610263637, "learning_rate": 5.422901065598395e-06, "loss": 0.0917, "step": 2740 }, { "epoch": 1.96, "grad_norm": 18.052870526381152, "learning_rate": 5.4200212018232024e-06, "loss": 0.0837, "step": 2741 }, { "epoch": 1.96, "grad_norm": 7.485821680212534, "learning_rate": 5.41714119771112e-06, "loss": 0.0809, "step": 2742 }, { "epoch": 1.96, "grad_norm": 23.671337108776125, "learning_rate": 5.414261054224412e-06, "loss": 0.0701, "step": 2743 }, { "epoch": 1.96, "grad_norm": 13.628921621805322, "learning_rate": 5.411380772325383e-06, "loss": 0.132, "step": 2744 }, { "epoch": 1.96, "grad_norm": 58.95361204923983, "learning_rate": 5.408500352976392e-06, "loss": 0.177, "step": 2745 }, { "epoch": 1.96, "grad_norm": 13.789171355412767, "learning_rate": 5.40561979713984e-06, "loss": 0.0959, "step": 2746 }, { "epoch": 1.96, "grad_norm": 8.334704571609194, "learning_rate": 5.402739105778175e-06, "loss": 0.1243, "step": 2747 }, { "epoch": 1.96, "grad_norm": 46.74494923664595, "learning_rate": 5.399858279853889e-06, "loss": 0.1034, "step": 2748 }, { "epoch": 1.96, "grad_norm": 37.41227408696971, "learning_rate": 5.39697732032952e-06, "loss": 0.0923, "step": 2749 }, { "epoch": 1.96, "grad_norm": 41.32562404134608, "learning_rate": 5.394096228167648e-06, "loss": 0.1169, "step": 2750 }, { "epoch": 1.96, "grad_norm": 11.165428505092693, "learning_rate": 5.391215004330903e-06, "loss": 0.0894, "step": 2751 }, { "epoch": 1.96, "grad_norm": 25.320083374694267, "learning_rate": 5.388333649781951e-06, "loss": 0.121, "step": 2752 }, { "epoch": 1.97, "grad_norm": 34.05814695395899, "learning_rate": 5.3854521654835105e-06, "loss": 0.1019, "step": 2753 }, { "epoch": 1.97, "grad_norm": 37.729778469348126, "learning_rate": 5.3825705523983366e-06, "loss": 0.1252, "step": 2754 }, { "epoch": 1.97, "grad_norm": 5.7768601785361735, "learning_rate": 5.37968881148923e-06, "loss": 0.1022, "step": 2755 }, { "epoch": 1.97, "grad_norm": 21.434040827191126, "learning_rate": 5.376806943719033e-06, "loss": 0.106, "step": 2756 }, { "epoch": 1.97, "grad_norm": 16.80181493949827, "learning_rate": 5.373924950050633e-06, "loss": 0.093, "step": 2757 }, { "epoch": 1.97, "grad_norm": 48.37549473256851, "learning_rate": 5.371042831446957e-06, "loss": 0.172, "step": 2758 }, { "epoch": 1.97, "grad_norm": 5.651227352253567, "learning_rate": 5.3681605888709755e-06, "loss": 0.0944, "step": 2759 }, { "epoch": 1.97, "grad_norm": 8.240276586363413, "learning_rate": 5.365278223285698e-06, "loss": 0.1755, "step": 2760 }, { "epoch": 1.97, "grad_norm": 13.781340769294333, "learning_rate": 5.362395735654175e-06, "loss": 0.1061, "step": 2761 }, { "epoch": 1.97, "grad_norm": 36.77153746292403, "learning_rate": 5.3595131269395015e-06, "loss": 0.1104, "step": 2762 }, { "epoch": 1.97, "grad_norm": 26.923003378387772, "learning_rate": 5.356630398104811e-06, "loss": 0.1163, "step": 2763 }, { "epoch": 1.97, "grad_norm": 19.27470596602233, "learning_rate": 5.353747550113274e-06, "loss": 0.0679, "step": 2764 }, { "epoch": 1.97, "grad_norm": 27.07027957372197, "learning_rate": 5.350864583928106e-06, "loss": 0.1111, "step": 2765 }, { "epoch": 1.97, "grad_norm": 7.765524049007483, "learning_rate": 5.347981500512558e-06, "loss": 0.085, "step": 2766 }, { "epoch": 1.98, "grad_norm": 41.86517901009098, "learning_rate": 5.345098300829924e-06, "loss": 0.1456, "step": 2767 }, { "epoch": 1.98, "grad_norm": 5.349243010931892, "learning_rate": 5.342214985843534e-06, "loss": 0.0994, "step": 2768 }, { "epoch": 1.98, "grad_norm": 16.401974170097617, "learning_rate": 5.339331556516755e-06, "loss": 0.0795, "step": 2769 }, { "epoch": 1.98, "grad_norm": 16.03990289754173, "learning_rate": 5.336448013812996e-06, "loss": 0.1182, "step": 2770 }, { "epoch": 1.98, "grad_norm": 17.74040652818955, "learning_rate": 5.333564358695701e-06, "loss": 0.0903, "step": 2771 }, { "epoch": 1.98, "grad_norm": 41.62125267709523, "learning_rate": 5.330680592128355e-06, "loss": 0.1292, "step": 2772 }, { "epoch": 1.98, "grad_norm": 11.822329634056095, "learning_rate": 5.3277967150744755e-06, "loss": 0.0974, "step": 2773 }, { "epoch": 1.98, "grad_norm": 17.12557284468244, "learning_rate": 5.324912728497621e-06, "loss": 0.104, "step": 2774 }, { "epoch": 1.98, "grad_norm": 51.98493528591735, "learning_rate": 5.322028633361386e-06, "loss": 0.1464, "step": 2775 }, { "epoch": 1.98, "grad_norm": 38.343198477712235, "learning_rate": 5.319144430629397e-06, "loss": 0.1155, "step": 2776 }, { "epoch": 1.98, "grad_norm": 13.395904291966062, "learning_rate": 5.316260121265323e-06, "loss": 0.1008, "step": 2777 }, { "epoch": 1.98, "grad_norm": 12.23983059469622, "learning_rate": 5.313375706232864e-06, "loss": 0.1072, "step": 2778 }, { "epoch": 1.98, "grad_norm": 6.743029425887316, "learning_rate": 5.310491186495757e-06, "loss": 0.0966, "step": 2779 }, { "epoch": 1.98, "grad_norm": 28.054949717811493, "learning_rate": 5.307606563017772e-06, "loss": 0.0975, "step": 2780 }, { "epoch": 1.99, "grad_norm": 15.050796478633394, "learning_rate": 5.304721836762717e-06, "loss": 0.0774, "step": 2781 }, { "epoch": 1.99, "grad_norm": 6.720263367169756, "learning_rate": 5.301837008694433e-06, "loss": 0.0818, "step": 2782 }, { "epoch": 1.99, "grad_norm": 11.712277096253679, "learning_rate": 5.298952079776794e-06, "loss": 0.0989, "step": 2783 }, { "epoch": 1.99, "grad_norm": 7.4999136139393965, "learning_rate": 5.296067050973709e-06, "loss": 0.0798, "step": 2784 }, { "epoch": 1.99, "grad_norm": 16.593799394745258, "learning_rate": 5.29318192324912e-06, "loss": 0.0981, "step": 2785 }, { "epoch": 1.99, "grad_norm": 8.159019478221214, "learning_rate": 5.290296697566999e-06, "loss": 0.0942, "step": 2786 }, { "epoch": 1.99, "grad_norm": 12.250548068310936, "learning_rate": 5.287411374891356e-06, "loss": 0.0811, "step": 2787 }, { "epoch": 1.99, "grad_norm": 11.420608184090936, "learning_rate": 5.284525956186231e-06, "loss": 0.098, "step": 2788 }, { "epoch": 1.99, "grad_norm": 6.178523711245514, "learning_rate": 5.281640442415695e-06, "loss": 0.1224, "step": 2789 }, { "epoch": 1.99, "grad_norm": 14.432868868069074, "learning_rate": 5.278754834543852e-06, "loss": 0.0961, "step": 2790 }, { "epoch": 1.99, "grad_norm": 19.832143314823682, "learning_rate": 5.275869133534838e-06, "loss": 0.0934, "step": 2791 }, { "epoch": 1.99, "grad_norm": 6.173140621912715, "learning_rate": 5.272983340352818e-06, "loss": 0.0894, "step": 2792 }, { "epoch": 1.99, "grad_norm": 8.664241283983232, "learning_rate": 5.270097455961991e-06, "loss": 0.0939, "step": 2793 }, { "epoch": 1.99, "grad_norm": 6.199786160737331, "learning_rate": 5.267211481326584e-06, "loss": 0.0717, "step": 2794 }, { "epoch": 2.0, "grad_norm": 9.359083770279224, "learning_rate": 5.264325417410854e-06, "loss": 0.0862, "step": 2795 }, { "epoch": 2.0, "grad_norm": 8.999303639282669, "learning_rate": 5.261439265179089e-06, "loss": 0.0884, "step": 2796 }, { "epoch": 2.0, "grad_norm": 9.531777669594215, "learning_rate": 5.258553025595605e-06, "loss": 0.1085, "step": 2797 }, { "epoch": 2.0, "grad_norm": 10.564778248106444, "learning_rate": 5.255666699624749e-06, "loss": 0.0837, "step": 2798 }, { "epoch": 2.0, "grad_norm": 8.144429532283462, "learning_rate": 5.252780288230899e-06, "loss": 0.0817, "step": 2799 }, { "epoch": 2.0, "grad_norm": 8.479863646274058, "learning_rate": 5.249893792378454e-06, "loss": 0.1122, "step": 2800 }, { "epoch": 2.0, "grad_norm": 8.219194155167031, "learning_rate": 5.24700721303185e-06, "loss": 0.1039, "step": 2801 }, { "epoch": 2.0, "grad_norm": 16.285123515055172, "learning_rate": 5.244120551155544e-06, "loss": 0.1119, "step": 2802 }, { "epoch": 2.0, "grad_norm": 13.901589114079679, "learning_rate": 5.241233807714024e-06, "loss": 0.054, "step": 2803 }, { "epoch": 2.0, "grad_norm": 4.199611427046534, "learning_rate": 5.238346983671805e-06, "loss": 0.0583, "step": 2804 }, { "epoch": 2.0, "grad_norm": 7.9097903544474075, "learning_rate": 5.235460079993429e-06, "loss": 0.0524, "step": 2805 }, { "epoch": 2.0, "grad_norm": 9.10846903963757, "learning_rate": 5.232573097643462e-06, "loss": 0.0448, "step": 2806 }, { "epoch": 2.0, "grad_norm": 8.33526310227755, "learning_rate": 5.229686037586502e-06, "loss": 0.0567, "step": 2807 }, { "epoch": 2.0, "grad_norm": 14.233119308284538, "learning_rate": 5.226798900787167e-06, "loss": 0.0535, "step": 2808 }, { "epoch": 2.0, "grad_norm": 7.672664735913163, "learning_rate": 5.223911688210104e-06, "loss": 0.0562, "step": 2809 }, { "epoch": 2.01, "grad_norm": 11.614815160250359, "learning_rate": 5.221024400819983e-06, "loss": 0.0532, "step": 2810 }, { "epoch": 2.01, "grad_norm": 14.154573219522113, "learning_rate": 5.218137039581504e-06, "loss": 0.0424, "step": 2811 }, { "epoch": 2.01, "grad_norm": 6.576671964529106, "learning_rate": 5.215249605459382e-06, "loss": 0.0618, "step": 2812 }, { "epoch": 2.01, "grad_norm": 11.287896142648293, "learning_rate": 5.212362099418369e-06, "loss": 0.0522, "step": 2813 }, { "epoch": 2.01, "grad_norm": 6.582372740411856, "learning_rate": 5.2094745224232306e-06, "loss": 0.0511, "step": 2814 }, { "epoch": 2.01, "grad_norm": 6.17959564383128, "learning_rate": 5.206586875438759e-06, "loss": 0.0565, "step": 2815 }, { "epoch": 2.01, "grad_norm": 12.734580828635346, "learning_rate": 5.203699159429773e-06, "loss": 0.049, "step": 2816 }, { "epoch": 2.01, "grad_norm": 13.886150462876547, "learning_rate": 5.200811375361112e-06, "loss": 0.0618, "step": 2817 }, { "epoch": 2.01, "grad_norm": 9.75151912924947, "learning_rate": 5.197923524197639e-06, "loss": 0.0517, "step": 2818 }, { "epoch": 2.01, "grad_norm": 8.18582132062964, "learning_rate": 5.195035606904237e-06, "loss": 0.0461, "step": 2819 }, { "epoch": 2.01, "grad_norm": 4.949615940026982, "learning_rate": 5.1921476244458135e-06, "loss": 0.07, "step": 2820 }, { "epoch": 2.01, "grad_norm": 5.522744474789711, "learning_rate": 5.189259577787297e-06, "loss": 0.0469, "step": 2821 }, { "epoch": 2.01, "grad_norm": 13.237191723970465, "learning_rate": 5.186371467893638e-06, "loss": 0.0585, "step": 2822 }, { "epoch": 2.01, "grad_norm": 22.63811767687695, "learning_rate": 5.1834832957298075e-06, "loss": 0.0612, "step": 2823 }, { "epoch": 2.02, "grad_norm": 10.856725695302611, "learning_rate": 5.180595062260797e-06, "loss": 0.0328, "step": 2824 }, { "epoch": 2.02, "grad_norm": 5.140432943530161, "learning_rate": 5.177706768451619e-06, "loss": 0.042, "step": 2825 }, { "epoch": 2.02, "grad_norm": 14.324322382042538, "learning_rate": 5.174818415267308e-06, "loss": 0.0389, "step": 2826 }, { "epoch": 2.02, "grad_norm": 12.103738058031697, "learning_rate": 5.1719300036729135e-06, "loss": 0.0532, "step": 2827 }, { "epoch": 2.02, "grad_norm": 23.584660325094507, "learning_rate": 5.169041534633511e-06, "loss": 0.0512, "step": 2828 }, { "epoch": 2.02, "grad_norm": 10.668112848453543, "learning_rate": 5.166153009114188e-06, "loss": 0.0463, "step": 2829 }, { "epoch": 2.02, "grad_norm": 3.695483605532189, "learning_rate": 5.163264428080057e-06, "loss": 0.0368, "step": 2830 }, { "epoch": 2.02, "grad_norm": 12.550637521816691, "learning_rate": 5.160375792496246e-06, "loss": 0.0579, "step": 2831 }, { "epoch": 2.02, "grad_norm": 7.404223260758192, "learning_rate": 5.157487103327901e-06, "loss": 0.0544, "step": 2832 }, { "epoch": 2.02, "grad_norm": 14.804123391013425, "learning_rate": 5.1545983615401885e-06, "loss": 0.0443, "step": 2833 }, { "epoch": 2.02, "grad_norm": 2.9860623662205845, "learning_rate": 5.151709568098289e-06, "loss": 0.0437, "step": 2834 }, { "epoch": 2.02, "grad_norm": 3.4808084788201925, "learning_rate": 5.1488207239674036e-06, "loss": 0.048, "step": 2835 }, { "epoch": 2.02, "grad_norm": 9.244067790459159, "learning_rate": 5.145931830112748e-06, "loss": 0.0397, "step": 2836 }, { "epoch": 2.02, "grad_norm": 6.7362343327123035, "learning_rate": 5.1430428874995554e-06, "loss": 0.0555, "step": 2837 }, { "epoch": 2.03, "grad_norm": 9.004920379741428, "learning_rate": 5.140153897093076e-06, "loss": 0.0459, "step": 2838 }, { "epoch": 2.03, "grad_norm": 8.516109389469001, "learning_rate": 5.1372648598585725e-06, "loss": 0.0656, "step": 2839 }, { "epoch": 2.03, "grad_norm": 8.640361450205146, "learning_rate": 5.134375776761329e-06, "loss": 0.0576, "step": 2840 }, { "epoch": 2.03, "grad_norm": 5.993357106722759, "learning_rate": 5.131486648766642e-06, "loss": 0.0432, "step": 2841 }, { "epoch": 2.03, "grad_norm": 14.19048628173077, "learning_rate": 5.1285974768398205e-06, "loss": 0.0488, "step": 2842 }, { "epoch": 2.03, "grad_norm": 3.711900865433462, "learning_rate": 5.125708261946192e-06, "loss": 0.0494, "step": 2843 }, { "epoch": 2.03, "grad_norm": 7.5442087251704875, "learning_rate": 5.122819005051096e-06, "loss": 0.0534, "step": 2844 }, { "epoch": 2.03, "grad_norm": 20.77714075942429, "learning_rate": 5.119929707119889e-06, "loss": 0.057, "step": 2845 }, { "epoch": 2.03, "grad_norm": 24.696838420818487, "learning_rate": 5.117040369117937e-06, "loss": 0.071, "step": 2846 }, { "epoch": 2.03, "grad_norm": 12.90688500272758, "learning_rate": 5.114150992010621e-06, "loss": 0.0461, "step": 2847 }, { "epoch": 2.03, "grad_norm": 21.76698605488115, "learning_rate": 5.1112615767633385e-06, "loss": 0.0688, "step": 2848 }, { "epoch": 2.03, "grad_norm": 12.042476678664588, "learning_rate": 5.108372124341494e-06, "loss": 0.0603, "step": 2849 }, { "epoch": 2.03, "grad_norm": 8.711444498022269, "learning_rate": 5.105482635710509e-06, "loss": 0.045, "step": 2850 }, { "epoch": 2.03, "grad_norm": 4.965522711679408, "learning_rate": 5.102593111835815e-06, "loss": 0.0609, "step": 2851 }, { "epoch": 2.04, "grad_norm": 5.78715066740317, "learning_rate": 5.099703553682854e-06, "loss": 0.0474, "step": 2852 }, { "epoch": 2.04, "grad_norm": 14.612001845478728, "learning_rate": 5.096813962217086e-06, "loss": 0.0386, "step": 2853 }, { "epoch": 2.04, "grad_norm": 8.213767104804448, "learning_rate": 5.093924338403971e-06, "loss": 0.0436, "step": 2854 }, { "epoch": 2.04, "grad_norm": 14.21885718408981, "learning_rate": 5.091034683208988e-06, "loss": 0.0649, "step": 2855 }, { "epoch": 2.04, "grad_norm": 9.200406261001536, "learning_rate": 5.088144997597627e-06, "loss": 0.0344, "step": 2856 }, { "epoch": 2.04, "grad_norm": 5.7563067974727264, "learning_rate": 5.085255282535383e-06, "loss": 0.0523, "step": 2857 }, { "epoch": 2.04, "grad_norm": 17.092771383434464, "learning_rate": 5.082365538987765e-06, "loss": 0.0494, "step": 2858 }, { "epoch": 2.04, "grad_norm": 6.860949499099525, "learning_rate": 5.079475767920289e-06, "loss": 0.0499, "step": 2859 }, { "epoch": 2.04, "grad_norm": 22.180665634795407, "learning_rate": 5.076585970298481e-06, "loss": 0.0686, "step": 2860 }, { "epoch": 2.04, "grad_norm": 14.512557797350894, "learning_rate": 5.073696147087878e-06, "loss": 0.0486, "step": 2861 }, { "epoch": 2.04, "grad_norm": 6.190960927702063, "learning_rate": 5.070806299254023e-06, "loss": 0.0379, "step": 2862 }, { "epoch": 2.04, "grad_norm": 7.028780537576393, "learning_rate": 5.067916427762466e-06, "loss": 0.0535, "step": 2863 }, { "epoch": 2.04, "grad_norm": 22.0853084053586, "learning_rate": 5.0650265335787685e-06, "loss": 0.0513, "step": 2864 }, { "epoch": 2.04, "grad_norm": 18.82402778771914, "learning_rate": 5.062136617668497e-06, "loss": 0.0328, "step": 2865 }, { "epoch": 2.05, "grad_norm": 9.35572729496596, "learning_rate": 5.059246680997228e-06, "loss": 0.0499, "step": 2866 }, { "epoch": 2.05, "grad_norm": 15.494949337550196, "learning_rate": 5.05635672453054e-06, "loss": 0.0523, "step": 2867 }, { "epoch": 2.05, "grad_norm": 17.512400175601442, "learning_rate": 5.053466749234023e-06, "loss": 0.0384, "step": 2868 }, { "epoch": 2.05, "grad_norm": 40.02758074357572, "learning_rate": 5.050576756073272e-06, "loss": 0.0751, "step": 2869 }, { "epoch": 2.05, "grad_norm": 18.427268569788204, "learning_rate": 5.047686746013888e-06, "loss": 0.0413, "step": 2870 }, { "epoch": 2.05, "grad_norm": 5.541127274636648, "learning_rate": 5.044796720021474e-06, "loss": 0.0522, "step": 2871 }, { "epoch": 2.05, "grad_norm": 7.965720801969884, "learning_rate": 5.041906679061643e-06, "loss": 0.0359, "step": 2872 }, { "epoch": 2.05, "grad_norm": 19.01943256008662, "learning_rate": 5.039016624100013e-06, "loss": 0.0659, "step": 2873 }, { "epoch": 2.05, "grad_norm": 27.05389706378414, "learning_rate": 5.036126556102202e-06, "loss": 0.063, "step": 2874 }, { "epoch": 2.05, "grad_norm": 5.472693277113262, "learning_rate": 5.033236476033838e-06, "loss": 0.042, "step": 2875 }, { "epoch": 2.05, "grad_norm": 5.360766469879016, "learning_rate": 5.0303463848605495e-06, "loss": 0.0457, "step": 2876 }, { "epoch": 2.05, "grad_norm": 16.162091546785724, "learning_rate": 5.027456283547969e-06, "loss": 0.0628, "step": 2877 }, { "epoch": 2.05, "grad_norm": 21.94294781666631, "learning_rate": 5.0245661730617344e-06, "loss": 0.0573, "step": 2878 }, { "epoch": 2.05, "grad_norm": 12.700207444520526, "learning_rate": 5.0216760543674855e-06, "loss": 0.0488, "step": 2879 }, { "epoch": 2.06, "grad_norm": 10.814485131817554, "learning_rate": 5.0187859284308635e-06, "loss": 0.059, "step": 2880 }, { "epoch": 2.06, "grad_norm": 19.09796261660822, "learning_rate": 5.015895796217514e-06, "loss": 0.056, "step": 2881 }, { "epoch": 2.06, "grad_norm": 21.58846874899979, "learning_rate": 5.013005658693083e-06, "loss": 0.0521, "step": 2882 }, { "epoch": 2.06, "grad_norm": 16.698159335405045, "learning_rate": 5.01011551682322e-06, "loss": 0.0566, "step": 2883 }, { "epoch": 2.06, "grad_norm": 14.016096004860056, "learning_rate": 5.007225371573573e-06, "loss": 0.0602, "step": 2884 }, { "epoch": 2.06, "grad_norm": 4.06089635411483, "learning_rate": 5.004335223909797e-06, "loss": 0.0458, "step": 2885 }, { "epoch": 2.06, "grad_norm": 13.265950504045813, "learning_rate": 5.0014450747975416e-06, "loss": 0.0705, "step": 2886 }, { "epoch": 2.06, "grad_norm": 19.71288313562799, "learning_rate": 4.998554925202459e-06, "loss": 0.0456, "step": 2887 }, { "epoch": 2.06, "grad_norm": 18.997859716152803, "learning_rate": 4.995664776090204e-06, "loss": 0.0703, "step": 2888 }, { "epoch": 2.06, "grad_norm": 12.003644856154164, "learning_rate": 4.9927746284264275e-06, "loss": 0.0547, "step": 2889 }, { "epoch": 2.06, "grad_norm": 5.8612356586077965, "learning_rate": 4.9898844831767826e-06, "loss": 0.0421, "step": 2890 }, { "epoch": 2.06, "grad_norm": 3.857541041219327, "learning_rate": 4.98699434130692e-06, "loss": 0.0409, "step": 2891 }, { "epoch": 2.06, "grad_norm": 27.220331399521893, "learning_rate": 4.984104203782488e-06, "loss": 0.0518, "step": 2892 }, { "epoch": 2.06, "grad_norm": 17.76518467751359, "learning_rate": 4.981214071569139e-06, "loss": 0.0568, "step": 2893 }, { "epoch": 2.07, "grad_norm": 6.706506867503759, "learning_rate": 4.978323945632515e-06, "loss": 0.0346, "step": 2894 }, { "epoch": 2.07, "grad_norm": 6.00943424052001, "learning_rate": 4.975433826938267e-06, "loss": 0.049, "step": 2895 }, { "epoch": 2.07, "grad_norm": 6.204979629155921, "learning_rate": 4.972543716452031e-06, "loss": 0.0735, "step": 2896 }, { "epoch": 2.07, "grad_norm": 10.632788181873034, "learning_rate": 4.969653615139452e-06, "loss": 0.0454, "step": 2897 }, { "epoch": 2.07, "grad_norm": 5.839103370356773, "learning_rate": 4.966763523966163e-06, "loss": 0.0974, "step": 2898 }, { "epoch": 2.07, "grad_norm": 10.849055825875318, "learning_rate": 4.963873443897799e-06, "loss": 0.0521, "step": 2899 }, { "epoch": 2.07, "grad_norm": 7.708000075587648, "learning_rate": 4.96098337589999e-06, "loss": 0.0355, "step": 2900 }, { "epoch": 2.07, "grad_norm": 5.723507585122161, "learning_rate": 4.958093320938358e-06, "loss": 0.0539, "step": 2901 }, { "epoch": 2.07, "grad_norm": 8.158944443681586, "learning_rate": 4.955203279978529e-06, "loss": 0.0671, "step": 2902 }, { "epoch": 2.07, "grad_norm": 9.759906642189957, "learning_rate": 4.952313253986114e-06, "loss": 0.0531, "step": 2903 }, { "epoch": 2.07, "grad_norm": 6.11064821441867, "learning_rate": 4.9494232439267296e-06, "loss": 0.0496, "step": 2904 }, { "epoch": 2.07, "grad_norm": 15.84805318936541, "learning_rate": 4.946533250765977e-06, "loss": 0.0455, "step": 2905 }, { "epoch": 2.07, "grad_norm": 12.764198896161808, "learning_rate": 4.943643275469461e-06, "loss": 0.038, "step": 2906 }, { "epoch": 2.07, "grad_norm": 8.974850741601227, "learning_rate": 4.940753319002773e-06, "loss": 0.053, "step": 2907 }, { "epoch": 2.08, "grad_norm": 13.75551037130389, "learning_rate": 4.937863382331504e-06, "loss": 0.0544, "step": 2908 }, { "epoch": 2.08, "grad_norm": 16.32267799964506, "learning_rate": 4.934973466421234e-06, "loss": 0.062, "step": 2909 }, { "epoch": 2.08, "grad_norm": 10.103817381967463, "learning_rate": 4.932083572237535e-06, "loss": 0.0406, "step": 2910 }, { "epoch": 2.08, "grad_norm": 6.213672979461658, "learning_rate": 4.92919370074598e-06, "loss": 0.0564, "step": 2911 }, { "epoch": 2.08, "grad_norm": 9.834401409500478, "learning_rate": 4.926303852912123e-06, "loss": 0.0533, "step": 2912 }, { "epoch": 2.08, "grad_norm": 6.141094774831048, "learning_rate": 4.9234140297015204e-06, "loss": 0.0361, "step": 2913 }, { "epoch": 2.08, "grad_norm": 17.040575294136367, "learning_rate": 4.920524232079712e-06, "loss": 0.06, "step": 2914 }, { "epoch": 2.08, "grad_norm": 18.436793853047202, "learning_rate": 4.917634461012238e-06, "loss": 0.0632, "step": 2915 }, { "epoch": 2.08, "grad_norm": 11.958023047209824, "learning_rate": 4.914744717464617e-06, "loss": 0.0414, "step": 2916 }, { "epoch": 2.08, "grad_norm": 6.374684872099406, "learning_rate": 4.911855002402375e-06, "loss": 0.0558, "step": 2917 }, { "epoch": 2.08, "grad_norm": 8.56698629233553, "learning_rate": 4.908965316791014e-06, "loss": 0.0278, "step": 2918 }, { "epoch": 2.08, "grad_norm": 8.288813639711439, "learning_rate": 4.906075661596031e-06, "loss": 0.0467, "step": 2919 }, { "epoch": 2.08, "grad_norm": 25.748623981042726, "learning_rate": 4.903186037782917e-06, "loss": 0.0627, "step": 2920 }, { "epoch": 2.08, "grad_norm": 4.1547283981098655, "learning_rate": 4.900296446317146e-06, "loss": 0.0426, "step": 2921 }, { "epoch": 2.09, "grad_norm": 11.060180181270248, "learning_rate": 4.897406888164187e-06, "loss": 0.0411, "step": 2922 }, { "epoch": 2.09, "grad_norm": 9.894982430864497, "learning_rate": 4.8945173642894915e-06, "loss": 0.0428, "step": 2923 }, { "epoch": 2.09, "grad_norm": 10.318786752951437, "learning_rate": 4.8916278756585074e-06, "loss": 0.038, "step": 2924 }, { "epoch": 2.09, "grad_norm": 23.226668767699405, "learning_rate": 4.888738423236664e-06, "loss": 0.0451, "step": 2925 }, { "epoch": 2.09, "grad_norm": 7.663805162430875, "learning_rate": 4.88584900798938e-06, "loss": 0.0311, "step": 2926 }, { "epoch": 2.09, "grad_norm": 11.873525027460229, "learning_rate": 4.882959630882066e-06, "loss": 0.0649, "step": 2927 }, { "epoch": 2.09, "grad_norm": 21.51580574874802, "learning_rate": 4.8800702928801124e-06, "loss": 0.0541, "step": 2928 }, { "epoch": 2.09, "grad_norm": 9.563175048514058, "learning_rate": 4.8771809949489056e-06, "loss": 0.0491, "step": 2929 }, { "epoch": 2.09, "grad_norm": 9.177795863776534, "learning_rate": 4.874291738053809e-06, "loss": 0.058, "step": 2930 }, { "epoch": 2.09, "grad_norm": 14.671314431344912, "learning_rate": 4.871402523160181e-06, "loss": 0.0529, "step": 2931 }, { "epoch": 2.09, "grad_norm": 8.21054929459398, "learning_rate": 4.868513351233359e-06, "loss": 0.0407, "step": 2932 }, { "epoch": 2.09, "grad_norm": 14.943488995962824, "learning_rate": 4.865624223238672e-06, "loss": 0.0537, "step": 2933 }, { "epoch": 2.09, "grad_norm": 9.285607421042759, "learning_rate": 4.862735140141428e-06, "loss": 0.0526, "step": 2934 }, { "epoch": 2.09, "grad_norm": 12.545169261685402, "learning_rate": 4.859846102906927e-06, "loss": 0.0347, "step": 2935 }, { "epoch": 2.1, "grad_norm": 6.418869884194012, "learning_rate": 4.856957112500446e-06, "loss": 0.0605, "step": 2936 }, { "epoch": 2.1, "grad_norm": 10.91319808557055, "learning_rate": 4.854068169887254e-06, "loss": 0.052, "step": 2937 }, { "epoch": 2.1, "grad_norm": 7.162112299173172, "learning_rate": 4.851179276032598e-06, "loss": 0.0621, "step": 2938 }, { "epoch": 2.1, "grad_norm": 17.64808170091903, "learning_rate": 4.848290431901712e-06, "loss": 0.0616, "step": 2939 }, { "epoch": 2.1, "grad_norm": 5.721241146237817, "learning_rate": 4.845401638459813e-06, "loss": 0.0644, "step": 2940 }, { "epoch": 2.1, "grad_norm": 5.215058023030627, "learning_rate": 4.8425128966721e-06, "loss": 0.0445, "step": 2941 }, { "epoch": 2.1, "grad_norm": 8.470561168753651, "learning_rate": 4.8396242075037555e-06, "loss": 0.0458, "step": 2942 }, { "epoch": 2.1, "grad_norm": 12.193270887378144, "learning_rate": 4.836735571919946e-06, "loss": 0.04, "step": 2943 }, { "epoch": 2.1, "grad_norm": 6.51982708722715, "learning_rate": 4.833846990885813e-06, "loss": 0.0451, "step": 2944 }, { "epoch": 2.1, "grad_norm": 14.275220224337577, "learning_rate": 4.830958465366492e-06, "loss": 0.06, "step": 2945 }, { "epoch": 2.1, "grad_norm": 14.485618936801512, "learning_rate": 4.828069996327088e-06, "loss": 0.0452, "step": 2946 }, { "epoch": 2.1, "grad_norm": 27.79698221697965, "learning_rate": 4.825181584732695e-06, "loss": 0.0803, "step": 2947 }, { "epoch": 2.1, "grad_norm": 7.760801102621836, "learning_rate": 4.822293231548382e-06, "loss": 0.051, "step": 2948 }, { "epoch": 2.1, "grad_norm": 18.186535510480933, "learning_rate": 4.819404937739205e-06, "loss": 0.0798, "step": 2949 }, { "epoch": 2.11, "grad_norm": 13.657062142670078, "learning_rate": 4.816516704270194e-06, "loss": 0.0419, "step": 2950 }, { "epoch": 2.11, "grad_norm": 17.763665719452103, "learning_rate": 4.813628532106363e-06, "loss": 0.041, "step": 2951 }, { "epoch": 2.11, "grad_norm": 17.213024245970015, "learning_rate": 4.810740422212705e-06, "loss": 0.0514, "step": 2952 }, { "epoch": 2.11, "grad_norm": 4.415365069317704, "learning_rate": 4.807852375554188e-06, "loss": 0.0439, "step": 2953 }, { "epoch": 2.11, "grad_norm": 4.156693282692543, "learning_rate": 4.804964393095765e-06, "loss": 0.0477, "step": 2954 }, { "epoch": 2.11, "grad_norm": 11.394625354113346, "learning_rate": 4.802076475802362e-06, "loss": 0.0494, "step": 2955 }, { "epoch": 2.11, "grad_norm": 3.3842660382835024, "learning_rate": 4.799188624638889e-06, "loss": 0.0276, "step": 2956 }, { "epoch": 2.11, "grad_norm": 4.446181745984596, "learning_rate": 4.796300840570227e-06, "loss": 0.0522, "step": 2957 }, { "epoch": 2.11, "grad_norm": 15.29577470926494, "learning_rate": 4.793413124561243e-06, "loss": 0.0464, "step": 2958 }, { "epoch": 2.11, "grad_norm": 8.414125335615022, "learning_rate": 4.790525477576773e-06, "loss": 0.0498, "step": 2959 }, { "epoch": 2.11, "grad_norm": 4.031180542596354, "learning_rate": 4.7876379005816325e-06, "loss": 0.032, "step": 2960 }, { "epoch": 2.11, "grad_norm": 3.6969083162862804, "learning_rate": 4.784750394540619e-06, "loss": 0.0463, "step": 2961 }, { "epoch": 2.11, "grad_norm": 13.339709877305827, "learning_rate": 4.781862960418498e-06, "loss": 0.0376, "step": 2962 }, { "epoch": 2.11, "grad_norm": 18.500857965632857, "learning_rate": 4.778975599180019e-06, "loss": 0.0535, "step": 2963 }, { "epoch": 2.12, "grad_norm": 5.964132136696635, "learning_rate": 4.776088311789897e-06, "loss": 0.0497, "step": 2964 }, { "epoch": 2.12, "grad_norm": 4.411962079455377, "learning_rate": 4.773201099212835e-06, "loss": 0.0402, "step": 2965 }, { "epoch": 2.12, "grad_norm": 8.713256952124864, "learning_rate": 4.770313962413499e-06, "loss": 0.0499, "step": 2966 }, { "epoch": 2.12, "grad_norm": 14.98799544031958, "learning_rate": 4.767426902356539e-06, "loss": 0.0379, "step": 2967 }, { "epoch": 2.12, "grad_norm": 6.230504284482281, "learning_rate": 4.7645399200065745e-06, "loss": 0.0564, "step": 2968 }, { "epoch": 2.12, "grad_norm": 7.904911066689615, "learning_rate": 4.761653016328197e-06, "loss": 0.0477, "step": 2969 }, { "epoch": 2.12, "grad_norm": 15.071860947840747, "learning_rate": 4.758766192285979e-06, "loss": 0.0497, "step": 2970 }, { "epoch": 2.12, "grad_norm": 21.895501390170683, "learning_rate": 4.755879448844458e-06, "loss": 0.0709, "step": 2971 }, { "epoch": 2.12, "grad_norm": 13.52662158776065, "learning_rate": 4.752992786968153e-06, "loss": 0.0518, "step": 2972 }, { "epoch": 2.12, "grad_norm": 18.273989464770818, "learning_rate": 4.750106207621546e-06, "loss": 0.0556, "step": 2973 }, { "epoch": 2.12, "grad_norm": 12.57628275065163, "learning_rate": 4.747219711769103e-06, "loss": 0.0445, "step": 2974 }, { "epoch": 2.12, "grad_norm": 8.238153462757012, "learning_rate": 4.74433330037525e-06, "loss": 0.0358, "step": 2975 }, { "epoch": 2.12, "grad_norm": 5.815107502108913, "learning_rate": 4.741446974404396e-06, "loss": 0.0553, "step": 2976 }, { "epoch": 2.12, "grad_norm": 15.551938145304423, "learning_rate": 4.738560734820914e-06, "loss": 0.0397, "step": 2977 }, { "epoch": 2.13, "grad_norm": 11.898776772774422, "learning_rate": 4.735674582589147e-06, "loss": 0.0439, "step": 2978 }, { "epoch": 2.13, "grad_norm": 10.619558463221429, "learning_rate": 4.732788518673418e-06, "loss": 0.0492, "step": 2979 }, { "epoch": 2.13, "grad_norm": 9.889398543545939, "learning_rate": 4.729902544038009e-06, "loss": 0.0441, "step": 2980 }, { "epoch": 2.13, "grad_norm": 5.05100282019345, "learning_rate": 4.7270166596471825e-06, "loss": 0.0447, "step": 2981 }, { "epoch": 2.13, "grad_norm": 17.41976116286379, "learning_rate": 4.724130866465163e-06, "loss": 0.0475, "step": 2982 }, { "epoch": 2.13, "grad_norm": 11.266177641235146, "learning_rate": 4.721245165456149e-06, "loss": 0.0494, "step": 2983 }, { "epoch": 2.13, "grad_norm": 11.20597565365233, "learning_rate": 4.7183595575843055e-06, "loss": 0.069, "step": 2984 }, { "epoch": 2.13, "grad_norm": 5.259086726470608, "learning_rate": 4.715474043813771e-06, "loss": 0.0356, "step": 2985 }, { "epoch": 2.13, "grad_norm": 5.080850839428504, "learning_rate": 4.712588625108645e-06, "loss": 0.0481, "step": 2986 }, { "epoch": 2.13, "grad_norm": 7.956793520224343, "learning_rate": 4.709703302433003e-06, "loss": 0.0567, "step": 2987 }, { "epoch": 2.13, "grad_norm": 21.843191213401248, "learning_rate": 4.706818076750883e-06, "loss": 0.0645, "step": 2988 }, { "epoch": 2.13, "grad_norm": 18.694211673739613, "learning_rate": 4.703932949026291e-06, "loss": 0.0492, "step": 2989 }, { "epoch": 2.13, "grad_norm": 15.151416311688783, "learning_rate": 4.701047920223207e-06, "loss": 0.0446, "step": 2990 }, { "epoch": 2.13, "grad_norm": 10.890580913948755, "learning_rate": 4.6981629913055674e-06, "loss": 0.0569, "step": 2991 }, { "epoch": 2.14, "grad_norm": 7.892486306024457, "learning_rate": 4.695278163237284e-06, "loss": 0.0312, "step": 2992 }, { "epoch": 2.14, "grad_norm": 12.299530495762703, "learning_rate": 4.692393436982229e-06, "loss": 0.0308, "step": 2993 }, { "epoch": 2.14, "grad_norm": 9.153777049929731, "learning_rate": 4.689508813504246e-06, "loss": 0.0351, "step": 2994 }, { "epoch": 2.14, "grad_norm": 3.108518173770672, "learning_rate": 4.686624293767138e-06, "loss": 0.0349, "step": 2995 }, { "epoch": 2.14, "grad_norm": 17.853184369338187, "learning_rate": 4.683739878734678e-06, "loss": 0.0641, "step": 2996 }, { "epoch": 2.14, "grad_norm": 6.466717945483939, "learning_rate": 4.6808555693706045e-06, "loss": 0.0453, "step": 2997 }, { "epoch": 2.14, "grad_norm": 4.829172386888312, "learning_rate": 4.677971366638616e-06, "loss": 0.0391, "step": 2998 }, { "epoch": 2.14, "grad_norm": 6.808172030984788, "learning_rate": 4.67508727150238e-06, "loss": 0.0492, "step": 2999 }, { "epoch": 2.14, "grad_norm": 6.156095884400888, "learning_rate": 4.672203284925525e-06, "loss": 0.0414, "step": 3000 }, { "epoch": 2.14, "eval_avg_AUC": 0.8227856069027939, "eval_avg_Accuracy": 0.7180039787798409, "eval_avg_Accuracy-right": 0.9098734837615756, "eval_avg_Accuracy-wrong": 0.3834432567659768, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6838678458638056, "eval_last_AUC": 0.8253504247874757, "eval_last_Accuracy": 0.7611903183023873, "eval_last_Accuracy-right": 0.873288117907917, "eval_last_Accuracy-wrong": 0.5657266317944053, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6932099187920158, "eval_max_AUC": 0.7682526969266754, "eval_max_Accuracy": 0.6413710212201591, "eval_max_Accuracy-right": 0.9846745793661145, "eval_max_Accuracy-wrong": 0.042756424835114853, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6326595141662732, "eval_min_AUC": 0.8253761205386874, "eval_min_Accuracy": 0.7610245358090185, "eval_min_Accuracy-right": 0.808921351245598, "eval_min_Accuracy-wrong": 0.6775073914032295, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6877850984496956, "eval_prod_AUC": 0.8274092551394246, "eval_prod_Accuracy": 0.7385195623342176, "eval_prod_Accuracy-right": 0.690556932307291, "eval_prod_Accuracy-wrong": 0.8221514669092563, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6858525438813741, "eval_runtime": 246.8659, "eval_samples_per_second": 97.737, "eval_steps_per_second": 3.054, "eval_sum_AUC": 0.6815349093354525, "eval_sum_Accuracy": 0.6375994694960212, "eval_sum_Accuracy-right": 0.9964784139820008, "eval_sum_Accuracy-wrong": 0.011826245167159426, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6539070349976671, "step": 3000 }, { "epoch": 2.14, "grad_norm": 7.733951582678205, "learning_rate": 4.669319407871647e-06, "loss": 0.0526, "step": 3001 }, { "epoch": 2.14, "grad_norm": 13.960495751854996, "learning_rate": 4.666435641304301e-06, "loss": 0.046, "step": 3002 }, { "epoch": 2.14, "grad_norm": 8.77619391019296, "learning_rate": 4.663551986187006e-06, "loss": 0.0649, "step": 3003 }, { "epoch": 2.14, "grad_norm": 6.884626530449051, "learning_rate": 4.660668443483248e-06, "loss": 0.0432, "step": 3004 }, { "epoch": 2.14, "grad_norm": 5.3549337785724465, "learning_rate": 4.657785014156468e-06, "loss": 0.0498, "step": 3005 }, { "epoch": 2.15, "grad_norm": 10.946093760776968, "learning_rate": 4.654901699170077e-06, "loss": 0.0597, "step": 3006 }, { "epoch": 2.15, "grad_norm": 15.763447770059578, "learning_rate": 4.652018499487442e-06, "loss": 0.0491, "step": 3007 }, { "epoch": 2.15, "grad_norm": 5.809896767903205, "learning_rate": 4.649135416071896e-06, "loss": 0.061, "step": 3008 }, { "epoch": 2.15, "grad_norm": 15.298958843270297, "learning_rate": 4.646252449886727e-06, "loss": 0.0544, "step": 3009 }, { "epoch": 2.15, "grad_norm": 10.962570611172566, "learning_rate": 4.6433696018951915e-06, "loss": 0.0578, "step": 3010 }, { "epoch": 2.15, "grad_norm": 8.441821768445967, "learning_rate": 4.640486873060501e-06, "loss": 0.0522, "step": 3011 }, { "epoch": 2.15, "grad_norm": 12.827070504932639, "learning_rate": 4.6376042643458254e-06, "loss": 0.0376, "step": 3012 }, { "epoch": 2.15, "grad_norm": 9.437577892174438, "learning_rate": 4.634721776714305e-06, "loss": 0.0455, "step": 3013 }, { "epoch": 2.15, "grad_norm": 9.188265760228681, "learning_rate": 4.631839411129025e-06, "loss": 0.061, "step": 3014 }, { "epoch": 2.15, "grad_norm": 9.140831955733349, "learning_rate": 4.628957168553044e-06, "loss": 0.0463, "step": 3015 }, { "epoch": 2.15, "grad_norm": 6.621373831056885, "learning_rate": 4.6260750499493665e-06, "loss": 0.0577, "step": 3016 }, { "epoch": 2.15, "grad_norm": 16.423887046208293, "learning_rate": 4.623193056280968e-06, "loss": 0.0422, "step": 3017 }, { "epoch": 2.15, "grad_norm": 8.94491278041153, "learning_rate": 4.6203111885107735e-06, "loss": 0.0443, "step": 3018 }, { "epoch": 2.15, "grad_norm": 10.518796763313492, "learning_rate": 4.617429447601665e-06, "loss": 0.0434, "step": 3019 }, { "epoch": 2.16, "grad_norm": 5.614949678699768, "learning_rate": 4.614547834516492e-06, "loss": 0.0467, "step": 3020 }, { "epoch": 2.16, "grad_norm": 6.229969137081133, "learning_rate": 4.6116663502180495e-06, "loss": 0.0798, "step": 3021 }, { "epoch": 2.16, "grad_norm": 5.066868988185442, "learning_rate": 4.6087849956691e-06, "loss": 0.0394, "step": 3022 }, { "epoch": 2.16, "grad_norm": 11.331501605212363, "learning_rate": 4.605903771832353e-06, "loss": 0.0513, "step": 3023 }, { "epoch": 2.16, "grad_norm": 5.839781229557823, "learning_rate": 4.603022679670482e-06, "loss": 0.041, "step": 3024 }, { "epoch": 2.16, "grad_norm": 14.332881398684489, "learning_rate": 4.6001417201461114e-06, "loss": 0.0539, "step": 3025 }, { "epoch": 2.16, "grad_norm": 16.291207017760183, "learning_rate": 4.597260894221826e-06, "loss": 0.0546, "step": 3026 }, { "epoch": 2.16, "grad_norm": 11.921642342100183, "learning_rate": 4.594380202860162e-06, "loss": 0.0542, "step": 3027 }, { "epoch": 2.16, "grad_norm": 13.932638931859428, "learning_rate": 4.5914996470236094e-06, "loss": 0.0531, "step": 3028 }, { "epoch": 2.16, "grad_norm": 6.7911004552180465, "learning_rate": 4.588619227674619e-06, "loss": 0.031, "step": 3029 }, { "epoch": 2.16, "grad_norm": 19.56600048222998, "learning_rate": 4.58573894577559e-06, "loss": 0.0653, "step": 3030 }, { "epoch": 2.16, "grad_norm": 3.7997903160804594, "learning_rate": 4.5828588022888815e-06, "loss": 0.0432, "step": 3031 }, { "epoch": 2.16, "grad_norm": 4.36456708146258, "learning_rate": 4.5799787981767975e-06, "loss": 0.0533, "step": 3032 }, { "epoch": 2.16, "grad_norm": 12.061053723608204, "learning_rate": 4.577098934401607e-06, "loss": 0.0577, "step": 3033 }, { "epoch": 2.17, "grad_norm": 5.047403317404933, "learning_rate": 4.57421921192552e-06, "loss": 0.043, "step": 3034 }, { "epoch": 2.17, "grad_norm": 6.587631210630952, "learning_rate": 4.5713396317107115e-06, "loss": 0.0443, "step": 3035 }, { "epoch": 2.17, "grad_norm": 5.787472009592416, "learning_rate": 4.568460194719299e-06, "loss": 0.0341, "step": 3036 }, { "epoch": 2.17, "grad_norm": 6.7437175989326965, "learning_rate": 4.565580901913356e-06, "loss": 0.0533, "step": 3037 }, { "epoch": 2.17, "grad_norm": 4.747331536456431, "learning_rate": 4.562701754254909e-06, "loss": 0.0473, "step": 3038 }, { "epoch": 2.17, "grad_norm": 5.768433537381566, "learning_rate": 4.559822752705933e-06, "loss": 0.0473, "step": 3039 }, { "epoch": 2.17, "grad_norm": 3.3083313751743146, "learning_rate": 4.556943898228358e-06, "loss": 0.0335, "step": 3040 }, { "epoch": 2.17, "grad_norm": 14.995712674161572, "learning_rate": 4.55406519178406e-06, "loss": 0.0591, "step": 3041 }, { "epoch": 2.17, "grad_norm": 12.571671708096863, "learning_rate": 4.551186634334873e-06, "loss": 0.0516, "step": 3042 }, { "epoch": 2.17, "grad_norm": 3.8361711880156655, "learning_rate": 4.54830822684257e-06, "loss": 0.0463, "step": 3043 }, { "epoch": 2.17, "grad_norm": 6.242235542738953, "learning_rate": 4.545429970268888e-06, "loss": 0.0573, "step": 3044 }, { "epoch": 2.17, "grad_norm": 2.9572838572416, "learning_rate": 4.542551865575499e-06, "loss": 0.0381, "step": 3045 }, { "epoch": 2.17, "grad_norm": 7.465321058449582, "learning_rate": 4.539673913724037e-06, "loss": 0.0504, "step": 3046 }, { "epoch": 2.17, "grad_norm": 13.582177654022352, "learning_rate": 4.5367961156760745e-06, "loss": 0.0642, "step": 3047 }, { "epoch": 2.18, "grad_norm": 13.106849831794028, "learning_rate": 4.533918472393141e-06, "loss": 0.0544, "step": 3048 }, { "epoch": 2.18, "grad_norm": 8.258854894812476, "learning_rate": 4.531040984836708e-06, "loss": 0.0398, "step": 3049 }, { "epoch": 2.18, "grad_norm": 6.1961201449818875, "learning_rate": 4.5281636539682e-06, "loss": 0.0563, "step": 3050 }, { "epoch": 2.18, "grad_norm": 14.114915646585992, "learning_rate": 4.5252864807489836e-06, "loss": 0.0525, "step": 3051 }, { "epoch": 2.18, "grad_norm": 3.184899979478378, "learning_rate": 4.522409466140379e-06, "loss": 0.0417, "step": 3052 }, { "epoch": 2.18, "grad_norm": 15.445103578110677, "learning_rate": 4.5195326111036475e-06, "loss": 0.0515, "step": 3053 }, { "epoch": 2.18, "grad_norm": 5.519785034152788, "learning_rate": 4.5166559166000035e-06, "loss": 0.0558, "step": 3054 }, { "epoch": 2.18, "grad_norm": 11.233134348940904, "learning_rate": 4.513779383590599e-06, "loss": 0.0532, "step": 3055 }, { "epoch": 2.18, "grad_norm": 6.353767355142566, "learning_rate": 4.510903013036542e-06, "loss": 0.0411, "step": 3056 }, { "epoch": 2.18, "grad_norm": 5.602659776333459, "learning_rate": 4.508026805898878e-06, "loss": 0.0474, "step": 3057 }, { "epoch": 2.18, "grad_norm": 3.55271578684028, "learning_rate": 4.505150763138604e-06, "loss": 0.0438, "step": 3058 }, { "epoch": 2.18, "grad_norm": 5.4020523865517065, "learning_rate": 4.502274885716656e-06, "loss": 0.0441, "step": 3059 }, { "epoch": 2.18, "grad_norm": 4.974542469049749, "learning_rate": 4.499399174593923e-06, "loss": 0.038, "step": 3060 }, { "epoch": 2.18, "grad_norm": 13.183234894928061, "learning_rate": 4.496523630731229e-06, "loss": 0.0536, "step": 3061 }, { "epoch": 2.19, "grad_norm": 7.012776400774284, "learning_rate": 4.493648255089347e-06, "loss": 0.053, "step": 3062 }, { "epoch": 2.19, "grad_norm": 5.113694599229363, "learning_rate": 4.490773048628997e-06, "loss": 0.0441, "step": 3063 }, { "epoch": 2.19, "grad_norm": 6.999872520790414, "learning_rate": 4.487898012310834e-06, "loss": 0.0551, "step": 3064 }, { "epoch": 2.19, "grad_norm": 8.988865058697366, "learning_rate": 4.485023147095466e-06, "loss": 0.0759, "step": 3065 }, { "epoch": 2.19, "grad_norm": 8.608399626100027, "learning_rate": 4.482148453943434e-06, "loss": 0.0555, "step": 3066 }, { "epoch": 2.19, "grad_norm": 5.657772658940335, "learning_rate": 4.479273933815232e-06, "loss": 0.0548, "step": 3067 }, { "epoch": 2.19, "grad_norm": 5.243732810688884, "learning_rate": 4.476399587671285e-06, "loss": 0.0759, "step": 3068 }, { "epoch": 2.19, "grad_norm": 17.83601854064182, "learning_rate": 4.47352541647197e-06, "loss": 0.0443, "step": 3069 }, { "epoch": 2.19, "grad_norm": 8.347841250675605, "learning_rate": 4.470651421177599e-06, "loss": 0.0452, "step": 3070 }, { "epoch": 2.19, "grad_norm": 7.0436872734018765, "learning_rate": 4.467777602748425e-06, "loss": 0.0431, "step": 3071 }, { "epoch": 2.19, "grad_norm": 24.860758320056135, "learning_rate": 4.4649039621446495e-06, "loss": 0.0797, "step": 3072 }, { "epoch": 2.19, "grad_norm": 11.719607525022566, "learning_rate": 4.462030500326403e-06, "loss": 0.0443, "step": 3073 }, { "epoch": 2.19, "grad_norm": 11.724827346428865, "learning_rate": 4.459157218253769e-06, "loss": 0.0717, "step": 3074 }, { "epoch": 2.19, "grad_norm": 13.934654419830443, "learning_rate": 4.456284116886758e-06, "loss": 0.0496, "step": 3075 }, { "epoch": 2.2, "grad_norm": 7.660664913105194, "learning_rate": 4.453411197185334e-06, "loss": 0.0442, "step": 3076 }, { "epoch": 2.2, "grad_norm": 24.60532296154118, "learning_rate": 4.450538460109384e-06, "loss": 0.0543, "step": 3077 }, { "epoch": 2.2, "grad_norm": 9.440853316535325, "learning_rate": 4.447665906618751e-06, "loss": 0.037, "step": 3078 }, { "epoch": 2.2, "grad_norm": 2.5094571457438564, "learning_rate": 4.444793537673204e-06, "loss": 0.0406, "step": 3079 }, { "epoch": 2.2, "grad_norm": 14.051552491510163, "learning_rate": 4.441921354232455e-06, "loss": 0.0598, "step": 3080 }, { "epoch": 2.2, "grad_norm": 4.601130148270938, "learning_rate": 4.439049357256156e-06, "loss": 0.0391, "step": 3081 }, { "epoch": 2.2, "grad_norm": 16.768683231807135, "learning_rate": 4.436177547703891e-06, "loss": 0.0461, "step": 3082 }, { "epoch": 2.2, "grad_norm": 14.287404647048588, "learning_rate": 4.433305926535189e-06, "loss": 0.0694, "step": 3083 }, { "epoch": 2.2, "grad_norm": 10.160614532165187, "learning_rate": 4.430434494709509e-06, "loss": 0.0536, "step": 3084 }, { "epoch": 2.2, "grad_norm": 15.080982562292157, "learning_rate": 4.427563253186253e-06, "loss": 0.0343, "step": 3085 }, { "epoch": 2.2, "grad_norm": 5.675753477001895, "learning_rate": 4.424692202924754e-06, "loss": 0.0356, "step": 3086 }, { "epoch": 2.2, "grad_norm": 5.716909777073017, "learning_rate": 4.421821344884281e-06, "loss": 0.0341, "step": 3087 }, { "epoch": 2.2, "grad_norm": 4.431358994048849, "learning_rate": 4.418950680024046e-06, "loss": 0.0457, "step": 3088 }, { "epoch": 2.2, "grad_norm": 6.596427500698859, "learning_rate": 4.416080209303187e-06, "loss": 0.0395, "step": 3089 }, { "epoch": 2.21, "grad_norm": 7.187035201109175, "learning_rate": 4.413209933680786e-06, "loss": 0.0556, "step": 3090 }, { "epoch": 2.21, "grad_norm": 15.759502614264681, "learning_rate": 4.410339854115849e-06, "loss": 0.0658, "step": 3091 }, { "epoch": 2.21, "grad_norm": 8.965449381966314, "learning_rate": 4.407469971567331e-06, "loss": 0.0542, "step": 3092 }, { "epoch": 2.21, "grad_norm": 13.279415208911773, "learning_rate": 4.4046002869941055e-06, "loss": 0.0518, "step": 3093 }, { "epoch": 2.21, "grad_norm": 5.9208827638453485, "learning_rate": 4.401730801354994e-06, "loss": 0.0453, "step": 3094 }, { "epoch": 2.21, "grad_norm": 4.575198553252729, "learning_rate": 4.39886151560874e-06, "loss": 0.0413, "step": 3095 }, { "epoch": 2.21, "grad_norm": 3.725306026650709, "learning_rate": 4.395992430714028e-06, "loss": 0.0352, "step": 3096 }, { "epoch": 2.21, "grad_norm": 10.572986077833214, "learning_rate": 4.393123547629472e-06, "loss": 0.0573, "step": 3097 }, { "epoch": 2.21, "grad_norm": 4.788737593661607, "learning_rate": 4.390254867313619e-06, "loss": 0.049, "step": 3098 }, { "epoch": 2.21, "grad_norm": 14.988650115590175, "learning_rate": 4.387386390724947e-06, "loss": 0.041, "step": 3099 }, { "epoch": 2.21, "grad_norm": 15.407848641378886, "learning_rate": 4.38451811882187e-06, "loss": 0.0631, "step": 3100 }, { "epoch": 2.21, "grad_norm": 10.332737742350625, "learning_rate": 4.3816500525627284e-06, "loss": 0.0542, "step": 3101 }, { "epoch": 2.21, "grad_norm": 3.3046432737592766, "learning_rate": 4.3787821929057985e-06, "loss": 0.0524, "step": 3102 }, { "epoch": 2.21, "grad_norm": 21.62698520171683, "learning_rate": 4.3759145408092855e-06, "loss": 0.0541, "step": 3103 }, { "epoch": 2.22, "grad_norm": 6.318176228409699, "learning_rate": 4.373047097231324e-06, "loss": 0.047, "step": 3104 }, { "epoch": 2.22, "grad_norm": 5.161442887353655, "learning_rate": 4.370179863129979e-06, "loss": 0.0438, "step": 3105 }, { "epoch": 2.22, "grad_norm": 7.9377251511338205, "learning_rate": 4.367312839463251e-06, "loss": 0.0496, "step": 3106 }, { "epoch": 2.22, "grad_norm": 5.3131144070684595, "learning_rate": 4.3644460271890614e-06, "loss": 0.0415, "step": 3107 }, { "epoch": 2.22, "grad_norm": 8.363837704096479, "learning_rate": 4.361579427265268e-06, "loss": 0.0644, "step": 3108 }, { "epoch": 2.22, "grad_norm": 8.313467422358402, "learning_rate": 4.358713040649654e-06, "loss": 0.0499, "step": 3109 }, { "epoch": 2.22, "grad_norm": 3.1434368387419873, "learning_rate": 4.3558468682999336e-06, "loss": 0.0358, "step": 3110 }, { "epoch": 2.22, "grad_norm": 5.111253029616996, "learning_rate": 4.352980911173747e-06, "loss": 0.0492, "step": 3111 }, { "epoch": 2.22, "grad_norm": 8.357180537855855, "learning_rate": 4.350115170228664e-06, "loss": 0.0505, "step": 3112 }, { "epoch": 2.22, "grad_norm": 7.2716322758839755, "learning_rate": 4.3472496464221845e-06, "loss": 0.0474, "step": 3113 }, { "epoch": 2.22, "grad_norm": 7.2404915807490315, "learning_rate": 4.344384340711728e-06, "loss": 0.0434, "step": 3114 }, { "epoch": 2.22, "grad_norm": 5.148940982244169, "learning_rate": 4.341519254054651e-06, "loss": 0.0416, "step": 3115 }, { "epoch": 2.22, "grad_norm": 5.229546559770345, "learning_rate": 4.338654387408229e-06, "loss": 0.0396, "step": 3116 }, { "epoch": 2.22, "grad_norm": 20.48430154530679, "learning_rate": 4.335789741729671e-06, "loss": 0.0515, "step": 3117 }, { "epoch": 2.23, "grad_norm": 10.359553267941436, "learning_rate": 4.332925317976104e-06, "loss": 0.0451, "step": 3118 }, { "epoch": 2.23, "grad_norm": 12.04521250763658, "learning_rate": 4.330061117104589e-06, "loss": 0.0395, "step": 3119 }, { "epoch": 2.23, "grad_norm": 9.16643051448259, "learning_rate": 4.327197140072108e-06, "loss": 0.0539, "step": 3120 }, { "epoch": 2.23, "grad_norm": 11.188804325737475, "learning_rate": 4.324333387835565e-06, "loss": 0.048, "step": 3121 }, { "epoch": 2.23, "grad_norm": 10.924204030628253, "learning_rate": 4.321469861351799e-06, "loss": 0.0456, "step": 3122 }, { "epoch": 2.23, "grad_norm": 12.396543885172143, "learning_rate": 4.318606561577562e-06, "loss": 0.0711, "step": 3123 }, { "epoch": 2.23, "grad_norm": 7.358079264086069, "learning_rate": 4.31574348946954e-06, "loss": 0.0432, "step": 3124 }, { "epoch": 2.23, "grad_norm": 6.63910667690175, "learning_rate": 4.312880645984334e-06, "loss": 0.0473, "step": 3125 }, { "epoch": 2.23, "grad_norm": 6.926846866544162, "learning_rate": 4.310018032078479e-06, "loss": 0.0427, "step": 3126 }, { "epoch": 2.23, "grad_norm": 5.348927273635156, "learning_rate": 4.307155648708421e-06, "loss": 0.0398, "step": 3127 }, { "epoch": 2.23, "grad_norm": 8.293095802027796, "learning_rate": 4.304293496830542e-06, "loss": 0.0552, "step": 3128 }, { "epoch": 2.23, "grad_norm": 5.164416626873579, "learning_rate": 4.301431577401136e-06, "loss": 0.0558, "step": 3129 }, { "epoch": 2.23, "grad_norm": 10.31913313745745, "learning_rate": 4.298569891376423e-06, "loss": 0.0557, "step": 3130 }, { "epoch": 2.23, "grad_norm": 11.861633104275379, "learning_rate": 4.2957084397125496e-06, "loss": 0.0438, "step": 3131 }, { "epoch": 2.24, "grad_norm": 11.33706132619583, "learning_rate": 4.292847223365574e-06, "loss": 0.0434, "step": 3132 }, { "epoch": 2.24, "grad_norm": 15.917688181318276, "learning_rate": 4.289986243291488e-06, "loss": 0.0623, "step": 3133 }, { "epoch": 2.24, "grad_norm": 4.856782780631129, "learning_rate": 4.287125500446193e-06, "loss": 0.0336, "step": 3134 }, { "epoch": 2.24, "grad_norm": 16.31108504711644, "learning_rate": 4.284264995785521e-06, "loss": 0.0607, "step": 3135 }, { "epoch": 2.24, "grad_norm": 9.762647669216687, "learning_rate": 4.2814047302652155e-06, "loss": 0.0361, "step": 3136 }, { "epoch": 2.24, "grad_norm": 8.749071414210514, "learning_rate": 4.278544704840948e-06, "loss": 0.0448, "step": 3137 }, { "epoch": 2.24, "grad_norm": 6.27371687383982, "learning_rate": 4.275684920468306e-06, "loss": 0.0473, "step": 3138 }, { "epoch": 2.24, "grad_norm": 4.534335338717934, "learning_rate": 4.272825378102791e-06, "loss": 0.0475, "step": 3139 }, { "epoch": 2.24, "grad_norm": 6.9101311343506655, "learning_rate": 4.269966078699836e-06, "loss": 0.0396, "step": 3140 }, { "epoch": 2.24, "grad_norm": 3.184600894702316, "learning_rate": 4.267107023214782e-06, "loss": 0.0396, "step": 3141 }, { "epoch": 2.24, "grad_norm": 10.089000430219308, "learning_rate": 4.264248212602896e-06, "loss": 0.0527, "step": 3142 }, { "epoch": 2.24, "grad_norm": 6.60955809699945, "learning_rate": 4.261389647819355e-06, "loss": 0.0446, "step": 3143 }, { "epoch": 2.24, "grad_norm": 13.226538812036294, "learning_rate": 4.258531329819264e-06, "loss": 0.0481, "step": 3144 }, { "epoch": 2.24, "grad_norm": 10.865699000414827, "learning_rate": 4.255673259557636e-06, "loss": 0.0531, "step": 3145 }, { "epoch": 2.25, "grad_norm": 14.364449970953885, "learning_rate": 4.252815437989408e-06, "loss": 0.055, "step": 3146 }, { "epoch": 2.25, "grad_norm": 9.573885380724388, "learning_rate": 4.24995786606943e-06, "loss": 0.0589, "step": 3147 }, { "epoch": 2.25, "grad_norm": 20.58280975888398, "learning_rate": 4.24710054475247e-06, "loss": 0.0551, "step": 3148 }, { "epoch": 2.25, "grad_norm": 4.894893628781143, "learning_rate": 4.244243474993214e-06, "loss": 0.0312, "step": 3149 }, { "epoch": 2.25, "grad_norm": 10.294083577388148, "learning_rate": 4.241386657746257e-06, "loss": 0.0512, "step": 3150 }, { "epoch": 2.25, "grad_norm": 9.884297976636462, "learning_rate": 4.2385300939661215e-06, "loss": 0.0698, "step": 3151 }, { "epoch": 2.25, "grad_norm": 20.580492702452048, "learning_rate": 4.2356737846072326e-06, "loss": 0.0423, "step": 3152 }, { "epoch": 2.25, "grad_norm": 10.3250929805605, "learning_rate": 4.232817730623941e-06, "loss": 0.0622, "step": 3153 }, { "epoch": 2.25, "grad_norm": 7.020014929183152, "learning_rate": 4.229961932970505e-06, "loss": 0.029, "step": 3154 }, { "epoch": 2.25, "grad_norm": 7.624072768428424, "learning_rate": 4.2271063926010995e-06, "loss": 0.0357, "step": 3155 }, { "epoch": 2.25, "grad_norm": 10.226850797566772, "learning_rate": 4.224251110469814e-06, "loss": 0.0622, "step": 3156 }, { "epoch": 2.25, "grad_norm": 6.615838261687149, "learning_rate": 4.221396087530652e-06, "loss": 0.056, "step": 3157 }, { "epoch": 2.25, "grad_norm": 8.170230954284976, "learning_rate": 4.218541324737529e-06, "loss": 0.0552, "step": 3158 }, { "epoch": 2.25, "grad_norm": 5.928036811380873, "learning_rate": 4.2156868230442756e-06, "loss": 0.0472, "step": 3159 }, { "epoch": 2.26, "grad_norm": 15.829254203511692, "learning_rate": 4.212832583404632e-06, "loss": 0.0587, "step": 3160 }, { "epoch": 2.26, "grad_norm": 11.618419552979391, "learning_rate": 4.2099786067722535e-06, "loss": 0.0403, "step": 3161 }, { "epoch": 2.26, "grad_norm": 2.1222508462469003, "learning_rate": 4.207124894100707e-06, "loss": 0.0364, "step": 3162 }, { "epoch": 2.26, "grad_norm": 4.318748307419997, "learning_rate": 4.2042714463434715e-06, "loss": 0.0391, "step": 3163 }, { "epoch": 2.26, "grad_norm": 9.36228620240982, "learning_rate": 4.201418264453935e-06, "loss": 0.0319, "step": 3164 }, { "epoch": 2.26, "grad_norm": 3.3778091133940125, "learning_rate": 4.198565349385402e-06, "loss": 0.0536, "step": 3165 }, { "epoch": 2.26, "grad_norm": 17.14676052388145, "learning_rate": 4.195712702091079e-06, "loss": 0.0535, "step": 3166 }, { "epoch": 2.26, "grad_norm": 5.024792221872647, "learning_rate": 4.192860323524094e-06, "loss": 0.0481, "step": 3167 }, { "epoch": 2.26, "grad_norm": 6.294621583070762, "learning_rate": 4.190008214637476e-06, "loss": 0.0591, "step": 3168 }, { "epoch": 2.26, "grad_norm": 7.517307671125681, "learning_rate": 4.187156376384171e-06, "loss": 0.0526, "step": 3169 }, { "epoch": 2.26, "grad_norm": 8.592352297300314, "learning_rate": 4.184304809717027e-06, "loss": 0.0632, "step": 3170 }, { "epoch": 2.26, "grad_norm": 14.526658581579898, "learning_rate": 4.18145351558881e-06, "loss": 0.0461, "step": 3171 }, { "epoch": 2.26, "grad_norm": 16.837649318976588, "learning_rate": 4.178602494952187e-06, "loss": 0.0806, "step": 3172 }, { "epoch": 2.26, "grad_norm": 7.440201035737965, "learning_rate": 4.175751748759737e-06, "loss": 0.0474, "step": 3173 }, { "epoch": 2.27, "grad_norm": 4.692217557688077, "learning_rate": 4.1729012779639495e-06, "loss": 0.0359, "step": 3174 }, { "epoch": 2.27, "grad_norm": 5.1727632650733915, "learning_rate": 4.170051083517217e-06, "loss": 0.0648, "step": 3175 }, { "epoch": 2.27, "grad_norm": 8.946075258402107, "learning_rate": 4.167201166371846e-06, "loss": 0.0524, "step": 3176 }, { "epoch": 2.27, "grad_norm": 7.667399977486221, "learning_rate": 4.164351527480042e-06, "loss": 0.049, "step": 3177 }, { "epoch": 2.27, "grad_norm": 5.274714349541263, "learning_rate": 4.161502167793928e-06, "loss": 0.0706, "step": 3178 }, { "epoch": 2.27, "grad_norm": 10.214076104747047, "learning_rate": 4.1586530882655226e-06, "loss": 0.0504, "step": 3179 }, { "epoch": 2.27, "grad_norm": 7.550207984999468, "learning_rate": 4.155804289846762e-06, "loss": 0.0385, "step": 3180 }, { "epoch": 2.27, "grad_norm": 6.718996738912058, "learning_rate": 4.152955773489479e-06, "loss": 0.0577, "step": 3181 }, { "epoch": 2.27, "grad_norm": 4.097197065235624, "learning_rate": 4.150107540145413e-06, "loss": 0.0457, "step": 3182 }, { "epoch": 2.27, "grad_norm": 8.064906232599887, "learning_rate": 4.147259590766219e-06, "loss": 0.0574, "step": 3183 }, { "epoch": 2.27, "grad_norm": 6.518517725711126, "learning_rate": 4.144411926303442e-06, "loss": 0.0383, "step": 3184 }, { "epoch": 2.27, "grad_norm": 5.616507539227108, "learning_rate": 4.141564547708546e-06, "loss": 0.0521, "step": 3185 }, { "epoch": 2.27, "grad_norm": 6.556983372205476, "learning_rate": 4.138717455932888e-06, "loss": 0.0537, "step": 3186 }, { "epoch": 2.27, "grad_norm": 4.742026540119468, "learning_rate": 4.13587065192774e-06, "loss": 0.0479, "step": 3187 }, { "epoch": 2.28, "grad_norm": 16.34355762704361, "learning_rate": 4.133024136644269e-06, "loss": 0.0531, "step": 3188 }, { "epoch": 2.28, "grad_norm": 7.81542554393562, "learning_rate": 4.130177911033546e-06, "loss": 0.04, "step": 3189 }, { "epoch": 2.28, "grad_norm": 13.76182597376324, "learning_rate": 4.127331976046553e-06, "loss": 0.0431, "step": 3190 }, { "epoch": 2.28, "grad_norm": 6.879252527427723, "learning_rate": 4.124486332634165e-06, "loss": 0.067, "step": 3191 }, { "epoch": 2.28, "grad_norm": 9.10827224827748, "learning_rate": 4.121640981747169e-06, "loss": 0.0434, "step": 3192 }, { "epoch": 2.28, "grad_norm": 6.435745967654931, "learning_rate": 4.118795924336245e-06, "loss": 0.0596, "step": 3193 }, { "epoch": 2.28, "grad_norm": 6.612178222538567, "learning_rate": 4.115951161351985e-06, "loss": 0.0446, "step": 3194 }, { "epoch": 2.28, "grad_norm": 14.164355450798242, "learning_rate": 4.113106693744871e-06, "loss": 0.0479, "step": 3195 }, { "epoch": 2.28, "grad_norm": 7.157081086345673, "learning_rate": 4.110262522465298e-06, "loss": 0.0531, "step": 3196 }, { "epoch": 2.28, "grad_norm": 7.031758826153102, "learning_rate": 4.107418648463553e-06, "loss": 0.0307, "step": 3197 }, { "epoch": 2.28, "grad_norm": 13.865702651660351, "learning_rate": 4.104575072689827e-06, "loss": 0.0684, "step": 3198 }, { "epoch": 2.28, "grad_norm": 3.3223935942971883, "learning_rate": 4.101731796094215e-06, "loss": 0.0368, "step": 3199 }, { "epoch": 2.28, "grad_norm": 9.926947145875099, "learning_rate": 4.098888819626704e-06, "loss": 0.0475, "step": 3200 }, { "epoch": 2.28, "grad_norm": 13.764679326338504, "learning_rate": 4.096046144237189e-06, "loss": 0.0667, "step": 3201 }, { "epoch": 2.29, "grad_norm": 4.877567877322836, "learning_rate": 4.093203770875458e-06, "loss": 0.0318, "step": 3202 }, { "epoch": 2.29, "grad_norm": 3.0890455248333235, "learning_rate": 4.090361700491203e-06, "loss": 0.0391, "step": 3203 }, { "epoch": 2.29, "grad_norm": 11.307502997946623, "learning_rate": 4.087519934034011e-06, "loss": 0.0579, "step": 3204 }, { "epoch": 2.29, "grad_norm": 12.286021346963802, "learning_rate": 4.084678472453371e-06, "loss": 0.0425, "step": 3205 }, { "epoch": 2.29, "grad_norm": 20.05845646784748, "learning_rate": 4.081837316698665e-06, "loss": 0.0714, "step": 3206 }, { "epoch": 2.29, "grad_norm": 7.513715346915439, "learning_rate": 4.078996467719179e-06, "loss": 0.0566, "step": 3207 }, { "epoch": 2.29, "grad_norm": 3.871357352532569, "learning_rate": 4.076155926464091e-06, "loss": 0.0322, "step": 3208 }, { "epoch": 2.29, "grad_norm": 8.692473548212876, "learning_rate": 4.07331569388248e-06, "loss": 0.0452, "step": 3209 }, { "epoch": 2.29, "grad_norm": 5.348891593350171, "learning_rate": 4.07047577092332e-06, "loss": 0.0622, "step": 3210 }, { "epoch": 2.29, "grad_norm": 10.667994143010105, "learning_rate": 4.067636158535483e-06, "loss": 0.0429, "step": 3211 }, { "epoch": 2.29, "grad_norm": 14.33390498257431, "learning_rate": 4.064796857667734e-06, "loss": 0.0602, "step": 3212 }, { "epoch": 2.29, "grad_norm": 13.853874395384349, "learning_rate": 4.0619578692687405e-06, "loss": 0.0485, "step": 3213 }, { "epoch": 2.29, "grad_norm": 10.967333214518202, "learning_rate": 4.059119194287056e-06, "loss": 0.0618, "step": 3214 }, { "epoch": 2.29, "grad_norm": 19.491206872437058, "learning_rate": 4.056280833671139e-06, "loss": 0.0689, "step": 3215 }, { "epoch": 2.3, "grad_norm": 15.545263997390558, "learning_rate": 4.053442788369334e-06, "loss": 0.0463, "step": 3216 }, { "epoch": 2.3, "grad_norm": 8.404901220400674, "learning_rate": 4.05060505932989e-06, "loss": 0.049, "step": 3217 }, { "epoch": 2.3, "grad_norm": 10.955981738026576, "learning_rate": 4.04776764750094e-06, "loss": 0.0467, "step": 3218 }, { "epoch": 2.3, "grad_norm": 9.242743867660305, "learning_rate": 4.04493055383052e-06, "loss": 0.0492, "step": 3219 }, { "epoch": 2.3, "grad_norm": 18.269012387060222, "learning_rate": 4.042093779266553e-06, "loss": 0.0473, "step": 3220 }, { "epoch": 2.3, "grad_norm": 4.530310469645218, "learning_rate": 4.0392573247568614e-06, "loss": 0.0731, "step": 3221 }, { "epoch": 2.3, "grad_norm": 10.337877336593875, "learning_rate": 4.036421191249155e-06, "loss": 0.0583, "step": 3222 }, { "epoch": 2.3, "grad_norm": 11.665285153590094, "learning_rate": 4.033585379691036e-06, "loss": 0.101, "step": 3223 }, { "epoch": 2.3, "grad_norm": 10.20335646310635, "learning_rate": 4.030749891030008e-06, "loss": 0.0428, "step": 3224 }, { "epoch": 2.3, "grad_norm": 14.145496210000346, "learning_rate": 4.0279147262134534e-06, "loss": 0.0629, "step": 3225 }, { "epoch": 2.3, "grad_norm": 10.358245511220993, "learning_rate": 4.025079886188661e-06, "loss": 0.0611, "step": 3226 }, { "epoch": 2.3, "grad_norm": 11.004038035963342, "learning_rate": 4.022245371902796e-06, "loss": 0.0431, "step": 3227 }, { "epoch": 2.3, "grad_norm": 13.706420262135827, "learning_rate": 4.01941118430293e-06, "loss": 0.0503, "step": 3228 }, { "epoch": 2.3, "grad_norm": 5.96510652515189, "learning_rate": 4.0165773243360105e-06, "loss": 0.0313, "step": 3229 }, { "epoch": 2.31, "grad_norm": 4.516895579432337, "learning_rate": 4.0137437929488885e-06, "loss": 0.0425, "step": 3230 }, { "epoch": 2.31, "grad_norm": 6.163717649417404, "learning_rate": 4.010910591088296e-06, "loss": 0.0454, "step": 3231 }, { "epoch": 2.31, "grad_norm": 10.726656037501574, "learning_rate": 4.008077719700859e-06, "loss": 0.0592, "step": 3232 }, { "epoch": 2.31, "grad_norm": 13.5104992520412, "learning_rate": 4.005245179733095e-06, "loss": 0.045, "step": 3233 }, { "epoch": 2.31, "grad_norm": 7.879892748265249, "learning_rate": 4.002412972131403e-06, "loss": 0.0378, "step": 3234 }, { "epoch": 2.31, "grad_norm": 7.298147921669261, "learning_rate": 3.999581097842082e-06, "loss": 0.0468, "step": 3235 }, { "epoch": 2.31, "grad_norm": 8.052193186678828, "learning_rate": 3.99674955781131e-06, "loss": 0.0628, "step": 3236 }, { "epoch": 2.31, "grad_norm": 4.805780871133186, "learning_rate": 3.99391835298516e-06, "loss": 0.0336, "step": 3237 }, { "epoch": 2.31, "grad_norm": 11.265202041373872, "learning_rate": 3.991087484309586e-06, "loss": 0.0347, "step": 3238 }, { "epoch": 2.31, "grad_norm": 9.214043476468115, "learning_rate": 3.988256952730439e-06, "loss": 0.0771, "step": 3239 }, { "epoch": 2.31, "grad_norm": 5.8998163572166895, "learning_rate": 3.985426759193449e-06, "loss": 0.0495, "step": 3240 }, { "epoch": 2.31, "grad_norm": 10.144981153013562, "learning_rate": 3.982596904644236e-06, "loss": 0.0382, "step": 3241 }, { "epoch": 2.31, "grad_norm": 5.985568117991297, "learning_rate": 3.979767390028309e-06, "loss": 0.0441, "step": 3242 }, { "epoch": 2.31, "grad_norm": 17.97011714211926, "learning_rate": 3.976938216291059e-06, "loss": 0.0776, "step": 3243 }, { "epoch": 2.32, "grad_norm": 3.31899818255434, "learning_rate": 3.974109384377768e-06, "loss": 0.0443, "step": 3244 }, { "epoch": 2.32, "grad_norm": 10.545724201687834, "learning_rate": 3.971280895233599e-06, "loss": 0.0442, "step": 3245 }, { "epoch": 2.32, "grad_norm": 5.238792598733231, "learning_rate": 3.968452749803605e-06, "loss": 0.0577, "step": 3246 }, { "epoch": 2.32, "grad_norm": 3.7875985593995583, "learning_rate": 3.965624949032723e-06, "loss": 0.0409, "step": 3247 }, { "epoch": 2.32, "grad_norm": 4.594499938098831, "learning_rate": 3.962797493865767e-06, "loss": 0.0485, "step": 3248 }, { "epoch": 2.32, "grad_norm": 5.310912106765515, "learning_rate": 3.959970385247451e-06, "loss": 0.0536, "step": 3249 }, { "epoch": 2.32, "grad_norm": 8.359734666912926, "learning_rate": 3.957143624122359e-06, "loss": 0.0649, "step": 3250 }, { "epoch": 2.32, "grad_norm": 5.073298004365436, "learning_rate": 3.954317211434966e-06, "loss": 0.0496, "step": 3251 }, { "epoch": 2.32, "grad_norm": 6.362507284188202, "learning_rate": 3.951491148129628e-06, "loss": 0.0522, "step": 3252 }, { "epoch": 2.32, "grad_norm": 3.991116571447913, "learning_rate": 3.948665435150589e-06, "loss": 0.0389, "step": 3253 }, { "epoch": 2.32, "grad_norm": 10.386760657877408, "learning_rate": 3.945840073441967e-06, "loss": 0.0495, "step": 3254 }, { "epoch": 2.32, "grad_norm": 5.3677506790255665, "learning_rate": 3.943015063947773e-06, "loss": 0.0417, "step": 3255 }, { "epoch": 2.32, "grad_norm": 7.163430214164329, "learning_rate": 3.940190407611891e-06, "loss": 0.0495, "step": 3256 }, { "epoch": 2.32, "grad_norm": 17.871089845054883, "learning_rate": 3.937366105378093e-06, "loss": 0.0321, "step": 3257 }, { "epoch": 2.33, "grad_norm": 7.221258871476373, "learning_rate": 3.93454215819003e-06, "loss": 0.0586, "step": 3258 }, { "epoch": 2.33, "grad_norm": 8.203863757302269, "learning_rate": 3.931718566991236e-06, "loss": 0.0588, "step": 3259 }, { "epoch": 2.33, "grad_norm": 6.2318167707341505, "learning_rate": 3.9288953327251265e-06, "loss": 0.0539, "step": 3260 }, { "epoch": 2.33, "grad_norm": 8.561571820835148, "learning_rate": 3.9260724563349935e-06, "loss": 0.045, "step": 3261 }, { "epoch": 2.33, "grad_norm": 10.489240447227571, "learning_rate": 3.923249938764016e-06, "loss": 0.0543, "step": 3262 }, { "epoch": 2.33, "grad_norm": 5.63338501785195, "learning_rate": 3.920427780955247e-06, "loss": 0.0695, "step": 3263 }, { "epoch": 2.33, "grad_norm": 7.206481099887481, "learning_rate": 3.917605983851622e-06, "loss": 0.0625, "step": 3264 }, { "epoch": 2.33, "grad_norm": 12.970256281406726, "learning_rate": 3.914784548395959e-06, "loss": 0.048, "step": 3265 }, { "epoch": 2.33, "grad_norm": 16.605000076839822, "learning_rate": 3.911963475530948e-06, "loss": 0.0655, "step": 3266 }, { "epoch": 2.33, "grad_norm": 22.570657256087852, "learning_rate": 3.909142766199163e-06, "loss": 0.0626, "step": 3267 }, { "epoch": 2.33, "grad_norm": 7.757888466221637, "learning_rate": 3.906322421343055e-06, "loss": 0.0389, "step": 3268 }, { "epoch": 2.33, "grad_norm": 18.353788836285396, "learning_rate": 3.903502441904956e-06, "loss": 0.0423, "step": 3269 }, { "epoch": 2.33, "grad_norm": 18.335337367703037, "learning_rate": 3.900682828827072e-06, "loss": 0.041, "step": 3270 }, { "epoch": 2.33, "grad_norm": 20.560967998792574, "learning_rate": 3.897863583051488e-06, "loss": 0.0438, "step": 3271 }, { "epoch": 2.34, "grad_norm": 9.189481542895964, "learning_rate": 3.895044705520167e-06, "loss": 0.0408, "step": 3272 }, { "epoch": 2.34, "grad_norm": 6.240925968107035, "learning_rate": 3.892226197174947e-06, "loss": 0.0535, "step": 3273 }, { "epoch": 2.34, "grad_norm": 11.330018925248265, "learning_rate": 3.889408058957547e-06, "loss": 0.0564, "step": 3274 }, { "epoch": 2.34, "grad_norm": 22.462180664383062, "learning_rate": 3.886590291809554e-06, "loss": 0.0628, "step": 3275 }, { "epoch": 2.34, "grad_norm": 25.572962458851126, "learning_rate": 3.883772896672443e-06, "loss": 0.0505, "step": 3276 }, { "epoch": 2.34, "grad_norm": 16.781204089206188, "learning_rate": 3.8809558744875534e-06, "loss": 0.0385, "step": 3277 }, { "epoch": 2.34, "grad_norm": 3.374198239697441, "learning_rate": 3.878139226196107e-06, "loss": 0.0457, "step": 3278 }, { "epoch": 2.34, "grad_norm": 14.5149616122367, "learning_rate": 3.875322952739196e-06, "loss": 0.0605, "step": 3279 }, { "epoch": 2.34, "grad_norm": 13.155372455067319, "learning_rate": 3.872507055057793e-06, "loss": 0.0413, "step": 3280 }, { "epoch": 2.34, "grad_norm": 14.517540894737511, "learning_rate": 3.8696915340927395e-06, "loss": 0.0463, "step": 3281 }, { "epoch": 2.34, "grad_norm": 12.853273973170033, "learning_rate": 3.866876390784752e-06, "loss": 0.0489, "step": 3282 }, { "epoch": 2.34, "grad_norm": 16.08116245026929, "learning_rate": 3.8640616260744266e-06, "loss": 0.0584, "step": 3283 }, { "epoch": 2.34, "grad_norm": 13.158844571849507, "learning_rate": 3.861247240902223e-06, "loss": 0.0468, "step": 3284 }, { "epoch": 2.34, "grad_norm": 11.605770161229955, "learning_rate": 3.858433236208485e-06, "loss": 0.0745, "step": 3285 }, { "epoch": 2.35, "grad_norm": 14.696854573310642, "learning_rate": 3.85561961293342e-06, "loss": 0.0485, "step": 3286 }, { "epoch": 2.35, "grad_norm": 8.808771550272338, "learning_rate": 3.852806372017115e-06, "loss": 0.0525, "step": 3287 }, { "epoch": 2.35, "grad_norm": 12.28812481591032, "learning_rate": 3.849993514399521e-06, "loss": 0.0457, "step": 3288 }, { "epoch": 2.35, "grad_norm": 7.973779667671355, "learning_rate": 3.847181041020472e-06, "loss": 0.0442, "step": 3289 }, { "epoch": 2.35, "grad_norm": 19.096436336706248, "learning_rate": 3.844368952819666e-06, "loss": 0.0495, "step": 3290 }, { "epoch": 2.35, "grad_norm": 4.254182022740973, "learning_rate": 3.84155725073667e-06, "loss": 0.0621, "step": 3291 }, { "epoch": 2.35, "grad_norm": 8.2181153756375, "learning_rate": 3.838745935710931e-06, "loss": 0.0423, "step": 3292 }, { "epoch": 2.35, "grad_norm": 6.376471586863244, "learning_rate": 3.835935008681757e-06, "loss": 0.0704, "step": 3293 }, { "epoch": 2.35, "grad_norm": 12.63158037914216, "learning_rate": 3.833124470588336e-06, "loss": 0.0662, "step": 3294 }, { "epoch": 2.35, "grad_norm": 13.3387733700678, "learning_rate": 3.830314322369717e-06, "loss": 0.0459, "step": 3295 }, { "epoch": 2.35, "grad_norm": 5.133497815442831, "learning_rate": 3.827504564964825e-06, "loss": 0.038, "step": 3296 }, { "epoch": 2.35, "grad_norm": 8.710809474582618, "learning_rate": 3.82469519931245e-06, "loss": 0.0739, "step": 3297 }, { "epoch": 2.35, "grad_norm": 7.0462004661285835, "learning_rate": 3.8218862263512565e-06, "loss": 0.0383, "step": 3298 }, { "epoch": 2.35, "grad_norm": 5.6385692219029595, "learning_rate": 3.819077647019772e-06, "loss": 0.0709, "step": 3299 }, { "epoch": 2.36, "grad_norm": 24.502025525734307, "learning_rate": 3.816269462256394e-06, "loss": 0.0513, "step": 3300 }, { "epoch": 2.36, "grad_norm": 13.864706482022296, "learning_rate": 3.813461672999394e-06, "loss": 0.0684, "step": 3301 }, { "epoch": 2.36, "grad_norm": 6.156507954757023, "learning_rate": 3.8106542801869007e-06, "loss": 0.0494, "step": 3302 }, { "epoch": 2.36, "grad_norm": 6.807759895501928, "learning_rate": 3.8078472847569215e-06, "loss": 0.0603, "step": 3303 }, { "epoch": 2.36, "grad_norm": 7.8161472508869725, "learning_rate": 3.805040687647321e-06, "loss": 0.0661, "step": 3304 }, { "epoch": 2.36, "grad_norm": 4.692379712993815, "learning_rate": 3.8022344897958402e-06, "loss": 0.0519, "step": 3305 }, { "epoch": 2.36, "grad_norm": 13.215818470193987, "learning_rate": 3.799428692140077e-06, "loss": 0.0547, "step": 3306 }, { "epoch": 2.36, "grad_norm": 13.05567981160067, "learning_rate": 3.7966232956175053e-06, "loss": 0.0408, "step": 3307 }, { "epoch": 2.36, "grad_norm": 11.836967320563483, "learning_rate": 3.793818301165457e-06, "loss": 0.0607, "step": 3308 }, { "epoch": 2.36, "grad_norm": 24.41802793152467, "learning_rate": 3.7910137097211345e-06, "loss": 0.0675, "step": 3309 }, { "epoch": 2.36, "grad_norm": 13.008738936268362, "learning_rate": 3.788209522221604e-06, "loss": 0.0508, "step": 3310 }, { "epoch": 2.36, "grad_norm": 8.453167608671531, "learning_rate": 3.7854057396037934e-06, "loss": 0.0465, "step": 3311 }, { "epoch": 2.36, "grad_norm": 7.942453598993944, "learning_rate": 3.7826023628045037e-06, "loss": 0.0429, "step": 3312 }, { "epoch": 2.36, "grad_norm": 7.2021837650240546, "learning_rate": 3.779799392760391e-06, "loss": 0.0595, "step": 3313 }, { "epoch": 2.37, "grad_norm": 8.110336108153568, "learning_rate": 3.7769968304079833e-06, "loss": 0.0392, "step": 3314 }, { "epoch": 2.37, "grad_norm": 4.770513910427804, "learning_rate": 3.7741946766836657e-06, "loss": 0.0422, "step": 3315 }, { "epoch": 2.37, "grad_norm": 5.485390555701271, "learning_rate": 3.771392932523691e-06, "loss": 0.0442, "step": 3316 }, { "epoch": 2.37, "grad_norm": 3.776761654570514, "learning_rate": 3.768591598864174e-06, "loss": 0.0474, "step": 3317 }, { "epoch": 2.37, "grad_norm": 8.255921845791628, "learning_rate": 3.765790676641092e-06, "loss": 0.0593, "step": 3318 }, { "epoch": 2.37, "grad_norm": 7.84516772890703, "learning_rate": 3.762990166790286e-06, "loss": 0.0421, "step": 3319 }, { "epoch": 2.37, "grad_norm": 21.7601097912396, "learning_rate": 3.760190070247458e-06, "loss": 0.0898, "step": 3320 }, { "epoch": 2.37, "grad_norm": 8.031470132988193, "learning_rate": 3.7573903879481714e-06, "loss": 0.0466, "step": 3321 }, { "epoch": 2.37, "grad_norm": 11.109101536744378, "learning_rate": 3.754591120827854e-06, "loss": 0.0467, "step": 3322 }, { "epoch": 2.37, "grad_norm": 17.1446165094642, "learning_rate": 3.7517922698217914e-06, "loss": 0.0461, "step": 3323 }, { "epoch": 2.37, "grad_norm": 14.65645178329707, "learning_rate": 3.7489938358651334e-06, "loss": 0.0511, "step": 3324 }, { "epoch": 2.37, "grad_norm": 6.9909618648344605, "learning_rate": 3.746195819892885e-06, "loss": 0.0581, "step": 3325 }, { "epoch": 2.37, "grad_norm": 18.094397161249283, "learning_rate": 3.7433982228399205e-06, "loss": 0.0445, "step": 3326 }, { "epoch": 2.37, "grad_norm": 12.802062963646232, "learning_rate": 3.7406010456409648e-06, "loss": 0.05, "step": 3327 }, { "epoch": 2.38, "grad_norm": 16.396058272502454, "learning_rate": 3.73780428923061e-06, "loss": 0.0441, "step": 3328 }, { "epoch": 2.38, "grad_norm": 10.919646375990595, "learning_rate": 3.7350079545433014e-06, "loss": 0.0501, "step": 3329 }, { "epoch": 2.38, "grad_norm": 6.922454337057674, "learning_rate": 3.7322120425133497e-06, "loss": 0.0319, "step": 3330 }, { "epoch": 2.38, "grad_norm": 11.453873310916373, "learning_rate": 3.729416554074917e-06, "loss": 0.0383, "step": 3331 }, { "epoch": 2.38, "grad_norm": 13.08051920997564, "learning_rate": 3.726621490162033e-06, "loss": 0.0369, "step": 3332 }, { "epoch": 2.38, "grad_norm": 10.658265780826953, "learning_rate": 3.7238268517085773e-06, "loss": 0.0862, "step": 3333 }, { "epoch": 2.38, "grad_norm": 15.404614991290057, "learning_rate": 3.7210326396482893e-06, "loss": 0.0499, "step": 3334 }, { "epoch": 2.38, "grad_norm": 14.75043279711972, "learning_rate": 3.718238854914771e-06, "loss": 0.0703, "step": 3335 }, { "epoch": 2.38, "grad_norm": 15.503977295964852, "learning_rate": 3.7154454984414733e-06, "loss": 0.0573, "step": 3336 }, { "epoch": 2.38, "grad_norm": 6.946957800905822, "learning_rate": 3.7126525711617135e-06, "loss": 0.0673, "step": 3337 }, { "epoch": 2.38, "grad_norm": 6.259494768027386, "learning_rate": 3.7098600740086555e-06, "loss": 0.0547, "step": 3338 }, { "epoch": 2.38, "grad_norm": 11.546595002331136, "learning_rate": 3.707068007915329e-06, "loss": 0.056, "step": 3339 }, { "epoch": 2.38, "grad_norm": 17.967131882842132, "learning_rate": 3.704276373814611e-06, "loss": 0.0409, "step": 3340 }, { "epoch": 2.38, "grad_norm": 15.393071791424758, "learning_rate": 3.7014851726392427e-06, "loss": 0.0546, "step": 3341 }, { "epoch": 2.39, "grad_norm": 7.001550477108224, "learning_rate": 3.6986944053218143e-06, "loss": 0.0545, "step": 3342 }, { "epoch": 2.39, "grad_norm": 3.9519019922056784, "learning_rate": 3.69590407279477e-06, "loss": 0.043, "step": 3343 }, { "epoch": 2.39, "grad_norm": 8.960410735070326, "learning_rate": 3.6931141759904175e-06, "loss": 0.0607, "step": 3344 }, { "epoch": 2.39, "grad_norm": 14.847248534040741, "learning_rate": 3.6903247158409077e-06, "loss": 0.0394, "step": 3345 }, { "epoch": 2.39, "grad_norm": 17.388659127488623, "learning_rate": 3.687535693278256e-06, "loss": 0.0561, "step": 3346 }, { "epoch": 2.39, "grad_norm": 6.378086436756875, "learning_rate": 3.6847471092343225e-06, "loss": 0.049, "step": 3347 }, { "epoch": 2.39, "grad_norm": 4.734911486197315, "learning_rate": 3.681958964640828e-06, "loss": 0.052, "step": 3348 }, { "epoch": 2.39, "grad_norm": 3.652535209632719, "learning_rate": 3.679171260429343e-06, "loss": 0.0573, "step": 3349 }, { "epoch": 2.39, "grad_norm": 18.331070603639798, "learning_rate": 3.676383997531288e-06, "loss": 0.0636, "step": 3350 }, { "epoch": 2.39, "grad_norm": 10.55381413312245, "learning_rate": 3.673597176877944e-06, "loss": 0.0585, "step": 3351 }, { "epoch": 2.39, "grad_norm": 8.280879713430096, "learning_rate": 3.670810799400435e-06, "loss": 0.0447, "step": 3352 }, { "epoch": 2.39, "grad_norm": 6.519631412173078, "learning_rate": 3.668024866029747e-06, "loss": 0.0404, "step": 3353 }, { "epoch": 2.39, "grad_norm": 11.576104508421272, "learning_rate": 3.665239377696706e-06, "loss": 0.0468, "step": 3354 }, { "epoch": 2.39, "grad_norm": 4.760444512858353, "learning_rate": 3.6624543353320006e-06, "loss": 0.036, "step": 3355 }, { "epoch": 2.4, "grad_norm": 4.6760439313381905, "learning_rate": 3.659669739866162e-06, "loss": 0.0276, "step": 3356 }, { "epoch": 2.4, "grad_norm": 4.543274966456261, "learning_rate": 3.6568855922295776e-06, "loss": 0.0482, "step": 3357 }, { "epoch": 2.4, "grad_norm": 5.3086205701499, "learning_rate": 3.654101893352482e-06, "loss": 0.0451, "step": 3358 }, { "epoch": 2.4, "grad_norm": 13.06056766375821, "learning_rate": 3.651318644164958e-06, "loss": 0.0627, "step": 3359 }, { "epoch": 2.4, "grad_norm": 8.710508596248417, "learning_rate": 3.6485358455969454e-06, "loss": 0.043, "step": 3360 }, { "epoch": 2.4, "grad_norm": 5.694002302786408, "learning_rate": 3.645753498578225e-06, "loss": 0.0331, "step": 3361 }, { "epoch": 2.4, "grad_norm": 8.42561338264928, "learning_rate": 3.6429716040384346e-06, "loss": 0.034, "step": 3362 }, { "epoch": 2.4, "grad_norm": 6.1770297556861475, "learning_rate": 3.6401901629070524e-06, "loss": 0.0625, "step": 3363 }, { "epoch": 2.4, "grad_norm": 5.919884188773968, "learning_rate": 3.6374091761134147e-06, "loss": 0.0616, "step": 3364 }, { "epoch": 2.4, "grad_norm": 5.43484904633554, "learning_rate": 3.6346286445866953e-06, "loss": 0.0339, "step": 3365 }, { "epoch": 2.4, "grad_norm": 3.2904973516040323, "learning_rate": 3.6318485692559263e-06, "loss": 0.0532, "step": 3366 }, { "epoch": 2.4, "grad_norm": 4.091491571818965, "learning_rate": 3.62906895104998e-06, "loss": 0.0418, "step": 3367 }, { "epoch": 2.4, "grad_norm": 7.015427483416067, "learning_rate": 3.6262897908975787e-06, "loss": 0.0275, "step": 3368 }, { "epoch": 2.4, "grad_norm": 2.8745109759948946, "learning_rate": 3.6235110897272917e-06, "loss": 0.0345, "step": 3369 }, { "epoch": 2.41, "grad_norm": 4.67694078014422, "learning_rate": 3.620732848467535e-06, "loss": 0.041, "step": 3370 }, { "epoch": 2.41, "grad_norm": 7.963957573360893, "learning_rate": 3.6179550680465703e-06, "loss": 0.042, "step": 3371 }, { "epoch": 2.41, "grad_norm": 13.579331192890951, "learning_rate": 3.615177749392506e-06, "loss": 0.0472, "step": 3372 }, { "epoch": 2.41, "grad_norm": 3.9713676441301975, "learning_rate": 3.6124008934332956e-06, "loss": 0.0406, "step": 3373 }, { "epoch": 2.41, "grad_norm": 4.852127016193872, "learning_rate": 3.609624501096739e-06, "loss": 0.0449, "step": 3374 }, { "epoch": 2.41, "grad_norm": 4.550426276023454, "learning_rate": 3.606848573310479e-06, "loss": 0.0425, "step": 3375 }, { "epoch": 2.41, "grad_norm": 12.497366697285011, "learning_rate": 3.6040731110020065e-06, "loss": 0.0394, "step": 3376 }, { "epoch": 2.41, "grad_norm": 18.168602267642527, "learning_rate": 3.6012981150986524e-06, "loss": 0.0843, "step": 3377 }, { "epoch": 2.41, "grad_norm": 12.307783885049966, "learning_rate": 3.598523586527599e-06, "loss": 0.0384, "step": 3378 }, { "epoch": 2.41, "grad_norm": 4.59462797750005, "learning_rate": 3.595749526215862e-06, "loss": 0.0432, "step": 3379 }, { "epoch": 2.41, "grad_norm": 14.017885187186423, "learning_rate": 3.5929759350903117e-06, "loss": 0.046, "step": 3380 }, { "epoch": 2.41, "grad_norm": 24.792119713261677, "learning_rate": 3.5902028140776524e-06, "loss": 0.0671, "step": 3381 }, { "epoch": 2.41, "grad_norm": 12.244242993831454, "learning_rate": 3.5874301641044386e-06, "loss": 0.0345, "step": 3382 }, { "epoch": 2.41, "grad_norm": 7.018502392020034, "learning_rate": 3.5846579860970632e-06, "loss": 0.0583, "step": 3383 }, { "epoch": 2.42, "grad_norm": 16.228689702919265, "learning_rate": 3.58188628098176e-06, "loss": 0.0566, "step": 3384 }, { "epoch": 2.42, "grad_norm": 7.9160236332123395, "learning_rate": 3.579115049684612e-06, "loss": 0.0453, "step": 3385 }, { "epoch": 2.42, "grad_norm": 6.865601457118527, "learning_rate": 3.576344293131533e-06, "loss": 0.0482, "step": 3386 }, { "epoch": 2.42, "grad_norm": 14.727035098444215, "learning_rate": 3.5735740122482896e-06, "loss": 0.0439, "step": 3387 }, { "epoch": 2.42, "grad_norm": 3.831666188573751, "learning_rate": 3.570804207960481e-06, "loss": 0.0428, "step": 3388 }, { "epoch": 2.42, "grad_norm": 6.970567913151844, "learning_rate": 3.5680348811935527e-06, "loss": 0.0429, "step": 3389 }, { "epoch": 2.42, "grad_norm": 4.700224034319951, "learning_rate": 3.565266032872785e-06, "loss": 0.05, "step": 3390 }, { "epoch": 2.42, "grad_norm": 5.060450793558676, "learning_rate": 3.5624976639233056e-06, "loss": 0.0587, "step": 3391 }, { "epoch": 2.42, "grad_norm": 9.739022910580005, "learning_rate": 3.559729775270076e-06, "loss": 0.0383, "step": 3392 }, { "epoch": 2.42, "grad_norm": 5.282482394344644, "learning_rate": 3.5569623678378972e-06, "loss": 0.05, "step": 3393 }, { "epoch": 2.42, "grad_norm": 7.146211172306711, "learning_rate": 3.554195442551416e-06, "loss": 0.0346, "step": 3394 }, { "epoch": 2.42, "grad_norm": 7.117770381713717, "learning_rate": 3.551429000335108e-06, "loss": 0.0453, "step": 3395 }, { "epoch": 2.42, "grad_norm": 4.501430969155091, "learning_rate": 3.5486630421132983e-06, "loss": 0.0406, "step": 3396 }, { "epoch": 2.42, "grad_norm": 9.978686458274396, "learning_rate": 3.5458975688101403e-06, "loss": 0.0475, "step": 3397 }, { "epoch": 2.43, "grad_norm": 9.504820462744869, "learning_rate": 3.5431325813496352e-06, "loss": 0.055, "step": 3398 }, { "epoch": 2.43, "grad_norm": 13.297641172040194, "learning_rate": 3.540368080655612e-06, "loss": 0.064, "step": 3399 }, { "epoch": 2.43, "grad_norm": 5.145894409950023, "learning_rate": 3.5376040676517443e-06, "loss": 0.0508, "step": 3400 }, { "epoch": 2.43, "grad_norm": 4.6754745477101975, "learning_rate": 3.5348405432615407e-06, "loss": 0.0369, "step": 3401 }, { "epoch": 2.43, "grad_norm": 9.183504955229806, "learning_rate": 3.5320775084083425e-06, "loss": 0.0362, "step": 3402 }, { "epoch": 2.43, "grad_norm": 10.016899646390549, "learning_rate": 3.529314964015336e-06, "loss": 0.0524, "step": 3403 }, { "epoch": 2.43, "grad_norm": 7.225269060493574, "learning_rate": 3.526552911005533e-06, "loss": 0.0651, "step": 3404 }, { "epoch": 2.43, "grad_norm": 9.402695057707561, "learning_rate": 3.523791350301793e-06, "loss": 0.0508, "step": 3405 }, { "epoch": 2.43, "grad_norm": 5.528168693207084, "learning_rate": 3.5210302828267984e-06, "loss": 0.0591, "step": 3406 }, { "epoch": 2.43, "grad_norm": 4.978722816391329, "learning_rate": 3.5182697095030795e-06, "loss": 0.0323, "step": 3407 }, { "epoch": 2.43, "grad_norm": 11.785501724674626, "learning_rate": 3.5155096312529913e-06, "loss": 0.0605, "step": 3408 }, { "epoch": 2.43, "grad_norm": 8.863372588103983, "learning_rate": 3.5127500489987252e-06, "loss": 0.0441, "step": 3409 }, { "epoch": 2.43, "grad_norm": 5.336343653134836, "learning_rate": 3.5099909636623148e-06, "loss": 0.0493, "step": 3410 }, { "epoch": 2.43, "grad_norm": 9.86742725134051, "learning_rate": 3.5072323761656163e-06, "loss": 0.0493, "step": 3411 }, { "epoch": 2.44, "grad_norm": 7.055618254025233, "learning_rate": 3.5044742874303297e-06, "loss": 0.0541, "step": 3412 }, { "epoch": 2.44, "grad_norm": 7.776496254142825, "learning_rate": 3.501716698377979e-06, "loss": 0.0308, "step": 3413 }, { "epoch": 2.44, "grad_norm": 7.122528698892253, "learning_rate": 3.4989596099299306e-06, "loss": 0.0427, "step": 3414 }, { "epoch": 2.44, "grad_norm": 16.55113954313631, "learning_rate": 3.496203023007374e-06, "loss": 0.075, "step": 3415 }, { "epoch": 2.44, "grad_norm": 9.731003512719141, "learning_rate": 3.4934469385313418e-06, "loss": 0.0554, "step": 3416 }, { "epoch": 2.44, "grad_norm": 35.12656172388353, "learning_rate": 3.490691357422689e-06, "loss": 0.0719, "step": 3417 }, { "epoch": 2.44, "grad_norm": 7.754354703696009, "learning_rate": 3.487936280602108e-06, "loss": 0.0235, "step": 3418 }, { "epoch": 2.44, "grad_norm": 3.521163253822065, "learning_rate": 3.4851817089901203e-06, "loss": 0.033, "step": 3419 }, { "epoch": 2.44, "grad_norm": 26.224926062833664, "learning_rate": 3.4824276435070804e-06, "loss": 0.0563, "step": 3420 }, { "epoch": 2.44, "grad_norm": 15.61271153737091, "learning_rate": 3.4796740850731716e-06, "loss": 0.0356, "step": 3421 }, { "epoch": 2.44, "grad_norm": 9.031891603705114, "learning_rate": 3.47692103460841e-06, "loss": 0.0486, "step": 3422 }, { "epoch": 2.44, "grad_norm": 6.734795663073578, "learning_rate": 3.474168493032641e-06, "loss": 0.0525, "step": 3423 }, { "epoch": 2.44, "grad_norm": 6.755868031059958, "learning_rate": 3.4714164612655387e-06, "loss": 0.0458, "step": 3424 }, { "epoch": 2.44, "grad_norm": 7.708505272692726, "learning_rate": 3.468664940226609e-06, "loss": 0.0513, "step": 3425 }, { "epoch": 2.45, "grad_norm": 4.506075218775547, "learning_rate": 3.4659139308351885e-06, "loss": 0.0344, "step": 3426 }, { "epoch": 2.45, "grad_norm": 3.992509225070968, "learning_rate": 3.4631634340104357e-06, "loss": 0.044, "step": 3427 }, { "epoch": 2.45, "grad_norm": 7.003251664486951, "learning_rate": 3.460413450671346e-06, "loss": 0.0434, "step": 3428 }, { "epoch": 2.45, "grad_norm": 14.475401829587407, "learning_rate": 3.457663981736739e-06, "loss": 0.0854, "step": 3429 }, { "epoch": 2.45, "grad_norm": 4.483836265471178, "learning_rate": 3.4549150281252635e-06, "loss": 0.0443, "step": 3430 }, { "epoch": 2.45, "grad_norm": 11.13154384738619, "learning_rate": 3.4521665907553957e-06, "loss": 0.0337, "step": 3431 }, { "epoch": 2.45, "grad_norm": 5.800442883625316, "learning_rate": 3.4494186705454402e-06, "loss": 0.0618, "step": 3432 }, { "epoch": 2.45, "grad_norm": 16.842487232611695, "learning_rate": 3.446671268413528e-06, "loss": 0.056, "step": 3433 }, { "epoch": 2.45, "grad_norm": 6.307882602308188, "learning_rate": 3.443924385277617e-06, "loss": 0.0401, "step": 3434 }, { "epoch": 2.45, "grad_norm": 12.347319578908353, "learning_rate": 3.4411780220554937e-06, "loss": 0.049, "step": 3435 }, { "epoch": 2.45, "grad_norm": 6.777317017761099, "learning_rate": 3.4384321796647645e-06, "loss": 0.044, "step": 3436 }, { "epoch": 2.45, "grad_norm": 8.578839854101997, "learning_rate": 3.4356868590228727e-06, "loss": 0.08, "step": 3437 }, { "epoch": 2.45, "grad_norm": 11.34847212257333, "learning_rate": 3.4329420610470745e-06, "loss": 0.0591, "step": 3438 }, { "epoch": 2.45, "grad_norm": 16.488323097081217, "learning_rate": 3.4301977866544634e-06, "loss": 0.0469, "step": 3439 }, { "epoch": 2.46, "grad_norm": 3.5345240893456635, "learning_rate": 3.427454036761948e-06, "loss": 0.037, "step": 3440 }, { "epoch": 2.46, "grad_norm": 10.70351554080197, "learning_rate": 3.4247108122862703e-06, "loss": 0.049, "step": 3441 }, { "epoch": 2.46, "grad_norm": 6.477152114166718, "learning_rate": 3.4219681141439907e-06, "loss": 0.0401, "step": 3442 }, { "epoch": 2.46, "grad_norm": 18.807079075717237, "learning_rate": 3.4192259432514934e-06, "loss": 0.0406, "step": 3443 }, { "epoch": 2.46, "grad_norm": 15.723153609547095, "learning_rate": 3.4164843005249928e-06, "loss": 0.0524, "step": 3444 }, { "epoch": 2.46, "grad_norm": 12.39846598919014, "learning_rate": 3.413743186880519e-06, "loss": 0.0407, "step": 3445 }, { "epoch": 2.46, "grad_norm": 7.3971140149986665, "learning_rate": 3.4110026032339317e-06, "loss": 0.0679, "step": 3446 }, { "epoch": 2.46, "grad_norm": 20.316958331448777, "learning_rate": 3.408262550500908e-06, "loss": 0.0699, "step": 3447 }, { "epoch": 2.46, "grad_norm": 19.17858909141001, "learning_rate": 3.4055230295969556e-06, "loss": 0.0484, "step": 3448 }, { "epoch": 2.46, "grad_norm": 10.402764837758005, "learning_rate": 3.4027840414373924e-06, "loss": 0.0569, "step": 3449 }, { "epoch": 2.46, "grad_norm": 7.135992378182201, "learning_rate": 3.4000455869373716e-06, "loss": 0.0426, "step": 3450 }, { "epoch": 2.46, "grad_norm": 14.292267710034482, "learning_rate": 3.397307667011859e-06, "loss": 0.0685, "step": 3451 }, { "epoch": 2.46, "grad_norm": 22.66042408586546, "learning_rate": 3.394570282575642e-06, "loss": 0.0729, "step": 3452 }, { "epoch": 2.46, "grad_norm": 7.338566737526984, "learning_rate": 3.3918334345433367e-06, "loss": 0.0495, "step": 3453 }, { "epoch": 2.47, "grad_norm": 8.259220087387108, "learning_rate": 3.3890971238293703e-06, "loss": 0.0544, "step": 3454 }, { "epoch": 2.47, "grad_norm": 7.349858298914431, "learning_rate": 3.386361351347999e-06, "loss": 0.0507, "step": 3455 }, { "epoch": 2.47, "grad_norm": 4.307434476542483, "learning_rate": 3.3836261180132914e-06, "loss": 0.0546, "step": 3456 }, { "epoch": 2.47, "grad_norm": 5.554073968741056, "learning_rate": 3.3808914247391437e-06, "loss": 0.0492, "step": 3457 }, { "epoch": 2.47, "grad_norm": 10.475716145946574, "learning_rate": 3.3781572724392642e-06, "loss": 0.0467, "step": 3458 }, { "epoch": 2.47, "grad_norm": 11.333875703512863, "learning_rate": 3.3754236620271876e-06, "loss": 0.0606, "step": 3459 }, { "epoch": 2.47, "grad_norm": 7.737852380637597, "learning_rate": 3.3726905944162615e-06, "loss": 0.0582, "step": 3460 }, { "epoch": 2.47, "grad_norm": 8.626067005065318, "learning_rate": 3.3699580705196527e-06, "loss": 0.0547, "step": 3461 }, { "epoch": 2.47, "grad_norm": 9.788902013697735, "learning_rate": 3.367226091250353e-06, "loss": 0.0514, "step": 3462 }, { "epoch": 2.47, "grad_norm": 7.494197930351465, "learning_rate": 3.3644946575211634e-06, "loss": 0.0322, "step": 3463 }, { "epoch": 2.47, "grad_norm": 6.72717147418577, "learning_rate": 3.36176377024471e-06, "loss": 0.0377, "step": 3464 }, { "epoch": 2.47, "grad_norm": 9.106408875639595, "learning_rate": 3.3590334303334293e-06, "loss": 0.0645, "step": 3465 }, { "epoch": 2.47, "grad_norm": 3.468657818143453, "learning_rate": 3.356303638699583e-06, "loss": 0.0349, "step": 3466 }, { "epoch": 2.47, "grad_norm": 10.178729399836424, "learning_rate": 3.35357439625524e-06, "loss": 0.0491, "step": 3467 }, { "epoch": 2.48, "grad_norm": 7.847892185842703, "learning_rate": 3.3508457039122965e-06, "loss": 0.0514, "step": 3468 }, { "epoch": 2.48, "grad_norm": 12.475199644318261, "learning_rate": 3.348117562582457e-06, "loss": 0.0379, "step": 3469 }, { "epoch": 2.48, "grad_norm": 4.425641585558723, "learning_rate": 3.345389973177241e-06, "loss": 0.0402, "step": 3470 }, { "epoch": 2.48, "grad_norm": 4.957786046660219, "learning_rate": 3.342662936607992e-06, "loss": 0.0685, "step": 3471 }, { "epoch": 2.48, "grad_norm": 10.682715401706243, "learning_rate": 3.3399364537858594e-06, "loss": 0.0541, "step": 3472 }, { "epoch": 2.48, "grad_norm": 5.124834331996581, "learning_rate": 3.3372105256218153e-06, "loss": 0.0482, "step": 3473 }, { "epoch": 2.48, "grad_norm": 6.155139065247913, "learning_rate": 3.334485153026639e-06, "loss": 0.0362, "step": 3474 }, { "epoch": 2.48, "grad_norm": 11.387503864889187, "learning_rate": 3.3317603369109332e-06, "loss": 0.0608, "step": 3475 }, { "epoch": 2.48, "grad_norm": 7.514201902512651, "learning_rate": 3.3290360781851055e-06, "loss": 0.0428, "step": 3476 }, { "epoch": 2.48, "grad_norm": 10.254928567111829, "learning_rate": 3.326312377759383e-06, "loss": 0.0479, "step": 3477 }, { "epoch": 2.48, "grad_norm": 8.581791189780155, "learning_rate": 3.3235892365438038e-06, "loss": 0.0291, "step": 3478 }, { "epoch": 2.48, "grad_norm": 9.933864421671199, "learning_rate": 3.3208666554482216e-06, "loss": 0.0492, "step": 3479 }, { "epoch": 2.48, "grad_norm": 5.1340261357847, "learning_rate": 3.3181446353822997e-06, "loss": 0.0361, "step": 3480 }, { "epoch": 2.48, "grad_norm": 10.211747713964122, "learning_rate": 3.315423177255516e-06, "loss": 0.0725, "step": 3481 }, { "epoch": 2.49, "grad_norm": 7.574056301175818, "learning_rate": 3.312702281977161e-06, "loss": 0.0604, "step": 3482 }, { "epoch": 2.49, "grad_norm": 20.440276470356725, "learning_rate": 3.3099819504563356e-06, "loss": 0.0413, "step": 3483 }, { "epoch": 2.49, "grad_norm": 11.787768676878581, "learning_rate": 3.3072621836019535e-06, "loss": 0.0383, "step": 3484 }, { "epoch": 2.49, "grad_norm": 5.320753627246697, "learning_rate": 3.3045429823227405e-06, "loss": 0.0366, "step": 3485 }, { "epoch": 2.49, "grad_norm": 3.0007273838143593, "learning_rate": 3.3018243475272282e-06, "loss": 0.0377, "step": 3486 }, { "epoch": 2.49, "grad_norm": 15.551358111361543, "learning_rate": 3.2991062801237683e-06, "loss": 0.0653, "step": 3487 }, { "epoch": 2.49, "grad_norm": 14.995269345180393, "learning_rate": 3.296388781020513e-06, "loss": 0.065, "step": 3488 }, { "epoch": 2.49, "grad_norm": 16.071603280041327, "learning_rate": 3.293671851125434e-06, "loss": 0.0832, "step": 3489 }, { "epoch": 2.49, "grad_norm": 11.916063649824876, "learning_rate": 3.2909554913463034e-06, "loss": 0.0473, "step": 3490 }, { "epoch": 2.49, "grad_norm": 21.15674028802956, "learning_rate": 3.2882397025907114e-06, "loss": 0.0777, "step": 3491 }, { "epoch": 2.49, "grad_norm": 16.156481807543038, "learning_rate": 3.2855244857660497e-06, "loss": 0.0443, "step": 3492 }, { "epoch": 2.49, "grad_norm": 16.88354258053397, "learning_rate": 3.2828098417795267e-06, "loss": 0.0409, "step": 3493 }, { "epoch": 2.49, "grad_norm": 22.697696452401754, "learning_rate": 3.2800957715381537e-06, "loss": 0.0652, "step": 3494 }, { "epoch": 2.49, "grad_norm": 4.743314683992511, "learning_rate": 3.2773822759487497e-06, "loss": 0.0425, "step": 3495 }, { "epoch": 2.5, "grad_norm": 14.222623429544777, "learning_rate": 3.2746693559179483e-06, "loss": 0.0468, "step": 3496 }, { "epoch": 2.5, "grad_norm": 36.49482472083385, "learning_rate": 3.2719570123521816e-06, "loss": 0.0753, "step": 3497 }, { "epoch": 2.5, "grad_norm": 18.21815675653801, "learning_rate": 3.2692452461576997e-06, "loss": 0.0572, "step": 3498 }, { "epoch": 2.5, "grad_norm": 12.83475452863996, "learning_rate": 3.266534058240548e-06, "loss": 0.0497, "step": 3499 }, { "epoch": 2.5, "grad_norm": 5.823480145703581, "learning_rate": 3.2638234495065903e-06, "loss": 0.0537, "step": 3500 }, { "epoch": 2.5, "eval_avg_AUC": 0.832346751207196, "eval_avg_Accuracy": 0.739597148541114, "eval_avg_Accuracy-right": 0.8735489761314725, "eval_avg_Accuracy-wrong": 0.5060268364794178, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.7006953165400592, "eval_last_AUC": 0.8475061209281654, "eval_last_Accuracy": 0.7807940981432361, "eval_last_Accuracy-right": 0.8329855223685927, "eval_last_Accuracy-wrong": 0.6897884921537412, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.7097495321532842, "eval_max_AUC": 0.7701603116760396, "eval_max_Accuracy": 0.6501574933687002, "eval_max_Accuracy-right": 0.9781531237772271, "eval_max_Accuracy-wrong": 0.07823516033659313, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6369597827879677, "eval_min_AUC": 0.8455461246669156, "eval_min_Accuracy": 0.7731681034482759, "eval_min_Accuracy-right": 0.7823138124429373, "eval_min_Accuracy-wrong": 0.7572208323857176, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.7126465395097116, "eval_prod_AUC": 0.8377160248849372, "eval_prod_Accuracy": 0.7180868700265252, "eval_prod_Accuracy-right": 0.6126255380200861, "eval_prod_Accuracy-wrong": 0.9019786217875825, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.7055742832242811, "eval_runtime": 247.3763, "eval_samples_per_second": 97.536, "eval_steps_per_second": 3.048, "eval_sum_AUC": 0.712185339026256, "eval_sum_Accuracy": 0.640873673740053, "eval_sum_Accuracy-right": 0.9953697665318899, "eval_sum_Accuracy-wrong": 0.02274277916761428, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6744582768230599, "step": 3500 }, { "epoch": 2.5, "grad_norm": 14.845003338865876, "learning_rate": 3.261113420861487e-06, "loss": 0.0569, "step": 3501 }, { "epoch": 2.5, "grad_norm": 20.476101635024914, "learning_rate": 3.258403973210713e-06, "loss": 0.0443, "step": 3502 }, { "epoch": 2.5, "grad_norm": 16.582413967496823, "learning_rate": 3.2556951074595435e-06, "loss": 0.0405, "step": 3503 }, { "epoch": 2.5, "grad_norm": 7.163367417609349, "learning_rate": 3.2529868245130577e-06, "loss": 0.0509, "step": 3504 }, { "epoch": 2.5, "grad_norm": 14.256405396539867, "learning_rate": 3.250279125276148e-06, "loss": 0.0443, "step": 3505 }, { "epoch": 2.5, "grad_norm": 9.355654318932615, "learning_rate": 3.2475720106535036e-06, "loss": 0.0351, "step": 3506 }, { "epoch": 2.5, "grad_norm": 15.4290622991359, "learning_rate": 3.244865481549625e-06, "loss": 0.0385, "step": 3507 }, { "epoch": 2.5, "grad_norm": 5.540807483931574, "learning_rate": 3.24215953886881e-06, "loss": 0.0711, "step": 3508 }, { "epoch": 2.5, "grad_norm": 3.2900577929188204, "learning_rate": 3.2394541835151692e-06, "loss": 0.0322, "step": 3509 }, { "epoch": 2.51, "grad_norm": 5.839214385110157, "learning_rate": 3.2367494163926095e-06, "loss": 0.0355, "step": 3510 }, { "epoch": 2.51, "grad_norm": 9.368493630197404, "learning_rate": 3.234045238404841e-06, "loss": 0.0588, "step": 3511 }, { "epoch": 2.51, "grad_norm": 13.586715948198096, "learning_rate": 3.2313416504553852e-06, "loss": 0.0579, "step": 3512 }, { "epoch": 2.51, "grad_norm": 7.231859040657891, "learning_rate": 3.2286386534475568e-06, "loss": 0.0381, "step": 3513 }, { "epoch": 2.51, "grad_norm": 4.101218107837911, "learning_rate": 3.2259362482844803e-06, "loss": 0.0369, "step": 3514 }, { "epoch": 2.51, "grad_norm": 5.651265778383636, "learning_rate": 3.2232344358690765e-06, "loss": 0.0352, "step": 3515 }, { "epoch": 2.51, "grad_norm": 12.961817876373003, "learning_rate": 3.220533217104075e-06, "loss": 0.0605, "step": 3516 }, { "epoch": 2.51, "grad_norm": 8.658443920435355, "learning_rate": 3.217832592891999e-06, "loss": 0.0306, "step": 3517 }, { "epoch": 2.51, "grad_norm": 7.589369821386444, "learning_rate": 3.2151325641351817e-06, "loss": 0.0788, "step": 3518 }, { "epoch": 2.51, "grad_norm": 7.192128941843156, "learning_rate": 3.2124331317357506e-06, "loss": 0.0422, "step": 3519 }, { "epoch": 2.51, "grad_norm": 5.335264955696329, "learning_rate": 3.2097342965956334e-06, "loss": 0.064, "step": 3520 }, { "epoch": 2.51, "grad_norm": 2.7148073824964096, "learning_rate": 3.2070360596165667e-06, "loss": 0.0297, "step": 3521 }, { "epoch": 2.51, "grad_norm": 14.262742389282423, "learning_rate": 3.204338421700076e-06, "loss": 0.0448, "step": 3522 }, { "epoch": 2.51, "grad_norm": 17.047759589845352, "learning_rate": 3.201641383747498e-06, "loss": 0.0615, "step": 3523 }, { "epoch": 2.52, "grad_norm": 5.1887465158901795, "learning_rate": 3.1989449466599574e-06, "loss": 0.0709, "step": 3524 }, { "epoch": 2.52, "grad_norm": 10.43183697384031, "learning_rate": 3.1962491113383896e-06, "loss": 0.0739, "step": 3525 }, { "epoch": 2.52, "grad_norm": 7.760472498332863, "learning_rate": 3.1935538786835183e-06, "loss": 0.0476, "step": 3526 }, { "epoch": 2.52, "grad_norm": 20.92702203320919, "learning_rate": 3.1908592495958747e-06, "loss": 0.053, "step": 3527 }, { "epoch": 2.52, "grad_norm": 11.407485827565694, "learning_rate": 3.1881652249757823e-06, "loss": 0.0372, "step": 3528 }, { "epoch": 2.52, "grad_norm": 4.8307280957120735, "learning_rate": 3.185471805723365e-06, "loss": 0.045, "step": 3529 }, { "epoch": 2.52, "grad_norm": 8.019327740566352, "learning_rate": 3.1827789927385444e-06, "loss": 0.0433, "step": 3530 }, { "epoch": 2.52, "grad_norm": 5.920055320148531, "learning_rate": 3.18008678692104e-06, "loss": 0.0533, "step": 3531 }, { "epoch": 2.52, "grad_norm": 4.226924007230088, "learning_rate": 3.1773951891703668e-06, "loss": 0.052, "step": 3532 }, { "epoch": 2.52, "grad_norm": 13.516443184870806, "learning_rate": 3.1747042003858386e-06, "loss": 0.0547, "step": 3533 }, { "epoch": 2.52, "grad_norm": 8.21547792246494, "learning_rate": 3.1720138214665643e-06, "loss": 0.0491, "step": 3534 }, { "epoch": 2.52, "grad_norm": 6.142068501852478, "learning_rate": 3.1693240533114496e-06, "loss": 0.0641, "step": 3535 }, { "epoch": 2.52, "grad_norm": 4.150763465920459, "learning_rate": 3.1666348968191955e-06, "loss": 0.0401, "step": 3536 }, { "epoch": 2.52, "grad_norm": 11.729987470097232, "learning_rate": 3.1639463528883007e-06, "loss": 0.0582, "step": 3537 }, { "epoch": 2.53, "grad_norm": 10.409400310191932, "learning_rate": 3.161258422417055e-06, "loss": 0.0383, "step": 3538 }, { "epoch": 2.53, "grad_norm": 6.526722578357467, "learning_rate": 3.1585711063035496e-06, "loss": 0.0496, "step": 3539 }, { "epoch": 2.53, "grad_norm": 4.236653388596862, "learning_rate": 3.155884405445663e-06, "loss": 0.0424, "step": 3540 }, { "epoch": 2.53, "grad_norm": 5.5839787734094095, "learning_rate": 3.153198320741074e-06, "loss": 0.0374, "step": 3541 }, { "epoch": 2.53, "grad_norm": 13.274151144936681, "learning_rate": 3.150512853087253e-06, "loss": 0.0564, "step": 3542 }, { "epoch": 2.53, "grad_norm": 9.510037031594369, "learning_rate": 3.1478280033814657e-06, "loss": 0.0466, "step": 3543 }, { "epoch": 2.53, "grad_norm": 6.862413054864041, "learning_rate": 3.14514377252077e-06, "loss": 0.0373, "step": 3544 }, { "epoch": 2.53, "grad_norm": 5.4795830545549435, "learning_rate": 3.142460161402014e-06, "loss": 0.0541, "step": 3545 }, { "epoch": 2.53, "grad_norm": 14.338633368147894, "learning_rate": 3.139777170921847e-06, "loss": 0.0584, "step": 3546 }, { "epoch": 2.53, "grad_norm": 8.975528387578118, "learning_rate": 3.137094801976701e-06, "loss": 0.0461, "step": 3547 }, { "epoch": 2.53, "grad_norm": 7.4199338093489215, "learning_rate": 3.1344130554628104e-06, "loss": 0.0346, "step": 3548 }, { "epoch": 2.53, "grad_norm": 10.49710065460678, "learning_rate": 3.131731932276193e-06, "loss": 0.0508, "step": 3549 }, { "epoch": 2.53, "grad_norm": 5.33530821774696, "learning_rate": 3.129051433312664e-06, "loss": 0.0599, "step": 3550 }, { "epoch": 2.53, "grad_norm": 5.796697346739547, "learning_rate": 3.1263715594678257e-06, "loss": 0.0528, "step": 3551 }, { "epoch": 2.54, "grad_norm": 3.240653694412567, "learning_rate": 3.1236923116370764e-06, "loss": 0.0381, "step": 3552 }, { "epoch": 2.54, "grad_norm": 2.9372123775383145, "learning_rate": 3.121013690715601e-06, "loss": 0.0376, "step": 3553 }, { "epoch": 2.54, "grad_norm": 7.609415911504751, "learning_rate": 3.118335697598376e-06, "loss": 0.0358, "step": 3554 }, { "epoch": 2.54, "grad_norm": 8.390856165835398, "learning_rate": 3.1156583331801703e-06, "loss": 0.0514, "step": 3555 }, { "epoch": 2.54, "grad_norm": 8.449747018529868, "learning_rate": 3.1129815983555387e-06, "loss": 0.0518, "step": 3556 }, { "epoch": 2.54, "grad_norm": 17.287771804155526, "learning_rate": 3.1103054940188316e-06, "loss": 0.0499, "step": 3557 }, { "epoch": 2.54, "grad_norm": 3.918114158102355, "learning_rate": 3.1076300210641814e-06, "loss": 0.0447, "step": 3558 }, { "epoch": 2.54, "grad_norm": 12.087731186438175, "learning_rate": 3.1049551803855173e-06, "loss": 0.0766, "step": 3559 }, { "epoch": 2.54, "grad_norm": 4.23804644083563, "learning_rate": 3.1022809728765486e-06, "loss": 0.0511, "step": 3560 }, { "epoch": 2.54, "grad_norm": 6.3009649089413236, "learning_rate": 3.0996073994307825e-06, "loss": 0.041, "step": 3561 }, { "epoch": 2.54, "grad_norm": 4.219391874758236, "learning_rate": 3.0969344609415076e-06, "loss": 0.0388, "step": 3562 }, { "epoch": 2.54, "grad_norm": 5.0531747625683945, "learning_rate": 3.0942621583017994e-06, "loss": 0.0349, "step": 3563 }, { "epoch": 2.54, "grad_norm": 9.97710030264506, "learning_rate": 3.0915904924045294e-06, "loss": 0.0431, "step": 3564 }, { "epoch": 2.54, "grad_norm": 9.397820259535381, "learning_rate": 3.088919464142346e-06, "loss": 0.0457, "step": 3565 }, { "epoch": 2.55, "grad_norm": 4.754455760260489, "learning_rate": 3.0862490744076928e-06, "loss": 0.0633, "step": 3566 }, { "epoch": 2.55, "grad_norm": 4.929988907403646, "learning_rate": 3.0835793240927937e-06, "loss": 0.0445, "step": 3567 }, { "epoch": 2.55, "grad_norm": 3.6203167310866937, "learning_rate": 3.0809102140896652e-06, "loss": 0.0346, "step": 3568 }, { "epoch": 2.55, "grad_norm": 8.143345190966631, "learning_rate": 3.078241745290103e-06, "loss": 0.0656, "step": 3569 }, { "epoch": 2.55, "grad_norm": 9.331281186466539, "learning_rate": 3.075573918585696e-06, "loss": 0.0452, "step": 3570 }, { "epoch": 2.55, "grad_norm": 10.05873032023487, "learning_rate": 3.0729067348678127e-06, "loss": 0.0308, "step": 3571 }, { "epoch": 2.55, "grad_norm": 4.45197339309329, "learning_rate": 3.0702401950276066e-06, "loss": 0.0505, "step": 3572 }, { "epoch": 2.55, "grad_norm": 6.511150541662428, "learning_rate": 3.067574299956022e-06, "loss": 0.0391, "step": 3573 }, { "epoch": 2.55, "grad_norm": 6.19790668856283, "learning_rate": 3.0649090505437804e-06, "loss": 0.0618, "step": 3574 }, { "epoch": 2.55, "grad_norm": 4.092533552558913, "learning_rate": 3.062244447681396e-06, "loss": 0.0362, "step": 3575 }, { "epoch": 2.55, "grad_norm": 3.328037295873963, "learning_rate": 3.0595804922591564e-06, "loss": 0.0398, "step": 3576 }, { "epoch": 2.55, "grad_norm": 11.908875157062429, "learning_rate": 3.0569171851671436e-06, "loss": 0.0689, "step": 3577 }, { "epoch": 2.55, "grad_norm": 10.261183867138291, "learning_rate": 3.054254527295215e-06, "loss": 0.0539, "step": 3578 }, { "epoch": 2.55, "grad_norm": 3.6021336026425312, "learning_rate": 3.0515925195330148e-06, "loss": 0.0393, "step": 3579 }, { "epoch": 2.56, "grad_norm": 5.4869403375119425, "learning_rate": 3.048931162769969e-06, "loss": 0.0507, "step": 3580 }, { "epoch": 2.56, "grad_norm": 16.354612895096743, "learning_rate": 3.0462704578952874e-06, "loss": 0.0511, "step": 3581 }, { "epoch": 2.56, "grad_norm": 7.463260414069788, "learning_rate": 3.0436104057979604e-06, "loss": 0.0478, "step": 3582 }, { "epoch": 2.56, "grad_norm": 8.65693289332545, "learning_rate": 3.0409510073667602e-06, "loss": 0.0355, "step": 3583 }, { "epoch": 2.56, "grad_norm": 4.322328157842049, "learning_rate": 3.038292263490242e-06, "loss": 0.0651, "step": 3584 }, { "epoch": 2.56, "grad_norm": 13.68926290517709, "learning_rate": 3.035634175056742e-06, "loss": 0.0544, "step": 3585 }, { "epoch": 2.56, "grad_norm": 10.244078321532088, "learning_rate": 3.0329767429543767e-06, "loss": 0.0561, "step": 3586 }, { "epoch": 2.56, "grad_norm": 4.0632698090854795, "learning_rate": 3.030319968071043e-06, "loss": 0.0345, "step": 3587 }, { "epoch": 2.56, "grad_norm": 2.8950494704080074, "learning_rate": 3.0276638512944177e-06, "loss": 0.0347, "step": 3588 }, { "epoch": 2.56, "grad_norm": 7.721157968690705, "learning_rate": 3.025008393511961e-06, "loss": 0.0644, "step": 3589 }, { "epoch": 2.56, "grad_norm": 14.006638808045752, "learning_rate": 3.022353595610909e-06, "loss": 0.0735, "step": 3590 }, { "epoch": 2.56, "grad_norm": 3.9140223429542447, "learning_rate": 3.01969945847828e-06, "loss": 0.043, "step": 3591 }, { "epoch": 2.56, "grad_norm": 9.900182325928498, "learning_rate": 3.017045983000871e-06, "loss": 0.0489, "step": 3592 }, { "epoch": 2.56, "grad_norm": 4.361887420728005, "learning_rate": 3.014393170065256e-06, "loss": 0.0556, "step": 3593 }, { "epoch": 2.57, "grad_norm": 13.67604306112046, "learning_rate": 3.0117410205577903e-06, "loss": 0.0477, "step": 3594 }, { "epoch": 2.57, "grad_norm": 9.916712836665532, "learning_rate": 3.0090895353646053e-06, "loss": 0.0385, "step": 3595 }, { "epoch": 2.57, "grad_norm": 3.3840417799657025, "learning_rate": 3.006438715371614e-06, "loss": 0.0473, "step": 3596 }, { "epoch": 2.57, "grad_norm": 7.2866366962959095, "learning_rate": 3.0037885614645e-06, "loss": 0.0517, "step": 3597 }, { "epoch": 2.57, "grad_norm": 10.141648265814117, "learning_rate": 3.001139074528735e-06, "loss": 0.0262, "step": 3598 }, { "epoch": 2.57, "grad_norm": 5.09068531568647, "learning_rate": 2.9984902554495556e-06, "loss": 0.0488, "step": 3599 }, { "epoch": 2.57, "grad_norm": 9.031353090048116, "learning_rate": 2.995842105111987e-06, "loss": 0.0409, "step": 3600 }, { "epoch": 2.57, "grad_norm": 6.990355483528864, "learning_rate": 2.99319462440082e-06, "loss": 0.0499, "step": 3601 }, { "epoch": 2.57, "grad_norm": 4.2774820742083435, "learning_rate": 2.990547814200633e-06, "loss": 0.0457, "step": 3602 }, { "epoch": 2.57, "grad_norm": 9.343295864443792, "learning_rate": 2.987901675395771e-06, "loss": 0.0399, "step": 3603 }, { "epoch": 2.57, "grad_norm": 9.517038015733123, "learning_rate": 2.985256208870357e-06, "loss": 0.0621, "step": 3604 }, { "epoch": 2.57, "grad_norm": 7.376829239076995, "learning_rate": 2.982611415508294e-06, "loss": 0.0432, "step": 3605 }, { "epoch": 2.57, "grad_norm": 5.701404340515294, "learning_rate": 2.9799672961932525e-06, "loss": 0.0607, "step": 3606 }, { "epoch": 2.57, "grad_norm": 4.540746888215145, "learning_rate": 2.9773238518086866e-06, "loss": 0.0315, "step": 3607 }, { "epoch": 2.58, "grad_norm": 5.28757599597712, "learning_rate": 2.974681083237816e-06, "loss": 0.0513, "step": 3608 }, { "epoch": 2.58, "grad_norm": 12.344327625926315, "learning_rate": 2.972038991363643e-06, "loss": 0.0467, "step": 3609 }, { "epoch": 2.58, "grad_norm": 10.626885297699232, "learning_rate": 2.9693975770689344e-06, "loss": 0.0414, "step": 3610 }, { "epoch": 2.58, "grad_norm": 5.12103907244829, "learning_rate": 2.9667568412362415e-06, "loss": 0.0318, "step": 3611 }, { "epoch": 2.58, "grad_norm": 13.006583054701983, "learning_rate": 2.9641167847478797e-06, "loss": 0.0516, "step": 3612 }, { "epoch": 2.58, "grad_norm": 7.09256099551172, "learning_rate": 2.96147740848594e-06, "loss": 0.0533, "step": 3613 }, { "epoch": 2.58, "grad_norm": 6.204069778734054, "learning_rate": 2.9588387133322903e-06, "loss": 0.0599, "step": 3614 }, { "epoch": 2.58, "grad_norm": 21.147619366359972, "learning_rate": 2.9562007001685644e-06, "loss": 0.0424, "step": 3615 }, { "epoch": 2.58, "grad_norm": 9.26229905936863, "learning_rate": 2.9535633698761755e-06, "loss": 0.0731, "step": 3616 }, { "epoch": 2.58, "grad_norm": 12.74094699586504, "learning_rate": 2.9509267233363005e-06, "loss": 0.0692, "step": 3617 }, { "epoch": 2.58, "grad_norm": 4.182529756900972, "learning_rate": 2.948290761429895e-06, "loss": 0.0459, "step": 3618 }, { "epoch": 2.58, "grad_norm": 13.62462515024402, "learning_rate": 2.9456554850376805e-06, "loss": 0.0695, "step": 3619 }, { "epoch": 2.58, "grad_norm": 5.9507955856152925, "learning_rate": 2.943020895040155e-06, "loss": 0.0305, "step": 3620 }, { "epoch": 2.58, "grad_norm": 34.83572138414297, "learning_rate": 2.940386992317582e-06, "loss": 0.073, "step": 3621 }, { "epoch": 2.59, "grad_norm": 3.677257365244086, "learning_rate": 2.937753777749996e-06, "loss": 0.032, "step": 3622 }, { "epoch": 2.59, "grad_norm": 13.762342920713616, "learning_rate": 2.9351212522172056e-06, "loss": 0.0811, "step": 3623 }, { "epoch": 2.59, "grad_norm": 10.635154432789744, "learning_rate": 2.9324894165987837e-06, "loss": 0.0372, "step": 3624 }, { "epoch": 2.59, "grad_norm": 11.482810263694958, "learning_rate": 2.9298582717740797e-06, "loss": 0.0728, "step": 3625 }, { "epoch": 2.59, "grad_norm": 22.37942524618786, "learning_rate": 2.9272278186222025e-06, "loss": 0.0446, "step": 3626 }, { "epoch": 2.59, "grad_norm": 9.388487880388006, "learning_rate": 2.9245980580220405e-06, "loss": 0.0342, "step": 3627 }, { "epoch": 2.59, "grad_norm": 6.033256714344545, "learning_rate": 2.921968990852242e-06, "loss": 0.0581, "step": 3628 }, { "epoch": 2.59, "grad_norm": 5.768022756457545, "learning_rate": 2.9193406179912297e-06, "loss": 0.0361, "step": 3629 }, { "epoch": 2.59, "grad_norm": 8.174036176719323, "learning_rate": 2.91671294031719e-06, "loss": 0.037, "step": 3630 }, { "epoch": 2.59, "grad_norm": 10.705237470263434, "learning_rate": 2.91408595870808e-06, "loss": 0.0402, "step": 3631 }, { "epoch": 2.59, "grad_norm": 9.964496232913909, "learning_rate": 2.9114596740416224e-06, "loss": 0.0462, "step": 3632 }, { "epoch": 2.59, "grad_norm": 10.145165458346774, "learning_rate": 2.908834087195308e-06, "loss": 0.046, "step": 3633 }, { "epoch": 2.59, "grad_norm": 5.38954700874282, "learning_rate": 2.9062091990463935e-06, "loss": 0.0441, "step": 3634 }, { "epoch": 2.59, "grad_norm": 13.879887451783997, "learning_rate": 2.903585010471904e-06, "loss": 0.0697, "step": 3635 }, { "epoch": 2.6, "grad_norm": 14.995211769676875, "learning_rate": 2.9009615223486297e-06, "loss": 0.0709, "step": 3636 }, { "epoch": 2.6, "grad_norm": 15.90824468650359, "learning_rate": 2.898338735553128e-06, "loss": 0.0475, "step": 3637 }, { "epoch": 2.6, "grad_norm": 15.039762855178544, "learning_rate": 2.895716650961714e-06, "loss": 0.0751, "step": 3638 }, { "epoch": 2.6, "grad_norm": 8.403369327327413, "learning_rate": 2.8930952694504843e-06, "loss": 0.0774, "step": 3639 }, { "epoch": 2.6, "grad_norm": 2.68859343539843, "learning_rate": 2.8904745918952833e-06, "loss": 0.031, "step": 3640 }, { "epoch": 2.6, "grad_norm": 7.863619642319471, "learning_rate": 2.887854619171735e-06, "loss": 0.0398, "step": 3641 }, { "epoch": 2.6, "grad_norm": 7.469206438001187, "learning_rate": 2.8852353521552135e-06, "loss": 0.0381, "step": 3642 }, { "epoch": 2.6, "grad_norm": 6.029345864226802, "learning_rate": 2.8826167917208727e-06, "loss": 0.0405, "step": 3643 }, { "epoch": 2.6, "grad_norm": 3.487371662761291, "learning_rate": 2.8799989387436137e-06, "loss": 0.0357, "step": 3644 }, { "epoch": 2.6, "grad_norm": 10.641719996018361, "learning_rate": 2.8773817940981186e-06, "loss": 0.0533, "step": 3645 }, { "epoch": 2.6, "grad_norm": 6.635229300228402, "learning_rate": 2.8747653586588183e-06, "loss": 0.0632, "step": 3646 }, { "epoch": 2.6, "grad_norm": 9.041375472119013, "learning_rate": 2.872149633299913e-06, "loss": 0.0475, "step": 3647 }, { "epoch": 2.6, "grad_norm": 7.537386679553337, "learning_rate": 2.8695346188953666e-06, "loss": 0.0461, "step": 3648 }, { "epoch": 2.6, "grad_norm": 5.14722877172696, "learning_rate": 2.866920316318904e-06, "loss": 0.048, "step": 3649 }, { "epoch": 2.61, "grad_norm": 14.424463819672413, "learning_rate": 2.8643067264440116e-06, "loss": 0.0525, "step": 3650 }, { "epoch": 2.61, "grad_norm": 13.853378771747693, "learning_rate": 2.8616938501439384e-06, "loss": 0.0692, "step": 3651 }, { "epoch": 2.61, "grad_norm": 18.270438337227063, "learning_rate": 2.8590816882916948e-06, "loss": 0.0568, "step": 3652 }, { "epoch": 2.61, "grad_norm": 18.947419081569944, "learning_rate": 2.856470241760054e-06, "loss": 0.0561, "step": 3653 }, { "epoch": 2.61, "grad_norm": 4.429111310023221, "learning_rate": 2.8538595114215472e-06, "loss": 0.0626, "step": 3654 }, { "epoch": 2.61, "grad_norm": 4.475739674756901, "learning_rate": 2.8512494981484706e-06, "loss": 0.0581, "step": 3655 }, { "epoch": 2.61, "grad_norm": 7.771125239389673, "learning_rate": 2.848640202812872e-06, "loss": 0.0417, "step": 3656 }, { "epoch": 2.61, "grad_norm": 23.062847695927893, "learning_rate": 2.846031626286574e-06, "loss": 0.0508, "step": 3657 }, { "epoch": 2.61, "grad_norm": 11.972621678603849, "learning_rate": 2.8434237694411414e-06, "loss": 0.0488, "step": 3658 }, { "epoch": 2.61, "grad_norm": 14.32588978216122, "learning_rate": 2.840816633147917e-06, "loss": 0.0611, "step": 3659 }, { "epoch": 2.61, "grad_norm": 5.884934135428629, "learning_rate": 2.8382102182779846e-06, "loss": 0.0441, "step": 3660 }, { "epoch": 2.61, "grad_norm": 9.351592698422403, "learning_rate": 2.8356045257022037e-06, "loss": 0.0354, "step": 3661 }, { "epoch": 2.61, "grad_norm": 6.957720102055877, "learning_rate": 2.832999556291177e-06, "loss": 0.0375, "step": 3662 }, { "epoch": 2.61, "grad_norm": 10.048379667158468, "learning_rate": 2.8303953109152815e-06, "loss": 0.0575, "step": 3663 }, { "epoch": 2.62, "grad_norm": 16.31872426637752, "learning_rate": 2.827791790444638e-06, "loss": 0.045, "step": 3664 }, { "epoch": 2.62, "grad_norm": 5.464392634847121, "learning_rate": 2.8251889957491317e-06, "loss": 0.0614, "step": 3665 }, { "epoch": 2.62, "grad_norm": 7.507048236348415, "learning_rate": 2.822586927698407e-06, "loss": 0.0433, "step": 3666 }, { "epoch": 2.62, "grad_norm": 9.366777000694293, "learning_rate": 2.819985587161861e-06, "loss": 0.0492, "step": 3667 }, { "epoch": 2.62, "grad_norm": 18.342694251690276, "learning_rate": 2.8173849750086513e-06, "loss": 0.0646, "step": 3668 }, { "epoch": 2.62, "grad_norm": 13.408885923969533, "learning_rate": 2.8147850921076903e-06, "loss": 0.072, "step": 3669 }, { "epoch": 2.62, "grad_norm": 5.859363792224824, "learning_rate": 2.8121859393276475e-06, "loss": 0.0466, "step": 3670 }, { "epoch": 2.62, "grad_norm": 6.544712210188832, "learning_rate": 2.809587517536947e-06, "loss": 0.0583, "step": 3671 }, { "epoch": 2.62, "grad_norm": 10.49401514565673, "learning_rate": 2.806989827603771e-06, "loss": 0.053, "step": 3672 }, { "epoch": 2.62, "grad_norm": 4.011757972273914, "learning_rate": 2.8043928703960565e-06, "loss": 0.0323, "step": 3673 }, { "epoch": 2.62, "grad_norm": 2.960978152701493, "learning_rate": 2.8017966467814933e-06, "loss": 0.0303, "step": 3674 }, { "epoch": 2.62, "grad_norm": 14.114822282619247, "learning_rate": 2.7992011576275295e-06, "loss": 0.0411, "step": 3675 }, { "epoch": 2.62, "grad_norm": 7.15626451156831, "learning_rate": 2.7966064038013657e-06, "loss": 0.0564, "step": 3676 }, { "epoch": 2.62, "grad_norm": 9.494513074846147, "learning_rate": 2.7940123861699577e-06, "loss": 0.0399, "step": 3677 }, { "epoch": 2.63, "grad_norm": 5.069632762089934, "learning_rate": 2.7914191056000147e-06, "loss": 0.0523, "step": 3678 }, { "epoch": 2.63, "grad_norm": 10.285203626675607, "learning_rate": 2.788826562958e-06, "loss": 0.0495, "step": 3679 }, { "epoch": 2.63, "grad_norm": 12.855691316396912, "learning_rate": 2.7862347591101326e-06, "loss": 0.051, "step": 3680 }, { "epoch": 2.63, "grad_norm": 8.43463699635779, "learning_rate": 2.7836436949223755e-06, "loss": 0.0451, "step": 3681 }, { "epoch": 2.63, "grad_norm": 6.876214234656952, "learning_rate": 2.78105337126046e-06, "loss": 0.0444, "step": 3682 }, { "epoch": 2.63, "grad_norm": 16.846348195822873, "learning_rate": 2.7784637889898534e-06, "loss": 0.0648, "step": 3683 }, { "epoch": 2.63, "grad_norm": 5.6815124763738485, "learning_rate": 2.7758749489757914e-06, "loss": 0.0646, "step": 3684 }, { "epoch": 2.63, "grad_norm": 10.381951130054908, "learning_rate": 2.7732868520832455e-06, "loss": 0.0453, "step": 3685 }, { "epoch": 2.63, "grad_norm": 11.46923666061598, "learning_rate": 2.770699499176954e-06, "loss": 0.051, "step": 3686 }, { "epoch": 2.63, "grad_norm": 5.726611333652352, "learning_rate": 2.768112891121394e-06, "loss": 0.0314, "step": 3687 }, { "epoch": 2.63, "grad_norm": 15.030148911582819, "learning_rate": 2.7655270287808045e-06, "loss": 0.0557, "step": 3688 }, { "epoch": 2.63, "grad_norm": 15.463593669759398, "learning_rate": 2.762941913019166e-06, "loss": 0.0581, "step": 3689 }, { "epoch": 2.63, "grad_norm": 6.6863472657093315, "learning_rate": 2.760357544700215e-06, "loss": 0.0718, "step": 3690 }, { "epoch": 2.63, "grad_norm": 14.381312588146585, "learning_rate": 2.757773924687437e-06, "loss": 0.0497, "step": 3691 }, { "epoch": 2.64, "grad_norm": 8.724171215718298, "learning_rate": 2.755191053844068e-06, "loss": 0.0458, "step": 3692 }, { "epoch": 2.64, "grad_norm": 8.172858308911552, "learning_rate": 2.7526089330330925e-06, "loss": 0.054, "step": 3693 }, { "epoch": 2.64, "grad_norm": 8.290284625175802, "learning_rate": 2.7500275631172455e-06, "loss": 0.0347, "step": 3694 }, { "epoch": 2.64, "grad_norm": 5.764092240236844, "learning_rate": 2.74744694495901e-06, "loss": 0.0269, "step": 3695 }, { "epoch": 2.64, "grad_norm": 8.50315536403876, "learning_rate": 2.74486707942062e-06, "loss": 0.0414, "step": 3696 }, { "epoch": 2.64, "grad_norm": 17.67269423715195, "learning_rate": 2.7422879673640552e-06, "loss": 0.0577, "step": 3697 }, { "epoch": 2.64, "grad_norm": 13.286685334149144, "learning_rate": 2.7397096096510467e-06, "loss": 0.0527, "step": 3698 }, { "epoch": 2.64, "grad_norm": 5.190362466075019, "learning_rate": 2.7371320071430674e-06, "loss": 0.0354, "step": 3699 }, { "epoch": 2.64, "grad_norm": 7.222587422386342, "learning_rate": 2.7345551607013475e-06, "loss": 0.04, "step": 3700 }, { "epoch": 2.64, "grad_norm": 4.454442361570336, "learning_rate": 2.7319790711868545e-06, "loss": 0.0438, "step": 3701 }, { "epoch": 2.64, "grad_norm": 4.525361617331926, "learning_rate": 2.7294037394603135e-06, "loss": 0.0638, "step": 3702 }, { "epoch": 2.64, "grad_norm": 6.9455554028374324, "learning_rate": 2.7268291663821825e-06, "loss": 0.0435, "step": 3703 }, { "epoch": 2.64, "grad_norm": 4.868096567440874, "learning_rate": 2.7242553528126842e-06, "loss": 0.0508, "step": 3704 }, { "epoch": 2.64, "grad_norm": 6.558699133066451, "learning_rate": 2.72168229961177e-06, "loss": 0.0406, "step": 3705 }, { "epoch": 2.65, "grad_norm": 4.402340494429636, "learning_rate": 2.7191100076391473e-06, "loss": 0.0454, "step": 3706 }, { "epoch": 2.65, "grad_norm": 12.420379796085895, "learning_rate": 2.716538477754266e-06, "loss": 0.0583, "step": 3707 }, { "epoch": 2.65, "grad_norm": 6.1149424905859675, "learning_rate": 2.713967710816323e-06, "loss": 0.0345, "step": 3708 }, { "epoch": 2.65, "grad_norm": 6.348271002030884, "learning_rate": 2.7113977076842597e-06, "loss": 0.0576, "step": 3709 }, { "epoch": 2.65, "grad_norm": 7.584921780363273, "learning_rate": 2.7088284692167604e-06, "loss": 0.0529, "step": 3710 }, { "epoch": 2.65, "grad_norm": 9.51983997797159, "learning_rate": 2.7062599962722563e-06, "loss": 0.0416, "step": 3711 }, { "epoch": 2.65, "grad_norm": 7.319440057301574, "learning_rate": 2.703692289708922e-06, "loss": 0.0438, "step": 3712 }, { "epoch": 2.65, "grad_norm": 26.01010056382804, "learning_rate": 2.701125350384676e-06, "loss": 0.0746, "step": 3713 }, { "epoch": 2.65, "grad_norm": 11.765662745979094, "learning_rate": 2.69855917915718e-06, "loss": 0.0467, "step": 3714 }, { "epoch": 2.65, "grad_norm": 23.418590205484847, "learning_rate": 2.695993776883839e-06, "loss": 0.0595, "step": 3715 }, { "epoch": 2.65, "grad_norm": 14.703328795567954, "learning_rate": 2.693429144421803e-06, "loss": 0.0302, "step": 3716 }, { "epoch": 2.65, "grad_norm": 23.578741481643345, "learning_rate": 2.6908652826279623e-06, "loss": 0.0658, "step": 3717 }, { "epoch": 2.65, "grad_norm": 5.857687742436122, "learning_rate": 2.688302192358952e-06, "loss": 0.0638, "step": 3718 }, { "epoch": 2.65, "grad_norm": 8.405158533466244, "learning_rate": 2.6857398744711472e-06, "loss": 0.0517, "step": 3719 }, { "epoch": 2.66, "grad_norm": 20.846618406862838, "learning_rate": 2.683178329820666e-06, "loss": 0.074, "step": 3720 }, { "epoch": 2.66, "grad_norm": 16.091758689157754, "learning_rate": 2.680617559263368e-06, "loss": 0.0647, "step": 3721 }, { "epoch": 2.66, "grad_norm": 5.701177069467726, "learning_rate": 2.6780575636548544e-06, "loss": 0.0526, "step": 3722 }, { "epoch": 2.66, "grad_norm": 13.59914895192763, "learning_rate": 2.67549834385047e-06, "loss": 0.0457, "step": 3723 }, { "epoch": 2.66, "grad_norm": 6.660108186626482, "learning_rate": 2.67293990070529e-06, "loss": 0.0396, "step": 3724 }, { "epoch": 2.66, "grad_norm": 3.818886071754114, "learning_rate": 2.6703822350741483e-06, "loss": 0.0388, "step": 3725 }, { "epoch": 2.66, "grad_norm": 7.17966247766626, "learning_rate": 2.6678253478116e-06, "loss": 0.0617, "step": 3726 }, { "epoch": 2.66, "grad_norm": 15.063926908466604, "learning_rate": 2.665269239771953e-06, "loss": 0.0401, "step": 3727 }, { "epoch": 2.66, "grad_norm": 17.589769650176894, "learning_rate": 2.662713911809248e-06, "loss": 0.0502, "step": 3728 }, { "epoch": 2.66, "grad_norm": 7.171926503654985, "learning_rate": 2.6601593647772696e-06, "loss": 0.0394, "step": 3729 }, { "epoch": 2.66, "grad_norm": 11.235108267848334, "learning_rate": 2.657605599529538e-06, "loss": 0.055, "step": 3730 }, { "epoch": 2.66, "grad_norm": 6.427120667435036, "learning_rate": 2.6550526169193148e-06, "loss": 0.0506, "step": 3731 }, { "epoch": 2.66, "grad_norm": 3.6786946918606946, "learning_rate": 2.6525004177995984e-06, "loss": 0.0367, "step": 3732 }, { "epoch": 2.66, "grad_norm": 11.498439922011594, "learning_rate": 2.6499490030231255e-06, "loss": 0.0499, "step": 3733 }, { "epoch": 2.67, "grad_norm": 5.251312011644968, "learning_rate": 2.6473983734423725e-06, "loss": 0.0386, "step": 3734 }, { "epoch": 2.67, "grad_norm": 8.575386085669063, "learning_rate": 2.644848529909552e-06, "loss": 0.0491, "step": 3735 }, { "epoch": 2.67, "grad_norm": 8.930496092355702, "learning_rate": 2.6422994732766124e-06, "loss": 0.0331, "step": 3736 }, { "epoch": 2.67, "grad_norm": 6.071195064464973, "learning_rate": 2.6397512043952422e-06, "loss": 0.0522, "step": 3737 }, { "epoch": 2.67, "grad_norm": 7.623010868928134, "learning_rate": 2.637203724116865e-06, "loss": 0.0363, "step": 3738 }, { "epoch": 2.67, "grad_norm": 9.317615708928468, "learning_rate": 2.634657033292644e-06, "loss": 0.0439, "step": 3739 }, { "epoch": 2.67, "grad_norm": 4.679344162012358, "learning_rate": 2.6321111327734693e-06, "loss": 0.041, "step": 3740 }, { "epoch": 2.67, "grad_norm": 10.628255304414862, "learning_rate": 2.6295660234099816e-06, "loss": 0.0474, "step": 3741 }, { "epoch": 2.67, "grad_norm": 3.7547535821491653, "learning_rate": 2.6270217060525416e-06, "loss": 0.0542, "step": 3742 }, { "epoch": 2.67, "grad_norm": 9.99181954191989, "learning_rate": 2.624478181551261e-06, "loss": 0.0369, "step": 3743 }, { "epoch": 2.67, "grad_norm": 7.006118145287793, "learning_rate": 2.62193545075597e-06, "loss": 0.0498, "step": 3744 }, { "epoch": 2.67, "grad_norm": 3.068279415859639, "learning_rate": 2.6193935145162507e-06, "loss": 0.0294, "step": 3745 }, { "epoch": 2.67, "grad_norm": 8.203047800359975, "learning_rate": 2.6168523736814035e-06, "loss": 0.0416, "step": 3746 }, { "epoch": 2.67, "grad_norm": 13.181075968942887, "learning_rate": 2.6143120291004785e-06, "loss": 0.0476, "step": 3747 }, { "epoch": 2.68, "grad_norm": 5.395303454942057, "learning_rate": 2.611772481622246e-06, "loss": 0.0324, "step": 3748 }, { "epoch": 2.68, "grad_norm": 8.82113177422998, "learning_rate": 2.609233732095218e-06, "loss": 0.0478, "step": 3749 }, { "epoch": 2.68, "grad_norm": 14.848385000891767, "learning_rate": 2.6066957813676375e-06, "loss": 0.0469, "step": 3750 }, { "epoch": 2.68, "grad_norm": 4.738432033718035, "learning_rate": 2.604158630287482e-06, "loss": 0.0406, "step": 3751 }, { "epoch": 2.68, "grad_norm": 6.340406715674136, "learning_rate": 2.60162227970246e-06, "loss": 0.0437, "step": 3752 }, { "epoch": 2.68, "grad_norm": 7.33847764367545, "learning_rate": 2.5990867304600136e-06, "loss": 0.0372, "step": 3753 }, { "epoch": 2.68, "grad_norm": 3.166395335908612, "learning_rate": 2.5965519834073172e-06, "loss": 0.0388, "step": 3754 }, { "epoch": 2.68, "grad_norm": 7.976267116665653, "learning_rate": 2.5940180393912767e-06, "loss": 0.0428, "step": 3755 }, { "epoch": 2.68, "grad_norm": 13.601712677500139, "learning_rate": 2.5914848992585293e-06, "loss": 0.0574, "step": 3756 }, { "epoch": 2.68, "grad_norm": 8.172580723390993, "learning_rate": 2.588952563855448e-06, "loss": 0.0438, "step": 3757 }, { "epoch": 2.68, "grad_norm": 7.078598667043399, "learning_rate": 2.5864210340281247e-06, "loss": 0.041, "step": 3758 }, { "epoch": 2.68, "grad_norm": 10.010293320385502, "learning_rate": 2.5838903106224004e-06, "loss": 0.0445, "step": 3759 }, { "epoch": 2.68, "grad_norm": 8.32877855610212, "learning_rate": 2.5813603944838283e-06, "loss": 0.0509, "step": 3760 }, { "epoch": 2.68, "grad_norm": 8.156187892342613, "learning_rate": 2.578831286457708e-06, "loss": 0.0385, "step": 3761 }, { "epoch": 2.69, "grad_norm": 8.634329209857926, "learning_rate": 2.5763029873890542e-06, "loss": 0.0607, "step": 3762 }, { "epoch": 2.69, "grad_norm": 10.967700021524228, "learning_rate": 2.573775498122626e-06, "loss": 0.0626, "step": 3763 }, { "epoch": 2.69, "grad_norm": 7.039557541057048, "learning_rate": 2.5712488195028972e-06, "loss": 0.0526, "step": 3764 }, { "epoch": 2.69, "grad_norm": 4.977594670286856, "learning_rate": 2.5687229523740852e-06, "loss": 0.0497, "step": 3765 }, { "epoch": 2.69, "grad_norm": 14.378683618047381, "learning_rate": 2.566197897580124e-06, "loss": 0.0786, "step": 3766 }, { "epoch": 2.69, "grad_norm": 11.70648699510383, "learning_rate": 2.5636736559646824e-06, "loss": 0.0514, "step": 3767 }, { "epoch": 2.69, "grad_norm": 9.415184037952974, "learning_rate": 2.5611502283711576e-06, "loss": 0.0461, "step": 3768 }, { "epoch": 2.69, "grad_norm": 5.61280952127481, "learning_rate": 2.5586276156426726e-06, "loss": 0.0415, "step": 3769 }, { "epoch": 2.69, "grad_norm": 10.422217664943227, "learning_rate": 2.55610581862208e-06, "loss": 0.0533, "step": 3770 }, { "epoch": 2.69, "grad_norm": 9.032558758185202, "learning_rate": 2.553584838151959e-06, "loss": 0.0513, "step": 3771 }, { "epoch": 2.69, "grad_norm": 9.894389545613095, "learning_rate": 2.5510646750746154e-06, "loss": 0.0457, "step": 3772 }, { "epoch": 2.69, "grad_norm": 9.655381132642049, "learning_rate": 2.548545330232083e-06, "loss": 0.0419, "step": 3773 }, { "epoch": 2.69, "grad_norm": 5.7174887879327745, "learning_rate": 2.5460268044661215e-06, "loss": 0.0477, "step": 3774 }, { "epoch": 2.69, "grad_norm": 16.20332294670294, "learning_rate": 2.5435090986182176e-06, "loss": 0.0663, "step": 3775 }, { "epoch": 2.7, "grad_norm": 7.352189227072673, "learning_rate": 2.5409922135295827e-06, "loss": 0.0466, "step": 3776 }, { "epoch": 2.7, "grad_norm": 5.547528471853324, "learning_rate": 2.538476150041156e-06, "loss": 0.032, "step": 3777 }, { "epoch": 2.7, "grad_norm": 9.148875990223138, "learning_rate": 2.5359609089936006e-06, "loss": 0.0935, "step": 3778 }, { "epoch": 2.7, "grad_norm": 7.890951821809732, "learning_rate": 2.533446491227305e-06, "loss": 0.0345, "step": 3779 }, { "epoch": 2.7, "grad_norm": 11.425432370389863, "learning_rate": 2.5309328975823834e-06, "loss": 0.051, "step": 3780 }, { "epoch": 2.7, "grad_norm": 7.139640801191179, "learning_rate": 2.5284201288986744e-06, "loss": 0.0556, "step": 3781 }, { "epoch": 2.7, "grad_norm": 12.38512234939895, "learning_rate": 2.5259081860157418e-06, "loss": 0.0362, "step": 3782 }, { "epoch": 2.7, "grad_norm": 6.283048395599437, "learning_rate": 2.5233970697728673e-06, "loss": 0.0679, "step": 3783 }, { "epoch": 2.7, "grad_norm": 7.575758374522927, "learning_rate": 2.520886781009068e-06, "loss": 0.0566, "step": 3784 }, { "epoch": 2.7, "grad_norm": 4.885508330549097, "learning_rate": 2.5183773205630726e-06, "loss": 0.0316, "step": 3785 }, { "epoch": 2.7, "grad_norm": 15.694594828315246, "learning_rate": 2.515868689273344e-06, "loss": 0.0845, "step": 3786 }, { "epoch": 2.7, "grad_norm": 9.405452769181828, "learning_rate": 2.513360887978056e-06, "loss": 0.0417, "step": 3787 }, { "epoch": 2.7, "grad_norm": 3.6408201726978895, "learning_rate": 2.510853917515119e-06, "loss": 0.0554, "step": 3788 }, { "epoch": 2.7, "grad_norm": 4.7474944897018405, "learning_rate": 2.50834777872215e-06, "loss": 0.036, "step": 3789 }, { "epoch": 2.71, "grad_norm": 6.32046437072879, "learning_rate": 2.505842472436506e-06, "loss": 0.0375, "step": 3790 }, { "epoch": 2.71, "grad_norm": 4.876168319824173, "learning_rate": 2.5033379994952493e-06, "loss": 0.0405, "step": 3791 }, { "epoch": 2.71, "grad_norm": 3.4487084790696407, "learning_rate": 2.5008343607351733e-06, "loss": 0.0408, "step": 3792 }, { "epoch": 2.71, "grad_norm": 21.368691291832302, "learning_rate": 2.4983315569927895e-06, "loss": 0.0717, "step": 3793 }, { "epoch": 2.71, "grad_norm": 5.394868150765753, "learning_rate": 2.495829589104333e-06, "loss": 0.0406, "step": 3794 }, { "epoch": 2.71, "grad_norm": 7.701013264562421, "learning_rate": 2.493328457905755e-06, "loss": 0.0374, "step": 3795 }, { "epoch": 2.71, "grad_norm": 9.241059362038095, "learning_rate": 2.490828164232732e-06, "loss": 0.0476, "step": 3796 }, { "epoch": 2.71, "grad_norm": 10.070938365629022, "learning_rate": 2.4883287089206582e-06, "loss": 0.0363, "step": 3797 }, { "epoch": 2.71, "grad_norm": 9.776212143814869, "learning_rate": 2.48583009280465e-06, "loss": 0.0464, "step": 3798 }, { "epoch": 2.71, "grad_norm": 3.2565285582268926, "learning_rate": 2.483332316719535e-06, "loss": 0.0363, "step": 3799 }, { "epoch": 2.71, "grad_norm": 3.0244463261051457, "learning_rate": 2.4808353814998747e-06, "loss": 0.0432, "step": 3800 }, { "epoch": 2.71, "grad_norm": 11.23437776694193, "learning_rate": 2.4783392879799345e-06, "loss": 0.0754, "step": 3801 }, { "epoch": 2.71, "grad_norm": 7.504406874948566, "learning_rate": 2.4758440369937125e-06, "loss": 0.0468, "step": 3802 }, { "epoch": 2.71, "grad_norm": 11.839846713260169, "learning_rate": 2.4733496293749116e-06, "loss": 0.0382, "step": 3803 }, { "epoch": 2.72, "grad_norm": 15.580980042235241, "learning_rate": 2.4708560659569665e-06, "loss": 0.0377, "step": 3804 }, { "epoch": 2.72, "grad_norm": 4.915771833170678, "learning_rate": 2.4683633475730158e-06, "loss": 0.0426, "step": 3805 }, { "epoch": 2.72, "grad_norm": 3.7801116481824035, "learning_rate": 2.465871475055931e-06, "loss": 0.0416, "step": 3806 }, { "epoch": 2.72, "grad_norm": 13.257844295682492, "learning_rate": 2.4633804492382866e-06, "loss": 0.0469, "step": 3807 }, { "epoch": 2.72, "grad_norm": 2.321691675555728, "learning_rate": 2.460890270952383e-06, "loss": 0.0398, "step": 3808 }, { "epoch": 2.72, "grad_norm": 5.925469089728795, "learning_rate": 2.4584009410302357e-06, "loss": 0.0483, "step": 3809 }, { "epoch": 2.72, "grad_norm": 5.729798068275378, "learning_rate": 2.4559124603035744e-06, "loss": 0.0408, "step": 3810 }, { "epoch": 2.72, "grad_norm": 8.12961700972875, "learning_rate": 2.4534248296038488e-06, "loss": 0.0488, "step": 3811 }, { "epoch": 2.72, "grad_norm": 3.6114805207114986, "learning_rate": 2.4509380497622208e-06, "loss": 0.031, "step": 3812 }, { "epoch": 2.72, "grad_norm": 10.032594319995873, "learning_rate": 2.448452121609571e-06, "loss": 0.0432, "step": 3813 }, { "epoch": 2.72, "grad_norm": 6.816652452453811, "learning_rate": 2.445967045976493e-06, "loss": 0.0637, "step": 3814 }, { "epoch": 2.72, "grad_norm": 8.219953176187644, "learning_rate": 2.443482823693298e-06, "loss": 0.0431, "step": 3815 }, { "epoch": 2.72, "grad_norm": 6.300399021489175, "learning_rate": 2.4409994555900125e-06, "loss": 0.0393, "step": 3816 }, { "epoch": 2.72, "grad_norm": 10.553580034207647, "learning_rate": 2.4385169424963696e-06, "loss": 0.0486, "step": 3817 }, { "epoch": 2.73, "grad_norm": 3.250056961602972, "learning_rate": 2.4360352852418305e-06, "loss": 0.038, "step": 3818 }, { "epoch": 2.73, "grad_norm": 2.5109869107834233, "learning_rate": 2.4335544846555564e-06, "loss": 0.0341, "step": 3819 }, { "epoch": 2.73, "grad_norm": 3.141809200082095, "learning_rate": 2.431074541566436e-06, "loss": 0.0346, "step": 3820 }, { "epoch": 2.73, "grad_norm": 8.93333143773801, "learning_rate": 2.4285954568030566e-06, "loss": 0.0596, "step": 3821 }, { "epoch": 2.73, "grad_norm": 2.8270654820116854, "learning_rate": 2.426117231193735e-06, "loss": 0.0378, "step": 3822 }, { "epoch": 2.73, "grad_norm": 6.114877716065037, "learning_rate": 2.4236398655664834e-06, "loss": 0.0361, "step": 3823 }, { "epoch": 2.73, "grad_norm": 11.797891191634468, "learning_rate": 2.4211633607490442e-06, "loss": 0.0575, "step": 3824 }, { "epoch": 2.73, "grad_norm": 10.73316573754624, "learning_rate": 2.4186877175688576e-06, "loss": 0.0678, "step": 3825 }, { "epoch": 2.73, "grad_norm": 5.071528261076872, "learning_rate": 2.4162129368530848e-06, "loss": 0.053, "step": 3826 }, { "epoch": 2.73, "grad_norm": 5.82366753919729, "learning_rate": 2.413739019428595e-06, "loss": 0.0371, "step": 3827 }, { "epoch": 2.73, "grad_norm": 4.408441529976167, "learning_rate": 2.41126596612197e-06, "loss": 0.0352, "step": 3828 }, { "epoch": 2.73, "grad_norm": 2.9553768058089136, "learning_rate": 2.408793777759504e-06, "loss": 0.0439, "step": 3829 }, { "epoch": 2.73, "grad_norm": 7.364755485236182, "learning_rate": 2.4063224551672e-06, "loss": 0.0334, "step": 3830 }, { "epoch": 2.73, "grad_norm": 7.684527850853966, "learning_rate": 2.4038519991707725e-06, "loss": 0.0483, "step": 3831 }, { "epoch": 2.74, "grad_norm": 9.183703665017836, "learning_rate": 2.4013824105956483e-06, "loss": 0.054, "step": 3832 }, { "epoch": 2.74, "grad_norm": 4.171026788107983, "learning_rate": 2.3989136902669614e-06, "loss": 0.0352, "step": 3833 }, { "epoch": 2.74, "grad_norm": 19.60009405514915, "learning_rate": 2.396445839009558e-06, "loss": 0.0358, "step": 3834 }, { "epoch": 2.74, "grad_norm": 4.711556978156083, "learning_rate": 2.3939788576479926e-06, "loss": 0.0421, "step": 3835 }, { "epoch": 2.74, "grad_norm": 22.38928410529173, "learning_rate": 2.39151274700653e-06, "loss": 0.0849, "step": 3836 }, { "epoch": 2.74, "grad_norm": 10.767567534815475, "learning_rate": 2.389047507909143e-06, "loss": 0.0384, "step": 3837 }, { "epoch": 2.74, "grad_norm": 12.836118570888207, "learning_rate": 2.3865831411795137e-06, "loss": 0.0568, "step": 3838 }, { "epoch": 2.74, "grad_norm": 15.689055796114143, "learning_rate": 2.3841196476410337e-06, "loss": 0.0585, "step": 3839 }, { "epoch": 2.74, "grad_norm": 7.494309674312284, "learning_rate": 2.3816570281168016e-06, "loss": 0.0571, "step": 3840 }, { "epoch": 2.74, "grad_norm": 16.871108256802874, "learning_rate": 2.379195283429626e-06, "loss": 0.0378, "step": 3841 }, { "epoch": 2.74, "grad_norm": 9.679241876905133, "learning_rate": 2.3767344144020164e-06, "loss": 0.0595, "step": 3842 }, { "epoch": 2.74, "grad_norm": 2.73030324100972, "learning_rate": 2.374274421856202e-06, "loss": 0.0332, "step": 3843 }, { "epoch": 2.74, "grad_norm": 4.984926898926288, "learning_rate": 2.371815306614104e-06, "loss": 0.0441, "step": 3844 }, { "epoch": 2.74, "grad_norm": 6.959555864531863, "learning_rate": 2.3693570694973673e-06, "loss": 0.0458, "step": 3845 }, { "epoch": 2.75, "grad_norm": 19.08144651468584, "learning_rate": 2.366899711327326e-06, "loss": 0.0403, "step": 3846 }, { "epoch": 2.75, "grad_norm": 7.685927254780913, "learning_rate": 2.3644432329250374e-06, "loss": 0.0418, "step": 3847 }, { "epoch": 2.75, "grad_norm": 3.733683829988181, "learning_rate": 2.3619876351112486e-06, "loss": 0.0475, "step": 3848 }, { "epoch": 2.75, "grad_norm": 7.2819284808845905, "learning_rate": 2.3595329187064282e-06, "loss": 0.041, "step": 3849 }, { "epoch": 2.75, "grad_norm": 8.790602910243651, "learning_rate": 2.3570790845307367e-06, "loss": 0.0455, "step": 3850 }, { "epoch": 2.75, "grad_norm": 5.762808139392179, "learning_rate": 2.3546261334040475e-06, "loss": 0.0621, "step": 3851 }, { "epoch": 2.75, "grad_norm": 18.647824678033906, "learning_rate": 2.352174066145938e-06, "loss": 0.0724, "step": 3852 }, { "epoch": 2.75, "grad_norm": 7.320240282215148, "learning_rate": 2.3497228835756887e-06, "loss": 0.0392, "step": 3853 }, { "epoch": 2.75, "grad_norm": 12.389189567383427, "learning_rate": 2.3472725865122854e-06, "loss": 0.0609, "step": 3854 }, { "epoch": 2.75, "grad_norm": 10.228654260988792, "learning_rate": 2.344823175774418e-06, "loss": 0.0463, "step": 3855 }, { "epoch": 2.75, "grad_norm": 6.773649330102428, "learning_rate": 2.3423746521804796e-06, "loss": 0.0524, "step": 3856 }, { "epoch": 2.75, "grad_norm": 5.435268080793936, "learning_rate": 2.339927016548568e-06, "loss": 0.0579, "step": 3857 }, { "epoch": 2.75, "grad_norm": 7.41772531916488, "learning_rate": 2.3374802696964842e-06, "loss": 0.0454, "step": 3858 }, { "epoch": 2.75, "grad_norm": 10.022754338495929, "learning_rate": 2.3350344124417336e-06, "loss": 0.035, "step": 3859 }, { "epoch": 2.76, "grad_norm": 10.610656647476898, "learning_rate": 2.3325894456015154e-06, "loss": 0.0523, "step": 3860 }, { "epoch": 2.76, "grad_norm": 18.39360281841152, "learning_rate": 2.3301453699927477e-06, "loss": 0.0579, "step": 3861 }, { "epoch": 2.76, "grad_norm": 6.048783311715747, "learning_rate": 2.3277021864320332e-06, "loss": 0.0506, "step": 3862 }, { "epoch": 2.76, "grad_norm": 7.751672566932973, "learning_rate": 2.325259895735693e-06, "loss": 0.0463, "step": 3863 }, { "epoch": 2.76, "grad_norm": 18.50386797364024, "learning_rate": 2.322818498719734e-06, "loss": 0.044, "step": 3864 }, { "epoch": 2.76, "grad_norm": 12.855762559723866, "learning_rate": 2.3203779961998795e-06, "loss": 0.0297, "step": 3865 }, { "epoch": 2.76, "grad_norm": 13.336159487903306, "learning_rate": 2.317938388991541e-06, "loss": 0.0365, "step": 3866 }, { "epoch": 2.76, "grad_norm": 16.833109489814944, "learning_rate": 2.3154996779098405e-06, "loss": 0.058, "step": 3867 }, { "epoch": 2.76, "grad_norm": 3.4464387255304665, "learning_rate": 2.313061863769594e-06, "loss": 0.0407, "step": 3868 }, { "epoch": 2.76, "grad_norm": 5.236811429649089, "learning_rate": 2.310624947385322e-06, "loss": 0.0554, "step": 3869 }, { "epoch": 2.76, "grad_norm": 17.42376851125036, "learning_rate": 2.3081889295712434e-06, "loss": 0.0373, "step": 3870 }, { "epoch": 2.76, "grad_norm": 28.16530722294004, "learning_rate": 2.3057538111412765e-06, "loss": 0.0786, "step": 3871 }, { "epoch": 2.76, "grad_norm": 3.5925463205431445, "learning_rate": 2.3033195929090404e-06, "loss": 0.0395, "step": 3872 }, { "epoch": 2.76, "grad_norm": 9.615138697340234, "learning_rate": 2.300886275687852e-06, "loss": 0.0433, "step": 3873 }, { "epoch": 2.77, "grad_norm": 18.44213000447938, "learning_rate": 2.298453860290728e-06, "loss": 0.0474, "step": 3874 }, { "epoch": 2.77, "grad_norm": 14.67959710316391, "learning_rate": 2.296022347530384e-06, "loss": 0.0471, "step": 3875 }, { "epoch": 2.77, "grad_norm": 10.905366471260221, "learning_rate": 2.293591738219233e-06, "loss": 0.0513, "step": 3876 }, { "epoch": 2.77, "grad_norm": 11.816111298594338, "learning_rate": 2.2911620331693867e-06, "loss": 0.0495, "step": 3877 }, { "epoch": 2.77, "grad_norm": 10.254614678819744, "learning_rate": 2.2887332331926555e-06, "loss": 0.06, "step": 3878 }, { "epoch": 2.77, "grad_norm": 16.736637604289378, "learning_rate": 2.2863053391005462e-06, "loss": 0.0427, "step": 3879 }, { "epoch": 2.77, "grad_norm": 18.759961357816216, "learning_rate": 2.2838783517042628e-06, "loss": 0.0543, "step": 3880 }, { "epoch": 2.77, "grad_norm": 17.80455205698793, "learning_rate": 2.281452271814708e-06, "loss": 0.063, "step": 3881 }, { "epoch": 2.77, "grad_norm": 19.60942208840727, "learning_rate": 2.2790271002424794e-06, "loss": 0.0605, "step": 3882 }, { "epoch": 2.77, "grad_norm": 12.453488405455872, "learning_rate": 2.276602837797872e-06, "loss": 0.0385, "step": 3883 }, { "epoch": 2.77, "grad_norm": 6.709907947321404, "learning_rate": 2.274179485290879e-06, "loss": 0.0788, "step": 3884 }, { "epoch": 2.77, "grad_norm": 20.038024659232907, "learning_rate": 2.271757043531184e-06, "loss": 0.056, "step": 3885 }, { "epoch": 2.77, "grad_norm": 9.050003866061806, "learning_rate": 2.2693355133281706e-06, "loss": 0.0639, "step": 3886 }, { "epoch": 2.77, "grad_norm": 18.58808666026702, "learning_rate": 2.266914895490918e-06, "loss": 0.0564, "step": 3887 }, { "epoch": 2.78, "grad_norm": 27.52059173829719, "learning_rate": 2.2644951908282e-06, "loss": 0.0483, "step": 3888 }, { "epoch": 2.78, "grad_norm": 10.592134606963889, "learning_rate": 2.262076400148484e-06, "loss": 0.0627, "step": 3889 }, { "epoch": 2.78, "grad_norm": 8.496491370108698, "learning_rate": 2.2596585242599333e-06, "loss": 0.0794, "step": 3890 }, { "epoch": 2.78, "grad_norm": 17.493489913800556, "learning_rate": 2.257241563970405e-06, "loss": 0.0555, "step": 3891 }, { "epoch": 2.78, "grad_norm": 13.568365004774797, "learning_rate": 2.254825520087451e-06, "loss": 0.064, "step": 3892 }, { "epoch": 2.78, "grad_norm": 4.508400191683394, "learning_rate": 2.2524103934183154e-06, "loss": 0.0353, "step": 3893 }, { "epoch": 2.78, "grad_norm": 12.32194315514444, "learning_rate": 2.249996184769938e-06, "loss": 0.0521, "step": 3894 }, { "epoch": 2.78, "grad_norm": 18.4710969837349, "learning_rate": 2.2475828949489504e-06, "loss": 0.0694, "step": 3895 }, { "epoch": 2.78, "grad_norm": 10.766594698490506, "learning_rate": 2.2451705247616774e-06, "loss": 0.0493, "step": 3896 }, { "epoch": 2.78, "grad_norm": 9.107591867740075, "learning_rate": 2.2427590750141364e-06, "loss": 0.0403, "step": 3897 }, { "epoch": 2.78, "grad_norm": 10.375009405030395, "learning_rate": 2.240348546512039e-06, "loss": 0.0589, "step": 3898 }, { "epoch": 2.78, "grad_norm": 16.177260041162715, "learning_rate": 2.237938940060786e-06, "loss": 0.0407, "step": 3899 }, { "epoch": 2.78, "grad_norm": 10.180497058246122, "learning_rate": 2.235530256465474e-06, "loss": 0.0477, "step": 3900 }, { "epoch": 2.78, "grad_norm": 5.248521888185715, "learning_rate": 2.233122496530884e-06, "loss": 0.0443, "step": 3901 }, { "epoch": 2.79, "grad_norm": 3.581699527389943, "learning_rate": 2.2307156610615e-06, "loss": 0.0369, "step": 3902 }, { "epoch": 2.79, "grad_norm": 9.5196018314941, "learning_rate": 2.2283097508614837e-06, "loss": 0.0368, "step": 3903 }, { "epoch": 2.79, "grad_norm": 3.2933709812401295, "learning_rate": 2.225904766734702e-06, "loss": 0.0442, "step": 3904 }, { "epoch": 2.79, "grad_norm": 12.630550369303542, "learning_rate": 2.2235007094846963e-06, "loss": 0.0475, "step": 3905 }, { "epoch": 2.79, "grad_norm": 10.408411930140902, "learning_rate": 2.2210975799147143e-06, "loss": 0.0349, "step": 3906 }, { "epoch": 2.79, "grad_norm": 4.291974830052436, "learning_rate": 2.21869537882768e-06, "loss": 0.0429, "step": 3907 }, { "epoch": 2.79, "grad_norm": 7.91993089844047, "learning_rate": 2.21629410702622e-06, "loss": 0.0311, "step": 3908 }, { "epoch": 2.79, "grad_norm": 7.232659660731117, "learning_rate": 2.2138937653126393e-06, "loss": 0.0378, "step": 3909 }, { "epoch": 2.79, "grad_norm": 6.846272738804037, "learning_rate": 2.2114943544889366e-06, "loss": 0.0432, "step": 3910 }, { "epoch": 2.79, "grad_norm": 6.8102817293655065, "learning_rate": 2.2090958753568013e-06, "loss": 0.0545, "step": 3911 }, { "epoch": 2.79, "grad_norm": 15.66677503356628, "learning_rate": 2.206698328717609e-06, "loss": 0.0679, "step": 3912 }, { "epoch": 2.79, "grad_norm": 8.851017334282632, "learning_rate": 2.2043017153724253e-06, "loss": 0.0357, "step": 3913 }, { "epoch": 2.79, "grad_norm": 4.348470099639396, "learning_rate": 2.2019060361220036e-06, "loss": 0.0399, "step": 3914 }, { "epoch": 2.79, "grad_norm": 18.351757414727047, "learning_rate": 2.199511291766783e-06, "loss": 0.0485, "step": 3915 }, { "epoch": 2.8, "grad_norm": 13.228376315595304, "learning_rate": 2.1971174831068944e-06, "loss": 0.0478, "step": 3916 }, { "epoch": 2.8, "grad_norm": 4.989234913705305, "learning_rate": 2.1947246109421514e-06, "loss": 0.0687, "step": 3917 }, { "epoch": 2.8, "grad_norm": 7.578293430391057, "learning_rate": 2.192332676072061e-06, "loss": 0.0698, "step": 3918 }, { "epoch": 2.8, "grad_norm": 19.50726319750263, "learning_rate": 2.189941679295807e-06, "loss": 0.0401, "step": 3919 }, { "epoch": 2.8, "grad_norm": 7.056050545567341, "learning_rate": 2.1875516214122723e-06, "loss": 0.0625, "step": 3920 }, { "epoch": 2.8, "grad_norm": 6.43155113737614, "learning_rate": 2.185162503220013e-06, "loss": 0.0531, "step": 3921 }, { "epoch": 2.8, "grad_norm": 7.53621423016585, "learning_rate": 2.182774325517285e-06, "loss": 0.0531, "step": 3922 }, { "epoch": 2.8, "grad_norm": 5.175804512791806, "learning_rate": 2.180387089102016e-06, "loss": 0.0406, "step": 3923 }, { "epoch": 2.8, "grad_norm": 5.263521350393166, "learning_rate": 2.1780007947718336e-06, "loss": 0.0337, "step": 3924 }, { "epoch": 2.8, "grad_norm": 6.01139176635299, "learning_rate": 2.175615443324035e-06, "loss": 0.0613, "step": 3925 }, { "epoch": 2.8, "grad_norm": 11.818360574772607, "learning_rate": 2.173231035555618e-06, "loss": 0.0646, "step": 3926 }, { "epoch": 2.8, "grad_norm": 3.039048127733819, "learning_rate": 2.170847572263252e-06, "loss": 0.0482, "step": 3927 }, { "epoch": 2.8, "grad_norm": 6.788911571364, "learning_rate": 2.1684650542432985e-06, "loss": 0.0548, "step": 3928 }, { "epoch": 2.8, "grad_norm": 4.333329054284375, "learning_rate": 2.166083482291801e-06, "loss": 0.0405, "step": 3929 }, { "epoch": 2.81, "grad_norm": 7.014460540954454, "learning_rate": 2.1637028572044867e-06, "loss": 0.0438, "step": 3930 }, { "epoch": 2.81, "grad_norm": 5.847965777925317, "learning_rate": 2.1613231797767668e-06, "loss": 0.066, "step": 3931 }, { "epoch": 2.81, "grad_norm": 8.636014633570216, "learning_rate": 2.158944450803736e-06, "loss": 0.0629, "step": 3932 }, { "epoch": 2.81, "grad_norm": 29.011178688159877, "learning_rate": 2.1565666710801714e-06, "loss": 0.0831, "step": 3933 }, { "epoch": 2.81, "grad_norm": 4.645775512919015, "learning_rate": 2.1541898414005343e-06, "loss": 0.0553, "step": 3934 }, { "epoch": 2.81, "grad_norm": 4.5723835589756465, "learning_rate": 2.1518139625589663e-06, "loss": 0.0328, "step": 3935 }, { "epoch": 2.81, "grad_norm": 9.046555105138385, "learning_rate": 2.1494390353492935e-06, "loss": 0.0498, "step": 3936 }, { "epoch": 2.81, "grad_norm": 5.551419544385709, "learning_rate": 2.1470650605650235e-06, "loss": 0.0357, "step": 3937 }, { "epoch": 2.81, "grad_norm": 5.766470083781369, "learning_rate": 2.144692038999345e-06, "loss": 0.0416, "step": 3938 }, { "epoch": 2.81, "grad_norm": 11.936429313135362, "learning_rate": 2.142319971445129e-06, "loss": 0.0496, "step": 3939 }, { "epoch": 2.81, "grad_norm": 4.825676884200282, "learning_rate": 2.139948858694926e-06, "loss": 0.0468, "step": 3940 }, { "epoch": 2.81, "grad_norm": 4.708123288700695, "learning_rate": 2.137578701540971e-06, "loss": 0.0472, "step": 3941 }, { "epoch": 2.81, "grad_norm": 3.6296186806200073, "learning_rate": 2.1352095007751754e-06, "loss": 0.045, "step": 3942 }, { "epoch": 2.81, "grad_norm": 5.596968703398852, "learning_rate": 2.132841257189137e-06, "loss": 0.0561, "step": 3943 }, { "epoch": 2.82, "grad_norm": 3.7089513216445256, "learning_rate": 2.1304739715741235e-06, "loss": 0.0627, "step": 3944 }, { "epoch": 2.82, "grad_norm": 3.45323572738163, "learning_rate": 2.128107644721096e-06, "loss": 0.0333, "step": 3945 }, { "epoch": 2.82, "grad_norm": 5.788593095757397, "learning_rate": 2.1257422774206816e-06, "loss": 0.045, "step": 3946 }, { "epoch": 2.82, "grad_norm": 9.321850934523075, "learning_rate": 2.1233778704632002e-06, "loss": 0.0383, "step": 3947 }, { "epoch": 2.82, "grad_norm": 10.186691315669988, "learning_rate": 2.1210144246386378e-06, "loss": 0.0892, "step": 3948 }, { "epoch": 2.82, "grad_norm": 8.396952907923813, "learning_rate": 2.1186519407366725e-06, "loss": 0.0443, "step": 3949 }, { "epoch": 2.82, "grad_norm": 2.555619165023534, "learning_rate": 2.1162904195466455e-06, "loss": 0.0361, "step": 3950 }, { "epoch": 2.82, "grad_norm": 8.059103832940904, "learning_rate": 2.113929861857594e-06, "loss": 0.0584, "step": 3951 }, { "epoch": 2.82, "grad_norm": 5.033265961432398, "learning_rate": 2.1115702684582177e-06, "loss": 0.0424, "step": 3952 }, { "epoch": 2.82, "grad_norm": 4.060885661594244, "learning_rate": 2.1092116401369033e-06, "loss": 0.0406, "step": 3953 }, { "epoch": 2.82, "grad_norm": 2.7540242910146486, "learning_rate": 2.1068539776817115e-06, "loss": 0.0282, "step": 3954 }, { "epoch": 2.82, "grad_norm": 4.013085063098618, "learning_rate": 2.1044972818803816e-06, "loss": 0.0482, "step": 3955 }, { "epoch": 2.82, "grad_norm": 11.249338957476022, "learning_rate": 2.1021415535203294e-06, "loss": 0.0404, "step": 3956 }, { "epoch": 2.82, "grad_norm": 11.67130080350898, "learning_rate": 2.0997867933886467e-06, "loss": 0.0558, "step": 3957 }, { "epoch": 2.83, "grad_norm": 11.149614862400087, "learning_rate": 2.0974330022721044e-06, "loss": 0.0512, "step": 3958 }, { "epoch": 2.83, "grad_norm": 3.687014650829844, "learning_rate": 2.0950801809571466e-06, "loss": 0.0612, "step": 3959 }, { "epoch": 2.83, "grad_norm": 4.395001654489367, "learning_rate": 2.0927283302298944e-06, "loss": 0.0432, "step": 3960 }, { "epoch": 2.83, "grad_norm": 5.284072609548135, "learning_rate": 2.0903774508761477e-06, "loss": 0.0447, "step": 3961 }, { "epoch": 2.83, "grad_norm": 3.343071674546592, "learning_rate": 2.0880275436813726e-06, "loss": 0.0218, "step": 3962 }, { "epoch": 2.83, "grad_norm": 6.360795545549215, "learning_rate": 2.0856786094307247e-06, "loss": 0.053, "step": 3963 }, { "epoch": 2.83, "grad_norm": 6.794862698130726, "learning_rate": 2.0833306489090186e-06, "loss": 0.0465, "step": 3964 }, { "epoch": 2.83, "grad_norm": 5.79082889481297, "learning_rate": 2.08098366290076e-06, "loss": 0.0308, "step": 3965 }, { "epoch": 2.83, "grad_norm": 10.3999370599744, "learning_rate": 2.078637652190112e-06, "loss": 0.0356, "step": 3966 }, { "epoch": 2.83, "grad_norm": 5.198526561829731, "learning_rate": 2.0762926175609287e-06, "loss": 0.045, "step": 3967 }, { "epoch": 2.83, "grad_norm": 7.195220899911224, "learning_rate": 2.0739485597967237e-06, "loss": 0.064, "step": 3968 }, { "epoch": 2.83, "grad_norm": 5.666052330346906, "learning_rate": 2.0716054796806916e-06, "loss": 0.0467, "step": 3969 }, { "epoch": 2.83, "grad_norm": 4.662416911777877, "learning_rate": 2.0692633779956998e-06, "loss": 0.0317, "step": 3970 }, { "epoch": 2.83, "grad_norm": 8.017085740979171, "learning_rate": 2.0669222555242884e-06, "loss": 0.0516, "step": 3971 }, { "epoch": 2.84, "grad_norm": 7.265789210058505, "learning_rate": 2.064582113048669e-06, "loss": 0.0457, "step": 3972 }, { "epoch": 2.84, "grad_norm": 8.795311756133641, "learning_rate": 2.0622429513507275e-06, "loss": 0.0427, "step": 3973 }, { "epoch": 2.84, "grad_norm": 6.9365340050039865, "learning_rate": 2.05990477121202e-06, "loss": 0.063, "step": 3974 }, { "epoch": 2.84, "grad_norm": 3.914484245239362, "learning_rate": 2.0575675734137773e-06, "loss": 0.0345, "step": 3975 }, { "epoch": 2.84, "grad_norm": 5.314654954698931, "learning_rate": 2.0552313587369003e-06, "loss": 0.0421, "step": 3976 }, { "epoch": 2.84, "grad_norm": 3.3533779142183926, "learning_rate": 2.052896127961963e-06, "loss": 0.0424, "step": 3977 }, { "epoch": 2.84, "grad_norm": 19.476559355360912, "learning_rate": 2.050561881869205e-06, "loss": 0.0509, "step": 3978 }, { "epoch": 2.84, "grad_norm": 6.938628294930286, "learning_rate": 2.048228621238547e-06, "loss": 0.0466, "step": 3979 }, { "epoch": 2.84, "grad_norm": 10.94306529980738, "learning_rate": 2.0458963468495692e-06, "loss": 0.0442, "step": 3980 }, { "epoch": 2.84, "grad_norm": 4.791706216599774, "learning_rate": 2.0435650594815338e-06, "loss": 0.0298, "step": 3981 }, { "epoch": 2.84, "grad_norm": 20.151160656020064, "learning_rate": 2.0412347599133607e-06, "loss": 0.0598, "step": 3982 }, { "epoch": 2.84, "grad_norm": 12.255216096956687, "learning_rate": 2.0389054489236534e-06, "loss": 0.0381, "step": 3983 }, { "epoch": 2.84, "grad_norm": 4.855045793700611, "learning_rate": 2.03657712729067e-06, "loss": 0.0604, "step": 3984 }, { "epoch": 2.84, "grad_norm": 8.182350191418786, "learning_rate": 2.034249795792355e-06, "loss": 0.0321, "step": 3985 }, { "epoch": 2.85, "grad_norm": 7.707765374797664, "learning_rate": 2.031923455206306e-06, "loss": 0.0521, "step": 3986 }, { "epoch": 2.85, "grad_norm": 6.104221297033886, "learning_rate": 2.0295981063098e-06, "loss": 0.0394, "step": 3987 }, { "epoch": 2.85, "grad_norm": 11.93087451550472, "learning_rate": 2.027273749879777e-06, "loss": 0.0406, "step": 3988 }, { "epoch": 2.85, "grad_norm": 5.2406447108361265, "learning_rate": 2.02495038669285e-06, "loss": 0.0346, "step": 3989 }, { "epoch": 2.85, "grad_norm": 13.604224730402509, "learning_rate": 2.0226280175252966e-06, "loss": 0.0383, "step": 3990 }, { "epoch": 2.85, "grad_norm": 2.555103971529308, "learning_rate": 2.020306643153063e-06, "loss": 0.0337, "step": 3991 }, { "epoch": 2.85, "grad_norm": 6.369185477762121, "learning_rate": 2.0179862643517657e-06, "loss": 0.0542, "step": 3992 }, { "epoch": 2.85, "grad_norm": 3.4873107821184797, "learning_rate": 2.015666881896684e-06, "loss": 0.0335, "step": 3993 }, { "epoch": 2.85, "grad_norm": 6.806890531404817, "learning_rate": 2.0133484965627683e-06, "loss": 0.0486, "step": 3994 }, { "epoch": 2.85, "grad_norm": 11.163092400988585, "learning_rate": 2.0110311091246333e-06, "loss": 0.0443, "step": 3995 }, { "epoch": 2.85, "grad_norm": 8.775320024586614, "learning_rate": 2.0087147203565614e-06, "loss": 0.0558, "step": 3996 }, { "epoch": 2.85, "grad_norm": 4.638711681258063, "learning_rate": 2.0063993310325013e-06, "loss": 0.0591, "step": 3997 }, { "epoch": 2.85, "grad_norm": 11.201379197569311, "learning_rate": 2.0040849419260682e-06, "loss": 0.0336, "step": 3998 }, { "epoch": 2.85, "grad_norm": 8.662046945406965, "learning_rate": 2.0017715538105416e-06, "loss": 0.05, "step": 3999 }, { "epoch": 2.86, "grad_norm": 22.627187773041392, "learning_rate": 1.9994591674588677e-06, "loss": 0.0756, "step": 4000 }, { "epoch": 2.86, "eval_avg_AUC": 0.8313536271084199, "eval_avg_Accuracy": 0.7335875331564987, "eval_avg_Accuracy-right": 0.8922655536715794, "eval_avg_Accuracy-wrong": 0.4569024334773709, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.7065878301129395, "eval_last_AUC": 0.8360746856019163, "eval_last_Accuracy": 0.7791777188328912, "eval_last_Accuracy-right": 0.8478544411112561, "eval_last_Accuracy-wrong": 0.6594268819649761, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.7097758926367477, "eval_max_AUC": 0.779076192285081, "eval_max_Accuracy": 0.6495358090185677, "eval_max_Accuracy-right": 0.9808921351245597, "eval_max_Accuracy-wrong": 0.07175346827382306, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6517312365875962, "eval_min_AUC": 0.8398533477848513, "eval_min_Accuracy": 0.7703498010610079, "eval_min_Accuracy-right": 0.7896178427024912, "eval_min_Accuracy-wrong": 0.7367523311348647, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.7123435959578684, "eval_prod_AUC": 0.834872802991024, "eval_prod_Accuracy": 0.7446949602122016, "eval_prod_Accuracy-right": 0.6746445806704057, "eval_prod_Accuracy-wrong": 0.8668410279736184, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.707421683014454, "eval_runtime": 248.5729, "eval_samples_per_second": 97.066, "eval_steps_per_second": 3.033, "eval_sum_AUC": 0.7054043711817337, "eval_sum_Accuracy": 0.6395059681697612, "eval_sum_Accuracy-right": 0.9969349158732229, "eval_sum_Accuracy-wrong": 0.016261087104844214, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.682774019703003, "step": 4000 }, { "epoch": 2.86, "grad_norm": 11.448506614643406, "learning_rate": 1.9971477836436575e-06, "loss": 0.0907, "step": 4001 }, { "epoch": 2.86, "grad_norm": 17.106042357096115, "learning_rate": 1.99483740313719e-06, "loss": 0.046, "step": 4002 }, { "epoch": 2.86, "grad_norm": 15.306913920279596, "learning_rate": 1.9925280267114e-06, "loss": 0.0353, "step": 4003 }, { "epoch": 2.86, "grad_norm": 4.772152424590818, "learning_rate": 1.9902196551379006e-06, "loss": 0.0455, "step": 4004 }, { "epoch": 2.86, "grad_norm": 15.68768042844194, "learning_rate": 1.987912289187954e-06, "loss": 0.0629, "step": 4005 }, { "epoch": 2.86, "grad_norm": 8.371248909494858, "learning_rate": 1.9856059296325027e-06, "loss": 0.0479, "step": 4006 }, { "epoch": 2.86, "grad_norm": 6.182577891454594, "learning_rate": 1.9833005772421354e-06, "loss": 0.0281, "step": 4007 }, { "epoch": 2.86, "grad_norm": 10.517434691180428, "learning_rate": 1.980996232787121e-06, "loss": 0.044, "step": 4008 }, { "epoch": 2.86, "grad_norm": 12.669315091660465, "learning_rate": 1.978692897037377e-06, "loss": 0.0345, "step": 4009 }, { "epoch": 2.86, "grad_norm": 5.040911189426195, "learning_rate": 1.9763905707624975e-06, "loss": 0.0225, "step": 4010 }, { "epoch": 2.86, "grad_norm": 6.772416865182909, "learning_rate": 1.974089254731727e-06, "loss": 0.0538, "step": 4011 }, { "epoch": 2.86, "grad_norm": 6.111698799497366, "learning_rate": 1.97178894971398e-06, "loss": 0.0297, "step": 4012 }, { "epoch": 2.86, "grad_norm": 5.376778629697909, "learning_rate": 1.9694896564778317e-06, "loss": 0.0508, "step": 4013 }, { "epoch": 2.87, "grad_norm": 12.684429026836513, "learning_rate": 1.9671913757915173e-06, "loss": 0.0394, "step": 4014 }, { "epoch": 2.87, "grad_norm": 10.673329195030192, "learning_rate": 1.964894108422936e-06, "loss": 0.0573, "step": 4015 }, { "epoch": 2.87, "grad_norm": 9.962999370202702, "learning_rate": 1.962597855139648e-06, "loss": 0.0633, "step": 4016 }, { "epoch": 2.87, "grad_norm": 5.930744399839919, "learning_rate": 1.960302616708873e-06, "loss": 0.0561, "step": 4017 }, { "epoch": 2.87, "grad_norm": 2.99617559332668, "learning_rate": 1.9580083938974937e-06, "loss": 0.0344, "step": 4018 }, { "epoch": 2.87, "grad_norm": 6.613066828727661, "learning_rate": 1.9557151874720526e-06, "loss": 0.0646, "step": 4019 }, { "epoch": 2.87, "grad_norm": 14.130923755778449, "learning_rate": 1.953422998198754e-06, "loss": 0.0476, "step": 4020 }, { "epoch": 2.87, "grad_norm": 5.093142217656671, "learning_rate": 1.9511318268434554e-06, "loss": 0.0422, "step": 4021 }, { "epoch": 2.87, "grad_norm": 5.700589187453384, "learning_rate": 1.9488416741716877e-06, "loss": 0.0416, "step": 4022 }, { "epoch": 2.87, "grad_norm": 11.290562714679753, "learning_rate": 1.946552540948625e-06, "loss": 0.0374, "step": 4023 }, { "epoch": 2.87, "grad_norm": 11.076994659685992, "learning_rate": 1.944264427939118e-06, "loss": 0.064, "step": 4024 }, { "epoch": 2.87, "grad_norm": 5.073760434659358, "learning_rate": 1.941977335907659e-06, "loss": 0.034, "step": 4025 }, { "epoch": 2.87, "grad_norm": 12.555237380180236, "learning_rate": 1.939691265618417e-06, "loss": 0.0356, "step": 4026 }, { "epoch": 2.87, "grad_norm": 13.260024757677835, "learning_rate": 1.9374062178352036e-06, "loss": 0.043, "step": 4027 }, { "epoch": 2.88, "grad_norm": 8.049256823016284, "learning_rate": 1.935122193321499e-06, "loss": 0.0456, "step": 4028 }, { "epoch": 2.88, "grad_norm": 4.427445039334935, "learning_rate": 1.932839192840436e-06, "loss": 0.0403, "step": 4029 }, { "epoch": 2.88, "grad_norm": 11.153086971868019, "learning_rate": 1.930557217154809e-06, "loss": 0.0361, "step": 4030 }, { "epoch": 2.88, "grad_norm": 6.049221219619234, "learning_rate": 1.9282762670270693e-06, "loss": 0.0492, "step": 4031 }, { "epoch": 2.88, "grad_norm": 7.07032061230794, "learning_rate": 1.925996343219323e-06, "loss": 0.0557, "step": 4032 }, { "epoch": 2.88, "grad_norm": 11.430527196674731, "learning_rate": 1.923717446493336e-06, "loss": 0.054, "step": 4033 }, { "epoch": 2.88, "grad_norm": 16.819939441212135, "learning_rate": 1.9214395776105297e-06, "loss": 0.0526, "step": 4034 }, { "epoch": 2.88, "grad_norm": 12.959453782895224, "learning_rate": 1.919162737331983e-06, "loss": 0.047, "step": 4035 }, { "epoch": 2.88, "grad_norm": 4.702665094500624, "learning_rate": 1.9168869264184296e-06, "loss": 0.0423, "step": 4036 }, { "epoch": 2.88, "grad_norm": 8.969495139042449, "learning_rate": 1.9146121456302613e-06, "loss": 0.038, "step": 4037 }, { "epoch": 2.88, "grad_norm": 5.982997771608221, "learning_rate": 1.9123383957275237e-06, "loss": 0.0415, "step": 4038 }, { "epoch": 2.88, "grad_norm": 21.08738392970205, "learning_rate": 1.91006567746992e-06, "loss": 0.0711, "step": 4039 }, { "epoch": 2.88, "grad_norm": 6.035083987282505, "learning_rate": 1.907793991616806e-06, "loss": 0.0494, "step": 4040 }, { "epoch": 2.88, "grad_norm": 8.001816643118843, "learning_rate": 1.9055233389271955e-06, "loss": 0.0515, "step": 4041 }, { "epoch": 2.89, "grad_norm": 5.808783157897872, "learning_rate": 1.9032537201597556e-06, "loss": 0.0374, "step": 4042 }, { "epoch": 2.89, "grad_norm": 7.463408759803654, "learning_rate": 1.9009851360728077e-06, "loss": 0.0522, "step": 4043 }, { "epoch": 2.89, "grad_norm": 7.169003885468974, "learning_rate": 1.898717587424328e-06, "loss": 0.0566, "step": 4044 }, { "epoch": 2.89, "grad_norm": 6.208200904132249, "learning_rate": 1.8964510749719484e-06, "loss": 0.0567, "step": 4045 }, { "epoch": 2.89, "grad_norm": 16.35211171549844, "learning_rate": 1.8941855994729497e-06, "loss": 0.0486, "step": 4046 }, { "epoch": 2.89, "grad_norm": 5.281197291589123, "learning_rate": 1.8919211616842703e-06, "loss": 0.0331, "step": 4047 }, { "epoch": 2.89, "grad_norm": 4.747918718572736, "learning_rate": 1.8896577623625017e-06, "loss": 0.037, "step": 4048 }, { "epoch": 2.89, "grad_norm": 12.046638045605734, "learning_rate": 1.887395402263888e-06, "loss": 0.0693, "step": 4049 }, { "epoch": 2.89, "grad_norm": 27.115553942502387, "learning_rate": 1.8851340821443248e-06, "loss": 0.0374, "step": 4050 }, { "epoch": 2.89, "grad_norm": 18.92193070578465, "learning_rate": 1.882873802759362e-06, "loss": 0.0794, "step": 4051 }, { "epoch": 2.89, "grad_norm": 4.295273630927773, "learning_rate": 1.8806145648642005e-06, "loss": 0.04, "step": 4052 }, { "epoch": 2.89, "grad_norm": 3.412350246216868, "learning_rate": 1.8783563692136936e-06, "loss": 0.0385, "step": 4053 }, { "epoch": 2.89, "grad_norm": 17.67225398516106, "learning_rate": 1.8760992165623465e-06, "loss": 0.0724, "step": 4054 }, { "epoch": 2.89, "grad_norm": 8.36568199802075, "learning_rate": 1.873843107664316e-06, "loss": 0.0547, "step": 4055 }, { "epoch": 2.9, "grad_norm": 9.290828968139262, "learning_rate": 1.87158804327341e-06, "loss": 0.0535, "step": 4056 }, { "epoch": 2.9, "grad_norm": 2.802306481599526, "learning_rate": 1.8693340241430874e-06, "loss": 0.0308, "step": 4057 }, { "epoch": 2.9, "grad_norm": 10.43756004493309, "learning_rate": 1.867081051026458e-06, "loss": 0.057, "step": 4058 }, { "epoch": 2.9, "grad_norm": 6.681088881904337, "learning_rate": 1.8648291246762818e-06, "loss": 0.0321, "step": 4059 }, { "epoch": 2.9, "grad_norm": 8.252176413766616, "learning_rate": 1.8625782458449693e-06, "loss": 0.0511, "step": 4060 }, { "epoch": 2.9, "grad_norm": 8.406152709881708, "learning_rate": 1.860328415284583e-06, "loss": 0.0521, "step": 4061 }, { "epoch": 2.9, "grad_norm": 2.896995014403766, "learning_rate": 1.8580796337468276e-06, "loss": 0.0422, "step": 4062 }, { "epoch": 2.9, "grad_norm": 3.8791001120200477, "learning_rate": 1.8558319019830695e-06, "loss": 0.0373, "step": 4063 }, { "epoch": 2.9, "grad_norm": 5.53865330305396, "learning_rate": 1.853585220744311e-06, "loss": 0.0411, "step": 4064 }, { "epoch": 2.9, "grad_norm": 12.29757774570903, "learning_rate": 1.851339590781217e-06, "loss": 0.0444, "step": 4065 }, { "epoch": 2.9, "grad_norm": 17.39424270516312, "learning_rate": 1.8490950128440877e-06, "loss": 0.0834, "step": 4066 }, { "epoch": 2.9, "grad_norm": 14.625296601227284, "learning_rate": 1.8468514876828847e-06, "loss": 0.0585, "step": 4067 }, { "epoch": 2.9, "grad_norm": 18.641308415923643, "learning_rate": 1.844609016047204e-06, "loss": 0.0481, "step": 4068 }, { "epoch": 2.9, "grad_norm": 9.87551268358642, "learning_rate": 1.8423675986863054e-06, "loss": 0.0361, "step": 4069 }, { "epoch": 2.91, "grad_norm": 3.7267052490862635, "learning_rate": 1.8401272363490818e-06, "loss": 0.0443, "step": 4070 }, { "epoch": 2.91, "grad_norm": 10.562034474063116, "learning_rate": 1.8378879297840818e-06, "loss": 0.0477, "step": 4071 }, { "epoch": 2.91, "grad_norm": 5.802811004992916, "learning_rate": 1.8356496797395002e-06, "loss": 0.0434, "step": 4072 }, { "epoch": 2.91, "grad_norm": 6.436346892902528, "learning_rate": 1.8334124869631765e-06, "loss": 0.0345, "step": 4073 }, { "epoch": 2.91, "grad_norm": 5.010512163189531, "learning_rate": 1.8311763522025994e-06, "loss": 0.0364, "step": 4074 }, { "epoch": 2.91, "grad_norm": 7.935289328219277, "learning_rate": 1.828941276204903e-06, "loss": 0.0379, "step": 4075 }, { "epoch": 2.91, "grad_norm": 5.923070187953769, "learning_rate": 1.8267072597168673e-06, "loss": 0.0304, "step": 4076 }, { "epoch": 2.91, "grad_norm": 3.8603791640597245, "learning_rate": 1.8244743034849193e-06, "loss": 0.0446, "step": 4077 }, { "epoch": 2.91, "grad_norm": 3.915093269488006, "learning_rate": 1.8222424082551303e-06, "loss": 0.0691, "step": 4078 }, { "epoch": 2.91, "grad_norm": 5.595291860210755, "learning_rate": 1.820011574773221e-06, "loss": 0.0426, "step": 4079 }, { "epoch": 2.91, "grad_norm": 6.802755351874992, "learning_rate": 1.8177818037845485e-06, "loss": 0.0476, "step": 4080 }, { "epoch": 2.91, "grad_norm": 3.6813348946660613, "learning_rate": 1.8155530960341273e-06, "loss": 0.0359, "step": 4081 }, { "epoch": 2.91, "grad_norm": 18.71702922304564, "learning_rate": 1.8133254522666033e-06, "loss": 0.0569, "step": 4082 }, { "epoch": 2.91, "grad_norm": 7.6921438088763985, "learning_rate": 1.8110988732262808e-06, "loss": 0.0419, "step": 4083 }, { "epoch": 2.92, "grad_norm": 10.044378122355903, "learning_rate": 1.8088733596570945e-06, "loss": 0.0382, "step": 4084 }, { "epoch": 2.92, "grad_norm": 6.114486237395475, "learning_rate": 1.806648912302636e-06, "loss": 0.061, "step": 4085 }, { "epoch": 2.92, "grad_norm": 3.3399112721858772, "learning_rate": 1.8044255319061287e-06, "loss": 0.0387, "step": 4086 }, { "epoch": 2.92, "grad_norm": 12.412698383313344, "learning_rate": 1.8022032192104517e-06, "loss": 0.0405, "step": 4087 }, { "epoch": 2.92, "grad_norm": 30.82021867460914, "learning_rate": 1.7999819749581154e-06, "loss": 0.0749, "step": 4088 }, { "epoch": 2.92, "grad_norm": 11.480240630642303, "learning_rate": 1.797761799891281e-06, "loss": 0.0581, "step": 4089 }, { "epoch": 2.92, "grad_norm": 7.582474162510632, "learning_rate": 1.7955426947517507e-06, "loss": 0.059, "step": 4090 }, { "epoch": 2.92, "grad_norm": 10.63345521267209, "learning_rate": 1.793324660280968e-06, "loss": 0.0526, "step": 4091 }, { "epoch": 2.92, "grad_norm": 8.103239978569194, "learning_rate": 1.7911076972200193e-06, "loss": 0.0411, "step": 4092 }, { "epoch": 2.92, "grad_norm": 11.925157615703743, "learning_rate": 1.7888918063096334e-06, "loss": 0.0408, "step": 4093 }, { "epoch": 2.92, "grad_norm": 14.374780754417847, "learning_rate": 1.7866769882901814e-06, "loss": 0.0412, "step": 4094 }, { "epoch": 2.92, "grad_norm": 6.109178998467284, "learning_rate": 1.784463243901674e-06, "loss": 0.0528, "step": 4095 }, { "epoch": 2.92, "grad_norm": 9.609524368121408, "learning_rate": 1.7822505738837648e-06, "loss": 0.0651, "step": 4096 }, { "epoch": 2.92, "grad_norm": 5.040866179019186, "learning_rate": 1.7800389789757483e-06, "loss": 0.0445, "step": 4097 }, { "epoch": 2.93, "grad_norm": 5.714985544615724, "learning_rate": 1.7778284599165597e-06, "loss": 0.0487, "step": 4098 }, { "epoch": 2.93, "grad_norm": 7.619378628354265, "learning_rate": 1.7756190174447734e-06, "loss": 0.0436, "step": 4099 }, { "epoch": 2.93, "grad_norm": 3.2474864738073133, "learning_rate": 1.7734106522986061e-06, "loss": 0.0462, "step": 4100 }, { "epoch": 2.93, "grad_norm": 4.3973092959361875, "learning_rate": 1.7712033652159133e-06, "loss": 0.0531, "step": 4101 }, { "epoch": 2.93, "grad_norm": 7.4959152699641765, "learning_rate": 1.7689971569341907e-06, "loss": 0.0576, "step": 4102 }, { "epoch": 2.93, "grad_norm": 3.749459966128013, "learning_rate": 1.7667920281905738e-06, "loss": 0.0277, "step": 4103 }, { "epoch": 2.93, "grad_norm": 7.533700501363081, "learning_rate": 1.764587979721838e-06, "loss": 0.0647, "step": 4104 }, { "epoch": 2.93, "grad_norm": 10.29526487200459, "learning_rate": 1.7623850122643926e-06, "loss": 0.0401, "step": 4105 }, { "epoch": 2.93, "grad_norm": 12.913824847457555, "learning_rate": 1.7601831265542968e-06, "loss": 0.0405, "step": 4106 }, { "epoch": 2.93, "grad_norm": 6.04341070181785, "learning_rate": 1.7579823233272337e-06, "loss": 0.0359, "step": 4107 }, { "epoch": 2.93, "grad_norm": 19.592526482210516, "learning_rate": 1.7557826033185404e-06, "loss": 0.054, "step": 4108 }, { "epoch": 2.93, "grad_norm": 5.540253325530484, "learning_rate": 1.7535839672631772e-06, "loss": 0.0508, "step": 4109 }, { "epoch": 2.93, "grad_norm": 7.452509099451598, "learning_rate": 1.7513864158957556e-06, "loss": 0.0364, "step": 4110 }, { "epoch": 2.93, "grad_norm": 20.761584265848015, "learning_rate": 1.7491899499505122e-06, "loss": 0.0438, "step": 4111 }, { "epoch": 2.94, "grad_norm": 13.58680016006121, "learning_rate": 1.746994570161334e-06, "loss": 0.0384, "step": 4112 }, { "epoch": 2.94, "grad_norm": 4.603506950318464, "learning_rate": 1.7448002772617324e-06, "loss": 0.0438, "step": 4113 }, { "epoch": 2.94, "grad_norm": 3.8154557106353284, "learning_rate": 1.7426070719848632e-06, "loss": 0.0257, "step": 4114 }, { "epoch": 2.94, "grad_norm": 5.747171185189265, "learning_rate": 1.7404149550635173e-06, "loss": 0.0524, "step": 4115 }, { "epoch": 2.94, "grad_norm": 7.9740227323975335, "learning_rate": 1.7382239272301221e-06, "loss": 0.077, "step": 4116 }, { "epoch": 2.94, "grad_norm": 10.989166783225153, "learning_rate": 1.7360339892167404e-06, "loss": 0.0374, "step": 4117 }, { "epoch": 2.94, "grad_norm": 6.7143857340659165, "learning_rate": 1.7338451417550712e-06, "loss": 0.0756, "step": 4118 }, { "epoch": 2.94, "grad_norm": 12.19837267726741, "learning_rate": 1.7316573855764485e-06, "loss": 0.092, "step": 4119 }, { "epoch": 2.94, "grad_norm": 7.153980131842924, "learning_rate": 1.7294707214118434e-06, "loss": 0.0359, "step": 4120 }, { "epoch": 2.94, "grad_norm": 7.588739529742861, "learning_rate": 1.7272851499918603e-06, "loss": 0.0444, "step": 4121 }, { "epoch": 2.94, "grad_norm": 4.8112730950323765, "learning_rate": 1.725100672046741e-06, "loss": 0.0451, "step": 4122 }, { "epoch": 2.94, "grad_norm": 7.260957042480349, "learning_rate": 1.7229172883063556e-06, "loss": 0.0417, "step": 4123 }, { "epoch": 2.94, "grad_norm": 7.1770527506341235, "learning_rate": 1.7207349995002192e-06, "loss": 0.0321, "step": 4124 }, { "epoch": 2.94, "grad_norm": 8.493330479432425, "learning_rate": 1.7185538063574692e-06, "loss": 0.0701, "step": 4125 }, { "epoch": 2.95, "grad_norm": 18.78832049114435, "learning_rate": 1.7163737096068883e-06, "loss": 0.0322, "step": 4126 }, { "epoch": 2.95, "grad_norm": 3.9921302247739954, "learning_rate": 1.7141947099768818e-06, "loss": 0.0453, "step": 4127 }, { "epoch": 2.95, "grad_norm": 10.454142818755402, "learning_rate": 1.7120168081955001e-06, "loss": 0.0321, "step": 4128 }, { "epoch": 2.95, "grad_norm": 7.925392060998192, "learning_rate": 1.7098400049904163e-06, "loss": 0.0514, "step": 4129 }, { "epoch": 2.95, "grad_norm": 6.926450108002052, "learning_rate": 1.707664301088941e-06, "loss": 0.0338, "step": 4130 }, { "epoch": 2.95, "grad_norm": 3.1024288350216556, "learning_rate": 1.705489697218019e-06, "loss": 0.0466, "step": 4131 }, { "epoch": 2.95, "grad_norm": 9.17160382357441, "learning_rate": 1.7033161941042248e-06, "loss": 0.0503, "step": 4132 }, { "epoch": 2.95, "grad_norm": 9.016353852398503, "learning_rate": 1.7011437924737666e-06, "loss": 0.0489, "step": 4133 }, { "epoch": 2.95, "grad_norm": 9.854573725493205, "learning_rate": 1.6989724930524843e-06, "loss": 0.0579, "step": 4134 }, { "epoch": 2.95, "grad_norm": 6.671723566772397, "learning_rate": 1.6968022965658492e-06, "loss": 0.0335, "step": 4135 }, { "epoch": 2.95, "grad_norm": 2.720192606101922, "learning_rate": 1.694633203738964e-06, "loss": 0.0402, "step": 4136 }, { "epoch": 2.95, "grad_norm": 5.995599861484541, "learning_rate": 1.6924652152965632e-06, "loss": 0.0549, "step": 4137 }, { "epoch": 2.95, "grad_norm": 12.266299701613004, "learning_rate": 1.690298331963014e-06, "loss": 0.0572, "step": 4138 }, { "epoch": 2.95, "grad_norm": 3.4813446495338654, "learning_rate": 1.6881325544623067e-06, "loss": 0.0452, "step": 4139 }, { "epoch": 2.96, "grad_norm": 5.891263821479523, "learning_rate": 1.6859678835180749e-06, "loss": 0.0515, "step": 4140 }, { "epoch": 2.96, "grad_norm": 3.6553475031782474, "learning_rate": 1.6838043198535693e-06, "loss": 0.0418, "step": 4141 }, { "epoch": 2.96, "grad_norm": 5.408411863091127, "learning_rate": 1.681641864191682e-06, "loss": 0.0557, "step": 4142 }, { "epoch": 2.96, "grad_norm": 5.8114627598252895, "learning_rate": 1.6794805172549244e-06, "loss": 0.0398, "step": 4143 }, { "epoch": 2.96, "grad_norm": 7.53548148027711, "learning_rate": 1.6773202797654486e-06, "loss": 0.061, "step": 4144 }, { "epoch": 2.96, "grad_norm": 5.913855035387853, "learning_rate": 1.6751611524450235e-06, "loss": 0.0335, "step": 4145 }, { "epoch": 2.96, "grad_norm": 2.024414954794077, "learning_rate": 1.6730031360150605e-06, "loss": 0.0341, "step": 4146 }, { "epoch": 2.96, "grad_norm": 9.593789286345464, "learning_rate": 1.670846231196588e-06, "loss": 0.0404, "step": 4147 }, { "epoch": 2.96, "grad_norm": 4.308719832906545, "learning_rate": 1.6686904387102692e-06, "loss": 0.0405, "step": 4148 }, { "epoch": 2.96, "grad_norm": 4.491244505461696, "learning_rate": 1.6665357592763948e-06, "loss": 0.0345, "step": 4149 }, { "epoch": 2.96, "grad_norm": 14.320074989143352, "learning_rate": 1.6643821936148834e-06, "loss": 0.0335, "step": 4150 }, { "epoch": 2.96, "grad_norm": 4.632814125620873, "learning_rate": 1.6622297424452817e-06, "loss": 0.0333, "step": 4151 }, { "epoch": 2.96, "grad_norm": 8.547410451451574, "learning_rate": 1.6600784064867625e-06, "loss": 0.049, "step": 4152 }, { "epoch": 2.96, "grad_norm": 11.406023385749469, "learning_rate": 1.6579281864581275e-06, "loss": 0.0543, "step": 4153 }, { "epoch": 2.97, "grad_norm": 6.388068135398717, "learning_rate": 1.6557790830778058e-06, "loss": 0.0488, "step": 4154 }, { "epoch": 2.97, "grad_norm": 4.4616777204213145, "learning_rate": 1.6536310970638525e-06, "loss": 0.0606, "step": 4155 }, { "epoch": 2.97, "grad_norm": 20.563153827350366, "learning_rate": 1.6514842291339494e-06, "loss": 0.0524, "step": 4156 }, { "epoch": 2.97, "grad_norm": 10.870505302773477, "learning_rate": 1.6493384800054052e-06, "loss": 0.0561, "step": 4157 }, { "epoch": 2.97, "grad_norm": 9.278625304721448, "learning_rate": 1.6471938503951546e-06, "loss": 0.0536, "step": 4158 }, { "epoch": 2.97, "grad_norm": 4.924475025051709, "learning_rate": 1.6450503410197582e-06, "loss": 0.0427, "step": 4159 }, { "epoch": 2.97, "grad_norm": 14.546159414440977, "learning_rate": 1.6429079525954023e-06, "loss": 0.0796, "step": 4160 }, { "epoch": 2.97, "grad_norm": 3.274576485929536, "learning_rate": 1.6407666858378985e-06, "loss": 0.0368, "step": 4161 }, { "epoch": 2.97, "grad_norm": 4.750934336503444, "learning_rate": 1.6386265414626834e-06, "loss": 0.0559, "step": 4162 }, { "epoch": 2.97, "grad_norm": 3.540640136850486, "learning_rate": 1.636487520184822e-06, "loss": 0.0367, "step": 4163 }, { "epoch": 2.97, "grad_norm": 4.692046580807771, "learning_rate": 1.6343496227189948e-06, "loss": 0.0359, "step": 4164 }, { "epoch": 2.97, "grad_norm": 10.53276192290032, "learning_rate": 1.632212849779521e-06, "loss": 0.0316, "step": 4165 }, { "epoch": 2.97, "grad_norm": 5.302753116947668, "learning_rate": 1.630077202080328e-06, "loss": 0.0267, "step": 4166 }, { "epoch": 2.97, "grad_norm": 5.259703313914846, "learning_rate": 1.6279426803349828e-06, "loss": 0.0547, "step": 4167 }, { "epoch": 2.98, "grad_norm": 4.082893040513792, "learning_rate": 1.6258092852566625e-06, "loss": 0.0217, "step": 4168 }, { "epoch": 2.98, "grad_norm": 8.23019416830836, "learning_rate": 1.6236770175581807e-06, "loss": 0.0509, "step": 4169 }, { "epoch": 2.98, "grad_norm": 12.438101804745394, "learning_rate": 1.62154587795196e-06, "loss": 0.055, "step": 4170 }, { "epoch": 2.98, "grad_norm": 5.427420456057769, "learning_rate": 1.6194158671500616e-06, "loss": 0.0518, "step": 4171 }, { "epoch": 2.98, "grad_norm": 10.415775308376247, "learning_rate": 1.6172869858641554e-06, "loss": 0.0432, "step": 4172 }, { "epoch": 2.98, "grad_norm": 8.480843745125458, "learning_rate": 1.6151592348055433e-06, "loss": 0.0427, "step": 4173 }, { "epoch": 2.98, "grad_norm": 4.617772973040294, "learning_rate": 1.6130326146851455e-06, "loss": 0.0438, "step": 4174 }, { "epoch": 2.98, "grad_norm": 5.919133113007816, "learning_rate": 1.6109071262135056e-06, "loss": 0.0423, "step": 4175 }, { "epoch": 2.98, "grad_norm": 3.689039215465472, "learning_rate": 1.608782770100789e-06, "loss": 0.0372, "step": 4176 }, { "epoch": 2.98, "grad_norm": 14.067237284232494, "learning_rate": 1.6066595470567825e-06, "loss": 0.0435, "step": 4177 }, { "epoch": 2.98, "grad_norm": 5.854433151920558, "learning_rate": 1.6045374577908944e-06, "loss": 0.0504, "step": 4178 }, { "epoch": 2.98, "grad_norm": 11.6646480285659, "learning_rate": 1.6024165030121542e-06, "loss": 0.0554, "step": 4179 }, { "epoch": 2.98, "grad_norm": 6.011438841677108, "learning_rate": 1.6002966834292116e-06, "loss": 0.0504, "step": 4180 }, { "epoch": 2.98, "grad_norm": 6.227447353492944, "learning_rate": 1.5981779997503405e-06, "loss": 0.0476, "step": 4181 }, { "epoch": 2.99, "grad_norm": 10.349021587684007, "learning_rate": 1.5960604526834266e-06, "loss": 0.077, "step": 4182 }, { "epoch": 2.99, "grad_norm": 6.211039507384463, "learning_rate": 1.5939440429359888e-06, "loss": 0.0554, "step": 4183 }, { "epoch": 2.99, "grad_norm": 2.6513931652813705, "learning_rate": 1.591828771215152e-06, "loss": 0.0348, "step": 4184 }, { "epoch": 2.99, "grad_norm": 9.179016643100248, "learning_rate": 1.5897146382276752e-06, "loss": 0.0577, "step": 4185 }, { "epoch": 2.99, "grad_norm": 7.051485770699399, "learning_rate": 1.587601644679922e-06, "loss": 0.0298, "step": 4186 }, { "epoch": 2.99, "grad_norm": 7.952697092704008, "learning_rate": 1.58548979127789e-06, "loss": 0.0339, "step": 4187 }, { "epoch": 2.99, "grad_norm": 3.6748750294465, "learning_rate": 1.5833790787271819e-06, "loss": 0.0346, "step": 4188 }, { "epoch": 2.99, "grad_norm": 5.611862793582695, "learning_rate": 1.5812695077330325e-06, "loss": 0.056, "step": 4189 }, { "epoch": 2.99, "grad_norm": 11.09492276918743, "learning_rate": 1.5791610790002838e-06, "loss": 0.0451, "step": 4190 }, { "epoch": 2.99, "grad_norm": 8.006779281484823, "learning_rate": 1.577053793233403e-06, "loss": 0.049, "step": 4191 }, { "epoch": 2.99, "grad_norm": 10.760492669788933, "learning_rate": 1.5749476511364726e-06, "loss": 0.0446, "step": 4192 }, { "epoch": 2.99, "grad_norm": 10.776948111144355, "learning_rate": 1.5728426534131946e-06, "loss": 0.0305, "step": 4193 }, { "epoch": 2.99, "grad_norm": 4.0188505044934155, "learning_rate": 1.5707388007668877e-06, "loss": 0.0481, "step": 4194 }, { "epoch": 2.99, "grad_norm": 8.979036207018861, "learning_rate": 1.568636093900488e-06, "loss": 0.0398, "step": 4195 }, { "epoch": 3.0, "grad_norm": 6.689484772564502, "learning_rate": 1.5665345335165488e-06, "loss": 0.0412, "step": 4196 }, { "epoch": 3.0, "grad_norm": 4.5182528489425104, "learning_rate": 1.5644341203172415e-06, "loss": 0.0522, "step": 4197 }, { "epoch": 3.0, "grad_norm": 7.494580739110864, "learning_rate": 1.5623348550043516e-06, "loss": 0.0384, "step": 4198 }, { "epoch": 3.0, "grad_norm": 6.568879027287965, "learning_rate": 1.5602367382792839e-06, "loss": 0.0331, "step": 4199 }, { "epoch": 3.0, "grad_norm": 14.033675146688019, "learning_rate": 1.5581397708430578e-06, "loss": 0.0726, "step": 4200 }, { "epoch": 3.0, "grad_norm": 4.394766382636879, "learning_rate": 1.556043953396309e-06, "loss": 0.0311, "step": 4201 }, { "epoch": 3.0, "grad_norm": 11.599669876581697, "learning_rate": 1.5539492866392891e-06, "loss": 0.0349, "step": 4202 }, { "epoch": 3.0, "grad_norm": 5.095777532033208, "learning_rate": 1.551855771271865e-06, "loss": 0.0338, "step": 4203 }, { "epoch": 3.0, "grad_norm": 8.662438814706617, "learning_rate": 1.5497634079935198e-06, "loss": 0.0439, "step": 4204 }, { "epoch": 3.0, "grad_norm": 1.5906055347775252, "learning_rate": 1.5476721975033498e-06, "loss": 0.0178, "step": 4205 }, { "epoch": 3.0, "grad_norm": 3.918653409467006, "learning_rate": 1.5455821405000703e-06, "loss": 0.0265, "step": 4206 }, { "epoch": 3.0, "grad_norm": 5.99063662385307, "learning_rate": 1.5434932376820039e-06, "loss": 0.034, "step": 4207 }, { "epoch": 3.0, "grad_norm": 5.838954154742759, "learning_rate": 1.5414054897470942e-06, "loss": 0.0314, "step": 4208 }, { "epoch": 3.0, "grad_norm": 1.5387521453912725, "learning_rate": 1.5393188973928957e-06, "loss": 0.0201, "step": 4209 }, { "epoch": 3.0, "grad_norm": 3.3458673133915355, "learning_rate": 1.5372334613165784e-06, "loss": 0.0305, "step": 4210 }, { "epoch": 3.01, "grad_norm": 5.3408721579010185, "learning_rate": 1.5351491822149255e-06, "loss": 0.0208, "step": 4211 }, { "epoch": 3.01, "grad_norm": 5.0928277444132934, "learning_rate": 1.533066060784333e-06, "loss": 0.0292, "step": 4212 }, { "epoch": 3.01, "grad_norm": 4.046163091652899, "learning_rate": 1.5309840977208096e-06, "loss": 0.0229, "step": 4213 }, { "epoch": 3.01, "grad_norm": 2.3567429229007706, "learning_rate": 1.5289032937199793e-06, "loss": 0.0251, "step": 4214 }, { "epoch": 3.01, "grad_norm": 4.662301159431529, "learning_rate": 1.5268236494770772e-06, "loss": 0.0248, "step": 4215 }, { "epoch": 3.01, "grad_norm": 5.949991688960428, "learning_rate": 1.5247451656869499e-06, "loss": 0.0298, "step": 4216 }, { "epoch": 3.01, "grad_norm": 4.549139241488894, "learning_rate": 1.5226678430440588e-06, "loss": 0.0263, "step": 4217 }, { "epoch": 3.01, "grad_norm": 3.180555537523223, "learning_rate": 1.5205916822424755e-06, "loss": 0.0239, "step": 4218 }, { "epoch": 3.01, "grad_norm": 3.443474683242361, "learning_rate": 1.5185166839758836e-06, "loss": 0.0295, "step": 4219 }, { "epoch": 3.01, "grad_norm": 1.5504955901675597, "learning_rate": 1.5164428489375789e-06, "loss": 0.0229, "step": 4220 }, { "epoch": 3.01, "grad_norm": 2.295258883006284, "learning_rate": 1.5143701778204683e-06, "loss": 0.0263, "step": 4221 }, { "epoch": 3.01, "grad_norm": 3.551031768131113, "learning_rate": 1.5122986713170712e-06, "loss": 0.0274, "step": 4222 }, { "epoch": 3.01, "grad_norm": 10.191936949992497, "learning_rate": 1.510228330119512e-06, "loss": 0.0294, "step": 4223 }, { "epoch": 3.01, "grad_norm": 6.993474554478619, "learning_rate": 1.5081591549195357e-06, "loss": 0.0232, "step": 4224 }, { "epoch": 3.02, "grad_norm": 9.562045171559191, "learning_rate": 1.5060911464084864e-06, "loss": 0.0357, "step": 4225 }, { "epoch": 3.02, "grad_norm": 3.0924209278860673, "learning_rate": 1.5040243052773312e-06, "loss": 0.0326, "step": 4226 }, { "epoch": 3.02, "grad_norm": 4.420905003070142, "learning_rate": 1.5019586322166323e-06, "loss": 0.0173, "step": 4227 }, { "epoch": 3.02, "grad_norm": 4.811580679582127, "learning_rate": 1.4998941279165773e-06, "loss": 0.0158, "step": 4228 }, { "epoch": 3.02, "grad_norm": 9.851045271456961, "learning_rate": 1.4978307930669483e-06, "loss": 0.024, "step": 4229 }, { "epoch": 3.02, "grad_norm": 9.584372906933657, "learning_rate": 1.4957686283571498e-06, "loss": 0.0256, "step": 4230 }, { "epoch": 3.02, "grad_norm": 6.770213359448928, "learning_rate": 1.4937076344761858e-06, "loss": 0.0271, "step": 4231 }, { "epoch": 3.02, "grad_norm": 3.7727215155076355, "learning_rate": 1.4916478121126732e-06, "loss": 0.0224, "step": 4232 }, { "epoch": 3.02, "grad_norm": 1.893841240458152, "learning_rate": 1.4895891619548374e-06, "loss": 0.0232, "step": 4233 }, { "epoch": 3.02, "grad_norm": 5.161001637713427, "learning_rate": 1.4875316846905113e-06, "loss": 0.023, "step": 4234 }, { "epoch": 3.02, "grad_norm": 11.061852811785181, "learning_rate": 1.4854753810071364e-06, "loss": 0.028, "step": 4235 }, { "epoch": 3.02, "grad_norm": 8.109821052482838, "learning_rate": 1.4834202515917628e-06, "loss": 0.0312, "step": 4236 }, { "epoch": 3.02, "grad_norm": 11.014754015899051, "learning_rate": 1.4813662971310465e-06, "loss": 0.032, "step": 4237 }, { "epoch": 3.02, "grad_norm": 2.2753130698707484, "learning_rate": 1.4793135183112523e-06, "loss": 0.027, "step": 4238 }, { "epoch": 3.03, "grad_norm": 4.690230867022709, "learning_rate": 1.477261915818251e-06, "loss": 0.0206, "step": 4239 }, { "epoch": 3.03, "grad_norm": 5.281295741354212, "learning_rate": 1.4752114903375243e-06, "loss": 0.0131, "step": 4240 }, { "epoch": 3.03, "grad_norm": 4.3490688386339516, "learning_rate": 1.473162242554151e-06, "loss": 0.0299, "step": 4241 }, { "epoch": 3.03, "grad_norm": 8.960682672351583, "learning_rate": 1.47111417315283e-06, "loss": 0.0282, "step": 4242 }, { "epoch": 3.03, "grad_norm": 8.66598782769347, "learning_rate": 1.4690672828178532e-06, "loss": 0.0362, "step": 4243 }, { "epoch": 3.03, "grad_norm": 5.361898466333456, "learning_rate": 1.467021572233131e-06, "loss": 0.0323, "step": 4244 }, { "epoch": 3.03, "grad_norm": 3.731815327650255, "learning_rate": 1.4649770420821663e-06, "loss": 0.0297, "step": 4245 }, { "epoch": 3.03, "grad_norm": 5.073322490267396, "learning_rate": 1.4629336930480813e-06, "loss": 0.0276, "step": 4246 }, { "epoch": 3.03, "grad_norm": 2.1603771617345915, "learning_rate": 1.4608915258135914e-06, "loss": 0.0354, "step": 4247 }, { "epoch": 3.03, "grad_norm": 6.080643211063934, "learning_rate": 1.4588505410610283e-06, "loss": 0.0316, "step": 4248 }, { "epoch": 3.03, "grad_norm": 4.017000843022467, "learning_rate": 1.4568107394723175e-06, "loss": 0.0245, "step": 4249 }, { "epoch": 3.03, "grad_norm": 2.6670606172944034, "learning_rate": 1.4547721217289972e-06, "loss": 0.0292, "step": 4250 }, { "epoch": 3.03, "grad_norm": 8.81886940865451, "learning_rate": 1.4527346885122073e-06, "loss": 0.0335, "step": 4251 }, { "epoch": 3.03, "grad_norm": 6.820384061010361, "learning_rate": 1.450698440502692e-06, "loss": 0.022, "step": 4252 }, { "epoch": 3.04, "grad_norm": 3.2055510470449873, "learning_rate": 1.4486633783807997e-06, "loss": 0.0227, "step": 4253 }, { "epoch": 3.04, "grad_norm": 6.073201859461071, "learning_rate": 1.4466295028264822e-06, "loss": 0.0244, "step": 4254 }, { "epoch": 3.04, "grad_norm": 2.993722905445915, "learning_rate": 1.4445968145192951e-06, "loss": 0.023, "step": 4255 }, { "epoch": 3.04, "grad_norm": 2.6344293218691632, "learning_rate": 1.4425653141383977e-06, "loss": 0.0262, "step": 4256 }, { "epoch": 3.04, "grad_norm": 3.3473254018552536, "learning_rate": 1.4405350023625514e-06, "loss": 0.0179, "step": 4257 }, { "epoch": 3.04, "grad_norm": 2.0005642649862208, "learning_rate": 1.4385058798701223e-06, "loss": 0.0231, "step": 4258 }, { "epoch": 3.04, "grad_norm": 2.6183840757874544, "learning_rate": 1.4364779473390767e-06, "loss": 0.0266, "step": 4259 }, { "epoch": 3.04, "grad_norm": 3.7496408204859124, "learning_rate": 1.4344512054469855e-06, "loss": 0.0322, "step": 4260 }, { "epoch": 3.04, "grad_norm": 5.046695349286583, "learning_rate": 1.4324256548710202e-06, "loss": 0.0212, "step": 4261 }, { "epoch": 3.04, "grad_norm": 4.538256274604029, "learning_rate": 1.430401296287955e-06, "loss": 0.0237, "step": 4262 }, { "epoch": 3.04, "grad_norm": 3.4393732090188815, "learning_rate": 1.4283781303741662e-06, "loss": 0.0169, "step": 4263 }, { "epoch": 3.04, "grad_norm": 4.128065352323907, "learning_rate": 1.4263561578056307e-06, "loss": 0.0239, "step": 4264 }, { "epoch": 3.04, "grad_norm": 3.7030098567807346, "learning_rate": 1.4243353792579285e-06, "loss": 0.028, "step": 4265 }, { "epoch": 3.04, "grad_norm": 2.497292358728816, "learning_rate": 1.4223157954062344e-06, "loss": 0.0199, "step": 4266 }, { "epoch": 3.05, "grad_norm": 6.153106787419073, "learning_rate": 1.4202974069253362e-06, "loss": 0.0301, "step": 4267 }, { "epoch": 3.05, "grad_norm": 1.5508727812000602, "learning_rate": 1.418280214489608e-06, "loss": 0.0194, "step": 4268 }, { "epoch": 3.05, "grad_norm": 3.9753775875375252, "learning_rate": 1.416264218773038e-06, "loss": 0.0207, "step": 4269 }, { "epoch": 3.05, "grad_norm": 4.719703036216151, "learning_rate": 1.4142494204492007e-06, "loss": 0.0307, "step": 4270 }, { "epoch": 3.05, "grad_norm": 1.8739011534547396, "learning_rate": 1.412235820191285e-06, "loss": 0.014, "step": 4271 }, { "epoch": 3.05, "grad_norm": 1.6215340748185867, "learning_rate": 1.4102234186720653e-06, "loss": 0.0198, "step": 4272 }, { "epoch": 3.05, "grad_norm": 1.1853722180193103, "learning_rate": 1.4082122165639285e-06, "loss": 0.0182, "step": 4273 }, { "epoch": 3.05, "grad_norm": 1.4639779224320038, "learning_rate": 1.4062022145388503e-06, "loss": 0.0155, "step": 4274 }, { "epoch": 3.05, "grad_norm": 3.0442440975692535, "learning_rate": 1.4041934132684116e-06, "loss": 0.034, "step": 4275 }, { "epoch": 3.05, "grad_norm": 1.485809174565446, "learning_rate": 1.4021858134237892e-06, "loss": 0.0212, "step": 4276 }, { "epoch": 3.05, "grad_norm": 2.050713547548853, "learning_rate": 1.4001794156757598e-06, "loss": 0.0276, "step": 4277 }, { "epoch": 3.05, "grad_norm": 3.065013650984198, "learning_rate": 1.398174220694699e-06, "loss": 0.0283, "step": 4278 }, { "epoch": 3.05, "grad_norm": 3.4201497917182877, "learning_rate": 1.3961702291505791e-06, "loss": 0.0275, "step": 4279 }, { "epoch": 3.05, "grad_norm": 2.7541672840549887, "learning_rate": 1.3941674417129714e-06, "loss": 0.023, "step": 4280 }, { "epoch": 3.06, "grad_norm": 2.246885890731216, "learning_rate": 1.3921658590510434e-06, "loss": 0.0336, "step": 4281 }, { "epoch": 3.06, "grad_norm": 3.022641932858734, "learning_rate": 1.3901654818335618e-06, "loss": 0.0173, "step": 4282 }, { "epoch": 3.06, "grad_norm": 6.8132174445899665, "learning_rate": 1.3881663107288918e-06, "loss": 0.0199, "step": 4283 }, { "epoch": 3.06, "grad_norm": 1.867430595204692, "learning_rate": 1.386168346404988e-06, "loss": 0.0254, "step": 4284 }, { "epoch": 3.06, "grad_norm": 3.233486849230603, "learning_rate": 1.3841715895294138e-06, "loss": 0.0185, "step": 4285 }, { "epoch": 3.06, "grad_norm": 4.830562376779117, "learning_rate": 1.3821760407693175e-06, "loss": 0.0284, "step": 4286 }, { "epoch": 3.06, "grad_norm": 1.5354005181744397, "learning_rate": 1.3801817007914543e-06, "loss": 0.0244, "step": 4287 }, { "epoch": 3.06, "grad_norm": 3.8781541269057374, "learning_rate": 1.3781885702621644e-06, "loss": 0.0317, "step": 4288 }, { "epoch": 3.06, "grad_norm": 3.305553379082568, "learning_rate": 1.3761966498473956e-06, "loss": 0.017, "step": 4289 }, { "epoch": 3.06, "grad_norm": 5.335432721094165, "learning_rate": 1.3742059402126818e-06, "loss": 0.0297, "step": 4290 }, { "epoch": 3.06, "grad_norm": 3.030662176498976, "learning_rate": 1.3722164420231565e-06, "loss": 0.0234, "step": 4291 }, { "epoch": 3.06, "grad_norm": 2.2382330008234748, "learning_rate": 1.370228155943548e-06, "loss": 0.0209, "step": 4292 }, { "epoch": 3.06, "grad_norm": 2.1132694656038105, "learning_rate": 1.3682410826381816e-06, "loss": 0.0181, "step": 4293 }, { "epoch": 3.06, "grad_norm": 2.882412146996378, "learning_rate": 1.366255222770973e-06, "loss": 0.0223, "step": 4294 }, { "epoch": 3.07, "grad_norm": 4.0879767931219435, "learning_rate": 1.364270577005436e-06, "loss": 0.0446, "step": 4295 }, { "epoch": 3.07, "grad_norm": 3.898872989029239, "learning_rate": 1.3622871460046778e-06, "loss": 0.0242, "step": 4296 }, { "epoch": 3.07, "grad_norm": 2.6033162498479463, "learning_rate": 1.3603049304313992e-06, "loss": 0.0349, "step": 4297 }, { "epoch": 3.07, "grad_norm": 3.5165832344769536, "learning_rate": 1.3583239309478953e-06, "loss": 0.0233, "step": 4298 }, { "epoch": 3.07, "grad_norm": 6.573066298874758, "learning_rate": 1.3563441482160562e-06, "loss": 0.0214, "step": 4299 }, { "epoch": 3.07, "grad_norm": 2.235003481206468, "learning_rate": 1.35436558289736e-06, "loss": 0.0264, "step": 4300 }, { "epoch": 3.07, "grad_norm": 5.948089488451964, "learning_rate": 1.3523882356528883e-06, "loss": 0.0163, "step": 4301 }, { "epoch": 3.07, "grad_norm": 7.51398793823194, "learning_rate": 1.350412107143303e-06, "loss": 0.031, "step": 4302 }, { "epoch": 3.07, "grad_norm": 2.0676717040990757, "learning_rate": 1.3484371980288712e-06, "loss": 0.0195, "step": 4303 }, { "epoch": 3.07, "grad_norm": 2.0443453458688623, "learning_rate": 1.3464635089694416e-06, "loss": 0.0204, "step": 4304 }, { "epoch": 3.07, "grad_norm": 2.4356846280920754, "learning_rate": 1.344491040624466e-06, "loss": 0.0255, "step": 4305 }, { "epoch": 3.07, "grad_norm": 4.725705710515471, "learning_rate": 1.3425197936529766e-06, "loss": 0.0226, "step": 4306 }, { "epoch": 3.07, "grad_norm": 4.276820785445097, "learning_rate": 1.3405497687136098e-06, "loss": 0.0295, "step": 4307 }, { "epoch": 3.07, "grad_norm": 1.4908882433698298, "learning_rate": 1.3385809664645827e-06, "loss": 0.0192, "step": 4308 }, { "epoch": 3.08, "grad_norm": 2.5380484947293613, "learning_rate": 1.336613387563711e-06, "loss": 0.0266, "step": 4309 }, { "epoch": 3.08, "grad_norm": 2.2945673276968894, "learning_rate": 1.3346470326683986e-06, "loss": 0.0289, "step": 4310 }, { "epoch": 3.08, "grad_norm": 2.4304115620560154, "learning_rate": 1.3326819024356413e-06, "loss": 0.0208, "step": 4311 }, { "epoch": 3.08, "grad_norm": 2.8002395851902047, "learning_rate": 1.3307179975220264e-06, "loss": 0.0295, "step": 4312 }, { "epoch": 3.08, "grad_norm": 2.275129136772214, "learning_rate": 1.3287553185837298e-06, "loss": 0.0194, "step": 4313 }, { "epoch": 3.08, "grad_norm": 7.423216970876992, "learning_rate": 1.3267938662765206e-06, "loss": 0.0204, "step": 4314 }, { "epoch": 3.08, "grad_norm": 2.6068477417666607, "learning_rate": 1.324833641255755e-06, "loss": 0.0209, "step": 4315 }, { "epoch": 3.08, "grad_norm": 2.474988243701278, "learning_rate": 1.3228746441763813e-06, "loss": 0.0143, "step": 4316 }, { "epoch": 3.08, "grad_norm": 2.441680844619571, "learning_rate": 1.3209168756929363e-06, "loss": 0.0305, "step": 4317 }, { "epoch": 3.08, "grad_norm": 7.651310891463135, "learning_rate": 1.3189603364595483e-06, "loss": 0.0259, "step": 4318 }, { "epoch": 3.08, "grad_norm": 1.8489347124819762, "learning_rate": 1.3170050271299316e-06, "loss": 0.0206, "step": 4319 }, { "epoch": 3.08, "grad_norm": 2.050853437133708, "learning_rate": 1.315050948357392e-06, "loss": 0.0201, "step": 4320 }, { "epoch": 3.08, "grad_norm": 3.8678126664888266, "learning_rate": 1.3130981007948247e-06, "loss": 0.0286, "step": 4321 }, { "epoch": 3.08, "grad_norm": 3.5527708798067263, "learning_rate": 1.3111464850947103e-06, "loss": 0.0284, "step": 4322 }, { "epoch": 3.09, "grad_norm": 6.799812210909939, "learning_rate": 1.3091961019091216e-06, "loss": 0.0285, "step": 4323 }, { "epoch": 3.09, "grad_norm": 3.718954096429571, "learning_rate": 1.3072469518897184e-06, "loss": 0.0201, "step": 4324 }, { "epoch": 3.09, "grad_norm": 7.454710537462165, "learning_rate": 1.3052990356877444e-06, "loss": 0.0204, "step": 4325 }, { "epoch": 3.09, "grad_norm": 5.810987162547036, "learning_rate": 1.3033523539540394e-06, "loss": 0.0218, "step": 4326 }, { "epoch": 3.09, "grad_norm": 2.6929964518246527, "learning_rate": 1.3014069073390206e-06, "loss": 0.0237, "step": 4327 }, { "epoch": 3.09, "grad_norm": 1.6508887017292961, "learning_rate": 1.2994626964927042e-06, "loss": 0.0233, "step": 4328 }, { "epoch": 3.09, "grad_norm": 1.694127839814189, "learning_rate": 1.2975197220646807e-06, "loss": 0.0146, "step": 4329 }, { "epoch": 3.09, "grad_norm": 8.55682892697071, "learning_rate": 1.29557798470414e-06, "loss": 0.0198, "step": 4330 }, { "epoch": 3.09, "grad_norm": 4.464695362177069, "learning_rate": 1.293637485059847e-06, "loss": 0.0292, "step": 4331 }, { "epoch": 3.09, "grad_norm": 1.3157782950293537, "learning_rate": 1.291698223780164e-06, "loss": 0.0184, "step": 4332 }, { "epoch": 3.09, "grad_norm": 2.6685124060299343, "learning_rate": 1.2897602015130306e-06, "loss": 0.0253, "step": 4333 }, { "epoch": 3.09, "grad_norm": 3.4823603776992815, "learning_rate": 1.287823418905977e-06, "loss": 0.0304, "step": 4334 }, { "epoch": 3.09, "grad_norm": 5.911506432130707, "learning_rate": 1.2858878766061178e-06, "loss": 0.0257, "step": 4335 }, { "epoch": 3.09, "grad_norm": 2.086775123511386, "learning_rate": 1.2839535752601551e-06, "loss": 0.0324, "step": 4336 }, { "epoch": 3.1, "grad_norm": 2.029837014664569, "learning_rate": 1.2820205155143738e-06, "loss": 0.0204, "step": 4337 }, { "epoch": 3.1, "grad_norm": 7.437040162360332, "learning_rate": 1.2800886980146453e-06, "loss": 0.0263, "step": 4338 }, { "epoch": 3.1, "grad_norm": 12.488964061218802, "learning_rate": 1.2781581234064256e-06, "loss": 0.033, "step": 4339 }, { "epoch": 3.1, "grad_norm": 5.81702543197951, "learning_rate": 1.276228792334756e-06, "loss": 0.0384, "step": 4340 }, { "epoch": 3.1, "grad_norm": 2.7625144524398144, "learning_rate": 1.274300705444262e-06, "loss": 0.0206, "step": 4341 }, { "epoch": 3.1, "grad_norm": 1.9870680625000081, "learning_rate": 1.2723738633791538e-06, "loss": 0.0257, "step": 4342 }, { "epoch": 3.1, "grad_norm": 1.9775795029648342, "learning_rate": 1.2704482667832218e-06, "loss": 0.0261, "step": 4343 }, { "epoch": 3.1, "grad_norm": 7.743239827097478, "learning_rate": 1.2685239162998485e-06, "loss": 0.026, "step": 4344 }, { "epoch": 3.1, "grad_norm": 3.6982912516993784, "learning_rate": 1.2666008125719904e-06, "loss": 0.0281, "step": 4345 }, { "epoch": 3.1, "grad_norm": 5.896769242939257, "learning_rate": 1.2646789562421975e-06, "loss": 0.0206, "step": 4346 }, { "epoch": 3.1, "grad_norm": 5.284872574990886, "learning_rate": 1.2627583479525913e-06, "loss": 0.0313, "step": 4347 }, { "epoch": 3.1, "grad_norm": 6.165105344259164, "learning_rate": 1.2608389883448896e-06, "loss": 0.0283, "step": 4348 }, { "epoch": 3.1, "grad_norm": 3.7245284863056223, "learning_rate": 1.2589208780603795e-06, "loss": 0.025, "step": 4349 }, { "epoch": 3.1, "grad_norm": 2.567851032035511, "learning_rate": 1.2570040177399435e-06, "loss": 0.0274, "step": 4350 }, { "epoch": 3.11, "grad_norm": 4.0847654157063555, "learning_rate": 1.255088408024036e-06, "loss": 0.0256, "step": 4351 }, { "epoch": 3.11, "grad_norm": 4.834804601853921, "learning_rate": 1.2531740495526989e-06, "loss": 0.0204, "step": 4352 }, { "epoch": 3.11, "grad_norm": 3.9081054310744254, "learning_rate": 1.2512609429655553e-06, "loss": 0.0241, "step": 4353 }, { "epoch": 3.11, "grad_norm": 3.942714359609263, "learning_rate": 1.249349088901809e-06, "loss": 0.0239, "step": 4354 }, { "epoch": 3.11, "grad_norm": 6.164893142313988, "learning_rate": 1.247438488000247e-06, "loss": 0.0227, "step": 4355 }, { "epoch": 3.11, "grad_norm": 6.8002871593043235, "learning_rate": 1.245529140899236e-06, "loss": 0.0285, "step": 4356 }, { "epoch": 3.11, "grad_norm": 14.128621503244876, "learning_rate": 1.2436210482367245e-06, "loss": 0.0278, "step": 4357 }, { "epoch": 3.11, "grad_norm": 4.892012988671743, "learning_rate": 1.2417142106502418e-06, "loss": 0.0212, "step": 4358 }, { "epoch": 3.11, "grad_norm": 6.258033264830363, "learning_rate": 1.2398086287768969e-06, "loss": 0.0276, "step": 4359 }, { "epoch": 3.11, "grad_norm": 2.533324048190474, "learning_rate": 1.237904303253381e-06, "loss": 0.0209, "step": 4360 }, { "epoch": 3.11, "grad_norm": 7.187459746139216, "learning_rate": 1.236001234715965e-06, "loss": 0.021, "step": 4361 }, { "epoch": 3.11, "grad_norm": 3.648792787441624, "learning_rate": 1.2340994238004987e-06, "loss": 0.0202, "step": 4362 }, { "epoch": 3.11, "grad_norm": 8.691909830938794, "learning_rate": 1.2321988711424132e-06, "loss": 0.0224, "step": 4363 }, { "epoch": 3.11, "grad_norm": 6.995988388162933, "learning_rate": 1.2302995773767174e-06, "loss": 0.0349, "step": 4364 }, { "epoch": 3.12, "grad_norm": 11.008840944917822, "learning_rate": 1.2284015431380015e-06, "loss": 0.0285, "step": 4365 }, { "epoch": 3.12, "grad_norm": 3.373639756983513, "learning_rate": 1.2265047690604354e-06, "loss": 0.0184, "step": 4366 }, { "epoch": 3.12, "grad_norm": 3.803315273323329, "learning_rate": 1.2246092557777633e-06, "loss": 0.0263, "step": 4367 }, { "epoch": 3.12, "grad_norm": 3.4078700572104674, "learning_rate": 1.2227150039233132e-06, "loss": 0.032, "step": 4368 }, { "epoch": 3.12, "grad_norm": 3.581845625792135, "learning_rate": 1.2208220141299893e-06, "loss": 0.0294, "step": 4369 }, { "epoch": 3.12, "grad_norm": 10.077585671022081, "learning_rate": 1.2189302870302755e-06, "loss": 0.0233, "step": 4370 }, { "epoch": 3.12, "grad_norm": 6.127715448474595, "learning_rate": 1.2170398232562324e-06, "loss": 0.0158, "step": 4371 }, { "epoch": 3.12, "grad_norm": 8.996189920092641, "learning_rate": 1.2151506234395e-06, "loss": 0.0471, "step": 4372 }, { "epoch": 3.12, "grad_norm": 8.082755597103926, "learning_rate": 1.2132626882112935e-06, "loss": 0.0246, "step": 4373 }, { "epoch": 3.12, "grad_norm": 4.081561675170032, "learning_rate": 1.211376018202408e-06, "loss": 0.0271, "step": 4374 }, { "epoch": 3.12, "grad_norm": 7.991360997398173, "learning_rate": 1.2094906140432155e-06, "loss": 0.0232, "step": 4375 }, { "epoch": 3.12, "grad_norm": 5.043322011112299, "learning_rate": 1.2076064763636641e-06, "loss": 0.0167, "step": 4376 }, { "epoch": 3.12, "grad_norm": 3.5060059412563933, "learning_rate": 1.205723605793279e-06, "loss": 0.0177, "step": 4377 }, { "epoch": 3.12, "grad_norm": 3.3044991967413786, "learning_rate": 1.2038420029611625e-06, "loss": 0.0185, "step": 4378 }, { "epoch": 3.13, "grad_norm": 2.5519361887925407, "learning_rate": 1.2019616684959934e-06, "loss": 0.0158, "step": 4379 }, { "epoch": 3.13, "grad_norm": 1.5711224232775167, "learning_rate": 1.2000826030260254e-06, "loss": 0.0196, "step": 4380 }, { "epoch": 3.13, "grad_norm": 3.845792898604567, "learning_rate": 1.1982048071790903e-06, "loss": 0.0327, "step": 4381 }, { "epoch": 3.13, "grad_norm": 1.807799902555709, "learning_rate": 1.1963282815825938e-06, "loss": 0.0217, "step": 4382 }, { "epoch": 3.13, "grad_norm": 3.5441110376934706, "learning_rate": 1.194453026863519e-06, "loss": 0.0205, "step": 4383 }, { "epoch": 3.13, "grad_norm": 2.2644135155675387, "learning_rate": 1.1925790436484219e-06, "loss": 0.0182, "step": 4384 }, { "epoch": 3.13, "grad_norm": 7.720254102788194, "learning_rate": 1.1907063325634376e-06, "loss": 0.0249, "step": 4385 }, { "epoch": 3.13, "grad_norm": 2.967984156703408, "learning_rate": 1.1888348942342697e-06, "loss": 0.0239, "step": 4386 }, { "epoch": 3.13, "grad_norm": 5.143287739884021, "learning_rate": 1.1869647292862051e-06, "loss": 0.0336, "step": 4387 }, { "epoch": 3.13, "grad_norm": 6.7684831711996045, "learning_rate": 1.1850958383440957e-06, "loss": 0.0294, "step": 4388 }, { "epoch": 3.13, "grad_norm": 8.714645017881613, "learning_rate": 1.183228222032378e-06, "loss": 0.0316, "step": 4389 }, { "epoch": 3.13, "grad_norm": 5.477500456705119, "learning_rate": 1.181361880975052e-06, "loss": 0.0294, "step": 4390 }, { "epoch": 3.13, "grad_norm": 3.500435025541258, "learning_rate": 1.1794968157957026e-06, "loss": 0.0218, "step": 4391 }, { "epoch": 3.13, "grad_norm": 4.026778952227533, "learning_rate": 1.1776330271174786e-06, "loss": 0.0235, "step": 4392 }, { "epoch": 3.14, "grad_norm": 1.910801405729195, "learning_rate": 1.1757705155631072e-06, "loss": 0.0224, "step": 4393 }, { "epoch": 3.14, "grad_norm": 4.7870288481139776, "learning_rate": 1.1739092817548887e-06, "loss": 0.0263, "step": 4394 }, { "epoch": 3.14, "grad_norm": 3.7988605929737616, "learning_rate": 1.172049326314696e-06, "loss": 0.0221, "step": 4395 }, { "epoch": 3.14, "grad_norm": 4.220102301388364, "learning_rate": 1.1701906498639741e-06, "loss": 0.0193, "step": 4396 }, { "epoch": 3.14, "grad_norm": 1.7825031992726328, "learning_rate": 1.1683332530237423e-06, "loss": 0.0278, "step": 4397 }, { "epoch": 3.14, "grad_norm": 2.9360054998312166, "learning_rate": 1.1664771364145905e-06, "loss": 0.0293, "step": 4398 }, { "epoch": 3.14, "grad_norm": 4.303354411436024, "learning_rate": 1.1646223006566827e-06, "loss": 0.0363, "step": 4399 }, { "epoch": 3.14, "grad_norm": 2.194395308626148, "learning_rate": 1.162768746369753e-06, "loss": 0.0237, "step": 4400 }, { "epoch": 3.14, "grad_norm": 2.116716075883209, "learning_rate": 1.1609164741731105e-06, "loss": 0.0273, "step": 4401 }, { "epoch": 3.14, "grad_norm": 3.210931319165583, "learning_rate": 1.1590654846856291e-06, "loss": 0.0263, "step": 4402 }, { "epoch": 3.14, "grad_norm": 2.119091545167854, "learning_rate": 1.1572157785257643e-06, "loss": 0.0143, "step": 4403 }, { "epoch": 3.14, "grad_norm": 4.324074704437676, "learning_rate": 1.1553673563115325e-06, "loss": 0.0336, "step": 4404 }, { "epoch": 3.14, "grad_norm": 4.170772991278056, "learning_rate": 1.153520218660531e-06, "loss": 0.038, "step": 4405 }, { "epoch": 3.14, "grad_norm": 4.213138084530538, "learning_rate": 1.1516743661899172e-06, "loss": 0.0237, "step": 4406 }, { "epoch": 3.15, "grad_norm": 11.656757681338371, "learning_rate": 1.1498297995164305e-06, "loss": 0.0331, "step": 4407 }, { "epoch": 3.15, "grad_norm": 3.4983502997263547, "learning_rate": 1.1479865192563683e-06, "loss": 0.0318, "step": 4408 }, { "epoch": 3.15, "grad_norm": 2.8807252835615254, "learning_rate": 1.146144526025612e-06, "loss": 0.0242, "step": 4409 }, { "epoch": 3.15, "grad_norm": 2.6306843568728038, "learning_rate": 1.1443038204396007e-06, "loss": 0.0258, "step": 4410 }, { "epoch": 3.15, "grad_norm": 4.192920419278408, "learning_rate": 1.1424644031133502e-06, "loss": 0.0187, "step": 4411 }, { "epoch": 3.15, "grad_norm": 7.177668219309729, "learning_rate": 1.1406262746614433e-06, "loss": 0.026, "step": 4412 }, { "epoch": 3.15, "grad_norm": 4.873333282003289, "learning_rate": 1.1387894356980334e-06, "loss": 0.0289, "step": 4413 }, { "epoch": 3.15, "grad_norm": 2.9519728668908227, "learning_rate": 1.1369538868368424e-06, "loss": 0.0196, "step": 4414 }, { "epoch": 3.15, "grad_norm": 3.8848350672996834, "learning_rate": 1.1351196286911615e-06, "loss": 0.0194, "step": 4415 }, { "epoch": 3.15, "grad_norm": 4.635840605259712, "learning_rate": 1.1332866618738498e-06, "loss": 0.0232, "step": 4416 }, { "epoch": 3.15, "grad_norm": 1.5581337612715147, "learning_rate": 1.1314549869973363e-06, "loss": 0.0232, "step": 4417 }, { "epoch": 3.15, "grad_norm": 1.7583630718679997, "learning_rate": 1.1296246046736176e-06, "loss": 0.0185, "step": 4418 }, { "epoch": 3.15, "grad_norm": 6.283893780931881, "learning_rate": 1.1277955155142578e-06, "loss": 0.0232, "step": 4419 }, { "epoch": 3.15, "grad_norm": 7.862048061035702, "learning_rate": 1.1259677201303905e-06, "loss": 0.0239, "step": 4420 }, { "epoch": 3.16, "grad_norm": 1.6230472483193787, "learning_rate": 1.1241412191327155e-06, "loss": 0.0157, "step": 4421 }, { "epoch": 3.16, "grad_norm": 2.755073615396238, "learning_rate": 1.1223160131315008e-06, "loss": 0.0298, "step": 4422 }, { "epoch": 3.16, "grad_norm": 2.73806500409641, "learning_rate": 1.1204921027365818e-06, "loss": 0.03, "step": 4423 }, { "epoch": 3.16, "grad_norm": 3.1203954227660895, "learning_rate": 1.1186694885573602e-06, "loss": 0.0197, "step": 4424 }, { "epoch": 3.16, "grad_norm": 3.5066645012673496, "learning_rate": 1.1168481712028061e-06, "loss": 0.0297, "step": 4425 }, { "epoch": 3.16, "grad_norm": 4.2675581356015915, "learning_rate": 1.115028151281457e-06, "loss": 0.022, "step": 4426 }, { "epoch": 3.16, "grad_norm": 3.4456080030453426, "learning_rate": 1.1132094294014106e-06, "loss": 0.0126, "step": 4427 }, { "epoch": 3.16, "grad_norm": 6.579809040631279, "learning_rate": 1.1113920061703416e-06, "loss": 0.0318, "step": 4428 }, { "epoch": 3.16, "grad_norm": 2.475266437209171, "learning_rate": 1.1095758821954788e-06, "loss": 0.0166, "step": 4429 }, { "epoch": 3.16, "grad_norm": 2.0190295382576133, "learning_rate": 1.107761058083629e-06, "loss": 0.0327, "step": 4430 }, { "epoch": 3.16, "grad_norm": 1.9898004038566273, "learning_rate": 1.1059475344411535e-06, "loss": 0.0185, "step": 4431 }, { "epoch": 3.16, "grad_norm": 3.6520762370965887, "learning_rate": 1.104135311873989e-06, "loss": 0.012, "step": 4432 }, { "epoch": 3.16, "grad_norm": 1.790944349770655, "learning_rate": 1.1023243909876275e-06, "loss": 0.0175, "step": 4433 }, { "epoch": 3.16, "grad_norm": 1.4622862012678017, "learning_rate": 1.1005147723871374e-06, "loss": 0.0174, "step": 4434 }, { "epoch": 3.17, "grad_norm": 2.29471929364547, "learning_rate": 1.0987064566771405e-06, "loss": 0.021, "step": 4435 }, { "epoch": 3.17, "grad_norm": 6.60079633509637, "learning_rate": 1.0968994444618313e-06, "loss": 0.0331, "step": 4436 }, { "epoch": 3.17, "grad_norm": 1.9722105559545458, "learning_rate": 1.0950937363449659e-06, "loss": 0.0176, "step": 4437 }, { "epoch": 3.17, "grad_norm": 9.226928463154826, "learning_rate": 1.0932893329298643e-06, "loss": 0.0339, "step": 4438 }, { "epoch": 3.17, "grad_norm": 2.7596617157692784, "learning_rate": 1.0914862348194121e-06, "loss": 0.0234, "step": 4439 }, { "epoch": 3.17, "grad_norm": 5.353123299645156, "learning_rate": 1.0896844426160575e-06, "loss": 0.0313, "step": 4440 }, { "epoch": 3.17, "grad_norm": 7.939115234782665, "learning_rate": 1.0878839569218124e-06, "loss": 0.0187, "step": 4441 }, { "epoch": 3.17, "grad_norm": 3.5070933721087356, "learning_rate": 1.0860847783382534e-06, "loss": 0.0319, "step": 4442 }, { "epoch": 3.17, "grad_norm": 7.566534679772082, "learning_rate": 1.0842869074665186e-06, "loss": 0.0226, "step": 4443 }, { "epoch": 3.17, "grad_norm": 2.5892901109879563, "learning_rate": 1.0824903449073115e-06, "loss": 0.0241, "step": 4444 }, { "epoch": 3.17, "grad_norm": 4.0522225755260655, "learning_rate": 1.0806950912608937e-06, "loss": 0.0407, "step": 4445 }, { "epoch": 3.17, "grad_norm": 4.042715326868905, "learning_rate": 1.0789011471270983e-06, "loss": 0.0237, "step": 4446 }, { "epoch": 3.17, "grad_norm": 5.5516387804921985, "learning_rate": 1.0771085131053087e-06, "loss": 0.0225, "step": 4447 }, { "epoch": 3.17, "grad_norm": 3.6164402815909362, "learning_rate": 1.0753171897944835e-06, "loss": 0.0215, "step": 4448 }, { "epoch": 3.18, "grad_norm": 6.6222474603434405, "learning_rate": 1.0735271777931322e-06, "loss": 0.022, "step": 4449 }, { "epoch": 3.18, "grad_norm": 2.592791914105326, "learning_rate": 1.0717384776993356e-06, "loss": 0.0322, "step": 4450 }, { "epoch": 3.18, "grad_norm": 5.227418309409531, "learning_rate": 1.069951090110728e-06, "loss": 0.0263, "step": 4451 }, { "epoch": 3.18, "grad_norm": 2.77396491744881, "learning_rate": 1.06816501562451e-06, "loss": 0.0195, "step": 4452 }, { "epoch": 3.18, "grad_norm": 1.8546536944682173, "learning_rate": 1.0663802548374424e-06, "loss": 0.0214, "step": 4453 }, { "epoch": 3.18, "grad_norm": 3.6321220341074585, "learning_rate": 1.064596808345847e-06, "loss": 0.0237, "step": 4454 }, { "epoch": 3.18, "grad_norm": 9.088425273383129, "learning_rate": 1.0628146767456066e-06, "loss": 0.031, "step": 4455 }, { "epoch": 3.18, "grad_norm": 2.244296281743473, "learning_rate": 1.061033860632164e-06, "loss": 0.0244, "step": 4456 }, { "epoch": 3.18, "grad_norm": 1.8747595056637634, "learning_rate": 1.0592543606005235e-06, "loss": 0.0254, "step": 4457 }, { "epoch": 3.18, "grad_norm": 3.2313945845341676, "learning_rate": 1.0574761772452486e-06, "loss": 0.0342, "step": 4458 }, { "epoch": 3.18, "grad_norm": 2.0815746855400103, "learning_rate": 1.0556993111604635e-06, "loss": 0.0226, "step": 4459 }, { "epoch": 3.18, "grad_norm": 8.359714562424623, "learning_rate": 1.0539237629398536e-06, "loss": 0.0335, "step": 4460 }, { "epoch": 3.18, "grad_norm": 3.709979629335406, "learning_rate": 1.052149533176659e-06, "loss": 0.0193, "step": 4461 }, { "epoch": 3.18, "grad_norm": 2.239188178419034, "learning_rate": 1.050376622463688e-06, "loss": 0.0217, "step": 4462 }, { "epoch": 3.19, "grad_norm": 4.238067345733504, "learning_rate": 1.0486050313932972e-06, "loss": 0.0246, "step": 4463 }, { "epoch": 3.19, "grad_norm": 2.8338637835412612, "learning_rate": 1.0468347605574137e-06, "loss": 0.0266, "step": 4464 }, { "epoch": 3.19, "grad_norm": 4.869316442569333, "learning_rate": 1.0450658105475126e-06, "loss": 0.0232, "step": 4465 }, { "epoch": 3.19, "grad_norm": 3.5665750122251625, "learning_rate": 1.0432981819546384e-06, "loss": 0.0309, "step": 4466 }, { "epoch": 3.19, "grad_norm": 8.936241009064924, "learning_rate": 1.0415318753693837e-06, "loss": 0.0236, "step": 4467 }, { "epoch": 3.19, "grad_norm": 3.101588523930688, "learning_rate": 1.0397668913819086e-06, "loss": 0.0239, "step": 4468 }, { "epoch": 3.19, "grad_norm": 4.295993293606268, "learning_rate": 1.0380032305819243e-06, "loss": 0.0223, "step": 4469 }, { "epoch": 3.19, "grad_norm": 2.3409226213317025, "learning_rate": 1.0362408935587026e-06, "loss": 0.0309, "step": 4470 }, { "epoch": 3.19, "grad_norm": 2.8493429852061487, "learning_rate": 1.0344798809010748e-06, "loss": 0.0246, "step": 4471 }, { "epoch": 3.19, "grad_norm": 4.991729680693538, "learning_rate": 1.0327201931974262e-06, "loss": 0.0165, "step": 4472 }, { "epoch": 3.19, "grad_norm": 1.8954580681443631, "learning_rate": 1.0309618310357023e-06, "loss": 0.0196, "step": 4473 }, { "epoch": 3.19, "grad_norm": 10.27608130286604, "learning_rate": 1.0292047950034046e-06, "loss": 0.0312, "step": 4474 }, { "epoch": 3.19, "grad_norm": 2.153159817327546, "learning_rate": 1.0274490856875908e-06, "loss": 0.0109, "step": 4475 }, { "epoch": 3.19, "grad_norm": 1.5007061694216104, "learning_rate": 1.0256947036748766e-06, "loss": 0.0186, "step": 4476 }, { "epoch": 3.2, "grad_norm": 1.7574699954655115, "learning_rate": 1.0239416495514331e-06, "loss": 0.0267, "step": 4477 }, { "epoch": 3.2, "grad_norm": 3.1371640873189954, "learning_rate": 1.0221899239029887e-06, "loss": 0.0185, "step": 4478 }, { "epoch": 3.2, "grad_norm": 5.019779692114384, "learning_rate": 1.0204395273148277e-06, "loss": 0.0157, "step": 4479 }, { "epoch": 3.2, "grad_norm": 7.540699269714138, "learning_rate": 1.0186904603717894e-06, "loss": 0.027, "step": 4480 }, { "epoch": 3.2, "grad_norm": 3.9237182973169595, "learning_rate": 1.0169427236582702e-06, "loss": 0.0377, "step": 4481 }, { "epoch": 3.2, "grad_norm": 4.278480950923269, "learning_rate": 1.0151963177582208e-06, "loss": 0.0245, "step": 4482 }, { "epoch": 3.2, "grad_norm": 2.312231473624651, "learning_rate": 1.0134512432551492e-06, "loss": 0.0183, "step": 4483 }, { "epoch": 3.2, "grad_norm": 2.0986535225249416, "learning_rate": 1.0117075007321152e-06, "loss": 0.0143, "step": 4484 }, { "epoch": 3.2, "grad_norm": 5.975648412284588, "learning_rate": 1.009965090771739e-06, "loss": 0.0197, "step": 4485 }, { "epoch": 3.2, "grad_norm": 4.374236320725866, "learning_rate": 1.0082240139561866e-06, "loss": 0.0254, "step": 4486 }, { "epoch": 3.2, "grad_norm": 6.532426395412988, "learning_rate": 1.0064842708671908e-06, "loss": 0.0208, "step": 4487 }, { "epoch": 3.2, "grad_norm": 3.6018002154036113, "learning_rate": 1.0047458620860251e-06, "loss": 0.0195, "step": 4488 }, { "epoch": 3.2, "grad_norm": 2.4807601063417293, "learning_rate": 1.0030087881935308e-06, "loss": 0.0158, "step": 4489 }, { "epoch": 3.2, "grad_norm": 3.4131438838142536, "learning_rate": 1.0012730497700912e-06, "loss": 0.0293, "step": 4490 }, { "epoch": 3.21, "grad_norm": 7.077550750630359, "learning_rate": 9.995386473956531e-07, "loss": 0.0251, "step": 4491 }, { "epoch": 3.21, "grad_norm": 2.258377721601201, "learning_rate": 9.978055816497084e-07, "loss": 0.0227, "step": 4492 }, { "epoch": 3.21, "grad_norm": 2.229914063381325, "learning_rate": 9.960738531113118e-07, "loss": 0.0209, "step": 4493 }, { "epoch": 3.21, "grad_norm": 2.1564691341307487, "learning_rate": 9.94343462359061e-07, "loss": 0.014, "step": 4494 }, { "epoch": 3.21, "grad_norm": 5.382295516557594, "learning_rate": 9.926144099711138e-07, "loss": 0.0277, "step": 4495 }, { "epoch": 3.21, "grad_norm": 2.516122869741995, "learning_rate": 9.90886696525179e-07, "loss": 0.022, "step": 4496 }, { "epoch": 3.21, "grad_norm": 7.685796145048834, "learning_rate": 9.89160322598517e-07, "loss": 0.0294, "step": 4497 }, { "epoch": 3.21, "grad_norm": 2.327309331754003, "learning_rate": 9.874352887679416e-07, "loss": 0.0273, "step": 4498 }, { "epoch": 3.21, "grad_norm": 2.8309636158087965, "learning_rate": 9.857115956098196e-07, "loss": 0.0287, "step": 4499 }, { "epoch": 3.21, "grad_norm": 3.2811544904994556, "learning_rate": 9.839892437000675e-07, "loss": 0.0135, "step": 4500 }, { "epoch": 3.21, "eval_avg_AUC": 0.8345286134092101, "eval_avg_Accuracy": 0.7458968832891246, "eval_avg_Accuracy-right": 0.888352680318247, "eval_avg_Accuracy-wrong": 0.49749829429156245, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.7117713077832741, "eval_last_AUC": 0.8350567927270806, "eval_last_Accuracy": 0.7807526525198939, "eval_last_Accuracy-right": 0.8413982000782575, "eval_last_Accuracy-wrong": 0.6750056856947919, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.7121076559312289, "eval_max_AUC": 0.7854278482735377, "eval_max_Accuracy": 0.6475878647214854, "eval_max_Accuracy-right": 0.9867614451545585, "eval_max_Accuracy-wrong": 0.056174664544007276, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6558197091469862, "eval_min_AUC": 0.843139889983326, "eval_min_Accuracy": 0.7734996684350133, "eval_min_Accuracy-right": 0.7857049693491587, "eval_min_Accuracy-wrong": 0.7522174209688424, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.715089706106881, "eval_prod_AUC": 0.8407684428232383, "eval_prod_Accuracy": 0.7391826923076923, "eval_prod_Accuracy-right": 0.6629059606104083, "eval_prod_Accuracy-wrong": 0.8721855810780077, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.7128298178825854, "eval_runtime": 247.0161, "eval_samples_per_second": 97.678, "eval_steps_per_second": 3.052, "eval_sum_AUC": 0.7186416616330681, "eval_sum_Accuracy": 0.6397546419098143, "eval_sum_Accuracy-right": 0.9967392722055562, "eval_sum_Accuracy-wrong": 0.017284512167386856, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6893739044164864, "step": 4500 }, { "epoch": 3.21, "grad_norm": 1.4722866410217152, "learning_rate": 9.822682336141558e-07, "loss": 0.0178, "step": 4501 }, { "epoch": 3.21, "grad_norm": 2.181579380218437, "learning_rate": 9.805485659271064e-07, "loss": 0.0168, "step": 4502 }, { "epoch": 3.21, "grad_norm": 7.752924249109985, "learning_rate": 9.788302412134931e-07, "loss": 0.0352, "step": 4503 }, { "epoch": 3.21, "grad_norm": 1.9281551566524828, "learning_rate": 9.77113260047436e-07, "loss": 0.0192, "step": 4504 }, { "epoch": 3.22, "grad_norm": 1.9477930911053865, "learning_rate": 9.753976230026158e-07, "loss": 0.0234, "step": 4505 }, { "epoch": 3.22, "grad_norm": 6.050182109498725, "learning_rate": 9.736833306522537e-07, "loss": 0.0257, "step": 4506 }, { "epoch": 3.22, "grad_norm": 5.560668219829083, "learning_rate": 9.719703835691314e-07, "loss": 0.0216, "step": 4507 }, { "epoch": 3.22, "grad_norm": 10.427582712165258, "learning_rate": 9.702587823255715e-07, "loss": 0.0413, "step": 4508 }, { "epoch": 3.22, "grad_norm": 2.6181714476401536, "learning_rate": 9.685485274934576e-07, "loss": 0.0179, "step": 4509 }, { "epoch": 3.22, "grad_norm": 9.156873614600924, "learning_rate": 9.66839619644211e-07, "loss": 0.0251, "step": 4510 }, { "epoch": 3.22, "grad_norm": 3.4673271118101643, "learning_rate": 9.651320593488162e-07, "loss": 0.0191, "step": 4511 }, { "epoch": 3.22, "grad_norm": 1.6137388839865046, "learning_rate": 9.634258471777958e-07, "loss": 0.0184, "step": 4512 }, { "epoch": 3.22, "grad_norm": 3.6836288992334048, "learning_rate": 9.617209837012287e-07, "loss": 0.0264, "step": 4513 }, { "epoch": 3.22, "grad_norm": 5.5935012176996635, "learning_rate": 9.600174694887421e-07, "loss": 0.0247, "step": 4514 }, { "epoch": 3.22, "grad_norm": 7.298134909789238, "learning_rate": 9.583153051095107e-07, "loss": 0.0247, "step": 4515 }, { "epoch": 3.22, "grad_norm": 5.308498768012288, "learning_rate": 9.5661449113226e-07, "loss": 0.0287, "step": 4516 }, { "epoch": 3.22, "grad_norm": 1.8179009486270912, "learning_rate": 9.549150281252633e-07, "loss": 0.0206, "step": 4517 }, { "epoch": 3.22, "grad_norm": 3.3811622578866727, "learning_rate": 9.532169166563426e-07, "loss": 0.0284, "step": 4518 }, { "epoch": 3.23, "grad_norm": 3.8222062654617406, "learning_rate": 9.515201572928689e-07, "loss": 0.028, "step": 4519 }, { "epoch": 3.23, "grad_norm": 3.0854257738751163, "learning_rate": 9.49824750601761e-07, "loss": 0.0316, "step": 4520 }, { "epoch": 3.23, "grad_norm": 2.5397879239379098, "learning_rate": 9.481306971494858e-07, "loss": 0.0265, "step": 4521 }, { "epoch": 3.23, "grad_norm": 3.2717124935629953, "learning_rate": 9.464379975020576e-07, "loss": 0.0278, "step": 4522 }, { "epoch": 3.23, "grad_norm": 9.377789209888471, "learning_rate": 9.447466522250393e-07, "loss": 0.0318, "step": 4523 }, { "epoch": 3.23, "grad_norm": 6.441122861364104, "learning_rate": 9.430566618835407e-07, "loss": 0.0314, "step": 4524 }, { "epoch": 3.23, "grad_norm": 2.3160747535915265, "learning_rate": 9.413680270422187e-07, "loss": 0.0289, "step": 4525 }, { "epoch": 3.23, "grad_norm": 1.6669925746710255, "learning_rate": 9.396807482652775e-07, "loss": 0.0241, "step": 4526 }, { "epoch": 3.23, "grad_norm": 2.8847819825771777, "learning_rate": 9.3799482611647e-07, "loss": 0.0266, "step": 4527 }, { "epoch": 3.23, "grad_norm": 4.401247157463592, "learning_rate": 9.363102611590918e-07, "loss": 0.0206, "step": 4528 }, { "epoch": 3.23, "grad_norm": 4.810249161783182, "learning_rate": 9.346270539559882e-07, "loss": 0.0213, "step": 4529 }, { "epoch": 3.23, "grad_norm": 7.9036297509881095, "learning_rate": 9.329452050695497e-07, "loss": 0.0221, "step": 4530 }, { "epoch": 3.23, "grad_norm": 3.570781343969614, "learning_rate": 9.312647150617144e-07, "loss": 0.0207, "step": 4531 }, { "epoch": 3.23, "grad_norm": 1.6397653229972227, "learning_rate": 9.295855844939639e-07, "loss": 0.012, "step": 4532 }, { "epoch": 3.24, "grad_norm": 2.258414374329704, "learning_rate": 9.279078139273279e-07, "loss": 0.0181, "step": 4533 }, { "epoch": 3.24, "grad_norm": 8.950313191107599, "learning_rate": 9.262314039223802e-07, "loss": 0.0312, "step": 4534 }, { "epoch": 3.24, "grad_norm": 2.1815871351729617, "learning_rate": 9.245563550392406e-07, "loss": 0.0223, "step": 4535 }, { "epoch": 3.24, "grad_norm": 4.859815723424763, "learning_rate": 9.22882667837574e-07, "loss": 0.0266, "step": 4536 }, { "epoch": 3.24, "grad_norm": 2.355286650208844, "learning_rate": 9.212103428765912e-07, "loss": 0.0203, "step": 4537 }, { "epoch": 3.24, "grad_norm": 2.0779801100488684, "learning_rate": 9.19539380715046e-07, "loss": 0.0234, "step": 4538 }, { "epoch": 3.24, "grad_norm": 9.338632225920227, "learning_rate": 9.178697819112381e-07, "loss": 0.0253, "step": 4539 }, { "epoch": 3.24, "grad_norm": 2.2899098228257464, "learning_rate": 9.162015470230123e-07, "loss": 0.0175, "step": 4540 }, { "epoch": 3.24, "grad_norm": 1.5376658110359513, "learning_rate": 9.145346766077562e-07, "loss": 0.025, "step": 4541 }, { "epoch": 3.24, "grad_norm": 1.9588815283252403, "learning_rate": 9.128691712224025e-07, "loss": 0.0208, "step": 4542 }, { "epoch": 3.24, "grad_norm": 1.793777776979931, "learning_rate": 9.112050314234272e-07, "loss": 0.0204, "step": 4543 }, { "epoch": 3.24, "grad_norm": 3.048832843843268, "learning_rate": 9.0954225776685e-07, "loss": 0.0146, "step": 4544 }, { "epoch": 3.24, "grad_norm": 2.3438063416343393, "learning_rate": 9.078808508082354e-07, "loss": 0.0206, "step": 4545 }, { "epoch": 3.24, "grad_norm": 1.9916762447835414, "learning_rate": 9.06220811102691e-07, "loss": 0.0257, "step": 4546 }, { "epoch": 3.25, "grad_norm": 5.85438556591717, "learning_rate": 9.045621392048637e-07, "loss": 0.0235, "step": 4547 }, { "epoch": 3.25, "grad_norm": 2.46668861130742, "learning_rate": 9.029048356689507e-07, "loss": 0.0255, "step": 4548 }, { "epoch": 3.25, "grad_norm": 4.512065904072692, "learning_rate": 9.012489010486835e-07, "loss": 0.0242, "step": 4549 }, { "epoch": 3.25, "grad_norm": 3.86233637448217, "learning_rate": 8.995943358973463e-07, "loss": 0.0244, "step": 4550 }, { "epoch": 3.25, "grad_norm": 3.2961520762406575, "learning_rate": 8.979411407677535e-07, "loss": 0.0241, "step": 4551 }, { "epoch": 3.25, "grad_norm": 4.216390230976483, "learning_rate": 8.962893162122749e-07, "loss": 0.0201, "step": 4552 }, { "epoch": 3.25, "grad_norm": 8.348781851060899, "learning_rate": 8.946388627828106e-07, "loss": 0.0311, "step": 4553 }, { "epoch": 3.25, "grad_norm": 6.084513848361836, "learning_rate": 8.929897810308102e-07, "loss": 0.025, "step": 4554 }, { "epoch": 3.25, "grad_norm": 8.2559434065546, "learning_rate": 8.913420715072619e-07, "loss": 0.0396, "step": 4555 }, { "epoch": 3.25, "grad_norm": 4.631280709242924, "learning_rate": 8.896957347626966e-07, "loss": 0.0321, "step": 4556 }, { "epoch": 3.25, "grad_norm": 3.139609791677315, "learning_rate": 8.880507713471853e-07, "loss": 0.0157, "step": 4557 }, { "epoch": 3.25, "grad_norm": 3.308998251970711, "learning_rate": 8.864071818103415e-07, "loss": 0.0179, "step": 4558 }, { "epoch": 3.25, "grad_norm": 2.4597231253581624, "learning_rate": 8.847649667013187e-07, "loss": 0.0248, "step": 4559 }, { "epoch": 3.25, "grad_norm": 4.1940520085889625, "learning_rate": 8.831241265688112e-07, "loss": 0.0284, "step": 4560 }, { "epoch": 3.26, "grad_norm": 9.58472105522457, "learning_rate": 8.814846619610545e-07, "loss": 0.0199, "step": 4561 }, { "epoch": 3.26, "grad_norm": 4.313842958685549, "learning_rate": 8.79846573425826e-07, "loss": 0.0267, "step": 4562 }, { "epoch": 3.26, "grad_norm": 3.509297605663404, "learning_rate": 8.782098615104373e-07, "loss": 0.0182, "step": 4563 }, { "epoch": 3.26, "grad_norm": 1.6077932936017227, "learning_rate": 8.765745267617487e-07, "loss": 0.0247, "step": 4564 }, { "epoch": 3.26, "grad_norm": 2.9038110707158453, "learning_rate": 8.749405697261515e-07, "loss": 0.0228, "step": 4565 }, { "epoch": 3.26, "grad_norm": 3.162924718811875, "learning_rate": 8.733079909495868e-07, "loss": 0.0229, "step": 4566 }, { "epoch": 3.26, "grad_norm": 2.0279848737464015, "learning_rate": 8.716767909775231e-07, "loss": 0.028, "step": 4567 }, { "epoch": 3.26, "grad_norm": 6.216485879376879, "learning_rate": 8.700469703549802e-07, "loss": 0.0333, "step": 4568 }, { "epoch": 3.26, "grad_norm": 7.520962408278871, "learning_rate": 8.684185296265074e-07, "loss": 0.0197, "step": 4569 }, { "epoch": 3.26, "grad_norm": 8.629919683440207, "learning_rate": 8.667914693362006e-07, "loss": 0.0187, "step": 4570 }, { "epoch": 3.26, "grad_norm": 7.682867297333057, "learning_rate": 8.651657900276878e-07, "loss": 0.0235, "step": 4571 }, { "epoch": 3.26, "grad_norm": 6.700794151870362, "learning_rate": 8.635414922441398e-07, "loss": 0.0226, "step": 4572 }, { "epoch": 3.26, "grad_norm": 1.3609985270641234, "learning_rate": 8.61918576528265e-07, "loss": 0.0191, "step": 4573 }, { "epoch": 3.26, "grad_norm": 8.333651139592504, "learning_rate": 8.60297043422309e-07, "loss": 0.0236, "step": 4574 }, { "epoch": 3.27, "grad_norm": 5.2970319558403975, "learning_rate": 8.586768934680572e-07, "loss": 0.0227, "step": 4575 }, { "epoch": 3.27, "grad_norm": 8.021689608056526, "learning_rate": 8.570581272068307e-07, "loss": 0.0183, "step": 4576 }, { "epoch": 3.27, "grad_norm": 2.30192703714936, "learning_rate": 8.554407451794905e-07, "loss": 0.0168, "step": 4577 }, { "epoch": 3.27, "grad_norm": 10.68080512987844, "learning_rate": 8.538247479264327e-07, "loss": 0.0336, "step": 4578 }, { "epoch": 3.27, "grad_norm": 4.080574857143387, "learning_rate": 8.522101359875934e-07, "loss": 0.0234, "step": 4579 }, { "epoch": 3.27, "grad_norm": 6.2548733609393015, "learning_rate": 8.505969099024436e-07, "loss": 0.0318, "step": 4580 }, { "epoch": 3.27, "grad_norm": 2.452170699337819, "learning_rate": 8.489850702099922e-07, "loss": 0.0318, "step": 4581 }, { "epoch": 3.27, "grad_norm": 5.3176103182624805, "learning_rate": 8.473746174487846e-07, "loss": 0.0326, "step": 4582 }, { "epoch": 3.27, "grad_norm": 4.033919981368345, "learning_rate": 8.457655521569036e-07, "loss": 0.0287, "step": 4583 }, { "epoch": 3.27, "grad_norm": 8.624185466236282, "learning_rate": 8.441578748719676e-07, "loss": 0.0274, "step": 4584 }, { "epoch": 3.27, "grad_norm": 2.78523165440918, "learning_rate": 8.425515861311312e-07, "loss": 0.0197, "step": 4585 }, { "epoch": 3.27, "grad_norm": 6.283027986139463, "learning_rate": 8.409466864710858e-07, "loss": 0.0248, "step": 4586 }, { "epoch": 3.27, "grad_norm": 5.122071684243946, "learning_rate": 8.393431764280591e-07, "loss": 0.0175, "step": 4587 }, { "epoch": 3.27, "grad_norm": 7.677661344943328, "learning_rate": 8.377410565378097e-07, "loss": 0.026, "step": 4588 }, { "epoch": 3.28, "grad_norm": 6.845604358260919, "learning_rate": 8.361403273356411e-07, "loss": 0.0262, "step": 4589 }, { "epoch": 3.28, "grad_norm": 1.9332179724262128, "learning_rate": 8.345409893563816e-07, "loss": 0.0208, "step": 4590 }, { "epoch": 3.28, "grad_norm": 4.828115067765296, "learning_rate": 8.329430431344043e-07, "loss": 0.0283, "step": 4591 }, { "epoch": 3.28, "grad_norm": 4.041522711176144, "learning_rate": 8.313464892036083e-07, "loss": 0.0182, "step": 4592 }, { "epoch": 3.28, "grad_norm": 3.5931461402812284, "learning_rate": 8.297513280974362e-07, "loss": 0.0205, "step": 4593 }, { "epoch": 3.28, "grad_norm": 7.322773741524985, "learning_rate": 8.281575603488573e-07, "loss": 0.0245, "step": 4594 }, { "epoch": 3.28, "grad_norm": 2.2037482548752654, "learning_rate": 8.265651864903823e-07, "loss": 0.0228, "step": 4595 }, { "epoch": 3.28, "grad_norm": 1.8543264539363935, "learning_rate": 8.249742070540506e-07, "loss": 0.0205, "step": 4596 }, { "epoch": 3.28, "grad_norm": 4.841219706984167, "learning_rate": 8.233846225714386e-07, "loss": 0.0315, "step": 4597 }, { "epoch": 3.28, "grad_norm": 1.9559510186530535, "learning_rate": 8.217964335736556e-07, "loss": 0.0196, "step": 4598 }, { "epoch": 3.28, "grad_norm": 2.369932136393158, "learning_rate": 8.202096405913462e-07, "loss": 0.0211, "step": 4599 }, { "epoch": 3.28, "grad_norm": 3.149653056045046, "learning_rate": 8.186242441546866e-07, "loss": 0.0208, "step": 4600 }, { "epoch": 3.28, "grad_norm": 5.1208147167576294, "learning_rate": 8.170402447933873e-07, "loss": 0.0377, "step": 4601 }, { "epoch": 3.28, "grad_norm": 2.50638995160429, "learning_rate": 8.154576430366922e-07, "loss": 0.0223, "step": 4602 }, { "epoch": 3.29, "grad_norm": 5.384939207639812, "learning_rate": 8.13876439413378e-07, "loss": 0.028, "step": 4603 }, { "epoch": 3.29, "grad_norm": 1.762765510713304, "learning_rate": 8.122966344517536e-07, "loss": 0.0239, "step": 4604 }, { "epoch": 3.29, "grad_norm": 3.0740858313842443, "learning_rate": 8.107182286796633e-07, "loss": 0.0244, "step": 4605 }, { "epoch": 3.29, "grad_norm": 3.9397427075031013, "learning_rate": 8.091412226244771e-07, "loss": 0.0172, "step": 4606 }, { "epoch": 3.29, "grad_norm": 3.5915667776547204, "learning_rate": 8.07565616813108e-07, "loss": 0.0344, "step": 4607 }, { "epoch": 3.29, "grad_norm": 3.9272905067394714, "learning_rate": 8.059914117719897e-07, "loss": 0.0253, "step": 4608 }, { "epoch": 3.29, "grad_norm": 3.2510066913863462, "learning_rate": 8.044186080270983e-07, "loss": 0.0248, "step": 4609 }, { "epoch": 3.29, "grad_norm": 1.2617444867153567, "learning_rate": 8.028472061039322e-07, "loss": 0.0192, "step": 4610 }, { "epoch": 3.29, "grad_norm": 1.939700065819408, "learning_rate": 8.012772065275304e-07, "loss": 0.021, "step": 4611 }, { "epoch": 3.29, "grad_norm": 2.1320831911446376, "learning_rate": 7.997086098224555e-07, "loss": 0.0199, "step": 4612 }, { "epoch": 3.29, "grad_norm": 1.884317978445364, "learning_rate": 7.981414165128065e-07, "loss": 0.0151, "step": 4613 }, { "epoch": 3.29, "grad_norm": 4.5621264328140905, "learning_rate": 7.965756271222108e-07, "loss": 0.0208, "step": 4614 }, { "epoch": 3.29, "grad_norm": 1.440610499433646, "learning_rate": 7.950112421738282e-07, "loss": 0.0178, "step": 4615 }, { "epoch": 3.29, "grad_norm": 2.258538996875376, "learning_rate": 7.934482621903494e-07, "loss": 0.0262, "step": 4616 }, { "epoch": 3.3, "grad_norm": 3.0165939984057832, "learning_rate": 7.91886687693994e-07, "loss": 0.0165, "step": 4617 }, { "epoch": 3.3, "grad_norm": 5.695857517771355, "learning_rate": 7.903265192065141e-07, "loss": 0.0197, "step": 4618 }, { "epoch": 3.3, "grad_norm": 1.7055585895666883, "learning_rate": 7.887677572491903e-07, "loss": 0.0187, "step": 4619 }, { "epoch": 3.3, "grad_norm": 7.5887897339828685, "learning_rate": 7.872104023428339e-07, "loss": 0.0238, "step": 4620 }, { "epoch": 3.3, "grad_norm": 3.374951107007746, "learning_rate": 7.856544550077883e-07, "loss": 0.0257, "step": 4621 }, { "epoch": 3.3, "grad_norm": 1.855484086420173, "learning_rate": 7.840999157639195e-07, "loss": 0.019, "step": 4622 }, { "epoch": 3.3, "grad_norm": 2.792593588176287, "learning_rate": 7.825467851306335e-07, "loss": 0.0207, "step": 4623 }, { "epoch": 3.3, "grad_norm": 9.742513544372374, "learning_rate": 7.809950636268554e-07, "loss": 0.0224, "step": 4624 }, { "epoch": 3.3, "grad_norm": 5.9915718445374795, "learning_rate": 7.794447517710485e-07, "loss": 0.0223, "step": 4625 }, { "epoch": 3.3, "grad_norm": 2.4603505001707378, "learning_rate": 7.778958500811961e-07, "loss": 0.0317, "step": 4626 }, { "epoch": 3.3, "grad_norm": 2.349151356305383, "learning_rate": 7.7634835907482e-07, "loss": 0.0232, "step": 4627 }, { "epoch": 3.3, "grad_norm": 5.788557646206046, "learning_rate": 7.748022792689613e-07, "loss": 0.0233, "step": 4628 }, { "epoch": 3.3, "grad_norm": 4.051836245160571, "learning_rate": 7.732576111801982e-07, "loss": 0.0125, "step": 4629 }, { "epoch": 3.3, "grad_norm": 2.8753818346406246, "learning_rate": 7.717143553246298e-07, "loss": 0.0306, "step": 4630 }, { "epoch": 3.31, "grad_norm": 2.577591283510448, "learning_rate": 7.701725122178871e-07, "loss": 0.0186, "step": 4631 }, { "epoch": 3.31, "grad_norm": 11.542060297374746, "learning_rate": 7.686320823751298e-07, "loss": 0.0478, "step": 4632 }, { "epoch": 3.31, "grad_norm": 2.371685944489164, "learning_rate": 7.670930663110426e-07, "loss": 0.0251, "step": 4633 }, { "epoch": 3.31, "grad_norm": 1.7468794803617154, "learning_rate": 7.655554645398405e-07, "loss": 0.0228, "step": 4634 }, { "epoch": 3.31, "grad_norm": 1.245714311313327, "learning_rate": 7.640192775752647e-07, "loss": 0.0189, "step": 4635 }, { "epoch": 3.31, "grad_norm": 4.9245303461145715, "learning_rate": 7.624845059305836e-07, "loss": 0.028, "step": 4636 }, { "epoch": 3.31, "grad_norm": 3.7927382042335367, "learning_rate": 7.609511501185929e-07, "loss": 0.0305, "step": 4637 }, { "epoch": 3.31, "grad_norm": 1.8226386480619419, "learning_rate": 7.594192106516151e-07, "loss": 0.024, "step": 4638 }, { "epoch": 3.31, "grad_norm": 4.11296383175397, "learning_rate": 7.578886880414999e-07, "loss": 0.0342, "step": 4639 }, { "epoch": 3.31, "grad_norm": 2.9392679931067014, "learning_rate": 7.563595827996235e-07, "loss": 0.0247, "step": 4640 }, { "epoch": 3.31, "grad_norm": 3.0949822502136284, "learning_rate": 7.548318954368883e-07, "loss": 0.0252, "step": 4641 }, { "epoch": 3.31, "grad_norm": 2.384999555074518, "learning_rate": 7.533056264637228e-07, "loss": 0.0207, "step": 4642 }, { "epoch": 3.31, "grad_norm": 1.6244482154616275, "learning_rate": 7.51780776390082e-07, "loss": 0.0199, "step": 4643 }, { "epoch": 3.31, "grad_norm": 5.849177252436853, "learning_rate": 7.50257345725447e-07, "loss": 0.0227, "step": 4644 }, { "epoch": 3.32, "grad_norm": 7.411666454715126, "learning_rate": 7.487353349788234e-07, "loss": 0.0223, "step": 4645 }, { "epoch": 3.32, "grad_norm": 2.944448161435966, "learning_rate": 7.472147446587452e-07, "loss": 0.0213, "step": 4646 }, { "epoch": 3.32, "grad_norm": 4.2919752995505585, "learning_rate": 7.456955752732659e-07, "loss": 0.0292, "step": 4647 }, { "epoch": 3.32, "grad_norm": 1.7148082801364484, "learning_rate": 7.441778273299738e-07, "loss": 0.0146, "step": 4648 }, { "epoch": 3.32, "grad_norm": 5.5113167274528125, "learning_rate": 7.426615013359706e-07, "loss": 0.0242, "step": 4649 }, { "epoch": 3.32, "grad_norm": 3.4930471463189185, "learning_rate": 7.411465977978949e-07, "loss": 0.0214, "step": 4650 }, { "epoch": 3.32, "grad_norm": 2.7583759156057948, "learning_rate": 7.396331172218996e-07, "loss": 0.0263, "step": 4651 }, { "epoch": 3.32, "grad_norm": 3.0835376005766184, "learning_rate": 7.381210601136702e-07, "loss": 0.0226, "step": 4652 }, { "epoch": 3.32, "grad_norm": 7.939488044101162, "learning_rate": 7.366104269784086e-07, "loss": 0.0222, "step": 4653 }, { "epoch": 3.32, "grad_norm": 7.71559173914037, "learning_rate": 7.351012183208511e-07, "loss": 0.0246, "step": 4654 }, { "epoch": 3.32, "grad_norm": 3.3505208772325212, "learning_rate": 7.335934346452484e-07, "loss": 0.022, "step": 4655 }, { "epoch": 3.32, "grad_norm": 2.522427789926864, "learning_rate": 7.320870764553795e-07, "loss": 0.0258, "step": 4656 }, { "epoch": 3.32, "grad_norm": 4.625965296359435, "learning_rate": 7.305821442545474e-07, "loss": 0.0318, "step": 4657 }, { "epoch": 3.32, "grad_norm": 1.666715669670613, "learning_rate": 7.290786385455778e-07, "loss": 0.0267, "step": 4658 }, { "epoch": 3.33, "grad_norm": 7.423398252798393, "learning_rate": 7.275765598308199e-07, "loss": 0.0457, "step": 4659 }, { "epoch": 3.33, "grad_norm": 4.94580570567993, "learning_rate": 7.26075908612146e-07, "loss": 0.0223, "step": 4660 }, { "epoch": 3.33, "grad_norm": 2.6200308676820714, "learning_rate": 7.245766853909519e-07, "loss": 0.0302, "step": 4661 }, { "epoch": 3.33, "grad_norm": 2.565484956644103, "learning_rate": 7.230788906681558e-07, "loss": 0.0139, "step": 4662 }, { "epoch": 3.33, "grad_norm": 7.519074271081959, "learning_rate": 7.215825249441982e-07, "loss": 0.0438, "step": 4663 }, { "epoch": 3.33, "grad_norm": 2.9349056048339865, "learning_rate": 7.200875887190445e-07, "loss": 0.0319, "step": 4664 }, { "epoch": 3.33, "grad_norm": 1.7802836631328636, "learning_rate": 7.185940824921772e-07, "loss": 0.0256, "step": 4665 }, { "epoch": 3.33, "grad_norm": 2.3165391573905887, "learning_rate": 7.171020067626089e-07, "loss": 0.0241, "step": 4666 }, { "epoch": 3.33, "grad_norm": 10.891239008200467, "learning_rate": 7.156113620288646e-07, "loss": 0.0393, "step": 4667 }, { "epoch": 3.33, "grad_norm": 8.083097463271788, "learning_rate": 7.141221487890027e-07, "loss": 0.0271, "step": 4668 }, { "epoch": 3.33, "grad_norm": 4.11813397569581, "learning_rate": 7.126343675405905e-07, "loss": 0.0164, "step": 4669 }, { "epoch": 3.33, "grad_norm": 5.230913755558933, "learning_rate": 7.111480187807296e-07, "loss": 0.0227, "step": 4670 }, { "epoch": 3.33, "grad_norm": 2.4065813670459244, "learning_rate": 7.096631030060308e-07, "loss": 0.0265, "step": 4671 }, { "epoch": 3.33, "grad_norm": 2.9175150140266624, "learning_rate": 7.081796207126373e-07, "loss": 0.0167, "step": 4672 }, { "epoch": 3.34, "grad_norm": 5.43837835587297, "learning_rate": 7.06697572396205e-07, "loss": 0.0166, "step": 4673 }, { "epoch": 3.34, "grad_norm": 5.613308008082632, "learning_rate": 7.052169585519142e-07, "loss": 0.0162, "step": 4674 }, { "epoch": 3.34, "grad_norm": 4.7970690996692165, "learning_rate": 7.037377796744666e-07, "loss": 0.0214, "step": 4675 }, { "epoch": 3.34, "grad_norm": 6.004092278249741, "learning_rate": 7.022600362580817e-07, "loss": 0.0234, "step": 4676 }, { "epoch": 3.34, "grad_norm": 4.179910056219882, "learning_rate": 7.007837287965024e-07, "loss": 0.014, "step": 4677 }, { "epoch": 3.34, "grad_norm": 5.786550535023316, "learning_rate": 6.993088577829904e-07, "loss": 0.0287, "step": 4678 }, { "epoch": 3.34, "grad_norm": 8.49339045448722, "learning_rate": 6.978354237103264e-07, "loss": 0.0291, "step": 4679 }, { "epoch": 3.34, "grad_norm": 1.9362629401618683, "learning_rate": 6.963634270708137e-07, "loss": 0.0211, "step": 4680 }, { "epoch": 3.34, "grad_norm": 3.2084607601706954, "learning_rate": 6.948928683562722e-07, "loss": 0.0145, "step": 4681 }, { "epoch": 3.34, "grad_norm": 6.388831706059872, "learning_rate": 6.934237480580435e-07, "loss": 0.0261, "step": 4682 }, { "epoch": 3.34, "grad_norm": 8.92008557471272, "learning_rate": 6.919560666669889e-07, "loss": 0.025, "step": 4683 }, { "epoch": 3.34, "grad_norm": 3.5596242184271523, "learning_rate": 6.904898246734864e-07, "loss": 0.024, "step": 4684 }, { "epoch": 3.34, "grad_norm": 5.813474706944849, "learning_rate": 6.890250225674361e-07, "loss": 0.0238, "step": 4685 }, { "epoch": 3.34, "grad_norm": 6.456049406535201, "learning_rate": 6.875616608382562e-07, "loss": 0.0235, "step": 4686 }, { "epoch": 3.35, "grad_norm": 4.27019288004158, "learning_rate": 6.860997399748792e-07, "loss": 0.0241, "step": 4687 }, { "epoch": 3.35, "grad_norm": 1.4468000545374635, "learning_rate": 6.846392604657653e-07, "loss": 0.0173, "step": 4688 }, { "epoch": 3.35, "grad_norm": 2.0063895434320953, "learning_rate": 6.831802227988843e-07, "loss": 0.0128, "step": 4689 }, { "epoch": 3.35, "grad_norm": 4.125314951911715, "learning_rate": 6.817226274617283e-07, "loss": 0.0184, "step": 4690 }, { "epoch": 3.35, "grad_norm": 4.390828758784832, "learning_rate": 6.802664749413079e-07, "loss": 0.0222, "step": 4691 }, { "epoch": 3.35, "grad_norm": 2.617942356146691, "learning_rate": 6.788117657241506e-07, "loss": 0.017, "step": 4692 }, { "epoch": 3.35, "grad_norm": 5.244438256633183, "learning_rate": 6.773585002963007e-07, "loss": 0.0287, "step": 4693 }, { "epoch": 3.35, "grad_norm": 5.848459710599114, "learning_rate": 6.759066791433228e-07, "loss": 0.025, "step": 4694 }, { "epoch": 3.35, "grad_norm": 4.5353239904491645, "learning_rate": 6.744563027502959e-07, "loss": 0.0229, "step": 4695 }, { "epoch": 3.35, "grad_norm": 6.973704437407207, "learning_rate": 6.730073716018187e-07, "loss": 0.0313, "step": 4696 }, { "epoch": 3.35, "grad_norm": 3.2864116478612178, "learning_rate": 6.715598861820055e-07, "loss": 0.0179, "step": 4697 }, { "epoch": 3.35, "grad_norm": 2.120343882460884, "learning_rate": 6.701138469744883e-07, "loss": 0.0195, "step": 4698 }, { "epoch": 3.35, "grad_norm": 2.2826363640967333, "learning_rate": 6.686692544624157e-07, "loss": 0.0249, "step": 4699 }, { "epoch": 3.35, "grad_norm": 2.2453281994267424, "learning_rate": 6.672261091284526e-07, "loss": 0.0298, "step": 4700 }, { "epoch": 3.36, "grad_norm": 4.527463652856993, "learning_rate": 6.657844114547812e-07, "loss": 0.0332, "step": 4701 }, { "epoch": 3.36, "grad_norm": 12.362619407093279, "learning_rate": 6.643441619230989e-07, "loss": 0.0338, "step": 4702 }, { "epoch": 3.36, "grad_norm": 4.943448636464124, "learning_rate": 6.629053610146202e-07, "loss": 0.0238, "step": 4703 }, { "epoch": 3.36, "grad_norm": 3.9420677318165227, "learning_rate": 6.61468009210075e-07, "loss": 0.0211, "step": 4704 }, { "epoch": 3.36, "grad_norm": 2.069578664244523, "learning_rate": 6.600321069897097e-07, "loss": 0.0155, "step": 4705 }, { "epoch": 3.36, "grad_norm": 6.509957366481949, "learning_rate": 6.585976548332856e-07, "loss": 0.0441, "step": 4706 }, { "epoch": 3.36, "grad_norm": 3.550669987246946, "learning_rate": 6.571646532200815e-07, "loss": 0.0255, "step": 4707 }, { "epoch": 3.36, "grad_norm": 2.07784447599088, "learning_rate": 6.557331026288855e-07, "loss": 0.0137, "step": 4708 }, { "epoch": 3.36, "grad_norm": 2.33838698403416, "learning_rate": 6.543030035380099e-07, "loss": 0.017, "step": 4709 }, { "epoch": 3.36, "grad_norm": 4.926223440067987, "learning_rate": 6.528743564252737e-07, "loss": 0.023, "step": 4710 }, { "epoch": 3.36, "grad_norm": 3.0636490626676856, "learning_rate": 6.514471617680184e-07, "loss": 0.0229, "step": 4711 }, { "epoch": 3.36, "grad_norm": 2.4741516925135745, "learning_rate": 6.500214200430921e-07, "loss": 0.017, "step": 4712 }, { "epoch": 3.36, "grad_norm": 2.4767551496918587, "learning_rate": 6.485971317268658e-07, "loss": 0.0192, "step": 4713 }, { "epoch": 3.36, "grad_norm": 1.7457587448524006, "learning_rate": 6.471742972952172e-07, "loss": 0.0236, "step": 4714 }, { "epoch": 3.37, "grad_norm": 3.6635714256934637, "learning_rate": 6.457529172235427e-07, "loss": 0.0201, "step": 4715 }, { "epoch": 3.37, "grad_norm": 3.108350647844174, "learning_rate": 6.44332991986753e-07, "loss": 0.0288, "step": 4716 }, { "epoch": 3.37, "grad_norm": 1.5565324965470344, "learning_rate": 6.429145220592703e-07, "loss": 0.0173, "step": 4717 }, { "epoch": 3.37, "grad_norm": 5.2053739429829005, "learning_rate": 6.414975079150321e-07, "loss": 0.0235, "step": 4718 }, { "epoch": 3.37, "grad_norm": 6.228664595722526, "learning_rate": 6.400819500274891e-07, "loss": 0.0341, "step": 4719 }, { "epoch": 3.37, "grad_norm": 6.550911531984451, "learning_rate": 6.386678488696057e-07, "loss": 0.0226, "step": 4720 }, { "epoch": 3.37, "grad_norm": 1.3109577383123228, "learning_rate": 6.372552049138591e-07, "loss": 0.0179, "step": 4721 }, { "epoch": 3.37, "grad_norm": 4.114856675378657, "learning_rate": 6.358440186322401e-07, "loss": 0.0256, "step": 4722 }, { "epoch": 3.37, "grad_norm": 2.4993011122011786, "learning_rate": 6.344342904962536e-07, "loss": 0.0167, "step": 4723 }, { "epoch": 3.37, "grad_norm": 2.1082760871594544, "learning_rate": 6.330260209769124e-07, "loss": 0.0158, "step": 4724 }, { "epoch": 3.37, "grad_norm": 3.5954228865011024, "learning_rate": 6.316192105447499e-07, "loss": 0.0181, "step": 4725 }, { "epoch": 3.37, "grad_norm": 3.250561893307712, "learning_rate": 6.302138596698032e-07, "loss": 0.0179, "step": 4726 }, { "epoch": 3.37, "grad_norm": 5.034063607262308, "learning_rate": 6.288099688216309e-07, "loss": 0.0226, "step": 4727 }, { "epoch": 3.37, "grad_norm": 3.382398513985597, "learning_rate": 6.27407538469294e-07, "loss": 0.0223, "step": 4728 }, { "epoch": 3.38, "grad_norm": 3.5480387834752256, "learning_rate": 6.260065690813754e-07, "loss": 0.0244, "step": 4729 }, { "epoch": 3.38, "grad_norm": 3.894566614171884, "learning_rate": 6.246070611259603e-07, "loss": 0.038, "step": 4730 }, { "epoch": 3.38, "grad_norm": 4.908257487051644, "learning_rate": 6.232090150706555e-07, "loss": 0.0319, "step": 4731 }, { "epoch": 3.38, "grad_norm": 2.2356350192362955, "learning_rate": 6.218124313825696e-07, "loss": 0.0118, "step": 4732 }, { "epoch": 3.38, "grad_norm": 2.7833053267514285, "learning_rate": 6.204173105283295e-07, "loss": 0.0305, "step": 4733 }, { "epoch": 3.38, "grad_norm": 3.0358977610764937, "learning_rate": 6.190236529740701e-07, "loss": 0.0213, "step": 4734 }, { "epoch": 3.38, "grad_norm": 2.7799160284276017, "learning_rate": 6.176314591854388e-07, "loss": 0.024, "step": 4735 }, { "epoch": 3.38, "grad_norm": 2.411428363895045, "learning_rate": 6.162407296275936e-07, "loss": 0.024, "step": 4736 }, { "epoch": 3.38, "grad_norm": 4.987033320862642, "learning_rate": 6.148514647652026e-07, "loss": 0.0213, "step": 4737 }, { "epoch": 3.38, "grad_norm": 7.294162310830876, "learning_rate": 6.134636650624448e-07, "loss": 0.0306, "step": 4738 }, { "epoch": 3.38, "grad_norm": 1.9285035219081978, "learning_rate": 6.120773309830108e-07, "loss": 0.0151, "step": 4739 }, { "epoch": 3.38, "grad_norm": 2.9779680103281083, "learning_rate": 6.106924629900996e-07, "loss": 0.023, "step": 4740 }, { "epoch": 3.38, "grad_norm": 2.802445642894432, "learning_rate": 6.09309061546422e-07, "loss": 0.0204, "step": 4741 }, { "epoch": 3.38, "grad_norm": 3.5712560891174503, "learning_rate": 6.079271271141979e-07, "loss": 0.0258, "step": 4742 }, { "epoch": 3.39, "grad_norm": 2.7284199278884085, "learning_rate": 6.065466601551578e-07, "loss": 0.0249, "step": 4743 }, { "epoch": 3.39, "grad_norm": 2.1930907915124753, "learning_rate": 6.051676611305401e-07, "loss": 0.0327, "step": 4744 }, { "epoch": 3.39, "grad_norm": 3.0801008957709364, "learning_rate": 6.037901305010951e-07, "loss": 0.0156, "step": 4745 }, { "epoch": 3.39, "grad_norm": 2.2322852894555716, "learning_rate": 6.024140687270813e-07, "loss": 0.0134, "step": 4746 }, { "epoch": 3.39, "grad_norm": 3.7566488570881544, "learning_rate": 6.010394762682659e-07, "loss": 0.018, "step": 4747 }, { "epoch": 3.39, "grad_norm": 2.266574212516343, "learning_rate": 5.996663535839275e-07, "loss": 0.0201, "step": 4748 }, { "epoch": 3.39, "grad_norm": 2.644461947400835, "learning_rate": 5.982947011328489e-07, "loss": 0.0243, "step": 4749 }, { "epoch": 3.39, "grad_norm": 2.973314350873358, "learning_rate": 5.969245193733275e-07, "loss": 0.0241, "step": 4750 }, { "epoch": 3.39, "grad_norm": 4.151307716784977, "learning_rate": 5.955558087631641e-07, "loss": 0.021, "step": 4751 }, { "epoch": 3.39, "grad_norm": 4.048781625741955, "learning_rate": 5.941885697596734e-07, "loss": 0.0216, "step": 4752 }, { "epoch": 3.39, "grad_norm": 3.429703276468199, "learning_rate": 5.928228028196714e-07, "loss": 0.0236, "step": 4753 }, { "epoch": 3.39, "grad_norm": 2.739052446340753, "learning_rate": 5.914585083994906e-07, "loss": 0.017, "step": 4754 }, { "epoch": 3.39, "grad_norm": 3.04638508596632, "learning_rate": 5.900956869549629e-07, "loss": 0.0281, "step": 4755 }, { "epoch": 3.39, "grad_norm": 4.775164840894267, "learning_rate": 5.887343389414363e-07, "loss": 0.0363, "step": 4756 }, { "epoch": 3.4, "grad_norm": 1.9766077837059972, "learning_rate": 5.873744648137592e-07, "loss": 0.032, "step": 4757 }, { "epoch": 3.4, "grad_norm": 1.932804696724512, "learning_rate": 5.860160650262925e-07, "loss": 0.0171, "step": 4758 }, { "epoch": 3.4, "grad_norm": 2.4446932774769956, "learning_rate": 5.846591400329021e-07, "loss": 0.0123, "step": 4759 }, { "epoch": 3.4, "grad_norm": 2.8494754763154027, "learning_rate": 5.833036902869626e-07, "loss": 0.0229, "step": 4760 }, { "epoch": 3.4, "grad_norm": 3.070788144846567, "learning_rate": 5.81949716241354e-07, "loss": 0.0193, "step": 4761 }, { "epoch": 3.4, "grad_norm": 3.3717997476125032, "learning_rate": 5.805972183484654e-07, "loss": 0.0272, "step": 4762 }, { "epoch": 3.4, "grad_norm": 1.891084092238694, "learning_rate": 5.792461970601903e-07, "loss": 0.02, "step": 4763 }, { "epoch": 3.4, "grad_norm": 1.6405267772372447, "learning_rate": 5.778966528279306e-07, "loss": 0.016, "step": 4764 }, { "epoch": 3.4, "grad_norm": 3.494793807973005, "learning_rate": 5.765485861025944e-07, "loss": 0.0191, "step": 4765 }, { "epoch": 3.4, "grad_norm": 2.096825436091188, "learning_rate": 5.752019973345963e-07, "loss": 0.0208, "step": 4766 }, { "epoch": 3.4, "grad_norm": 1.678751370669293, "learning_rate": 5.738568869738537e-07, "loss": 0.0174, "step": 4767 }, { "epoch": 3.4, "grad_norm": 4.166473159072722, "learning_rate": 5.725132554697971e-07, "loss": 0.024, "step": 4768 }, { "epoch": 3.4, "grad_norm": 4.3633134523358335, "learning_rate": 5.711711032713547e-07, "loss": 0.0182, "step": 4769 }, { "epoch": 3.4, "grad_norm": 2.4438998290312313, "learning_rate": 5.698304308269686e-07, "loss": 0.0225, "step": 4770 }, { "epoch": 3.41, "grad_norm": 1.6826909960550536, "learning_rate": 5.684912385845786e-07, "loss": 0.0165, "step": 4771 }, { "epoch": 3.41, "grad_norm": 4.223897905047371, "learning_rate": 5.671535269916373e-07, "loss": 0.0189, "step": 4772 }, { "epoch": 3.41, "grad_norm": 4.451136477185219, "learning_rate": 5.658172964950953e-07, "loss": 0.0172, "step": 4773 }, { "epoch": 3.41, "grad_norm": 2.8782735665276964, "learning_rate": 5.644825475414162e-07, "loss": 0.0149, "step": 4774 }, { "epoch": 3.41, "grad_norm": 5.622361288092509, "learning_rate": 5.631492805765609e-07, "loss": 0.0232, "step": 4775 }, { "epoch": 3.41, "grad_norm": 2.0095858180774, "learning_rate": 5.618174960459999e-07, "loss": 0.0343, "step": 4776 }, { "epoch": 3.41, "grad_norm": 4.312258137394328, "learning_rate": 5.604871943947071e-07, "loss": 0.0214, "step": 4777 }, { "epoch": 3.41, "grad_norm": 12.714949832938647, "learning_rate": 5.591583760671609e-07, "loss": 0.0416, "step": 4778 }, { "epoch": 3.41, "grad_norm": 6.098583691916675, "learning_rate": 5.578310415073451e-07, "loss": 0.0201, "step": 4779 }, { "epoch": 3.41, "grad_norm": 3.0148079241593546, "learning_rate": 5.565051911587455e-07, "loss": 0.0257, "step": 4780 }, { "epoch": 3.41, "grad_norm": 2.1858188497490474, "learning_rate": 5.551808254643543e-07, "loss": 0.0237, "step": 4781 }, { "epoch": 3.41, "grad_norm": 7.095925230827684, "learning_rate": 5.538579448666675e-07, "loss": 0.0175, "step": 4782 }, { "epoch": 3.41, "grad_norm": 2.3749779288397534, "learning_rate": 5.525365498076807e-07, "loss": 0.0241, "step": 4783 }, { "epoch": 3.41, "grad_norm": 2.1169352892214444, "learning_rate": 5.51216640728901e-07, "loss": 0.0263, "step": 4784 }, { "epoch": 3.42, "grad_norm": 1.5113690124988577, "learning_rate": 5.498982180713308e-07, "loss": 0.0244, "step": 4785 }, { "epoch": 3.42, "grad_norm": 5.743204991318629, "learning_rate": 5.485812822754826e-07, "loss": 0.0365, "step": 4786 }, { "epoch": 3.42, "grad_norm": 5.539838806774285, "learning_rate": 5.472658337813664e-07, "loss": 0.0237, "step": 4787 }, { "epoch": 3.42, "grad_norm": 2.172371017801228, "learning_rate": 5.459518730285007e-07, "loss": 0.0176, "step": 4788 }, { "epoch": 3.42, "grad_norm": 7.723251710056527, "learning_rate": 5.446394004559008e-07, "loss": 0.0264, "step": 4789 }, { "epoch": 3.42, "grad_norm": 3.6273060070583125, "learning_rate": 5.43328416502093e-07, "loss": 0.0194, "step": 4790 }, { "epoch": 3.42, "grad_norm": 5.07385094981908, "learning_rate": 5.420189216050969e-07, "loss": 0.0348, "step": 4791 }, { "epoch": 3.42, "grad_norm": 2.3283643337319133, "learning_rate": 5.407109162024409e-07, "loss": 0.0178, "step": 4792 }, { "epoch": 3.42, "grad_norm": 1.9835055011656801, "learning_rate": 5.394044007311544e-07, "loss": 0.0307, "step": 4793 }, { "epoch": 3.42, "grad_norm": 8.575120390262455, "learning_rate": 5.380993756277675e-07, "loss": 0.0278, "step": 4794 }, { "epoch": 3.42, "grad_norm": 2.5516786025822182, "learning_rate": 5.367958413283141e-07, "loss": 0.0219, "step": 4795 }, { "epoch": 3.42, "grad_norm": 9.296882286540368, "learning_rate": 5.354937982683283e-07, "loss": 0.0288, "step": 4796 }, { "epoch": 3.42, "grad_norm": 3.5951696364880306, "learning_rate": 5.341932468828481e-07, "loss": 0.0207, "step": 4797 }, { "epoch": 3.42, "grad_norm": 3.7362598014081576, "learning_rate": 5.328941876064114e-07, "loss": 0.0259, "step": 4798 }, { "epoch": 3.43, "grad_norm": 2.4075304327331613, "learning_rate": 5.315966208730578e-07, "loss": 0.0219, "step": 4799 }, { "epoch": 3.43, "grad_norm": 2.245070060364752, "learning_rate": 5.30300547116328e-07, "loss": 0.024, "step": 4800 }, { "epoch": 3.43, "grad_norm": 4.6692741589089675, "learning_rate": 5.290059667692655e-07, "loss": 0.0308, "step": 4801 }, { "epoch": 3.43, "grad_norm": 3.000431411370373, "learning_rate": 5.277128802644133e-07, "loss": 0.0195, "step": 4802 }, { "epoch": 3.43, "grad_norm": 8.965965626231066, "learning_rate": 5.264212880338154e-07, "loss": 0.0414, "step": 4803 }, { "epoch": 3.43, "grad_norm": 5.876899527347979, "learning_rate": 5.251311905090167e-07, "loss": 0.0178, "step": 4804 }, { "epoch": 3.43, "grad_norm": 4.667587955450332, "learning_rate": 5.238425881210624e-07, "loss": 0.0205, "step": 4805 }, { "epoch": 3.43, "grad_norm": 20.266107099324124, "learning_rate": 5.225554813004996e-07, "loss": 0.0611, "step": 4806 }, { "epoch": 3.43, "grad_norm": 7.450685723335502, "learning_rate": 5.21269870477375e-07, "loss": 0.0396, "step": 4807 }, { "epoch": 3.43, "grad_norm": 2.8697958072570726, "learning_rate": 5.199857560812316e-07, "loss": 0.0218, "step": 4808 }, { "epoch": 3.43, "grad_norm": 5.772178641403427, "learning_rate": 5.187031385411206e-07, "loss": 0.0254, "step": 4809 }, { "epoch": 3.43, "grad_norm": 6.398504975526057, "learning_rate": 5.174220182855844e-07, "loss": 0.0217, "step": 4810 }, { "epoch": 3.43, "grad_norm": 7.595861533557351, "learning_rate": 5.161423957426725e-07, "loss": 0.0276, "step": 4811 }, { "epoch": 3.43, "grad_norm": 10.706839343586863, "learning_rate": 5.148642713399272e-07, "loss": 0.0207, "step": 4812 }, { "epoch": 3.44, "grad_norm": 9.248483629993162, "learning_rate": 5.13587645504397e-07, "loss": 0.0163, "step": 4813 }, { "epoch": 3.44, "grad_norm": 4.5853381448697315, "learning_rate": 5.123125186626227e-07, "loss": 0.032, "step": 4814 }, { "epoch": 3.44, "grad_norm": 2.6956221870281567, "learning_rate": 5.110388912406517e-07, "loss": 0.0199, "step": 4815 }, { "epoch": 3.44, "grad_norm": 6.193292360315304, "learning_rate": 5.097667636640241e-07, "loss": 0.0251, "step": 4816 }, { "epoch": 3.44, "grad_norm": 4.970186592154489, "learning_rate": 5.084961363577817e-07, "loss": 0.0285, "step": 4817 }, { "epoch": 3.44, "grad_norm": 2.2733355420940127, "learning_rate": 5.072270097464649e-07, "loss": 0.0255, "step": 4818 }, { "epoch": 3.44, "grad_norm": 1.282955595029532, "learning_rate": 5.059593842541127e-07, "loss": 0.0167, "step": 4819 }, { "epoch": 3.44, "grad_norm": 2.9173904214804534, "learning_rate": 5.04693260304262e-07, "loss": 0.0229, "step": 4820 }, { "epoch": 3.44, "grad_norm": 8.307122745953784, "learning_rate": 5.034286383199488e-07, "loss": 0.0229, "step": 4821 }, { "epoch": 3.44, "grad_norm": 10.234033109502867, "learning_rate": 5.021655187237067e-07, "loss": 0.0214, "step": 4822 }, { "epoch": 3.44, "grad_norm": 5.625721576167736, "learning_rate": 5.009039019375672e-07, "loss": 0.0244, "step": 4823 }, { "epoch": 3.44, "grad_norm": 6.9193873894305, "learning_rate": 4.996437883830596e-07, "loss": 0.0136, "step": 4824 }, { "epoch": 3.44, "grad_norm": 1.9958571703345365, "learning_rate": 4.983851784812127e-07, "loss": 0.0194, "step": 4825 }, { "epoch": 3.44, "grad_norm": 3.457883216533768, "learning_rate": 4.97128072652549e-07, "loss": 0.0174, "step": 4826 }, { "epoch": 3.45, "grad_norm": 3.264883698161827, "learning_rate": 4.958724713170943e-07, "loss": 0.0245, "step": 4827 }, { "epoch": 3.45, "grad_norm": 1.6502594031454794, "learning_rate": 4.946183748943639e-07, "loss": 0.0254, "step": 4828 }, { "epoch": 3.45, "grad_norm": 1.992537904277229, "learning_rate": 4.933657838033795e-07, "loss": 0.0189, "step": 4829 }, { "epoch": 3.45, "grad_norm": 2.8779533325492452, "learning_rate": 4.921146984626507e-07, "loss": 0.0241, "step": 4830 }, { "epoch": 3.45, "grad_norm": 3.3861004105405037, "learning_rate": 4.908651192901926e-07, "loss": 0.022, "step": 4831 }, { "epoch": 3.45, "grad_norm": 1.7445944302458545, "learning_rate": 4.896170467035089e-07, "loss": 0.0226, "step": 4832 }, { "epoch": 3.45, "grad_norm": 3.5655653643743537, "learning_rate": 4.883704811196072e-07, "loss": 0.0161, "step": 4833 }, { "epoch": 3.45, "grad_norm": 8.66061468381288, "learning_rate": 4.871254229549855e-07, "loss": 0.0523, "step": 4834 }, { "epoch": 3.45, "grad_norm": 2.657255822757317, "learning_rate": 4.858818726256425e-07, "loss": 0.028, "step": 4835 }, { "epoch": 3.45, "grad_norm": 5.472995401509639, "learning_rate": 4.846398305470712e-07, "loss": 0.0291, "step": 4836 }, { "epoch": 3.45, "grad_norm": 5.524011687090876, "learning_rate": 4.833992971342604e-07, "loss": 0.0233, "step": 4837 }, { "epoch": 3.45, "grad_norm": 8.501747778235401, "learning_rate": 4.821602728016955e-07, "loss": 0.0182, "step": 4838 }, { "epoch": 3.45, "grad_norm": 4.991085189378154, "learning_rate": 4.809227579633585e-07, "loss": 0.0291, "step": 4839 }, { "epoch": 3.45, "grad_norm": 5.373771634991806, "learning_rate": 4.796867530327249e-07, "loss": 0.0316, "step": 4840 }, { "epoch": 3.46, "grad_norm": 2.6554569867124536, "learning_rate": 4.784522584227675e-07, "loss": 0.0186, "step": 4841 }, { "epoch": 3.46, "grad_norm": 5.663613995175922, "learning_rate": 4.772192745459536e-07, "loss": 0.0249, "step": 4842 }, { "epoch": 3.46, "grad_norm": 2.1898976437413404, "learning_rate": 4.7598780181424666e-07, "loss": 0.0228, "step": 4843 }, { "epoch": 3.46, "grad_norm": 4.016259077763735, "learning_rate": 4.7475784063910404e-07, "loss": 0.0166, "step": 4844 }, { "epoch": 3.46, "grad_norm": 1.9221511010470675, "learning_rate": 4.7352939143147927e-07, "loss": 0.0231, "step": 4845 }, { "epoch": 3.46, "grad_norm": 3.563250640079929, "learning_rate": 4.72302454601819e-07, "loss": 0.0238, "step": 4846 }, { "epoch": 3.46, "grad_norm": 7.844911546765337, "learning_rate": 4.7107703056006706e-07, "loss": 0.0254, "step": 4847 }, { "epoch": 3.46, "grad_norm": 5.403934146700979, "learning_rate": 4.6985311971565806e-07, "loss": 0.0185, "step": 4848 }, { "epoch": 3.46, "grad_norm": 5.167903062966448, "learning_rate": 4.6863072247752664e-07, "loss": 0.016, "step": 4849 }, { "epoch": 3.46, "grad_norm": 2.1577262913144835, "learning_rate": 4.67409839254096e-07, "loss": 0.0212, "step": 4850 }, { "epoch": 3.46, "grad_norm": 1.3631800246611254, "learning_rate": 4.66190470453286e-07, "loss": 0.0197, "step": 4851 }, { "epoch": 3.46, "grad_norm": 3.344043147865704, "learning_rate": 4.6497261648251134e-07, "loss": 0.02, "step": 4852 }, { "epoch": 3.46, "grad_norm": 3.0874123639808926, "learning_rate": 4.6375627774867925e-07, "loss": 0.0212, "step": 4853 }, { "epoch": 3.46, "grad_norm": 7.328165344305696, "learning_rate": 4.6254145465819134e-07, "loss": 0.0213, "step": 4854 }, { "epoch": 3.47, "grad_norm": 3.224518234244968, "learning_rate": 4.6132814761694234e-07, "loss": 0.016, "step": 4855 }, { "epoch": 3.47, "grad_norm": 4.097008960876435, "learning_rate": 4.6011635703032075e-07, "loss": 0.0252, "step": 4856 }, { "epoch": 3.47, "grad_norm": 2.807322653482841, "learning_rate": 4.589060833032083e-07, "loss": 0.0268, "step": 4857 }, { "epoch": 3.47, "grad_norm": 2.3328664724594423, "learning_rate": 4.5769732683997983e-07, "loss": 0.0319, "step": 4858 }, { "epoch": 3.47, "grad_norm": 1.7226625611795265, "learning_rate": 4.564900880445039e-07, "loss": 0.021, "step": 4859 }, { "epoch": 3.47, "grad_norm": 4.033978626071074, "learning_rate": 4.552843673201407e-07, "loss": 0.0255, "step": 4860 }, { "epoch": 3.47, "grad_norm": 2.467384163882045, "learning_rate": 4.540801650697446e-07, "loss": 0.0245, "step": 4861 }, { "epoch": 3.47, "grad_norm": 2.9446504009525656, "learning_rate": 4.528774816956616e-07, "loss": 0.0178, "step": 4862 }, { "epoch": 3.47, "grad_norm": 2.235471595072864, "learning_rate": 4.516763175997302e-07, "loss": 0.0237, "step": 4863 }, { "epoch": 3.47, "grad_norm": 3.5832326050990857, "learning_rate": 4.5047667318328215e-07, "loss": 0.0277, "step": 4864 }, { "epoch": 3.47, "grad_norm": 7.581065600283904, "learning_rate": 4.492785488471413e-07, "loss": 0.0321, "step": 4865 }, { "epoch": 3.47, "grad_norm": 2.6477077947678427, "learning_rate": 4.480819449916224e-07, "loss": 0.0253, "step": 4866 }, { "epoch": 3.47, "grad_norm": 3.2013043262055714, "learning_rate": 4.468868620165334e-07, "loss": 0.0266, "step": 4867 }, { "epoch": 3.47, "grad_norm": 7.4880088058210115, "learning_rate": 4.4569330032117496e-07, "loss": 0.033, "step": 4868 }, { "epoch": 3.48, "grad_norm": 4.464721969976531, "learning_rate": 4.445012603043347e-07, "loss": 0.0327, "step": 4869 }, { "epoch": 3.48, "grad_norm": 3.1361654089894224, "learning_rate": 4.4331074236430014e-07, "loss": 0.0221, "step": 4870 }, { "epoch": 3.48, "grad_norm": 2.418337459504903, "learning_rate": 4.421217468988409e-07, "loss": 0.0188, "step": 4871 }, { "epoch": 3.48, "grad_norm": 2.973188602787579, "learning_rate": 4.409342743052264e-07, "loss": 0.0176, "step": 4872 }, { "epoch": 3.48, "grad_norm": 4.845221165052784, "learning_rate": 4.3974832498020983e-07, "loss": 0.0243, "step": 4873 }, { "epoch": 3.48, "grad_norm": 2.687507603812843, "learning_rate": 4.385638993200425e-07, "loss": 0.0227, "step": 4874 }, { "epoch": 3.48, "grad_norm": 5.577237802892938, "learning_rate": 4.3738099772045963e-07, "loss": 0.0234, "step": 4875 }, { "epoch": 3.48, "grad_norm": 1.9561421206253133, "learning_rate": 4.3619962057669216e-07, "loss": 0.025, "step": 4876 }, { "epoch": 3.48, "grad_norm": 4.175169583976657, "learning_rate": 4.350197682834606e-07, "loss": 0.0178, "step": 4877 }, { "epoch": 3.48, "grad_norm": 5.29437999575849, "learning_rate": 4.338414412349745e-07, "loss": 0.0204, "step": 4878 }, { "epoch": 3.48, "grad_norm": 2.0279366468727096, "learning_rate": 4.3266463982493566e-07, "loss": 0.0251, "step": 4879 }, { "epoch": 3.48, "grad_norm": 3.208336780233766, "learning_rate": 4.314893644465351e-07, "loss": 0.0241, "step": 4880 }, { "epoch": 3.48, "grad_norm": 3.0834492531396256, "learning_rate": 4.303156154924537e-07, "loss": 0.0139, "step": 4881 }, { "epoch": 3.48, "grad_norm": 3.2142530911093683, "learning_rate": 4.291433933548633e-07, "loss": 0.0255, "step": 4882 }, { "epoch": 3.49, "grad_norm": 2.640834983845967, "learning_rate": 4.279726984254251e-07, "loss": 0.0262, "step": 4883 }, { "epoch": 3.49, "grad_norm": 3.651611286643107, "learning_rate": 4.268035310952906e-07, "loss": 0.0197, "step": 4884 }, { "epoch": 3.49, "grad_norm": 2.2783581009506872, "learning_rate": 4.256358917550979e-07, "loss": 0.0192, "step": 4885 }, { "epoch": 3.49, "grad_norm": 5.428102406876919, "learning_rate": 4.244697807949805e-07, "loss": 0.0196, "step": 4886 }, { "epoch": 3.49, "grad_norm": 4.979308584900138, "learning_rate": 4.2330519860455446e-07, "loss": 0.0229, "step": 4887 }, { "epoch": 3.49, "grad_norm": 4.1409739409071635, "learning_rate": 4.2214214557293133e-07, "loss": 0.0306, "step": 4888 }, { "epoch": 3.49, "grad_norm": 4.79268503474881, "learning_rate": 4.209806220887053e-07, "loss": 0.03, "step": 4889 }, { "epoch": 3.49, "grad_norm": 2.352736577614645, "learning_rate": 4.1982062853996695e-07, "loss": 0.0215, "step": 4890 }, { "epoch": 3.49, "grad_norm": 3.0169446795812007, "learning_rate": 4.1866216531428806e-07, "loss": 0.0196, "step": 4891 }, { "epoch": 3.49, "grad_norm": 4.629541748577467, "learning_rate": 4.1750523279873613e-07, "loss": 0.0195, "step": 4892 }, { "epoch": 3.49, "grad_norm": 1.271554392634293, "learning_rate": 4.1634983137986083e-07, "loss": 0.0134, "step": 4893 }, { "epoch": 3.49, "grad_norm": 2.0119672947069476, "learning_rate": 4.151959614437046e-07, "loss": 0.0247, "step": 4894 }, { "epoch": 3.49, "grad_norm": 5.712184762810063, "learning_rate": 4.1404362337579716e-07, "loss": 0.029, "step": 4895 }, { "epoch": 3.49, "grad_norm": 5.945272020870616, "learning_rate": 4.128928175611546e-07, "loss": 0.029, "step": 4896 }, { "epoch": 3.5, "grad_norm": 4.408542849416339, "learning_rate": 4.1174354438428434e-07, "loss": 0.0188, "step": 4897 }, { "epoch": 3.5, "grad_norm": 7.5322367428462025, "learning_rate": 4.105958042291791e-07, "loss": 0.0363, "step": 4898 }, { "epoch": 3.5, "grad_norm": 1.6314161372537972, "learning_rate": 4.0944959747931945e-07, "loss": 0.0179, "step": 4899 }, { "epoch": 3.5, "grad_norm": 3.334468123285942, "learning_rate": 4.0830492451767566e-07, "loss": 0.0211, "step": 4900 }, { "epoch": 3.5, "grad_norm": 2.5645975778823953, "learning_rate": 4.0716178572670405e-07, "loss": 0.0224, "step": 4901 }, { "epoch": 3.5, "grad_norm": 4.513557316836511, "learning_rate": 4.060201814883474e-07, "loss": 0.0137, "step": 4902 }, { "epoch": 3.5, "grad_norm": 3.9386081706521936, "learning_rate": 4.0488011218403844e-07, "loss": 0.0279, "step": 4903 }, { "epoch": 3.5, "grad_norm": 2.9682175362012773, "learning_rate": 4.0374157819469406e-07, "loss": 0.0221, "step": 4904 }, { "epoch": 3.5, "grad_norm": 4.816340081088018, "learning_rate": 4.0260457990072113e-07, "loss": 0.0267, "step": 4905 }, { "epoch": 3.5, "grad_norm": 1.652181073212692, "learning_rate": 4.014691176820107e-07, "loss": 0.0142, "step": 4906 }, { "epoch": 3.5, "grad_norm": 2.860067524660608, "learning_rate": 4.003351919179421e-07, "loss": 0.0188, "step": 4907 }, { "epoch": 3.5, "grad_norm": 1.120466782402688, "learning_rate": 3.9920280298738125e-07, "loss": 0.0195, "step": 4908 }, { "epoch": 3.5, "grad_norm": 3.1345045016083892, "learning_rate": 3.980719512686809e-07, "loss": 0.0175, "step": 4909 }, { "epoch": 3.5, "grad_norm": 3.7019204611349212, "learning_rate": 3.969426371396773e-07, "loss": 0.0179, "step": 4910 }, { "epoch": 3.51, "grad_norm": 2.976700351913577, "learning_rate": 3.9581486097769905e-07, "loss": 0.0188, "step": 4911 }, { "epoch": 3.51, "grad_norm": 2.486198283586228, "learning_rate": 3.946886231595526e-07, "loss": 0.0215, "step": 4912 }, { "epoch": 3.51, "grad_norm": 2.020627642013814, "learning_rate": 3.935639240615396e-07, "loss": 0.0218, "step": 4913 }, { "epoch": 3.51, "grad_norm": 4.969087632628006, "learning_rate": 3.924407640594391e-07, "loss": 0.0237, "step": 4914 }, { "epoch": 3.51, "grad_norm": 2.887660959477525, "learning_rate": 3.913191435285224e-07, "loss": 0.0196, "step": 4915 }, { "epoch": 3.51, "grad_norm": 1.867078635117958, "learning_rate": 3.9019906284354145e-07, "loss": 0.0118, "step": 4916 }, { "epoch": 3.51, "grad_norm": 1.9465176519470353, "learning_rate": 3.8908052237873863e-07, "loss": 0.0207, "step": 4917 }, { "epoch": 3.51, "grad_norm": 9.8498836097476, "learning_rate": 3.879635225078371e-07, "loss": 0.0282, "step": 4918 }, { "epoch": 3.51, "grad_norm": 2.9397169943458836, "learning_rate": 3.868480636040484e-07, "loss": 0.0266, "step": 4919 }, { "epoch": 3.51, "grad_norm": 4.960618492091106, "learning_rate": 3.857341460400665e-07, "loss": 0.0177, "step": 4920 }, { "epoch": 3.51, "grad_norm": 4.712244580795838, "learning_rate": 3.846217701880739e-07, "loss": 0.0237, "step": 4921 }, { "epoch": 3.51, "grad_norm": 1.6971508399719888, "learning_rate": 3.835109364197348e-07, "loss": 0.018, "step": 4922 }, { "epoch": 3.51, "grad_norm": 1.7472712527944807, "learning_rate": 3.8240164510620017e-07, "loss": 0.0211, "step": 4923 }, { "epoch": 3.51, "grad_norm": 3.5480848152651516, "learning_rate": 3.81293896618104e-07, "loss": 0.0257, "step": 4924 }, { "epoch": 3.52, "grad_norm": 3.256572425168427, "learning_rate": 3.8018769132556644e-07, "loss": 0.0149, "step": 4925 }, { "epoch": 3.52, "grad_norm": 2.4597451706002413, "learning_rate": 3.790830295981912e-07, "loss": 0.0182, "step": 4926 }, { "epoch": 3.52, "grad_norm": 3.5212984506844682, "learning_rate": 3.7797991180506643e-07, "loss": 0.019, "step": 4927 }, { "epoch": 3.52, "grad_norm": 3.4875509288086013, "learning_rate": 3.768783383147623e-07, "loss": 0.0197, "step": 4928 }, { "epoch": 3.52, "grad_norm": 3.163210180212613, "learning_rate": 3.757783094953382e-07, "loss": 0.0292, "step": 4929 }, { "epoch": 3.52, "grad_norm": 2.5198870342212296, "learning_rate": 3.746798257143314e-07, "loss": 0.029, "step": 4930 }, { "epoch": 3.52, "grad_norm": 3.5847902090339847, "learning_rate": 3.735828873387681e-07, "loss": 0.0183, "step": 4931 }, { "epoch": 3.52, "grad_norm": 3.338194522323422, "learning_rate": 3.724874947351531e-07, "loss": 0.0196, "step": 4932 }, { "epoch": 3.52, "grad_norm": 1.9680640652690908, "learning_rate": 3.7139364826948077e-07, "loss": 0.0177, "step": 4933 }, { "epoch": 3.52, "grad_norm": 1.7834906752501898, "learning_rate": 3.7030134830722207e-07, "loss": 0.0166, "step": 4934 }, { "epoch": 3.52, "grad_norm": 2.341415129554604, "learning_rate": 3.692105952133379e-07, "loss": 0.0249, "step": 4935 }, { "epoch": 3.52, "grad_norm": 4.6407785758751166, "learning_rate": 3.681213893522667e-07, "loss": 0.0352, "step": 4936 }, { "epoch": 3.52, "grad_norm": 4.540938320445459, "learning_rate": 3.670337310879335e-07, "loss": 0.0237, "step": 4937 }, { "epoch": 3.52, "grad_norm": 6.387135784803516, "learning_rate": 3.6594762078374536e-07, "loss": 0.0277, "step": 4938 }, { "epoch": 3.53, "grad_norm": 3.665487214795183, "learning_rate": 3.6486305880259085e-07, "loss": 0.0257, "step": 4939 }, { "epoch": 3.53, "grad_norm": 4.212800882402541, "learning_rate": 3.6378004550684355e-07, "loss": 0.0213, "step": 4940 }, { "epoch": 3.53, "grad_norm": 3.2600196073500043, "learning_rate": 3.626985812583572e-07, "loss": 0.0193, "step": 4941 }, { "epoch": 3.53, "grad_norm": 2.0264755974293966, "learning_rate": 3.6161866641847007e-07, "loss": 0.0271, "step": 4942 }, { "epoch": 3.53, "grad_norm": 3.4707348585734024, "learning_rate": 3.6054030134800243e-07, "loss": 0.0162, "step": 4943 }, { "epoch": 3.53, "grad_norm": 2.0961417452701587, "learning_rate": 3.594634864072527e-07, "loss": 0.0232, "step": 4944 }, { "epoch": 3.53, "grad_norm": 4.063463726526555, "learning_rate": 3.583882219560092e-07, "loss": 0.032, "step": 4945 }, { "epoch": 3.53, "grad_norm": 2.098516311167729, "learning_rate": 3.57314508353534e-07, "loss": 0.0345, "step": 4946 }, { "epoch": 3.53, "grad_norm": 3.9001687600424275, "learning_rate": 3.5624234595857787e-07, "loss": 0.0237, "step": 4947 }, { "epoch": 3.53, "grad_norm": 1.3522189099210975, "learning_rate": 3.551717351293676e-07, "loss": 0.0137, "step": 4948 }, { "epoch": 3.53, "grad_norm": 5.448938058753399, "learning_rate": 3.541026762236166e-07, "loss": 0.0335, "step": 4949 }, { "epoch": 3.53, "grad_norm": 3.39809976716535, "learning_rate": 3.5303516959851405e-07, "loss": 0.0267, "step": 4950 }, { "epoch": 3.53, "grad_norm": 2.0973599945236923, "learning_rate": 3.519692156107379e-07, "loss": 0.02, "step": 4951 }, { "epoch": 3.53, "grad_norm": 4.928976500976884, "learning_rate": 3.509048146164401e-07, "loss": 0.0232, "step": 4952 }, { "epoch": 3.54, "grad_norm": 4.341400781737448, "learning_rate": 3.4984196697125827e-07, "loss": 0.021, "step": 4953 }, { "epoch": 3.54, "grad_norm": 1.5588549990820906, "learning_rate": 3.4878067303030836e-07, "loss": 0.0135, "step": 4954 }, { "epoch": 3.54, "grad_norm": 5.866237503465065, "learning_rate": 3.4772093314818957e-07, "loss": 0.0292, "step": 4955 }, { "epoch": 3.54, "grad_norm": 2.889730819504992, "learning_rate": 3.4666274767897967e-07, "loss": 0.0207, "step": 4956 }, { "epoch": 3.54, "grad_norm": 6.8785021783137665, "learning_rate": 3.456061169762392e-07, "loss": 0.0226, "step": 4957 }, { "epoch": 3.54, "grad_norm": 2.4293314184044568, "learning_rate": 3.44551041393007e-07, "loss": 0.0219, "step": 4958 }, { "epoch": 3.54, "grad_norm": 2.5630792149658586, "learning_rate": 3.434975212818048e-07, "loss": 0.0239, "step": 4959 }, { "epoch": 3.54, "grad_norm": 1.6349579915872139, "learning_rate": 3.424455569946317e-07, "loss": 0.0134, "step": 4960 }, { "epoch": 3.54, "grad_norm": 6.31085497252182, "learning_rate": 3.4139514888296975e-07, "loss": 0.0312, "step": 4961 }, { "epoch": 3.54, "grad_norm": 5.08757040363162, "learning_rate": 3.403462972977789e-07, "loss": 0.0244, "step": 4962 }, { "epoch": 3.54, "grad_norm": 4.263621591200577, "learning_rate": 3.392990025895004e-07, "loss": 0.0243, "step": 4963 }, { "epoch": 3.54, "grad_norm": 2.1468229060472144, "learning_rate": 3.3825326510805556e-07, "loss": 0.0219, "step": 4964 }, { "epoch": 3.54, "grad_norm": 3.37958648447092, "learning_rate": 3.372090852028437e-07, "loss": 0.0281, "step": 4965 }, { "epoch": 3.54, "grad_norm": 1.7362602943935712, "learning_rate": 3.361664632227446e-07, "loss": 0.0188, "step": 4966 }, { "epoch": 3.55, "grad_norm": 2.274637522937686, "learning_rate": 3.3512539951611856e-07, "loss": 0.0234, "step": 4967 }, { "epoch": 3.55, "grad_norm": 5.0068330161997805, "learning_rate": 3.3408589443080395e-07, "loss": 0.0206, "step": 4968 }, { "epoch": 3.55, "grad_norm": 2.6474551601173517, "learning_rate": 3.3304794831411804e-07, "loss": 0.0234, "step": 4969 }, { "epoch": 3.55, "grad_norm": 2.496187749477123, "learning_rate": 3.3201156151285994e-07, "loss": 0.0258, "step": 4970 }, { "epoch": 3.55, "grad_norm": 4.010290134183546, "learning_rate": 3.309767343733028e-07, "loss": 0.0176, "step": 4971 }, { "epoch": 3.55, "grad_norm": 1.5675425735480626, "learning_rate": 3.299434672412044e-07, "loss": 0.0194, "step": 4972 }, { "epoch": 3.55, "grad_norm": 2.539100465424783, "learning_rate": 3.2891176046179583e-07, "loss": 0.019, "step": 4973 }, { "epoch": 3.55, "grad_norm": 2.0917188211506823, "learning_rate": 3.278816143797919e-07, "loss": 0.0194, "step": 4974 }, { "epoch": 3.55, "grad_norm": 4.350767461798503, "learning_rate": 3.2685302933938177e-07, "loss": 0.0386, "step": 4975 }, { "epoch": 3.55, "grad_norm": 3.8928169385095948, "learning_rate": 3.2582600568423715e-07, "loss": 0.0244, "step": 4976 }, { "epoch": 3.55, "grad_norm": 5.787909162323301, "learning_rate": 3.2480054375750305e-07, "loss": 0.0193, "step": 4977 }, { "epoch": 3.55, "grad_norm": 2.4001819336533567, "learning_rate": 3.237766439018064e-07, "loss": 0.02, "step": 4978 }, { "epoch": 3.55, "grad_norm": 1.8709562877855663, "learning_rate": 3.227543064592514e-07, "loss": 0.0191, "step": 4979 }, { "epoch": 3.55, "grad_norm": 1.9065701916885103, "learning_rate": 3.2173353177142044e-07, "loss": 0.0203, "step": 4980 }, { "epoch": 3.56, "grad_norm": 2.5120744516072513, "learning_rate": 3.207143201793722e-07, "loss": 0.0187, "step": 4981 }, { "epoch": 3.56, "grad_norm": 1.8222263981036755, "learning_rate": 3.1969667202364496e-07, "loss": 0.0223, "step": 4982 }, { "epoch": 3.56, "grad_norm": 3.4179200252243804, "learning_rate": 3.1868058764425337e-07, "loss": 0.0249, "step": 4983 }, { "epoch": 3.56, "grad_norm": 5.7498072907102635, "learning_rate": 3.1766606738069084e-07, "loss": 0.0275, "step": 4984 }, { "epoch": 3.56, "grad_norm": 5.041392321955332, "learning_rate": 3.166531115719268e-07, "loss": 0.021, "step": 4985 }, { "epoch": 3.56, "grad_norm": 1.822728818625865, "learning_rate": 3.1564172055640994e-07, "loss": 0.0185, "step": 4986 }, { "epoch": 3.56, "grad_norm": 2.7917425319945153, "learning_rate": 3.1463189467206166e-07, "loss": 0.023, "step": 4987 }, { "epoch": 3.56, "grad_norm": 4.366628592986311, "learning_rate": 3.1362363425628763e-07, "loss": 0.0171, "step": 4988 }, { "epoch": 3.56, "grad_norm": 3.3852483744219595, "learning_rate": 3.1261693964596275e-07, "loss": 0.0244, "step": 4989 }, { "epoch": 3.56, "grad_norm": 3.7515129198680555, "learning_rate": 3.116118111774452e-07, "loss": 0.0148, "step": 4990 }, { "epoch": 3.56, "grad_norm": 1.9817429833762845, "learning_rate": 3.106082491865647e-07, "loss": 0.0157, "step": 4991 }, { "epoch": 3.56, "grad_norm": 2.414884784646765, "learning_rate": 3.0960625400863253e-07, "loss": 0.0179, "step": 4992 }, { "epoch": 3.56, "grad_norm": 5.250163439928367, "learning_rate": 3.0860582597843137e-07, "loss": 0.0252, "step": 4993 }, { "epoch": 3.56, "grad_norm": 4.698750314158885, "learning_rate": 3.0760696543022496e-07, "loss": 0.0187, "step": 4994 }, { "epoch": 3.57, "grad_norm": 12.932454054533897, "learning_rate": 3.066096726977502e-07, "loss": 0.0597, "step": 4995 }, { "epoch": 3.57, "grad_norm": 6.094441081177688, "learning_rate": 3.056139481142206e-07, "loss": 0.0201, "step": 4996 }, { "epoch": 3.57, "grad_norm": 7.143468273686904, "learning_rate": 3.0461979201232674e-07, "loss": 0.0393, "step": 4997 }, { "epoch": 3.57, "grad_norm": 1.6228527593437256, "learning_rate": 3.0362720472423503e-07, "loss": 0.018, "step": 4998 }, { "epoch": 3.57, "grad_norm": 8.117921391666709, "learning_rate": 3.026361865815869e-07, "loss": 0.036, "step": 4999 }, { "epoch": 3.57, "grad_norm": 4.951489289600751, "learning_rate": 3.016467379154997e-07, "loss": 0.0274, "step": 5000 }, { "epoch": 3.57, "eval_avg_AUC": 0.8360170574106709, "eval_avg_Accuracy": 0.743824602122016, "eval_avg_Accuracy-right": 0.8943524194600235, "eval_avg_Accuracy-wrong": 0.4813509210825563, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.7115880873199548, "eval_last_AUC": 0.8253922573221322, "eval_last_Accuracy": 0.7799651856763926, "eval_last_Accuracy-right": 0.8460284335463676, "eval_last_Accuracy-wrong": 0.6647714350693654, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.706694551574502, "eval_max_AUC": 0.7861393054105478, "eval_max_Accuracy": 0.647256299734748, "eval_max_Accuracy-right": 0.988065736272336, "eval_max_Accuracy-wrong": 0.05299067546054128, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6590420344459055, "eval_min_AUC": 0.8438178165751404, "eval_min_Accuracy": 0.7740384615384616, "eval_min_Accuracy-right": 0.7930089996087126, "eval_min_Accuracy-wrong": 0.7409597452808733, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.7097302187281208, "eval_prod_AUC": 0.8398013852657344, "eval_prod_Accuracy": 0.7416279840848806, "eval_prod_Accuracy-right": 0.6696882744228512, "eval_prod_Accuracy-wrong": 0.8670684557652946, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.7074705074291826, "eval_runtime": 246.8796, "eval_samples_per_second": 97.732, "eval_steps_per_second": 3.054, "eval_sum_AUC": 0.7124423224936764, "eval_sum_Accuracy": 0.6394645225464191, "eval_sum_Accuracy-right": 0.9969349158732229, "eval_sum_Accuracy-wrong": 0.016147373209006142, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6847078223460834, "step": 5000 }, { "epoch": 3.57, "grad_norm": 3.654880951851883, "learning_rate": 3.0065885905656733e-07, "loss": 0.0203, "step": 5001 }, { "epoch": 3.57, "grad_norm": 5.243962114831206, "learning_rate": 2.99672550334858e-07, "loss": 0.0173, "step": 5002 }, { "epoch": 3.57, "grad_norm": 1.4092832840135632, "learning_rate": 2.986878120799158e-07, "loss": 0.0208, "step": 5003 }, { "epoch": 3.57, "grad_norm": 1.554011978745569, "learning_rate": 2.977046446207604e-07, "loss": 0.0176, "step": 5004 }, { "epoch": 3.57, "grad_norm": 4.669169960672686, "learning_rate": 2.967230482858863e-07, "loss": 0.0252, "step": 5005 }, { "epoch": 3.57, "grad_norm": 6.6094996745112695, "learning_rate": 2.957430234032627e-07, "loss": 0.0215, "step": 5006 }, { "epoch": 3.57, "grad_norm": 11.110930559363911, "learning_rate": 2.947645703003338e-07, "loss": 0.0281, "step": 5007 }, { "epoch": 3.57, "grad_norm": 5.9334704383066175, "learning_rate": 2.937876893040209e-07, "loss": 0.0171, "step": 5008 }, { "epoch": 3.58, "grad_norm": 4.423469270196388, "learning_rate": 2.9281238074071463e-07, "loss": 0.0315, "step": 5009 }, { "epoch": 3.58, "grad_norm": 1.592156374591865, "learning_rate": 2.9183864493628756e-07, "loss": 0.0227, "step": 5010 }, { "epoch": 3.58, "grad_norm": 4.119354327917944, "learning_rate": 2.908664822160806e-07, "loss": 0.0254, "step": 5011 }, { "epoch": 3.58, "grad_norm": 4.884003650845199, "learning_rate": 2.898958929049117e-07, "loss": 0.0186, "step": 5012 }, { "epoch": 3.58, "grad_norm": 1.961721413748107, "learning_rate": 2.889268773270731e-07, "loss": 0.0222, "step": 5013 }, { "epoch": 3.58, "grad_norm": 2.7963462297389414, "learning_rate": 2.879594358063303e-07, "loss": 0.0233, "step": 5014 }, { "epoch": 3.58, "grad_norm": 2.468003197724322, "learning_rate": 2.869935686659248e-07, "loss": 0.0169, "step": 5015 }, { "epoch": 3.58, "grad_norm": 8.136562695225999, "learning_rate": 2.8602927622856935e-07, "loss": 0.0215, "step": 5016 }, { "epoch": 3.58, "grad_norm": 5.485663261521494, "learning_rate": 2.8506655881645305e-07, "loss": 0.0159, "step": 5017 }, { "epoch": 3.58, "grad_norm": 2.0505850995875723, "learning_rate": 2.841054167512369e-07, "loss": 0.0294, "step": 5018 }, { "epoch": 3.58, "grad_norm": 7.481500531403321, "learning_rate": 2.8314585035405683e-07, "loss": 0.0284, "step": 5019 }, { "epoch": 3.58, "grad_norm": 2.9777680960797746, "learning_rate": 2.8218785994552136e-07, "loss": 0.028, "step": 5020 }, { "epoch": 3.58, "grad_norm": 1.8732685907666995, "learning_rate": 2.8123144584571326e-07, "loss": 0.0196, "step": 5021 }, { "epoch": 3.58, "grad_norm": 5.086031806559671, "learning_rate": 2.8027660837418813e-07, "loss": 0.0253, "step": 5022 }, { "epoch": 3.59, "grad_norm": 6.7694477711519685, "learning_rate": 2.793233478499752e-07, "loss": 0.0291, "step": 5023 }, { "epoch": 3.59, "grad_norm": 3.4396884785592605, "learning_rate": 2.7837166459157625e-07, "loss": 0.0202, "step": 5024 }, { "epoch": 3.59, "grad_norm": 5.189513945664995, "learning_rate": 2.77421558916966e-07, "loss": 0.0231, "step": 5025 }, { "epoch": 3.59, "grad_norm": 2.550855581854903, "learning_rate": 2.764730311435931e-07, "loss": 0.014, "step": 5026 }, { "epoch": 3.59, "grad_norm": 4.1416552070916355, "learning_rate": 2.755260815883781e-07, "loss": 0.0367, "step": 5027 }, { "epoch": 3.59, "grad_norm": 2.011663446639588, "learning_rate": 2.745807105677145e-07, "loss": 0.0251, "step": 5028 }, { "epoch": 3.59, "grad_norm": 7.231248835410802, "learning_rate": 2.736369183974685e-07, "loss": 0.02, "step": 5029 }, { "epoch": 3.59, "grad_norm": 2.63783442077024, "learning_rate": 2.726947053929768e-07, "loss": 0.0271, "step": 5030 }, { "epoch": 3.59, "grad_norm": 3.454747329266388, "learning_rate": 2.7175407186905367e-07, "loss": 0.017, "step": 5031 }, { "epoch": 3.59, "grad_norm": 5.611368469459556, "learning_rate": 2.708150181399788e-07, "loss": 0.0202, "step": 5032 }, { "epoch": 3.59, "grad_norm": 2.2270437823391966, "learning_rate": 2.698775445195101e-07, "loss": 0.0245, "step": 5033 }, { "epoch": 3.59, "grad_norm": 2.6227149241256744, "learning_rate": 2.689416513208726e-07, "loss": 0.0145, "step": 5034 }, { "epoch": 3.59, "grad_norm": 3.930124224787002, "learning_rate": 2.6800733885676833e-07, "loss": 0.0425, "step": 5035 }, { "epoch": 3.59, "grad_norm": 3.6236506163310427, "learning_rate": 2.6707460743936653e-07, "loss": 0.0212, "step": 5036 }, { "epoch": 3.6, "grad_norm": 7.174749354653931, "learning_rate": 2.6614345738031014e-07, "loss": 0.0288, "step": 5037 }, { "epoch": 3.6, "grad_norm": 3.9949991370218254, "learning_rate": 2.6521388899071467e-07, "loss": 0.0237, "step": 5038 }, { "epoch": 3.6, "grad_norm": 1.4321127479377347, "learning_rate": 2.642859025811656e-07, "loss": 0.017, "step": 5039 }, { "epoch": 3.6, "grad_norm": 4.085364605840262, "learning_rate": 2.633594984617199e-07, "loss": 0.0229, "step": 5040 }, { "epoch": 3.6, "grad_norm": 4.510319037477354, "learning_rate": 2.624346769419078e-07, "loss": 0.0276, "step": 5041 }, { "epoch": 3.6, "grad_norm": 2.265080203783568, "learning_rate": 2.6151143833072824e-07, "loss": 0.0195, "step": 5042 }, { "epoch": 3.6, "grad_norm": 2.381010118389464, "learning_rate": 2.605897829366527e-07, "loss": 0.0164, "step": 5043 }, { "epoch": 3.6, "grad_norm": 2.1997765716885223, "learning_rate": 2.596697110676233e-07, "loss": 0.0253, "step": 5044 }, { "epoch": 3.6, "grad_norm": 2.80303572995716, "learning_rate": 2.5875122303105403e-07, "loss": 0.0303, "step": 5045 }, { "epoch": 3.6, "grad_norm": 2.042229652025857, "learning_rate": 2.5783431913382673e-07, "loss": 0.0197, "step": 5046 }, { "epoch": 3.6, "grad_norm": 3.4939362138330483, "learning_rate": 2.5691899968229904e-07, "loss": 0.0174, "step": 5047 }, { "epoch": 3.6, "grad_norm": 1.4919612882154958, "learning_rate": 2.560052649822925e-07, "loss": 0.0164, "step": 5048 }, { "epoch": 3.6, "grad_norm": 2.9015314648170483, "learning_rate": 2.5509311533910674e-07, "loss": 0.0202, "step": 5049 }, { "epoch": 3.6, "grad_norm": 2.6111830304001855, "learning_rate": 2.5418255105750465e-07, "loss": 0.0182, "step": 5050 }, { "epoch": 3.61, "grad_norm": 3.25436373070333, "learning_rate": 2.532735724417251e-07, "loss": 0.0296, "step": 5051 }, { "epoch": 3.61, "grad_norm": 2.0050448829571654, "learning_rate": 2.52366179795473e-07, "loss": 0.0205, "step": 5052 }, { "epoch": 3.61, "grad_norm": 5.903582016575689, "learning_rate": 2.5146037342192673e-07, "loss": 0.0338, "step": 5053 }, { "epoch": 3.61, "grad_norm": 3.84120253557257, "learning_rate": 2.505561536237311e-07, "loss": 0.0205, "step": 5054 }, { "epoch": 3.61, "grad_norm": 3.1135264434051866, "learning_rate": 2.496535207030043e-07, "loss": 0.0112, "step": 5055 }, { "epoch": 3.61, "grad_norm": 1.5988271065115593, "learning_rate": 2.4875247496133234e-07, "loss": 0.0239, "step": 5056 }, { "epoch": 3.61, "grad_norm": 2.5038282984009146, "learning_rate": 2.4785301669977116e-07, "loss": 0.0233, "step": 5057 }, { "epoch": 3.61, "grad_norm": 3.2800353162346583, "learning_rate": 2.469551462188463e-07, "loss": 0.0207, "step": 5058 }, { "epoch": 3.61, "grad_norm": 2.5254055880544684, "learning_rate": 2.460588638185535e-07, "loss": 0.0244, "step": 5059 }, { "epoch": 3.61, "grad_norm": 2.135851694029928, "learning_rate": 2.45164169798357e-07, "loss": 0.0271, "step": 5060 }, { "epoch": 3.61, "grad_norm": 5.307448036798728, "learning_rate": 2.4427106445719053e-07, "loss": 0.0141, "step": 5061 }, { "epoch": 3.61, "grad_norm": 3.9909825038257694, "learning_rate": 2.4337954809345807e-07, "loss": 0.0143, "step": 5062 }, { "epoch": 3.61, "grad_norm": 5.963517100815372, "learning_rate": 2.4248962100503095e-07, "loss": 0.0257, "step": 5063 }, { "epoch": 3.61, "grad_norm": 3.314407673144177, "learning_rate": 2.416012834892506e-07, "loss": 0.0271, "step": 5064 }, { "epoch": 3.62, "grad_norm": 3.1641223902931577, "learning_rate": 2.4071453584292693e-07, "loss": 0.0187, "step": 5065 }, { "epoch": 3.62, "grad_norm": 2.1701515875255235, "learning_rate": 2.3982937836233954e-07, "loss": 0.0261, "step": 5066 }, { "epoch": 3.62, "grad_norm": 3.6782775860952124, "learning_rate": 2.389458113432347e-07, "loss": 0.0218, "step": 5067 }, { "epoch": 3.62, "grad_norm": 3.2993387317839593, "learning_rate": 2.380638350808301e-07, "loss": 0.0245, "step": 5068 }, { "epoch": 3.62, "grad_norm": 1.5026356633345086, "learning_rate": 2.371834498698089e-07, "loss": 0.0189, "step": 5069 }, { "epoch": 3.62, "grad_norm": 3.729960127563879, "learning_rate": 2.363046560043264e-07, "loss": 0.0256, "step": 5070 }, { "epoch": 3.62, "grad_norm": 3.429117472534288, "learning_rate": 2.3542745377800046e-07, "loss": 0.0197, "step": 5071 }, { "epoch": 3.62, "grad_norm": 2.2548825921747713, "learning_rate": 2.3455184348392446e-07, "loss": 0.014, "step": 5072 }, { "epoch": 3.62, "grad_norm": 5.749312446063484, "learning_rate": 2.3367782541465268e-07, "loss": 0.0181, "step": 5073 }, { "epoch": 3.62, "grad_norm": 4.623364041936797, "learning_rate": 2.3280539986221317e-07, "loss": 0.0201, "step": 5074 }, { "epoch": 3.62, "grad_norm": 2.473930184609748, "learning_rate": 2.3193456711809837e-07, "loss": 0.025, "step": 5075 }, { "epoch": 3.62, "grad_norm": 3.597217833878548, "learning_rate": 2.3106532747327104e-07, "loss": 0.0147, "step": 5076 }, { "epoch": 3.62, "grad_norm": 2.8255040242530125, "learning_rate": 2.3019768121815777e-07, "loss": 0.0132, "step": 5077 }, { "epoch": 3.62, "grad_norm": 2.3758633766254955, "learning_rate": 2.2933162864265836e-07, "loss": 0.015, "step": 5078 }, { "epoch": 3.63, "grad_norm": 4.64748860266476, "learning_rate": 2.2846717003613462e-07, "loss": 0.0245, "step": 5079 }, { "epoch": 3.63, "grad_norm": 3.860781440775316, "learning_rate": 2.2760430568741943e-07, "loss": 0.021, "step": 5080 }, { "epoch": 3.63, "grad_norm": 2.042023533674793, "learning_rate": 2.2674303588481162e-07, "loss": 0.0194, "step": 5081 }, { "epoch": 3.63, "grad_norm": 1.7607599316113298, "learning_rate": 2.258833609160771e-07, "loss": 0.021, "step": 5082 }, { "epoch": 3.63, "grad_norm": 4.441214972475646, "learning_rate": 2.2502528106845e-07, "loss": 0.0197, "step": 5083 }, { "epoch": 3.63, "grad_norm": 4.545608016353152, "learning_rate": 2.241687966286299e-07, "loss": 0.0257, "step": 5084 }, { "epoch": 3.63, "grad_norm": 1.6754502037667738, "learning_rate": 2.233139078827845e-07, "loss": 0.0189, "step": 5085 }, { "epoch": 3.63, "grad_norm": 1.0124530351530259, "learning_rate": 2.2246061511654816e-07, "loss": 0.013, "step": 5086 }, { "epoch": 3.63, "grad_norm": 1.3560170545749979, "learning_rate": 2.2160891861502165e-07, "loss": 0.0203, "step": 5087 }, { "epoch": 3.63, "grad_norm": 3.614265986080177, "learning_rate": 2.2075881866277348e-07, "loss": 0.0145, "step": 5088 }, { "epoch": 3.63, "grad_norm": 1.8367535225634963, "learning_rate": 2.199103155438359e-07, "loss": 0.0218, "step": 5089 }, { "epoch": 3.63, "grad_norm": 2.112045433383258, "learning_rate": 2.1906340954171212e-07, "loss": 0.0324, "step": 5090 }, { "epoch": 3.63, "grad_norm": 2.89243344264716, "learning_rate": 2.1821810093936636e-07, "loss": 0.0231, "step": 5091 }, { "epoch": 3.63, "grad_norm": 2.5341079427529376, "learning_rate": 2.1737439001923488e-07, "loss": 0.0171, "step": 5092 }, { "epoch": 3.64, "grad_norm": 12.959994215228567, "learning_rate": 2.1653227706321388e-07, "loss": 0.055, "step": 5093 }, { "epoch": 3.64, "grad_norm": 2.8515096352994016, "learning_rate": 2.156917623526722e-07, "loss": 0.0296, "step": 5094 }, { "epoch": 3.64, "grad_norm": 3.541543190563925, "learning_rate": 2.1485284616843904e-07, "loss": 0.0184, "step": 5095 }, { "epoch": 3.64, "grad_norm": 3.7930353728065533, "learning_rate": 2.140155287908141e-07, "loss": 0.0248, "step": 5096 }, { "epoch": 3.64, "grad_norm": 3.481361108056802, "learning_rate": 2.131798104995586e-07, "loss": 0.0275, "step": 5097 }, { "epoch": 3.64, "grad_norm": 6.486064903589152, "learning_rate": 2.123456915739025e-07, "loss": 0.0203, "step": 5098 }, { "epoch": 3.64, "grad_norm": 3.4582584714064426, "learning_rate": 2.115131722925401e-07, "loss": 0.0294, "step": 5099 }, { "epoch": 3.64, "grad_norm": 1.3294381184143647, "learning_rate": 2.1068225293363166e-07, "loss": 0.023, "step": 5100 }, { "epoch": 3.64, "grad_norm": 2.312292253078772, "learning_rate": 2.0985293377480342e-07, "loss": 0.0206, "step": 5101 }, { "epoch": 3.64, "grad_norm": 3.2619168492415973, "learning_rate": 2.0902521509314543e-07, "loss": 0.0173, "step": 5102 }, { "epoch": 3.64, "grad_norm": 3.81893103016964, "learning_rate": 2.0819909716521426e-07, "loss": 0.0235, "step": 5103 }, { "epoch": 3.64, "grad_norm": 2.3150610495838744, "learning_rate": 2.0737458026703182e-07, "loss": 0.0168, "step": 5104 }, { "epoch": 3.64, "grad_norm": 4.629763159973222, "learning_rate": 2.0655166467408283e-07, "loss": 0.0433, "step": 5105 }, { "epoch": 3.64, "grad_norm": 5.583557149885652, "learning_rate": 2.057303506613212e-07, "loss": 0.0238, "step": 5106 }, { "epoch": 3.65, "grad_norm": 3.0910682729647805, "learning_rate": 2.049106385031602e-07, "loss": 0.0221, "step": 5107 }, { "epoch": 3.65, "grad_norm": 4.763394187633355, "learning_rate": 2.0409252847348404e-07, "loss": 0.022, "step": 5108 }, { "epoch": 3.65, "grad_norm": 1.864205678587097, "learning_rate": 2.032760208456358e-07, "loss": 0.0174, "step": 5109 }, { "epoch": 3.65, "grad_norm": 5.291056945540742, "learning_rate": 2.0246111589242835e-07, "loss": 0.0265, "step": 5110 }, { "epoch": 3.65, "grad_norm": 1.921185917208169, "learning_rate": 2.0164781388613386e-07, "loss": 0.0202, "step": 5111 }, { "epoch": 3.65, "grad_norm": 7.495862039740579, "learning_rate": 2.0083611509849443e-07, "loss": 0.0235, "step": 5112 }, { "epoch": 3.65, "grad_norm": 1.3689942096771623, "learning_rate": 2.0002601980071145e-07, "loss": 0.0245, "step": 5113 }, { "epoch": 3.65, "grad_norm": 4.366828293872557, "learning_rate": 1.9921752826345397e-07, "loss": 0.0237, "step": 5114 }, { "epoch": 3.65, "grad_norm": 1.9223888609093516, "learning_rate": 1.9841064075685367e-07, "loss": 0.0229, "step": 5115 }, { "epoch": 3.65, "grad_norm": 1.2881973687247583, "learning_rate": 1.9760535755050715e-07, "loss": 0.0125, "step": 5116 }, { "epoch": 3.65, "grad_norm": 2.6967817037967867, "learning_rate": 1.9680167891347356e-07, "loss": 0.016, "step": 5117 }, { "epoch": 3.65, "grad_norm": 3.439604203890696, "learning_rate": 1.9599960511427761e-07, "loss": 0.0323, "step": 5118 }, { "epoch": 3.65, "grad_norm": 1.5596126183501706, "learning_rate": 1.9519913642090715e-07, "loss": 0.0179, "step": 5119 }, { "epoch": 3.65, "grad_norm": 2.9106021839335043, "learning_rate": 1.9440027310081323e-07, "loss": 0.0309, "step": 5120 }, { "epoch": 3.66, "grad_norm": 4.111574300372222, "learning_rate": 1.9360301542091065e-07, "loss": 0.0214, "step": 5121 }, { "epoch": 3.66, "grad_norm": 3.430689782203096, "learning_rate": 1.9280736364757912e-07, "loss": 0.0167, "step": 5122 }, { "epoch": 3.66, "grad_norm": 2.7827580688892324, "learning_rate": 1.9201331804665934e-07, "loss": 0.0135, "step": 5123 }, { "epoch": 3.66, "grad_norm": 1.750804909510771, "learning_rate": 1.9122087888345798e-07, "loss": 0.0179, "step": 5124 }, { "epoch": 3.66, "grad_norm": 3.102904067177262, "learning_rate": 1.9043004642274266e-07, "loss": 0.0181, "step": 5125 }, { "epoch": 3.66, "grad_norm": 1.4913641627966314, "learning_rate": 1.896408209287459e-07, "loss": 0.0184, "step": 5126 }, { "epoch": 3.66, "grad_norm": 1.2369411725617043, "learning_rate": 1.888532026651624e-07, "loss": 0.0197, "step": 5127 }, { "epoch": 3.66, "grad_norm": 3.1080100506244572, "learning_rate": 1.880671918951499e-07, "loss": 0.016, "step": 5128 }, { "epoch": 3.66, "grad_norm": 3.60563971987711, "learning_rate": 1.8728278888132944e-07, "loss": 0.0275, "step": 5129 }, { "epoch": 3.66, "grad_norm": 2.013124995048135, "learning_rate": 1.864999938857842e-07, "loss": 0.0272, "step": 5130 }, { "epoch": 3.66, "grad_norm": 3.569223411896495, "learning_rate": 1.8571880717006218e-07, "loss": 0.013, "step": 5131 }, { "epoch": 3.66, "grad_norm": 2.717475672756776, "learning_rate": 1.8493922899516902e-07, "loss": 0.0257, "step": 5132 }, { "epoch": 3.66, "grad_norm": 3.2812818109675796, "learning_rate": 1.8416125962157971e-07, "loss": 0.0374, "step": 5133 }, { "epoch": 3.66, "grad_norm": 6.769023739742816, "learning_rate": 1.8338489930922632e-07, "loss": 0.0213, "step": 5134 }, { "epoch": 3.67, "grad_norm": 5.479076444685034, "learning_rate": 1.8261014831750633e-07, "loss": 0.0232, "step": 5135 }, { "epoch": 3.67, "grad_norm": 3.1296442552646995, "learning_rate": 1.8183700690527717e-07, "loss": 0.0199, "step": 5136 }, { "epoch": 3.67, "grad_norm": 2.5439097741169814, "learning_rate": 1.810654753308616e-07, "loss": 0.0301, "step": 5137 }, { "epoch": 3.67, "grad_norm": 1.373119819713574, "learning_rate": 1.8029555385204067e-07, "loss": 0.0166, "step": 5138 }, { "epoch": 3.67, "grad_norm": 2.135208763283895, "learning_rate": 1.795272427260608e-07, "loss": 0.0168, "step": 5139 }, { "epoch": 3.67, "grad_norm": 2.092430470504436, "learning_rate": 1.7876054220962835e-07, "loss": 0.0161, "step": 5140 }, { "epoch": 3.67, "grad_norm": 2.8211956696294695, "learning_rate": 1.779954525589128e-07, "loss": 0.0222, "step": 5141 }, { "epoch": 3.67, "grad_norm": 1.7382385750874663, "learning_rate": 1.7723197402954419e-07, "loss": 0.0143, "step": 5142 }, { "epoch": 3.67, "grad_norm": 3.548820464246968, "learning_rate": 1.7647010687661558e-07, "loss": 0.0152, "step": 5143 }, { "epoch": 3.67, "grad_norm": 1.9248454205191103, "learning_rate": 1.757098513546801e-07, "loss": 0.0228, "step": 5144 }, { "epoch": 3.67, "grad_norm": 5.791954592906748, "learning_rate": 1.74951207717754e-07, "loss": 0.0251, "step": 5145 }, { "epoch": 3.67, "grad_norm": 3.654095689991998, "learning_rate": 1.7419417621931388e-07, "loss": 0.021, "step": 5146 }, { "epoch": 3.67, "grad_norm": 3.678062581923196, "learning_rate": 1.7343875711229864e-07, "loss": 0.0274, "step": 5147 }, { "epoch": 3.67, "grad_norm": 2.4221492030173604, "learning_rate": 1.7268495064910574e-07, "loss": 0.0193, "step": 5148 }, { "epoch": 3.68, "grad_norm": 2.665876583116908, "learning_rate": 1.719327570815993e-07, "loss": 0.0284, "step": 5149 }, { "epoch": 3.68, "grad_norm": 4.023954591813659, "learning_rate": 1.711821766610977e-07, "loss": 0.0265, "step": 5150 }, { "epoch": 3.68, "grad_norm": 2.666367457998923, "learning_rate": 1.704332096383865e-07, "loss": 0.0207, "step": 5151 }, { "epoch": 3.68, "grad_norm": 4.801752871559416, "learning_rate": 1.696858562637077e-07, "loss": 0.014, "step": 5152 }, { "epoch": 3.68, "grad_norm": 2.7854192881731974, "learning_rate": 1.689401167867677e-07, "loss": 0.0282, "step": 5153 }, { "epoch": 3.68, "grad_norm": 2.964894722808635, "learning_rate": 1.6819599145672993e-07, "loss": 0.028, "step": 5154 }, { "epoch": 3.68, "grad_norm": 3.7778238501101375, "learning_rate": 1.674534805222222e-07, "loss": 0.0338, "step": 5155 }, { "epoch": 3.68, "grad_norm": 2.277752038612983, "learning_rate": 1.667125842313305e-07, "loss": 0.0182, "step": 5156 }, { "epoch": 3.68, "grad_norm": 1.2367806497125986, "learning_rate": 1.6597330283160184e-07, "loss": 0.0113, "step": 5157 }, { "epoch": 3.68, "grad_norm": 8.140836309513228, "learning_rate": 1.6523563657004416e-07, "loss": 0.0375, "step": 5158 }, { "epoch": 3.68, "grad_norm": 2.5710975609049305, "learning_rate": 1.644995856931253e-07, "loss": 0.02, "step": 5159 }, { "epoch": 3.68, "grad_norm": 2.969233743080048, "learning_rate": 1.6376515044677354e-07, "loss": 0.0219, "step": 5160 }, { "epoch": 3.68, "grad_norm": 2.177218001293877, "learning_rate": 1.630323310763776e-07, "loss": 0.0217, "step": 5161 }, { "epoch": 3.68, "grad_norm": 2.1841237881191446, "learning_rate": 1.6230112782678608e-07, "loss": 0.0266, "step": 5162 }, { "epoch": 3.69, "grad_norm": 2.4817841006206276, "learning_rate": 1.6157154094230744e-07, "loss": 0.0182, "step": 5163 }, { "epoch": 3.69, "grad_norm": 4.016931727249433, "learning_rate": 1.6084357066670997e-07, "loss": 0.0272, "step": 5164 }, { "epoch": 3.69, "grad_norm": 1.879190602814266, "learning_rate": 1.601172172432225e-07, "loss": 0.0209, "step": 5165 }, { "epoch": 3.69, "grad_norm": 2.778061083524743, "learning_rate": 1.5939248091453252e-07, "loss": 0.0218, "step": 5166 }, { "epoch": 3.69, "grad_norm": 5.732208780209183, "learning_rate": 1.5866936192278915e-07, "loss": 0.0325, "step": 5167 }, { "epoch": 3.69, "grad_norm": 1.4706232698161006, "learning_rate": 1.5794786050959797e-07, "loss": 0.0134, "step": 5168 }, { "epoch": 3.69, "grad_norm": 6.98594884392636, "learning_rate": 1.5722797691602842e-07, "loss": 0.0319, "step": 5169 }, { "epoch": 3.69, "grad_norm": 3.349426832853079, "learning_rate": 1.5650971138260473e-07, "loss": 0.0248, "step": 5170 }, { "epoch": 3.69, "grad_norm": 4.976579295378491, "learning_rate": 1.5579306414931493e-07, "loss": 0.0274, "step": 5171 }, { "epoch": 3.69, "grad_norm": 2.608275894549004, "learning_rate": 1.5507803545560195e-07, "loss": 0.0272, "step": 5172 }, { "epoch": 3.69, "grad_norm": 3.1503623838936474, "learning_rate": 1.543646255403719e-07, "loss": 0.0266, "step": 5173 }, { "epoch": 3.69, "grad_norm": 1.5556403344359921, "learning_rate": 1.5365283464198743e-07, "loss": 0.0189, "step": 5174 }, { "epoch": 3.69, "grad_norm": 8.45265240922781, "learning_rate": 1.529426629982711e-07, "loss": 0.0317, "step": 5175 }, { "epoch": 3.69, "grad_norm": 4.4790627118344535, "learning_rate": 1.5223411084650476e-07, "loss": 0.02, "step": 5176 }, { "epoch": 3.7, "grad_norm": 2.2062655844358336, "learning_rate": 1.5152717842342845e-07, "loss": 0.0246, "step": 5177 }, { "epoch": 3.7, "grad_norm": 2.6205269218486786, "learning_rate": 1.5082186596524218e-07, "loss": 0.0175, "step": 5178 }, { "epoch": 3.7, "grad_norm": 3.0051366479489072, "learning_rate": 1.501181737076035e-07, "loss": 0.0238, "step": 5179 }, { "epoch": 3.7, "grad_norm": 4.819673804209162, "learning_rate": 1.4941610188562884e-07, "loss": 0.0199, "step": 5180 }, { "epoch": 3.7, "grad_norm": 5.138577134268833, "learning_rate": 1.4871565073389382e-07, "loss": 0.0246, "step": 5181 }, { "epoch": 3.7, "grad_norm": 3.8138304855316023, "learning_rate": 1.4801682048643183e-07, "loss": 0.0242, "step": 5182 }, { "epoch": 3.7, "grad_norm": 1.4542929428683837, "learning_rate": 1.4731961137673555e-07, "loss": 0.016, "step": 5183 }, { "epoch": 3.7, "grad_norm": 4.104948196555313, "learning_rate": 1.466240236377553e-07, "loss": 0.0257, "step": 5184 }, { "epoch": 3.7, "grad_norm": 3.5735418575307967, "learning_rate": 1.4593005750189958e-07, "loss": 0.0232, "step": 5185 }, { "epoch": 3.7, "grad_norm": 1.8507033348331547, "learning_rate": 1.4523771320103574e-07, "loss": 0.0195, "step": 5186 }, { "epoch": 3.7, "grad_norm": 1.8413134451311026, "learning_rate": 1.4454699096648873e-07, "loss": 0.0345, "step": 5187 }, { "epoch": 3.7, "grad_norm": 3.3278075671092697, "learning_rate": 1.4385789102904168e-07, "loss": 0.0301, "step": 5188 }, { "epoch": 3.7, "grad_norm": 2.3248996332848746, "learning_rate": 1.4317041361893546e-07, "loss": 0.0295, "step": 5189 }, { "epoch": 3.7, "grad_norm": 3.4264176035418945, "learning_rate": 1.4248455896587022e-07, "loss": 0.0327, "step": 5190 }, { "epoch": 3.71, "grad_norm": 1.728019296230289, "learning_rate": 1.418003272990004e-07, "loss": 0.0189, "step": 5191 }, { "epoch": 3.71, "grad_norm": 1.489162413811022, "learning_rate": 1.4111771884694315e-07, "loss": 0.0203, "step": 5192 }, { "epoch": 3.71, "grad_norm": 1.9524657898968014, "learning_rate": 1.4043673383776825e-07, "loss": 0.0123, "step": 5193 }, { "epoch": 3.71, "grad_norm": 2.415123271352204, "learning_rate": 1.3975737249900812e-07, "loss": 0.0266, "step": 5194 }, { "epoch": 3.71, "grad_norm": 2.8692636393398847, "learning_rate": 1.3907963505764731e-07, "loss": 0.0243, "step": 5195 }, { "epoch": 3.71, "grad_norm": 2.8502413687478807, "learning_rate": 1.384035217401325e-07, "loss": 0.0139, "step": 5196 }, { "epoch": 3.71, "grad_norm": 2.862718769125452, "learning_rate": 1.3772903277236404e-07, "loss": 0.0235, "step": 5197 }, { "epoch": 3.71, "grad_norm": 2.1513904659754792, "learning_rate": 1.370561683797028e-07, "loss": 0.0271, "step": 5198 }, { "epoch": 3.71, "grad_norm": 6.519218790872213, "learning_rate": 1.363849287869645e-07, "loss": 0.0378, "step": 5199 }, { "epoch": 3.71, "grad_norm": 4.489049467119617, "learning_rate": 1.3571531421842256e-07, "loss": 0.0273, "step": 5200 }, { "epoch": 3.71, "grad_norm": 2.3577172209252852, "learning_rate": 1.3504732489780849e-07, "loss": 0.0219, "step": 5201 }, { "epoch": 3.71, "grad_norm": 5.7833943763627795, "learning_rate": 1.3438096104830879e-07, "loss": 0.0217, "step": 5202 }, { "epoch": 3.71, "grad_norm": 1.988371201831818, "learning_rate": 1.3371622289256869e-07, "loss": 0.0202, "step": 5203 }, { "epoch": 3.71, "grad_norm": 2.425503910065707, "learning_rate": 1.3305311065269e-07, "loss": 0.0198, "step": 5204 }, { "epoch": 3.72, "grad_norm": 3.090722053136764, "learning_rate": 1.323916245502299e-07, "loss": 0.0245, "step": 5205 }, { "epoch": 3.72, "grad_norm": 4.9627962283694345, "learning_rate": 1.3173176480620442e-07, "loss": 0.0244, "step": 5206 }, { "epoch": 3.72, "grad_norm": 9.206592423133095, "learning_rate": 1.3107353164108273e-07, "loss": 0.0316, "step": 5207 }, { "epoch": 3.72, "grad_norm": 2.7039312687504373, "learning_rate": 1.3041692527479556e-07, "loss": 0.022, "step": 5208 }, { "epoch": 3.72, "grad_norm": 1.3752954379070126, "learning_rate": 1.2976194592672465e-07, "loss": 0.0183, "step": 5209 }, { "epoch": 3.72, "grad_norm": 4.6190385954775905, "learning_rate": 1.2910859381571327e-07, "loss": 0.0233, "step": 5210 }, { "epoch": 3.72, "grad_norm": 3.210019534830187, "learning_rate": 1.284568691600563e-07, "loss": 0.0186, "step": 5211 }, { "epoch": 3.72, "grad_norm": 3.4280035373439843, "learning_rate": 1.2780677217750949e-07, "loss": 0.0217, "step": 5212 }, { "epoch": 3.72, "grad_norm": 2.9802093187192633, "learning_rate": 1.271583030852791e-07, "loss": 0.0247, "step": 5213 }, { "epoch": 3.72, "grad_norm": 1.385873022404817, "learning_rate": 1.2651146210003406e-07, "loss": 0.0176, "step": 5214 }, { "epoch": 3.72, "grad_norm": 5.529817438941661, "learning_rate": 1.2586624943789372e-07, "loss": 0.0199, "step": 5215 }, { "epoch": 3.72, "grad_norm": 2.3117896694300617, "learning_rate": 1.2522266531443616e-07, "loss": 0.0269, "step": 5216 }, { "epoch": 3.72, "grad_norm": 7.010813791691153, "learning_rate": 1.245807099446955e-07, "loss": 0.0401, "step": 5217 }, { "epoch": 3.72, "grad_norm": 6.309724081611123, "learning_rate": 1.239403835431602e-07, "loss": 0.0204, "step": 5218 }, { "epoch": 3.73, "grad_norm": 3.991131983188111, "learning_rate": 1.2330168632377514e-07, "loss": 0.021, "step": 5219 }, { "epoch": 3.73, "grad_norm": 3.3782189157497045, "learning_rate": 1.2266461849994138e-07, "loss": 0.0224, "step": 5220 }, { "epoch": 3.73, "grad_norm": 4.944132812084641, "learning_rate": 1.2202918028451527e-07, "loss": 0.0171, "step": 5221 }, { "epoch": 3.73, "grad_norm": 2.3020823968537423, "learning_rate": 1.2139537188980753e-07, "loss": 0.0197, "step": 5222 }, { "epoch": 3.73, "grad_norm": 2.252902650158216, "learning_rate": 1.207631935275866e-07, "loss": 0.0306, "step": 5223 }, { "epoch": 3.73, "grad_norm": 4.783919269935325, "learning_rate": 1.2013264540907455e-07, "loss": 0.0215, "step": 5224 }, { "epoch": 3.73, "grad_norm": 2.5909512837118926, "learning_rate": 1.1950372774494846e-07, "loss": 0.0176, "step": 5225 }, { "epoch": 3.73, "grad_norm": 4.136611119846169, "learning_rate": 1.1887644074534244e-07, "loss": 0.0186, "step": 5226 }, { "epoch": 3.73, "grad_norm": 2.72318296618443, "learning_rate": 1.182507846198444e-07, "loss": 0.0245, "step": 5227 }, { "epoch": 3.73, "grad_norm": 7.516987600595989, "learning_rate": 1.1762675957749769e-07, "loss": 0.0422, "step": 5228 }, { "epoch": 3.73, "grad_norm": 2.5276403915062757, "learning_rate": 1.1700436582680108e-07, "loss": 0.0191, "step": 5229 }, { "epoch": 3.73, "grad_norm": 4.752529791718225, "learning_rate": 1.1638360357570654e-07, "loss": 0.0293, "step": 5230 }, { "epoch": 3.73, "grad_norm": 2.400911193761536, "learning_rate": 1.157644730316243e-07, "loss": 0.0157, "step": 5231 }, { "epoch": 3.73, "grad_norm": 2.326937968268249, "learning_rate": 1.1514697440141498e-07, "loss": 0.0182, "step": 5232 }, { "epoch": 3.74, "grad_norm": 2.4877722821251047, "learning_rate": 1.1453110789139855e-07, "loss": 0.0208, "step": 5233 }, { "epoch": 3.74, "grad_norm": 3.9936599844023553, "learning_rate": 1.1391687370734594e-07, "loss": 0.018, "step": 5234 }, { "epoch": 3.74, "grad_norm": 3.6287267410137187, "learning_rate": 1.1330427205448579e-07, "loss": 0.0285, "step": 5235 }, { "epoch": 3.74, "grad_norm": 2.4006517456771665, "learning_rate": 1.1269330313749715e-07, "loss": 0.0214, "step": 5236 }, { "epoch": 3.74, "grad_norm": 4.142343883359804, "learning_rate": 1.1208396716051895e-07, "loss": 0.0247, "step": 5237 }, { "epoch": 3.74, "grad_norm": 2.711787066445167, "learning_rate": 1.1147626432713943e-07, "loss": 0.017, "step": 5238 }, { "epoch": 3.74, "grad_norm": 1.7231843572349768, "learning_rate": 1.1087019484040562e-07, "loss": 0.0136, "step": 5239 }, { "epoch": 3.74, "grad_norm": 6.191379324297485, "learning_rate": 1.1026575890281443e-07, "loss": 0.0314, "step": 5240 }, { "epoch": 3.74, "grad_norm": 2.822860417056047, "learning_rate": 1.0966295671632043e-07, "loss": 0.0251, "step": 5241 }, { "epoch": 3.74, "grad_norm": 2.203361342127882, "learning_rate": 1.0906178848233029e-07, "loss": 0.0172, "step": 5242 }, { "epoch": 3.74, "grad_norm": 2.8531402032960096, "learning_rate": 1.0846225440170611e-07, "loss": 0.0256, "step": 5243 }, { "epoch": 3.74, "grad_norm": 1.698947522501088, "learning_rate": 1.0786435467476264e-07, "loss": 0.0149, "step": 5244 }, { "epoch": 3.74, "grad_norm": 4.130192879839065, "learning_rate": 1.072680895012701e-07, "loss": 0.0208, "step": 5245 }, { "epoch": 3.74, "grad_norm": 4.923493442779139, "learning_rate": 1.0667345908045135e-07, "loss": 0.0206, "step": 5246 }, { "epoch": 3.75, "grad_norm": 1.6193563517025997, "learning_rate": 1.0608046361098356e-07, "loss": 0.021, "step": 5247 }, { "epoch": 3.75, "grad_norm": 5.093530894764852, "learning_rate": 1.0548910329099771e-07, "loss": 0.0214, "step": 5248 }, { "epoch": 3.75, "grad_norm": 3.1583303655468944, "learning_rate": 1.048993783180785e-07, "loss": 0.0204, "step": 5249 }, { "epoch": 3.75, "grad_norm": 4.86486432471581, "learning_rate": 1.0431128888926222e-07, "loss": 0.0346, "step": 5250 }, { "epoch": 3.75, "grad_norm": 3.9913107188739105, "learning_rate": 1.0372483520104337e-07, "loss": 0.0359, "step": 5251 }, { "epoch": 3.75, "grad_norm": 2.2381666578064294, "learning_rate": 1.0314001744936409e-07, "loss": 0.0213, "step": 5252 }, { "epoch": 3.75, "grad_norm": 4.551116852075309, "learning_rate": 1.0255683582962583e-07, "loss": 0.0211, "step": 5253 }, { "epoch": 3.75, "grad_norm": 1.057164791329309, "learning_rate": 1.0197529053667721e-07, "loss": 0.0178, "step": 5254 }, { "epoch": 3.75, "grad_norm": 3.1358364708701503, "learning_rate": 1.013953817648261e-07, "loss": 0.0205, "step": 5255 }, { "epoch": 3.75, "grad_norm": 4.521051074903339, "learning_rate": 1.008171097078292e-07, "loss": 0.0273, "step": 5256 }, { "epoch": 3.75, "grad_norm": 2.9904025566820267, "learning_rate": 1.0024047455889918e-07, "loss": 0.0304, "step": 5257 }, { "epoch": 3.75, "grad_norm": 3.778324379127207, "learning_rate": 9.966547651069913e-08, "loss": 0.0278, "step": 5258 }, { "epoch": 3.75, "grad_norm": 10.272189356491223, "learning_rate": 9.909211575534705e-08, "loss": 0.0461, "step": 5259 }, { "epoch": 3.75, "grad_norm": 2.2283715681606227, "learning_rate": 9.852039248441414e-08, "loss": 0.0171, "step": 5260 }, { "epoch": 3.76, "grad_norm": 4.722409299384118, "learning_rate": 9.79503068889226e-08, "loss": 0.0255, "step": 5261 }, { "epoch": 3.76, "grad_norm": 3.4508766889297706, "learning_rate": 9.738185915935005e-08, "loss": 0.0292, "step": 5262 }, { "epoch": 3.76, "grad_norm": 4.496268757517527, "learning_rate": 9.681504948562403e-08, "loss": 0.0217, "step": 5263 }, { "epoch": 3.76, "grad_norm": 5.404507854655809, "learning_rate": 9.624987805712749e-08, "loss": 0.0197, "step": 5264 }, { "epoch": 3.76, "grad_norm": 2.622445035917951, "learning_rate": 9.568634506269381e-08, "loss": 0.0259, "step": 5265 }, { "epoch": 3.76, "grad_norm": 4.884116982411068, "learning_rate": 9.51244506906096e-08, "loss": 0.0258, "step": 5266 }, { "epoch": 3.76, "grad_norm": 1.8627897365580712, "learning_rate": 9.45641951286158e-08, "loss": 0.0194, "step": 5267 }, { "epoch": 3.76, "grad_norm": 2.7000828431244783, "learning_rate": 9.400557856390158e-08, "loss": 0.0199, "step": 5268 }, { "epoch": 3.76, "grad_norm": 3.956492664965638, "learning_rate": 9.344860118311427e-08, "loss": 0.0234, "step": 5269 }, { "epoch": 3.76, "grad_norm": 1.425209166481016, "learning_rate": 9.289326317234726e-08, "loss": 0.0231, "step": 5270 }, { "epoch": 3.76, "grad_norm": 4.097267096486783, "learning_rate": 9.23395647171521e-08, "loss": 0.0235, "step": 5271 }, { "epoch": 3.76, "grad_norm": 2.028670905892367, "learning_rate": 9.178750600252695e-08, "loss": 0.0255, "step": 5272 }, { "epoch": 3.76, "grad_norm": 4.966736021039718, "learning_rate": 9.123708721292756e-08, "loss": 0.028, "step": 5273 }, { "epoch": 3.76, "grad_norm": 3.197737197596851, "learning_rate": 9.06883085322574e-08, "loss": 0.0216, "step": 5274 }, { "epoch": 3.77, "grad_norm": 4.080727773730197, "learning_rate": 9.014117014387424e-08, "loss": 0.0352, "step": 5275 }, { "epoch": 3.77, "grad_norm": 1.1296264280256079, "learning_rate": 8.95956722305874e-08, "loss": 0.0169, "step": 5276 }, { "epoch": 3.77, "grad_norm": 4.057116883266143, "learning_rate": 8.905181497465664e-08, "loss": 0.0221, "step": 5277 }, { "epoch": 3.77, "grad_norm": 3.9647187605506953, "learning_rate": 8.850959855779662e-08, "loss": 0.0233, "step": 5278 }, { "epoch": 3.77, "grad_norm": 2.1040774256861408, "learning_rate": 8.796902316117018e-08, "loss": 0.0213, "step": 5279 }, { "epoch": 3.77, "grad_norm": 4.65723542116145, "learning_rate": 8.743008896539451e-08, "loss": 0.0258, "step": 5280 }, { "epoch": 3.77, "grad_norm": 4.876295934301376, "learning_rate": 8.68927961505378e-08, "loss": 0.0257, "step": 5281 }, { "epoch": 3.77, "grad_norm": 2.8627960416897555, "learning_rate": 8.635714489611868e-08, "loss": 0.0314, "step": 5282 }, { "epoch": 3.77, "grad_norm": 2.273725154988319, "learning_rate": 8.582313538110898e-08, "loss": 0.0161, "step": 5283 }, { "epoch": 3.77, "grad_norm": 1.9282706264267362, "learning_rate": 8.529076778393097e-08, "loss": 0.0254, "step": 5284 }, { "epoch": 3.77, "grad_norm": 2.628970273375051, "learning_rate": 8.476004228245848e-08, "loss": 0.0231, "step": 5285 }, { "epoch": 3.77, "grad_norm": 3.0795335082550763, "learning_rate": 8.42309590540169e-08, "loss": 0.0233, "step": 5286 }, { "epoch": 3.77, "grad_norm": 2.4895174177627926, "learning_rate": 8.370351827538259e-08, "loss": 0.0258, "step": 5287 }, { "epoch": 3.77, "grad_norm": 7.5059982137655075, "learning_rate": 8.317772012278347e-08, "loss": 0.0257, "step": 5288 }, { "epoch": 3.78, "grad_norm": 2.287349453785497, "learning_rate": 8.26535647718979e-08, "loss": 0.0153, "step": 5289 }, { "epoch": 3.78, "grad_norm": 4.007946391371989, "learning_rate": 8.213105239785691e-08, "loss": 0.0142, "step": 5290 }, { "epoch": 3.78, "grad_norm": 3.325150468656642, "learning_rate": 8.161018317524139e-08, "loss": 0.0186, "step": 5291 }, { "epoch": 3.78, "grad_norm": 2.679684188837439, "learning_rate": 8.109095727808269e-08, "loss": 0.0303, "step": 5292 }, { "epoch": 3.78, "grad_norm": 3.923277021778115, "learning_rate": 8.057337487986427e-08, "loss": 0.0271, "step": 5293 }, { "epoch": 3.78, "grad_norm": 2.9481719065310883, "learning_rate": 8.005743615352057e-08, "loss": 0.022, "step": 5294 }, { "epoch": 3.78, "grad_norm": 11.41032621498422, "learning_rate": 7.954314127143481e-08, "loss": 0.0336, "step": 5295 }, { "epoch": 3.78, "grad_norm": 8.438022620378145, "learning_rate": 7.903049040544453e-08, "loss": 0.0272, "step": 5296 }, { "epoch": 3.78, "grad_norm": 1.5690249006305896, "learning_rate": 7.851948372683382e-08, "loss": 0.0239, "step": 5297 }, { "epoch": 3.78, "grad_norm": 1.8028194196876857, "learning_rate": 7.801012140634167e-08, "loss": 0.02, "step": 5298 }, { "epoch": 3.78, "grad_norm": 1.8809847137659024, "learning_rate": 7.750240361415362e-08, "loss": 0.0178, "step": 5299 }, { "epoch": 3.78, "grad_norm": 4.070256377359856, "learning_rate": 7.69963305199084e-08, "loss": 0.0211, "step": 5300 }, { "epoch": 3.78, "grad_norm": 4.871741592906539, "learning_rate": 7.64919022926941e-08, "loss": 0.0213, "step": 5301 }, { "epoch": 3.78, "grad_norm": 2.1231914716571154, "learning_rate": 7.598911910105033e-08, "loss": 0.0186, "step": 5302 }, { "epoch": 3.79, "grad_norm": 2.0372035432976916, "learning_rate": 7.548798111296552e-08, "loss": 0.0222, "step": 5303 }, { "epoch": 3.79, "grad_norm": 3.43083865516561, "learning_rate": 7.498848849588015e-08, "loss": 0.0245, "step": 5304 }, { "epoch": 3.79, "grad_norm": 4.2401607366865335, "learning_rate": 7.449064141668238e-08, "loss": 0.0262, "step": 5305 }, { "epoch": 3.79, "grad_norm": 2.638289900596031, "learning_rate": 7.399444004171364e-08, "loss": 0.0216, "step": 5306 }, { "epoch": 3.79, "grad_norm": 5.203760317859933, "learning_rate": 7.349988453676349e-08, "loss": 0.0264, "step": 5307 }, { "epoch": 3.79, "grad_norm": 1.1903840699355497, "learning_rate": 7.300697506707254e-08, "loss": 0.0174, "step": 5308 }, { "epoch": 3.79, "grad_norm": 2.386179668964028, "learning_rate": 7.251571179732963e-08, "loss": 0.0213, "step": 5309 }, { "epoch": 3.79, "grad_norm": 5.227249096322965, "learning_rate": 7.202609489167734e-08, "loss": 0.0252, "step": 5310 }, { "epoch": 3.79, "grad_norm": 1.9320796097209536, "learning_rate": 7.153812451370312e-08, "loss": 0.0148, "step": 5311 }, { "epoch": 3.79, "grad_norm": 3.182049411942898, "learning_rate": 7.10518008264488e-08, "loss": 0.0228, "step": 5312 }, { "epoch": 3.79, "grad_norm": 3.3538825883283443, "learning_rate": 7.056712399240274e-08, "loss": 0.0275, "step": 5313 }, { "epoch": 3.79, "grad_norm": 2.1138391051762473, "learning_rate": 7.008409417350648e-08, "loss": 0.0259, "step": 5314 }, { "epoch": 3.79, "grad_norm": 4.7881945597014255, "learning_rate": 6.960271153114706e-08, "loss": 0.0272, "step": 5315 }, { "epoch": 3.79, "grad_norm": 1.3152877442321909, "learning_rate": 6.912297622616526e-08, "loss": 0.0118, "step": 5316 }, { "epoch": 3.8, "grad_norm": 2.201245036810093, "learning_rate": 6.864488841884786e-08, "loss": 0.0182, "step": 5317 }, { "epoch": 3.8, "grad_norm": 1.7396078183985866, "learning_rate": 6.816844826893431e-08, "loss": 0.0211, "step": 5318 }, { "epoch": 3.8, "grad_norm": 6.429755754943171, "learning_rate": 6.769365593561117e-08, "loss": 0.0239, "step": 5319 }, { "epoch": 3.8, "grad_norm": 2.4797877328826865, "learning_rate": 6.722051157751597e-08, "loss": 0.0221, "step": 5320 }, { "epoch": 3.8, "grad_norm": 4.190112748365354, "learning_rate": 6.674901535273448e-08, "loss": 0.0164, "step": 5321 }, { "epoch": 3.8, "grad_norm": 3.4184760808635124, "learning_rate": 6.627916741880291e-08, "loss": 0.0369, "step": 5322 }, { "epoch": 3.8, "grad_norm": 2.141574310849085, "learning_rate": 6.581096793270625e-08, "loss": 0.0137, "step": 5323 }, { "epoch": 3.8, "grad_norm": 2.740988732465579, "learning_rate": 6.534441705087768e-08, "loss": 0.0214, "step": 5324 }, { "epoch": 3.8, "grad_norm": 3.057761817664433, "learning_rate": 6.487951492920141e-08, "loss": 0.0271, "step": 5325 }, { "epoch": 3.8, "grad_norm": 4.367102135231493, "learning_rate": 6.441626172300986e-08, "loss": 0.0225, "step": 5326 }, { "epoch": 3.8, "grad_norm": 5.584248486877098, "learning_rate": 6.395465758708419e-08, "loss": 0.018, "step": 5327 }, { "epoch": 3.8, "grad_norm": 5.53656132887622, "learning_rate": 6.349470267565549e-08, "loss": 0.0262, "step": 5328 }, { "epoch": 3.8, "grad_norm": 2.128469356840069, "learning_rate": 6.303639714240196e-08, "loss": 0.0127, "step": 5329 }, { "epoch": 3.8, "grad_norm": 4.588339566836977, "learning_rate": 6.257974114045385e-08, "loss": 0.0219, "step": 5330 }, { "epoch": 3.81, "grad_norm": 1.3886881130197628, "learning_rate": 6.212473482238635e-08, "loss": 0.0196, "step": 5331 }, { "epoch": 3.81, "grad_norm": 11.056473846052311, "learning_rate": 6.167137834022785e-08, "loss": 0.0398, "step": 5332 }, { "epoch": 3.81, "grad_norm": 1.7596324104793302, "learning_rate": 6.121967184545107e-08, "loss": 0.0129, "step": 5333 }, { "epoch": 3.81, "grad_norm": 1.6553378546687454, "learning_rate": 6.076961548898086e-08, "loss": 0.0172, "step": 5334 }, { "epoch": 3.81, "grad_norm": 3.0913951303287965, "learning_rate": 6.032120942118858e-08, "loss": 0.0193, "step": 5335 }, { "epoch": 3.81, "grad_norm": 3.909975477030834, "learning_rate": 5.98744537918955e-08, "loss": 0.0185, "step": 5336 }, { "epoch": 3.81, "grad_norm": 1.5964964171735954, "learning_rate": 5.9429348750371097e-08, "loss": 0.02, "step": 5337 }, { "epoch": 3.81, "grad_norm": 3.4138357917288413, "learning_rate": 5.898589444533254e-08, "loss": 0.0198, "step": 5338 }, { "epoch": 3.81, "grad_norm": 4.010010339656211, "learning_rate": 5.85440910249474e-08, "loss": 0.0468, "step": 5339 }, { "epoch": 3.81, "grad_norm": 2.884838335613086, "learning_rate": 5.810393863682873e-08, "loss": 0.0236, "step": 5340 }, { "epoch": 3.81, "grad_norm": 2.5632337910539778, "learning_rate": 5.7665437428041096e-08, "loss": 0.0254, "step": 5341 }, { "epoch": 3.81, "grad_norm": 6.461929942388153, "learning_rate": 5.722858754509564e-08, "loss": 0.0225, "step": 5342 }, { "epoch": 3.81, "grad_norm": 2.358122356416235, "learning_rate": 5.679338913395116e-08, "loss": 0.0181, "step": 5343 }, { "epoch": 3.81, "grad_norm": 7.677224673110656, "learning_rate": 5.6359842340016904e-08, "loss": 0.0262, "step": 5344 }, { "epoch": 3.82, "grad_norm": 2.168935169589492, "learning_rate": 5.5927947308147545e-08, "loss": 0.0177, "step": 5345 }, { "epoch": 3.82, "grad_norm": 3.466736981204486, "learning_rate": 5.549770418264766e-08, "loss": 0.0177, "step": 5346 }, { "epoch": 3.82, "grad_norm": 3.4138530734232755, "learning_rate": 5.5069113107270034e-08, "loss": 0.0244, "step": 5347 }, { "epoch": 3.82, "grad_norm": 2.065047272470145, "learning_rate": 5.464217422521456e-08, "loss": 0.0221, "step": 5348 }, { "epoch": 3.82, "grad_norm": 2.8946037809002765, "learning_rate": 5.421688767912936e-08, "loss": 0.0186, "step": 5349 }, { "epoch": 3.82, "grad_norm": 4.394964370403741, "learning_rate": 5.3793253611110206e-08, "loss": 0.0197, "step": 5350 }, { "epoch": 3.82, "grad_norm": 2.608996497277368, "learning_rate": 5.3371272162702214e-08, "loss": 0.0177, "step": 5351 }, { "epoch": 3.82, "grad_norm": 1.8025304112213554, "learning_rate": 5.295094347489593e-08, "loss": 0.0207, "step": 5352 }, { "epoch": 3.82, "grad_norm": 2.5613629695229787, "learning_rate": 5.253226768813235e-08, "loss": 0.0164, "step": 5353 }, { "epoch": 3.82, "grad_norm": 4.54003532307264, "learning_rate": 5.211524494229736e-08, "loss": 0.023, "step": 5354 }, { "epoch": 3.82, "grad_norm": 1.904588615161526, "learning_rate": 5.169987537672727e-08, "loss": 0.0179, "step": 5355 }, { "epoch": 3.82, "grad_norm": 1.9393310412596467, "learning_rate": 5.128615913020385e-08, "loss": 0.0222, "step": 5356 }, { "epoch": 3.82, "grad_norm": 3.359468134735287, "learning_rate": 5.087409634095819e-08, "loss": 0.0238, "step": 5357 }, { "epoch": 3.82, "grad_norm": 1.1449746630362878, "learning_rate": 5.046368714666683e-08, "loss": 0.0117, "step": 5358 }, { "epoch": 3.83, "grad_norm": 4.222518213830071, "learning_rate": 5.0054931684457296e-08, "loss": 0.0383, "step": 5359 }, { "epoch": 3.83, "grad_norm": 3.743647719670252, "learning_rate": 4.964783009090035e-08, "loss": 0.0293, "step": 5360 }, { "epoch": 3.83, "grad_norm": 3.833891477000365, "learning_rate": 4.9242382502017185e-08, "loss": 0.0229, "step": 5361 }, { "epoch": 3.83, "grad_norm": 4.168894379967887, "learning_rate": 4.883858905327499e-08, "loss": 0.019, "step": 5362 }, { "epoch": 3.83, "grad_norm": 1.913254240680341, "learning_rate": 4.843644987958862e-08, "loss": 0.0188, "step": 5363 }, { "epoch": 3.83, "grad_norm": 2.110753327008818, "learning_rate": 4.8035965115320604e-08, "loss": 0.0197, "step": 5364 }, { "epoch": 3.83, "grad_norm": 8.56246472776577, "learning_rate": 4.763713489428001e-08, "loss": 0.0337, "step": 5365 }, { "epoch": 3.83, "grad_norm": 3.119553272465091, "learning_rate": 4.723995934972414e-08, "loss": 0.0235, "step": 5366 }, { "epoch": 3.83, "grad_norm": 3.8853086683100098, "learning_rate": 4.684443861435572e-08, "loss": 0.0244, "step": 5367 }, { "epoch": 3.83, "grad_norm": 4.4695595709236855, "learning_rate": 4.6450572820325727e-08, "loss": 0.0252, "step": 5368 }, { "epoch": 3.83, "grad_norm": 2.7874075602983672, "learning_rate": 4.605836209923331e-08, "loss": 0.0192, "step": 5369 }, { "epoch": 3.83, "grad_norm": 2.203531606811125, "learning_rate": 4.566780658212144e-08, "loss": 0.0206, "step": 5370 }, { "epoch": 3.83, "grad_norm": 3.160499631682135, "learning_rate": 4.5278906399483516e-08, "loss": 0.0169, "step": 5371 }, { "epoch": 3.83, "grad_norm": 4.6089108849374805, "learning_rate": 4.489166168125725e-08, "loss": 0.0271, "step": 5372 }, { "epoch": 3.84, "grad_norm": 4.866432991585118, "learning_rate": 4.4506072556829704e-08, "loss": 0.0189, "step": 5373 }, { "epoch": 3.84, "grad_norm": 3.197986454347981, "learning_rate": 4.4122139155031717e-08, "loss": 0.0219, "step": 5374 }, { "epoch": 3.84, "grad_norm": 4.3140025616477145, "learning_rate": 4.373986160414345e-08, "loss": 0.04, "step": 5375 }, { "epoch": 3.84, "grad_norm": 0.9589388375969385, "learning_rate": 4.335924003189107e-08, "loss": 0.0173, "step": 5376 }, { "epoch": 3.84, "grad_norm": 2.9736420672090085, "learning_rate": 4.298027456544674e-08, "loss": 0.0215, "step": 5377 }, { "epoch": 3.84, "grad_norm": 3.0076067234022306, "learning_rate": 4.260296533143027e-08, "loss": 0.0311, "step": 5378 }, { "epoch": 3.84, "grad_norm": 2.9163352666589164, "learning_rate": 4.22273124559075e-08, "loss": 0.0179, "step": 5379 }, { "epoch": 3.84, "grad_norm": 5.350361549448945, "learning_rate": 4.185331606439136e-08, "loss": 0.0296, "step": 5380 }, { "epoch": 3.84, "grad_norm": 3.5143808177875666, "learning_rate": 4.148097628184078e-08, "loss": 0.0268, "step": 5381 }, { "epoch": 3.84, "grad_norm": 1.6488078338220415, "learning_rate": 4.111029323266125e-08, "loss": 0.0212, "step": 5382 }, { "epoch": 3.84, "grad_norm": 2.9180480292411337, "learning_rate": 4.07412670407048e-08, "loss": 0.0167, "step": 5383 }, { "epoch": 3.84, "grad_norm": 3.426743137102039, "learning_rate": 4.037389782927059e-08, "loss": 0.0261, "step": 5384 }, { "epoch": 3.84, "grad_norm": 2.626920868414007, "learning_rate": 4.000818572110265e-08, "loss": 0.0187, "step": 5385 }, { "epoch": 3.84, "grad_norm": 4.358340512942245, "learning_rate": 3.964413083839269e-08, "loss": 0.024, "step": 5386 }, { "epoch": 3.85, "grad_norm": 2.5771043456304734, "learning_rate": 3.9281733302778404e-08, "loss": 0.0135, "step": 5387 }, { "epoch": 3.85, "grad_norm": 3.7108362612772945, "learning_rate": 3.892099323534293e-08, "loss": 0.0251, "step": 5388 }, { "epoch": 3.85, "grad_norm": 3.2641134315951765, "learning_rate": 3.856191075661708e-08, "loss": 0.0207, "step": 5389 }, { "epoch": 3.85, "grad_norm": 4.7954663584761, "learning_rate": 3.8204485986576e-08, "loss": 0.0233, "step": 5390 }, { "epoch": 3.85, "grad_norm": 3.0039195291483587, "learning_rate": 3.784871904464249e-08, "loss": 0.0214, "step": 5391 }, { "epoch": 3.85, "grad_norm": 4.781319594897229, "learning_rate": 3.7494610049684796e-08, "loss": 0.0202, "step": 5392 }, { "epoch": 3.85, "grad_norm": 2.2181342509203716, "learning_rate": 3.714215912001773e-08, "loss": 0.0237, "step": 5393 }, { "epoch": 3.85, "grad_norm": 1.5307656638034723, "learning_rate": 3.6791366373400974e-08, "loss": 0.016, "step": 5394 }, { "epoch": 3.85, "grad_norm": 2.9980957129316477, "learning_rate": 3.6442231927041324e-08, "loss": 0.0209, "step": 5395 }, { "epoch": 3.85, "grad_norm": 2.614325470570068, "learning_rate": 3.609475589759104e-08, "loss": 0.0195, "step": 5396 }, { "epoch": 3.85, "grad_norm": 1.2342786417833946, "learning_rate": 3.574893840114835e-08, "loss": 0.0195, "step": 5397 }, { "epoch": 3.85, "grad_norm": 3.13513680493031, "learning_rate": 3.5404779553257494e-08, "loss": 0.0198, "step": 5398 }, { "epoch": 3.85, "grad_norm": 15.078842855499811, "learning_rate": 3.506227946890761e-08, "loss": 0.042, "step": 5399 }, { "epoch": 3.85, "grad_norm": 4.968249293917528, "learning_rate": 3.4721438262534935e-08, "loss": 0.0286, "step": 5400 }, { "epoch": 3.86, "grad_norm": 2.300715233738611, "learning_rate": 3.438225604802115e-08, "loss": 0.0247, "step": 5401 }, { "epoch": 3.86, "grad_norm": 3.2215687096264367, "learning_rate": 3.404473293869226e-08, "loss": 0.0225, "step": 5402 }, { "epoch": 3.86, "grad_norm": 1.3402505655817432, "learning_rate": 3.370886904732196e-08, "loss": 0.0137, "step": 5403 }, { "epoch": 3.86, "grad_norm": 1.0509369564653657, "learning_rate": 3.33746644861288e-08, "loss": 0.0152, "step": 5404 }, { "epoch": 3.86, "grad_norm": 1.5200283370650576, "learning_rate": 3.30421193667757e-08, "loss": 0.0172, "step": 5405 }, { "epoch": 3.86, "grad_norm": 6.978782111965761, "learning_rate": 3.271123380037322e-08, "loss": 0.0288, "step": 5406 }, { "epoch": 3.86, "grad_norm": 4.070184829965542, "learning_rate": 3.2382007897475695e-08, "loss": 0.0289, "step": 5407 }, { "epoch": 3.86, "grad_norm": 4.409008744759003, "learning_rate": 3.2054441768083477e-08, "loss": 0.0241, "step": 5408 }, { "epoch": 3.86, "grad_norm": 2.2078652879223797, "learning_rate": 3.1728535521643454e-08, "loss": 0.0198, "step": 5409 }, { "epoch": 3.86, "grad_norm": 2.187261175257129, "learning_rate": 3.1404289267046305e-08, "loss": 0.0185, "step": 5410 }, { "epoch": 3.86, "grad_norm": 1.8104547975037084, "learning_rate": 3.1081703112628146e-08, "loss": 0.0207, "step": 5411 }, { "epoch": 3.86, "grad_norm": 2.108463142618941, "learning_rate": 3.0760777166172206e-08, "loss": 0.0204, "step": 5412 }, { "epoch": 3.86, "grad_norm": 4.41782265356589, "learning_rate": 3.0441511534904934e-08, "loss": 0.0266, "step": 5413 }, { "epoch": 3.86, "grad_norm": 2.1563829583102483, "learning_rate": 3.012390632549933e-08, "loss": 0.0298, "step": 5414 }, { "epoch": 3.87, "grad_norm": 7.78806885480321, "learning_rate": 2.9807961644073294e-08, "loss": 0.026, "step": 5415 }, { "epoch": 3.87, "grad_norm": 3.120989169250971, "learning_rate": 2.9493677596189595e-08, "loss": 0.0237, "step": 5416 }, { "epoch": 3.87, "grad_norm": 4.358905498727036, "learning_rate": 2.9181054286855916e-08, "loss": 0.0214, "step": 5417 }, { "epoch": 3.87, "grad_norm": 4.6477516202899, "learning_rate": 2.887009182052647e-08, "loss": 0.0307, "step": 5418 }, { "epoch": 3.87, "grad_norm": 6.1082992817988115, "learning_rate": 2.8560790301098705e-08, "loss": 0.0244, "step": 5419 }, { "epoch": 3.87, "grad_norm": 1.275610383748102, "learning_rate": 2.825314983191718e-08, "loss": 0.0116, "step": 5420 }, { "epoch": 3.87, "grad_norm": 8.547679089993268, "learning_rate": 2.7947170515768562e-08, "loss": 0.0333, "step": 5421 }, { "epoch": 3.87, "grad_norm": 2.957300537052848, "learning_rate": 2.7642852454887736e-08, "loss": 0.0221, "step": 5422 }, { "epoch": 3.87, "grad_norm": 2.033116201364948, "learning_rate": 2.7340195750952813e-08, "loss": 0.0148, "step": 5423 }, { "epoch": 3.87, "grad_norm": 5.438523966512901, "learning_rate": 2.703920050508624e-08, "loss": 0.0304, "step": 5424 }, { "epoch": 3.87, "grad_norm": 5.690418670617803, "learning_rate": 2.673986681785645e-08, "loss": 0.0202, "step": 5425 }, { "epoch": 3.87, "grad_norm": 6.5518385685801555, "learning_rate": 2.6442194789277342e-08, "loss": 0.0322, "step": 5426 }, { "epoch": 3.87, "grad_norm": 1.2401661063067848, "learning_rate": 2.6146184518804908e-08, "loss": 0.014, "step": 5427 }, { "epoch": 3.87, "grad_norm": 3.209009421866879, "learning_rate": 2.5851836105343363e-08, "loss": 0.0168, "step": 5428 }, { "epoch": 3.88, "grad_norm": 2.2479236401385623, "learning_rate": 2.555914964723849e-08, "loss": 0.0192, "step": 5429 }, { "epoch": 3.88, "grad_norm": 3.664875088278122, "learning_rate": 2.5268125242283724e-08, "loss": 0.0293, "step": 5430 }, { "epoch": 3.88, "grad_norm": 4.403075650273632, "learning_rate": 2.4978762987714067e-08, "loss": 0.0368, "step": 5431 }, { "epoch": 3.88, "grad_norm": 6.9310695166002025, "learning_rate": 2.469106298021273e-08, "loss": 0.0316, "step": 5432 }, { "epoch": 3.88, "grad_norm": 2.423140496516659, "learning_rate": 2.4405025315904495e-08, "loss": 0.024, "step": 5433 }, { "epoch": 3.88, "grad_norm": 4.022916173686561, "learning_rate": 2.412065009036013e-08, "loss": 0.0212, "step": 5434 }, { "epoch": 3.88, "grad_norm": 3.837813048775894, "learning_rate": 2.3837937398594747e-08, "loss": 0.0298, "step": 5435 }, { "epoch": 3.88, "grad_norm": 2.1748941776797377, "learning_rate": 2.3556887335067223e-08, "loss": 0.0262, "step": 5436 }, { "epoch": 3.88, "grad_norm": 6.561398321137626, "learning_rate": 2.3277499993682452e-08, "loss": 0.036, "step": 5437 }, { "epoch": 3.88, "grad_norm": 4.357304631831137, "learning_rate": 2.2999775467788532e-08, "loss": 0.0223, "step": 5438 }, { "epoch": 3.88, "grad_norm": 3.486845904406258, "learning_rate": 2.272371385017902e-08, "loss": 0.0272, "step": 5439 }, { "epoch": 3.88, "grad_norm": 4.662953700822703, "learning_rate": 2.244931523309013e-08, "loss": 0.0278, "step": 5440 }, { "epoch": 3.88, "grad_norm": 2.054534031716196, "learning_rate": 2.2176579708204636e-08, "loss": 0.0244, "step": 5441 }, { "epoch": 3.88, "grad_norm": 3.9539255224153704, "learning_rate": 2.190550736664798e-08, "loss": 0.0359, "step": 5442 }, { "epoch": 3.89, "grad_norm": 5.221943937345425, "learning_rate": 2.163609829898994e-08, "loss": 0.0316, "step": 5443 }, { "epoch": 3.89, "grad_norm": 2.6919999185285426, "learning_rate": 2.136835259524628e-08, "loss": 0.0281, "step": 5444 }, { "epoch": 3.89, "grad_norm": 3.252062666573824, "learning_rate": 2.1102270344874887e-08, "loss": 0.0428, "step": 5445 }, { "epoch": 3.89, "grad_norm": 2.2687059601810353, "learning_rate": 2.083785163677965e-08, "loss": 0.0194, "step": 5446 }, { "epoch": 3.89, "grad_norm": 3.2991800662002784, "learning_rate": 2.0575096559306564e-08, "loss": 0.0274, "step": 5447 }, { "epoch": 3.89, "grad_norm": 3.6298689814703224, "learning_rate": 2.0314005200248178e-08, "loss": 0.0195, "step": 5448 }, { "epoch": 3.89, "grad_norm": 3.218939239822081, "learning_rate": 2.0054577646839156e-08, "loss": 0.0142, "step": 5449 }, { "epoch": 3.89, "grad_norm": 2.868578007944478, "learning_rate": 1.979681398575961e-08, "loss": 0.0172, "step": 5450 }, { "epoch": 3.89, "grad_norm": 4.0210996974880056, "learning_rate": 1.954071430313287e-08, "loss": 0.0219, "step": 5451 }, { "epoch": 3.89, "grad_norm": 2.3082378385282873, "learning_rate": 1.9286278684526593e-08, "loss": 0.0243, "step": 5452 }, { "epoch": 3.89, "grad_norm": 3.362288093740942, "learning_rate": 1.9033507214952784e-08, "loss": 0.017, "step": 5453 }, { "epoch": 3.89, "grad_norm": 4.605772008765052, "learning_rate": 1.878239997886666e-08, "loss": 0.023, "step": 5454 }, { "epoch": 3.89, "grad_norm": 5.6829958574551505, "learning_rate": 1.853295706016778e-08, "loss": 0.0274, "step": 5455 }, { "epoch": 3.89, "grad_norm": 2.357378781581144, "learning_rate": 1.8285178542200022e-08, "loss": 0.0222, "step": 5456 }, { "epoch": 3.9, "grad_norm": 2.7214489224447185, "learning_rate": 1.8039064507750503e-08, "loss": 0.0131, "step": 5457 }, { "epoch": 3.9, "grad_norm": 2.8725088439600706, "learning_rate": 1.7794615039050665e-08, "loss": 0.0144, "step": 5458 }, { "epoch": 3.9, "grad_norm": 4.16619545847278, "learning_rate": 1.7551830217775734e-08, "loss": 0.0164, "step": 5459 }, { "epoch": 3.9, "grad_norm": 2.0022748959715058, "learning_rate": 1.7310710125044707e-08, "loss": 0.0226, "step": 5460 }, { "epoch": 3.9, "grad_norm": 2.3611659620346592, "learning_rate": 1.7071254841419805e-08, "loss": 0.0202, "step": 5461 }, { "epoch": 3.9, "grad_norm": 3.864100176432943, "learning_rate": 1.6833464446907588e-08, "loss": 0.0208, "step": 5462 }, { "epoch": 3.9, "grad_norm": 2.0424463543342584, "learning_rate": 1.6597339020958393e-08, "loss": 0.0192, "step": 5463 }, { "epoch": 3.9, "grad_norm": 2.3858731170790857, "learning_rate": 1.6362878642466328e-08, "loss": 0.0205, "step": 5464 }, { "epoch": 3.9, "grad_norm": 2.110909016473477, "learning_rate": 1.6130083389768735e-08, "loss": 0.0182, "step": 5465 }, { "epoch": 3.9, "grad_norm": 3.7439060845341365, "learning_rate": 1.5898953340646728e-08, "loss": 0.029, "step": 5466 }, { "epoch": 3.9, "grad_norm": 4.647959641308654, "learning_rate": 1.5669488572325197e-08, "loss": 0.0206, "step": 5467 }, { "epoch": 3.9, "grad_norm": 2.5611865882124523, "learning_rate": 1.5441689161472816e-08, "loss": 0.0283, "step": 5468 }, { "epoch": 3.9, "grad_norm": 3.131129293544498, "learning_rate": 1.521555518420148e-08, "loss": 0.0255, "step": 5469 }, { "epoch": 3.9, "grad_norm": 2.906211714511769, "learning_rate": 1.499108671606686e-08, "loss": 0.0222, "step": 5470 }, { "epoch": 3.91, "grad_norm": 1.3921197046262057, "learning_rate": 1.4768283832067853e-08, "loss": 0.0206, "step": 5471 }, { "epoch": 3.91, "grad_norm": 3.859282967632713, "learning_rate": 1.4547146606646578e-08, "loss": 0.0236, "step": 5472 }, { "epoch": 3.91, "grad_norm": 5.407318739007233, "learning_rate": 1.4327675113690598e-08, "loss": 0.0219, "step": 5473 }, { "epoch": 3.91, "grad_norm": 4.811797317082379, "learning_rate": 1.4109869426527368e-08, "loss": 0.0205, "step": 5474 }, { "epoch": 3.91, "grad_norm": 2.252930648915975, "learning_rate": 1.3893729617931451e-08, "loss": 0.0241, "step": 5475 }, { "epoch": 3.91, "grad_norm": 1.4816278252113144, "learning_rate": 1.3679255760118415e-08, "loss": 0.0227, "step": 5476 }, { "epoch": 3.91, "grad_norm": 6.986746169818905, "learning_rate": 1.3466447924748716e-08, "loss": 0.0249, "step": 5477 }, { "epoch": 3.91, "grad_norm": 2.1997925480493876, "learning_rate": 1.3255306182924365e-08, "loss": 0.0179, "step": 5478 }, { "epoch": 3.91, "grad_norm": 4.030533592119653, "learning_rate": 1.3045830605192266e-08, "loss": 0.0458, "step": 5479 }, { "epoch": 3.91, "grad_norm": 3.088240734799316, "learning_rate": 1.2838021261541988e-08, "loss": 0.019, "step": 5480 }, { "epoch": 3.91, "grad_norm": 3.0526266172301084, "learning_rate": 1.263187822140688e-08, "loss": 0.0228, "step": 5481 }, { "epoch": 3.91, "grad_norm": 2.888270689172121, "learning_rate": 1.2427401553662955e-08, "loss": 0.0192, "step": 5482 }, { "epoch": 3.91, "grad_norm": 2.4369666827080487, "learning_rate": 1.2224591326628898e-08, "loss": 0.015, "step": 5483 }, { "epoch": 3.91, "grad_norm": 2.2435601031926296, "learning_rate": 1.2023447608068283e-08, "loss": 0.0208, "step": 5484 }, { "epoch": 3.92, "grad_norm": 5.144528764350548, "learning_rate": 1.182397046518735e-08, "loss": 0.0224, "step": 5485 }, { "epoch": 3.92, "grad_norm": 2.2211905772296, "learning_rate": 1.1626159964633899e-08, "loss": 0.0147, "step": 5486 }, { "epoch": 3.92, "grad_norm": 2.2902390229673197, "learning_rate": 1.1430016172501169e-08, "loss": 0.0149, "step": 5487 }, { "epoch": 3.92, "grad_norm": 4.665203417246369, "learning_rate": 1.1235539154323405e-08, "loss": 0.0213, "step": 5488 }, { "epoch": 3.92, "grad_norm": 6.05200035827132, "learning_rate": 1.1042728975079741e-08, "loss": 0.0254, "step": 5489 }, { "epoch": 3.92, "grad_norm": 6.912668006593784, "learning_rate": 1.0851585699191425e-08, "loss": 0.0291, "step": 5490 }, { "epoch": 3.92, "grad_norm": 2.8237061465451463, "learning_rate": 1.0662109390522924e-08, "loss": 0.0197, "step": 5491 }, { "epoch": 3.92, "grad_norm": 5.594205018270895, "learning_rate": 1.047430011238193e-08, "loss": 0.0262, "step": 5492 }, { "epoch": 3.92, "grad_norm": 2.5381484863031885, "learning_rate": 1.028815792751936e-08, "loss": 0.0226, "step": 5493 }, { "epoch": 3.92, "grad_norm": 3.127384381139182, "learning_rate": 1.0103682898128241e-08, "loss": 0.0184, "step": 5494 }, { "epoch": 3.92, "grad_norm": 3.2830582442340375, "learning_rate": 9.920875085845383e-09, "loss": 0.0213, "step": 5495 }, { "epoch": 3.92, "grad_norm": 2.7865544882843336, "learning_rate": 9.739734551749703e-09, "loss": 0.0185, "step": 5496 }, { "epoch": 3.92, "grad_norm": 6.719662127295909, "learning_rate": 9.560261356364452e-09, "loss": 0.0387, "step": 5497 }, { "epoch": 3.92, "grad_norm": 3.0048549069559707, "learning_rate": 9.382455559654446e-09, "loss": 0.026, "step": 5498 }, { "epoch": 3.93, "grad_norm": 2.031697797024053, "learning_rate": 9.206317221027717e-09, "loss": 0.023, "step": 5499 }, { "epoch": 3.93, "grad_norm": 3.903823119328278, "learning_rate": 9.031846399336075e-09, "loss": 0.0327, "step": 5500 }, { "epoch": 3.93, "eval_avg_AUC": 0.8371702589944845, "eval_avg_Accuracy": 0.7440318302387268, "eval_avg_Accuracy-right": 0.8942872049041346, "eval_avg_Accuracy-wrong": 0.48203320445758474, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.7103294303171083, "eval_last_AUC": 0.8319023333640545, "eval_last_Accuracy": 0.7803796419098143, "eval_last_Accuracy-right": 0.8463545063258119, "eval_last_Accuracy-wrong": 0.6653400045485558, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.7067685458768546, "eval_max_AUC": 0.7867669380385187, "eval_max_Accuracy": 0.6470490716180372, "eval_max_Accuracy-right": 0.9887178818312248, "eval_max_Accuracy-wrong": 0.05128496702297021, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6553866744173231, "eval_min_AUC": 0.8460472659735542, "eval_min_Accuracy": 0.7755305039787799, "eval_min_Accuracy-right": 0.7935307160558237, "eval_min_Accuracy-wrong": 0.7441437343643393, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.711460753125292, "eval_prod_AUC": 0.8437307587767714, "eval_prod_Accuracy": 0.7427470159151194, "eval_prod_Accuracy-right": 0.6701447763140733, "eval_prod_Accuracy-wrong": 0.869342733682056, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.7105555644172705, "eval_runtime": 247.3548, "eval_samples_per_second": 97.544, "eval_steps_per_second": 3.048, "eval_sum_AUC": 0.7143932084431329, "eval_sum_Accuracy": 0.6395474137931034, "eval_sum_Accuracy-right": 0.996869701317334, "eval_sum_Accuracy-wrong": 0.016488514896520354, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6877217306105692, "step": 5500 }, { "epoch": 3.93, "grad_norm": 3.3371950491903637, "learning_rate": 8.859043152872892e-09, "loss": 0.0194, "step": 5501 }, { "epoch": 3.93, "grad_norm": 1.9530134838227862, "learning_rate": 8.687907539375318e-09, "loss": 0.0204, "step": 5502 }, { "epoch": 3.93, "grad_norm": 1.9341448030769988, "learning_rate": 8.518439616022057e-09, "loss": 0.0219, "step": 5503 }, { "epoch": 3.93, "grad_norm": 2.087075389938105, "learning_rate": 8.350639439436703e-09, "loss": 0.0242, "step": 5504 }, { "epoch": 3.93, "grad_norm": 3.430222763743477, "learning_rate": 8.184507065683855e-09, "loss": 0.0174, "step": 5505 }, { "epoch": 3.93, "grad_norm": 5.6908272831133395, "learning_rate": 8.020042550271889e-09, "loss": 0.02, "step": 5506 }, { "epoch": 3.93, "grad_norm": 2.351915576486446, "learning_rate": 7.857245948150183e-09, "loss": 0.0186, "step": 5507 }, { "epoch": 3.93, "grad_norm": 4.173408896062741, "learning_rate": 7.696117313713559e-09, "loss": 0.0271, "step": 5508 }, { "epoch": 3.93, "grad_norm": 1.7164931573536757, "learning_rate": 7.536656700797284e-09, "loss": 0.0169, "step": 5509 }, { "epoch": 3.93, "grad_norm": 2.99059556731782, "learning_rate": 7.37886416268041e-09, "loss": 0.0201, "step": 5510 }, { "epoch": 3.93, "grad_norm": 3.3403066040208462, "learning_rate": 7.222739752084096e-09, "loss": 0.0251, "step": 5511 }, { "epoch": 3.93, "grad_norm": 8.053052196324328, "learning_rate": 7.068283521172725e-09, "loss": 0.0359, "step": 5512 }, { "epoch": 3.94, "grad_norm": 4.416499688532894, "learning_rate": 6.915495521552795e-09, "loss": 0.018, "step": 5513 }, { "epoch": 3.94, "grad_norm": 4.387845883879492, "learning_rate": 6.764375804274026e-09, "loss": 0.0239, "step": 5514 }, { "epoch": 3.94, "grad_norm": 5.03141411349809, "learning_rate": 6.61492441982714e-09, "loss": 0.0377, "step": 5515 }, { "epoch": 3.94, "grad_norm": 3.757977750589518, "learning_rate": 6.467141418147748e-09, "loss": 0.0122, "step": 5516 }, { "epoch": 3.94, "grad_norm": 2.001295426759492, "learning_rate": 6.321026848613021e-09, "loss": 0.0153, "step": 5517 }, { "epoch": 3.94, "grad_norm": 3.9821414447648973, "learning_rate": 6.176580760041684e-09, "loss": 0.0194, "step": 5518 }, { "epoch": 3.94, "grad_norm": 1.7433347366769623, "learning_rate": 6.033803200696242e-09, "loss": 0.018, "step": 5519 }, { "epoch": 3.94, "grad_norm": 4.301461630452348, "learning_rate": 5.892694218281869e-09, "loss": 0.0371, "step": 5520 }, { "epoch": 3.94, "grad_norm": 4.386923072523771, "learning_rate": 5.753253859944741e-09, "loss": 0.0165, "step": 5521 }, { "epoch": 3.94, "grad_norm": 3.0793034972539677, "learning_rate": 5.615482172275366e-09, "loss": 0.0387, "step": 5522 }, { "epoch": 3.94, "grad_norm": 6.778878857726523, "learning_rate": 5.479379201305257e-09, "loss": 0.0304, "step": 5523 }, { "epoch": 3.94, "grad_norm": 4.863627634913964, "learning_rate": 5.344944992509149e-09, "loss": 0.0256, "step": 5524 }, { "epoch": 3.94, "grad_norm": 2.509596156570167, "learning_rate": 5.212179590803335e-09, "loss": 0.0306, "step": 5525 }, { "epoch": 3.94, "grad_norm": 2.796887352552083, "learning_rate": 5.08108304054844e-09, "loss": 0.0224, "step": 5526 }, { "epoch": 3.95, "grad_norm": 4.106611953848664, "learning_rate": 4.9516553855455395e-09, "loss": 0.0308, "step": 5527 }, { "epoch": 3.95, "grad_norm": 2.2897445226023216, "learning_rate": 4.82389666903893e-09, "loss": 0.0221, "step": 5528 }, { "epoch": 3.95, "grad_norm": 2.7080187516169496, "learning_rate": 4.697806933715021e-09, "loss": 0.0269, "step": 5529 }, { "epoch": 3.95, "grad_norm": 4.343144051787935, "learning_rate": 4.573386221703446e-09, "loss": 0.0267, "step": 5530 }, { "epoch": 3.95, "grad_norm": 2.7596315781432397, "learning_rate": 4.450634574574286e-09, "loss": 0.0331, "step": 5531 }, { "epoch": 3.95, "grad_norm": 4.324439345233048, "learning_rate": 4.329552033341955e-09, "loss": 0.0182, "step": 5532 }, { "epoch": 3.95, "grad_norm": 2.8977509062535747, "learning_rate": 4.210138638462424e-09, "loss": 0.0135, "step": 5533 }, { "epoch": 3.95, "grad_norm": 4.537391924016334, "learning_rate": 4.0923944298337796e-09, "loss": 0.0325, "step": 5534 }, { "epoch": 3.95, "grad_norm": 5.661417948871261, "learning_rate": 3.976319446795662e-09, "loss": 0.0379, "step": 5535 }, { "epoch": 3.95, "grad_norm": 2.8349607349929005, "learning_rate": 3.8619137281326044e-09, "loss": 0.0227, "step": 5536 }, { "epoch": 3.95, "grad_norm": 5.200658373149944, "learning_rate": 3.749177312068475e-09, "loss": 0.0279, "step": 5537 }, { "epoch": 3.95, "grad_norm": 1.8628376484778557, "learning_rate": 3.63811023627092e-09, "loss": 0.0193, "step": 5538 }, { "epoch": 3.95, "grad_norm": 4.724184543652983, "learning_rate": 3.528712537849144e-09, "loss": 0.0267, "step": 5539 }, { "epoch": 3.95, "grad_norm": 3.6749826001937764, "learning_rate": 3.42098425335613e-09, "loss": 0.0311, "step": 5540 }, { "epoch": 3.96, "grad_norm": 4.909671343908744, "learning_rate": 3.3149254187841985e-09, "loss": 0.0288, "step": 5541 }, { "epoch": 3.96, "grad_norm": 3.8677490201938785, "learning_rate": 3.210536069571113e-09, "loss": 0.024, "step": 5542 }, { "epoch": 3.96, "grad_norm": 3.608231101955727, "learning_rate": 3.1078162405939747e-09, "loss": 0.0213, "step": 5543 }, { "epoch": 3.96, "grad_norm": 3.272735238444045, "learning_rate": 3.006765966174774e-09, "loss": 0.0318, "step": 5544 }, { "epoch": 3.96, "grad_norm": 3.1790589293641482, "learning_rate": 2.907385280075392e-09, "loss": 0.0182, "step": 5545 }, { "epoch": 3.96, "grad_norm": 3.649186670073757, "learning_rate": 2.80967421550038e-09, "loss": 0.0286, "step": 5546 }, { "epoch": 3.96, "grad_norm": 3.138795296256432, "learning_rate": 2.7136328050980654e-09, "loss": 0.0284, "step": 5547 }, { "epoch": 3.96, "grad_norm": 3.9420052048363736, "learning_rate": 2.6192610809566697e-09, "loss": 0.0237, "step": 5548 }, { "epoch": 3.96, "grad_norm": 2.182296768945293, "learning_rate": 2.5265590746076373e-09, "loss": 0.0113, "step": 5549 }, { "epoch": 3.96, "grad_norm": 2.6773104562420236, "learning_rate": 2.43552681702508e-09, "loss": 0.0203, "step": 5550 }, { "epoch": 3.96, "grad_norm": 3.367481619945359, "learning_rate": 2.346164338624113e-09, "loss": 0.0228, "step": 5551 }, { "epoch": 3.96, "grad_norm": 5.067110242743501, "learning_rate": 2.2584716692619636e-09, "loss": 0.0237, "step": 5552 }, { "epoch": 3.96, "grad_norm": 2.0718286609725105, "learning_rate": 2.172448838239083e-09, "loss": 0.0221, "step": 5553 }, { "epoch": 3.96, "grad_norm": 2.662417149164622, "learning_rate": 2.08809587429748e-09, "loss": 0.0202, "step": 5554 }, { "epoch": 3.97, "grad_norm": 3.989820634216304, "learning_rate": 2.0054128056201662e-09, "loss": 0.0137, "step": 5555 }, { "epoch": 3.97, "grad_norm": 1.8775842103565406, "learning_rate": 1.924399659833376e-09, "loss": 0.014, "step": 5556 }, { "epoch": 3.97, "grad_norm": 7.413735859817457, "learning_rate": 1.8450564640054569e-09, "loss": 0.046, "step": 5557 }, { "epoch": 3.97, "grad_norm": 3.243264854700322, "learning_rate": 1.7673832446463146e-09, "loss": 0.0276, "step": 5558 }, { "epoch": 3.97, "grad_norm": 7.71185976345551, "learning_rate": 1.6913800277085225e-09, "loss": 0.031, "step": 5559 }, { "epoch": 3.97, "grad_norm": 6.885871743475731, "learning_rate": 1.6170468385845462e-09, "loss": 0.0285, "step": 5560 }, { "epoch": 3.97, "grad_norm": 3.6615117870884992, "learning_rate": 1.5443837021122954e-09, "loss": 0.0207, "step": 5561 }, { "epoch": 3.97, "grad_norm": 4.28734845963075, "learning_rate": 1.473390642569017e-09, "loss": 0.0173, "step": 5562 }, { "epoch": 3.97, "grad_norm": 2.702589837418001, "learning_rate": 1.4040676836746259e-09, "loss": 0.0251, "step": 5563 }, { "epoch": 3.97, "grad_norm": 5.090936212710113, "learning_rate": 1.336414848591705e-09, "loss": 0.0137, "step": 5564 }, { "epoch": 3.97, "grad_norm": 5.178772005848657, "learning_rate": 1.2704321599243951e-09, "loss": 0.0318, "step": 5565 }, { "epoch": 3.97, "grad_norm": 3.0897135807835494, "learning_rate": 1.206119639718395e-09, "loss": 0.0284, "step": 5566 }, { "epoch": 3.97, "grad_norm": 6.956302961997545, "learning_rate": 1.1434773094615158e-09, "loss": 0.0202, "step": 5567 }, { "epoch": 3.97, "grad_norm": 5.512949716160351, "learning_rate": 1.0825051900842377e-09, "loss": 0.0299, "step": 5568 }, { "epoch": 3.98, "grad_norm": 4.34290757998301, "learning_rate": 1.0232033019580423e-09, "loss": 0.0247, "step": 5569 }, { "epoch": 3.98, "grad_norm": 1.9398553518199555, "learning_rate": 9.655716648970804e-10, "loss": 0.0212, "step": 5570 }, { "epoch": 3.98, "grad_norm": 3.452078753978198, "learning_rate": 9.096102981570598e-10, "loss": 0.0181, "step": 5571 }, { "epoch": 3.98, "grad_norm": 1.1011444564361026, "learning_rate": 8.553192204358018e-10, "loss": 0.0116, "step": 5572 }, { "epoch": 3.98, "grad_norm": 2.2777409632755012, "learning_rate": 8.026984498726853e-10, "loss": 0.0277, "step": 5573 }, { "epoch": 3.98, "grad_norm": 4.323892977235624, "learning_rate": 7.517480040497572e-10, "loss": 0.0181, "step": 5574 }, { "epoch": 3.98, "grad_norm": 1.8751831593629684, "learning_rate": 7.024678999900669e-10, "loss": 0.0309, "step": 5575 }, { "epoch": 3.98, "grad_norm": 2.7347866857145986, "learning_rate": 6.548581541593324e-10, "loss": 0.0186, "step": 5576 }, { "epoch": 3.98, "grad_norm": 2.8926365897897406, "learning_rate": 6.08918782464829e-10, "loss": 0.0178, "step": 5577 }, { "epoch": 3.98, "grad_norm": 4.6764673593214505, "learning_rate": 5.646498002553902e-10, "loss": 0.0231, "step": 5578 }, { "epoch": 3.98, "grad_norm": 5.802111064150024, "learning_rate": 5.220512223219621e-10, "loss": 0.0308, "step": 5579 }, { "epoch": 3.98, "grad_norm": 2.0047609631552525, "learning_rate": 4.81123062898714e-10, "loss": 0.0163, "step": 5580 }, { "epoch": 3.98, "grad_norm": 1.0730242231401868, "learning_rate": 4.4186533565915293e-10, "loss": 0.0146, "step": 5581 }, { "epoch": 3.98, "grad_norm": 3.672437423453078, "learning_rate": 4.042780537205637e-10, "loss": 0.0192, "step": 5582 }, { "epoch": 3.99, "grad_norm": 4.753438723651524, "learning_rate": 3.6836122964178934e-10, "loss": 0.0247, "step": 5583 }, { "epoch": 3.99, "grad_norm": 2.127300877016818, "learning_rate": 3.341148754232304e-10, "loss": 0.0155, "step": 5584 }, { "epoch": 3.99, "grad_norm": 7.34220557033025, "learning_rate": 3.015390025068454e-10, "loss": 0.039, "step": 5585 }, { "epoch": 3.99, "grad_norm": 6.750258237012312, "learning_rate": 2.706336217767058e-10, "loss": 0.0253, "step": 5586 }, { "epoch": 3.99, "grad_norm": 2.231439477264196, "learning_rate": 2.4139874355955105e-10, "loss": 0.0146, "step": 5587 }, { "epoch": 3.99, "grad_norm": 3.31797560363478, "learning_rate": 2.138343776231233e-10, "loss": 0.0244, "step": 5588 }, { "epoch": 3.99, "grad_norm": 4.782531101487246, "learning_rate": 1.8794053317672255e-10, "loss": 0.0175, "step": 5589 }, { "epoch": 3.99, "grad_norm": 1.6196649621284065, "learning_rate": 1.6371721887287196e-10, "loss": 0.0192, "step": 5590 }, { "epoch": 3.99, "grad_norm": 1.720724803964426, "learning_rate": 1.4116444280398711e-10, "loss": 0.0169, "step": 5591 }, { "epoch": 3.99, "grad_norm": 3.9654605728386154, "learning_rate": 1.2028221250570683e-10, "loss": 0.015, "step": 5592 }, { "epoch": 3.99, "grad_norm": 3.692236086799763, "learning_rate": 1.0107053495522767e-10, "loss": 0.0205, "step": 5593 }, { "epoch": 3.99, "grad_norm": 2.5848778676255066, "learning_rate": 8.35294165718592e-11, "loss": 0.027, "step": 5594 }, { "epoch": 3.99, "grad_norm": 5.861472760760072, "learning_rate": 6.765886321646874e-11, "loss": 0.021, "step": 5595 }, { "epoch": 3.99, "grad_norm": 6.288974718765686, "learning_rate": 5.345888019092638e-11, "loss": 0.0381, "step": 5596 }, { "epoch": 4.0, "grad_norm": 3.6870045895899226, "learning_rate": 4.092947224032529e-11, "loss": 0.0194, "step": 5597 }, { "epoch": 4.0, "grad_norm": 7.961768982598845, "learning_rate": 3.007064355076139e-11, "loss": 0.0204, "step": 5598 }, { "epoch": 4.0, "grad_norm": 2.888016420410043, "learning_rate": 2.088239775044354e-11, "loss": 0.0248, "step": 5599 }, { "epoch": 4.0, "grad_norm": 6.1509337395528, "learning_rate": 1.3364737909138392e-11, "loss": 0.025, "step": 5600 }, { "epoch": 4.0, "grad_norm": 3.4536548965699705, "learning_rate": 7.517666539280654e-12, "loss": 0.039, "step": 5601 }, { "epoch": 4.0, "grad_norm": 1.643426850528451, "learning_rate": 3.3411855937526273e-12, "loss": 0.0276, "step": 5602 }, { "epoch": 4.0, "grad_norm": 3.699342004217049, "learning_rate": 8.352964681046516e-13, "loss": 0.0218, "step": 5603 }, { "epoch": 4.0, "grad_norm": 1.6873280750994422, "learning_rate": 0.0, "loss": 0.0218, "step": 5604 }, { "epoch": 4.0, "step": 5604, "total_flos": 750239094767616.0, "train_loss": 0.1176493737971587, "train_runtime": 15635.0784, "train_samples_per_second": 22.932, "train_steps_per_second": 0.358 } ], "logging_steps": 1.0, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 24000, "total_flos": 750239094767616.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }