{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999979819587109, "eval_steps": 500, "global_step": 37164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.78010667114874, "learning_rate": 4.0236686390532546e-07, "loss": 0.5346, "step": 20 }, { "epoch": 0.0, "grad_norm": 6.216957639101055, "learning_rate": 6.153846153846155e-07, "loss": 0.5043, "step": 40 }, { "epoch": 0.0, "grad_norm": 7.990357548396736, "learning_rate": 8.284023668639055e-07, "loss": 0.5017, "step": 60 }, { "epoch": 0.01, "grad_norm": 8.12688107356609, "learning_rate": 1.0414201183431955e-06, "loss": 0.4952, "step": 80 }, { "epoch": 0.01, "grad_norm": 6.530843475685683, "learning_rate": 1.2544378698224851e-06, "loss": 0.5621, "step": 100 }, { "epoch": 0.01, "grad_norm": 6.308467672405027, "learning_rate": 1.4674556213017752e-06, "loss": 0.4549, "step": 120 }, { "epoch": 0.01, "grad_norm": 7.243052870190241, "learning_rate": 1.6804733727810652e-06, "loss": 0.4466, "step": 140 }, { "epoch": 0.01, "grad_norm": 10.219726515841495, "learning_rate": 1.8934911242603552e-06, "loss": 0.3893, "step": 160 }, { "epoch": 0.01, "grad_norm": 6.627649317339657, "learning_rate": 2.106508875739645e-06, "loss": 0.4179, "step": 180 }, { "epoch": 0.02, "grad_norm": 6.44244224679364, "learning_rate": 2.319526627218935e-06, "loss": 0.4226, "step": 200 }, { "epoch": 0.02, "grad_norm": 6.612950721768246, "learning_rate": 2.532544378698225e-06, "loss": 0.3795, "step": 220 }, { "epoch": 0.02, "grad_norm": 6.283517912051673, "learning_rate": 2.7455621301775153e-06, "loss": 0.4276, "step": 240 }, { "epoch": 0.02, "grad_norm": 7.268987062349035, "learning_rate": 2.958579881656805e-06, "loss": 0.3619, "step": 260 }, { "epoch": 0.02, "grad_norm": 8.340583800596072, "learning_rate": 3.171597633136095e-06, "loss": 0.4244, "step": 280 }, { "epoch": 0.02, "grad_norm": 6.75600646477272, "learning_rate": 3.384615384615385e-06, "loss": 0.3852, "step": 300 }, { "epoch": 0.03, "grad_norm": 5.647054711391784, "learning_rate": 3.597633136094675e-06, "loss": 0.3809, "step": 320 }, { "epoch": 0.03, "grad_norm": 7.253045067435066, "learning_rate": 3.8106508875739652e-06, "loss": 0.3858, "step": 340 }, { "epoch": 0.03, "grad_norm": 7.301184351749545, "learning_rate": 4.023668639053255e-06, "loss": 0.3549, "step": 360 }, { "epoch": 0.03, "grad_norm": 7.59195138486003, "learning_rate": 4.236686390532545e-06, "loss": 0.4048, "step": 380 }, { "epoch": 0.03, "grad_norm": 10.124611929532549, "learning_rate": 4.449704142011835e-06, "loss": 0.3646, "step": 400 }, { "epoch": 0.03, "grad_norm": 7.744526068197853, "learning_rate": 4.662721893491124e-06, "loss": 0.3677, "step": 420 }, { "epoch": 0.04, "grad_norm": 8.246611853098463, "learning_rate": 4.875739644970415e-06, "loss": 0.3573, "step": 440 }, { "epoch": 0.04, "grad_norm": 7.675393597081337, "learning_rate": 5.088757396449705e-06, "loss": 0.3714, "step": 460 }, { "epoch": 0.04, "grad_norm": 7.407374558199348, "learning_rate": 5.301775147928995e-06, "loss": 0.3762, "step": 480 }, { "epoch": 0.04, "grad_norm": 7.5336943019480875, "learning_rate": 5.514792899408284e-06, "loss": 0.3423, "step": 500 }, { "epoch": 0.04, "grad_norm": 6.789944607897793, "learning_rate": 5.727810650887574e-06, "loss": 0.3382, "step": 520 }, { "epoch": 0.04, "grad_norm": 6.2048712513712765, "learning_rate": 5.940828402366864e-06, "loss": 0.355, "step": 540 }, { "epoch": 0.05, "grad_norm": 7.7676635768181255, "learning_rate": 6.153846153846153e-06, "loss": 0.3481, "step": 560 }, { "epoch": 0.05, "grad_norm": 5.33865756273462, "learning_rate": 6.366863905325444e-06, "loss": 0.3486, "step": 580 }, { "epoch": 0.05, "grad_norm": 7.215416340807466, "learning_rate": 6.579881656804735e-06, "loss": 0.3378, "step": 600 }, { "epoch": 0.05, "grad_norm": 6.563753135562715, "learning_rate": 6.792899408284025e-06, "loss": 0.3126, "step": 620 }, { "epoch": 0.05, "grad_norm": 8.45042187241426, "learning_rate": 7.005917159763315e-06, "loss": 0.3231, "step": 640 }, { "epoch": 0.05, "grad_norm": 6.734275587016377, "learning_rate": 7.218934911242604e-06, "loss": 0.3533, "step": 660 }, { "epoch": 0.05, "grad_norm": 6.417837093997314, "learning_rate": 7.431952662721894e-06, "loss": 0.3335, "step": 680 }, { "epoch": 0.06, "grad_norm": 8.615382928114506, "learning_rate": 7.644970414201183e-06, "loss": 0.3495, "step": 700 }, { "epoch": 0.06, "grad_norm": 6.6395133490495395, "learning_rate": 7.857988165680473e-06, "loss": 0.3371, "step": 720 }, { "epoch": 0.06, "grad_norm": 9.710125890008053, "learning_rate": 8.071005917159764e-06, "loss": 0.3619, "step": 740 }, { "epoch": 0.06, "grad_norm": 8.21056721557438, "learning_rate": 8.284023668639054e-06, "loss": 0.3479, "step": 760 }, { "epoch": 0.06, "grad_norm": 7.0850591314913185, "learning_rate": 8.497041420118344e-06, "loss": 0.3217, "step": 780 }, { "epoch": 0.06, "grad_norm": 8.709374421841343, "learning_rate": 8.710059171597634e-06, "loss": 0.345, "step": 800 }, { "epoch": 0.07, "grad_norm": 8.703998514988717, "learning_rate": 8.923076923076925e-06, "loss": 0.3544, "step": 820 }, { "epoch": 0.07, "grad_norm": 8.233699532008588, "learning_rate": 9.136094674556215e-06, "loss": 0.3449, "step": 840 }, { "epoch": 0.07, "grad_norm": 6.360611479052244, "learning_rate": 9.349112426035503e-06, "loss": 0.3709, "step": 860 }, { "epoch": 0.07, "grad_norm": 5.21422760141616, "learning_rate": 9.562130177514794e-06, "loss": 0.3031, "step": 880 }, { "epoch": 0.07, "grad_norm": 10.285280127672143, "learning_rate": 9.775147928994084e-06, "loss": 0.314, "step": 900 }, { "epoch": 0.07, "grad_norm": 8.561551897783339, "learning_rate": 9.988165680473372e-06, "loss": 0.34, "step": 920 }, { "epoch": 0.08, "grad_norm": 8.337797520489195, "learning_rate": 1.0201183431952664e-05, "loss": 0.3324, "step": 940 }, { "epoch": 0.08, "grad_norm": 9.531967986532953, "learning_rate": 1.0414201183431953e-05, "loss": 0.359, "step": 960 }, { "epoch": 0.08, "grad_norm": 6.913190435381454, "learning_rate": 1.0627218934911243e-05, "loss": 0.3715, "step": 980 }, { "epoch": 0.08, "grad_norm": 8.525404719704843, "learning_rate": 1.0840236686390533e-05, "loss": 0.2926, "step": 1000 }, { "epoch": 0.08, "grad_norm": 8.186212103501571, "learning_rate": 1.1053254437869825e-05, "loss": 0.351, "step": 1020 }, { "epoch": 0.08, "grad_norm": 5.968895396114415, "learning_rate": 1.1266272189349114e-05, "loss": 0.3325, "step": 1040 }, { "epoch": 0.09, "grad_norm": 9.069989076248202, "learning_rate": 1.1479289940828404e-05, "loss": 0.3075, "step": 1060 }, { "epoch": 0.09, "grad_norm": 6.617321242745247, "learning_rate": 1.1692307692307694e-05, "loss": 0.3321, "step": 1080 }, { "epoch": 0.09, "grad_norm": 8.096724387107496, "learning_rate": 1.1905325443786983e-05, "loss": 0.3508, "step": 1100 }, { "epoch": 0.09, "grad_norm": 7.972357677530315, "learning_rate": 1.2118343195266273e-05, "loss": 0.3031, "step": 1120 }, { "epoch": 0.09, "grad_norm": 7.82798416711515, "learning_rate": 1.2331360946745563e-05, "loss": 0.2912, "step": 1140 }, { "epoch": 0.09, "grad_norm": 10.732472169183794, "learning_rate": 1.2544378698224854e-05, "loss": 0.2956, "step": 1160 }, { "epoch": 0.1, "grad_norm": 7.4415223600685625, "learning_rate": 1.2757396449704142e-05, "loss": 0.3234, "step": 1180 }, { "epoch": 0.1, "grad_norm": 7.006979596197168, "learning_rate": 1.2970414201183432e-05, "loss": 0.3271, "step": 1200 }, { "epoch": 0.1, "grad_norm": 8.00254256852378, "learning_rate": 1.3183431952662723e-05, "loss": 0.3165, "step": 1220 }, { "epoch": 0.1, "grad_norm": 14.742499561979336, "learning_rate": 1.3396449704142011e-05, "loss": 0.3015, "step": 1240 }, { "epoch": 0.1, "grad_norm": 7.14191343340385, "learning_rate": 1.3609467455621301e-05, "loss": 0.3273, "step": 1260 }, { "epoch": 0.1, "grad_norm": 5.720523746760943, "learning_rate": 1.3822485207100593e-05, "loss": 0.3073, "step": 1280 }, { "epoch": 0.1, "grad_norm": 29.467573544915435, "learning_rate": 1.4035502958579883e-05, "loss": 0.2779, "step": 1300 }, { "epoch": 0.11, "grad_norm": 6.058319538140977, "learning_rate": 1.4248520710059172e-05, "loss": 0.3176, "step": 1320 }, { "epoch": 0.11, "grad_norm": 5.979724640901607, "learning_rate": 1.4461538461538462e-05, "loss": 0.2919, "step": 1340 }, { "epoch": 0.11, "grad_norm": 9.296561099611436, "learning_rate": 1.4674556213017754e-05, "loss": 0.3073, "step": 1360 }, { "epoch": 0.11, "grad_norm": 6.766863350601436, "learning_rate": 1.4887573964497044e-05, "loss": 0.3449, "step": 1380 }, { "epoch": 0.11, "grad_norm": 9.0738349271763, "learning_rate": 1.5100591715976333e-05, "loss": 0.3282, "step": 1400 }, { "epoch": 0.11, "grad_norm": 5.238889472577541, "learning_rate": 1.5313609467455623e-05, "loss": 0.2766, "step": 1420 }, { "epoch": 0.12, "grad_norm": 6.068739678701155, "learning_rate": 1.5526627218934912e-05, "loss": 0.3527, "step": 1440 }, { "epoch": 0.12, "grad_norm": 8.761831948666881, "learning_rate": 1.5739644970414204e-05, "loss": 0.309, "step": 1460 }, { "epoch": 0.12, "grad_norm": 5.696276254834093, "learning_rate": 1.5952662721893492e-05, "loss": 0.3334, "step": 1480 }, { "epoch": 0.12, "grad_norm": 5.698639074786848, "learning_rate": 1.616568047337278e-05, "loss": 0.306, "step": 1500 }, { "epoch": 0.12, "grad_norm": 6.244148261549131, "learning_rate": 1.6378698224852073e-05, "loss": 0.3, "step": 1520 }, { "epoch": 0.12, "grad_norm": 6.948838050368144, "learning_rate": 1.659171597633136e-05, "loss": 0.3348, "step": 1540 }, { "epoch": 0.13, "grad_norm": 6.672363835099913, "learning_rate": 1.6804733727810653e-05, "loss": 0.3534, "step": 1560 }, { "epoch": 0.13, "grad_norm": 50.29837243421749, "learning_rate": 1.7017751479289942e-05, "loss": 0.3203, "step": 1580 }, { "epoch": 0.13, "grad_norm": 7.03929850767991, "learning_rate": 1.723076923076923e-05, "loss": 0.297, "step": 1600 }, { "epoch": 0.13, "grad_norm": 7.19469430195841, "learning_rate": 1.7443786982248522e-05, "loss": 0.2978, "step": 1620 }, { "epoch": 0.13, "grad_norm": 7.055651559785636, "learning_rate": 1.765680473372781e-05, "loss": 0.3313, "step": 1640 }, { "epoch": 0.13, "grad_norm": 7.162934891578056, "learning_rate": 1.78698224852071e-05, "loss": 0.3188, "step": 1660 }, { "epoch": 0.14, "grad_norm": 7.176153099629785, "learning_rate": 1.808284023668639e-05, "loss": 0.3297, "step": 1680 }, { "epoch": 0.14, "grad_norm": 6.367781797418254, "learning_rate": 1.8295857988165683e-05, "loss": 0.3042, "step": 1700 }, { "epoch": 0.14, "grad_norm": 6.373173769997794, "learning_rate": 1.8508875739644975e-05, "loss": 0.3266, "step": 1720 }, { "epoch": 0.14, "grad_norm": 18.577261761115807, "learning_rate": 1.8721893491124264e-05, "loss": 0.2736, "step": 1740 }, { "epoch": 0.14, "grad_norm": 4.994768648489008, "learning_rate": 1.8934911242603552e-05, "loss": 0.3012, "step": 1760 }, { "epoch": 0.14, "grad_norm": 6.301807374403158, "learning_rate": 1.9147928994082844e-05, "loss": 0.3394, "step": 1780 }, { "epoch": 0.15, "grad_norm": 7.456967816650055, "learning_rate": 1.9360946745562133e-05, "loss": 0.3329, "step": 1800 }, { "epoch": 0.15, "grad_norm": 6.0971107018925625, "learning_rate": 1.957396449704142e-05, "loss": 0.3523, "step": 1820 }, { "epoch": 0.15, "grad_norm": 8.10379030061416, "learning_rate": 1.9786982248520713e-05, "loss": 0.3013, "step": 1840 }, { "epoch": 0.15, "grad_norm": 7.024454181142386, "learning_rate": 1.9999999960412883e-05, "loss": 0.2762, "step": 1860 }, { "epoch": 0.15, "grad_norm": 8.232339625552946, "learning_rate": 1.9999982542086008e-05, "loss": 0.3448, "step": 1880 }, { "epoch": 0.15, "grad_norm": 7.602098687169412, "learning_rate": 1.9999933454128334e-05, "loss": 0.3398, "step": 1900 }, { "epoch": 0.15, "grad_norm": 13.60330741158854, "learning_rate": 1.9999852696695326e-05, "loss": 0.3449, "step": 1920 }, { "epoch": 0.16, "grad_norm": 6.868480851347135, "learning_rate": 1.9999740270042764e-05, "loss": 0.3047, "step": 1940 }, { "epoch": 0.16, "grad_norm": 8.419546962701075, "learning_rate": 1.9999596174526744e-05, "loss": 0.296, "step": 1960 }, { "epoch": 0.16, "grad_norm": 8.258953437915027, "learning_rate": 1.9999420410603655e-05, "loss": 0.3145, "step": 1980 }, { "epoch": 0.16, "grad_norm": 8.591067141592344, "learning_rate": 1.9999212978830192e-05, "loss": 0.2967, "step": 2000 }, { "epoch": 0.16, "grad_norm": 7.788694443802249, "learning_rate": 1.9998973879863347e-05, "loss": 0.2922, "step": 2020 }, { "epoch": 0.16, "grad_norm": 8.894924525369044, "learning_rate": 1.999870311446042e-05, "loss": 0.2909, "step": 2040 }, { "epoch": 0.17, "grad_norm": 7.71785690762546, "learning_rate": 1.9998400683478994e-05, "loss": 0.3185, "step": 2060 }, { "epoch": 0.17, "grad_norm": 6.423565941270898, "learning_rate": 1.9998066587876964e-05, "loss": 0.331, "step": 2080 }, { "epoch": 0.17, "grad_norm": 9.504973464920754, "learning_rate": 1.9997700828712502e-05, "loss": 0.3163, "step": 2100 }, { "epoch": 0.17, "grad_norm": 5.0751162961967555, "learning_rate": 1.999730340714407e-05, "loss": 0.2935, "step": 2120 }, { "epoch": 0.17, "grad_norm": 6.788588249402219, "learning_rate": 1.9996874324430414e-05, "loss": 0.304, "step": 2140 }, { "epoch": 0.17, "grad_norm": 6.982728358658279, "learning_rate": 1.9996413581930564e-05, "loss": 0.3254, "step": 2160 }, { "epoch": 0.18, "grad_norm": 7.39883321894128, "learning_rate": 1.9995921181103827e-05, "loss": 0.3238, "step": 2180 }, { "epoch": 0.18, "grad_norm": 6.349466388539233, "learning_rate": 1.999539712350977e-05, "loss": 0.3138, "step": 2200 }, { "epoch": 0.18, "grad_norm": 8.460848680137447, "learning_rate": 1.9994841410808238e-05, "loss": 0.2951, "step": 2220 }, { "epoch": 0.18, "grad_norm": 12.228980891015102, "learning_rate": 1.999425404475933e-05, "loss": 0.313, "step": 2240 }, { "epoch": 0.18, "grad_norm": 10.128152992141587, "learning_rate": 1.99936350272234e-05, "loss": 0.3209, "step": 2260 }, { "epoch": 0.18, "grad_norm": 7.366413095980619, "learning_rate": 1.999298436016105e-05, "loss": 0.3508, "step": 2280 }, { "epoch": 0.19, "grad_norm": 5.935510884433497, "learning_rate": 1.9992302045633138e-05, "loss": 0.3087, "step": 2300 }, { "epoch": 0.19, "grad_norm": 9.917097921103624, "learning_rate": 1.9991588085800745e-05, "loss": 0.3272, "step": 2320 }, { "epoch": 0.19, "grad_norm": 8.158864972330328, "learning_rate": 1.9990842482925183e-05, "loss": 0.3097, "step": 2340 }, { "epoch": 0.19, "grad_norm": 5.660258353439845, "learning_rate": 1.999006523936799e-05, "loss": 0.3194, "step": 2360 }, { "epoch": 0.19, "grad_norm": 6.343908148236521, "learning_rate": 1.9989256357590915e-05, "loss": 0.3144, "step": 2380 }, { "epoch": 0.19, "grad_norm": 6.935522124005399, "learning_rate": 1.9988415840155925e-05, "loss": 0.316, "step": 2400 }, { "epoch": 0.2, "grad_norm": 6.118420550913593, "learning_rate": 1.9987543689725172e-05, "loss": 0.2935, "step": 2420 }, { "epoch": 0.2, "grad_norm": 5.852760915435719, "learning_rate": 1.998663990906101e-05, "loss": 0.2982, "step": 2440 }, { "epoch": 0.2, "grad_norm": 10.637858430267903, "learning_rate": 1.9985704501025967e-05, "loss": 0.3263, "step": 2460 }, { "epoch": 0.2, "grad_norm": 5.969298652078407, "learning_rate": 1.9984737468582746e-05, "loss": 0.2785, "step": 2480 }, { "epoch": 0.2, "grad_norm": 7.290609868079292, "learning_rate": 1.998373881479422e-05, "loss": 0.2902, "step": 2500 }, { "epoch": 0.2, "grad_norm": 8.419267863068479, "learning_rate": 1.9982708542823405e-05, "loss": 0.2854, "step": 2520 }, { "epoch": 0.21, "grad_norm": 8.123890492905641, "learning_rate": 1.9981646655933466e-05, "loss": 0.2981, "step": 2540 }, { "epoch": 0.21, "grad_norm": 5.10058603098674, "learning_rate": 1.998055315748771e-05, "loss": 0.2792, "step": 2560 }, { "epoch": 0.21, "grad_norm": 5.6118366078785105, "learning_rate": 1.997942805094955e-05, "loss": 0.2905, "step": 2580 }, { "epoch": 0.21, "grad_norm": 11.075016161215812, "learning_rate": 1.997827133988252e-05, "loss": 0.2902, "step": 2600 }, { "epoch": 0.21, "grad_norm": 8.887207232453743, "learning_rate": 1.997708302795026e-05, "loss": 0.3155, "step": 2620 }, { "epoch": 0.21, "grad_norm": 8.39711790141671, "learning_rate": 1.997586311891649e-05, "loss": 0.286, "step": 2640 }, { "epoch": 0.21, "grad_norm": 5.439201702560111, "learning_rate": 1.9974611616645007e-05, "loss": 0.2933, "step": 2660 }, { "epoch": 0.22, "grad_norm": 7.182735290178756, "learning_rate": 1.9973328525099675e-05, "loss": 0.3267, "step": 2680 }, { "epoch": 0.22, "grad_norm": 8.170254081594555, "learning_rate": 1.997201384834442e-05, "loss": 0.2967, "step": 2700 }, { "epoch": 0.22, "grad_norm": 6.357829888020736, "learning_rate": 1.997066759054319e-05, "loss": 0.3109, "step": 2720 }, { "epoch": 0.22, "grad_norm": 6.180030398494684, "learning_rate": 1.996928975595997e-05, "loss": 0.3054, "step": 2740 }, { "epoch": 0.22, "grad_norm": 7.812651853992933, "learning_rate": 1.996788034895875e-05, "loss": 0.2852, "step": 2760 }, { "epoch": 0.22, "grad_norm": 6.492759192826664, "learning_rate": 1.9966439374003538e-05, "loss": 0.305, "step": 2780 }, { "epoch": 0.23, "grad_norm": 11.941328303638311, "learning_rate": 1.99649668356583e-05, "loss": 0.2922, "step": 2800 }, { "epoch": 0.23, "grad_norm": 8.314661648415811, "learning_rate": 1.9963462738586993e-05, "loss": 0.3102, "step": 2820 }, { "epoch": 0.23, "grad_norm": 7.301474320450444, "learning_rate": 1.996192708755351e-05, "loss": 0.2964, "step": 2840 }, { "epoch": 0.23, "grad_norm": 8.797389284755965, "learning_rate": 1.996035988742171e-05, "loss": 0.2656, "step": 2860 }, { "epoch": 0.23, "grad_norm": 6.614687108304631, "learning_rate": 1.9958761143155357e-05, "loss": 0.2927, "step": 2880 }, { "epoch": 0.23, "grad_norm": 7.623906291629947, "learning_rate": 1.995713085981813e-05, "loss": 0.2788, "step": 2900 }, { "epoch": 0.24, "grad_norm": 5.684942655651583, "learning_rate": 1.9955469042573605e-05, "loss": 0.3051, "step": 2920 }, { "epoch": 0.24, "grad_norm": 8.632124459996572, "learning_rate": 1.9953775696685223e-05, "loss": 0.3002, "step": 2940 }, { "epoch": 0.24, "grad_norm": 4.942563514745161, "learning_rate": 1.99520508275163e-05, "loss": 0.2862, "step": 2960 }, { "epoch": 0.24, "grad_norm": 6.712590574651518, "learning_rate": 1.995029444052999e-05, "loss": 0.2938, "step": 2980 }, { "epoch": 0.24, "grad_norm": 7.09759309831908, "learning_rate": 1.9948506541289266e-05, "loss": 0.3054, "step": 3000 }, { "epoch": 0.24, "grad_norm": 9.042408105595795, "learning_rate": 1.994668713545692e-05, "loss": 0.3041, "step": 3020 }, { "epoch": 0.25, "grad_norm": 6.843804903550461, "learning_rate": 1.994483622879553e-05, "loss": 0.2958, "step": 3040 }, { "epoch": 0.25, "grad_norm": 8.109882081629157, "learning_rate": 1.9942953827167443e-05, "loss": 0.3115, "step": 3060 }, { "epoch": 0.25, "grad_norm": 6.519937602019556, "learning_rate": 1.994103993653476e-05, "loss": 0.2873, "step": 3080 }, { "epoch": 0.25, "grad_norm": 43.825640526729615, "learning_rate": 1.9939094562959324e-05, "loss": 0.3084, "step": 3100 }, { "epoch": 0.25, "grad_norm": 6.740302289754918, "learning_rate": 1.993711771260268e-05, "loss": 0.2898, "step": 3120 }, { "epoch": 0.25, "grad_norm": 8.585263920916868, "learning_rate": 1.993510939172609e-05, "loss": 0.303, "step": 3140 }, { "epoch": 0.26, "grad_norm": 7.715090235382078, "learning_rate": 1.9933069606690468e-05, "loss": 0.3102, "step": 3160 }, { "epoch": 0.26, "grad_norm": 9.780531981807941, "learning_rate": 1.99309983639564e-05, "loss": 0.3077, "step": 3180 }, { "epoch": 0.26, "grad_norm": 4.606128027451412, "learning_rate": 1.99288956700841e-05, "loss": 0.3131, "step": 3200 }, { "epoch": 0.26, "grad_norm": 6.5456142622794875, "learning_rate": 1.9926761531733403e-05, "loss": 0.2899, "step": 3220 }, { "epoch": 0.26, "grad_norm": 6.881966685047346, "learning_rate": 1.9924595955663732e-05, "loss": 0.2834, "step": 3240 }, { "epoch": 0.26, "grad_norm": 6.086009895569889, "learning_rate": 1.9922398948734088e-05, "loss": 0.2887, "step": 3260 }, { "epoch": 0.26, "grad_norm": 5.329585705771699, "learning_rate": 1.992017051790301e-05, "loss": 0.2888, "step": 3280 }, { "epoch": 0.27, "grad_norm": 9.3500534790468, "learning_rate": 1.991791067022858e-05, "loss": 0.3168, "step": 3300 }, { "epoch": 0.27, "grad_norm": 6.741688450171789, "learning_rate": 1.9915619412868387e-05, "loss": 0.2703, "step": 3320 }, { "epoch": 0.27, "grad_norm": 6.86462812934889, "learning_rate": 1.9913296753079484e-05, "loss": 0.3141, "step": 3340 }, { "epoch": 0.27, "grad_norm": 6.6699035733643495, "learning_rate": 1.9910942698218404e-05, "loss": 0.2922, "step": 3360 }, { "epoch": 0.27, "grad_norm": 8.43685481112505, "learning_rate": 1.990855725574111e-05, "loss": 0.2836, "step": 3380 }, { "epoch": 0.27, "grad_norm": 10.06638161800925, "learning_rate": 1.990614043320298e-05, "loss": 0.2949, "step": 3400 }, { "epoch": 0.28, "grad_norm": 8.362504433942911, "learning_rate": 1.9903692238258783e-05, "loss": 0.2897, "step": 3420 }, { "epoch": 0.28, "grad_norm": 6.788699791177713, "learning_rate": 1.9901212678662646e-05, "loss": 0.2907, "step": 3440 }, { "epoch": 0.28, "grad_norm": 6.40238575575375, "learning_rate": 1.989870176226804e-05, "loss": 0.2609, "step": 3460 }, { "epoch": 0.28, "grad_norm": 5.227206933131435, "learning_rate": 1.9896159497027758e-05, "loss": 0.3162, "step": 3480 }, { "epoch": 0.28, "grad_norm": 6.191089860311128, "learning_rate": 1.9893585890993877e-05, "loss": 0.2998, "step": 3500 }, { "epoch": 0.28, "grad_norm": 8.156421644955156, "learning_rate": 1.9890980952317745e-05, "loss": 0.2683, "step": 3520 }, { "epoch": 0.29, "grad_norm": 7.62639892752842, "learning_rate": 1.9888344689249945e-05, "loss": 0.3138, "step": 3540 }, { "epoch": 0.29, "grad_norm": 8.10913885283575, "learning_rate": 1.9885677110140272e-05, "loss": 0.3098, "step": 3560 }, { "epoch": 0.29, "grad_norm": 5.974197538110473, "learning_rate": 1.988297822343771e-05, "loss": 0.2879, "step": 3580 }, { "epoch": 0.29, "grad_norm": 8.13170124417466, "learning_rate": 1.9880248037690406e-05, "loss": 0.2741, "step": 3600 }, { "epoch": 0.29, "grad_norm": 5.373939941911109, "learning_rate": 1.9877486561545635e-05, "loss": 0.2818, "step": 3620 }, { "epoch": 0.29, "grad_norm": 6.876975035910139, "learning_rate": 1.9874693803749786e-05, "loss": 0.2872, "step": 3640 }, { "epoch": 0.3, "grad_norm": 11.88859663115872, "learning_rate": 1.987186977314831e-05, "loss": 0.2787, "step": 3660 }, { "epoch": 0.3, "grad_norm": 5.296482127875842, "learning_rate": 1.9869014478685726e-05, "loss": 0.3125, "step": 3680 }, { "epoch": 0.3, "grad_norm": 10.902431223896663, "learning_rate": 1.986612792940556e-05, "loss": 0.2696, "step": 3700 }, { "epoch": 0.3, "grad_norm": 7.957172435618448, "learning_rate": 1.986321013445034e-05, "loss": 0.2846, "step": 3720 }, { "epoch": 0.3, "grad_norm": 5.49530713404051, "learning_rate": 1.9860261103061555e-05, "loss": 0.2904, "step": 3740 }, { "epoch": 0.3, "grad_norm": 6.7681775640908315, "learning_rate": 1.985728084457963e-05, "loss": 0.2907, "step": 3760 }, { "epoch": 0.31, "grad_norm": 11.417291183282801, "learning_rate": 1.9854269368443898e-05, "loss": 0.3124, "step": 3780 }, { "epoch": 0.31, "grad_norm": 9.165271676007183, "learning_rate": 1.985122668419255e-05, "loss": 0.2938, "step": 3800 }, { "epoch": 0.31, "grad_norm": 9.710590629489802, "learning_rate": 1.984815280146265e-05, "loss": 0.2805, "step": 3820 }, { "epoch": 0.31, "grad_norm": 10.32416184835814, "learning_rate": 1.9845047729990052e-05, "loss": 0.2939, "step": 3840 }, { "epoch": 0.31, "grad_norm": 6.123004510419631, "learning_rate": 1.984191147960941e-05, "loss": 0.3217, "step": 3860 }, { "epoch": 0.31, "grad_norm": 8.419418288045916, "learning_rate": 1.9838744060254113e-05, "loss": 0.2466, "step": 3880 }, { "epoch": 0.31, "grad_norm": 8.941869987837809, "learning_rate": 1.9835545481956295e-05, "loss": 0.3091, "step": 3900 }, { "epoch": 0.32, "grad_norm": 6.854852736746462, "learning_rate": 1.983231575484676e-05, "loss": 0.3094, "step": 3920 }, { "epoch": 0.32, "grad_norm": 10.162127205743055, "learning_rate": 1.9829054889154978e-05, "loss": 0.2988, "step": 3940 }, { "epoch": 0.32, "grad_norm": 6.1276753090877385, "learning_rate": 1.982576289520904e-05, "loss": 0.2875, "step": 3960 }, { "epoch": 0.32, "grad_norm": 6.806977159453115, "learning_rate": 1.982243978343562e-05, "loss": 0.2943, "step": 3980 }, { "epoch": 0.32, "grad_norm": 6.9055487505442015, "learning_rate": 1.9819085564359977e-05, "loss": 0.2911, "step": 4000 }, { "epoch": 0.32, "grad_norm": 7.466453294884225, "learning_rate": 1.9815700248605875e-05, "loss": 0.2902, "step": 4020 }, { "epoch": 0.33, "grad_norm": 5.488844395318609, "learning_rate": 1.9812283846895572e-05, "loss": 0.2773, "step": 4040 }, { "epoch": 0.33, "grad_norm": 5.492586688406755, "learning_rate": 1.9808836370049786e-05, "loss": 0.2942, "step": 4060 }, { "epoch": 0.33, "grad_norm": 6.870365314571275, "learning_rate": 1.980535782898766e-05, "loss": 0.3134, "step": 4080 }, { "epoch": 0.33, "grad_norm": 6.474349542297636, "learning_rate": 1.9801848234726733e-05, "loss": 0.278, "step": 4100 }, { "epoch": 0.33, "grad_norm": 7.02153354250866, "learning_rate": 1.9798307598382887e-05, "loss": 0.3008, "step": 4120 }, { "epoch": 0.33, "grad_norm": 6.5342549251431725, "learning_rate": 1.9794735931170323e-05, "loss": 0.2588, "step": 4140 }, { "epoch": 0.34, "grad_norm": 7.235161691162515, "learning_rate": 1.9791133244401536e-05, "loss": 0.2892, "step": 4160 }, { "epoch": 0.34, "grad_norm": 6.613883714897734, "learning_rate": 1.978749954948726e-05, "loss": 0.3042, "step": 4180 }, { "epoch": 0.34, "grad_norm": 5.588985182579549, "learning_rate": 1.978383485793645e-05, "loss": 0.2895, "step": 4200 }, { "epoch": 0.34, "grad_norm": 41.788686405813685, "learning_rate": 1.9780139181356223e-05, "loss": 0.2967, "step": 4220 }, { "epoch": 0.34, "grad_norm": 8.000329487691184, "learning_rate": 1.9776412531451845e-05, "loss": 0.3068, "step": 4240 }, { "epoch": 0.34, "grad_norm": 8.858664509374336, "learning_rate": 1.977265492002667e-05, "loss": 0.2904, "step": 4260 }, { "epoch": 0.35, "grad_norm": 6.412322055660321, "learning_rate": 1.9768866358982138e-05, "loss": 0.302, "step": 4280 }, { "epoch": 0.35, "grad_norm": 6.15402072878952, "learning_rate": 1.9765046860317697e-05, "loss": 0.2753, "step": 4300 }, { "epoch": 0.35, "grad_norm": 7.363823390602094, "learning_rate": 1.9761196436130792e-05, "loss": 0.3077, "step": 4320 }, { "epoch": 0.35, "grad_norm": 5.820012641709484, "learning_rate": 1.9757315098616813e-05, "loss": 0.3024, "step": 4340 }, { "epoch": 0.35, "grad_norm": 5.291771334516593, "learning_rate": 1.975340286006906e-05, "loss": 0.2732, "step": 4360 }, { "epoch": 0.35, "grad_norm": 5.880570388428466, "learning_rate": 1.9749459732878716e-05, "loss": 0.2491, "step": 4380 }, { "epoch": 0.36, "grad_norm": 5.655270317760537, "learning_rate": 1.9745485729534788e-05, "loss": 0.2803, "step": 4400 }, { "epoch": 0.36, "grad_norm": 6.013092379821028, "learning_rate": 1.974148086262408e-05, "loss": 0.2803, "step": 4420 }, { "epoch": 0.36, "grad_norm": 5.7211652147787975, "learning_rate": 1.9737445144831136e-05, "loss": 0.2637, "step": 4440 }, { "epoch": 0.36, "grad_norm": 8.131187547800137, "learning_rate": 1.973337858893824e-05, "loss": 0.3255, "step": 4460 }, { "epoch": 0.36, "grad_norm": 7.137552013307909, "learning_rate": 1.972928120782533e-05, "loss": 0.2668, "step": 4480 }, { "epoch": 0.36, "grad_norm": 5.150009725617049, "learning_rate": 1.972515301446998e-05, "loss": 0.2854, "step": 4500 }, { "epoch": 0.36, "grad_norm": 6.331424258094408, "learning_rate": 1.972099402194736e-05, "loss": 0.2866, "step": 4520 }, { "epoch": 0.37, "grad_norm": 8.392412446366174, "learning_rate": 1.9716804243430176e-05, "loss": 0.2616, "step": 4540 }, { "epoch": 0.37, "grad_norm": 7.299549339702017, "learning_rate": 1.971258369218867e-05, "loss": 0.2983, "step": 4560 }, { "epoch": 0.37, "grad_norm": 6.2169745129545575, "learning_rate": 1.970833238159051e-05, "loss": 0.276, "step": 4580 }, { "epoch": 0.37, "grad_norm": 7.506632605972847, "learning_rate": 1.9704050325100827e-05, "loss": 0.2951, "step": 4600 }, { "epoch": 0.37, "grad_norm": 6.66483208527068, "learning_rate": 1.969973753628211e-05, "loss": 0.2784, "step": 4620 }, { "epoch": 0.37, "grad_norm": 7.211925104193477, "learning_rate": 1.9695394028794195e-05, "loss": 0.2729, "step": 4640 }, { "epoch": 0.38, "grad_norm": 4.20991234694906, "learning_rate": 1.9691019816394204e-05, "loss": 0.3152, "step": 4660 }, { "epoch": 0.38, "grad_norm": 5.975539574939649, "learning_rate": 1.9686614912936516e-05, "loss": 0.2747, "step": 4680 }, { "epoch": 0.38, "grad_norm": 6.135748637813934, "learning_rate": 1.968217933237272e-05, "loss": 0.3028, "step": 4700 }, { "epoch": 0.38, "grad_norm": 8.994912298940163, "learning_rate": 1.9677713088751562e-05, "loss": 0.3043, "step": 4720 }, { "epoch": 0.38, "grad_norm": 7.649871286543558, "learning_rate": 1.967321619621892e-05, "loss": 0.2577, "step": 4740 }, { "epoch": 0.38, "grad_norm": 6.035703921853307, "learning_rate": 1.9668688669017722e-05, "loss": 0.2596, "step": 4760 }, { "epoch": 0.39, "grad_norm": 5.4070428696843615, "learning_rate": 1.9664130521487946e-05, "loss": 0.2885, "step": 4780 }, { "epoch": 0.39, "grad_norm": 9.68037240943506, "learning_rate": 1.9659541768066545e-05, "loss": 0.2739, "step": 4800 }, { "epoch": 0.39, "grad_norm": 7.032775442165197, "learning_rate": 1.965492242328741e-05, "loss": 0.2832, "step": 4820 }, { "epoch": 0.39, "grad_norm": 7.038266627020968, "learning_rate": 1.9650272501781326e-05, "loss": 0.3053, "step": 4840 }, { "epoch": 0.39, "grad_norm": 4.469246363249616, "learning_rate": 1.9645592018275917e-05, "loss": 0.2922, "step": 4860 }, { "epoch": 0.39, "grad_norm": 9.997476259295432, "learning_rate": 1.964088098759561e-05, "loss": 0.3029, "step": 4880 }, { "epoch": 0.4, "grad_norm": 4.1359256203786705, "learning_rate": 1.9636139424661588e-05, "loss": 0.2885, "step": 4900 }, { "epoch": 0.4, "grad_norm": 6.857022727186512, "learning_rate": 1.9631367344491735e-05, "loss": 0.263, "step": 4920 }, { "epoch": 0.4, "grad_norm": 6.720261230840821, "learning_rate": 1.9626564762200583e-05, "loss": 0.3083, "step": 4940 }, { "epoch": 0.4, "grad_norm": 7.113731977499931, "learning_rate": 1.9621731692999284e-05, "loss": 0.2789, "step": 4960 }, { "epoch": 0.4, "grad_norm": 6.665634774774537, "learning_rate": 1.961686815219555e-05, "loss": 0.2591, "step": 4980 }, { "epoch": 0.4, "grad_norm": 4.599220599612653, "learning_rate": 1.9611974155193597e-05, "loss": 0.2753, "step": 5000 }, { "epoch": 0.41, "grad_norm": 6.564880520618788, "learning_rate": 1.960704971749411e-05, "loss": 0.2805, "step": 5020 }, { "epoch": 0.41, "grad_norm": 7.418663470463415, "learning_rate": 1.9602094854694194e-05, "loss": 0.2782, "step": 5040 }, { "epoch": 0.41, "grad_norm": 5.491847672130194, "learning_rate": 1.9597109582487313e-05, "loss": 0.2702, "step": 5060 }, { "epoch": 0.41, "grad_norm": 12.43956214256869, "learning_rate": 1.9592093916663242e-05, "loss": 0.2972, "step": 5080 }, { "epoch": 0.41, "grad_norm": 8.800236820155485, "learning_rate": 1.958704787310803e-05, "loss": 0.2725, "step": 5100 }, { "epoch": 0.41, "grad_norm": 6.644759742176537, "learning_rate": 1.9581971467803934e-05, "loss": 0.289, "step": 5120 }, { "epoch": 0.41, "grad_norm": 5.009336147526538, "learning_rate": 1.9576864716829377e-05, "loss": 0.2969, "step": 5140 }, { "epoch": 0.42, "grad_norm": 5.803503477935393, "learning_rate": 1.95717276363589e-05, "loss": 0.2774, "step": 5160 }, { "epoch": 0.42, "grad_norm": 5.894993950320594, "learning_rate": 1.95665602426631e-05, "loss": 0.2273, "step": 5180 }, { "epoch": 0.42, "grad_norm": 6.96986305003759, "learning_rate": 1.956136255210859e-05, "loss": 0.2736, "step": 5200 }, { "epoch": 0.42, "grad_norm": 9.605041419937288, "learning_rate": 1.955613458115793e-05, "loss": 0.2907, "step": 5220 }, { "epoch": 0.42, "grad_norm": 6.647244804794919, "learning_rate": 1.9550876346369615e-05, "loss": 0.261, "step": 5240 }, { "epoch": 0.42, "grad_norm": 5.607026269896423, "learning_rate": 1.9545587864397955e-05, "loss": 0.3143, "step": 5260 }, { "epoch": 0.43, "grad_norm": 13.142033450455475, "learning_rate": 1.954026915199309e-05, "loss": 0.2434, "step": 5280 }, { "epoch": 0.43, "grad_norm": 3.248788167531875, "learning_rate": 1.9534920226000902e-05, "loss": 0.2705, "step": 5300 }, { "epoch": 0.43, "grad_norm": 13.164180730181236, "learning_rate": 1.9529541103362962e-05, "loss": 0.2862, "step": 5320 }, { "epoch": 0.43, "grad_norm": 5.831970091880435, "learning_rate": 1.9524131801116487e-05, "loss": 0.3054, "step": 5340 }, { "epoch": 0.43, "grad_norm": 5.446448668681817, "learning_rate": 1.951869233639428e-05, "loss": 0.2671, "step": 5360 }, { "epoch": 0.43, "grad_norm": 6.302315229032403, "learning_rate": 1.951322272642468e-05, "loss": 0.2765, "step": 5380 }, { "epoch": 0.44, "grad_norm": 11.243376074569383, "learning_rate": 1.9507722988531502e-05, "loss": 0.2582, "step": 5400 }, { "epoch": 0.44, "grad_norm": 7.673489589122099, "learning_rate": 1.9502193140133983e-05, "loss": 0.3143, "step": 5420 }, { "epoch": 0.44, "grad_norm": 7.9949329541838265, "learning_rate": 1.9496633198746736e-05, "loss": 0.2862, "step": 5440 }, { "epoch": 0.44, "grad_norm": 4.907903307657898, "learning_rate": 1.9491043181979677e-05, "loss": 0.2926, "step": 5460 }, { "epoch": 0.44, "grad_norm": 8.67569902665337, "learning_rate": 1.9485423107537986e-05, "loss": 0.2741, "step": 5480 }, { "epoch": 0.44, "grad_norm": 7.103858232561379, "learning_rate": 1.9479772993222038e-05, "loss": 0.2767, "step": 5500 }, { "epoch": 0.45, "grad_norm": 8.73970254143099, "learning_rate": 1.947409285692736e-05, "loss": 0.232, "step": 5520 }, { "epoch": 0.45, "grad_norm": 5.799420807956918, "learning_rate": 1.946838271664457e-05, "loss": 0.286, "step": 5540 }, { "epoch": 0.45, "grad_norm": 5.575038878985263, "learning_rate": 1.9462642590459306e-05, "loss": 0.2361, "step": 5560 }, { "epoch": 0.45, "grad_norm": 5.718902752613272, "learning_rate": 1.9456872496552184e-05, "loss": 0.2781, "step": 5580 }, { "epoch": 0.45, "grad_norm": 6.156497233891847, "learning_rate": 1.9451072453198742e-05, "loss": 0.2798, "step": 5600 }, { "epoch": 0.45, "grad_norm": 8.158434812741351, "learning_rate": 1.9445242478769374e-05, "loss": 0.2629, "step": 5620 }, { "epoch": 0.46, "grad_norm": 4.100053898526847, "learning_rate": 1.9439382591729265e-05, "loss": 0.2616, "step": 5640 }, { "epoch": 0.46, "grad_norm": 10.398486291325238, "learning_rate": 1.9433492810638355e-05, "loss": 0.281, "step": 5660 }, { "epoch": 0.46, "grad_norm": 7.054531497011973, "learning_rate": 1.942757315415126e-05, "loss": 0.2899, "step": 5680 }, { "epoch": 0.46, "grad_norm": 5.42866324225203, "learning_rate": 1.9421623641017218e-05, "loss": 0.3102, "step": 5700 }, { "epoch": 0.46, "grad_norm": 7.133905299895763, "learning_rate": 1.941564429008004e-05, "loss": 0.2616, "step": 5720 }, { "epoch": 0.46, "grad_norm": 6.043327611859842, "learning_rate": 1.9409635120278035e-05, "loss": 0.2614, "step": 5740 }, { "epoch": 0.46, "grad_norm": 7.125098244508386, "learning_rate": 1.9403596150643957e-05, "loss": 0.2732, "step": 5760 }, { "epoch": 0.47, "grad_norm": 7.809924868565428, "learning_rate": 1.9397527400304944e-05, "loss": 0.2537, "step": 5780 }, { "epoch": 0.47, "grad_norm": 6.721114625225486, "learning_rate": 1.9391428888482466e-05, "loss": 0.2935, "step": 5800 }, { "epoch": 0.47, "grad_norm": 5.41764363012664, "learning_rate": 1.9385300634492244e-05, "loss": 0.2644, "step": 5820 }, { "epoch": 0.47, "grad_norm": 6.577839969793495, "learning_rate": 1.937914265774421e-05, "loss": 0.2822, "step": 5840 }, { "epoch": 0.47, "grad_norm": 5.446653006796048, "learning_rate": 1.9372954977742437e-05, "loss": 0.2767, "step": 5860 }, { "epoch": 0.47, "grad_norm": 6.276051357995197, "learning_rate": 1.9366737614085067e-05, "loss": 0.2693, "step": 5880 }, { "epoch": 0.48, "grad_norm": 5.1174083399984935, "learning_rate": 1.9360490586464265e-05, "loss": 0.2968, "step": 5900 }, { "epoch": 0.48, "grad_norm": 8.060800772518713, "learning_rate": 1.9354213914666154e-05, "loss": 0.3042, "step": 5920 }, { "epoch": 0.48, "grad_norm": 6.946903213944759, "learning_rate": 1.934790761857074e-05, "loss": 0.2896, "step": 5940 }, { "epoch": 0.48, "grad_norm": 6.9058705221323855, "learning_rate": 1.934157171815187e-05, "loss": 0.2697, "step": 5960 }, { "epoch": 0.48, "grad_norm": 6.140873173559298, "learning_rate": 1.9335206233477138e-05, "loss": 0.3012, "step": 5980 }, { "epoch": 0.48, "grad_norm": 6.2613108710926415, "learning_rate": 1.9328811184707857e-05, "loss": 0.2616, "step": 6000 }, { "epoch": 0.49, "grad_norm": 7.103058610195544, "learning_rate": 1.932238659209897e-05, "loss": 0.2476, "step": 6020 }, { "epoch": 0.49, "grad_norm": 7.721454841424903, "learning_rate": 1.9315932475998994e-05, "loss": 0.2772, "step": 6040 }, { "epoch": 0.49, "grad_norm": 7.2979455852743245, "learning_rate": 1.930944885684996e-05, "loss": 0.2463, "step": 6060 }, { "epoch": 0.49, "grad_norm": 5.194835577851161, "learning_rate": 1.9302935755187335e-05, "loss": 0.2595, "step": 6080 }, { "epoch": 0.49, "grad_norm": 8.826615087967348, "learning_rate": 1.9296393191639976e-05, "loss": 0.2728, "step": 6100 }, { "epoch": 0.49, "grad_norm": 4.8811686686091, "learning_rate": 1.9289821186930038e-05, "loss": 0.2998, "step": 6120 }, { "epoch": 0.5, "grad_norm": 6.458493860362177, "learning_rate": 1.9283219761872943e-05, "loss": 0.2608, "step": 6140 }, { "epoch": 0.5, "grad_norm": 7.035199086314618, "learning_rate": 1.9276588937377293e-05, "loss": 0.2789, "step": 6160 }, { "epoch": 0.5, "grad_norm": 5.133893925330738, "learning_rate": 1.9269928734444792e-05, "loss": 0.2858, "step": 6180 }, { "epoch": 0.5, "grad_norm": 6.888562228890644, "learning_rate": 1.9263239174170203e-05, "loss": 0.263, "step": 6200 }, { "epoch": 0.5, "grad_norm": 7.247178153358964, "learning_rate": 1.9256520277741276e-05, "loss": 0.2887, "step": 6220 }, { "epoch": 0.5, "grad_norm": 5.8972079378636755, "learning_rate": 1.9249772066438676e-05, "loss": 0.2693, "step": 6240 }, { "epoch": 0.51, "grad_norm": 4.962455745470868, "learning_rate": 1.924299456163591e-05, "loss": 0.2749, "step": 6260 }, { "epoch": 0.51, "grad_norm": 7.258216936978947, "learning_rate": 1.9236187784799267e-05, "loss": 0.2957, "step": 6280 }, { "epoch": 0.51, "grad_norm": 6.194723517380506, "learning_rate": 1.9229351757487757e-05, "loss": 0.2773, "step": 6300 }, { "epoch": 0.51, "grad_norm": 6.5263758820129505, "learning_rate": 1.9222486501353027e-05, "loss": 0.293, "step": 6320 }, { "epoch": 0.51, "grad_norm": 6.366073941639377, "learning_rate": 1.9215592038139296e-05, "loss": 0.2755, "step": 6340 }, { "epoch": 0.51, "grad_norm": 116.03864354978906, "learning_rate": 1.9208668389683308e-05, "loss": 0.251, "step": 6360 }, { "epoch": 0.52, "grad_norm": 5.264420821601751, "learning_rate": 1.9201715577914223e-05, "loss": 0.2845, "step": 6380 }, { "epoch": 0.52, "grad_norm": 7.40581166704595, "learning_rate": 1.9194733624853584e-05, "loss": 0.2632, "step": 6400 }, { "epoch": 0.52, "grad_norm": 5.941729193844859, "learning_rate": 1.918772255261523e-05, "loss": 0.2597, "step": 6420 }, { "epoch": 0.52, "grad_norm": 6.312855927658963, "learning_rate": 1.9180682383405227e-05, "loss": 0.2692, "step": 6440 }, { "epoch": 0.52, "grad_norm": 6.071350558765477, "learning_rate": 1.9173613139521798e-05, "loss": 0.2731, "step": 6460 }, { "epoch": 0.52, "grad_norm": 8.993502072541519, "learning_rate": 1.9166514843355254e-05, "loss": 0.2548, "step": 6480 }, { "epoch": 0.52, "grad_norm": 7.005495911170442, "learning_rate": 1.9159387517387924e-05, "loss": 0.2612, "step": 6500 }, { "epoch": 0.53, "grad_norm": 7.665844444744844, "learning_rate": 1.915223118419409e-05, "loss": 0.2501, "step": 6520 }, { "epoch": 0.53, "grad_norm": 6.458334088069092, "learning_rate": 1.9145045866439892e-05, "loss": 0.2762, "step": 6540 }, { "epoch": 0.53, "grad_norm": 6.819348453934704, "learning_rate": 1.9137831586883288e-05, "loss": 0.2826, "step": 6560 }, { "epoch": 0.53, "grad_norm": 5.080353890354994, "learning_rate": 1.9130588368373958e-05, "loss": 0.2738, "step": 6580 }, { "epoch": 0.53, "grad_norm": 5.836344060052037, "learning_rate": 1.912331623385324e-05, "loss": 0.2586, "step": 6600 }, { "epoch": 0.53, "grad_norm": 8.357013119166787, "learning_rate": 1.9116015206354067e-05, "loss": 0.3174, "step": 6620 }, { "epoch": 0.54, "grad_norm": 4.307353833021694, "learning_rate": 1.9108685309000866e-05, "loss": 0.2721, "step": 6640 }, { "epoch": 0.54, "grad_norm": 7.002584797605542, "learning_rate": 1.9101326565009517e-05, "loss": 0.2581, "step": 6660 }, { "epoch": 0.54, "grad_norm": 7.299065402050334, "learning_rate": 1.909393899768726e-05, "loss": 0.2933, "step": 6680 }, { "epoch": 0.54, "grad_norm": 6.57009941780665, "learning_rate": 1.9086522630432638e-05, "loss": 0.2843, "step": 6700 }, { "epoch": 0.54, "grad_norm": 6.857683881351832, "learning_rate": 1.907907748673539e-05, "loss": 0.273, "step": 6720 }, { "epoch": 0.54, "grad_norm": 5.553536898181894, "learning_rate": 1.9071603590176417e-05, "loss": 0.2623, "step": 6740 }, { "epoch": 0.55, "grad_norm": 6.30566096206076, "learning_rate": 1.906410096442768e-05, "loss": 0.2366, "step": 6760 }, { "epoch": 0.55, "grad_norm": 10.352010603508685, "learning_rate": 1.9056569633252136e-05, "loss": 0.2546, "step": 6780 }, { "epoch": 0.55, "grad_norm": 7.80077709337333, "learning_rate": 1.9049009620503663e-05, "loss": 0.2763, "step": 6800 }, { "epoch": 0.55, "grad_norm": 8.03821543687826, "learning_rate": 1.9041420950126976e-05, "loss": 0.2486, "step": 6820 }, { "epoch": 0.55, "grad_norm": 5.390540434685423, "learning_rate": 1.9033803646157558e-05, "loss": 0.2964, "step": 6840 }, { "epoch": 0.55, "grad_norm": 7.430208109717449, "learning_rate": 1.9026157732721585e-05, "loss": 0.2681, "step": 6860 }, { "epoch": 0.56, "grad_norm": 7.857246125768761, "learning_rate": 1.9018483234035845e-05, "loss": 0.2719, "step": 6880 }, { "epoch": 0.56, "grad_norm": 5.565927371122475, "learning_rate": 1.901078017440767e-05, "loss": 0.2703, "step": 6900 }, { "epoch": 0.56, "grad_norm": 6.9976710123560375, "learning_rate": 1.9003048578234843e-05, "loss": 0.2566, "step": 6920 }, { "epoch": 0.56, "grad_norm": 6.478229745256355, "learning_rate": 1.899528847000554e-05, "loss": 0.253, "step": 6940 }, { "epoch": 0.56, "grad_norm": 5.116336291620225, "learning_rate": 1.898749987429823e-05, "loss": 0.2529, "step": 6960 }, { "epoch": 0.56, "grad_norm": 5.587216274900646, "learning_rate": 1.8979682815781627e-05, "loss": 0.2848, "step": 6980 }, { "epoch": 0.57, "grad_norm": 5.679239499396368, "learning_rate": 1.8971837319214586e-05, "loss": 0.2435, "step": 7000 }, { "epoch": 0.57, "grad_norm": 6.1169436663169074, "learning_rate": 1.8963963409446022e-05, "loss": 0.2793, "step": 7020 }, { "epoch": 0.57, "grad_norm": 5.802383972086084, "learning_rate": 1.8956061111414865e-05, "loss": 0.2717, "step": 7040 }, { "epoch": 0.57, "grad_norm": 6.507485407821351, "learning_rate": 1.8948130450149942e-05, "loss": 0.3011, "step": 7060 }, { "epoch": 0.57, "grad_norm": 5.9621692749685415, "learning_rate": 1.8940171450769924e-05, "loss": 0.3076, "step": 7080 }, { "epoch": 0.57, "grad_norm": 54.193219403324576, "learning_rate": 1.8932184138483223e-05, "loss": 0.2629, "step": 7100 }, { "epoch": 0.57, "grad_norm": 7.104622864455409, "learning_rate": 1.8924168538587956e-05, "loss": 0.2714, "step": 7120 }, { "epoch": 0.58, "grad_norm": 7.0480173880874615, "learning_rate": 1.8916124676471797e-05, "loss": 0.2736, "step": 7140 }, { "epoch": 0.58, "grad_norm": 8.412429750327739, "learning_rate": 1.8908052577611958e-05, "loss": 0.2644, "step": 7160 }, { "epoch": 0.58, "grad_norm": 8.78692031185526, "learning_rate": 1.8899952267575083e-05, "loss": 0.2402, "step": 7180 }, { "epoch": 0.58, "grad_norm": 6.684638721458758, "learning_rate": 1.889182377201716e-05, "loss": 0.2542, "step": 7200 }, { "epoch": 0.58, "grad_norm": 6.010627553359556, "learning_rate": 1.8883667116683457e-05, "loss": 0.2838, "step": 7220 }, { "epoch": 0.58, "grad_norm": 7.414666524064716, "learning_rate": 1.887548232740843e-05, "loss": 0.2851, "step": 7240 }, { "epoch": 0.59, "grad_norm": 8.34076720224061, "learning_rate": 1.886726943011564e-05, "loss": 0.2516, "step": 7260 }, { "epoch": 0.59, "grad_norm": 6.094640422225146, "learning_rate": 1.885902845081767e-05, "loss": 0.2313, "step": 7280 }, { "epoch": 0.59, "grad_norm": 4.148097252407384, "learning_rate": 1.8850759415616066e-05, "loss": 0.2689, "step": 7300 }, { "epoch": 0.59, "grad_norm": 7.561235558598041, "learning_rate": 1.8842462350701212e-05, "loss": 0.2983, "step": 7320 }, { "epoch": 0.59, "grad_norm": 7.87091273248736, "learning_rate": 1.883413728235228e-05, "loss": 0.2386, "step": 7340 }, { "epoch": 0.59, "grad_norm": 6.197625708495748, "learning_rate": 1.8825784236937146e-05, "loss": 0.282, "step": 7360 }, { "epoch": 0.6, "grad_norm": 9.524108011926973, "learning_rate": 1.8817403240912283e-05, "loss": 0.2776, "step": 7380 }, { "epoch": 0.6, "grad_norm": 5.147943823246307, "learning_rate": 1.8808994320822693e-05, "loss": 0.2625, "step": 7400 }, { "epoch": 0.6, "grad_norm": 5.764109011612628, "learning_rate": 1.8800557503301827e-05, "loss": 0.2859, "step": 7420 }, { "epoch": 0.6, "grad_norm": 7.7970156609149335, "learning_rate": 1.8792092815071498e-05, "loss": 0.2589, "step": 7440 }, { "epoch": 0.6, "grad_norm": 6.220123292737489, "learning_rate": 1.8783600282941782e-05, "loss": 0.269, "step": 7460 }, { "epoch": 0.6, "grad_norm": 6.875941264134116, "learning_rate": 1.877507993381096e-05, "loss": 0.2624, "step": 7480 }, { "epoch": 0.61, "grad_norm": 5.721394912188018, "learning_rate": 1.8766531794665402e-05, "loss": 0.2571, "step": 7500 }, { "epoch": 0.61, "grad_norm": 6.99318335916291, "learning_rate": 1.8757955892579504e-05, "loss": 0.26, "step": 7520 }, { "epoch": 0.61, "grad_norm": 6.692727585899676, "learning_rate": 1.87493522547156e-05, "loss": 0.2635, "step": 7540 }, { "epoch": 0.61, "grad_norm": 6.5007755110350525, "learning_rate": 1.874072090832386e-05, "loss": 0.2754, "step": 7560 }, { "epoch": 0.61, "grad_norm": 7.775379340923738, "learning_rate": 1.873206188074223e-05, "loss": 0.2708, "step": 7580 }, { "epoch": 0.61, "grad_norm": 4.970941791912674, "learning_rate": 1.872337519939631e-05, "loss": 0.2592, "step": 7600 }, { "epoch": 0.62, "grad_norm": 7.276189396167904, "learning_rate": 1.8714660891799302e-05, "loss": 0.2648, "step": 7620 }, { "epoch": 0.62, "grad_norm": 6.724776718800752, "learning_rate": 1.870591898555191e-05, "loss": 0.2606, "step": 7640 }, { "epoch": 0.62, "grad_norm": 7.735768695454274, "learning_rate": 1.8697149508342237e-05, "loss": 0.2511, "step": 7660 }, { "epoch": 0.62, "grad_norm": 5.05914779633595, "learning_rate": 1.868835248794573e-05, "loss": 0.2609, "step": 7680 }, { "epoch": 0.62, "grad_norm": 8.031314122281715, "learning_rate": 1.8679527952225054e-05, "loss": 0.2718, "step": 7700 }, { "epoch": 0.62, "grad_norm": 5.619780768194464, "learning_rate": 1.867067592913004e-05, "loss": 0.2717, "step": 7720 }, { "epoch": 0.62, "grad_norm": 7.595427904662886, "learning_rate": 1.8661796446697557e-05, "loss": 0.2536, "step": 7740 }, { "epoch": 0.63, "grad_norm": 5.462276616537402, "learning_rate": 1.8652889533051473e-05, "loss": 0.2674, "step": 7760 }, { "epoch": 0.63, "grad_norm": 5.302383844019715, "learning_rate": 1.864395521640252e-05, "loss": 0.2856, "step": 7780 }, { "epoch": 0.63, "grad_norm": 4.703940083284321, "learning_rate": 1.8634993525048227e-05, "loss": 0.2609, "step": 7800 }, { "epoch": 0.63, "grad_norm": 6.585961827134786, "learning_rate": 1.862600448737283e-05, "loss": 0.265, "step": 7820 }, { "epoch": 0.63, "grad_norm": 7.27689896277283, "learning_rate": 1.861698813184717e-05, "loss": 0.3018, "step": 7840 }, { "epoch": 0.63, "grad_norm": 6.231232809733686, "learning_rate": 1.860794448702863e-05, "loss": 0.2268, "step": 7860 }, { "epoch": 0.64, "grad_norm": 7.794911353272152, "learning_rate": 1.8598873581561e-05, "loss": 0.2632, "step": 7880 }, { "epoch": 0.64, "grad_norm": 6.977335614708055, "learning_rate": 1.8589775444174436e-05, "loss": 0.3097, "step": 7900 }, { "epoch": 0.64, "grad_norm": 7.607942857642037, "learning_rate": 1.858065010368533e-05, "loss": 0.2658, "step": 7920 }, { "epoch": 0.64, "grad_norm": 6.109669397778123, "learning_rate": 1.857149758899624e-05, "loss": 0.2613, "step": 7940 }, { "epoch": 0.64, "grad_norm": 6.142102090556645, "learning_rate": 1.8562317929095796e-05, "loss": 0.2769, "step": 7960 }, { "epoch": 0.64, "grad_norm": 3.98370343700879, "learning_rate": 1.8553111153058593e-05, "loss": 0.2642, "step": 7980 }, { "epoch": 0.65, "grad_norm": 6.375900504146025, "learning_rate": 1.8543877290045122e-05, "loss": 0.2646, "step": 8000 }, { "epoch": 0.65, "grad_norm": 7.277577534154136, "learning_rate": 1.853461636930166e-05, "loss": 0.2806, "step": 8020 }, { "epoch": 0.65, "grad_norm": 6.81435963858201, "learning_rate": 1.852532842016019e-05, "loss": 0.2536, "step": 8040 }, { "epoch": 0.65, "grad_norm": 5.854006003712663, "learning_rate": 1.851601347203829e-05, "loss": 0.2447, "step": 8060 }, { "epoch": 0.65, "grad_norm": 7.787886275359923, "learning_rate": 1.8506671554439064e-05, "loss": 0.2663, "step": 8080 }, { "epoch": 0.65, "grad_norm": 9.21089898409568, "learning_rate": 1.849730269695103e-05, "loss": 0.2601, "step": 8100 }, { "epoch": 0.66, "grad_norm": 8.590889527489873, "learning_rate": 1.8487906929248028e-05, "loss": 0.2531, "step": 8120 }, { "epoch": 0.66, "grad_norm": 5.551684548356732, "learning_rate": 1.8478484281089143e-05, "loss": 0.2605, "step": 8140 }, { "epoch": 0.66, "grad_norm": 4.227848217032472, "learning_rate": 1.8469034782318585e-05, "loss": 0.2728, "step": 8160 }, { "epoch": 0.66, "grad_norm": 6.015758166139706, "learning_rate": 1.8459558462865613e-05, "loss": 0.2883, "step": 8180 }, { "epoch": 0.66, "grad_norm": 6.568658173678755, "learning_rate": 1.845005535274444e-05, "loss": 0.2454, "step": 8200 }, { "epoch": 0.66, "grad_norm": 5.985752674217696, "learning_rate": 1.844052548205412e-05, "loss": 0.2442, "step": 8220 }, { "epoch": 0.67, "grad_norm": 6.3219678524060425, "learning_rate": 1.843096888097848e-05, "loss": 0.2912, "step": 8240 }, { "epoch": 0.67, "grad_norm": 4.096257730243316, "learning_rate": 1.8421385579785997e-05, "loss": 0.2636, "step": 8260 }, { "epoch": 0.67, "grad_norm": 6.396648972118899, "learning_rate": 1.8411775608829722e-05, "loss": 0.2324, "step": 8280 }, { "epoch": 0.67, "grad_norm": 4.782379216505, "learning_rate": 1.8402138998547174e-05, "loss": 0.2675, "step": 8300 }, { "epoch": 0.67, "grad_norm": 8.676707198167653, "learning_rate": 1.839247577946025e-05, "loss": 0.2843, "step": 8320 }, { "epoch": 0.67, "grad_norm": 5.39138478992206, "learning_rate": 1.8382785982175118e-05, "loss": 0.2742, "step": 8340 }, { "epoch": 0.67, "grad_norm": 5.818443622984385, "learning_rate": 1.8373069637382136e-05, "loss": 0.26, "step": 8360 }, { "epoch": 0.68, "grad_norm": 8.95366226368456, "learning_rate": 1.8363326775855737e-05, "loss": 0.2687, "step": 8380 }, { "epoch": 0.68, "grad_norm": 7.96756080281063, "learning_rate": 1.8353557428454346e-05, "loss": 0.2425, "step": 8400 }, { "epoch": 0.68, "grad_norm": 6.577104865413394, "learning_rate": 1.8343761626120272e-05, "loss": 0.2688, "step": 8420 }, { "epoch": 0.68, "grad_norm": 6.6269283727065, "learning_rate": 1.8333939399879617e-05, "loss": 0.2808, "step": 8440 }, { "epoch": 0.68, "grad_norm": 7.016348140974161, "learning_rate": 1.8324090780842173e-05, "loss": 0.2511, "step": 8460 }, { "epoch": 0.68, "grad_norm": 7.5454363034081116, "learning_rate": 1.831421580020133e-05, "loss": 0.252, "step": 8480 }, { "epoch": 0.69, "grad_norm": 5.837760589468463, "learning_rate": 1.830431448923396e-05, "loss": 0.2728, "step": 8500 }, { "epoch": 0.69, "grad_norm": 6.154380243306325, "learning_rate": 1.8294386879300353e-05, "loss": 0.2867, "step": 8520 }, { "epoch": 0.69, "grad_norm": 7.71122937485844, "learning_rate": 1.8284433001844073e-05, "loss": 0.2302, "step": 8540 }, { "epoch": 0.69, "grad_norm": 6.86335128201322, "learning_rate": 1.8274452888391894e-05, "loss": 0.2586, "step": 8560 }, { "epoch": 0.69, "grad_norm": 5.661853354206643, "learning_rate": 1.8264446570553682e-05, "loss": 0.2505, "step": 8580 }, { "epoch": 0.69, "grad_norm": 5.982364804963667, "learning_rate": 1.82544140800223e-05, "loss": 0.2673, "step": 8600 }, { "epoch": 0.7, "grad_norm": 4.739028708176796, "learning_rate": 1.824435544857351e-05, "loss": 0.2678, "step": 8620 }, { "epoch": 0.7, "grad_norm": 4.91420533377473, "learning_rate": 1.823427070806587e-05, "loss": 0.2559, "step": 8640 }, { "epoch": 0.7, "grad_norm": 5.618249360419533, "learning_rate": 1.8224159890440623e-05, "loss": 0.2493, "step": 8660 }, { "epoch": 0.7, "grad_norm": 5.896677808188606, "learning_rate": 1.821402302772162e-05, "loss": 0.2585, "step": 8680 }, { "epoch": 0.7, "grad_norm": 6.073985124124518, "learning_rate": 1.82038601520152e-05, "loss": 0.2452, "step": 8700 }, { "epoch": 0.7, "grad_norm": 7.1459209410818, "learning_rate": 1.819367129551008e-05, "loss": 0.2592, "step": 8720 }, { "epoch": 0.71, "grad_norm": 6.390094315335785, "learning_rate": 1.8183456490477287e-05, "loss": 0.2461, "step": 8740 }, { "epoch": 0.71, "grad_norm": 5.294426005863845, "learning_rate": 1.8173215769270015e-05, "loss": 0.2685, "step": 8760 }, { "epoch": 0.71, "grad_norm": 5.276924483715485, "learning_rate": 1.8162949164323554e-05, "loss": 0.2615, "step": 8780 }, { "epoch": 0.71, "grad_norm": 7.331765382932756, "learning_rate": 1.8152656708155173e-05, "loss": 0.2828, "step": 8800 }, { "epoch": 0.71, "grad_norm": 5.361402122667844, "learning_rate": 1.8142338433364012e-05, "loss": 0.2849, "step": 8820 }, { "epoch": 0.71, "grad_norm": 6.712375473487036, "learning_rate": 1.8131994372630995e-05, "loss": 0.2716, "step": 8840 }, { "epoch": 0.72, "grad_norm": 8.103353922148388, "learning_rate": 1.812162455871872e-05, "loss": 0.2703, "step": 8860 }, { "epoch": 0.72, "grad_norm": 4.585974100152074, "learning_rate": 1.8111229024471334e-05, "loss": 0.2386, "step": 8880 }, { "epoch": 0.72, "grad_norm": 6.8332489132512375, "learning_rate": 1.8100807802814467e-05, "loss": 0.2935, "step": 8900 }, { "epoch": 0.72, "grad_norm": 5.556964992180211, "learning_rate": 1.80903609267551e-05, "loss": 0.2404, "step": 8920 }, { "epoch": 0.72, "grad_norm": 6.524527124099894, "learning_rate": 1.8079888429381472e-05, "loss": 0.2477, "step": 8940 }, { "epoch": 0.72, "grad_norm": 6.394125877212817, "learning_rate": 1.8069390343862972e-05, "loss": 0.2585, "step": 8960 }, { "epoch": 0.72, "grad_norm": 7.212304875264878, "learning_rate": 1.805886670345003e-05, "loss": 0.2514, "step": 8980 }, { "epoch": 0.73, "grad_norm": 5.915336602662839, "learning_rate": 1.8048317541474015e-05, "loss": 0.2554, "step": 9000 }, { "epoch": 0.73, "grad_norm": 6.204874325324116, "learning_rate": 1.803774289134714e-05, "loss": 0.2663, "step": 9020 }, { "epoch": 0.73, "grad_norm": 4.9458264028130525, "learning_rate": 1.8027142786562334e-05, "loss": 0.2374, "step": 9040 }, { "epoch": 0.73, "grad_norm": 5.66437734846908, "learning_rate": 1.8016517260693152e-05, "loss": 0.2173, "step": 9060 }, { "epoch": 0.73, "grad_norm": 8.8145498502476, "learning_rate": 1.800586634739367e-05, "loss": 0.2672, "step": 9080 }, { "epoch": 0.73, "grad_norm": 5.225621616310874, "learning_rate": 1.799519008039837e-05, "loss": 0.263, "step": 9100 }, { "epoch": 0.74, "grad_norm": 6.749141497235558, "learning_rate": 1.7984488493522033e-05, "loss": 0.294, "step": 9120 }, { "epoch": 0.74, "grad_norm": 6.5925500148457115, "learning_rate": 1.7973761620659645e-05, "loss": 0.2549, "step": 9140 }, { "epoch": 0.74, "grad_norm": 3.6612011894705097, "learning_rate": 1.7963009495786262e-05, "loss": 0.274, "step": 9160 }, { "epoch": 0.74, "grad_norm": 7.730637018917763, "learning_rate": 1.795223215295694e-05, "loss": 0.2476, "step": 9180 }, { "epoch": 0.74, "grad_norm": 5.253387992852078, "learning_rate": 1.7941429626306597e-05, "loss": 0.2557, "step": 9200 }, { "epoch": 0.74, "grad_norm": 6.185451592355014, "learning_rate": 1.7930601950049918e-05, "loss": 0.2414, "step": 9220 }, { "epoch": 0.75, "grad_norm": 8.613330410148825, "learning_rate": 1.7919749158481238e-05, "loss": 0.252, "step": 9240 }, { "epoch": 0.75, "grad_norm": 4.082779957130279, "learning_rate": 1.7908871285974452e-05, "loss": 0.246, "step": 9260 }, { "epoch": 0.75, "grad_norm": 5.080789002249157, "learning_rate": 1.789796836698288e-05, "loss": 0.2241, "step": 9280 }, { "epoch": 0.75, "grad_norm": 5.616004872409631, "learning_rate": 1.788704043603918e-05, "loss": 0.2635, "step": 9300 }, { "epoch": 0.75, "grad_norm": 4.6896605535132005, "learning_rate": 1.787608752775523e-05, "loss": 0.2496, "step": 9320 }, { "epoch": 0.75, "grad_norm": 6.020003669712304, "learning_rate": 1.786510967682201e-05, "loss": 0.2742, "step": 9340 }, { "epoch": 0.76, "grad_norm": 4.869330214670387, "learning_rate": 1.7854106918009516e-05, "loss": 0.2554, "step": 9360 }, { "epoch": 0.76, "grad_norm": 5.463125770044224, "learning_rate": 1.7843079286166613e-05, "loss": 0.256, "step": 9380 }, { "epoch": 0.76, "grad_norm": 12.859151326084799, "learning_rate": 1.7832026816220964e-05, "loss": 0.3044, "step": 9400 }, { "epoch": 0.76, "grad_norm": 7.462079888408213, "learning_rate": 1.7820949543178893e-05, "loss": 0.2603, "step": 9420 }, { "epoch": 0.76, "grad_norm": 6.251675190537996, "learning_rate": 1.7809847502125287e-05, "loss": 0.2524, "step": 9440 }, { "epoch": 0.76, "grad_norm": 20.20686096910179, "learning_rate": 1.779872072822348e-05, "loss": 0.2727, "step": 9460 }, { "epoch": 0.77, "grad_norm": 11.117280832355938, "learning_rate": 1.7787569256715128e-05, "loss": 0.2751, "step": 9480 }, { "epoch": 0.77, "grad_norm": 6.174365967852932, "learning_rate": 1.7776393122920136e-05, "loss": 0.2465, "step": 9500 }, { "epoch": 0.77, "grad_norm": 6.5845686642808205, "learning_rate": 1.7765192362236505e-05, "loss": 0.2637, "step": 9520 }, { "epoch": 0.77, "grad_norm": 9.227894944405277, "learning_rate": 1.775396701014024e-05, "loss": 0.2594, "step": 9540 }, { "epoch": 0.77, "grad_norm": 6.0294211980015255, "learning_rate": 1.7742717102185233e-05, "loss": 0.2506, "step": 9560 }, { "epoch": 0.77, "grad_norm": 6.611585459356701, "learning_rate": 1.7731442674003153e-05, "loss": 0.256, "step": 9580 }, { "epoch": 0.77, "grad_norm": 6.474013099428535, "learning_rate": 1.772014376130333e-05, "loss": 0.2509, "step": 9600 }, { "epoch": 0.78, "grad_norm": 4.050917561517386, "learning_rate": 1.7708820399872644e-05, "loss": 0.2597, "step": 9620 }, { "epoch": 0.78, "grad_norm": 7.523512541811629, "learning_rate": 1.7697472625575415e-05, "loss": 0.2617, "step": 9640 }, { "epoch": 0.78, "grad_norm": 4.674855993980255, "learning_rate": 1.768610047435328e-05, "loss": 0.2148, "step": 9660 }, { "epoch": 0.78, "grad_norm": 3.581193699152847, "learning_rate": 1.7674703982225084e-05, "loss": 0.2485, "step": 9680 }, { "epoch": 0.78, "grad_norm": 5.995347444394187, "learning_rate": 1.7663283185286778e-05, "loss": 0.2504, "step": 9700 }, { "epoch": 0.78, "grad_norm": 6.106039165812286, "learning_rate": 1.7651838119711278e-05, "loss": 0.2591, "step": 9720 }, { "epoch": 0.79, "grad_norm": 5.544368037680747, "learning_rate": 1.7640368821748374e-05, "loss": 0.2589, "step": 9740 }, { "epoch": 0.79, "grad_norm": 11.908781488384356, "learning_rate": 1.7628875327724604e-05, "loss": 0.24, "step": 9760 }, { "epoch": 0.79, "grad_norm": 5.2162186199664005, "learning_rate": 1.761735767404314e-05, "loss": 0.279, "step": 9780 }, { "epoch": 0.79, "grad_norm": 8.332009731717408, "learning_rate": 1.760581589718369e-05, "loss": 0.2523, "step": 9800 }, { "epoch": 0.79, "grad_norm": 6.811834460305066, "learning_rate": 1.759425003370234e-05, "loss": 0.2422, "step": 9820 }, { "epoch": 0.79, "grad_norm": 10.001650864708848, "learning_rate": 1.758266012023149e-05, "loss": 0.2415, "step": 9840 }, { "epoch": 0.8, "grad_norm": 14.181135321229519, "learning_rate": 1.7571046193479697e-05, "loss": 0.2439, "step": 9860 }, { "epoch": 0.8, "grad_norm": 5.304371617930666, "learning_rate": 1.7559408290231582e-05, "loss": 0.2883, "step": 9880 }, { "epoch": 0.8, "grad_norm": 10.159891549680514, "learning_rate": 1.754774644734771e-05, "loss": 0.2402, "step": 9900 }, { "epoch": 0.8, "grad_norm": 21.596871665189294, "learning_rate": 1.753606070176446e-05, "loss": 0.2646, "step": 9920 }, { "epoch": 0.8, "grad_norm": 3.6266946448855064, "learning_rate": 1.752435109049392e-05, "loss": 0.2463, "step": 9940 }, { "epoch": 0.8, "grad_norm": 7.461139967802549, "learning_rate": 1.7512617650623776e-05, "loss": 0.2343, "step": 9960 }, { "epoch": 0.81, "grad_norm": 5.8844648373593955, "learning_rate": 1.7500860419317183e-05, "loss": 0.251, "step": 9980 }, { "epoch": 0.81, "grad_norm": 9.038354738793856, "learning_rate": 1.7489079433812638e-05, "loss": 0.2494, "step": 10000 }, { "epoch": 0.81, "grad_norm": 8.591404154257724, "learning_rate": 1.7477274731423892e-05, "loss": 0.2374, "step": 10020 }, { "epoch": 0.81, "grad_norm": 5.9870710947999815, "learning_rate": 1.7465446349539797e-05, "loss": 0.2206, "step": 10040 }, { "epoch": 0.81, "grad_norm": 6.228813578147013, "learning_rate": 1.7453594325624224e-05, "loss": 0.2462, "step": 10060 }, { "epoch": 0.81, "grad_norm": 5.257017078287017, "learning_rate": 1.7441718697215904e-05, "loss": 0.2409, "step": 10080 }, { "epoch": 0.82, "grad_norm": 6.952956019716318, "learning_rate": 1.742981950192835e-05, "loss": 0.2521, "step": 10100 }, { "epoch": 0.82, "grad_norm": 5.5548892299756805, "learning_rate": 1.7417896777449706e-05, "loss": 0.2647, "step": 10120 }, { "epoch": 0.82, "grad_norm": 5.73273030739662, "learning_rate": 1.7405950561542636e-05, "loss": 0.2473, "step": 10140 }, { "epoch": 0.82, "grad_norm": 5.8226292447674775, "learning_rate": 1.7393980892044222e-05, "loss": 0.2799, "step": 10160 }, { "epoch": 0.82, "grad_norm": 6.573153903103647, "learning_rate": 1.738198780686582e-05, "loss": 0.2391, "step": 10180 }, { "epoch": 0.82, "grad_norm": 6.2081294015592094, "learning_rate": 1.7369971343992953e-05, "loss": 0.2441, "step": 10200 }, { "epoch": 0.82, "grad_norm": 7.239395541675969, "learning_rate": 1.735793154148519e-05, "loss": 0.2467, "step": 10220 }, { "epoch": 0.83, "grad_norm": 6.574019720880623, "learning_rate": 1.7345868437476016e-05, "loss": 0.2742, "step": 10240 }, { "epoch": 0.83, "grad_norm": 3.932079883792344, "learning_rate": 1.733378207017273e-05, "loss": 0.2799, "step": 10260 }, { "epoch": 0.83, "grad_norm": 7.965596611059161, "learning_rate": 1.7321672477856297e-05, "loss": 0.268, "step": 10280 }, { "epoch": 0.83, "grad_norm": 6.637332593742831, "learning_rate": 1.730953969888126e-05, "loss": 0.281, "step": 10300 }, { "epoch": 0.83, "grad_norm": 4.598400020154981, "learning_rate": 1.729738377167559e-05, "loss": 0.2688, "step": 10320 }, { "epoch": 0.83, "grad_norm": 10.008276375495472, "learning_rate": 1.728520473474057e-05, "loss": 0.2424, "step": 10340 }, { "epoch": 0.84, "grad_norm": 9.609588968019253, "learning_rate": 1.7273002626650693e-05, "loss": 0.2562, "step": 10360 }, { "epoch": 0.84, "grad_norm": 6.246946580790647, "learning_rate": 1.726077748605352e-05, "loss": 0.2536, "step": 10380 }, { "epoch": 0.84, "grad_norm": 5.207954250527354, "learning_rate": 1.724852935166955e-05, "loss": 0.2803, "step": 10400 }, { "epoch": 0.84, "grad_norm": 6.83554630577102, "learning_rate": 1.723625826229212e-05, "loss": 0.2366, "step": 10420 }, { "epoch": 0.84, "grad_norm": 5.2741649888827, "learning_rate": 1.7223964256787275e-05, "loss": 0.2589, "step": 10440 }, { "epoch": 0.84, "grad_norm": 4.504793580943435, "learning_rate": 1.7211647374093644e-05, "loss": 0.2654, "step": 10460 }, { "epoch": 0.85, "grad_norm": 5.074320615196733, "learning_rate": 1.71993076532223e-05, "loss": 0.2531, "step": 10480 }, { "epoch": 0.85, "grad_norm": 7.4921309833960645, "learning_rate": 1.7186945133256663e-05, "loss": 0.2452, "step": 10500 }, { "epoch": 0.85, "grad_norm": 4.773435701909952, "learning_rate": 1.7174559853352366e-05, "loss": 0.2786, "step": 10520 }, { "epoch": 0.85, "grad_norm": 5.190944401366304, "learning_rate": 1.7162151852737114e-05, "loss": 0.2082, "step": 10540 }, { "epoch": 0.85, "grad_norm": 6.8860794956428215, "learning_rate": 1.7149721170710597e-05, "loss": 0.2593, "step": 10560 }, { "epoch": 0.85, "grad_norm": 5.315969613200098, "learning_rate": 1.7137267846644324e-05, "loss": 0.2451, "step": 10580 }, { "epoch": 0.86, "grad_norm": 8.924983723943493, "learning_rate": 1.712479191998153e-05, "loss": 0.2487, "step": 10600 }, { "epoch": 0.86, "grad_norm": 4.785603454868163, "learning_rate": 1.711229343023703e-05, "loss": 0.275, "step": 10620 }, { "epoch": 0.86, "grad_norm": 4.5511584473505895, "learning_rate": 1.709977241699711e-05, "loss": 0.2438, "step": 10640 }, { "epoch": 0.86, "grad_norm": 6.601440573023448, "learning_rate": 1.7087228919919395e-05, "loss": 0.2682, "step": 10660 }, { "epoch": 0.86, "grad_norm": 8.06521205975687, "learning_rate": 1.7074662978732713e-05, "loss": 0.2672, "step": 10680 }, { "epoch": 0.86, "grad_norm": 5.877886448612562, "learning_rate": 1.7062074633236992e-05, "loss": 0.2415, "step": 10700 }, { "epoch": 0.87, "grad_norm": 6.00267509589556, "learning_rate": 1.704946392330311e-05, "loss": 0.245, "step": 10720 }, { "epoch": 0.87, "grad_norm": 18.727472632503616, "learning_rate": 1.703683088887278e-05, "loss": 0.2527, "step": 10740 }, { "epoch": 0.87, "grad_norm": 8.42578939933542, "learning_rate": 1.7024175569958435e-05, "loss": 0.2447, "step": 10760 }, { "epoch": 0.87, "grad_norm": 15.871158165018187, "learning_rate": 1.7011498006643075e-05, "loss": 0.2611, "step": 10780 }, { "epoch": 0.87, "grad_norm": 4.623538224443551, "learning_rate": 1.6998798239080167e-05, "loss": 0.2521, "step": 10800 }, { "epoch": 0.87, "grad_norm": 6.908983060916792, "learning_rate": 1.698607630749349e-05, "loss": 0.2298, "step": 10820 }, { "epoch": 0.88, "grad_norm": 6.502465294111384, "learning_rate": 1.6973332252177036e-05, "loss": 0.2498, "step": 10840 }, { "epoch": 0.88, "grad_norm": 4.978479228853818, "learning_rate": 1.6960566113494865e-05, "loss": 0.252, "step": 10860 }, { "epoch": 0.88, "grad_norm": 5.650381173298351, "learning_rate": 1.694777793188098e-05, "loss": 0.2288, "step": 10880 }, { "epoch": 0.88, "grad_norm": 7.073746360539243, "learning_rate": 1.6934967747839202e-05, "loss": 0.2519, "step": 10900 }, { "epoch": 0.88, "grad_norm": 5.927901369661737, "learning_rate": 1.6922135601943037e-05, "loss": 0.265, "step": 10920 }, { "epoch": 0.88, "grad_norm": 5.53567758715019, "learning_rate": 1.690928153483555e-05, "loss": 0.25, "step": 10940 }, { "epoch": 0.88, "grad_norm": 7.570944618942586, "learning_rate": 1.6896405587229247e-05, "loss": 0.2549, "step": 10960 }, { "epoch": 0.89, "grad_norm": 7.379565103013804, "learning_rate": 1.6883507799905922e-05, "loss": 0.2363, "step": 10980 }, { "epoch": 0.89, "grad_norm": 9.023229502472875, "learning_rate": 1.6870588213716555e-05, "loss": 0.2832, "step": 11000 }, { "epoch": 0.89, "grad_norm": 5.6792260655491855, "learning_rate": 1.6857646869581153e-05, "loss": 0.228, "step": 11020 }, { "epoch": 0.89, "grad_norm": 7.456793627942026, "learning_rate": 1.6844683808488647e-05, "loss": 0.2494, "step": 11040 }, { "epoch": 0.89, "grad_norm": 4.8011477449229885, "learning_rate": 1.6831699071496758e-05, "loss": 0.2634, "step": 11060 }, { "epoch": 0.89, "grad_norm": 6.58057290965885, "learning_rate": 1.681869269973184e-05, "loss": 0.2577, "step": 11080 }, { "epoch": 0.9, "grad_norm": 5.68008811828603, "learning_rate": 1.68056647343888e-05, "loss": 0.2297, "step": 11100 }, { "epoch": 0.9, "grad_norm": 6.528010244716758, "learning_rate": 1.6792615216730907e-05, "loss": 0.2196, "step": 11120 }, { "epoch": 0.9, "grad_norm": 5.853566456861371, "learning_rate": 1.6779544188089715e-05, "loss": 0.2629, "step": 11140 }, { "epoch": 0.9, "grad_norm": 10.986926893405414, "learning_rate": 1.67664516898649e-05, "loss": 0.2302, "step": 11160 }, { "epoch": 0.9, "grad_norm": 7.730824034913035, "learning_rate": 1.6753337763524137e-05, "loss": 0.2336, "step": 11180 }, { "epoch": 0.9, "grad_norm": 7.922173067463235, "learning_rate": 1.6740202450602976e-05, "loss": 0.2686, "step": 11200 }, { "epoch": 0.91, "grad_norm": 5.406865255814246, "learning_rate": 1.67270457927047e-05, "loss": 0.226, "step": 11220 }, { "epoch": 0.91, "grad_norm": 6.843481049848729, "learning_rate": 1.6713867831500195e-05, "loss": 0.2586, "step": 11240 }, { "epoch": 0.91, "grad_norm": 5.49549924323287, "learning_rate": 1.670066860872783e-05, "loss": 0.2627, "step": 11260 }, { "epoch": 0.91, "grad_norm": 6.183808429627808, "learning_rate": 1.6687448166193306e-05, "loss": 0.2749, "step": 11280 }, { "epoch": 0.91, "grad_norm": 4.378810204329709, "learning_rate": 1.667420654576954e-05, "loss": 0.2558, "step": 11300 }, { "epoch": 0.91, "grad_norm": 6.028002244995752, "learning_rate": 1.666094378939652e-05, "loss": 0.2554, "step": 11320 }, { "epoch": 0.92, "grad_norm": 7.776788987779546, "learning_rate": 1.664765993908118e-05, "loss": 0.2326, "step": 11340 }, { "epoch": 0.92, "grad_norm": 7.503277380435426, "learning_rate": 1.663435503689726e-05, "loss": 0.2707, "step": 11360 }, { "epoch": 0.92, "grad_norm": 6.303861845235693, "learning_rate": 1.6621029124985195e-05, "loss": 0.2435, "step": 11380 }, { "epoch": 0.92, "grad_norm": 7.213728574312154, "learning_rate": 1.6607682245551935e-05, "loss": 0.2514, "step": 11400 }, { "epoch": 0.92, "grad_norm": 5.2552293437415525, "learning_rate": 1.6594314440870864e-05, "loss": 0.2397, "step": 11420 }, { "epoch": 0.92, "grad_norm": 6.538249814157013, "learning_rate": 1.6580925753281634e-05, "loss": 0.2655, "step": 11440 }, { "epoch": 0.93, "grad_norm": 5.2378821622768905, "learning_rate": 1.6567516225190035e-05, "loss": 0.2607, "step": 11460 }, { "epoch": 0.93, "grad_norm": 5.674850314010563, "learning_rate": 1.655408589906787e-05, "loss": 0.2723, "step": 11480 }, { "epoch": 0.93, "grad_norm": 7.192949169932349, "learning_rate": 1.654063481745281e-05, "loss": 0.2561, "step": 11500 }, { "epoch": 0.93, "grad_norm": 13.135993930717675, "learning_rate": 1.652716302294828e-05, "loss": 0.2382, "step": 11520 }, { "epoch": 0.93, "grad_norm": 4.887607996691356, "learning_rate": 1.651367055822329e-05, "loss": 0.2863, "step": 11540 }, { "epoch": 0.93, "grad_norm": 7.367579978609729, "learning_rate": 1.6500157466012324e-05, "loss": 0.2379, "step": 11560 }, { "epoch": 0.93, "grad_norm": 8.199270857981157, "learning_rate": 1.6486623789115205e-05, "loss": 0.2432, "step": 11580 }, { "epoch": 0.94, "grad_norm": 6.243091274334211, "learning_rate": 1.6473069570396942e-05, "loss": 0.2635, "step": 11600 }, { "epoch": 0.94, "grad_norm": 5.6352137765892545, "learning_rate": 1.6459494852787622e-05, "loss": 0.2292, "step": 11620 }, { "epoch": 0.94, "grad_norm": 5.2104929401235305, "learning_rate": 1.6445899679282248e-05, "loss": 0.2545, "step": 11640 }, { "epoch": 0.94, "grad_norm": 5.635847694521193, "learning_rate": 1.6432284092940607e-05, "loss": 0.247, "step": 11660 }, { "epoch": 0.94, "grad_norm": 5.853851889115171, "learning_rate": 1.6418648136887152e-05, "loss": 0.2323, "step": 11680 }, { "epoch": 0.94, "grad_norm": 4.98208977143132, "learning_rate": 1.6404991854310846e-05, "loss": 0.238, "step": 11700 }, { "epoch": 0.95, "grad_norm": 5.560280174770714, "learning_rate": 1.6391315288465027e-05, "loss": 0.2589, "step": 11720 }, { "epoch": 0.95, "grad_norm": 11.332988584174231, "learning_rate": 1.637761848266729e-05, "loss": 0.2437, "step": 11740 }, { "epoch": 0.95, "grad_norm": 13.079688339953384, "learning_rate": 1.6363901480299323e-05, "loss": 0.2489, "step": 11760 }, { "epoch": 0.95, "grad_norm": 6.852537601204953, "learning_rate": 1.6350164324806787e-05, "loss": 0.218, "step": 11780 }, { "epoch": 0.95, "grad_norm": 6.384240727219325, "learning_rate": 1.633640705969917e-05, "loss": 0.2419, "step": 11800 }, { "epoch": 0.95, "grad_norm": 4.348764283501352, "learning_rate": 1.6322629728549665e-05, "loss": 0.2037, "step": 11820 }, { "epoch": 0.96, "grad_norm": 5.096264739138052, "learning_rate": 1.6308832374994997e-05, "loss": 0.2502, "step": 11840 }, { "epoch": 0.96, "grad_norm": 4.471177088927129, "learning_rate": 1.6295015042735336e-05, "loss": 0.2435, "step": 11860 }, { "epoch": 0.96, "grad_norm": 7.886308089698534, "learning_rate": 1.6281177775534106e-05, "loss": 0.2367, "step": 11880 }, { "epoch": 0.96, "grad_norm": 5.0872043608074335, "learning_rate": 1.6267320617217886e-05, "loss": 0.2618, "step": 11900 }, { "epoch": 0.96, "grad_norm": 7.332403239597943, "learning_rate": 1.6253443611676247e-05, "loss": 0.2377, "step": 11920 }, { "epoch": 0.96, "grad_norm": 5.2156408493688, "learning_rate": 1.6239546802861628e-05, "loss": 0.2588, "step": 11940 }, { "epoch": 0.97, "grad_norm": 14.389605988283588, "learning_rate": 1.6225630234789186e-05, "loss": 0.2359, "step": 11960 }, { "epoch": 0.97, "grad_norm": 6.61108607154756, "learning_rate": 1.621169395153666e-05, "loss": 0.2454, "step": 11980 }, { "epoch": 0.97, "grad_norm": 5.92623925749379, "learning_rate": 1.6197737997244242e-05, "loss": 0.2504, "step": 12000 }, { "epoch": 0.97, "grad_norm": 6.729876438497323, "learning_rate": 1.6183762416114417e-05, "loss": 0.231, "step": 12020 }, { "epoch": 0.97, "grad_norm": 4.91119912664639, "learning_rate": 1.6169767252411843e-05, "loss": 0.2732, "step": 12040 }, { "epoch": 0.97, "grad_norm": 7.372474108547359, "learning_rate": 1.615575255046319e-05, "loss": 0.2396, "step": 12060 }, { "epoch": 0.98, "grad_norm": 4.844310112839635, "learning_rate": 1.6141718354657023e-05, "loss": 0.2682, "step": 12080 }, { "epoch": 0.98, "grad_norm": 7.827541428550464, "learning_rate": 1.6127664709443642e-05, "loss": 0.2351, "step": 12100 }, { "epoch": 0.98, "grad_norm": 6.394194783450918, "learning_rate": 1.6113591659334952e-05, "loss": 0.277, "step": 12120 }, { "epoch": 0.98, "grad_norm": 6.728544539125102, "learning_rate": 1.609949924890432e-05, "loss": 0.2517, "step": 12140 }, { "epoch": 0.98, "grad_norm": 4.095514979882195, "learning_rate": 1.6085387522786432e-05, "loss": 0.2317, "step": 12160 }, { "epoch": 0.98, "grad_norm": 6.899190893971197, "learning_rate": 1.6071256525677144e-05, "loss": 0.239, "step": 12180 }, { "epoch": 0.98, "grad_norm": 5.002813882583922, "learning_rate": 1.6057106302333366e-05, "loss": 0.2411, "step": 12200 }, { "epoch": 0.99, "grad_norm": 6.7562128367712, "learning_rate": 1.6042936897572883e-05, "loss": 0.2347, "step": 12220 }, { "epoch": 0.99, "grad_norm": 9.896004658604653, "learning_rate": 1.6028748356274247e-05, "loss": 0.2526, "step": 12240 }, { "epoch": 0.99, "grad_norm": 7.972800268940516, "learning_rate": 1.6014540723376623e-05, "loss": 0.2505, "step": 12260 }, { "epoch": 0.99, "grad_norm": 5.170343546862058, "learning_rate": 1.600031404387963e-05, "loss": 0.2478, "step": 12280 }, { "epoch": 0.99, "grad_norm": 6.356344714814083, "learning_rate": 1.5986068362843224e-05, "loss": 0.2767, "step": 12300 }, { "epoch": 0.99, "grad_norm": 6.20794198597022, "learning_rate": 1.5971803725387544e-05, "loss": 0.2533, "step": 12320 }, { "epoch": 1.0, "grad_norm": 7.368279449995274, "learning_rate": 1.5957520176692766e-05, "loss": 0.2706, "step": 12340 }, { "epoch": 1.0, "grad_norm": 9.218421438795374, "learning_rate": 1.594321776199896e-05, "loss": 0.2447, "step": 12360 }, { "epoch": 1.0, "grad_norm": 5.4653346268657845, "learning_rate": 1.592889652660596e-05, "loss": 0.2339, "step": 12380 }, { "epoch": 1.0, "grad_norm": 6.741041667370887, "learning_rate": 1.5914556515873197e-05, "loss": 0.1749, "step": 12400 }, { "epoch": 1.0, "grad_norm": 4.207049838195936, "learning_rate": 1.590019777521959e-05, "loss": 0.1849, "step": 12420 }, { "epoch": 1.0, "grad_norm": 7.1496607666636285, "learning_rate": 1.588582035012336e-05, "loss": 0.1743, "step": 12440 }, { "epoch": 1.01, "grad_norm": 7.5265979882421865, "learning_rate": 1.587142428612191e-05, "loss": 0.1868, "step": 12460 }, { "epoch": 1.01, "grad_norm": 5.651063343012383, "learning_rate": 1.5857009628811692e-05, "loss": 0.1983, "step": 12480 }, { "epoch": 1.01, "grad_norm": 9.202976607727676, "learning_rate": 1.5842576423848034e-05, "loss": 0.1917, "step": 12500 }, { "epoch": 1.01, "grad_norm": 5.832342590483985, "learning_rate": 1.582812471694501e-05, "loss": 0.2189, "step": 12520 }, { "epoch": 1.01, "grad_norm": 6.20925991986496, "learning_rate": 1.5813654553875307e-05, "loss": 0.1941, "step": 12540 }, { "epoch": 1.01, "grad_norm": 6.9734995441552865, "learning_rate": 1.579916598047006e-05, "loss": 0.1722, "step": 12560 }, { "epoch": 1.02, "grad_norm": 5.261181866981142, "learning_rate": 1.578465904261871e-05, "loss": 0.1841, "step": 12580 }, { "epoch": 1.02, "grad_norm": 6.347552583288099, "learning_rate": 1.5770133786268867e-05, "loss": 0.2178, "step": 12600 }, { "epoch": 1.02, "grad_norm": 5.354096329322261, "learning_rate": 1.5755590257426172e-05, "loss": 0.2037, "step": 12620 }, { "epoch": 1.02, "grad_norm": 6.433760955249804, "learning_rate": 1.5741028502154122e-05, "loss": 0.1918, "step": 12640 }, { "epoch": 1.02, "grad_norm": 10.724559043942634, "learning_rate": 1.572644856657396e-05, "loss": 0.1943, "step": 12660 }, { "epoch": 1.02, "grad_norm": 7.14880036647321, "learning_rate": 1.571185049686449e-05, "loss": 0.1971, "step": 12680 }, { "epoch": 1.03, "grad_norm": 5.58282494958642, "learning_rate": 1.5697234339261973e-05, "loss": 0.2066, "step": 12700 }, { "epoch": 1.03, "grad_norm": 5.049152250234184, "learning_rate": 1.5682600140059945e-05, "loss": 0.2155, "step": 12720 }, { "epoch": 1.03, "grad_norm": 5.577269074409907, "learning_rate": 1.5667947945609098e-05, "loss": 0.2307, "step": 12740 }, { "epoch": 1.03, "grad_norm": 7.3636602086666905, "learning_rate": 1.5653277802317107e-05, "loss": 0.1904, "step": 12760 }, { "epoch": 1.03, "grad_norm": 5.824392969812123, "learning_rate": 1.5638589756648507e-05, "loss": 0.1796, "step": 12780 }, { "epoch": 1.03, "grad_norm": 6.363241683808851, "learning_rate": 1.562388385512452e-05, "loss": 0.1792, "step": 12800 }, { "epoch": 1.03, "grad_norm": 8.101137773642606, "learning_rate": 1.560916014432294e-05, "loss": 0.1934, "step": 12820 }, { "epoch": 1.04, "grad_norm": 4.945106731069112, "learning_rate": 1.559441867087796e-05, "loss": 0.2209, "step": 12840 }, { "epoch": 1.04, "grad_norm": 6.2605136180443495, "learning_rate": 1.5579659481480026e-05, "loss": 0.1781, "step": 12860 }, { "epoch": 1.04, "grad_norm": 7.849809107312115, "learning_rate": 1.5564882622875715e-05, "loss": 0.1772, "step": 12880 }, { "epoch": 1.04, "grad_norm": 6.076234028129562, "learning_rate": 1.5550088141867542e-05, "loss": 0.1798, "step": 12900 }, { "epoch": 1.04, "grad_norm": 8.417089571258343, "learning_rate": 1.553527608531386e-05, "loss": 0.2224, "step": 12920 }, { "epoch": 1.04, "grad_norm": 5.434386315534151, "learning_rate": 1.5520446500128666e-05, "loss": 0.1751, "step": 12940 }, { "epoch": 1.05, "grad_norm": 7.365658808789612, "learning_rate": 1.55055994332815e-05, "loss": 0.216, "step": 12960 }, { "epoch": 1.05, "grad_norm": 6.124958583146801, "learning_rate": 1.5490734931797252e-05, "loss": 0.1785, "step": 12980 }, { "epoch": 1.05, "grad_norm": 6.345434106235919, "learning_rate": 1.5475853042756045e-05, "loss": 0.2129, "step": 13000 }, { "epoch": 1.05, "grad_norm": 10.0344020371502, "learning_rate": 1.5460953813293065e-05, "loss": 0.178, "step": 13020 }, { "epoch": 1.05, "grad_norm": 7.7502114051780016, "learning_rate": 1.544603729059842e-05, "loss": 0.1777, "step": 13040 }, { "epoch": 1.05, "grad_norm": 6.198622753624231, "learning_rate": 1.5431103521916996e-05, "loss": 0.2098, "step": 13060 }, { "epoch": 1.06, "grad_norm": 6.48127602670386, "learning_rate": 1.5416152554548302e-05, "loss": 0.164, "step": 13080 }, { "epoch": 1.06, "grad_norm": 7.600239308253696, "learning_rate": 1.5401184435846316e-05, "loss": 0.1847, "step": 13100 }, { "epoch": 1.06, "grad_norm": 6.423081036482468, "learning_rate": 1.5386199213219344e-05, "loss": 0.1873, "step": 13120 }, { "epoch": 1.06, "grad_norm": 7.1989659944439355, "learning_rate": 1.5371196934129854e-05, "loss": 0.2092, "step": 13140 }, { "epoch": 1.06, "grad_norm": 4.613997491830078, "learning_rate": 1.5356177646094348e-05, "loss": 0.1882, "step": 13160 }, { "epoch": 1.06, "grad_norm": 5.629794641682726, "learning_rate": 1.5341141396683202e-05, "loss": 0.1952, "step": 13180 }, { "epoch": 1.07, "grad_norm": 5.86222977330632, "learning_rate": 1.53260882335205e-05, "loss": 0.1857, "step": 13200 }, { "epoch": 1.07, "grad_norm": 5.390116349700223, "learning_rate": 1.5311018204283915e-05, "loss": 0.1862, "step": 13220 }, { "epoch": 1.07, "grad_norm": 4.734598991710353, "learning_rate": 1.5295931356704522e-05, "loss": 0.1922, "step": 13240 }, { "epoch": 1.07, "grad_norm": 6.44238392273467, "learning_rate": 1.5280827738566673e-05, "loss": 0.1823, "step": 13260 }, { "epoch": 1.07, "grad_norm": 4.314282919486737, "learning_rate": 1.5265707397707838e-05, "loss": 0.1904, "step": 13280 }, { "epoch": 1.07, "grad_norm": 6.471785123561109, "learning_rate": 1.525057038201845e-05, "loss": 0.2201, "step": 13300 }, { "epoch": 1.08, "grad_norm": 6.211228619356565, "learning_rate": 1.523541673944176e-05, "loss": 0.1941, "step": 13320 }, { "epoch": 1.08, "grad_norm": 5.109706482939786, "learning_rate": 1.5220246517973674e-05, "loss": 0.205, "step": 13340 }, { "epoch": 1.08, "grad_norm": 7.1474883569847405, "learning_rate": 1.5205059765662611e-05, "loss": 0.1863, "step": 13360 }, { "epoch": 1.08, "grad_norm": 10.992853444090926, "learning_rate": 1.5189856530609351e-05, "loss": 0.2029, "step": 13380 }, { "epoch": 1.08, "grad_norm": 5.481913913723081, "learning_rate": 1.517463686096688e-05, "loss": 0.2004, "step": 13400 }, { "epoch": 1.08, "grad_norm": 5.850124859903834, "learning_rate": 1.5159400804940232e-05, "loss": 0.2029, "step": 13420 }, { "epoch": 1.08, "grad_norm": 5.113867042039441, "learning_rate": 1.5144148410786344e-05, "loss": 0.2166, "step": 13440 }, { "epoch": 1.09, "grad_norm": 6.039714348714059, "learning_rate": 1.51288797268139e-05, "loss": 0.2116, "step": 13460 }, { "epoch": 1.09, "grad_norm": 7.541603158363756, "learning_rate": 1.5113594801383178e-05, "loss": 0.1925, "step": 13480 }, { "epoch": 1.09, "grad_norm": 7.479663488804942, "learning_rate": 1.50982936829059e-05, "loss": 0.1953, "step": 13500 }, { "epoch": 1.09, "grad_norm": 6.7557856877486175, "learning_rate": 1.5082976419845078e-05, "loss": 0.1976, "step": 13520 }, { "epoch": 1.09, "grad_norm": 5.931350253738143, "learning_rate": 1.5067643060714844e-05, "loss": 0.2133, "step": 13540 }, { "epoch": 1.09, "grad_norm": 6.494971457661676, "learning_rate": 1.5052293654080332e-05, "loss": 0.176, "step": 13560 }, { "epoch": 1.1, "grad_norm": 5.762783270305054, "learning_rate": 1.503692824855749e-05, "loss": 0.2096, "step": 13580 }, { "epoch": 1.1, "grad_norm": 6.848342260542276, "learning_rate": 1.5021546892812934e-05, "loss": 0.2034, "step": 13600 }, { "epoch": 1.1, "grad_norm": 5.6448287727059485, "learning_rate": 1.5006149635563817e-05, "loss": 0.1936, "step": 13620 }, { "epoch": 1.1, "grad_norm": 5.83680549100651, "learning_rate": 1.4990736525577642e-05, "loss": 0.2025, "step": 13640 }, { "epoch": 1.1, "grad_norm": 7.41015135946399, "learning_rate": 1.4975307611672127e-05, "loss": 0.2024, "step": 13660 }, { "epoch": 1.1, "grad_norm": 9.778277740797297, "learning_rate": 1.4959862942715043e-05, "loss": 0.1707, "step": 13680 }, { "epoch": 1.11, "grad_norm": 5.046412396561728, "learning_rate": 1.4944402567624065e-05, "loss": 0.1936, "step": 13700 }, { "epoch": 1.11, "grad_norm": 11.086870666327583, "learning_rate": 1.492892653536661e-05, "loss": 0.1979, "step": 13720 }, { "epoch": 1.11, "grad_norm": 6.0254596329111525, "learning_rate": 1.4913434894959693e-05, "loss": 0.1791, "step": 13740 }, { "epoch": 1.11, "grad_norm": 7.033701558289489, "learning_rate": 1.4897927695469756e-05, "loss": 0.1905, "step": 13760 }, { "epoch": 1.11, "grad_norm": 4.474543730422018, "learning_rate": 1.4882404986012523e-05, "loss": 0.1693, "step": 13780 }, { "epoch": 1.11, "grad_norm": 4.1690691824315405, "learning_rate": 1.4866866815752847e-05, "loss": 0.1856, "step": 13800 }, { "epoch": 1.12, "grad_norm": 5.6756955564977964, "learning_rate": 1.4851313233904547e-05, "loss": 0.2053, "step": 13820 }, { "epoch": 1.12, "grad_norm": 11.164112387266075, "learning_rate": 1.4835744289730252e-05, "loss": 0.171, "step": 13840 }, { "epoch": 1.12, "grad_norm": 7.308139120179797, "learning_rate": 1.4820160032541254e-05, "loss": 0.1954, "step": 13860 }, { "epoch": 1.12, "grad_norm": 3.5914657630993294, "learning_rate": 1.4804560511697341e-05, "loss": 0.2246, "step": 13880 }, { "epoch": 1.12, "grad_norm": 6.751209857032397, "learning_rate": 1.4788945776606647e-05, "loss": 0.2013, "step": 13900 }, { "epoch": 1.12, "grad_norm": 6.405950176387068, "learning_rate": 1.477331587672549e-05, "loss": 0.2113, "step": 13920 }, { "epoch": 1.13, "grad_norm": 6.376328572976509, "learning_rate": 1.4757670861558228e-05, "loss": 0.1924, "step": 13940 }, { "epoch": 1.13, "grad_norm": 9.121068656282398, "learning_rate": 1.4742010780657085e-05, "loss": 0.209, "step": 13960 }, { "epoch": 1.13, "grad_norm": 4.7626419486771026, "learning_rate": 1.4726335683622008e-05, "loss": 0.2255, "step": 13980 }, { "epoch": 1.13, "grad_norm": 6.817788687267193, "learning_rate": 1.4710645620100499e-05, "loss": 0.1896, "step": 14000 }, { "epoch": 1.13, "grad_norm": 7.332059084839946, "learning_rate": 1.4694940639787466e-05, "loss": 0.2066, "step": 14020 }, { "epoch": 1.13, "grad_norm": 5.640954681652653, "learning_rate": 1.4679220792425067e-05, "loss": 0.1771, "step": 14040 }, { "epoch": 1.13, "grad_norm": 6.095651183293601, "learning_rate": 1.4663486127802538e-05, "loss": 0.186, "step": 14060 }, { "epoch": 1.14, "grad_norm": 7.092095404760473, "learning_rate": 1.464773669575606e-05, "loss": 0.2142, "step": 14080 }, { "epoch": 1.14, "grad_norm": 4.837369592766958, "learning_rate": 1.463197254616857e-05, "loss": 0.2218, "step": 14100 }, { "epoch": 1.14, "grad_norm": 4.9264497356291646, "learning_rate": 1.4616193728969633e-05, "loss": 0.1938, "step": 14120 }, { "epoch": 1.14, "grad_norm": 9.674187157366852, "learning_rate": 1.4600400294135264e-05, "loss": 0.2098, "step": 14140 }, { "epoch": 1.14, "grad_norm": 6.437523999215732, "learning_rate": 1.4584592291687777e-05, "loss": 0.2029, "step": 14160 }, { "epoch": 1.14, "grad_norm": 6.823030504825212, "learning_rate": 1.4568769771695625e-05, "loss": 0.1877, "step": 14180 }, { "epoch": 1.15, "grad_norm": 7.082752727573593, "learning_rate": 1.4552932784273246e-05, "loss": 0.1928, "step": 14200 }, { "epoch": 1.15, "grad_norm": 11.029950376037581, "learning_rate": 1.45370813795809e-05, "loss": 0.1682, "step": 14220 }, { "epoch": 1.15, "grad_norm": 4.956260098113924, "learning_rate": 1.4521215607824499e-05, "loss": 0.1972, "step": 14240 }, { "epoch": 1.15, "grad_norm": 5.51023882727362, "learning_rate": 1.4505335519255482e-05, "loss": 0.1967, "step": 14260 }, { "epoch": 1.15, "grad_norm": 6.160472680324094, "learning_rate": 1.4489441164170612e-05, "loss": 0.1913, "step": 14280 }, { "epoch": 1.15, "grad_norm": 9.72267194404117, "learning_rate": 1.447353259291185e-05, "loss": 0.1818, "step": 14300 }, { "epoch": 1.16, "grad_norm": 8.889058328709032, "learning_rate": 1.4457609855866181e-05, "loss": 0.2082, "step": 14320 }, { "epoch": 1.16, "grad_norm": 3.973002280859937, "learning_rate": 1.4441673003465458e-05, "loss": 0.1851, "step": 14340 }, { "epoch": 1.16, "grad_norm": 6.797652557445699, "learning_rate": 1.4425722086186236e-05, "loss": 0.191, "step": 14360 }, { "epoch": 1.16, "grad_norm": 7.229624826239175, "learning_rate": 1.4409757154549621e-05, "loss": 0.1891, "step": 14380 }, { "epoch": 1.16, "grad_norm": 6.741986922621174, "learning_rate": 1.4393778259121113e-05, "loss": 0.1868, "step": 14400 }, { "epoch": 1.16, "grad_norm": 4.781423328723562, "learning_rate": 1.4377785450510426e-05, "loss": 0.1953, "step": 14420 }, { "epoch": 1.17, "grad_norm": 5.074661529228375, "learning_rate": 1.436177877937135e-05, "loss": 0.2004, "step": 14440 }, { "epoch": 1.17, "grad_norm": 8.033180093952518, "learning_rate": 1.4345758296401585e-05, "loss": 0.1816, "step": 14460 }, { "epoch": 1.17, "grad_norm": 5.974169762042515, "learning_rate": 1.4329724052342569e-05, "loss": 0.192, "step": 14480 }, { "epoch": 1.17, "grad_norm": 4.656455835490763, "learning_rate": 1.4313676097979326e-05, "loss": 0.1835, "step": 14500 }, { "epoch": 1.17, "grad_norm": 3.1936684555124617, "learning_rate": 1.4297614484140307e-05, "loss": 0.1808, "step": 14520 }, { "epoch": 1.17, "grad_norm": 8.20373974631054, "learning_rate": 1.4281539261697228e-05, "loss": 0.1836, "step": 14540 }, { "epoch": 1.18, "grad_norm": 6.186778941670351, "learning_rate": 1.4265450481564904e-05, "loss": 0.1946, "step": 14560 }, { "epoch": 1.18, "grad_norm": 6.498458092545216, "learning_rate": 1.4249348194701091e-05, "loss": 0.1883, "step": 14580 }, { "epoch": 1.18, "grad_norm": 4.671798654398146, "learning_rate": 1.4233232452106331e-05, "loss": 0.1981, "step": 14600 }, { "epoch": 1.18, "grad_norm": 6.939397878773192, "learning_rate": 1.4217103304823774e-05, "loss": 0.1858, "step": 14620 }, { "epoch": 1.18, "grad_norm": 9.00976329632259, "learning_rate": 1.4200960803939034e-05, "loss": 0.1917, "step": 14640 }, { "epoch": 1.18, "grad_norm": 6.559576346045498, "learning_rate": 1.4184805000580018e-05, "loss": 0.1915, "step": 14660 }, { "epoch": 1.18, "grad_norm": 5.306030822468831, "learning_rate": 1.4168635945916762e-05, "loss": 0.2023, "step": 14680 }, { "epoch": 1.19, "grad_norm": 6.917396077987428, "learning_rate": 1.4152453691161279e-05, "loss": 0.201, "step": 14700 }, { "epoch": 1.19, "grad_norm": 5.388833984060343, "learning_rate": 1.4136258287567386e-05, "loss": 0.1951, "step": 14720 }, { "epoch": 1.19, "grad_norm": 5.741268992759075, "learning_rate": 1.412004978643055e-05, "loss": 0.2052, "step": 14740 }, { "epoch": 1.19, "grad_norm": 8.528637831977798, "learning_rate": 1.4103828239087713e-05, "loss": 0.1911, "step": 14760 }, { "epoch": 1.19, "grad_norm": 7.229831133673562, "learning_rate": 1.4087593696917152e-05, "loss": 0.2147, "step": 14780 }, { "epoch": 1.19, "grad_norm": 8.453977418463722, "learning_rate": 1.4071346211338287e-05, "loss": 0.2056, "step": 14800 }, { "epoch": 1.2, "grad_norm": 4.0493875016839205, "learning_rate": 1.4055085833811543e-05, "loss": 0.1875, "step": 14820 }, { "epoch": 1.2, "grad_norm": 6.315957510536494, "learning_rate": 1.403881261583818e-05, "loss": 0.2049, "step": 14840 }, { "epoch": 1.2, "grad_norm": 6.174807081413619, "learning_rate": 1.4022526608960117e-05, "loss": 0.1887, "step": 14860 }, { "epoch": 1.2, "grad_norm": 5.128483960445226, "learning_rate": 1.4006227864759787e-05, "loss": 0.1958, "step": 14880 }, { "epoch": 1.2, "grad_norm": 6.592332443520915, "learning_rate": 1.3989916434859961e-05, "loss": 0.174, "step": 14900 }, { "epoch": 1.2, "grad_norm": 5.47341406161348, "learning_rate": 1.3973592370923594e-05, "loss": 0.1972, "step": 14920 }, { "epoch": 1.21, "grad_norm": 10.100193860299168, "learning_rate": 1.395725572465366e-05, "loss": 0.217, "step": 14940 }, { "epoch": 1.21, "grad_norm": 5.393687564139285, "learning_rate": 1.394090654779297e-05, "loss": 0.1746, "step": 14960 }, { "epoch": 1.21, "grad_norm": 4.862848248796205, "learning_rate": 1.3924544892124037e-05, "loss": 0.1804, "step": 14980 }, { "epoch": 1.21, "grad_norm": 5.347965295637652, "learning_rate": 1.390817080946889e-05, "loss": 0.1774, "step": 15000 }, { "epoch": 1.21, "grad_norm": 6.040766822188126, "learning_rate": 1.3891784351688921e-05, "loss": 0.2123, "step": 15020 }, { "epoch": 1.21, "grad_norm": 5.8662001265351105, "learning_rate": 1.3875385570684725e-05, "loss": 0.1888, "step": 15040 }, { "epoch": 1.22, "grad_norm": 3.4039196477297584, "learning_rate": 1.3858974518395912e-05, "loss": 0.1776, "step": 15060 }, { "epoch": 1.22, "grad_norm": 5.345588055655486, "learning_rate": 1.384255124680097e-05, "loss": 0.1934, "step": 15080 }, { "epoch": 1.22, "grad_norm": 7.7672522699283455, "learning_rate": 1.3826115807917088e-05, "loss": 0.1881, "step": 15100 }, { "epoch": 1.22, "grad_norm": 14.607170518002183, "learning_rate": 1.3809668253799989e-05, "loss": 0.1992, "step": 15120 }, { "epoch": 1.22, "grad_norm": 7.122439543446391, "learning_rate": 1.379320863654377e-05, "loss": 0.2071, "step": 15140 }, { "epoch": 1.22, "grad_norm": 7.078892142047386, "learning_rate": 1.3776737008280734e-05, "loss": 0.1846, "step": 15160 }, { "epoch": 1.23, "grad_norm": 4.833315926005269, "learning_rate": 1.3760253421181232e-05, "loss": 0.1955, "step": 15180 }, { "epoch": 1.23, "grad_norm": 7.642880346774846, "learning_rate": 1.3743757927453485e-05, "loss": 0.1926, "step": 15200 }, { "epoch": 1.23, "grad_norm": 5.4079945125552, "learning_rate": 1.3727250579343427e-05, "loss": 0.1873, "step": 15220 }, { "epoch": 1.23, "grad_norm": 5.948430330328783, "learning_rate": 1.371073142913454e-05, "loss": 0.207, "step": 15240 }, { "epoch": 1.23, "grad_norm": 5.5464350136347615, "learning_rate": 1.369420052914769e-05, "loss": 0.2059, "step": 15260 }, { "epoch": 1.23, "grad_norm": 6.914142224558012, "learning_rate": 1.3677657931740953e-05, "loss": 0.2101, "step": 15280 }, { "epoch": 1.24, "grad_norm": 9.57042012056836, "learning_rate": 1.3661103689309451e-05, "loss": 0.1845, "step": 15300 }, { "epoch": 1.24, "grad_norm": 4.631446263817481, "learning_rate": 1.3644537854285198e-05, "loss": 0.1676, "step": 15320 }, { "epoch": 1.24, "grad_norm": 7.563813907200117, "learning_rate": 1.3627960479136917e-05, "loss": 0.1959, "step": 15340 }, { "epoch": 1.24, "grad_norm": 7.423882592374514, "learning_rate": 1.3611371616369888e-05, "loss": 0.2119, "step": 15360 }, { "epoch": 1.24, "grad_norm": 8.033278865814227, "learning_rate": 1.3594771318525772e-05, "loss": 0.1999, "step": 15380 }, { "epoch": 1.24, "grad_norm": 4.263461392179193, "learning_rate": 1.3578159638182443e-05, "loss": 0.1623, "step": 15400 }, { "epoch": 1.24, "grad_norm": 5.605534420891448, "learning_rate": 1.3561536627953846e-05, "loss": 0.1878, "step": 15420 }, { "epoch": 1.25, "grad_norm": 7.271725167246008, "learning_rate": 1.3544902340489788e-05, "loss": 0.203, "step": 15440 }, { "epoch": 1.25, "grad_norm": 24.17397448663477, "learning_rate": 1.3528256828475806e-05, "loss": 0.1996, "step": 15460 }, { "epoch": 1.25, "grad_norm": 8.21922387907255, "learning_rate": 1.3511600144632984e-05, "loss": 0.2115, "step": 15480 }, { "epoch": 1.25, "grad_norm": 6.438100546073929, "learning_rate": 1.3494932341717795e-05, "loss": 0.2178, "step": 15500 }, { "epoch": 1.25, "grad_norm": 5.438714078216004, "learning_rate": 1.3478253472521926e-05, "loss": 0.2035, "step": 15520 }, { "epoch": 1.25, "grad_norm": 5.267123317638661, "learning_rate": 1.3461563589872115e-05, "loss": 0.1871, "step": 15540 }, { "epoch": 1.26, "grad_norm": 6.952296244061062, "learning_rate": 1.3444862746629983e-05, "loss": 0.1796, "step": 15560 }, { "epoch": 1.26, "grad_norm": 5.566630169564181, "learning_rate": 1.3428150995691864e-05, "loss": 0.19, "step": 15580 }, { "epoch": 1.26, "grad_norm": 8.317946865138628, "learning_rate": 1.3411428389988643e-05, "loss": 0.1867, "step": 15600 }, { "epoch": 1.26, "grad_norm": 5.8312505364290015, "learning_rate": 1.3394694982485588e-05, "loss": 0.1966, "step": 15620 }, { "epoch": 1.26, "grad_norm": 5.558038014921936, "learning_rate": 1.3377950826182167e-05, "loss": 0.2084, "step": 15640 }, { "epoch": 1.26, "grad_norm": 3.3670931844933434, "learning_rate": 1.3361195974111908e-05, "loss": 0.1886, "step": 15660 }, { "epoch": 1.27, "grad_norm": 6.730872285494118, "learning_rate": 1.3344430479342205e-05, "loss": 0.1991, "step": 15680 }, { "epoch": 1.27, "grad_norm": 5.221936012884134, "learning_rate": 1.3327654394974164e-05, "loss": 0.1871, "step": 15700 }, { "epoch": 1.27, "grad_norm": 8.096789875414892, "learning_rate": 1.3310867774142433e-05, "loss": 0.1799, "step": 15720 }, { "epoch": 1.27, "grad_norm": 4.9068121132509335, "learning_rate": 1.3294070670015026e-05, "loss": 0.1817, "step": 15740 }, { "epoch": 1.27, "grad_norm": 3.9856004214295386, "learning_rate": 1.3277263135793167e-05, "loss": 0.1793, "step": 15760 }, { "epoch": 1.27, "grad_norm": 7.118631561874846, "learning_rate": 1.3260445224711115e-05, "loss": 0.1787, "step": 15780 }, { "epoch": 1.28, "grad_norm": 7.914428581966782, "learning_rate": 1.3243616990035988e-05, "loss": 0.1821, "step": 15800 }, { "epoch": 1.28, "grad_norm": 6.602254147384827, "learning_rate": 1.322677848506761e-05, "loss": 0.1746, "step": 15820 }, { "epoch": 1.28, "grad_norm": 6.142691150589716, "learning_rate": 1.3209929763138333e-05, "loss": 0.1964, "step": 15840 }, { "epoch": 1.28, "grad_norm": 7.318010910855917, "learning_rate": 1.3193070877612863e-05, "loss": 0.1974, "step": 15860 }, { "epoch": 1.28, "grad_norm": 6.924430013948181, "learning_rate": 1.3176201881888104e-05, "loss": 0.1991, "step": 15880 }, { "epoch": 1.28, "grad_norm": 7.883879242504948, "learning_rate": 1.3159322829392978e-05, "loss": 0.1924, "step": 15900 }, { "epoch": 1.29, "grad_norm": 5.220112148848683, "learning_rate": 1.3142433773588259e-05, "loss": 0.2138, "step": 15920 }, { "epoch": 1.29, "grad_norm": 5.620528075363317, "learning_rate": 1.3125534767966406e-05, "loss": 0.1833, "step": 15940 }, { "epoch": 1.29, "grad_norm": 7.394718512387162, "learning_rate": 1.3108625866051393e-05, "loss": 0.1745, "step": 15960 }, { "epoch": 1.29, "grad_norm": 4.8000424144778275, "learning_rate": 1.3091707121398535e-05, "loss": 0.2024, "step": 15980 }, { "epoch": 1.29, "grad_norm": 4.774826621857471, "learning_rate": 1.3074778587594328e-05, "loss": 0.2015, "step": 16000 }, { "epoch": 1.29, "grad_norm": 9.387391712768, "learning_rate": 1.3057840318256265e-05, "loss": 0.1795, "step": 16020 }, { "epoch": 1.29, "grad_norm": 4.963107377940831, "learning_rate": 1.3040892367032682e-05, "loss": 0.1653, "step": 16040 }, { "epoch": 1.3, "grad_norm": 6.997759324175277, "learning_rate": 1.3023934787602572e-05, "loss": 0.2063, "step": 16060 }, { "epoch": 1.3, "grad_norm": 6.190583972900869, "learning_rate": 1.3006967633675432e-05, "loss": 0.2153, "step": 16080 }, { "epoch": 1.3, "grad_norm": 6.824679624981862, "learning_rate": 1.2989990958991077e-05, "loss": 0.1891, "step": 16100 }, { "epoch": 1.3, "grad_norm": 8.754013884741005, "learning_rate": 1.2973004817319479e-05, "loss": 0.1804, "step": 16120 }, { "epoch": 1.3, "grad_norm": 8.960491352232728, "learning_rate": 1.29560092624606e-05, "loss": 0.1923, "step": 16140 }, { "epoch": 1.3, "grad_norm": 4.719281501394497, "learning_rate": 1.2939004348244207e-05, "loss": 0.2186, "step": 16160 }, { "epoch": 1.31, "grad_norm": 6.0936889645380825, "learning_rate": 1.2921990128529713e-05, "loss": 0.2008, "step": 16180 }, { "epoch": 1.31, "grad_norm": 6.559249218102312, "learning_rate": 1.2904966657206013e-05, "loss": 0.1968, "step": 16200 }, { "epoch": 1.31, "grad_norm": 7.485638410293939, "learning_rate": 1.2887933988191297e-05, "loss": 0.1754, "step": 16220 }, { "epoch": 1.31, "grad_norm": 4.949399299007962, "learning_rate": 1.2870892175432887e-05, "loss": 0.1949, "step": 16240 }, { "epoch": 1.31, "grad_norm": 5.031890259751463, "learning_rate": 1.2853841272907068e-05, "loss": 0.1697, "step": 16260 }, { "epoch": 1.31, "grad_norm": 3.2773740128221367, "learning_rate": 1.2836781334618912e-05, "loss": 0.1706, "step": 16280 }, { "epoch": 1.32, "grad_norm": 7.100024972791139, "learning_rate": 1.2819712414602112e-05, "loss": 0.1725, "step": 16300 }, { "epoch": 1.32, "grad_norm": 6.847007498771447, "learning_rate": 1.2802634566918806e-05, "loss": 0.2224, "step": 16320 }, { "epoch": 1.32, "grad_norm": 18.124907394698774, "learning_rate": 1.2785547845659412e-05, "loss": 0.1954, "step": 16340 }, { "epoch": 1.32, "grad_norm": 6.653915067377773, "learning_rate": 1.2768452304942449e-05, "loss": 0.195, "step": 16360 }, { "epoch": 1.32, "grad_norm": 13.653829050105037, "learning_rate": 1.275134799891438e-05, "loss": 0.1771, "step": 16380 }, { "epoch": 1.32, "grad_norm": 5.199783489414919, "learning_rate": 1.2734234981749416e-05, "loss": 0.1697, "step": 16400 }, { "epoch": 1.33, "grad_norm": 4.938204944360832, "learning_rate": 1.2717113307649367e-05, "loss": 0.2153, "step": 16420 }, { "epoch": 1.33, "grad_norm": 5.379988728337729, "learning_rate": 1.2699983030843462e-05, "loss": 0.1807, "step": 16440 }, { "epoch": 1.33, "grad_norm": 5.563035060565681, "learning_rate": 1.2682844205588175e-05, "loss": 0.1723, "step": 16460 }, { "epoch": 1.33, "grad_norm": 4.137830028452154, "learning_rate": 1.2665696886167054e-05, "loss": 0.2015, "step": 16480 }, { "epoch": 1.33, "grad_norm": 5.444985685023187, "learning_rate": 1.2648541126890553e-05, "loss": 0.1891, "step": 16500 }, { "epoch": 1.33, "grad_norm": 6.758562379366711, "learning_rate": 1.2631376982095857e-05, "loss": 0.1794, "step": 16520 }, { "epoch": 1.34, "grad_norm": 6.42547461986865, "learning_rate": 1.2614204506146714e-05, "loss": 0.2072, "step": 16540 }, { "epoch": 1.34, "grad_norm": 6.67478604205973, "learning_rate": 1.2597023753433248e-05, "loss": 0.1752, "step": 16560 }, { "epoch": 1.34, "grad_norm": 5.114865214342277, "learning_rate": 1.2579834778371814e-05, "loss": 0.2129, "step": 16580 }, { "epoch": 1.34, "grad_norm": 6.721543126268775, "learning_rate": 1.2562637635404791e-05, "loss": 0.1774, "step": 16600 }, { "epoch": 1.34, "grad_norm": 5.93001108203901, "learning_rate": 1.2545432379000448e-05, "loss": 0.1773, "step": 16620 }, { "epoch": 1.34, "grad_norm": 5.7523308051939095, "learning_rate": 1.2528219063652729e-05, "loss": 0.2078, "step": 16640 }, { "epoch": 1.34, "grad_norm": 6.871500685256494, "learning_rate": 1.2510997743881129e-05, "loss": 0.1804, "step": 16660 }, { "epoch": 1.35, "grad_norm": 6.975118558142685, "learning_rate": 1.249376847423047e-05, "loss": 0.1923, "step": 16680 }, { "epoch": 1.35, "grad_norm": 5.518168870477368, "learning_rate": 1.2476531309270773e-05, "loss": 0.2043, "step": 16700 }, { "epoch": 1.35, "grad_norm": 7.024733851268127, "learning_rate": 1.2459286303597055e-05, "loss": 0.1957, "step": 16720 }, { "epoch": 1.35, "grad_norm": 5.189450368424653, "learning_rate": 1.244203351182917e-05, "loss": 0.1972, "step": 16740 }, { "epoch": 1.35, "grad_norm": 4.769858359227226, "learning_rate": 1.2424772988611631e-05, "loss": 0.2045, "step": 16760 }, { "epoch": 1.35, "grad_norm": 5.6345946816574575, "learning_rate": 1.2407504788613441e-05, "loss": 0.184, "step": 16780 }, { "epoch": 1.36, "grad_norm": 8.03446545781803, "learning_rate": 1.2390228966527917e-05, "loss": 0.2016, "step": 16800 }, { "epoch": 1.36, "grad_norm": 5.376750572260963, "learning_rate": 1.2372945577072516e-05, "loss": 0.221, "step": 16820 }, { "epoch": 1.36, "grad_norm": 3.9735936922072836, "learning_rate": 1.2355654674988669e-05, "loss": 0.2193, "step": 16840 }, { "epoch": 1.36, "grad_norm": 4.966981585117317, "learning_rate": 1.2338356315041587e-05, "loss": 0.1788, "step": 16860 }, { "epoch": 1.36, "grad_norm": 5.389331607177196, "learning_rate": 1.232105055202012e-05, "loss": 0.2325, "step": 16880 }, { "epoch": 1.36, "grad_norm": 3.4531020772181753, "learning_rate": 1.2303737440736553e-05, "loss": 0.1978, "step": 16900 }, { "epoch": 1.37, "grad_norm": 8.524005426914416, "learning_rate": 1.2286417036026454e-05, "loss": 0.2219, "step": 16920 }, { "epoch": 1.37, "grad_norm": 8.61532713797543, "learning_rate": 1.2269089392748484e-05, "loss": 0.1786, "step": 16940 }, { "epoch": 1.37, "grad_norm": 7.510098080661287, "learning_rate": 1.225175456578423e-05, "loss": 0.192, "step": 16960 }, { "epoch": 1.37, "grad_norm": 5.7571280705405075, "learning_rate": 1.2234412610038045e-05, "loss": 0.1884, "step": 16980 }, { "epoch": 1.37, "grad_norm": 6.431467433327115, "learning_rate": 1.2217063580436841e-05, "loss": 0.1861, "step": 17000 }, { "epoch": 1.37, "grad_norm": 5.997538061494255, "learning_rate": 1.219970753192995e-05, "loss": 0.196, "step": 17020 }, { "epoch": 1.38, "grad_norm": 5.041053272270793, "learning_rate": 1.218234451948893e-05, "loss": 0.1676, "step": 17040 }, { "epoch": 1.38, "grad_norm": 8.26975958810288, "learning_rate": 1.2164974598107398e-05, "loss": 0.1953, "step": 17060 }, { "epoch": 1.38, "grad_norm": 4.861930347014316, "learning_rate": 1.2147597822800843e-05, "loss": 0.2077, "step": 17080 }, { "epoch": 1.38, "grad_norm": 6.4205005966339534, "learning_rate": 1.2130214248606478e-05, "loss": 0.1743, "step": 17100 }, { "epoch": 1.38, "grad_norm": 6.274607928060438, "learning_rate": 1.2112823930583042e-05, "loss": 0.168, "step": 17120 }, { "epoch": 1.38, "grad_norm": 8.376715185548848, "learning_rate": 1.2095426923810631e-05, "loss": 0.1821, "step": 17140 }, { "epoch": 1.39, "grad_norm": 8.347057130026942, "learning_rate": 1.2078023283390532e-05, "loss": 0.201, "step": 17160 }, { "epoch": 1.39, "grad_norm": 6.10042452976507, "learning_rate": 1.2060613064445041e-05, "loss": 0.1864, "step": 17180 }, { "epoch": 1.39, "grad_norm": 6.882586091178972, "learning_rate": 1.204319632211729e-05, "loss": 0.1945, "step": 17200 }, { "epoch": 1.39, "grad_norm": 4.831518270883375, "learning_rate": 1.2025773111571067e-05, "loss": 0.1997, "step": 17220 }, { "epoch": 1.39, "grad_norm": 5.820557290888362, "learning_rate": 1.2008343487990652e-05, "loss": 0.213, "step": 17240 }, { "epoch": 1.39, "grad_norm": 7.590766477620916, "learning_rate": 1.199090750658064e-05, "loss": 0.1943, "step": 17260 }, { "epoch": 1.39, "grad_norm": 5.245303096541225, "learning_rate": 1.1973465222565756e-05, "loss": 0.1935, "step": 17280 }, { "epoch": 1.4, "grad_norm": 5.635466946173672, "learning_rate": 1.1956016691190693e-05, "loss": 0.1937, "step": 17300 }, { "epoch": 1.4, "grad_norm": 9.8612439882403, "learning_rate": 1.1938561967719929e-05, "loss": 0.1998, "step": 17320 }, { "epoch": 1.4, "grad_norm": 7.57010186594224, "learning_rate": 1.1921101107437547e-05, "loss": 0.1859, "step": 17340 }, { "epoch": 1.4, "grad_norm": 5.578111322637294, "learning_rate": 1.190363416564708e-05, "loss": 0.1885, "step": 17360 }, { "epoch": 1.4, "grad_norm": 10.636687907364392, "learning_rate": 1.188616119767132e-05, "loss": 0.2183, "step": 17380 }, { "epoch": 1.4, "grad_norm": 5.149639168848235, "learning_rate": 1.1868682258852135e-05, "loss": 0.1854, "step": 17400 }, { "epoch": 1.41, "grad_norm": 7.37696372575026, "learning_rate": 1.1851197404550314e-05, "loss": 0.1859, "step": 17420 }, { "epoch": 1.41, "grad_norm": 7.336027199681234, "learning_rate": 1.183370669014538e-05, "loss": 0.1685, "step": 17440 }, { "epoch": 1.41, "grad_norm": 5.449592911101573, "learning_rate": 1.181621017103542e-05, "loss": 0.2028, "step": 17460 }, { "epoch": 1.41, "grad_norm": 8.354625001227333, "learning_rate": 1.1798707902636895e-05, "loss": 0.1841, "step": 17480 }, { "epoch": 1.41, "grad_norm": 6.278768324372431, "learning_rate": 1.178119994038449e-05, "loss": 0.1682, "step": 17500 }, { "epoch": 1.41, "grad_norm": 6.121827218539604, "learning_rate": 1.1763686339730911e-05, "loss": 0.1864, "step": 17520 }, { "epoch": 1.42, "grad_norm": 4.642245950398283, "learning_rate": 1.174616715614673e-05, "loss": 0.1919, "step": 17540 }, { "epoch": 1.42, "grad_norm": 12.026938401540649, "learning_rate": 1.1728642445120205e-05, "loss": 0.1876, "step": 17560 }, { "epoch": 1.42, "grad_norm": 5.259501261110696, "learning_rate": 1.1711112262157093e-05, "loss": 0.196, "step": 17580 }, { "epoch": 1.42, "grad_norm": 6.612093751827706, "learning_rate": 1.1693576662780486e-05, "loss": 0.1811, "step": 17600 }, { "epoch": 1.42, "grad_norm": 6.169870867251168, "learning_rate": 1.167603570253063e-05, "loss": 0.1955, "step": 17620 }, { "epoch": 1.42, "grad_norm": 7.205754195375382, "learning_rate": 1.1658489436964753e-05, "loss": 0.1806, "step": 17640 }, { "epoch": 1.43, "grad_norm": 7.67399813297936, "learning_rate": 1.1640937921656882e-05, "loss": 0.198, "step": 17660 }, { "epoch": 1.43, "grad_norm": 5.102867345151343, "learning_rate": 1.1623381212197677e-05, "loss": 0.1663, "step": 17680 }, { "epoch": 1.43, "grad_norm": 6.182588419638686, "learning_rate": 1.1605819364194244e-05, "loss": 0.1972, "step": 17700 }, { "epoch": 1.43, "grad_norm": 8.729593950859055, "learning_rate": 1.1588252433269966e-05, "loss": 0.1978, "step": 17720 }, { "epoch": 1.43, "grad_norm": 4.896187735642024, "learning_rate": 1.1570680475064328e-05, "loss": 0.181, "step": 17740 }, { "epoch": 1.43, "grad_norm": 8.586607472154492, "learning_rate": 1.1553103545232738e-05, "loss": 0.1778, "step": 17760 }, { "epoch": 1.44, "grad_norm": 7.077719973666291, "learning_rate": 1.1535521699446344e-05, "loss": 0.1881, "step": 17780 }, { "epoch": 1.44, "grad_norm": 6.368844445064559, "learning_rate": 1.151793499339187e-05, "loss": 0.1837, "step": 17800 }, { "epoch": 1.44, "grad_norm": 5.603581999541538, "learning_rate": 1.1500343482771433e-05, "loss": 0.1788, "step": 17820 }, { "epoch": 1.44, "grad_norm": 6.124117106428645, "learning_rate": 1.1482747223302362e-05, "loss": 0.2073, "step": 17840 }, { "epoch": 1.44, "grad_norm": 4.896094240852357, "learning_rate": 1.146514627071704e-05, "loss": 0.2018, "step": 17860 }, { "epoch": 1.44, "grad_norm": 8.633063928041105, "learning_rate": 1.1447540680762697e-05, "loss": 0.187, "step": 17880 }, { "epoch": 1.44, "grad_norm": 5.8650395795782595, "learning_rate": 1.1429930509201264e-05, "loss": 0.1884, "step": 17900 }, { "epoch": 1.45, "grad_norm": 3.3374318833847245, "learning_rate": 1.141231581180918e-05, "loss": 0.1755, "step": 17920 }, { "epoch": 1.45, "grad_norm": 9.55305523159018, "learning_rate": 1.1394696644377216e-05, "loss": 0.183, "step": 17940 }, { "epoch": 1.45, "grad_norm": 5.489288302847405, "learning_rate": 1.1377073062710309e-05, "loss": 0.1963, "step": 17960 }, { "epoch": 1.45, "grad_norm": 5.8983191471983805, "learning_rate": 1.1359445122627362e-05, "loss": 0.1895, "step": 17980 }, { "epoch": 1.45, "grad_norm": 7.189465859300051, "learning_rate": 1.1341812879961095e-05, "loss": 0.1673, "step": 18000 }, { "epoch": 1.45, "grad_norm": 7.447490489655281, "learning_rate": 1.1324176390557853e-05, "loss": 0.1809, "step": 18020 }, { "epoch": 1.46, "grad_norm": 5.775175955597824, "learning_rate": 1.1306535710277428e-05, "loss": 0.1791, "step": 18040 }, { "epoch": 1.46, "grad_norm": 8.2814761724716, "learning_rate": 1.1288890894992888e-05, "loss": 0.1819, "step": 18060 }, { "epoch": 1.46, "grad_norm": 4.995617280837431, "learning_rate": 1.12712420005904e-05, "loss": 0.1925, "step": 18080 }, { "epoch": 1.46, "grad_norm": 5.884038263005907, "learning_rate": 1.1253589082969046e-05, "loss": 0.1854, "step": 18100 }, { "epoch": 1.46, "grad_norm": 4.022777792168135, "learning_rate": 1.1235932198040653e-05, "loss": 0.1728, "step": 18120 }, { "epoch": 1.46, "grad_norm": 4.792975515098119, "learning_rate": 1.1218271401729617e-05, "loss": 0.1836, "step": 18140 }, { "epoch": 1.47, "grad_norm": 5.250400164657746, "learning_rate": 1.1200606749972718e-05, "loss": 0.167, "step": 18160 }, { "epoch": 1.47, "grad_norm": 7.457426673152692, "learning_rate": 1.1182938298718945e-05, "loss": 0.1829, "step": 18180 }, { "epoch": 1.47, "grad_norm": 14.206800331502505, "learning_rate": 1.1165266103929328e-05, "loss": 0.1778, "step": 18200 }, { "epoch": 1.47, "grad_norm": 6.502359695534185, "learning_rate": 1.1147590221576754e-05, "loss": 0.1799, "step": 18220 }, { "epoch": 1.47, "grad_norm": 7.1040996052400525, "learning_rate": 1.1129910707645779e-05, "loss": 0.1917, "step": 18240 }, { "epoch": 1.47, "grad_norm": 6.809597692047531, "learning_rate": 1.1112227618132472e-05, "loss": 0.1584, "step": 18260 }, { "epoch": 1.48, "grad_norm": 6.214106220478112, "learning_rate": 1.1094541009044219e-05, "loss": 0.1745, "step": 18280 }, { "epoch": 1.48, "grad_norm": 6.0818007467761, "learning_rate": 1.1076850936399564e-05, "loss": 0.1811, "step": 18300 }, { "epoch": 1.48, "grad_norm": 7.541164852188345, "learning_rate": 1.1059157456228008e-05, "loss": 0.1642, "step": 18320 }, { "epoch": 1.48, "grad_norm": 8.065093340490257, "learning_rate": 1.104146062456986e-05, "loss": 0.1801, "step": 18340 }, { "epoch": 1.48, "grad_norm": 6.675920675840688, "learning_rate": 1.1023760497476028e-05, "loss": 0.1756, "step": 18360 }, { "epoch": 1.48, "grad_norm": 6.6987902445864655, "learning_rate": 1.1006057131007866e-05, "loss": 0.1795, "step": 18380 }, { "epoch": 1.49, "grad_norm": 6.510299738533309, "learning_rate": 1.0988350581236991e-05, "loss": 0.1865, "step": 18400 }, { "epoch": 1.49, "grad_norm": 6.34510225061825, "learning_rate": 1.0970640904245094e-05, "loss": 0.1955, "step": 18420 }, { "epoch": 1.49, "grad_norm": 4.228947726190241, "learning_rate": 1.0952928156123781e-05, "loss": 0.175, "step": 18440 }, { "epoch": 1.49, "grad_norm": 6.238306015624761, "learning_rate": 1.0935212392974372e-05, "loss": 0.1767, "step": 18460 }, { "epoch": 1.49, "grad_norm": 5.8825690027835424, "learning_rate": 1.0917493670907751e-05, "loss": 0.1804, "step": 18480 }, { "epoch": 1.49, "grad_norm": 5.307188542805111, "learning_rate": 1.0899772046044157e-05, "loss": 0.2165, "step": 18500 }, { "epoch": 1.49, "grad_norm": 7.671752182857302, "learning_rate": 1.0882047574513045e-05, "loss": 0.1754, "step": 18520 }, { "epoch": 1.5, "grad_norm": 7.228224840476562, "learning_rate": 1.0864320312452865e-05, "loss": 0.1749, "step": 18540 }, { "epoch": 1.5, "grad_norm": 7.022023025670514, "learning_rate": 1.0846590316010918e-05, "loss": 0.1815, "step": 18560 }, { "epoch": 1.5, "grad_norm": 11.59241800805326, "learning_rate": 1.082885764134316e-05, "loss": 0.1947, "step": 18580 }, { "epoch": 1.5, "grad_norm": 5.496283576035116, "learning_rate": 1.081112234461403e-05, "loss": 0.191, "step": 18600 }, { "epoch": 1.5, "grad_norm": 8.37203471510935, "learning_rate": 1.0793384481996279e-05, "loss": 0.2098, "step": 18620 }, { "epoch": 1.5, "grad_norm": 4.85467675057146, "learning_rate": 1.0775644109670778e-05, "loss": 0.199, "step": 18640 }, { "epoch": 1.51, "grad_norm": 7.2927775672082165, "learning_rate": 1.0757901283826341e-05, "loss": 0.1763, "step": 18660 }, { "epoch": 1.51, "grad_norm": 4.587898523966167, "learning_rate": 1.0740156060659565e-05, "loss": 0.1933, "step": 18680 }, { "epoch": 1.51, "grad_norm": 4.846531420101418, "learning_rate": 1.0722408496374634e-05, "loss": 0.1605, "step": 18700 }, { "epoch": 1.51, "grad_norm": 6.357471377975472, "learning_rate": 1.0704658647183155e-05, "loss": 0.1911, "step": 18720 }, { "epoch": 1.51, "grad_norm": 4.884941528032838, "learning_rate": 1.0686906569303955e-05, "loss": 0.1613, "step": 18740 }, { "epoch": 1.51, "grad_norm": 6.578672698862572, "learning_rate": 1.0669152318962936e-05, "loss": 0.1583, "step": 18760 }, { "epoch": 1.52, "grad_norm": 4.996568837666117, "learning_rate": 1.0651395952392876e-05, "loss": 0.1893, "step": 18780 }, { "epoch": 1.52, "grad_norm": 6.137635879410934, "learning_rate": 1.0633637525833246e-05, "loss": 0.2006, "step": 18800 }, { "epoch": 1.52, "grad_norm": 10.339616116567784, "learning_rate": 1.0615877095530058e-05, "loss": 0.2034, "step": 18820 }, { "epoch": 1.52, "grad_norm": 5.500147904228012, "learning_rate": 1.0598114717735661e-05, "loss": 0.1989, "step": 18840 }, { "epoch": 1.52, "grad_norm": 5.340910932588536, "learning_rate": 1.0580350448708571e-05, "loss": 0.2044, "step": 18860 }, { "epoch": 1.52, "grad_norm": 7.0020980415617275, "learning_rate": 1.0562584344713301e-05, "loss": 0.1873, "step": 18880 }, { "epoch": 1.53, "grad_norm": 6.566893932198695, "learning_rate": 1.0544816462020169e-05, "loss": 0.1672, "step": 18900 }, { "epoch": 1.53, "grad_norm": 21.295579564416062, "learning_rate": 1.052704685690513e-05, "loss": 0.1728, "step": 18920 }, { "epoch": 1.53, "grad_norm": 5.869527987425055, "learning_rate": 1.0509275585649594e-05, "loss": 0.2102, "step": 18940 }, { "epoch": 1.53, "grad_norm": 15.699095798380197, "learning_rate": 1.0491502704540249e-05, "loss": 0.1861, "step": 18960 }, { "epoch": 1.53, "grad_norm": 4.514889983973343, "learning_rate": 1.0473728269868879e-05, "loss": 0.189, "step": 18980 }, { "epoch": 1.53, "grad_norm": 6.288009726033927, "learning_rate": 1.045595233793219e-05, "loss": 0.1626, "step": 19000 }, { "epoch": 1.54, "grad_norm": 5.491080420793532, "learning_rate": 1.0438174965031632e-05, "loss": 0.1763, "step": 19020 }, { "epoch": 1.54, "grad_norm": 5.751789643793079, "learning_rate": 1.0420396207473214e-05, "loss": 0.1938, "step": 19040 }, { "epoch": 1.54, "grad_norm": 6.8267007909359, "learning_rate": 1.0402616121567339e-05, "loss": 0.1965, "step": 19060 }, { "epoch": 1.54, "grad_norm": 9.0429514544921, "learning_rate": 1.0384834763628609e-05, "loss": 0.1956, "step": 19080 }, { "epoch": 1.54, "grad_norm": 4.883758892659512, "learning_rate": 1.0367052189975661e-05, "loss": 0.2052, "step": 19100 }, { "epoch": 1.54, "grad_norm": 4.958192369182968, "learning_rate": 1.0349268456930978e-05, "loss": 0.1595, "step": 19120 }, { "epoch": 1.55, "grad_norm": 7.1048523990512775, "learning_rate": 1.0331483620820718e-05, "loss": 0.1802, "step": 19140 }, { "epoch": 1.55, "grad_norm": 7.81188718299639, "learning_rate": 1.0313697737974532e-05, "loss": 0.1762, "step": 19160 }, { "epoch": 1.55, "grad_norm": 7.41902623164456, "learning_rate": 1.0295910864725385e-05, "loss": 0.1815, "step": 19180 }, { "epoch": 1.55, "grad_norm": 6.487508949464715, "learning_rate": 1.027812305740938e-05, "loss": 0.1868, "step": 19200 }, { "epoch": 1.55, "grad_norm": 6.2665205241809, "learning_rate": 1.0260334372365579e-05, "loss": 0.1786, "step": 19220 }, { "epoch": 1.55, "grad_norm": 10.235739114720019, "learning_rate": 1.0242544865935822e-05, "loss": 0.1974, "step": 19240 }, { "epoch": 1.55, "grad_norm": 7.9210272040028205, "learning_rate": 1.0224754594464548e-05, "loss": 0.1995, "step": 19260 }, { "epoch": 1.56, "grad_norm": 7.599636480772786, "learning_rate": 1.020696361429863e-05, "loss": 0.1838, "step": 19280 }, { "epoch": 1.56, "grad_norm": 6.6901787357374305, "learning_rate": 1.0189171981787176e-05, "loss": 0.1857, "step": 19300 }, { "epoch": 1.56, "grad_norm": 7.859622048956387, "learning_rate": 1.0171379753281365e-05, "loss": 0.1473, "step": 19320 }, { "epoch": 1.56, "grad_norm": 9.023527968972886, "learning_rate": 1.015358698513426e-05, "loss": 0.2087, "step": 19340 }, { "epoch": 1.56, "grad_norm": 7.332951564677237, "learning_rate": 1.0135793733700635e-05, "loss": 0.1723, "step": 19360 }, { "epoch": 1.56, "grad_norm": 6.6064475500042406, "learning_rate": 1.0118000055336792e-05, "loss": 0.1892, "step": 19380 }, { "epoch": 1.57, "grad_norm": 7.156751057104011, "learning_rate": 1.0100206006400388e-05, "loss": 0.1808, "step": 19400 }, { "epoch": 1.57, "grad_norm": 7.071734037861301, "learning_rate": 1.0082411643250256e-05, "loss": 0.1987, "step": 19420 }, { "epoch": 1.57, "grad_norm": 5.576705794819434, "learning_rate": 1.0064617022246218e-05, "loss": 0.1826, "step": 19440 }, { "epoch": 1.57, "grad_norm": 9.630333945880572, "learning_rate": 1.0046822199748918e-05, "loss": 0.1778, "step": 19460 }, { "epoch": 1.57, "grad_norm": 10.401189633333978, "learning_rate": 1.0029027232119637e-05, "loss": 0.1834, "step": 19480 }, { "epoch": 1.57, "grad_norm": 5.393374772794798, "learning_rate": 1.0011232175720113e-05, "loss": 0.1738, "step": 19500 }, { "epoch": 1.58, "grad_norm": 6.924075773515395, "learning_rate": 9.993437086912373e-06, "loss": 0.1917, "step": 19520 }, { "epoch": 1.58, "grad_norm": 8.996239194302989, "learning_rate": 9.975642022058535e-06, "loss": 0.164, "step": 19540 }, { "epoch": 1.58, "grad_norm": 6.440697625799741, "learning_rate": 9.95784703752065e-06, "loss": 0.1846, "step": 19560 }, { "epoch": 1.58, "grad_norm": 4.738285879209919, "learning_rate": 9.940052189660508e-06, "loss": 0.2179, "step": 19580 }, { "epoch": 1.58, "grad_norm": 6.078747244317922, "learning_rate": 9.922257534839473e-06, "loss": 0.1678, "step": 19600 }, { "epoch": 1.58, "grad_norm": 4.399307517265612, "learning_rate": 9.904463129418295e-06, "loss": 0.188, "step": 19620 }, { "epoch": 1.59, "grad_norm": 6.3147663542594135, "learning_rate": 9.886669029756928e-06, "loss": 0.1814, "step": 19640 }, { "epoch": 1.59, "grad_norm": 10.295074351756918, "learning_rate": 9.86887529221437e-06, "loss": 0.1594, "step": 19660 }, { "epoch": 1.59, "grad_norm": 6.127518428057287, "learning_rate": 9.851081973148461e-06, "loss": 0.1583, "step": 19680 }, { "epoch": 1.59, "grad_norm": 7.524745554123201, "learning_rate": 9.833289128915719e-06, "loss": 0.1725, "step": 19700 }, { "epoch": 1.59, "grad_norm": 8.809630447586713, "learning_rate": 9.815496815871163e-06, "loss": 0.1835, "step": 19720 }, { "epoch": 1.59, "grad_norm": 5.0997599152545705, "learning_rate": 9.79770509036812e-06, "loss": 0.1918, "step": 19740 }, { "epoch": 1.6, "grad_norm": 5.7257380913445415, "learning_rate": 9.779914008758064e-06, "loss": 0.179, "step": 19760 }, { "epoch": 1.6, "grad_norm": 6.438675856373519, "learning_rate": 9.762123627390428e-06, "loss": 0.2072, "step": 19780 }, { "epoch": 1.6, "grad_norm": 7.284638627304715, "learning_rate": 9.744334002612426e-06, "loss": 0.1655, "step": 19800 }, { "epoch": 1.6, "grad_norm": 7.624618520730721, "learning_rate": 9.726545190768871e-06, "loss": 0.1907, "step": 19820 }, { "epoch": 1.6, "grad_norm": 5.640863218058795, "learning_rate": 9.70875724820201e-06, "loss": 0.1743, "step": 19840 }, { "epoch": 1.6, "grad_norm": 6.508589424568692, "learning_rate": 9.690970231251332e-06, "loss": 0.1778, "step": 19860 }, { "epoch": 1.6, "grad_norm": 4.719808699723677, "learning_rate": 9.673184196253397e-06, "loss": 0.1842, "step": 19880 }, { "epoch": 1.61, "grad_norm": 5.327888908637108, "learning_rate": 9.655399199541648e-06, "loss": 0.1778, "step": 19900 }, { "epoch": 1.61, "grad_norm": 4.62310607898033, "learning_rate": 9.63761529744625e-06, "loss": 0.159, "step": 19920 }, { "epoch": 1.61, "grad_norm": 4.242984591464857, "learning_rate": 9.61983254629389e-06, "loss": 0.1766, "step": 19940 }, { "epoch": 1.61, "grad_norm": 4.634594013494804, "learning_rate": 9.60205100240762e-06, "loss": 0.186, "step": 19960 }, { "epoch": 1.61, "grad_norm": 6.440683707201506, "learning_rate": 9.584270722106662e-06, "loss": 0.1856, "step": 19980 }, { "epoch": 1.61, "grad_norm": 2.790764895496361, "learning_rate": 9.566491761706234e-06, "loss": 0.1841, "step": 20000 }, { "epoch": 1.62, "grad_norm": 7.195201164651666, "learning_rate": 9.54871417751738e-06, "loss": 0.1723, "step": 20020 }, { "epoch": 1.62, "grad_norm": 4.764709985159199, "learning_rate": 9.530938025846778e-06, "loss": 0.1866, "step": 20040 }, { "epoch": 1.62, "grad_norm": 5.932858550259948, "learning_rate": 9.513163362996577e-06, "loss": 0.1866, "step": 20060 }, { "epoch": 1.62, "grad_norm": 6.464021142906132, "learning_rate": 9.495390245264204e-06, "loss": 0.1868, "step": 20080 }, { "epoch": 1.62, "grad_norm": 5.655476722759092, "learning_rate": 9.477618728942194e-06, "loss": 0.166, "step": 20100 }, { "epoch": 1.62, "grad_norm": 5.962336904666332, "learning_rate": 9.459848870318007e-06, "loss": 0.2101, "step": 20120 }, { "epoch": 1.63, "grad_norm": 4.675684775725181, "learning_rate": 9.44208072567386e-06, "loss": 0.1772, "step": 20140 }, { "epoch": 1.63, "grad_norm": 7.450883124319922, "learning_rate": 9.42431435128654e-06, "loss": 0.1647, "step": 20160 }, { "epoch": 1.63, "grad_norm": 4.965583223967794, "learning_rate": 9.406549803427218e-06, "loss": 0.2103, "step": 20180 }, { "epoch": 1.63, "grad_norm": 6.079517195254289, "learning_rate": 9.388787138361289e-06, "loss": 0.1917, "step": 20200 }, { "epoch": 1.63, "grad_norm": 5.5400797739723515, "learning_rate": 9.371026412348178e-06, "loss": 0.1691, "step": 20220 }, { "epoch": 1.63, "grad_norm": 5.974428361817412, "learning_rate": 9.353267681641178e-06, "loss": 0.1887, "step": 20240 }, { "epoch": 1.64, "grad_norm": 5.87274126501588, "learning_rate": 9.335511002487252e-06, "loss": 0.1888, "step": 20260 }, { "epoch": 1.64, "grad_norm": 5.03986866734462, "learning_rate": 9.31775643112687e-06, "loss": 0.1793, "step": 20280 }, { "epoch": 1.64, "grad_norm": 4.961609321820686, "learning_rate": 9.300004023793826e-06, "loss": 0.1811, "step": 20300 }, { "epoch": 1.64, "grad_norm": 6.09235980486571, "learning_rate": 9.282253836715063e-06, "loss": 0.1699, "step": 20320 }, { "epoch": 1.64, "grad_norm": 6.186755202964333, "learning_rate": 9.264505926110482e-06, "loss": 0.1936, "step": 20340 }, { "epoch": 1.64, "grad_norm": 6.795058856229219, "learning_rate": 9.246760348192785e-06, "loss": 0.1988, "step": 20360 }, { "epoch": 1.65, "grad_norm": 8.626010116914388, "learning_rate": 9.229017159167278e-06, "loss": 0.1753, "step": 20380 }, { "epoch": 1.65, "grad_norm": 7.036939700979724, "learning_rate": 9.211276415231704e-06, "loss": 0.1775, "step": 20400 }, { "epoch": 1.65, "grad_norm": 7.222151578247956, "learning_rate": 9.193538172576061e-06, "loss": 0.2063, "step": 20420 }, { "epoch": 1.65, "grad_norm": 6.261687984855021, "learning_rate": 9.175802487382427e-06, "loss": 0.1875, "step": 20440 }, { "epoch": 1.65, "grad_norm": 6.763660595031022, "learning_rate": 9.158069415824776e-06, "loss": 0.162, "step": 20460 }, { "epoch": 1.65, "grad_norm": 6.645406711322244, "learning_rate": 9.140339014068805e-06, "loss": 0.1701, "step": 20480 }, { "epoch": 1.65, "grad_norm": 6.075259280852945, "learning_rate": 9.122611338271759e-06, "loss": 0.1876, "step": 20500 }, { "epoch": 1.66, "grad_norm": 5.071793679309942, "learning_rate": 9.104886444582239e-06, "loss": 0.1891, "step": 20520 }, { "epoch": 1.66, "grad_norm": 6.0993248112456, "learning_rate": 9.087164389140048e-06, "loss": 0.1773, "step": 20540 }, { "epoch": 1.66, "grad_norm": 3.137955931101823, "learning_rate": 9.069445228075984e-06, "loss": 0.175, "step": 20560 }, { "epoch": 1.66, "grad_norm": 6.87017680822122, "learning_rate": 9.051729017511696e-06, "loss": 0.1781, "step": 20580 }, { "epoch": 1.66, "grad_norm": 6.740478016007273, "learning_rate": 9.034015813559472e-06, "loss": 0.1842, "step": 20600 }, { "epoch": 1.66, "grad_norm": 6.293414885644483, "learning_rate": 9.016305672322082e-06, "loss": 0.1754, "step": 20620 }, { "epoch": 1.67, "grad_norm": 6.770107414261222, "learning_rate": 8.998598649892602e-06, "loss": 0.1832, "step": 20640 }, { "epoch": 1.67, "grad_norm": 6.957667275109727, "learning_rate": 8.98089480235422e-06, "loss": 0.1915, "step": 20660 }, { "epoch": 1.67, "grad_norm": 7.753279699575756, "learning_rate": 8.963194185780076e-06, "loss": 0.2074, "step": 20680 }, { "epoch": 1.67, "grad_norm": 5.572695528675712, "learning_rate": 8.94549685623307e-06, "loss": 0.1675, "step": 20700 }, { "epoch": 1.67, "grad_norm": 8.318273389362469, "learning_rate": 8.927802869765697e-06, "loss": 0.1901, "step": 20720 }, { "epoch": 1.67, "grad_norm": 6.1756703742616645, "learning_rate": 8.91011228241986e-06, "loss": 0.1759, "step": 20740 }, { "epoch": 1.68, "grad_norm": 4.72678645840794, "learning_rate": 8.892425150226697e-06, "loss": 0.1672, "step": 20760 }, { "epoch": 1.68, "grad_norm": 5.064164253114183, "learning_rate": 8.874741529206401e-06, "loss": 0.1832, "step": 20780 }, { "epoch": 1.68, "grad_norm": 5.593557173319701, "learning_rate": 8.857061475368046e-06, "loss": 0.1767, "step": 20800 }, { "epoch": 1.68, "grad_norm": 5.739827985678297, "learning_rate": 8.83938504470941e-06, "loss": 0.1633, "step": 20820 }, { "epoch": 1.68, "grad_norm": 3.8745235027093172, "learning_rate": 8.821712293216792e-06, "loss": 0.1827, "step": 20840 }, { "epoch": 1.68, "grad_norm": 5.758169590139612, "learning_rate": 8.804043276864838e-06, "loss": 0.1799, "step": 20860 }, { "epoch": 1.69, "grad_norm": 9.048806835785985, "learning_rate": 8.786378051616363e-06, "loss": 0.1818, "step": 20880 }, { "epoch": 1.69, "grad_norm": 4.6191342246916545, "learning_rate": 8.768716673422176e-06, "loss": 0.184, "step": 20900 }, { "epoch": 1.69, "grad_norm": 10.64178899596623, "learning_rate": 8.751059198220903e-06, "loss": 0.1868, "step": 20920 }, { "epoch": 1.69, "grad_norm": 5.957903048399736, "learning_rate": 8.733405681938806e-06, "loss": 0.2088, "step": 20940 }, { "epoch": 1.69, "grad_norm": 7.14845190828466, "learning_rate": 8.715756180489609e-06, "loss": 0.1591, "step": 20960 }, { "epoch": 1.69, "grad_norm": 5.026492884909507, "learning_rate": 8.698110749774315e-06, "loss": 0.1692, "step": 20980 }, { "epoch": 1.7, "grad_norm": 6.193357527199346, "learning_rate": 8.680469445681042e-06, "loss": 0.1865, "step": 21000 }, { "epoch": 1.7, "grad_norm": 5.392241393190001, "learning_rate": 8.662832324084831e-06, "loss": 0.1643, "step": 21020 }, { "epoch": 1.7, "grad_norm": 7.209071685667427, "learning_rate": 8.645199440847485e-06, "loss": 0.1699, "step": 21040 }, { "epoch": 1.7, "grad_norm": 6.378055513443605, "learning_rate": 8.62757085181737e-06, "loss": 0.1997, "step": 21060 }, { "epoch": 1.7, "grad_norm": 5.948983100737581, "learning_rate": 8.609946612829258e-06, "loss": 0.1768, "step": 21080 }, { "epoch": 1.7, "grad_norm": 5.843305438262167, "learning_rate": 8.592326779704148e-06, "loss": 0.1819, "step": 21100 }, { "epoch": 1.7, "grad_norm": 4.695464831529554, "learning_rate": 8.574711408249074e-06, "loss": 0.1984, "step": 21120 }, { "epoch": 1.71, "grad_norm": 5.334180148766731, "learning_rate": 8.557100554256944e-06, "loss": 0.18, "step": 21140 }, { "epoch": 1.71, "grad_norm": 3.594041232042909, "learning_rate": 8.53949427350636e-06, "loss": 0.165, "step": 21160 }, { "epoch": 1.71, "grad_norm": 5.690661212066448, "learning_rate": 8.521892621761433e-06, "loss": 0.2051, "step": 21180 }, { "epoch": 1.71, "grad_norm": 8.263658257123618, "learning_rate": 8.504295654771622e-06, "loss": 0.178, "step": 21200 }, { "epoch": 1.71, "grad_norm": 6.964730099763294, "learning_rate": 8.486703428271536e-06, "loss": 0.1718, "step": 21220 }, { "epoch": 1.71, "grad_norm": 7.2364322382017745, "learning_rate": 8.469115997980786e-06, "loss": 0.1609, "step": 21240 }, { "epoch": 1.72, "grad_norm": 7.01112598686971, "learning_rate": 8.451533419603773e-06, "loss": 0.1918, "step": 21260 }, { "epoch": 1.72, "grad_norm": 6.7742526609944385, "learning_rate": 8.433955748829543e-06, "loss": 0.1746, "step": 21280 }, { "epoch": 1.72, "grad_norm": 6.080225878083447, "learning_rate": 8.416383041331594e-06, "loss": 0.1621, "step": 21300 }, { "epoch": 1.72, "grad_norm": 6.122578979331691, "learning_rate": 8.398815352767706e-06, "loss": 0.1866, "step": 21320 }, { "epoch": 1.72, "grad_norm": 9.715953363199073, "learning_rate": 8.38125273877976e-06, "loss": 0.1696, "step": 21340 }, { "epoch": 1.72, "grad_norm": 6.632818538811297, "learning_rate": 8.363695254993569e-06, "loss": 0.182, "step": 21360 }, { "epoch": 1.73, "grad_norm": 5.18153129890793, "learning_rate": 8.346142957018688e-06, "loss": 0.2091, "step": 21380 }, { "epoch": 1.73, "grad_norm": 4.933940908460862, "learning_rate": 8.32859590044826e-06, "loss": 0.1834, "step": 21400 }, { "epoch": 1.73, "grad_norm": 4.130527376581761, "learning_rate": 8.311054140858814e-06, "loss": 0.217, "step": 21420 }, { "epoch": 1.73, "grad_norm": 4.51550356163752, "learning_rate": 8.29351773381011e-06, "loss": 0.2001, "step": 21440 }, { "epoch": 1.73, "grad_norm": 5.869039230613348, "learning_rate": 8.275986734844956e-06, "loss": 0.176, "step": 21460 }, { "epoch": 1.73, "grad_norm": 5.039303039014279, "learning_rate": 8.258461199489026e-06, "loss": 0.2202, "step": 21480 }, { "epoch": 1.74, "grad_norm": 6.5768380616493936, "learning_rate": 8.240941183250689e-06, "loss": 0.1748, "step": 21500 }, { "epoch": 1.74, "grad_norm": 5.013498586372746, "learning_rate": 8.22342674162084e-06, "loss": 0.1933, "step": 21520 }, { "epoch": 1.74, "grad_norm": 4.181027548764225, "learning_rate": 8.205917930072707e-06, "loss": 0.1706, "step": 21540 }, { "epoch": 1.74, "grad_norm": 6.782829698366385, "learning_rate": 8.188414804061698e-06, "loss": 0.1857, "step": 21560 }, { "epoch": 1.74, "grad_norm": 13.591366754088444, "learning_rate": 8.170917419025203e-06, "loss": 0.1467, "step": 21580 }, { "epoch": 1.74, "grad_norm": 6.120949476303091, "learning_rate": 8.153425830382438e-06, "loss": 0.1991, "step": 21600 }, { "epoch": 1.75, "grad_norm": 7.272557124401674, "learning_rate": 8.135940093534249e-06, "loss": 0.1766, "step": 21620 }, { "epoch": 1.75, "grad_norm": 6.349382878150412, "learning_rate": 8.11846026386296e-06, "loss": 0.1989, "step": 21640 }, { "epoch": 1.75, "grad_norm": 3.9867833656488356, "learning_rate": 8.100986396732173e-06, "loss": 0.1831, "step": 21660 }, { "epoch": 1.75, "grad_norm": 5.4985809229416365, "learning_rate": 8.083518547486617e-06, "loss": 0.1851, "step": 21680 }, { "epoch": 1.75, "grad_norm": 5.519318033191571, "learning_rate": 8.066056771451954e-06, "loss": 0.1879, "step": 21700 }, { "epoch": 1.75, "grad_norm": 3.14973956651229, "learning_rate": 8.048601123934609e-06, "loss": 0.1737, "step": 21720 }, { "epoch": 1.75, "grad_norm": 4.004131091247943, "learning_rate": 8.031151660221597e-06, "loss": 0.1667, "step": 21740 }, { "epoch": 1.76, "grad_norm": 6.567536955091622, "learning_rate": 8.013708435580352e-06, "loss": 0.1697, "step": 21760 }, { "epoch": 1.76, "grad_norm": 4.599731495525866, "learning_rate": 7.996271505258542e-06, "loss": 0.1547, "step": 21780 }, { "epoch": 1.76, "grad_norm": 6.65599496558806, "learning_rate": 7.978840924483904e-06, "loss": 0.1774, "step": 21800 }, { "epoch": 1.76, "grad_norm": 8.560171882828884, "learning_rate": 7.961416748464055e-06, "loss": 0.2049, "step": 21820 }, { "epoch": 1.76, "grad_norm": 6.954024696005634, "learning_rate": 7.943999032386336e-06, "loss": 0.1881, "step": 21840 }, { "epoch": 1.76, "grad_norm": 4.630461689811874, "learning_rate": 7.926587831417623e-06, "loss": 0.1881, "step": 21860 }, { "epoch": 1.77, "grad_norm": 7.232546878139292, "learning_rate": 7.90918320070416e-06, "loss": 0.1995, "step": 21880 }, { "epoch": 1.77, "grad_norm": 7.453903957263037, "learning_rate": 7.891785195371375e-06, "loss": 0.1722, "step": 21900 }, { "epoch": 1.77, "grad_norm": 7.759776194383061, "learning_rate": 7.874393870523715e-06, "loss": 0.1695, "step": 21920 }, { "epoch": 1.77, "grad_norm": 10.868257032537139, "learning_rate": 7.857009281244472e-06, "loss": 0.1835, "step": 21940 }, { "epoch": 1.77, "grad_norm": 5.779984289233758, "learning_rate": 7.839631482595597e-06, "loss": 0.1665, "step": 21960 }, { "epoch": 1.77, "grad_norm": 5.114836127824816, "learning_rate": 7.822260529617539e-06, "loss": 0.1882, "step": 21980 }, { "epoch": 1.78, "grad_norm": 5.8070528491627105, "learning_rate": 7.804896477329062e-06, "loss": 0.2043, "step": 22000 }, { "epoch": 1.78, "grad_norm": 3.7936436595812935, "learning_rate": 7.787539380727074e-06, "loss": 0.1828, "step": 22020 }, { "epoch": 1.78, "grad_norm": 7.004553267660414, "learning_rate": 7.770189294786455e-06, "loss": 0.1891, "step": 22040 }, { "epoch": 1.78, "grad_norm": 7.107618580250647, "learning_rate": 7.752846274459873e-06, "loss": 0.1952, "step": 22060 }, { "epoch": 1.78, "grad_norm": 7.293012687334171, "learning_rate": 7.735510374677624e-06, "loss": 0.1668, "step": 22080 }, { "epoch": 1.78, "grad_norm": 5.229665339832979, "learning_rate": 7.718181650347453e-06, "loss": 0.2154, "step": 22100 }, { "epoch": 1.79, "grad_norm": 6.524237121183421, "learning_rate": 7.70086015635437e-06, "loss": 0.1834, "step": 22120 }, { "epoch": 1.79, "grad_norm": 7.363125050150268, "learning_rate": 7.683545947560491e-06, "loss": 0.1865, "step": 22140 }, { "epoch": 1.79, "grad_norm": 7.300432730612564, "learning_rate": 7.666239078804853e-06, "loss": 0.1818, "step": 22160 }, { "epoch": 1.79, "grad_norm": 14.035100259942592, "learning_rate": 7.648939604903252e-06, "loss": 0.191, "step": 22180 }, { "epoch": 1.79, "grad_norm": 6.910011337425548, "learning_rate": 7.631647580648057e-06, "loss": 0.168, "step": 22200 }, { "epoch": 1.79, "grad_norm": 6.7568942995248324, "learning_rate": 7.6143630608080395e-06, "loss": 0.1843, "step": 22220 }, { "epoch": 1.8, "grad_norm": 6.747067132204526, "learning_rate": 7.597086100128209e-06, "loss": 0.1937, "step": 22240 }, { "epoch": 1.8, "grad_norm": 8.5527154190958, "learning_rate": 7.579816753329629e-06, "loss": 0.1818, "step": 22260 }, { "epoch": 1.8, "grad_norm": 7.746379621946916, "learning_rate": 7.562555075109248e-06, "loss": 0.2052, "step": 22280 }, { "epoch": 1.8, "grad_norm": 6.639988193271665, "learning_rate": 7.545301120139724e-06, "loss": 0.1631, "step": 22300 }, { "epoch": 1.8, "grad_norm": 6.767819936156134, "learning_rate": 7.528054943069261e-06, "loss": 0.1661, "step": 22320 }, { "epoch": 1.8, "grad_norm": 5.273647818433291, "learning_rate": 7.510816598521416e-06, "loss": 0.1584, "step": 22340 }, { "epoch": 1.8, "grad_norm": 8.93563631273546, "learning_rate": 7.493586141094952e-06, "loss": 0.1555, "step": 22360 }, { "epoch": 1.81, "grad_norm": 3.0753266447920566, "learning_rate": 7.47636362536364e-06, "loss": 0.1517, "step": 22380 }, { "epoch": 1.81, "grad_norm": 10.30705792722804, "learning_rate": 7.459149105876106e-06, "loss": 0.154, "step": 22400 }, { "epoch": 1.81, "grad_norm": 7.984708476707701, "learning_rate": 7.441942637155638e-06, "loss": 0.1671, "step": 22420 }, { "epoch": 1.81, "grad_norm": 5.866654657582794, "learning_rate": 7.424744273700038e-06, "loss": 0.1886, "step": 22440 }, { "epoch": 1.81, "grad_norm": 4.737602317329208, "learning_rate": 7.407554069981428e-06, "loss": 0.2059, "step": 22460 }, { "epoch": 1.81, "grad_norm": 4.877816193781035, "learning_rate": 7.390372080446089e-06, "loss": 0.198, "step": 22480 }, { "epoch": 1.82, "grad_norm": 4.465809741780349, "learning_rate": 7.373198359514283e-06, "loss": 0.1678, "step": 22500 }, { "epoch": 1.82, "grad_norm": 6.934781753417284, "learning_rate": 7.356032961580083e-06, "loss": 0.18, "step": 22520 }, { "epoch": 1.82, "grad_norm": 6.355881689873095, "learning_rate": 7.338875941011206e-06, "loss": 0.1676, "step": 22540 }, { "epoch": 1.82, "grad_norm": 6.110876810996104, "learning_rate": 7.321727352148833e-06, "loss": 0.1855, "step": 22560 }, { "epoch": 1.82, "grad_norm": 5.355037257245673, "learning_rate": 7.304587249307434e-06, "loss": 0.1804, "step": 22580 }, { "epoch": 1.82, "grad_norm": 6.6548543550416195, "learning_rate": 7.287455686774608e-06, "loss": 0.2034, "step": 22600 }, { "epoch": 1.83, "grad_norm": 9.27569310973663, "learning_rate": 7.270332718810901e-06, "loss": 0.1937, "step": 22620 }, { "epoch": 1.83, "grad_norm": 4.751357838150305, "learning_rate": 7.253218399649638e-06, "loss": 0.1651, "step": 22640 }, { "epoch": 1.83, "grad_norm": 9.734100288935727, "learning_rate": 7.2361127834967505e-06, "loss": 0.1529, "step": 22660 }, { "epoch": 1.83, "grad_norm": 8.120082834507159, "learning_rate": 7.219015924530608e-06, "loss": 0.1747, "step": 22680 }, { "epoch": 1.83, "grad_norm": 5.407215715714017, "learning_rate": 7.201927876901839e-06, "loss": 0.1704, "step": 22700 }, { "epoch": 1.83, "grad_norm": 5.774551387388109, "learning_rate": 7.184848694733164e-06, "loss": 0.161, "step": 22720 }, { "epoch": 1.84, "grad_norm": 6.759114928231145, "learning_rate": 7.167778432119233e-06, "loss": 0.1879, "step": 22740 }, { "epoch": 1.84, "grad_norm": 4.92199338740308, "learning_rate": 7.150717143126433e-06, "loss": 0.1652, "step": 22760 }, { "epoch": 1.84, "grad_norm": 4.178823107172061, "learning_rate": 7.133664881792739e-06, "loss": 0.1785, "step": 22780 }, { "epoch": 1.84, "grad_norm": 6.686026437573051, "learning_rate": 7.116621702127524e-06, "loss": 0.1869, "step": 22800 }, { "epoch": 1.84, "grad_norm": 4.2403070205517, "learning_rate": 7.099587658111403e-06, "loss": 0.1673, "step": 22820 }, { "epoch": 1.84, "grad_norm": 3.965564681031053, "learning_rate": 7.082562803696054e-06, "loss": 0.1606, "step": 22840 }, { "epoch": 1.85, "grad_norm": 7.1312004014078205, "learning_rate": 7.065547192804044e-06, "loss": 0.1833, "step": 22860 }, { "epoch": 1.85, "grad_norm": 6.361412078237849, "learning_rate": 7.048540879328677e-06, "loss": 0.176, "step": 22880 }, { "epoch": 1.85, "grad_norm": 7.594302059247436, "learning_rate": 7.031543917133794e-06, "loss": 0.1622, "step": 22900 }, { "epoch": 1.85, "grad_norm": 10.096625612296556, "learning_rate": 7.014556360053627e-06, "loss": 0.1875, "step": 22920 }, { "epoch": 1.85, "grad_norm": 5.429860679005328, "learning_rate": 6.997578261892612e-06, "loss": 0.1742, "step": 22940 }, { "epoch": 1.85, "grad_norm": 4.64963548113418, "learning_rate": 6.980609676425238e-06, "loss": 0.1645, "step": 22960 }, { "epoch": 1.85, "grad_norm": 7.328249932303926, "learning_rate": 6.963650657395851e-06, "loss": 0.1653, "step": 22980 }, { "epoch": 1.86, "grad_norm": 4.797310178817107, "learning_rate": 6.946701258518505e-06, "loss": 0.1718, "step": 23000 }, { "epoch": 1.86, "grad_norm": 5.93397915500725, "learning_rate": 6.929761533476782e-06, "loss": 0.171, "step": 23020 }, { "epoch": 1.86, "grad_norm": 4.493879394366119, "learning_rate": 6.912831535923627e-06, "loss": 0.1596, "step": 23040 }, { "epoch": 1.86, "grad_norm": 4.5526438818226564, "learning_rate": 6.89591131948117e-06, "loss": 0.1477, "step": 23060 }, { "epoch": 1.86, "grad_norm": 5.57770560261374, "learning_rate": 6.879000937740566e-06, "loss": 0.1911, "step": 23080 }, { "epoch": 1.86, "grad_norm": 5.825009574391105, "learning_rate": 6.862100444261819e-06, "loss": 0.1768, "step": 23100 }, { "epoch": 1.87, "grad_norm": 5.136029725948361, "learning_rate": 6.845209892573611e-06, "loss": 0.1863, "step": 23120 }, { "epoch": 1.87, "grad_norm": 7.989879882826286, "learning_rate": 6.828329336173145e-06, "loss": 0.1763, "step": 23140 }, { "epoch": 1.87, "grad_norm": 6.65685803139121, "learning_rate": 6.8114588285259576e-06, "loss": 0.1755, "step": 23160 }, { "epoch": 1.87, "grad_norm": 9.26693783187225, "learning_rate": 6.794598423065758e-06, "loss": 0.176, "step": 23180 }, { "epoch": 1.87, "grad_norm": 3.574898543618821, "learning_rate": 6.7777481731942616e-06, "loss": 0.1858, "step": 23200 }, { "epoch": 1.87, "grad_norm": 4.831086963108529, "learning_rate": 6.760908132281021e-06, "loss": 0.1796, "step": 23220 }, { "epoch": 1.88, "grad_norm": 7.68773272402437, "learning_rate": 6.744078353663247e-06, "loss": 0.1703, "step": 23240 }, { "epoch": 1.88, "grad_norm": 9.370545816880146, "learning_rate": 6.727258890645652e-06, "loss": 0.18, "step": 23260 }, { "epoch": 1.88, "grad_norm": 9.159176720015646, "learning_rate": 6.710449796500274e-06, "loss": 0.1716, "step": 23280 }, { "epoch": 1.88, "grad_norm": 5.362130304855524, "learning_rate": 6.693651124466311e-06, "loss": 0.168, "step": 23300 }, { "epoch": 1.88, "grad_norm": 4.870253771176025, "learning_rate": 6.676862927749953e-06, "loss": 0.2008, "step": 23320 }, { "epoch": 1.88, "grad_norm": 4.029937293840335, "learning_rate": 6.6600852595242075e-06, "loss": 0.1735, "step": 23340 }, { "epoch": 1.89, "grad_norm": 5.6204264379241025, "learning_rate": 6.643318172928737e-06, "loss": 0.1707, "step": 23360 }, { "epoch": 1.89, "grad_norm": 9.703306236355932, "learning_rate": 6.626561721069688e-06, "loss": 0.1599, "step": 23380 }, { "epoch": 1.89, "grad_norm": 7.347162760458088, "learning_rate": 6.609815957019527e-06, "loss": 0.1703, "step": 23400 }, { "epoch": 1.89, "grad_norm": 12.113550817300446, "learning_rate": 6.593080933816866e-06, "loss": 0.1784, "step": 23420 }, { "epoch": 1.89, "grad_norm": 3.3783362111733486, "learning_rate": 6.576356704466297e-06, "loss": 0.1641, "step": 23440 }, { "epoch": 1.89, "grad_norm": 4.068637613130167, "learning_rate": 6.5596433219382285e-06, "loss": 0.1436, "step": 23460 }, { "epoch": 1.9, "grad_norm": 6.015225800979343, "learning_rate": 6.542940839168712e-06, "loss": 0.1975, "step": 23480 }, { "epoch": 1.9, "grad_norm": 7.2342521376918585, "learning_rate": 6.5262493090592715e-06, "loss": 0.1882, "step": 23500 }, { "epoch": 1.9, "grad_norm": 7.619431760731591, "learning_rate": 6.509568784476753e-06, "loss": 0.1743, "step": 23520 }, { "epoch": 1.9, "grad_norm": 4.213072170259656, "learning_rate": 6.4928993182531345e-06, "loss": 0.1576, "step": 23540 }, { "epoch": 1.9, "grad_norm": 8.226101564425411, "learning_rate": 6.476240963185369e-06, "loss": 0.1565, "step": 23560 }, { "epoch": 1.9, "grad_norm": 8.224634189065512, "learning_rate": 6.459593772035225e-06, "loss": 0.1835, "step": 23580 }, { "epoch": 1.91, "grad_norm": 6.247185035023087, "learning_rate": 6.442957797529104e-06, "loss": 0.1736, "step": 23600 }, { "epoch": 1.91, "grad_norm": 5.9810785123502965, "learning_rate": 6.426333092357886e-06, "loss": 0.1615, "step": 23620 }, { "epoch": 1.91, "grad_norm": 11.84481798656128, "learning_rate": 6.409719709176755e-06, "loss": 0.1888, "step": 23640 }, { "epoch": 1.91, "grad_norm": 7.506850102533956, "learning_rate": 6.393117700605034e-06, "loss": 0.1963, "step": 23660 }, { "epoch": 1.91, "grad_norm": 7.776460084859693, "learning_rate": 6.376527119226023e-06, "loss": 0.1485, "step": 23680 }, { "epoch": 1.91, "grad_norm": 6.3180091914164125, "learning_rate": 6.359948017586827e-06, "loss": 0.1816, "step": 23700 }, { "epoch": 1.91, "grad_norm": 11.20267927275362, "learning_rate": 6.343380448198188e-06, "loss": 0.1652, "step": 23720 }, { "epoch": 1.92, "grad_norm": 6.037469216973155, "learning_rate": 6.326824463534336e-06, "loss": 0.1725, "step": 23740 }, { "epoch": 1.92, "grad_norm": 6.261846682645075, "learning_rate": 6.310280116032791e-06, "loss": 0.1538, "step": 23760 }, { "epoch": 1.92, "grad_norm": 8.59123942376336, "learning_rate": 6.293747458094223e-06, "loss": 0.1737, "step": 23780 }, { "epoch": 1.92, "grad_norm": 10.500292092756426, "learning_rate": 6.277226542082278e-06, "loss": 0.1921, "step": 23800 }, { "epoch": 1.92, "grad_norm": 4.477032435385099, "learning_rate": 6.260717420323409e-06, "loss": 0.1721, "step": 23820 }, { "epoch": 1.92, "grad_norm": 5.503172566318245, "learning_rate": 6.244220145106716e-06, "loss": 0.1668, "step": 23840 }, { "epoch": 1.93, "grad_norm": 13.937620974986471, "learning_rate": 6.227734768683779e-06, "loss": 0.1721, "step": 23860 }, { "epoch": 1.93, "grad_norm": 4.6367027070331135, "learning_rate": 6.211261343268485e-06, "loss": 0.1765, "step": 23880 }, { "epoch": 1.93, "grad_norm": 5.804661212928365, "learning_rate": 6.194799921036879e-06, "loss": 0.1706, "step": 23900 }, { "epoch": 1.93, "grad_norm": 5.838975230670599, "learning_rate": 6.178350554126979e-06, "loss": 0.1684, "step": 23920 }, { "epoch": 1.93, "grad_norm": 6.638228830097108, "learning_rate": 6.161913294638621e-06, "loss": 0.1848, "step": 23940 }, { "epoch": 1.93, "grad_norm": 5.656968213422785, "learning_rate": 6.1454881946333e-06, "loss": 0.1674, "step": 23960 }, { "epoch": 1.94, "grad_norm": 10.46109722035461, "learning_rate": 6.1290753061339925e-06, "loss": 0.1631, "step": 23980 }, { "epoch": 1.94, "grad_norm": 7.0280095071831425, "learning_rate": 6.112674681124998e-06, "loss": 0.1759, "step": 24000 }, { "epoch": 1.94, "grad_norm": 7.401073079016803, "learning_rate": 6.09628637155178e-06, "loss": 0.185, "step": 24020 }, { "epoch": 1.94, "grad_norm": 7.45583155210073, "learning_rate": 6.079910429320789e-06, "loss": 0.1907, "step": 24040 }, { "epoch": 1.94, "grad_norm": 5.361896066514213, "learning_rate": 6.063546906299304e-06, "loss": 0.1661, "step": 24060 }, { "epoch": 1.94, "grad_norm": 4.390297656906118, "learning_rate": 6.047195854315274e-06, "loss": 0.161, "step": 24080 }, { "epoch": 1.95, "grad_norm": 7.3569152997891365, "learning_rate": 6.030857325157148e-06, "loss": 0.183, "step": 24100 }, { "epoch": 1.95, "grad_norm": 9.276295993466274, "learning_rate": 6.014531370573706e-06, "loss": 0.1585, "step": 24120 }, { "epoch": 1.95, "grad_norm": 6.708581228789846, "learning_rate": 5.99821804227391e-06, "loss": 0.1923, "step": 24140 }, { "epoch": 1.95, "grad_norm": 4.565214003093718, "learning_rate": 5.981917391926716e-06, "loss": 0.1618, "step": 24160 }, { "epoch": 1.95, "grad_norm": 4.958953220289099, "learning_rate": 5.9656294711609455e-06, "loss": 0.1766, "step": 24180 }, { "epoch": 1.95, "grad_norm": 8.765162382650892, "learning_rate": 5.949354331565087e-06, "loss": 0.179, "step": 24200 }, { "epoch": 1.96, "grad_norm": 9.338864277112508, "learning_rate": 5.93309202468715e-06, "loss": 0.1772, "step": 24220 }, { "epoch": 1.96, "grad_norm": 6.455990799227259, "learning_rate": 5.916842602034503e-06, "loss": 0.1764, "step": 24240 }, { "epoch": 1.96, "grad_norm": 5.02536557514161, "learning_rate": 5.900606115073703e-06, "loss": 0.1834, "step": 24260 }, { "epoch": 1.96, "grad_norm": 5.6042058818064335, "learning_rate": 5.884382615230334e-06, "loss": 0.1667, "step": 24280 }, { "epoch": 1.96, "grad_norm": 3.5189123085762195, "learning_rate": 5.8681721538888544e-06, "loss": 0.1572, "step": 24300 }, { "epoch": 1.96, "grad_norm": 5.6992692847099855, "learning_rate": 5.85197478239242e-06, "loss": 0.1953, "step": 24320 }, { "epoch": 1.96, "grad_norm": 7.984601221033869, "learning_rate": 5.835790552042726e-06, "loss": 0.1821, "step": 24340 }, { "epoch": 1.97, "grad_norm": 5.326599797739027, "learning_rate": 5.819619514099847e-06, "loss": 0.1899, "step": 24360 }, { "epoch": 1.97, "grad_norm": 5.785676158799588, "learning_rate": 5.80346171978208e-06, "loss": 0.1655, "step": 24380 }, { "epoch": 1.97, "grad_norm": 5.66486255125483, "learning_rate": 5.78731722026576e-06, "loss": 0.1787, "step": 24400 }, { "epoch": 1.97, "grad_norm": 3.0399399302219625, "learning_rate": 5.771186066685136e-06, "loss": 0.1913, "step": 24420 }, { "epoch": 1.97, "grad_norm": 4.538207717223615, "learning_rate": 5.755068310132162e-06, "loss": 0.1486, "step": 24440 }, { "epoch": 1.97, "grad_norm": 8.673712678315818, "learning_rate": 5.738964001656382e-06, "loss": 0.1561, "step": 24460 }, { "epoch": 1.98, "grad_norm": 3.9322192187509035, "learning_rate": 5.722873192264731e-06, "loss": 0.1594, "step": 24480 }, { "epoch": 1.98, "grad_norm": 5.362419487053112, "learning_rate": 5.706795932921395e-06, "loss": 0.1769, "step": 24500 }, { "epoch": 1.98, "grad_norm": 5.243766171745307, "learning_rate": 5.690732274547639e-06, "loss": 0.1674, "step": 24520 }, { "epoch": 1.98, "grad_norm": 6.49132016995219, "learning_rate": 5.674682268021655e-06, "loss": 0.1795, "step": 24540 }, { "epoch": 1.98, "grad_norm": 8.23819975362725, "learning_rate": 5.658645964178398e-06, "loss": 0.1739, "step": 24560 }, { "epoch": 1.98, "grad_norm": 6.291038363918079, "learning_rate": 5.642623413809408e-06, "loss": 0.1574, "step": 24580 }, { "epoch": 1.99, "grad_norm": 5.366637178107015, "learning_rate": 5.626614667662681e-06, "loss": 0.1694, "step": 24600 }, { "epoch": 1.99, "grad_norm": 9.263601664358115, "learning_rate": 5.610619776442482e-06, "loss": 0.1928, "step": 24620 }, { "epoch": 1.99, "grad_norm": 6.177003530809166, "learning_rate": 5.5946387908091995e-06, "loss": 0.1578, "step": 24640 }, { "epoch": 1.99, "grad_norm": 4.122671452664669, "learning_rate": 5.5786717613791675e-06, "loss": 0.1652, "step": 24660 }, { "epoch": 1.99, "grad_norm": 8.598868326647608, "learning_rate": 5.562718738724532e-06, "loss": 0.1829, "step": 24680 }, { "epoch": 1.99, "grad_norm": 5.201437186659346, "learning_rate": 5.54677977337306e-06, "loss": 0.1948, "step": 24700 }, { "epoch": 2.0, "grad_norm": 5.546180531185139, "learning_rate": 5.530854915808009e-06, "loss": 0.1632, "step": 24720 }, { "epoch": 2.0, "grad_norm": 5.599419815798738, "learning_rate": 5.514944216467942e-06, "loss": 0.173, "step": 24740 }, { "epoch": 2.0, "grad_norm": 6.685290390475231, "learning_rate": 5.4990477257465854e-06, "loss": 0.1767, "step": 24760 }, { "epoch": 2.0, "grad_norm": 8.819487702938599, "learning_rate": 5.483165493992667e-06, "loss": 0.1491, "step": 24780 }, { "epoch": 2.0, "grad_norm": 3.9679892704495603, "learning_rate": 5.467297571509735e-06, "loss": 0.1422, "step": 24800 }, { "epoch": 2.0, "grad_norm": 6.793517658152576, "learning_rate": 5.451444008556042e-06, "loss": 0.1183, "step": 24820 }, { "epoch": 2.01, "grad_norm": 5.961884869528241, "learning_rate": 5.435604855344332e-06, "loss": 0.1284, "step": 24840 }, { "epoch": 2.01, "grad_norm": 7.746334808953981, "learning_rate": 5.419780162041731e-06, "loss": 0.1081, "step": 24860 }, { "epoch": 2.01, "grad_norm": 4.4474985013962485, "learning_rate": 5.4039699787695536e-06, "loss": 0.1347, "step": 24880 }, { "epoch": 2.01, "grad_norm": 5.856732802707876, "learning_rate": 5.388174355603166e-06, "loss": 0.1545, "step": 24900 }, { "epoch": 2.01, "grad_norm": 4.924020644664483, "learning_rate": 5.372393342571808e-06, "loss": 0.1499, "step": 24920 }, { "epoch": 2.01, "grad_norm": 5.682977240510656, "learning_rate": 5.356626989658453e-06, "loss": 0.1246, "step": 24940 }, { "epoch": 2.01, "grad_norm": 5.859264972925292, "learning_rate": 5.340875346799646e-06, "loss": 0.1305, "step": 24960 }, { "epoch": 2.02, "grad_norm": 4.623581539899941, "learning_rate": 5.325138463885324e-06, "loss": 0.1264, "step": 24980 }, { "epoch": 2.02, "grad_norm": 7.01967036937822, "learning_rate": 5.309416390758695e-06, "loss": 0.1069, "step": 25000 }, { "epoch": 2.02, "grad_norm": 4.610942199146199, "learning_rate": 5.293709177216038e-06, "loss": 0.1276, "step": 25020 }, { "epoch": 2.02, "grad_norm": 5.183424486215351, "learning_rate": 5.2780168730065965e-06, "loss": 0.1431, "step": 25040 }, { "epoch": 2.02, "grad_norm": 7.406827107410586, "learning_rate": 5.262339527832362e-06, "loss": 0.1424, "step": 25060 }, { "epoch": 2.02, "grad_norm": 4.722117857932919, "learning_rate": 5.2466771913479705e-06, "loss": 0.0988, "step": 25080 }, { "epoch": 2.03, "grad_norm": 6.302812451096825, "learning_rate": 5.2310299131605025e-06, "loss": 0.1376, "step": 25100 }, { "epoch": 2.03, "grad_norm": 7.078472608910602, "learning_rate": 5.215397742829359e-06, "loss": 0.144, "step": 25120 }, { "epoch": 2.03, "grad_norm": 7.602425850654334, "learning_rate": 5.199780729866077e-06, "loss": 0.1272, "step": 25140 }, { "epoch": 2.03, "grad_norm": 8.529011303397162, "learning_rate": 5.184178923734198e-06, "loss": 0.1256, "step": 25160 }, { "epoch": 2.03, "grad_norm": 4.877164506895307, "learning_rate": 5.168592373849094e-06, "loss": 0.1376, "step": 25180 }, { "epoch": 2.03, "grad_norm": 7.728895184323803, "learning_rate": 5.153021129577811e-06, "loss": 0.1214, "step": 25200 }, { "epoch": 2.04, "grad_norm": 5.393362925620798, "learning_rate": 5.1374652402389315e-06, "loss": 0.1292, "step": 25220 }, { "epoch": 2.04, "grad_norm": 5.354829066920577, "learning_rate": 5.121924755102384e-06, "loss": 0.1431, "step": 25240 }, { "epoch": 2.04, "grad_norm": 7.3582267756395145, "learning_rate": 5.10639972338933e-06, "loss": 0.1188, "step": 25260 }, { "epoch": 2.04, "grad_norm": 7.328519050073156, "learning_rate": 5.090890194271968e-06, "loss": 0.1432, "step": 25280 }, { "epoch": 2.04, "grad_norm": 3.806689728390327, "learning_rate": 5.075396216873406e-06, "loss": 0.1443, "step": 25300 }, { "epoch": 2.04, "grad_norm": 6.760581305977065, "learning_rate": 5.059917840267488e-06, "loss": 0.1266, "step": 25320 }, { "epoch": 2.05, "grad_norm": 6.062860851756675, "learning_rate": 5.0444551134786505e-06, "loss": 0.1293, "step": 25340 }, { "epoch": 2.05, "grad_norm": 6.121797308202456, "learning_rate": 5.0290080854817644e-06, "loss": 0.1272, "step": 25360 }, { "epoch": 2.05, "grad_norm": 17.765817391218594, "learning_rate": 5.013576805201969e-06, "loss": 0.1222, "step": 25380 }, { "epoch": 2.05, "grad_norm": 8.588731666870714, "learning_rate": 4.998161321514542e-06, "loss": 0.145, "step": 25400 }, { "epoch": 2.05, "grad_norm": 6.845902892502509, "learning_rate": 4.982761683244707e-06, "loss": 0.143, "step": 25420 }, { "epoch": 2.05, "grad_norm": 6.001793593307747, "learning_rate": 4.967377939167522e-06, "loss": 0.1154, "step": 25440 }, { "epoch": 2.06, "grad_norm": 6.5641471612161135, "learning_rate": 4.952010138007682e-06, "loss": 0.1172, "step": 25460 }, { "epoch": 2.06, "grad_norm": 25.650122260384183, "learning_rate": 4.936658328439415e-06, "loss": 0.1182, "step": 25480 }, { "epoch": 2.06, "grad_norm": 5.092025183975739, "learning_rate": 4.92132255908627e-06, "loss": 0.1304, "step": 25500 }, { "epoch": 2.06, "grad_norm": 5.661300848835485, "learning_rate": 4.906002878521012e-06, "loss": 0.1212, "step": 25520 }, { "epoch": 2.06, "grad_norm": 5.880854510023727, "learning_rate": 4.8906993352654324e-06, "loss": 0.1593, "step": 25540 }, { "epoch": 2.06, "grad_norm": 6.493643660496159, "learning_rate": 4.875411977790225e-06, "loss": 0.1266, "step": 25560 }, { "epoch": 2.06, "grad_norm": 5.831800677269783, "learning_rate": 4.860140854514814e-06, "loss": 0.1241, "step": 25580 }, { "epoch": 2.07, "grad_norm": 3.2429436594691574, "learning_rate": 4.8448860138072e-06, "loss": 0.1331, "step": 25600 }, { "epoch": 2.07, "grad_norm": 6.416709437372251, "learning_rate": 4.829647503983822e-06, "loss": 0.1093, "step": 25620 }, { "epoch": 2.07, "grad_norm": 5.959547688263065, "learning_rate": 4.81442537330938e-06, "loss": 0.1256, "step": 25640 }, { "epoch": 2.07, "grad_norm": 4.681823548653095, "learning_rate": 4.799219669996716e-06, "loss": 0.1459, "step": 25660 }, { "epoch": 2.07, "grad_norm": 3.449801403355737, "learning_rate": 4.784030442206621e-06, "loss": 0.1257, "step": 25680 }, { "epoch": 2.07, "grad_norm": 7.289045491254941, "learning_rate": 4.768857738047723e-06, "loss": 0.124, "step": 25700 }, { "epoch": 2.08, "grad_norm": 6.517132701291285, "learning_rate": 4.753701605576295e-06, "loss": 0.1453, "step": 25720 }, { "epoch": 2.08, "grad_norm": 7.181713926570433, "learning_rate": 4.73856209279614e-06, "loss": 0.1047, "step": 25740 }, { "epoch": 2.08, "grad_norm": 5.011428931665478, "learning_rate": 4.723439247658417e-06, "loss": 0.134, "step": 25760 }, { "epoch": 2.08, "grad_norm": 4.972723565616148, "learning_rate": 4.708333118061484e-06, "loss": 0.1445, "step": 25780 }, { "epoch": 2.08, "grad_norm": 4.810280027743967, "learning_rate": 4.693243751850772e-06, "loss": 0.1121, "step": 25800 }, { "epoch": 2.08, "grad_norm": 5.014039927021972, "learning_rate": 4.678171196818602e-06, "loss": 0.1349, "step": 25820 }, { "epoch": 2.09, "grad_norm": 5.744956084798192, "learning_rate": 4.663115500704064e-06, "loss": 0.1376, "step": 25840 }, { "epoch": 2.09, "grad_norm": 7.060959099307941, "learning_rate": 4.648076711192836e-06, "loss": 0.1538, "step": 25860 }, { "epoch": 2.09, "grad_norm": 6.671974485133576, "learning_rate": 4.633054875917063e-06, "loss": 0.1311, "step": 25880 }, { "epoch": 2.09, "grad_norm": 5.316396025144006, "learning_rate": 4.618050042455172e-06, "loss": 0.135, "step": 25900 }, { "epoch": 2.09, "grad_norm": 5.775031627221634, "learning_rate": 4.603062258331763e-06, "loss": 0.138, "step": 25920 }, { "epoch": 2.09, "grad_norm": 7.608587141981036, "learning_rate": 4.588091571017425e-06, "loss": 0.128, "step": 25940 }, { "epoch": 2.1, "grad_norm": 5.727304199814848, "learning_rate": 4.573138027928591e-06, "loss": 0.116, "step": 25960 }, { "epoch": 2.1, "grad_norm": 6.746801678257794, "learning_rate": 4.558201676427404e-06, "loss": 0.1372, "step": 25980 }, { "epoch": 2.1, "grad_norm": 4.725263735560743, "learning_rate": 4.543282563821544e-06, "loss": 0.1331, "step": 26000 }, { "epoch": 2.1, "grad_norm": 5.698041599538245, "learning_rate": 4.528380737364105e-06, "loss": 0.1308, "step": 26020 }, { "epoch": 2.1, "grad_norm": 5.36845229792286, "learning_rate": 4.513496244253417e-06, "loss": 0.1137, "step": 26040 }, { "epoch": 2.1, "grad_norm": 6.544772372677009, "learning_rate": 4.498629131632924e-06, "loss": 0.1056, "step": 26060 }, { "epoch": 2.11, "grad_norm": 7.290753756096228, "learning_rate": 4.483779446591005e-06, "loss": 0.1094, "step": 26080 }, { "epoch": 2.11, "grad_norm": 8.144050682764536, "learning_rate": 4.4689472361608545e-06, "loss": 0.143, "step": 26100 }, { "epoch": 2.11, "grad_norm": 8.215468763372733, "learning_rate": 4.454132547320319e-06, "loss": 0.1114, "step": 26120 }, { "epoch": 2.11, "grad_norm": 7.589002593072822, "learning_rate": 4.439335426991738e-06, "loss": 0.1458, "step": 26140 }, { "epoch": 2.11, "grad_norm": 6.393883058835871, "learning_rate": 4.42455592204182e-06, "loss": 0.1242, "step": 26160 }, { "epoch": 2.11, "grad_norm": 4.988324839521765, "learning_rate": 4.409794079281468e-06, "loss": 0.1282, "step": 26180 }, { "epoch": 2.11, "grad_norm": 7.269064468574413, "learning_rate": 4.395049945465658e-06, "loss": 0.1258, "step": 26200 }, { "epoch": 2.12, "grad_norm": 7.401597247566605, "learning_rate": 4.380323567293261e-06, "loss": 0.1179, "step": 26220 }, { "epoch": 2.12, "grad_norm": 5.335445264059175, "learning_rate": 4.365614991406925e-06, "loss": 0.1232, "step": 26240 }, { "epoch": 2.12, "grad_norm": 4.539039808732008, "learning_rate": 4.3509242643929e-06, "loss": 0.115, "step": 26260 }, { "epoch": 2.12, "grad_norm": 7.941645943720367, "learning_rate": 4.3362514327809156e-06, "loss": 0.1308, "step": 26280 }, { "epoch": 2.12, "grad_norm": 5.886590117104053, "learning_rate": 4.321596543044017e-06, "loss": 0.1237, "step": 26300 }, { "epoch": 2.12, "grad_norm": 3.3739418914259254, "learning_rate": 4.306959641598416e-06, "loss": 0.1164, "step": 26320 }, { "epoch": 2.13, "grad_norm": 8.292547386845971, "learning_rate": 4.292340774803359e-06, "loss": 0.1231, "step": 26340 }, { "epoch": 2.13, "grad_norm": 2.7709714429657937, "learning_rate": 4.277739988960969e-06, "loss": 0.0971, "step": 26360 }, { "epoch": 2.13, "grad_norm": 7.052957214874731, "learning_rate": 4.263157330316105e-06, "loss": 0.1435, "step": 26380 }, { "epoch": 2.13, "grad_norm": 5.680082745553791, "learning_rate": 4.248592845056201e-06, "loss": 0.1457, "step": 26400 }, { "epoch": 2.13, "grad_norm": 5.633045122948842, "learning_rate": 4.234046579311143e-06, "loss": 0.1176, "step": 26420 }, { "epoch": 2.13, "grad_norm": 4.486149134571564, "learning_rate": 4.219518579153098e-06, "loss": 0.115, "step": 26440 }, { "epoch": 2.14, "grad_norm": 8.153188781371831, "learning_rate": 4.205008890596397e-06, "loss": 0.1387, "step": 26460 }, { "epoch": 2.14, "grad_norm": 8.77221692937179, "learning_rate": 4.190517559597355e-06, "loss": 0.1268, "step": 26480 }, { "epoch": 2.14, "grad_norm": 4.636203701912266, "learning_rate": 4.176044632054157e-06, "loss": 0.1387, "step": 26500 }, { "epoch": 2.14, "grad_norm": 5.701122695381408, "learning_rate": 4.161590153806698e-06, "loss": 0.1189, "step": 26520 }, { "epoch": 2.14, "grad_norm": 5.713259202853844, "learning_rate": 4.147154170636426e-06, "loss": 0.1321, "step": 26540 }, { "epoch": 2.14, "grad_norm": 3.4565496939054685, "learning_rate": 4.132736728266227e-06, "loss": 0.1009, "step": 26560 }, { "epoch": 2.15, "grad_norm": 2.531735314833725, "learning_rate": 4.118337872360249e-06, "loss": 0.1087, "step": 26580 }, { "epoch": 2.15, "grad_norm": 3.639938355227791, "learning_rate": 4.103957648523783e-06, "loss": 0.1269, "step": 26600 }, { "epoch": 2.15, "grad_norm": 4.32306427607865, "learning_rate": 4.089596102303094e-06, "loss": 0.1226, "step": 26620 }, { "epoch": 2.15, "grad_norm": 5.21603012020097, "learning_rate": 4.075253279185303e-06, "loss": 0.1189, "step": 26640 }, { "epoch": 2.15, "grad_norm": 5.90185997303879, "learning_rate": 4.0609292245982175e-06, "loss": 0.1022, "step": 26660 }, { "epoch": 2.15, "grad_norm": 8.53068792147352, "learning_rate": 4.04662398391021e-06, "loss": 0.1676, "step": 26680 }, { "epoch": 2.16, "grad_norm": 8.980817909969629, "learning_rate": 4.032337602430062e-06, "loss": 0.1267, "step": 26700 }, { "epoch": 2.16, "grad_norm": 7.293001703025444, "learning_rate": 4.0180701254068135e-06, "loss": 0.1334, "step": 26720 }, { "epoch": 2.16, "grad_norm": 10.524039950043766, "learning_rate": 4.003821598029641e-06, "loss": 0.1314, "step": 26740 }, { "epoch": 2.16, "grad_norm": 5.740696221638179, "learning_rate": 3.98959206542769e-06, "loss": 0.1211, "step": 26760 }, { "epoch": 2.16, "grad_norm": 5.116715224508855, "learning_rate": 3.975381572669955e-06, "loss": 0.1198, "step": 26780 }, { "epoch": 2.16, "grad_norm": 3.602629331852247, "learning_rate": 3.961190164765121e-06, "loss": 0.1193, "step": 26800 }, { "epoch": 2.16, "grad_norm": 4.821156036909256, "learning_rate": 3.94701788666143e-06, "loss": 0.1547, "step": 26820 }, { "epoch": 2.17, "grad_norm": 5.512641942971287, "learning_rate": 3.932864783246522e-06, "loss": 0.1358, "step": 26840 }, { "epoch": 2.17, "grad_norm": 10.076917851032867, "learning_rate": 3.9187308993473214e-06, "loss": 0.1287, "step": 26860 }, { "epoch": 2.17, "grad_norm": 5.147003761445771, "learning_rate": 3.9046162797298656e-06, "loss": 0.1375, "step": 26880 }, { "epoch": 2.17, "grad_norm": 3.9350970974373385, "learning_rate": 3.8905209690991854e-06, "loss": 0.1081, "step": 26900 }, { "epoch": 2.17, "grad_norm": 7.846989787062407, "learning_rate": 3.8764450120991554e-06, "loss": 0.1283, "step": 26920 }, { "epoch": 2.17, "grad_norm": 6.025181064021898, "learning_rate": 3.862388453312338e-06, "loss": 0.1385, "step": 26940 }, { "epoch": 2.18, "grad_norm": 6.371635895752135, "learning_rate": 3.848351337259878e-06, "loss": 0.123, "step": 26960 }, { "epoch": 2.18, "grad_norm": 7.997494305164138, "learning_rate": 3.8343337084013155e-06, "loss": 0.1363, "step": 26980 }, { "epoch": 2.18, "grad_norm": 8.394624406910976, "learning_rate": 3.82033561113449e-06, "loss": 0.1383, "step": 27000 }, { "epoch": 2.18, "grad_norm": 5.705894153275095, "learning_rate": 3.8063570897953637e-06, "loss": 0.1245, "step": 27020 }, { "epoch": 2.18, "grad_norm": 7.461343963895418, "learning_rate": 3.792398188657907e-06, "loss": 0.1203, "step": 27040 }, { "epoch": 2.18, "grad_norm": 4.306592350211294, "learning_rate": 3.778458951933937e-06, "loss": 0.1074, "step": 27060 }, { "epoch": 2.19, "grad_norm": 8.397226475187265, "learning_rate": 3.7645394237729973e-06, "loss": 0.1374, "step": 27080 }, { "epoch": 2.19, "grad_norm": 4.981183504165737, "learning_rate": 3.7506396482622088e-06, "loss": 0.1239, "step": 27100 }, { "epoch": 2.19, "grad_norm": 6.457305366988544, "learning_rate": 3.7367596694261195e-06, "loss": 0.1034, "step": 27120 }, { "epoch": 2.19, "grad_norm": 7.391037911298385, "learning_rate": 3.72289953122659e-06, "loss": 0.1243, "step": 27140 }, { "epoch": 2.19, "grad_norm": 4.8231637890027, "learning_rate": 3.709059277562625e-06, "loss": 0.1347, "step": 27160 }, { "epoch": 2.19, "grad_norm": 6.7727869902812285, "learning_rate": 3.6952389522702662e-06, "loss": 0.1392, "step": 27180 }, { "epoch": 2.2, "grad_norm": 5.219332177131873, "learning_rate": 3.6814385991224144e-06, "loss": 0.1439, "step": 27200 }, { "epoch": 2.2, "grad_norm": 8.34825349919469, "learning_rate": 3.6676582618287414e-06, "loss": 0.1265, "step": 27220 }, { "epoch": 2.2, "grad_norm": 5.948357863578263, "learning_rate": 3.6538979840354955e-06, "loss": 0.1234, "step": 27240 }, { "epoch": 2.2, "grad_norm": 6.2425581305214495, "learning_rate": 3.640157809325408e-06, "loss": 0.128, "step": 27260 }, { "epoch": 2.2, "grad_norm": 8.50504377727242, "learning_rate": 3.626437781217535e-06, "loss": 0.1223, "step": 27280 }, { "epoch": 2.2, "grad_norm": 4.36693745615525, "learning_rate": 3.6127379431671135e-06, "loss": 0.1225, "step": 27300 }, { "epoch": 2.21, "grad_norm": 6.291226942178153, "learning_rate": 3.5990583385654473e-06, "loss": 0.13, "step": 27320 }, { "epoch": 2.21, "grad_norm": 3.743285844206663, "learning_rate": 3.5853990107397414e-06, "loss": 0.1136, "step": 27340 }, { "epoch": 2.21, "grad_norm": 7.063395755642613, "learning_rate": 3.571760002952988e-06, "loss": 0.1288, "step": 27360 }, { "epoch": 2.21, "grad_norm": 10.67822112681253, "learning_rate": 3.558141358403814e-06, "loss": 0.1325, "step": 27380 }, { "epoch": 2.21, "grad_norm": 6.014674678893275, "learning_rate": 3.544543120226356e-06, "loss": 0.1193, "step": 27400 }, { "epoch": 2.21, "grad_norm": 6.383024363929499, "learning_rate": 3.5309653314901092e-06, "loss": 0.1314, "step": 27420 }, { "epoch": 2.22, "grad_norm": 8.488364389980548, "learning_rate": 3.5174080351998087e-06, "loss": 0.1294, "step": 27440 }, { "epoch": 2.22, "grad_norm": 7.5571459268249, "learning_rate": 3.503871274295283e-06, "loss": 0.1094, "step": 27460 }, { "epoch": 2.22, "grad_norm": 4.855325366824, "learning_rate": 3.490355091651309e-06, "loss": 0.121, "step": 27480 }, { "epoch": 2.22, "grad_norm": 2.8339656984330315, "learning_rate": 3.476859530077501e-06, "loss": 0.1168, "step": 27500 }, { "epoch": 2.22, "grad_norm": 3.880464576947814, "learning_rate": 3.4633846323181476e-06, "loss": 0.1183, "step": 27520 }, { "epoch": 2.22, "grad_norm": 7.142186901799362, "learning_rate": 3.4499304410521006e-06, "loss": 0.127, "step": 27540 }, { "epoch": 2.22, "grad_norm": 5.625606419405784, "learning_rate": 3.436496998892617e-06, "loss": 0.1167, "step": 27560 }, { "epoch": 2.23, "grad_norm": 5.769044931793202, "learning_rate": 3.4230843483872455e-06, "loss": 0.1237, "step": 27580 }, { "epoch": 2.23, "grad_norm": 5.109397211522344, "learning_rate": 3.409692532017673e-06, "loss": 0.1295, "step": 27600 }, { "epoch": 2.23, "grad_norm": 7.853734200980971, "learning_rate": 3.3963215921996088e-06, "loss": 0.1474, "step": 27620 }, { "epoch": 2.23, "grad_norm": 5.382800629579164, "learning_rate": 3.382971571282625e-06, "loss": 0.1245, "step": 27640 }, { "epoch": 2.23, "grad_norm": 7.162330942339086, "learning_rate": 3.369642511550058e-06, "loss": 0.1416, "step": 27660 }, { "epoch": 2.23, "grad_norm": 8.835213467102951, "learning_rate": 3.356334455218841e-06, "loss": 0.1257, "step": 27680 }, { "epoch": 2.24, "grad_norm": 5.964761250043419, "learning_rate": 3.343047444439381e-06, "loss": 0.1267, "step": 27700 }, { "epoch": 2.24, "grad_norm": 5.096884669815704, "learning_rate": 3.3297815212954384e-06, "loss": 0.119, "step": 27720 }, { "epoch": 2.24, "grad_norm": 8.767442157903723, "learning_rate": 3.31653672780397e-06, "loss": 0.1393, "step": 27740 }, { "epoch": 2.24, "grad_norm": 4.622895870474268, "learning_rate": 3.303313105915024e-06, "loss": 0.1315, "step": 27760 }, { "epoch": 2.24, "grad_norm": 7.017417220010313, "learning_rate": 3.2901106975115764e-06, "loss": 0.1308, "step": 27780 }, { "epoch": 2.24, "grad_norm": 6.904726438634829, "learning_rate": 3.2769295444094277e-06, "loss": 0.1326, "step": 27800 }, { "epoch": 2.25, "grad_norm": 4.224265103502957, "learning_rate": 3.2637696883570457e-06, "loss": 0.1316, "step": 27820 }, { "epoch": 2.25, "grad_norm": 8.01349169802627, "learning_rate": 3.2506311710354525e-06, "loss": 0.1206, "step": 27840 }, { "epoch": 2.25, "grad_norm": 7.347025428961614, "learning_rate": 3.2375140340580847e-06, "loss": 0.1208, "step": 27860 }, { "epoch": 2.25, "grad_norm": 7.678434159075267, "learning_rate": 3.2244183189706516e-06, "loss": 0.119, "step": 27880 }, { "epoch": 2.25, "grad_norm": 3.6564444465772703, "learning_rate": 3.2113440672510245e-06, "loss": 0.1187, "step": 27900 }, { "epoch": 2.25, "grad_norm": 6.942445933674398, "learning_rate": 3.1982913203090847e-06, "loss": 0.1228, "step": 27920 }, { "epoch": 2.26, "grad_norm": 8.134118699891573, "learning_rate": 3.185260119486611e-06, "loss": 0.1338, "step": 27940 }, { "epoch": 2.26, "grad_norm": 6.289165917725007, "learning_rate": 3.1722505060571298e-06, "loss": 0.131, "step": 27960 }, { "epoch": 2.26, "grad_norm": 7.956512748892634, "learning_rate": 3.1592625212258044e-06, "loss": 0.1257, "step": 27980 }, { "epoch": 2.26, "grad_norm": 7.91705982503564, "learning_rate": 3.1462962061292833e-06, "loss": 0.134, "step": 28000 }, { "epoch": 2.26, "grad_norm": 5.113434558315745, "learning_rate": 3.1333516018355874e-06, "loss": 0.1342, "step": 28020 }, { "epoch": 2.26, "grad_norm": 4.294035149979876, "learning_rate": 3.120428749343978e-06, "loss": 0.1055, "step": 28040 }, { "epoch": 2.27, "grad_norm": 5.284875652254274, "learning_rate": 3.1075276895848083e-06, "loss": 0.1522, "step": 28060 }, { "epoch": 2.27, "grad_norm": 4.564542339401451, "learning_rate": 3.094648463419418e-06, "loss": 0.108, "step": 28080 }, { "epoch": 2.27, "grad_norm": 3.330793144003339, "learning_rate": 3.081791111639994e-06, "loss": 0.1338, "step": 28100 }, { "epoch": 2.27, "grad_norm": 5.675958681844664, "learning_rate": 3.0689556749694386e-06, "loss": 0.1305, "step": 28120 }, { "epoch": 2.27, "grad_norm": 6.3779003053267855, "learning_rate": 3.0561421940612373e-06, "loss": 0.1402, "step": 28140 }, { "epoch": 2.27, "grad_norm": 7.164443210815206, "learning_rate": 3.043350709499347e-06, "loss": 0.1091, "step": 28160 }, { "epoch": 2.27, "grad_norm": 6.199435965200471, "learning_rate": 3.0305812617980413e-06, "loss": 0.114, "step": 28180 }, { "epoch": 2.28, "grad_norm": 6.507304269057784, "learning_rate": 3.0178338914018125e-06, "loss": 0.1146, "step": 28200 }, { "epoch": 2.28, "grad_norm": 7.487244249156909, "learning_rate": 3.0051086386852124e-06, "loss": 0.148, "step": 28220 }, { "epoch": 2.28, "grad_norm": 5.306922048688804, "learning_rate": 2.9924055439527522e-06, "loss": 0.1289, "step": 28240 }, { "epoch": 2.28, "grad_norm": 8.734149636515674, "learning_rate": 2.97972464743876e-06, "loss": 0.1325, "step": 28260 }, { "epoch": 2.28, "grad_norm": 8.860214166743736, "learning_rate": 2.9670659893072457e-06, "loss": 0.1228, "step": 28280 }, { "epoch": 2.28, "grad_norm": 7.3339474449802555, "learning_rate": 2.9544296096517992e-06, "loss": 0.1585, "step": 28300 }, { "epoch": 2.29, "grad_norm": 6.384028604887931, "learning_rate": 2.9418155484954342e-06, "loss": 0.1074, "step": 28320 }, { "epoch": 2.29, "grad_norm": 15.187132277569924, "learning_rate": 2.9292238457904887e-06, "loss": 0.1286, "step": 28340 }, { "epoch": 2.29, "grad_norm": 4.105032943983691, "learning_rate": 2.9166545414184704e-06, "loss": 0.1251, "step": 28360 }, { "epoch": 2.29, "grad_norm": 9.032081644910658, "learning_rate": 2.904107675189959e-06, "loss": 0.1529, "step": 28380 }, { "epoch": 2.29, "grad_norm": 5.9665443379455345, "learning_rate": 2.8915832868444547e-06, "loss": 0.1113, "step": 28400 }, { "epoch": 2.29, "grad_norm": 8.228193652094436, "learning_rate": 2.879081416050271e-06, "loss": 0.1236, "step": 28420 }, { "epoch": 2.3, "grad_norm": 9.204936602123944, "learning_rate": 2.866602102404405e-06, "loss": 0.1218, "step": 28440 }, { "epoch": 2.3, "grad_norm": 6.684327130460606, "learning_rate": 2.854145385432395e-06, "loss": 0.125, "step": 28460 }, { "epoch": 2.3, "grad_norm": 5.242441458313832, "learning_rate": 2.8417113045882273e-06, "loss": 0.1056, "step": 28480 }, { "epoch": 2.3, "grad_norm": 6.908042709563139, "learning_rate": 2.829299899254175e-06, "loss": 0.1338, "step": 28500 }, { "epoch": 2.3, "grad_norm": 7.139422359234334, "learning_rate": 2.816911208740706e-06, "loss": 0.1222, "step": 28520 }, { "epoch": 2.3, "grad_norm": 4.282490126046702, "learning_rate": 2.8045452722863335e-06, "loss": 0.1112, "step": 28540 }, { "epoch": 2.31, "grad_norm": 4.896308691877213, "learning_rate": 2.7922021290575154e-06, "loss": 0.1306, "step": 28560 }, { "epoch": 2.31, "grad_norm": 5.519497039105762, "learning_rate": 2.7798818181484986e-06, "loss": 0.115, "step": 28580 }, { "epoch": 2.31, "grad_norm": 6.446748774629239, "learning_rate": 2.76758437858123e-06, "loss": 0.1112, "step": 28600 }, { "epoch": 2.31, "grad_norm": 6.647073787412228, "learning_rate": 2.7553098493052043e-06, "loss": 0.1295, "step": 28620 }, { "epoch": 2.31, "grad_norm": 8.347887934752801, "learning_rate": 2.743058269197361e-06, "loss": 0.152, "step": 28640 }, { "epoch": 2.31, "grad_norm": 6.952959037257396, "learning_rate": 2.7308296770619526e-06, "loss": 0.1085, "step": 28660 }, { "epoch": 2.32, "grad_norm": 6.952771124246241, "learning_rate": 2.718624111630414e-06, "loss": 0.145, "step": 28680 }, { "epoch": 2.32, "grad_norm": 7.884357282124984, "learning_rate": 2.706441611561259e-06, "loss": 0.1288, "step": 28700 }, { "epoch": 2.32, "grad_norm": 6.953856834730615, "learning_rate": 2.6942822154399364e-06, "loss": 0.1175, "step": 28720 }, { "epoch": 2.32, "grad_norm": 5.6769079152108155, "learning_rate": 2.682145961778729e-06, "loss": 0.1161, "step": 28740 }, { "epoch": 2.32, "grad_norm": 7.772646149653579, "learning_rate": 2.6700328890166082e-06, "loss": 0.1373, "step": 28760 }, { "epoch": 2.32, "grad_norm": 6.269489266798685, "learning_rate": 2.6579430355191392e-06, "loss": 0.1318, "step": 28780 }, { "epoch": 2.32, "grad_norm": 8.611301324309055, "learning_rate": 2.6458764395783314e-06, "loss": 0.1254, "step": 28800 }, { "epoch": 2.33, "grad_norm": 3.918558002252436, "learning_rate": 2.6338331394125386e-06, "loss": 0.1299, "step": 28820 }, { "epoch": 2.33, "grad_norm": 6.545080990679989, "learning_rate": 2.6218131731663343e-06, "loss": 0.1623, "step": 28840 }, { "epoch": 2.33, "grad_norm": 7.750854937250521, "learning_rate": 2.6098165789103743e-06, "loss": 0.1421, "step": 28860 }, { "epoch": 2.33, "grad_norm": 7.169167134155887, "learning_rate": 2.5978433946413014e-06, "loss": 0.1273, "step": 28880 }, { "epoch": 2.33, "grad_norm": 5.786194372694674, "learning_rate": 2.5858936582816e-06, "loss": 0.1222, "step": 28900 }, { "epoch": 2.33, "grad_norm": 3.8269959322287512, "learning_rate": 2.5739674076795024e-06, "loss": 0.1225, "step": 28920 }, { "epoch": 2.34, "grad_norm": 7.014129275324685, "learning_rate": 2.5620646806088394e-06, "loss": 0.1101, "step": 28940 }, { "epoch": 2.34, "grad_norm": 5.8000351345708125, "learning_rate": 2.550185514768949e-06, "loss": 0.1191, "step": 28960 }, { "epoch": 2.34, "grad_norm": 2.8990500900399336, "learning_rate": 2.5383299477845387e-06, "loss": 0.1147, "step": 28980 }, { "epoch": 2.34, "grad_norm": 11.088067633066151, "learning_rate": 2.5264980172055704e-06, "loss": 0.1125, "step": 29000 }, { "epoch": 2.34, "grad_norm": 9.413832741620547, "learning_rate": 2.5146897605071486e-06, "loss": 0.1279, "step": 29020 }, { "epoch": 2.34, "grad_norm": 6.830754173272787, "learning_rate": 2.5029052150893846e-06, "loss": 0.1346, "step": 29040 }, { "epoch": 2.35, "grad_norm": 5.406928753001935, "learning_rate": 2.4911444182773035e-06, "loss": 0.1308, "step": 29060 }, { "epoch": 2.35, "grad_norm": 5.562205789050428, "learning_rate": 2.479407407320698e-06, "loss": 0.1593, "step": 29080 }, { "epoch": 2.35, "grad_norm": 5.745666197553677, "learning_rate": 2.467694219394037e-06, "loss": 0.1382, "step": 29100 }, { "epoch": 2.35, "grad_norm": 6.587771050741836, "learning_rate": 2.4560048915963234e-06, "loss": 0.1185, "step": 29120 }, { "epoch": 2.35, "grad_norm": 3.7207574635139715, "learning_rate": 2.4443394609509987e-06, "loss": 0.1354, "step": 29140 }, { "epoch": 2.35, "grad_norm": 5.410069874648939, "learning_rate": 2.4326979644058053e-06, "loss": 0.1055, "step": 29160 }, { "epoch": 2.36, "grad_norm": 4.629335999961238, "learning_rate": 2.4210804388326863e-06, "loss": 0.1279, "step": 29180 }, { "epoch": 2.36, "grad_norm": 6.075402127994961, "learning_rate": 2.4094869210276644e-06, "loss": 0.1282, "step": 29200 }, { "epoch": 2.36, "grad_norm": 8.077203635348965, "learning_rate": 2.397917447710712e-06, "loss": 0.1361, "step": 29220 }, { "epoch": 2.36, "grad_norm": 6.904012326173178, "learning_rate": 2.3863720555256586e-06, "loss": 0.1447, "step": 29240 }, { "epoch": 2.36, "grad_norm": 7.51187719216677, "learning_rate": 2.374850781040049e-06, "loss": 0.1198, "step": 29260 }, { "epoch": 2.36, "grad_norm": 8.843888028958192, "learning_rate": 2.3633536607450533e-06, "loss": 0.1078, "step": 29280 }, { "epoch": 2.37, "grad_norm": 4.724450312535336, "learning_rate": 2.3518807310553265e-06, "loss": 0.1312, "step": 29300 }, { "epoch": 2.37, "grad_norm": 3.901823297415003, "learning_rate": 2.3404320283089157e-06, "loss": 0.1236, "step": 29320 }, { "epoch": 2.37, "grad_norm": 4.94887940794774, "learning_rate": 2.3290075887671235e-06, "loss": 0.1214, "step": 29340 }, { "epoch": 2.37, "grad_norm": 7.183863507713724, "learning_rate": 2.3176074486144144e-06, "loss": 0.1096, "step": 29360 }, { "epoch": 2.37, "grad_norm": 6.719091788181481, "learning_rate": 2.306231643958282e-06, "loss": 0.1443, "step": 29380 }, { "epoch": 2.37, "grad_norm": 7.79866499602431, "learning_rate": 2.294880210829149e-06, "loss": 0.14, "step": 29400 }, { "epoch": 2.37, "grad_norm": 8.13986395794833, "learning_rate": 2.2835531851802426e-06, "loss": 0.1228, "step": 29420 }, { "epoch": 2.38, "grad_norm": 8.596704925219258, "learning_rate": 2.272250602887482e-06, "loss": 0.129, "step": 29440 }, { "epoch": 2.38, "grad_norm": 5.557622421227328, "learning_rate": 2.260972499749375e-06, "loss": 0.12, "step": 29460 }, { "epoch": 2.38, "grad_norm": 7.505791526088736, "learning_rate": 2.249718911486888e-06, "loss": 0.1222, "step": 29480 }, { "epoch": 2.38, "grad_norm": 3.236146156292273, "learning_rate": 2.2384898737433487e-06, "loss": 0.1134, "step": 29500 }, { "epoch": 2.38, "grad_norm": 7.749403407606816, "learning_rate": 2.227285422084322e-06, "loss": 0.1386, "step": 29520 }, { "epoch": 2.38, "grad_norm": 6.096181843986543, "learning_rate": 2.216105591997505e-06, "loss": 0.1073, "step": 29540 }, { "epoch": 2.39, "grad_norm": 6.994357423423653, "learning_rate": 2.204950418892607e-06, "loss": 0.1043, "step": 29560 }, { "epoch": 2.39, "grad_norm": 6.513300858530218, "learning_rate": 2.1938199381012448e-06, "loss": 0.1386, "step": 29580 }, { "epoch": 2.39, "grad_norm": 5.7220499492173165, "learning_rate": 2.1827141848768325e-06, "loss": 0.1225, "step": 29600 }, { "epoch": 2.39, "grad_norm": 8.694779892998747, "learning_rate": 2.1716331943944514e-06, "loss": 0.1251, "step": 29620 }, { "epoch": 2.39, "grad_norm": 2.990108235674315, "learning_rate": 2.1605770017507677e-06, "loss": 0.1299, "step": 29640 }, { "epoch": 2.39, "grad_norm": 6.089718154874702, "learning_rate": 2.1495456419638907e-06, "loss": 0.1412, "step": 29660 }, { "epoch": 2.4, "grad_norm": 5.762346695124691, "learning_rate": 2.1385391499732912e-06, "loss": 0.1278, "step": 29680 }, { "epoch": 2.4, "grad_norm": 7.010790274094211, "learning_rate": 2.127557560639665e-06, "loss": 0.1355, "step": 29700 }, { "epoch": 2.4, "grad_norm": 7.968815582603903, "learning_rate": 2.116600908744843e-06, "loss": 0.1278, "step": 29720 }, { "epoch": 2.4, "grad_norm": 6.934804031694609, "learning_rate": 2.105669228991664e-06, "loss": 0.1293, "step": 29740 }, { "epoch": 2.4, "grad_norm": 7.504583102851711, "learning_rate": 2.0947625560038785e-06, "loss": 0.1472, "step": 29760 }, { "epoch": 2.4, "grad_norm": 7.014433557829672, "learning_rate": 2.083880924326036e-06, "loss": 0.1134, "step": 29780 }, { "epoch": 2.41, "grad_norm": 4.792121445455869, "learning_rate": 2.0730243684233625e-06, "loss": 0.123, "step": 29800 }, { "epoch": 2.41, "grad_norm": 6.473022229582405, "learning_rate": 2.062192922681671e-06, "loss": 0.1183, "step": 29820 }, { "epoch": 2.41, "grad_norm": 7.8085710335306135, "learning_rate": 2.0513866214072425e-06, "loss": 0.1307, "step": 29840 }, { "epoch": 2.41, "grad_norm": 7.207969906268875, "learning_rate": 2.040605498826717e-06, "loss": 0.1242, "step": 29860 }, { "epoch": 2.41, "grad_norm": 4.522824787815607, "learning_rate": 2.0298495890869816e-06, "loss": 0.1183, "step": 29880 }, { "epoch": 2.41, "grad_norm": 5.9260976426535255, "learning_rate": 2.019118926255076e-06, "loss": 0.1224, "step": 29900 }, { "epoch": 2.42, "grad_norm": 8.095499799542615, "learning_rate": 2.008413544318068e-06, "loss": 0.1353, "step": 29920 }, { "epoch": 2.42, "grad_norm": 5.524789086368143, "learning_rate": 1.9977334771829614e-06, "loss": 0.1123, "step": 29940 }, { "epoch": 2.42, "grad_norm": 8.547211969905513, "learning_rate": 1.987078758676569e-06, "loss": 0.1143, "step": 29960 }, { "epoch": 2.42, "grad_norm": 5.226656652038737, "learning_rate": 1.9764494225454297e-06, "loss": 0.1193, "step": 29980 }, { "epoch": 2.42, "grad_norm": 6.434813596312415, "learning_rate": 1.9658455024556855e-06, "loss": 0.1276, "step": 30000 }, { "epoch": 2.42, "grad_norm": 4.3200453939525385, "learning_rate": 1.9552670319929717e-06, "loss": 0.1212, "step": 30020 }, { "epoch": 2.42, "grad_norm": 6.559567913618923, "learning_rate": 1.9447140446623284e-06, "loss": 0.1428, "step": 30040 }, { "epoch": 2.43, "grad_norm": 9.381816369635184, "learning_rate": 1.934186573888071e-06, "loss": 0.1327, "step": 30060 }, { "epoch": 2.43, "grad_norm": 14.290288356383638, "learning_rate": 1.923684653013711e-06, "loss": 0.1326, "step": 30080 }, { "epoch": 2.43, "grad_norm": 4.427293672239725, "learning_rate": 1.9132083153018223e-06, "loss": 0.0952, "step": 30100 }, { "epoch": 2.43, "grad_norm": 6.127755304416192, "learning_rate": 1.9027575939339587e-06, "loss": 0.1248, "step": 30120 }, { "epoch": 2.43, "grad_norm": 6.310289354657356, "learning_rate": 1.8923325220105349e-06, "loss": 0.1143, "step": 30140 }, { "epoch": 2.43, "grad_norm": 5.992058266411168, "learning_rate": 1.8819331325507292e-06, "loss": 0.1131, "step": 30160 }, { "epoch": 2.44, "grad_norm": 5.062852823654787, "learning_rate": 1.8715594584923772e-06, "loss": 0.1371, "step": 30180 }, { "epoch": 2.44, "grad_norm": 2.280657233995462, "learning_rate": 1.8612115326918611e-06, "loss": 0.1325, "step": 30200 }, { "epoch": 2.44, "grad_norm": 7.160906080742632, "learning_rate": 1.8508893879240192e-06, "loss": 0.1111, "step": 30220 }, { "epoch": 2.44, "grad_norm": 6.772265974235008, "learning_rate": 1.8405930568820237e-06, "loss": 0.1459, "step": 30240 }, { "epoch": 2.44, "grad_norm": 8.638939249832973, "learning_rate": 1.830322572177297e-06, "loss": 0.1158, "step": 30260 }, { "epoch": 2.44, "grad_norm": 8.806223697230502, "learning_rate": 1.8200779663393922e-06, "loss": 0.1181, "step": 30280 }, { "epoch": 2.45, "grad_norm": 4.033502703386666, "learning_rate": 1.8098592718159038e-06, "loss": 0.1165, "step": 30300 }, { "epoch": 2.45, "grad_norm": 9.444128676976248, "learning_rate": 1.7996665209723468e-06, "loss": 0.1148, "step": 30320 }, { "epoch": 2.45, "grad_norm": 7.415313197162458, "learning_rate": 1.7894997460920762e-06, "loss": 0.1176, "step": 30340 }, { "epoch": 2.45, "grad_norm": 7.176890097029754, "learning_rate": 1.7793589793761682e-06, "loss": 0.1329, "step": 30360 }, { "epoch": 2.45, "grad_norm": 3.9279082196733843, "learning_rate": 1.7692442529433225e-06, "loss": 0.1178, "step": 30380 }, { "epoch": 2.45, "grad_norm": 9.816455214014374, "learning_rate": 1.7591555988297663e-06, "loss": 0.1214, "step": 30400 }, { "epoch": 2.46, "grad_norm": 8.912967758423415, "learning_rate": 1.7490930489891434e-06, "loss": 0.133, "step": 30420 }, { "epoch": 2.46, "grad_norm": 6.811867220827171, "learning_rate": 1.739056635292423e-06, "loss": 0.1151, "step": 30440 }, { "epoch": 2.46, "grad_norm": 7.111981192976948, "learning_rate": 1.7290463895277887e-06, "loss": 0.1134, "step": 30460 }, { "epoch": 2.46, "grad_norm": 7.4620075517021665, "learning_rate": 1.719062343400548e-06, "loss": 0.1316, "step": 30480 }, { "epoch": 2.46, "grad_norm": 4.684146953424328, "learning_rate": 1.709104528533019e-06, "loss": 0.1305, "step": 30500 }, { "epoch": 2.46, "grad_norm": 6.327017830903305, "learning_rate": 1.6991729764644476e-06, "loss": 0.1399, "step": 30520 }, { "epoch": 2.47, "grad_norm": 6.740197021593106, "learning_rate": 1.6892677186508898e-06, "loss": 0.1111, "step": 30540 }, { "epoch": 2.47, "grad_norm": 8.0439618513283, "learning_rate": 1.6793887864651246e-06, "loss": 0.1243, "step": 30560 }, { "epoch": 2.47, "grad_norm": 5.085685290105466, "learning_rate": 1.6695362111965515e-06, "loss": 0.1384, "step": 30580 }, { "epoch": 2.47, "grad_norm": 7.875454359722385, "learning_rate": 1.6597100240510838e-06, "loss": 0.1323, "step": 30600 }, { "epoch": 2.47, "grad_norm": 5.333635361384662, "learning_rate": 1.6499102561510626e-06, "loss": 0.1202, "step": 30620 }, { "epoch": 2.47, "grad_norm": 4.867868929901582, "learning_rate": 1.6401369385351469e-06, "loss": 0.133, "step": 30640 }, { "epoch": 2.47, "grad_norm": 6.472647999014053, "learning_rate": 1.6303901021582264e-06, "loss": 0.1408, "step": 30660 }, { "epoch": 2.48, "grad_norm": 7.7675142810740585, "learning_rate": 1.620669777891306e-06, "loss": 0.1047, "step": 30680 }, { "epoch": 2.48, "grad_norm": 4.887028843641001, "learning_rate": 1.6109759965214319e-06, "loss": 0.1191, "step": 30700 }, { "epoch": 2.48, "grad_norm": 8.456861195605512, "learning_rate": 1.6013087887515735e-06, "loss": 0.1326, "step": 30720 }, { "epoch": 2.48, "grad_norm": 7.070870359552745, "learning_rate": 1.5916681852005378e-06, "loss": 0.1193, "step": 30740 }, { "epoch": 2.48, "grad_norm": 5.292783704387575, "learning_rate": 1.582054216402868e-06, "loss": 0.1312, "step": 30760 }, { "epoch": 2.48, "grad_norm": 4.631588547413747, "learning_rate": 1.572466912808742e-06, "loss": 0.1187, "step": 30780 }, { "epoch": 2.49, "grad_norm": 6.784570399650408, "learning_rate": 1.5629063047838916e-06, "loss": 0.1187, "step": 30800 }, { "epoch": 2.49, "grad_norm": 9.266122737079046, "learning_rate": 1.5533724226094857e-06, "loss": 0.1468, "step": 30820 }, { "epoch": 2.49, "grad_norm": 7.742854115631847, "learning_rate": 1.5438652964820526e-06, "loss": 0.1311, "step": 30840 }, { "epoch": 2.49, "grad_norm": 8.735765923043441, "learning_rate": 1.53438495651337e-06, "loss": 0.116, "step": 30860 }, { "epoch": 2.49, "grad_norm": 6.79139084831237, "learning_rate": 1.524931432730385e-06, "loss": 0.1184, "step": 30880 }, { "epoch": 2.49, "grad_norm": 6.034957773839951, "learning_rate": 1.5155047550750996e-06, "loss": 0.1323, "step": 30900 }, { "epoch": 2.5, "grad_norm": 6.008839136315207, "learning_rate": 1.5061049534044933e-06, "loss": 0.1271, "step": 30920 }, { "epoch": 2.5, "grad_norm": 5.507681943820568, "learning_rate": 1.496732057490423e-06, "loss": 0.1491, "step": 30940 }, { "epoch": 2.5, "grad_norm": 5.606945308575729, "learning_rate": 1.4873860970195197e-06, "loss": 0.1337, "step": 30960 }, { "epoch": 2.5, "grad_norm": 5.121597463130114, "learning_rate": 1.4780671015931124e-06, "loss": 0.138, "step": 30980 }, { "epoch": 2.5, "grad_norm": 6.576887363160123, "learning_rate": 1.4687751007271133e-06, "loss": 0.127, "step": 31000 }, { "epoch": 2.5, "grad_norm": 6.181317564129292, "learning_rate": 1.4595101238519446e-06, "loss": 0.125, "step": 31020 }, { "epoch": 2.51, "grad_norm": 4.639778034228489, "learning_rate": 1.4502722003124294e-06, "loss": 0.122, "step": 31040 }, { "epoch": 2.51, "grad_norm": 5.264822969566869, "learning_rate": 1.4410613593677095e-06, "loss": 0.1325, "step": 31060 }, { "epoch": 2.51, "grad_norm": 6.874620461824829, "learning_rate": 1.4318776301911433e-06, "loss": 0.114, "step": 31080 }, { "epoch": 2.51, "grad_norm": 6.441698205368128, "learning_rate": 1.4227210418702227e-06, "loss": 0.1249, "step": 31100 }, { "epoch": 2.51, "grad_norm": 4.740170195859825, "learning_rate": 1.4135916234064764e-06, "loss": 0.1271, "step": 31120 }, { "epoch": 2.51, "grad_norm": 5.096712339274361, "learning_rate": 1.4044894037153778e-06, "loss": 0.1239, "step": 31140 }, { "epoch": 2.52, "grad_norm": 3.9579478159775507, "learning_rate": 1.395414411626256e-06, "loss": 0.1444, "step": 31160 }, { "epoch": 2.52, "grad_norm": 7.130158547969986, "learning_rate": 1.3863666758821955e-06, "loss": 0.1319, "step": 31180 }, { "epoch": 2.52, "grad_norm": 7.798534147977922, "learning_rate": 1.3773462251399627e-06, "loss": 0.1336, "step": 31200 }, { "epoch": 2.52, "grad_norm": 5.667338088406301, "learning_rate": 1.3683530879698928e-06, "loss": 0.1287, "step": 31220 }, { "epoch": 2.52, "grad_norm": 7.830630573395859, "learning_rate": 1.3593872928558236e-06, "loss": 0.1109, "step": 31240 }, { "epoch": 2.52, "grad_norm": 5.645347583702913, "learning_rate": 1.3504488681949792e-06, "loss": 0.1157, "step": 31260 }, { "epoch": 2.52, "grad_norm": 5.457235541373335, "learning_rate": 1.3415378422979085e-06, "loss": 0.1053, "step": 31280 }, { "epoch": 2.53, "grad_norm": 6.735796643374742, "learning_rate": 1.3326542433883683e-06, "loss": 0.1283, "step": 31300 }, { "epoch": 2.53, "grad_norm": 10.785574667612503, "learning_rate": 1.3237980996032517e-06, "loss": 0.1268, "step": 31320 }, { "epoch": 2.53, "grad_norm": 4.838773077887822, "learning_rate": 1.3149694389924962e-06, "loss": 0.1237, "step": 31340 }, { "epoch": 2.53, "grad_norm": 10.437919197116221, "learning_rate": 1.306168289518985e-06, "loss": 0.1446, "step": 31360 }, { "epoch": 2.53, "grad_norm": 5.243182160432772, "learning_rate": 1.2973946790584738e-06, "loss": 0.1292, "step": 31380 }, { "epoch": 2.53, "grad_norm": 5.252728615986934, "learning_rate": 1.288648635399485e-06, "loss": 0.1278, "step": 31400 }, { "epoch": 2.54, "grad_norm": 6.405933204819527, "learning_rate": 1.279930186243242e-06, "loss": 0.1543, "step": 31420 }, { "epoch": 2.54, "grad_norm": 4.189951392645205, "learning_rate": 1.2712393592035532e-06, "loss": 0.116, "step": 31440 }, { "epoch": 2.54, "grad_norm": 5.816130458014046, "learning_rate": 1.2625761818067537e-06, "loss": 0.1159, "step": 31460 }, { "epoch": 2.54, "grad_norm": 5.249036064665044, "learning_rate": 1.2539406814915936e-06, "loss": 0.1222, "step": 31480 }, { "epoch": 2.54, "grad_norm": 6.908118362057733, "learning_rate": 1.245332885609168e-06, "loss": 0.09, "step": 31500 }, { "epoch": 2.54, "grad_norm": 6.29663115755768, "learning_rate": 1.2367528214228266e-06, "loss": 0.1069, "step": 31520 }, { "epoch": 2.55, "grad_norm": 7.868646389837377, "learning_rate": 1.2282005161080743e-06, "loss": 0.1311, "step": 31540 }, { "epoch": 2.55, "grad_norm": 5.394022968055218, "learning_rate": 1.2196759967525055e-06, "loss": 0.119, "step": 31560 }, { "epoch": 2.55, "grad_norm": 7.543821866405037, "learning_rate": 1.2111792903557062e-06, "loss": 0.1291, "step": 31580 }, { "epoch": 2.55, "grad_norm": 4.893091914140241, "learning_rate": 1.2027104238291709e-06, "loss": 0.1155, "step": 31600 }, { "epoch": 2.55, "grad_norm": 6.241902102557373, "learning_rate": 1.1942694239962118e-06, "loss": 0.1346, "step": 31620 }, { "epoch": 2.55, "grad_norm": 3.289932679528205, "learning_rate": 1.1858563175918899e-06, "loss": 0.1113, "step": 31640 }, { "epoch": 2.56, "grad_norm": 7.794821795664855, "learning_rate": 1.1774711312629082e-06, "loss": 0.1403, "step": 31660 }, { "epoch": 2.56, "grad_norm": 5.785990652921361, "learning_rate": 1.1691138915675485e-06, "loss": 0.1268, "step": 31680 }, { "epoch": 2.56, "grad_norm": 5.403410413891596, "learning_rate": 1.16078462497557e-06, "loss": 0.1232, "step": 31700 }, { "epoch": 2.56, "grad_norm": 8.347697834916307, "learning_rate": 1.1524833578681385e-06, "loss": 0.1425, "step": 31720 }, { "epoch": 2.56, "grad_norm": 4.453693554370925, "learning_rate": 1.1442101165377383e-06, "loss": 0.1421, "step": 31740 }, { "epoch": 2.56, "grad_norm": 6.738466935847338, "learning_rate": 1.1359649271880802e-06, "loss": 0.1196, "step": 31760 }, { "epoch": 2.57, "grad_norm": 10.80798563127106, "learning_rate": 1.1277478159340374e-06, "loss": 0.1245, "step": 31780 }, { "epoch": 2.57, "grad_norm": 3.961371824991924, "learning_rate": 1.1195588088015385e-06, "loss": 0.1258, "step": 31800 }, { "epoch": 2.57, "grad_norm": 7.695673869211814, "learning_rate": 1.1113979317275137e-06, "loss": 0.0902, "step": 31820 }, { "epoch": 2.57, "grad_norm": 6.261727888132758, "learning_rate": 1.1032652105597828e-06, "loss": 0.1345, "step": 31840 }, { "epoch": 2.57, "grad_norm": 5.69396571599717, "learning_rate": 1.0951606710569984e-06, "loss": 0.1105, "step": 31860 }, { "epoch": 2.57, "grad_norm": 4.818599056348715, "learning_rate": 1.087084338888547e-06, "loss": 0.1314, "step": 31880 }, { "epoch": 2.58, "grad_norm": 4.8622912521467825, "learning_rate": 1.079036239634477e-06, "loss": 0.1279, "step": 31900 }, { "epoch": 2.58, "grad_norm": 10.968936755554902, "learning_rate": 1.0710163987854195e-06, "loss": 0.1107, "step": 31920 }, { "epoch": 2.58, "grad_norm": 3.9512346293830167, "learning_rate": 1.063024841742493e-06, "loss": 0.117, "step": 31940 }, { "epoch": 2.58, "grad_norm": 4.5457619684470165, "learning_rate": 1.0550615938172445e-06, "loss": 0.1263, "step": 31960 }, { "epoch": 2.58, "grad_norm": 9.36688206471384, "learning_rate": 1.0471266802315478e-06, "loss": 0.1525, "step": 31980 }, { "epoch": 2.58, "grad_norm": 5.375309247723985, "learning_rate": 1.0392201261175393e-06, "loss": 0.1275, "step": 32000 }, { "epoch": 2.58, "grad_norm": 7.1181471143173525, "learning_rate": 1.0313419565175328e-06, "loss": 0.1129, "step": 32020 }, { "epoch": 2.59, "grad_norm": 9.45812150042991, "learning_rate": 1.0234921963839421e-06, "loss": 0.1228, "step": 32040 }, { "epoch": 2.59, "grad_norm": 14.020876441802383, "learning_rate": 1.015670870579192e-06, "loss": 0.1339, "step": 32060 }, { "epoch": 2.59, "grad_norm": 4.674658517044487, "learning_rate": 1.0078780038756562e-06, "loss": 0.1208, "step": 32080 }, { "epoch": 2.59, "grad_norm": 6.321401834623801, "learning_rate": 1.0001136209555692e-06, "loss": 0.1055, "step": 32100 }, { "epoch": 2.59, "grad_norm": 6.325753075922191, "learning_rate": 9.923777464109414e-07, "loss": 0.1386, "step": 32120 }, { "epoch": 2.59, "grad_norm": 7.348910981549994, "learning_rate": 9.84670404743501e-07, "loss": 0.1237, "step": 32140 }, { "epoch": 2.6, "grad_norm": 7.112253935031915, "learning_rate": 9.769916203645922e-07, "loss": 0.1157, "step": 32160 }, { "epoch": 2.6, "grad_norm": 12.78737115133185, "learning_rate": 9.693414175951228e-07, "loss": 0.1477, "step": 32180 }, { "epoch": 2.6, "grad_norm": 4.6369345019438635, "learning_rate": 9.617198206654616e-07, "loss": 0.1149, "step": 32200 }, { "epoch": 2.6, "grad_norm": 5.264226620233943, "learning_rate": 9.54126853715385e-07, "loss": 0.1134, "step": 32220 }, { "epoch": 2.6, "grad_norm": 5.914163767503201, "learning_rate": 9.465625407939831e-07, "loss": 0.1229, "step": 32240 }, { "epoch": 2.6, "grad_norm": 6.7503348373439795, "learning_rate": 9.390269058595956e-07, "loss": 0.1254, "step": 32260 }, { "epoch": 2.61, "grad_norm": 7.892316399010486, "learning_rate": 9.31519972779728e-07, "loss": 0.1328, "step": 32280 }, { "epoch": 2.61, "grad_norm": 6.117153201731752, "learning_rate": 9.24041765330978e-07, "loss": 0.1207, "step": 32300 }, { "epoch": 2.61, "grad_norm": 7.983692475949779, "learning_rate": 9.16592307198966e-07, "loss": 0.1244, "step": 32320 }, { "epoch": 2.61, "grad_norm": 3.955244860490069, "learning_rate": 9.09171621978246e-07, "loss": 0.1235, "step": 32340 }, { "epoch": 2.61, "grad_norm": 6.203165279997527, "learning_rate": 9.017797331722508e-07, "loss": 0.1136, "step": 32360 }, { "epoch": 2.61, "grad_norm": 8.554472118435926, "learning_rate": 8.944166641931964e-07, "loss": 0.1395, "step": 32380 }, { "epoch": 2.62, "grad_norm": 7.069745423563985, "learning_rate": 8.870824383620293e-07, "loss": 0.1463, "step": 32400 }, { "epoch": 2.62, "grad_norm": 4.888268046020604, "learning_rate": 8.79777078908329e-07, "loss": 0.1139, "step": 32420 }, { "epoch": 2.62, "grad_norm": 9.201663356182284, "learning_rate": 8.72500608970257e-07, "loss": 0.1362, "step": 32440 }, { "epoch": 2.62, "grad_norm": 7.810213776811871, "learning_rate": 8.652530515944686e-07, "loss": 0.1346, "step": 32460 }, { "epoch": 2.62, "grad_norm": 7.097375691001481, "learning_rate": 8.580344297360476e-07, "loss": 0.1294, "step": 32480 }, { "epoch": 2.62, "grad_norm": 6.684772256011875, "learning_rate": 8.5084476625843e-07, "loss": 0.1026, "step": 32500 }, { "epoch": 2.63, "grad_norm": 5.1644648916007885, "learning_rate": 8.436840839333276e-07, "loss": 0.1127, "step": 32520 }, { "epoch": 2.63, "grad_norm": 4.437527508717552, "learning_rate": 8.365524054406686e-07, "loss": 0.1083, "step": 32540 }, { "epoch": 2.63, "grad_norm": 6.123491373917768, "learning_rate": 8.294497533685112e-07, "loss": 0.119, "step": 32560 }, { "epoch": 2.63, "grad_norm": 8.449303628792341, "learning_rate": 8.22376150212982e-07, "loss": 0.139, "step": 32580 }, { "epoch": 2.63, "grad_norm": 7.346546436071408, "learning_rate": 8.153316183781996e-07, "loss": 0.1433, "step": 32600 }, { "epoch": 2.63, "grad_norm": 5.912180027761677, "learning_rate": 8.083161801762093e-07, "loss": 0.1059, "step": 32620 }, { "epoch": 2.63, "grad_norm": 3.9692106204923294, "learning_rate": 8.01329857826901e-07, "loss": 0.1121, "step": 32640 }, { "epoch": 2.64, "grad_norm": 8.800207777171327, "learning_rate": 7.943726734579543e-07, "loss": 0.1255, "step": 32660 }, { "epoch": 2.64, "grad_norm": 8.29049546550003, "learning_rate": 7.87444649104758e-07, "loss": 0.1439, "step": 32680 }, { "epoch": 2.64, "grad_norm": 9.026854399058228, "learning_rate": 7.805458067103398e-07, "loss": 0.1216, "step": 32700 }, { "epoch": 2.64, "grad_norm": 3.893497004487122, "learning_rate": 7.736761681253032e-07, "loss": 0.0998, "step": 32720 }, { "epoch": 2.64, "grad_norm": 4.278050473151302, "learning_rate": 7.668357551077522e-07, "loss": 0.128, "step": 32740 }, { "epoch": 2.64, "grad_norm": 7.00599686014735, "learning_rate": 7.600245893232301e-07, "loss": 0.1261, "step": 32760 }, { "epoch": 2.65, "grad_norm": 4.65856773500985, "learning_rate": 7.532426923446387e-07, "loss": 0.1163, "step": 32780 }, { "epoch": 2.65, "grad_norm": 6.619050719550238, "learning_rate": 7.464900856521835e-07, "loss": 0.1424, "step": 32800 }, { "epoch": 2.65, "grad_norm": 1.5085381851749664, "learning_rate": 7.397667906332945e-07, "loss": 0.1137, "step": 32820 }, { "epoch": 2.65, "grad_norm": 7.756973328778995, "learning_rate": 7.330728285825665e-07, "loss": 0.1296, "step": 32840 }, { "epoch": 2.65, "grad_norm": 4.908229355717314, "learning_rate": 7.26408220701689e-07, "loss": 0.1155, "step": 32860 }, { "epoch": 2.65, "grad_norm": 7.794041915847408, "learning_rate": 7.197729880993726e-07, "loss": 0.1328, "step": 32880 }, { "epoch": 2.66, "grad_norm": 4.871517013111337, "learning_rate": 7.131671517912967e-07, "loss": 0.1097, "step": 32900 }, { "epoch": 2.66, "grad_norm": 9.45506178506386, "learning_rate": 7.065907327000274e-07, "loss": 0.1292, "step": 32920 }, { "epoch": 2.66, "grad_norm": 7.168635814924726, "learning_rate": 7.000437516549624e-07, "loss": 0.1255, "step": 32940 }, { "epoch": 2.66, "grad_norm": 5.582074645015892, "learning_rate": 6.935262293922558e-07, "loss": 0.1203, "step": 32960 }, { "epoch": 2.66, "grad_norm": 4.451450822901556, "learning_rate": 6.870381865547634e-07, "loss": 0.0995, "step": 32980 }, { "epoch": 2.66, "grad_norm": 5.805931071051967, "learning_rate": 6.805796436919659e-07, "loss": 0.1264, "step": 33000 }, { "epoch": 2.67, "grad_norm": 4.495906027181732, "learning_rate": 6.741506212599129e-07, "loss": 0.1418, "step": 33020 }, { "epoch": 2.67, "grad_norm": 5.808603272625852, "learning_rate": 6.677511396211512e-07, "loss": 0.1256, "step": 33040 }, { "epoch": 2.67, "grad_norm": 6.626750211029159, "learning_rate": 6.613812190446662e-07, "loss": 0.112, "step": 33060 }, { "epoch": 2.67, "grad_norm": 8.567619185674758, "learning_rate": 6.550408797058146e-07, "loss": 0.1366, "step": 33080 }, { "epoch": 2.67, "grad_norm": 6.01916716685692, "learning_rate": 6.487301416862581e-07, "loss": 0.1215, "step": 33100 }, { "epoch": 2.67, "grad_norm": 8.962014889323987, "learning_rate": 6.424490249739089e-07, "loss": 0.1345, "step": 33120 }, { "epoch": 2.68, "grad_norm": 7.757310680128591, "learning_rate": 6.361975494628508e-07, "loss": 0.1175, "step": 33140 }, { "epoch": 2.68, "grad_norm": 3.060716849489914, "learning_rate": 6.299757349532947e-07, "loss": 0.1133, "step": 33160 }, { "epoch": 2.68, "grad_norm": 4.468002363499739, "learning_rate": 6.237836011514977e-07, "loss": 0.1077, "step": 33180 }, { "epoch": 2.68, "grad_norm": 7.185375662627942, "learning_rate": 6.176211676697205e-07, "loss": 0.1222, "step": 33200 }, { "epoch": 2.68, "grad_norm": 8.84170562515041, "learning_rate": 6.114884540261428e-07, "loss": 0.1218, "step": 33220 }, { "epoch": 2.68, "grad_norm": 5.538710051650554, "learning_rate": 6.053854796448217e-07, "loss": 0.1435, "step": 33240 }, { "epoch": 2.68, "grad_norm": 6.78666253025451, "learning_rate": 5.993122638556182e-07, "loss": 0.14, "step": 33260 }, { "epoch": 2.69, "grad_norm": 2.643639373268369, "learning_rate": 5.932688258941388e-07, "loss": 0.1263, "step": 33280 }, { "epoch": 2.69, "grad_norm": 6.149870224490609, "learning_rate": 5.872551849016762e-07, "loss": 0.1216, "step": 33300 }, { "epoch": 2.69, "grad_norm": 6.94433119957035, "learning_rate": 5.81271359925147e-07, "loss": 0.1172, "step": 33320 }, { "epoch": 2.69, "grad_norm": 4.4867487881661425, "learning_rate": 5.753173699170339e-07, "loss": 0.1148, "step": 33340 }, { "epoch": 2.69, "grad_norm": 11.906934983781838, "learning_rate": 5.693932337353199e-07, "loss": 0.1493, "step": 33360 }, { "epoch": 2.69, "grad_norm": 4.795102267164223, "learning_rate": 5.63498970143438e-07, "loss": 0.1227, "step": 33380 }, { "epoch": 2.7, "grad_norm": 6.840906676047454, "learning_rate": 5.576345978101983e-07, "loss": 0.1264, "step": 33400 }, { "epoch": 2.7, "grad_norm": 3.1992782195995098, "learning_rate": 5.51800135309745e-07, "loss": 0.1169, "step": 33420 }, { "epoch": 2.7, "grad_norm": 7.471822577483658, "learning_rate": 5.459956011214832e-07, "loss": 0.1231, "step": 33440 }, { "epoch": 2.7, "grad_norm": 8.847765762291347, "learning_rate": 5.4022101363003e-07, "loss": 0.1324, "step": 33460 }, { "epoch": 2.7, "grad_norm": 6.353919085906283, "learning_rate": 5.344763911251533e-07, "loss": 0.1177, "step": 33480 }, { "epoch": 2.7, "grad_norm": 6.427326348242822, "learning_rate": 5.287617518017097e-07, "loss": 0.1214, "step": 33500 }, { "epoch": 2.71, "grad_norm": 5.588241336807433, "learning_rate": 5.230771137595928e-07, "loss": 0.134, "step": 33520 }, { "epoch": 2.71, "grad_norm": 6.541581301122035, "learning_rate": 5.174224950036734e-07, "loss": 0.1271, "step": 33540 }, { "epoch": 2.71, "grad_norm": 7.835651730986799, "learning_rate": 5.117979134437436e-07, "loss": 0.1251, "step": 33560 }, { "epoch": 2.71, "grad_norm": 6.586428659028256, "learning_rate": 5.062033868944559e-07, "loss": 0.1384, "step": 33580 }, { "epoch": 2.71, "grad_norm": 6.65254903712902, "learning_rate": 5.006389330752748e-07, "loss": 0.1279, "step": 33600 }, { "epoch": 2.71, "grad_norm": 3.2552717441740513, "learning_rate": 4.951045696104105e-07, "loss": 0.129, "step": 33620 }, { "epoch": 2.72, "grad_norm": 8.672024562336388, "learning_rate": 4.896003140287725e-07, "loss": 0.1319, "step": 33640 }, { "epoch": 2.72, "grad_norm": 7.009024388749127, "learning_rate": 4.841261837639114e-07, "loss": 0.1225, "step": 33660 }, { "epoch": 2.72, "grad_norm": 4.9749594221904365, "learning_rate": 4.786821961539554e-07, "loss": 0.135, "step": 33680 }, { "epoch": 2.72, "grad_norm": 3.4497092082994723, "learning_rate": 4.732683684415701e-07, "loss": 0.1195, "step": 33700 }, { "epoch": 2.72, "grad_norm": 5.716793204965511, "learning_rate": 4.6788471777389074e-07, "loss": 0.1057, "step": 33720 }, { "epoch": 2.72, "grad_norm": 5.895058337206874, "learning_rate": 4.625312612024753e-07, "loss": 0.1245, "step": 33740 }, { "epoch": 2.73, "grad_norm": 8.364038382355943, "learning_rate": 4.572080156832478e-07, "loss": 0.131, "step": 33760 }, { "epoch": 2.73, "grad_norm": 5.976339737474339, "learning_rate": 4.519149980764483e-07, "loss": 0.1053, "step": 33780 }, { "epoch": 2.73, "grad_norm": 6.840809647676377, "learning_rate": 4.4665222514657086e-07, "loss": 0.1416, "step": 33800 }, { "epoch": 2.73, "grad_norm": 8.911435407961696, "learning_rate": 4.414197135623202e-07, "loss": 0.1273, "step": 33820 }, { "epoch": 2.73, "grad_norm": 7.144991751920346, "learning_rate": 4.3621747989655514e-07, "loss": 0.1287, "step": 33840 }, { "epoch": 2.73, "grad_norm": 7.063737338892995, "learning_rate": 4.310455406262318e-07, "loss": 0.1322, "step": 33860 }, { "epoch": 2.73, "grad_norm": 4.395930983677614, "learning_rate": 4.259039121323593e-07, "loss": 0.1266, "step": 33880 }, { "epoch": 2.74, "grad_norm": 6.400149236369628, "learning_rate": 4.207926106999389e-07, "loss": 0.1298, "step": 33900 }, { "epoch": 2.74, "grad_norm": 7.045897287010474, "learning_rate": 4.15711652517926e-07, "loss": 0.1109, "step": 33920 }, { "epoch": 2.74, "grad_norm": 3.5434837766081912, "learning_rate": 4.1066105367916013e-07, "loss": 0.1156, "step": 33940 }, { "epoch": 2.74, "grad_norm": 5.427032919544017, "learning_rate": 4.0564083018033095e-07, "loss": 0.1203, "step": 33960 }, { "epoch": 2.74, "grad_norm": 4.383982782636372, "learning_rate": 4.006509979219189e-07, "loss": 0.1159, "step": 33980 }, { "epoch": 2.74, "grad_norm": 6.240901580626192, "learning_rate": 3.956915727081458e-07, "loss": 0.1133, "step": 34000 }, { "epoch": 2.75, "grad_norm": 4.661521436665521, "learning_rate": 3.907625702469276e-07, "loss": 0.1218, "step": 34020 }, { "epoch": 2.75, "grad_norm": 7.8539517994706465, "learning_rate": 3.8586400614981944e-07, "loss": 0.1314, "step": 34040 }, { "epoch": 2.75, "grad_norm": 5.8003317507871435, "learning_rate": 3.8099589593197523e-07, "loss": 0.1214, "step": 34060 }, { "epoch": 2.75, "grad_norm": 6.436400384411093, "learning_rate": 3.761582550120848e-07, "loss": 0.131, "step": 34080 }, { "epoch": 2.75, "grad_norm": 7.906832345617252, "learning_rate": 3.7135109871234027e-07, "loss": 0.117, "step": 34100 }, { "epoch": 2.75, "grad_norm": 6.917382842830597, "learning_rate": 3.665744422583762e-07, "loss": 0.1048, "step": 34120 }, { "epoch": 2.76, "grad_norm": 5.02214581863741, "learning_rate": 3.6182830077922635e-07, "loss": 0.1022, "step": 34140 }, { "epoch": 2.76, "grad_norm": 2.7231931663248137, "learning_rate": 3.5711268930727487e-07, "loss": 0.1011, "step": 34160 }, { "epoch": 2.76, "grad_norm": 4.680945123948578, "learning_rate": 3.5242762277820955e-07, "loss": 0.1196, "step": 34180 }, { "epoch": 2.76, "grad_norm": 5.299826910683322, "learning_rate": 3.477731160309729e-07, "loss": 0.1233, "step": 34200 }, { "epoch": 2.76, "grad_norm": 5.990753360302409, "learning_rate": 3.4314918380771585e-07, "loss": 0.143, "step": 34220 }, { "epoch": 2.76, "grad_norm": 8.32297859280258, "learning_rate": 3.3855584075375184e-07, "loss": 0.1049, "step": 34240 }, { "epoch": 2.77, "grad_norm": 6.493463310543985, "learning_rate": 3.339931014175062e-07, "loss": 0.1208, "step": 34260 }, { "epoch": 2.77, "grad_norm": 8.535264408639643, "learning_rate": 3.294609802504803e-07, "loss": 0.1449, "step": 34280 }, { "epoch": 2.77, "grad_norm": 9.463293536636538, "learning_rate": 3.2495949160719146e-07, "loss": 0.1341, "step": 34300 }, { "epoch": 2.77, "grad_norm": 12.26352614407162, "learning_rate": 3.204886497451413e-07, "loss": 0.1271, "step": 34320 }, { "epoch": 2.77, "grad_norm": 13.159598533744711, "learning_rate": 3.160484688247596e-07, "loss": 0.0999, "step": 34340 }, { "epoch": 2.77, "grad_norm": 6.504637604338118, "learning_rate": 3.1163896290936843e-07, "loss": 0.1315, "step": 34360 }, { "epoch": 2.78, "grad_norm": 6.898436670530103, "learning_rate": 3.0726014596513015e-07, "loss": 0.1387, "step": 34380 }, { "epoch": 2.78, "grad_norm": 6.931111736655661, "learning_rate": 3.0291203186100604e-07, "loss": 0.1333, "step": 34400 }, { "epoch": 2.78, "grad_norm": 3.5685122435783176, "learning_rate": 2.9859463436871694e-07, "loss": 0.139, "step": 34420 }, { "epoch": 2.78, "grad_norm": 6.157402434170446, "learning_rate": 2.9430796716269023e-07, "loss": 0.1241, "step": 34440 }, { "epoch": 2.78, "grad_norm": 6.230741814157244, "learning_rate": 2.9005204382002645e-07, "loss": 0.1227, "step": 34460 }, { "epoch": 2.78, "grad_norm": 4.179842880070398, "learning_rate": 2.858268778204459e-07, "loss": 0.1523, "step": 34480 }, { "epoch": 2.78, "grad_norm": 8.152978701465065, "learning_rate": 2.816324825462589e-07, "loss": 0.1236, "step": 34500 }, { "epoch": 2.79, "grad_norm": 7.081448412654981, "learning_rate": 2.7746887128230797e-07, "loss": 0.1388, "step": 34520 }, { "epoch": 2.79, "grad_norm": 9.811309159054707, "learning_rate": 2.733360572159421e-07, "loss": 0.1496, "step": 34540 }, { "epoch": 2.79, "grad_norm": 4.316826460739263, "learning_rate": 2.6923405343695917e-07, "loss": 0.1235, "step": 34560 }, { "epoch": 2.79, "grad_norm": 4.537044424510889, "learning_rate": 2.651628729375774e-07, "loss": 0.1222, "step": 34580 }, { "epoch": 2.79, "grad_norm": 6.43738948393834, "learning_rate": 2.6112252861238805e-07, "loss": 0.1325, "step": 34600 }, { "epoch": 2.79, "grad_norm": 4.7672870676851185, "learning_rate": 2.571130332583106e-07, "loss": 0.1181, "step": 34620 }, { "epoch": 2.8, "grad_norm": 6.84589397692235, "learning_rate": 2.5313439957456665e-07, "loss": 0.1281, "step": 34640 }, { "epoch": 2.8, "grad_norm": 6.228317492046972, "learning_rate": 2.491866401626193e-07, "loss": 0.114, "step": 34660 }, { "epoch": 2.8, "grad_norm": 5.837530896515532, "learning_rate": 2.452697675261519e-07, "loss": 0.13, "step": 34680 }, { "epoch": 2.8, "grad_norm": 5.862731785858869, "learning_rate": 2.4138379407101673e-07, "loss": 0.1025, "step": 34700 }, { "epoch": 2.8, "grad_norm": 4.790427780322071, "learning_rate": 2.3752873210520103e-07, "loss": 0.1323, "step": 34720 }, { "epoch": 2.8, "grad_norm": 8.260268323479504, "learning_rate": 2.3370459383878448e-07, "loss": 0.1243, "step": 34740 }, { "epoch": 2.81, "grad_norm": 5.565164335143073, "learning_rate": 2.2991139138390613e-07, "loss": 0.1332, "step": 34760 }, { "epoch": 2.81, "grad_norm": 8.448461123857298, "learning_rate": 2.261491367547198e-07, "loss": 0.1361, "step": 34780 }, { "epoch": 2.81, "grad_norm": 3.87846146076151, "learning_rate": 2.2241784186735862e-07, "loss": 0.1081, "step": 34800 }, { "epoch": 2.81, "grad_norm": 7.16672039751123, "learning_rate": 2.1871751853989963e-07, "loss": 0.1472, "step": 34820 }, { "epoch": 2.81, "grad_norm": 5.60534693356312, "learning_rate": 2.1504817849232026e-07, "loss": 0.1156, "step": 34840 }, { "epoch": 2.81, "grad_norm": 3.337052384552651, "learning_rate": 2.1140983334647082e-07, "loss": 0.1199, "step": 34860 }, { "epoch": 2.82, "grad_norm": 4.2136706256661265, "learning_rate": 2.0780249462602656e-07, "loss": 0.1315, "step": 34880 }, { "epoch": 2.82, "grad_norm": 10.193194523779548, "learning_rate": 2.042261737564589e-07, "loss": 0.1186, "step": 34900 }, { "epoch": 2.82, "grad_norm": 6.210252025852846, "learning_rate": 2.0068088206499553e-07, "loss": 0.1217, "step": 34920 }, { "epoch": 2.82, "grad_norm": 5.909774778047476, "learning_rate": 1.9716663078058802e-07, "loss": 0.1183, "step": 34940 }, { "epoch": 2.82, "grad_norm": 5.424779284509604, "learning_rate": 1.9368343103387214e-07, "loss": 0.1312, "step": 34960 }, { "epoch": 2.82, "grad_norm": 7.1959892228084525, "learning_rate": 1.9023129385713642e-07, "loss": 0.1334, "step": 34980 }, { "epoch": 2.83, "grad_norm": 7.107631997872955, "learning_rate": 1.868102301842837e-07, "loss": 0.1405, "step": 35000 }, { "epoch": 2.83, "grad_norm": 6.6902155131251595, "learning_rate": 1.8342025085079856e-07, "loss": 0.1304, "step": 35020 }, { "epoch": 2.83, "grad_norm": 6.016665836802643, "learning_rate": 1.800613665937143e-07, "loss": 0.1065, "step": 35040 }, { "epoch": 2.83, "grad_norm": 4.36999246941448, "learning_rate": 1.767335880515717e-07, "loss": 0.1106, "step": 35060 }, { "epoch": 2.83, "grad_norm": 10.142374084759215, "learning_rate": 1.7343692576440018e-07, "loss": 0.1283, "step": 35080 }, { "epoch": 2.83, "grad_norm": 6.058321741959696, "learning_rate": 1.7017139017366685e-07, "loss": 0.1449, "step": 35100 }, { "epoch": 2.83, "grad_norm": 6.673556613953213, "learning_rate": 1.6693699162225637e-07, "loss": 0.1209, "step": 35120 }, { "epoch": 2.84, "grad_norm": 5.899325716960789, "learning_rate": 1.637337403544288e-07, "loss": 0.1015, "step": 35140 }, { "epoch": 2.84, "grad_norm": 4.531652814126784, "learning_rate": 1.605616465157986e-07, "loss": 0.1244, "step": 35160 }, { "epoch": 2.84, "grad_norm": 6.297185771836459, "learning_rate": 1.5742072015329005e-07, "loss": 0.1209, "step": 35180 }, { "epoch": 2.84, "grad_norm": 8.519773036458979, "learning_rate": 1.5431097121511313e-07, "loss": 0.1206, "step": 35200 }, { "epoch": 2.84, "grad_norm": 3.498019339209741, "learning_rate": 1.5123240955073208e-07, "loss": 0.1356, "step": 35220 }, { "epoch": 2.84, "grad_norm": 5.409886895370653, "learning_rate": 1.4818504491082678e-07, "loss": 0.1291, "step": 35240 }, { "epoch": 2.85, "grad_norm": 4.691147262704599, "learning_rate": 1.451688869472727e-07, "loss": 0.1118, "step": 35260 }, { "epoch": 2.85, "grad_norm": 4.329237371509222, "learning_rate": 1.421839452131032e-07, "loss": 0.1207, "step": 35280 }, { "epoch": 2.85, "grad_norm": 7.4109464152710975, "learning_rate": 1.392302291624794e-07, "loss": 0.1254, "step": 35300 }, { "epoch": 2.85, "grad_norm": 9.46641103730691, "learning_rate": 1.363077481506626e-07, "loss": 0.1268, "step": 35320 }, { "epoch": 2.85, "grad_norm": 17.777256629510763, "learning_rate": 1.334165114339843e-07, "loss": 0.1187, "step": 35340 }, { "epoch": 2.85, "grad_norm": 6.672583084589652, "learning_rate": 1.3055652816981725e-07, "loss": 0.1344, "step": 35360 }, { "epoch": 2.86, "grad_norm": 7.46361259716019, "learning_rate": 1.2772780741654212e-07, "loss": 0.122, "step": 35380 }, { "epoch": 2.86, "grad_norm": 7.962744510555444, "learning_rate": 1.249303581335288e-07, "loss": 0.1482, "step": 35400 }, { "epoch": 2.86, "grad_norm": 4.639312866896684, "learning_rate": 1.2216418918109295e-07, "loss": 0.1118, "step": 35420 }, { "epoch": 2.86, "grad_norm": 6.374488361247182, "learning_rate": 1.1942930932048377e-07, "loss": 0.1237, "step": 35440 }, { "epoch": 2.86, "grad_norm": 11.720032481906914, "learning_rate": 1.167257272138442e-07, "loss": 0.148, "step": 35460 }, { "epoch": 2.86, "grad_norm": 5.531369137962614, "learning_rate": 1.1405345142419078e-07, "loss": 0.1061, "step": 35480 }, { "epoch": 2.87, "grad_norm": 5.2051559929095745, "learning_rate": 1.1141249041538487e-07, "loss": 0.1201, "step": 35500 }, { "epoch": 2.87, "grad_norm": 5.370136100706602, "learning_rate": 1.0880285255210267e-07, "loss": 0.1233, "step": 35520 }, { "epoch": 2.87, "grad_norm": 7.384789645144088, "learning_rate": 1.062245460998107e-07, "loss": 0.1277, "step": 35540 }, { "epoch": 2.87, "grad_norm": 5.724044933363986, "learning_rate": 1.0367757922474377e-07, "loss": 0.1179, "step": 35560 }, { "epoch": 2.87, "grad_norm": 3.4770635008127306, "learning_rate": 1.0116195999387046e-07, "loss": 0.101, "step": 35580 }, { "epoch": 2.87, "grad_norm": 6.895352564837291, "learning_rate": 9.86776963748753e-08, "loss": 0.1301, "step": 35600 }, { "epoch": 2.88, "grad_norm": 6.436488396131478, "learning_rate": 9.622479623613118e-08, "loss": 0.1209, "step": 35620 }, { "epoch": 2.88, "grad_norm": 8.539520837710704, "learning_rate": 9.380326734667035e-08, "loss": 0.1391, "step": 35640 }, { "epoch": 2.88, "grad_norm": 3.645902680237058, "learning_rate": 9.141311737616782e-08, "loss": 0.1338, "step": 35660 }, { "epoch": 2.88, "grad_norm": 3.4959808160879655, "learning_rate": 8.905435389490916e-08, "loss": 0.1223, "step": 35680 }, { "epoch": 2.88, "grad_norm": 8.457614460981622, "learning_rate": 8.672698437377388e-08, "loss": 0.1184, "step": 35700 }, { "epoch": 2.88, "grad_norm": 7.860398333990087, "learning_rate": 8.443101618420206e-08, "loss": 0.15, "step": 35720 }, { "epoch": 2.89, "grad_norm": 4.612908925690368, "learning_rate": 8.216645659818111e-08, "loss": 0.123, "step": 35740 }, { "epoch": 2.89, "grad_norm": 5.531335862349993, "learning_rate": 7.9933312788219e-08, "loss": 0.1188, "step": 35760 }, { "epoch": 2.89, "grad_norm": 7.509422486877788, "learning_rate": 7.773159182731779e-08, "loss": 0.1188, "step": 35780 }, { "epoch": 2.89, "grad_norm": 5.207845659948716, "learning_rate": 7.556130068895684e-08, "loss": 0.1118, "step": 35800 }, { "epoch": 2.89, "grad_norm": 5.7390201059173025, "learning_rate": 7.342244624706626e-08, "loss": 0.1101, "step": 35820 }, { "epoch": 2.89, "grad_norm": 11.666954545601891, "learning_rate": 7.131503527601015e-08, "loss": 0.1285, "step": 35840 }, { "epoch": 2.89, "grad_norm": 7.3196376298105275, "learning_rate": 6.923907445055786e-08, "loss": 0.1322, "step": 35860 }, { "epoch": 2.9, "grad_norm": 4.918596417017865, "learning_rate": 6.719457034587279e-08, "loss": 0.1232, "step": 35880 }, { "epoch": 2.9, "grad_norm": 3.959972858528604, "learning_rate": 6.518152943748027e-08, "loss": 0.1063, "step": 35900 }, { "epoch": 2.9, "grad_norm": 2.397152833117293, "learning_rate": 6.31999581012564e-08, "loss": 0.0965, "step": 35920 }, { "epoch": 2.9, "grad_norm": 5.681186399533992, "learning_rate": 6.124986261340256e-08, "loss": 0.1264, "step": 35940 }, { "epoch": 2.9, "grad_norm": 7.548889270450938, "learning_rate": 5.933124915042872e-08, "loss": 0.1369, "step": 35960 }, { "epoch": 2.9, "grad_norm": 8.380126295207777, "learning_rate": 5.744412378913017e-08, "loss": 0.1386, "step": 35980 }, { "epoch": 2.91, "grad_norm": 5.782865391396359, "learning_rate": 5.558849250657085e-08, "loss": 0.1222, "step": 36000 }, { "epoch": 2.91, "grad_norm": 4.153375305486208, "learning_rate": 5.376436118006558e-08, "loss": 0.1358, "step": 36020 }, { "epoch": 2.91, "grad_norm": 6.065191556290861, "learning_rate": 5.197173558715897e-08, "loss": 0.1126, "step": 36040 }, { "epoch": 2.91, "grad_norm": 8.79953932856826, "learning_rate": 5.021062140560655e-08, "loss": 0.1371, "step": 36060 }, { "epoch": 2.91, "grad_norm": 8.306374494252564, "learning_rate": 4.848102421336036e-08, "loss": 0.1119, "step": 36080 }, { "epoch": 2.91, "grad_norm": 7.727234659764161, "learning_rate": 4.67829494885478e-08, "loss": 0.1344, "step": 36100 }, { "epoch": 2.92, "grad_norm": 8.737522929629472, "learning_rate": 4.511640260945618e-08, "loss": 0.1287, "step": 36120 }, { "epoch": 2.92, "grad_norm": 5.6267056740106245, "learning_rate": 4.3481388854514844e-08, "loss": 0.1358, "step": 36140 }, { "epoch": 2.92, "grad_norm": 6.566053899585036, "learning_rate": 4.1877913402279745e-08, "loss": 0.1152, "step": 36160 }, { "epoch": 2.92, "grad_norm": 7.0622715195688475, "learning_rate": 4.030598133141448e-08, "loss": 0.1289, "step": 36180 }, { "epoch": 2.92, "grad_norm": 4.972699784704855, "learning_rate": 3.876559762067592e-08, "loss": 0.1305, "step": 36200 }, { "epoch": 2.92, "grad_norm": 5.014587294996044, "learning_rate": 3.725676714889973e-08, "loss": 0.1171, "step": 36220 }, { "epoch": 2.93, "grad_norm": 6.830982858472428, "learning_rate": 3.577949469498266e-08, "loss": 0.122, "step": 36240 }, { "epoch": 2.93, "grad_norm": 7.525834158354324, "learning_rate": 3.433378493786696e-08, "loss": 0.1361, "step": 36260 }, { "epoch": 2.93, "grad_norm": 4.280769488246129, "learning_rate": 3.2919642456529296e-08, "loss": 0.1063, "step": 36280 }, { "epoch": 2.93, "grad_norm": 6.3496033672303165, "learning_rate": 3.153707172996076e-08, "loss": 0.1245, "step": 36300 }, { "epoch": 2.93, "grad_norm": 7.793461421221783, "learning_rate": 3.018607713715799e-08, "loss": 0.1275, "step": 36320 }, { "epoch": 2.93, "grad_norm": 6.56059513602374, "learning_rate": 2.8866662957107676e-08, "loss": 0.1263, "step": 36340 }, { "epoch": 2.94, "grad_norm": 5.638965309681728, "learning_rate": 2.7578833368769814e-08, "loss": 0.1179, "step": 36360 }, { "epoch": 2.94, "grad_norm": 4.373394916922692, "learning_rate": 2.6322592451068903e-08, "loss": 0.1308, "step": 36380 }, { "epoch": 2.94, "grad_norm": 4.544552554021107, "learning_rate": 2.509794418287615e-08, "loss": 0.1077, "step": 36400 }, { "epoch": 2.94, "grad_norm": 4.809466309947855, "learning_rate": 2.3904892443005044e-08, "loss": 0.1235, "step": 36420 }, { "epoch": 2.94, "grad_norm": 8.925168366540419, "learning_rate": 2.2743441010190245e-08, "loss": 0.1077, "step": 36440 }, { "epoch": 2.94, "grad_norm": 3.6068744667487773, "learning_rate": 2.1613593563077622e-08, "loss": 0.1163, "step": 36460 }, { "epoch": 2.94, "grad_norm": 6.33321976141573, "learning_rate": 2.0515353680218668e-08, "loss": 0.1416, "step": 36480 }, { "epoch": 2.95, "grad_norm": 5.509014891195261, "learning_rate": 1.9448724840052763e-08, "loss": 0.1, "step": 36500 }, { "epoch": 2.95, "grad_norm": 1.4435415604636703, "learning_rate": 1.8413710420894955e-08, "loss": 0.1467, "step": 36520 }, { "epoch": 2.95, "grad_norm": 6.837090583552706, "learning_rate": 1.7410313700933742e-08, "loss": 0.1193, "step": 36540 }, { "epoch": 2.95, "grad_norm": 10.98917619333693, "learning_rate": 1.6438537858213306e-08, "loss": 0.1204, "step": 36560 }, { "epoch": 2.95, "grad_norm": 7.2427408116065655, "learning_rate": 1.549838597062241e-08, "loss": 0.1235, "step": 36580 }, { "epoch": 2.95, "grad_norm": 9.665591169138377, "learning_rate": 1.4589861015893297e-08, "loss": 0.128, "step": 36600 }, { "epoch": 2.96, "grad_norm": 6.059247274956065, "learning_rate": 1.3712965871581705e-08, "loss": 0.1335, "step": 36620 }, { "epoch": 2.96, "grad_norm": 6.0881631916846635, "learning_rate": 1.2867703315064636e-08, "loss": 0.1518, "step": 36640 }, { "epoch": 2.96, "grad_norm": 5.282203362544957, "learning_rate": 1.2054076023531496e-08, "loss": 0.0985, "step": 36660 }, { "epoch": 2.96, "grad_norm": 7.347100714977006, "learning_rate": 1.1272086573970748e-08, "loss": 0.117, "step": 36680 }, { "epoch": 2.96, "grad_norm": 4.938935469550873, "learning_rate": 1.0521737443166604e-08, "loss": 0.1021, "step": 36700 }, { "epoch": 2.96, "grad_norm": 6.385136642248586, "learning_rate": 9.803031007687918e-09, "loss": 0.137, "step": 36720 }, { "epoch": 2.97, "grad_norm": 5.224078536116553, "learning_rate": 9.115969543884837e-09, "loss": 0.121, "step": 36740 }, { "epoch": 2.97, "grad_norm": 6.548833367504612, "learning_rate": 8.460555227875514e-09, "loss": 0.1193, "step": 36760 }, { "epoch": 2.97, "grad_norm": 7.489873534740506, "learning_rate": 7.836790135544969e-09, "loss": 0.1314, "step": 36780 }, { "epoch": 2.97, "grad_norm": 10.381538628706075, "learning_rate": 7.244676242533998e-09, "loss": 0.1447, "step": 36800 }, { "epoch": 2.97, "grad_norm": 5.100756734916774, "learning_rate": 6.684215424238067e-09, "loss": 0.1316, "step": 36820 }, { "epoch": 2.97, "grad_norm": 7.706831777227298, "learning_rate": 6.155409455796202e-09, "loss": 0.095, "step": 36840 }, { "epoch": 2.98, "grad_norm": 6.937329539361333, "learning_rate": 5.658260012086559e-09, "loss": 0.12, "step": 36860 }, { "epoch": 2.98, "grad_norm": 7.4927831896270565, "learning_rate": 5.192768667723085e-09, "loss": 0.1258, "step": 36880 }, { "epoch": 2.98, "grad_norm": 6.263280263327071, "learning_rate": 4.75893689705219e-09, "loss": 0.1275, "step": 36900 }, { "epoch": 2.98, "grad_norm": 8.70846552649615, "learning_rate": 4.356766074139427e-09, "loss": 0.1263, "step": 36920 }, { "epoch": 2.98, "grad_norm": 6.562570335374569, "learning_rate": 3.986257472777263e-09, "loss": 0.1388, "step": 36940 }, { "epoch": 2.98, "grad_norm": 4.5248888286612265, "learning_rate": 3.647412266471757e-09, "loss": 0.113, "step": 36960 }, { "epoch": 2.99, "grad_norm": 4.624438483715926, "learning_rate": 3.340231528441448e-09, "loss": 0.1267, "step": 36980 }, { "epoch": 2.99, "grad_norm": 4.769546846181503, "learning_rate": 3.064716231616248e-09, "loss": 0.1058, "step": 37000 }, { "epoch": 2.99, "grad_norm": 7.257785431613275, "learning_rate": 2.82086724863078e-09, "loss": 0.1077, "step": 37020 }, { "epoch": 2.99, "grad_norm": 5.537462786340638, "learning_rate": 2.6086853518243772e-09, "loss": 0.1274, "step": 37040 }, { "epoch": 2.99, "grad_norm": 4.591382305296377, "learning_rate": 2.428171213238863e-09, "loss": 0.1288, "step": 37060 }, { "epoch": 2.99, "grad_norm": 3.1613016021607647, "learning_rate": 2.2793254046130027e-09, "loss": 0.1174, "step": 37080 }, { "epoch": 2.99, "grad_norm": 5.955705859215525, "learning_rate": 2.1621483973836107e-09, "loss": 0.1072, "step": 37100 }, { "epoch": 3.0, "grad_norm": 7.465708093192746, "learning_rate": 2.076640562685552e-09, "loss": 0.1386, "step": 37120 }, { "epoch": 3.0, "grad_norm": 5.477348499788448, "learning_rate": 2.0228021713439704e-09, "loss": 0.1072, "step": 37140 }, { "epoch": 3.0, "grad_norm": 7.907133151097777, "learning_rate": 2.0006333938820607e-09, "loss": 0.1396, "step": 37160 } ], "logging_steps": 20, "max_steps": 37164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }