{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0090206445623877, "eval_steps": 500, "global_step": 12500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.78010667114874, "learning_rate": 4.0236686390532546e-07, "loss": 0.5346, "step": 20 }, { "epoch": 0.0, "grad_norm": 6.216957639101055, "learning_rate": 6.153846153846155e-07, "loss": 0.5043, "step": 40 }, { "epoch": 0.0, "grad_norm": 7.990357548396736, "learning_rate": 8.284023668639055e-07, "loss": 0.5017, "step": 60 }, { "epoch": 0.01, "grad_norm": 8.12688107356609, "learning_rate": 1.0414201183431955e-06, "loss": 0.4952, "step": 80 }, { "epoch": 0.01, "grad_norm": 6.530843475685683, "learning_rate": 1.2544378698224851e-06, "loss": 0.5621, "step": 100 }, { "epoch": 0.01, "grad_norm": 6.308467672405027, "learning_rate": 1.4674556213017752e-06, "loss": 0.4549, "step": 120 }, { "epoch": 0.01, "grad_norm": 7.243052870190241, "learning_rate": 1.6804733727810652e-06, "loss": 0.4466, "step": 140 }, { "epoch": 0.01, "grad_norm": 10.219726515841495, "learning_rate": 1.8934911242603552e-06, "loss": 0.3893, "step": 160 }, { "epoch": 0.01, "grad_norm": 6.627649317339657, "learning_rate": 2.106508875739645e-06, "loss": 0.4179, "step": 180 }, { "epoch": 0.02, "grad_norm": 6.44244224679364, "learning_rate": 2.319526627218935e-06, "loss": 0.4226, "step": 200 }, { "epoch": 0.02, "grad_norm": 6.612950721768246, "learning_rate": 2.532544378698225e-06, "loss": 0.3795, "step": 220 }, { "epoch": 0.02, "grad_norm": 6.283517912051673, "learning_rate": 2.7455621301775153e-06, "loss": 0.4276, "step": 240 }, { "epoch": 0.02, "grad_norm": 7.268987062349035, "learning_rate": 2.958579881656805e-06, "loss": 0.3619, "step": 260 }, { "epoch": 0.02, "grad_norm": 8.340583800596072, "learning_rate": 3.171597633136095e-06, "loss": 0.4244, "step": 280 }, { "epoch": 0.02, "grad_norm": 6.75600646477272, "learning_rate": 3.384615384615385e-06, "loss": 0.3852, "step": 300 }, { "epoch": 0.03, "grad_norm": 5.647054711391784, "learning_rate": 3.597633136094675e-06, "loss": 0.3809, "step": 320 }, { "epoch": 0.03, "grad_norm": 7.253045067435066, "learning_rate": 3.8106508875739652e-06, "loss": 0.3858, "step": 340 }, { "epoch": 0.03, "grad_norm": 7.301184351749545, "learning_rate": 4.023668639053255e-06, "loss": 0.3549, "step": 360 }, { "epoch": 0.03, "grad_norm": 7.59195138486003, "learning_rate": 4.236686390532545e-06, "loss": 0.4048, "step": 380 }, { "epoch": 0.03, "grad_norm": 10.124611929532549, "learning_rate": 4.449704142011835e-06, "loss": 0.3646, "step": 400 }, { "epoch": 0.03, "grad_norm": 7.744526068197853, "learning_rate": 4.662721893491124e-06, "loss": 0.3677, "step": 420 }, { "epoch": 0.04, "grad_norm": 8.246611853098463, "learning_rate": 4.875739644970415e-06, "loss": 0.3573, "step": 440 }, { "epoch": 0.04, "grad_norm": 7.675393597081337, "learning_rate": 5.088757396449705e-06, "loss": 0.3714, "step": 460 }, { "epoch": 0.04, "grad_norm": 7.407374558199348, "learning_rate": 5.301775147928995e-06, "loss": 0.3762, "step": 480 }, { "epoch": 0.04, "grad_norm": 7.5336943019480875, "learning_rate": 5.514792899408284e-06, "loss": 0.3423, "step": 500 }, { "epoch": 0.04, "grad_norm": 6.789944607897793, "learning_rate": 5.727810650887574e-06, "loss": 0.3382, "step": 520 }, { "epoch": 0.04, "grad_norm": 6.2048712513712765, "learning_rate": 5.940828402366864e-06, "loss": 0.355, "step": 540 }, { "epoch": 0.05, "grad_norm": 7.7676635768181255, "learning_rate": 6.153846153846153e-06, "loss": 0.3481, "step": 560 }, { "epoch": 0.05, "grad_norm": 5.33865756273462, "learning_rate": 6.366863905325444e-06, "loss": 0.3486, "step": 580 }, { "epoch": 0.05, "grad_norm": 7.215416340807466, "learning_rate": 6.579881656804735e-06, "loss": 0.3378, "step": 600 }, { "epoch": 0.05, "grad_norm": 6.563753135562715, "learning_rate": 6.792899408284025e-06, "loss": 0.3126, "step": 620 }, { "epoch": 0.05, "grad_norm": 8.45042187241426, "learning_rate": 7.005917159763315e-06, "loss": 0.3231, "step": 640 }, { "epoch": 0.05, "grad_norm": 6.734275587016377, "learning_rate": 7.218934911242604e-06, "loss": 0.3533, "step": 660 }, { "epoch": 0.05, "grad_norm": 6.417837093997314, "learning_rate": 7.431952662721894e-06, "loss": 0.3335, "step": 680 }, { "epoch": 0.06, "grad_norm": 8.615382928114506, "learning_rate": 7.644970414201183e-06, "loss": 0.3495, "step": 700 }, { "epoch": 0.06, "grad_norm": 6.6395133490495395, "learning_rate": 7.857988165680473e-06, "loss": 0.3371, "step": 720 }, { "epoch": 0.06, "grad_norm": 9.710125890008053, "learning_rate": 8.071005917159764e-06, "loss": 0.3619, "step": 740 }, { "epoch": 0.06, "grad_norm": 8.21056721557438, "learning_rate": 8.284023668639054e-06, "loss": 0.3479, "step": 760 }, { "epoch": 0.06, "grad_norm": 7.0850591314913185, "learning_rate": 8.497041420118344e-06, "loss": 0.3217, "step": 780 }, { "epoch": 0.06, "grad_norm": 8.709374421841343, "learning_rate": 8.710059171597634e-06, "loss": 0.345, "step": 800 }, { "epoch": 0.07, "grad_norm": 8.703998514988717, "learning_rate": 8.923076923076925e-06, "loss": 0.3544, "step": 820 }, { "epoch": 0.07, "grad_norm": 8.233699532008588, "learning_rate": 9.136094674556215e-06, "loss": 0.3449, "step": 840 }, { "epoch": 0.07, "grad_norm": 6.360611479052244, "learning_rate": 9.349112426035503e-06, "loss": 0.3709, "step": 860 }, { "epoch": 0.07, "grad_norm": 5.21422760141616, "learning_rate": 9.562130177514794e-06, "loss": 0.3031, "step": 880 }, { "epoch": 0.07, "grad_norm": 10.285280127672143, "learning_rate": 9.775147928994084e-06, "loss": 0.314, "step": 900 }, { "epoch": 0.07, "grad_norm": 8.561551897783339, "learning_rate": 9.988165680473372e-06, "loss": 0.34, "step": 920 }, { "epoch": 0.08, "grad_norm": 8.337797520489195, "learning_rate": 1.0201183431952664e-05, "loss": 0.3324, "step": 940 }, { "epoch": 0.08, "grad_norm": 9.531967986532953, "learning_rate": 1.0414201183431953e-05, "loss": 0.359, "step": 960 }, { "epoch": 0.08, "grad_norm": 6.913190435381454, "learning_rate": 1.0627218934911243e-05, "loss": 0.3715, "step": 980 }, { "epoch": 0.08, "grad_norm": 8.525404719704843, "learning_rate": 1.0840236686390533e-05, "loss": 0.2926, "step": 1000 }, { "epoch": 0.08, "grad_norm": 8.186212103501571, "learning_rate": 1.1053254437869825e-05, "loss": 0.351, "step": 1020 }, { "epoch": 0.08, "grad_norm": 5.968895396114415, "learning_rate": 1.1266272189349114e-05, "loss": 0.3325, "step": 1040 }, { "epoch": 0.09, "grad_norm": 9.069989076248202, "learning_rate": 1.1479289940828404e-05, "loss": 0.3075, "step": 1060 }, { "epoch": 0.09, "grad_norm": 6.617321242745247, "learning_rate": 1.1692307692307694e-05, "loss": 0.3321, "step": 1080 }, { "epoch": 0.09, "grad_norm": 8.096724387107496, "learning_rate": 1.1905325443786983e-05, "loss": 0.3508, "step": 1100 }, { "epoch": 0.09, "grad_norm": 7.972357677530315, "learning_rate": 1.2118343195266273e-05, "loss": 0.3031, "step": 1120 }, { "epoch": 0.09, "grad_norm": 7.82798416711515, "learning_rate": 1.2331360946745563e-05, "loss": 0.2912, "step": 1140 }, { "epoch": 0.09, "grad_norm": 10.732472169183794, "learning_rate": 1.2544378698224854e-05, "loss": 0.2956, "step": 1160 }, { "epoch": 0.1, "grad_norm": 7.4415223600685625, "learning_rate": 1.2757396449704142e-05, "loss": 0.3234, "step": 1180 }, { "epoch": 0.1, "grad_norm": 7.006979596197168, "learning_rate": 1.2970414201183432e-05, "loss": 0.3271, "step": 1200 }, { "epoch": 0.1, "grad_norm": 8.00254256852378, "learning_rate": 1.3183431952662723e-05, "loss": 0.3165, "step": 1220 }, { "epoch": 0.1, "grad_norm": 14.742499561979336, "learning_rate": 1.3396449704142011e-05, "loss": 0.3015, "step": 1240 }, { "epoch": 0.1, "grad_norm": 7.14191343340385, "learning_rate": 1.3609467455621301e-05, "loss": 0.3273, "step": 1260 }, { "epoch": 0.1, "grad_norm": 5.720523746760943, "learning_rate": 1.3822485207100593e-05, "loss": 0.3073, "step": 1280 }, { "epoch": 0.1, "grad_norm": 29.467573544915435, "learning_rate": 1.4035502958579883e-05, "loss": 0.2779, "step": 1300 }, { "epoch": 0.11, "grad_norm": 6.058319538140977, "learning_rate": 1.4248520710059172e-05, "loss": 0.3176, "step": 1320 }, { "epoch": 0.11, "grad_norm": 5.979724640901607, "learning_rate": 1.4461538461538462e-05, "loss": 0.2919, "step": 1340 }, { "epoch": 0.11, "grad_norm": 9.296561099611436, "learning_rate": 1.4674556213017754e-05, "loss": 0.3073, "step": 1360 }, { "epoch": 0.11, "grad_norm": 6.766863350601436, "learning_rate": 1.4887573964497044e-05, "loss": 0.3449, "step": 1380 }, { "epoch": 0.11, "grad_norm": 9.0738349271763, "learning_rate": 1.5100591715976333e-05, "loss": 0.3282, "step": 1400 }, { "epoch": 0.11, "grad_norm": 5.238889472577541, "learning_rate": 1.5313609467455623e-05, "loss": 0.2766, "step": 1420 }, { "epoch": 0.12, "grad_norm": 6.068739678701155, "learning_rate": 1.5526627218934912e-05, "loss": 0.3527, "step": 1440 }, { "epoch": 0.12, "grad_norm": 8.761831948666881, "learning_rate": 1.5739644970414204e-05, "loss": 0.309, "step": 1460 }, { "epoch": 0.12, "grad_norm": 5.696276254834093, "learning_rate": 1.5952662721893492e-05, "loss": 0.3334, "step": 1480 }, { "epoch": 0.12, "grad_norm": 5.698639074786848, "learning_rate": 1.616568047337278e-05, "loss": 0.306, "step": 1500 }, { "epoch": 0.12, "grad_norm": 6.244148261549131, "learning_rate": 1.6378698224852073e-05, "loss": 0.3, "step": 1520 }, { "epoch": 0.12, "grad_norm": 6.948838050368144, "learning_rate": 1.659171597633136e-05, "loss": 0.3348, "step": 1540 }, { "epoch": 0.13, "grad_norm": 6.672363835099913, "learning_rate": 1.6804733727810653e-05, "loss": 0.3534, "step": 1560 }, { "epoch": 0.13, "grad_norm": 50.29837243421749, "learning_rate": 1.7017751479289942e-05, "loss": 0.3203, "step": 1580 }, { "epoch": 0.13, "grad_norm": 7.03929850767991, "learning_rate": 1.723076923076923e-05, "loss": 0.297, "step": 1600 }, { "epoch": 0.13, "grad_norm": 7.19469430195841, "learning_rate": 1.7443786982248522e-05, "loss": 0.2978, "step": 1620 }, { "epoch": 0.13, "grad_norm": 7.055651559785636, "learning_rate": 1.765680473372781e-05, "loss": 0.3313, "step": 1640 }, { "epoch": 0.13, "grad_norm": 7.162934891578056, "learning_rate": 1.78698224852071e-05, "loss": 0.3188, "step": 1660 }, { "epoch": 0.14, "grad_norm": 7.176153099629785, "learning_rate": 1.808284023668639e-05, "loss": 0.3297, "step": 1680 }, { "epoch": 0.14, "grad_norm": 6.367781797418254, "learning_rate": 1.8295857988165683e-05, "loss": 0.3042, "step": 1700 }, { "epoch": 0.14, "grad_norm": 6.373173769997794, "learning_rate": 1.8508875739644975e-05, "loss": 0.3266, "step": 1720 }, { "epoch": 0.14, "grad_norm": 18.577261761115807, "learning_rate": 1.8721893491124264e-05, "loss": 0.2736, "step": 1740 }, { "epoch": 0.14, "grad_norm": 4.994768648489008, "learning_rate": 1.8934911242603552e-05, "loss": 0.3012, "step": 1760 }, { "epoch": 0.14, "grad_norm": 6.301807374403158, "learning_rate": 1.9147928994082844e-05, "loss": 0.3394, "step": 1780 }, { "epoch": 0.15, "grad_norm": 7.456967816650055, "learning_rate": 1.9360946745562133e-05, "loss": 0.3329, "step": 1800 }, { "epoch": 0.15, "grad_norm": 6.0971107018925625, "learning_rate": 1.957396449704142e-05, "loss": 0.3523, "step": 1820 }, { "epoch": 0.15, "grad_norm": 8.10379030061416, "learning_rate": 1.9786982248520713e-05, "loss": 0.3013, "step": 1840 }, { "epoch": 0.15, "grad_norm": 7.024454181142386, "learning_rate": 1.9999999960412883e-05, "loss": 0.2762, "step": 1860 }, { "epoch": 0.15, "grad_norm": 8.232339625552946, "learning_rate": 1.9999982542086008e-05, "loss": 0.3448, "step": 1880 }, { "epoch": 0.15, "grad_norm": 7.602098687169412, "learning_rate": 1.9999933454128334e-05, "loss": 0.3398, "step": 1900 }, { "epoch": 0.15, "grad_norm": 13.60330741158854, "learning_rate": 1.9999852696695326e-05, "loss": 0.3449, "step": 1920 }, { "epoch": 0.16, "grad_norm": 6.868480851347135, "learning_rate": 1.9999740270042764e-05, "loss": 0.3047, "step": 1940 }, { "epoch": 0.16, "grad_norm": 8.419546962701075, "learning_rate": 1.9999596174526744e-05, "loss": 0.296, "step": 1960 }, { "epoch": 0.16, "grad_norm": 8.258953437915027, "learning_rate": 1.9999420410603655e-05, "loss": 0.3145, "step": 1980 }, { "epoch": 0.16, "grad_norm": 8.591067141592344, "learning_rate": 1.9999212978830192e-05, "loss": 0.2967, "step": 2000 }, { "epoch": 0.16, "grad_norm": 7.788694443802249, "learning_rate": 1.9998973879863347e-05, "loss": 0.2922, "step": 2020 }, { "epoch": 0.16, "grad_norm": 8.894924525369044, "learning_rate": 1.999870311446042e-05, "loss": 0.2909, "step": 2040 }, { "epoch": 0.17, "grad_norm": 7.71785690762546, "learning_rate": 1.9998400683478994e-05, "loss": 0.3185, "step": 2060 }, { "epoch": 0.17, "grad_norm": 6.423565941270898, "learning_rate": 1.9998066587876964e-05, "loss": 0.331, "step": 2080 }, { "epoch": 0.17, "grad_norm": 9.504973464920754, "learning_rate": 1.9997700828712502e-05, "loss": 0.3163, "step": 2100 }, { "epoch": 0.17, "grad_norm": 5.0751162961967555, "learning_rate": 1.999730340714407e-05, "loss": 0.2935, "step": 2120 }, { "epoch": 0.17, "grad_norm": 6.788588249402219, "learning_rate": 1.9996874324430414e-05, "loss": 0.304, "step": 2140 }, { "epoch": 0.17, "grad_norm": 6.982728358658279, "learning_rate": 1.9996413581930564e-05, "loss": 0.3254, "step": 2160 }, { "epoch": 0.18, "grad_norm": 7.39883321894128, "learning_rate": 1.9995921181103827e-05, "loss": 0.3238, "step": 2180 }, { "epoch": 0.18, "grad_norm": 6.349466388539233, "learning_rate": 1.999539712350977e-05, "loss": 0.3138, "step": 2200 }, { "epoch": 0.18, "grad_norm": 8.460848680137447, "learning_rate": 1.9994841410808238e-05, "loss": 0.2951, "step": 2220 }, { "epoch": 0.18, "grad_norm": 12.228980891015102, "learning_rate": 1.999425404475933e-05, "loss": 0.313, "step": 2240 }, { "epoch": 0.18, "grad_norm": 10.128152992141587, "learning_rate": 1.99936350272234e-05, "loss": 0.3209, "step": 2260 }, { "epoch": 0.18, "grad_norm": 7.366413095980619, "learning_rate": 1.999298436016105e-05, "loss": 0.3508, "step": 2280 }, { "epoch": 0.19, "grad_norm": 5.935510884433497, "learning_rate": 1.9992302045633138e-05, "loss": 0.3087, "step": 2300 }, { "epoch": 0.19, "grad_norm": 9.917097921103624, "learning_rate": 1.9991588085800745e-05, "loss": 0.3272, "step": 2320 }, { "epoch": 0.19, "grad_norm": 8.158864972330328, "learning_rate": 1.9990842482925183e-05, "loss": 0.3097, "step": 2340 }, { "epoch": 0.19, "grad_norm": 5.660258353439845, "learning_rate": 1.999006523936799e-05, "loss": 0.3194, "step": 2360 }, { "epoch": 0.19, "grad_norm": 6.343908148236521, "learning_rate": 1.9989256357590915e-05, "loss": 0.3144, "step": 2380 }, { "epoch": 0.19, "grad_norm": 6.935522124005399, "learning_rate": 1.9988415840155925e-05, "loss": 0.316, "step": 2400 }, { "epoch": 0.2, "grad_norm": 6.118420550913593, "learning_rate": 1.9987543689725172e-05, "loss": 0.2935, "step": 2420 }, { "epoch": 0.2, "grad_norm": 5.852760915435719, "learning_rate": 1.998663990906101e-05, "loss": 0.2982, "step": 2440 }, { "epoch": 0.2, "grad_norm": 10.637858430267903, "learning_rate": 1.9985704501025967e-05, "loss": 0.3263, "step": 2460 }, { "epoch": 0.2, "grad_norm": 5.969298652078407, "learning_rate": 1.9984737468582746e-05, "loss": 0.2785, "step": 2480 }, { "epoch": 0.2, "grad_norm": 7.290609868079292, "learning_rate": 1.998373881479422e-05, "loss": 0.2902, "step": 2500 }, { "epoch": 0.2, "grad_norm": 8.419267863068479, "learning_rate": 1.9982708542823405e-05, "loss": 0.2854, "step": 2520 }, { "epoch": 0.21, "grad_norm": 8.123890492905641, "learning_rate": 1.9981646655933466e-05, "loss": 0.2981, "step": 2540 }, { "epoch": 0.21, "grad_norm": 5.10058603098674, "learning_rate": 1.998055315748771e-05, "loss": 0.2792, "step": 2560 }, { "epoch": 0.21, "grad_norm": 5.6118366078785105, "learning_rate": 1.997942805094955e-05, "loss": 0.2905, "step": 2580 }, { "epoch": 0.21, "grad_norm": 11.075016161215812, "learning_rate": 1.997827133988252e-05, "loss": 0.2902, "step": 2600 }, { "epoch": 0.21, "grad_norm": 8.887207232453743, "learning_rate": 1.997708302795026e-05, "loss": 0.3155, "step": 2620 }, { "epoch": 0.21, "grad_norm": 8.39711790141671, "learning_rate": 1.997586311891649e-05, "loss": 0.286, "step": 2640 }, { "epoch": 0.21, "grad_norm": 5.439201702560111, "learning_rate": 1.9974611616645007e-05, "loss": 0.2933, "step": 2660 }, { "epoch": 0.22, "grad_norm": 7.182735290178756, "learning_rate": 1.9973328525099675e-05, "loss": 0.3267, "step": 2680 }, { "epoch": 0.22, "grad_norm": 8.170254081594555, "learning_rate": 1.997201384834442e-05, "loss": 0.2967, "step": 2700 }, { "epoch": 0.22, "grad_norm": 6.357829888020736, "learning_rate": 1.997066759054319e-05, "loss": 0.3109, "step": 2720 }, { "epoch": 0.22, "grad_norm": 6.180030398494684, "learning_rate": 1.996928975595997e-05, "loss": 0.3054, "step": 2740 }, { "epoch": 0.22, "grad_norm": 7.812651853992933, "learning_rate": 1.996788034895875e-05, "loss": 0.2852, "step": 2760 }, { "epoch": 0.22, "grad_norm": 6.492759192826664, "learning_rate": 1.9966439374003538e-05, "loss": 0.305, "step": 2780 }, { "epoch": 0.23, "grad_norm": 11.941328303638311, "learning_rate": 1.99649668356583e-05, "loss": 0.2922, "step": 2800 }, { "epoch": 0.23, "grad_norm": 8.314661648415811, "learning_rate": 1.9963462738586993e-05, "loss": 0.3102, "step": 2820 }, { "epoch": 0.23, "grad_norm": 7.301474320450444, "learning_rate": 1.996192708755351e-05, "loss": 0.2964, "step": 2840 }, { "epoch": 0.23, "grad_norm": 8.797389284755965, "learning_rate": 1.996035988742171e-05, "loss": 0.2656, "step": 2860 }, { "epoch": 0.23, "grad_norm": 6.614687108304631, "learning_rate": 1.9958761143155357e-05, "loss": 0.2927, "step": 2880 }, { "epoch": 0.23, "grad_norm": 7.623906291629947, "learning_rate": 1.995713085981813e-05, "loss": 0.2788, "step": 2900 }, { "epoch": 0.24, "grad_norm": 5.684942655651583, "learning_rate": 1.9955469042573605e-05, "loss": 0.3051, "step": 2920 }, { "epoch": 0.24, "grad_norm": 8.632124459996572, "learning_rate": 1.9953775696685223e-05, "loss": 0.3002, "step": 2940 }, { "epoch": 0.24, "grad_norm": 4.942563514745161, "learning_rate": 1.99520508275163e-05, "loss": 0.2862, "step": 2960 }, { "epoch": 0.24, "grad_norm": 6.712590574651518, "learning_rate": 1.995029444052999e-05, "loss": 0.2938, "step": 2980 }, { "epoch": 0.24, "grad_norm": 7.09759309831908, "learning_rate": 1.9948506541289266e-05, "loss": 0.3054, "step": 3000 }, { "epoch": 0.24, "grad_norm": 9.042408105595795, "learning_rate": 1.994668713545692e-05, "loss": 0.3041, "step": 3020 }, { "epoch": 0.25, "grad_norm": 6.843804903550461, "learning_rate": 1.994483622879553e-05, "loss": 0.2958, "step": 3040 }, { "epoch": 0.25, "grad_norm": 8.109882081629157, "learning_rate": 1.9942953827167443e-05, "loss": 0.3115, "step": 3060 }, { "epoch": 0.25, "grad_norm": 6.519937602019556, "learning_rate": 1.994103993653476e-05, "loss": 0.2873, "step": 3080 }, { "epoch": 0.25, "grad_norm": 43.825640526729615, "learning_rate": 1.9939094562959324e-05, "loss": 0.3084, "step": 3100 }, { "epoch": 0.25, "grad_norm": 6.740302289754918, "learning_rate": 1.993711771260268e-05, "loss": 0.2898, "step": 3120 }, { "epoch": 0.25, "grad_norm": 8.585263920916868, "learning_rate": 1.993510939172609e-05, "loss": 0.303, "step": 3140 }, { "epoch": 0.26, "grad_norm": 7.715090235382078, "learning_rate": 1.9933069606690468e-05, "loss": 0.3102, "step": 3160 }, { "epoch": 0.26, "grad_norm": 9.780531981807941, "learning_rate": 1.99309983639564e-05, "loss": 0.3077, "step": 3180 }, { "epoch": 0.26, "grad_norm": 4.606128027451412, "learning_rate": 1.99288956700841e-05, "loss": 0.3131, "step": 3200 }, { "epoch": 0.26, "grad_norm": 6.5456142622794875, "learning_rate": 1.9926761531733403e-05, "loss": 0.2899, "step": 3220 }, { "epoch": 0.26, "grad_norm": 6.881966685047346, "learning_rate": 1.9924595955663732e-05, "loss": 0.2834, "step": 3240 }, { "epoch": 0.26, "grad_norm": 6.086009895569889, "learning_rate": 1.9922398948734088e-05, "loss": 0.2887, "step": 3260 }, { "epoch": 0.26, "grad_norm": 5.329585705771699, "learning_rate": 1.992017051790301e-05, "loss": 0.2888, "step": 3280 }, { "epoch": 0.27, "grad_norm": 9.3500534790468, "learning_rate": 1.991791067022858e-05, "loss": 0.3168, "step": 3300 }, { "epoch": 0.27, "grad_norm": 6.741688450171789, "learning_rate": 1.9915619412868387e-05, "loss": 0.2703, "step": 3320 }, { "epoch": 0.27, "grad_norm": 6.86462812934889, "learning_rate": 1.9913296753079484e-05, "loss": 0.3141, "step": 3340 }, { "epoch": 0.27, "grad_norm": 6.6699035733643495, "learning_rate": 1.9910942698218404e-05, "loss": 0.2922, "step": 3360 }, { "epoch": 0.27, "grad_norm": 8.43685481112505, "learning_rate": 1.990855725574111e-05, "loss": 0.2836, "step": 3380 }, { "epoch": 0.27, "grad_norm": 10.06638161800925, "learning_rate": 1.990614043320298e-05, "loss": 0.2949, "step": 3400 }, { "epoch": 0.28, "grad_norm": 8.362504433942911, "learning_rate": 1.9903692238258783e-05, "loss": 0.2897, "step": 3420 }, { "epoch": 0.28, "grad_norm": 6.788699791177713, "learning_rate": 1.9901212678662646e-05, "loss": 0.2907, "step": 3440 }, { "epoch": 0.28, "grad_norm": 6.40238575575375, "learning_rate": 1.989870176226804e-05, "loss": 0.2609, "step": 3460 }, { "epoch": 0.28, "grad_norm": 5.227206933131435, "learning_rate": 1.9896159497027758e-05, "loss": 0.3162, "step": 3480 }, { "epoch": 0.28, "grad_norm": 6.191089860311128, "learning_rate": 1.9893585890993877e-05, "loss": 0.2998, "step": 3500 }, { "epoch": 0.28, "grad_norm": 8.156421644955156, "learning_rate": 1.9890980952317745e-05, "loss": 0.2683, "step": 3520 }, { "epoch": 0.29, "grad_norm": 7.62639892752842, "learning_rate": 1.9888344689249945e-05, "loss": 0.3138, "step": 3540 }, { "epoch": 0.29, "grad_norm": 8.10913885283575, "learning_rate": 1.9885677110140272e-05, "loss": 0.3098, "step": 3560 }, { "epoch": 0.29, "grad_norm": 5.974197538110473, "learning_rate": 1.988297822343771e-05, "loss": 0.2879, "step": 3580 }, { "epoch": 0.29, "grad_norm": 8.13170124417466, "learning_rate": 1.9880248037690406e-05, "loss": 0.2741, "step": 3600 }, { "epoch": 0.29, "grad_norm": 5.373939941911109, "learning_rate": 1.9877486561545635e-05, "loss": 0.2818, "step": 3620 }, { "epoch": 0.29, "grad_norm": 6.876975035910139, "learning_rate": 1.9874693803749786e-05, "loss": 0.2872, "step": 3640 }, { "epoch": 0.3, "grad_norm": 11.88859663115872, "learning_rate": 1.987186977314831e-05, "loss": 0.2787, "step": 3660 }, { "epoch": 0.3, "grad_norm": 5.296482127875842, "learning_rate": 1.9869014478685726e-05, "loss": 0.3125, "step": 3680 }, { "epoch": 0.3, "grad_norm": 10.902431223896663, "learning_rate": 1.986612792940556e-05, "loss": 0.2696, "step": 3700 }, { "epoch": 0.3, "grad_norm": 7.957172435618448, "learning_rate": 1.986321013445034e-05, "loss": 0.2846, "step": 3720 }, { "epoch": 0.3, "grad_norm": 5.49530713404051, "learning_rate": 1.9860261103061555e-05, "loss": 0.2904, "step": 3740 }, { "epoch": 0.3, "grad_norm": 6.7681775640908315, "learning_rate": 1.985728084457963e-05, "loss": 0.2907, "step": 3760 }, { "epoch": 0.31, "grad_norm": 11.417291183282801, "learning_rate": 1.9854269368443898e-05, "loss": 0.3124, "step": 3780 }, { "epoch": 0.31, "grad_norm": 9.165271676007183, "learning_rate": 1.985122668419255e-05, "loss": 0.2938, "step": 3800 }, { "epoch": 0.31, "grad_norm": 9.710590629489802, "learning_rate": 1.984815280146265e-05, "loss": 0.2805, "step": 3820 }, { "epoch": 0.31, "grad_norm": 10.32416184835814, "learning_rate": 1.9845047729990052e-05, "loss": 0.2939, "step": 3840 }, { "epoch": 0.31, "grad_norm": 6.123004510419631, "learning_rate": 1.984191147960941e-05, "loss": 0.3217, "step": 3860 }, { "epoch": 0.31, "grad_norm": 8.419418288045916, "learning_rate": 1.9838744060254113e-05, "loss": 0.2466, "step": 3880 }, { "epoch": 0.31, "grad_norm": 8.941869987837809, "learning_rate": 1.9835545481956295e-05, "loss": 0.3091, "step": 3900 }, { "epoch": 0.32, "grad_norm": 6.854852736746462, "learning_rate": 1.983231575484676e-05, "loss": 0.3094, "step": 3920 }, { "epoch": 0.32, "grad_norm": 10.162127205743055, "learning_rate": 1.9829054889154978e-05, "loss": 0.2988, "step": 3940 }, { "epoch": 0.32, "grad_norm": 6.1276753090877385, "learning_rate": 1.982576289520904e-05, "loss": 0.2875, "step": 3960 }, { "epoch": 0.32, "grad_norm": 6.806977159453115, "learning_rate": 1.982243978343562e-05, "loss": 0.2943, "step": 3980 }, { "epoch": 0.32, "grad_norm": 6.9055487505442015, "learning_rate": 1.9819085564359977e-05, "loss": 0.2911, "step": 4000 }, { "epoch": 0.32, "grad_norm": 7.466453294884225, "learning_rate": 1.9815700248605875e-05, "loss": 0.2902, "step": 4020 }, { "epoch": 0.33, "grad_norm": 5.488844395318609, "learning_rate": 1.9812283846895572e-05, "loss": 0.2773, "step": 4040 }, { "epoch": 0.33, "grad_norm": 5.492586688406755, "learning_rate": 1.9808836370049786e-05, "loss": 0.2942, "step": 4060 }, { "epoch": 0.33, "grad_norm": 6.870365314571275, "learning_rate": 1.980535782898766e-05, "loss": 0.3134, "step": 4080 }, { "epoch": 0.33, "grad_norm": 6.474349542297636, "learning_rate": 1.9801848234726733e-05, "loss": 0.278, "step": 4100 }, { "epoch": 0.33, "grad_norm": 7.02153354250866, "learning_rate": 1.9798307598382887e-05, "loss": 0.3008, "step": 4120 }, { "epoch": 0.33, "grad_norm": 6.5342549251431725, "learning_rate": 1.9794735931170323e-05, "loss": 0.2588, "step": 4140 }, { "epoch": 0.34, "grad_norm": 7.235161691162515, "learning_rate": 1.9791133244401536e-05, "loss": 0.2892, "step": 4160 }, { "epoch": 0.34, "grad_norm": 6.613883714897734, "learning_rate": 1.978749954948726e-05, "loss": 0.3042, "step": 4180 }, { "epoch": 0.34, "grad_norm": 5.588985182579549, "learning_rate": 1.978383485793645e-05, "loss": 0.2895, "step": 4200 }, { "epoch": 0.34, "grad_norm": 41.788686405813685, "learning_rate": 1.9780139181356223e-05, "loss": 0.2967, "step": 4220 }, { "epoch": 0.34, "grad_norm": 8.000329487691184, "learning_rate": 1.9776412531451845e-05, "loss": 0.3068, "step": 4240 }, { "epoch": 0.34, "grad_norm": 8.858664509374336, "learning_rate": 1.977265492002667e-05, "loss": 0.2904, "step": 4260 }, { "epoch": 0.35, "grad_norm": 6.412322055660321, "learning_rate": 1.9768866358982138e-05, "loss": 0.302, "step": 4280 }, { "epoch": 0.35, "grad_norm": 6.15402072878952, "learning_rate": 1.9765046860317697e-05, "loss": 0.2753, "step": 4300 }, { "epoch": 0.35, "grad_norm": 7.363823390602094, "learning_rate": 1.9761196436130792e-05, "loss": 0.3077, "step": 4320 }, { "epoch": 0.35, "grad_norm": 5.820012641709484, "learning_rate": 1.9757315098616813e-05, "loss": 0.3024, "step": 4340 }, { "epoch": 0.35, "grad_norm": 5.291771334516593, "learning_rate": 1.975340286006906e-05, "loss": 0.2732, "step": 4360 }, { "epoch": 0.35, "grad_norm": 5.880570388428466, "learning_rate": 1.9749459732878716e-05, "loss": 0.2491, "step": 4380 }, { "epoch": 0.36, "grad_norm": 5.655270317760537, "learning_rate": 1.9745485729534788e-05, "loss": 0.2803, "step": 4400 }, { "epoch": 0.36, "grad_norm": 6.013092379821028, "learning_rate": 1.974148086262408e-05, "loss": 0.2803, "step": 4420 }, { "epoch": 0.36, "grad_norm": 5.7211652147787975, "learning_rate": 1.9737445144831136e-05, "loss": 0.2637, "step": 4440 }, { "epoch": 0.36, "grad_norm": 8.131187547800137, "learning_rate": 1.973337858893824e-05, "loss": 0.3255, "step": 4460 }, { "epoch": 0.36, "grad_norm": 7.137552013307909, "learning_rate": 1.972928120782533e-05, "loss": 0.2668, "step": 4480 }, { "epoch": 0.36, "grad_norm": 5.150009725617049, "learning_rate": 1.972515301446998e-05, "loss": 0.2854, "step": 4500 }, { "epoch": 0.36, "grad_norm": 6.331424258094408, "learning_rate": 1.972099402194736e-05, "loss": 0.2866, "step": 4520 }, { "epoch": 0.37, "grad_norm": 8.392412446366174, "learning_rate": 1.9716804243430176e-05, "loss": 0.2616, "step": 4540 }, { "epoch": 0.37, "grad_norm": 7.299549339702017, "learning_rate": 1.971258369218867e-05, "loss": 0.2983, "step": 4560 }, { "epoch": 0.37, "grad_norm": 6.2169745129545575, "learning_rate": 1.970833238159051e-05, "loss": 0.276, "step": 4580 }, { "epoch": 0.37, "grad_norm": 7.506632605972847, "learning_rate": 1.9704050325100827e-05, "loss": 0.2951, "step": 4600 }, { "epoch": 0.37, "grad_norm": 6.66483208527068, "learning_rate": 1.969973753628211e-05, "loss": 0.2784, "step": 4620 }, { "epoch": 0.37, "grad_norm": 7.211925104193477, "learning_rate": 1.9695394028794195e-05, "loss": 0.2729, "step": 4640 }, { "epoch": 0.38, "grad_norm": 4.20991234694906, "learning_rate": 1.9691019816394204e-05, "loss": 0.3152, "step": 4660 }, { "epoch": 0.38, "grad_norm": 5.975539574939649, "learning_rate": 1.9686614912936516e-05, "loss": 0.2747, "step": 4680 }, { "epoch": 0.38, "grad_norm": 6.135748637813934, "learning_rate": 1.968217933237272e-05, "loss": 0.3028, "step": 4700 }, { "epoch": 0.38, "grad_norm": 8.994912298940163, "learning_rate": 1.9677713088751562e-05, "loss": 0.3043, "step": 4720 }, { "epoch": 0.38, "grad_norm": 7.649871286543558, "learning_rate": 1.967321619621892e-05, "loss": 0.2577, "step": 4740 }, { "epoch": 0.38, "grad_norm": 6.035703921853307, "learning_rate": 1.9668688669017722e-05, "loss": 0.2596, "step": 4760 }, { "epoch": 0.39, "grad_norm": 5.4070428696843615, "learning_rate": 1.9664130521487946e-05, "loss": 0.2885, "step": 4780 }, { "epoch": 0.39, "grad_norm": 9.68037240943506, "learning_rate": 1.9659541768066545e-05, "loss": 0.2739, "step": 4800 }, { "epoch": 0.39, "grad_norm": 7.032775442165197, "learning_rate": 1.965492242328741e-05, "loss": 0.2832, "step": 4820 }, { "epoch": 0.39, "grad_norm": 7.038266627020968, "learning_rate": 1.9650272501781326e-05, "loss": 0.3053, "step": 4840 }, { "epoch": 0.39, "grad_norm": 4.469246363249616, "learning_rate": 1.9645592018275917e-05, "loss": 0.2922, "step": 4860 }, { "epoch": 0.39, "grad_norm": 9.997476259295432, "learning_rate": 1.964088098759561e-05, "loss": 0.3029, "step": 4880 }, { "epoch": 0.4, "grad_norm": 4.1359256203786705, "learning_rate": 1.9636139424661588e-05, "loss": 0.2885, "step": 4900 }, { "epoch": 0.4, "grad_norm": 6.857022727186512, "learning_rate": 1.9631367344491735e-05, "loss": 0.263, "step": 4920 }, { "epoch": 0.4, "grad_norm": 6.720261230840821, "learning_rate": 1.9626564762200583e-05, "loss": 0.3083, "step": 4940 }, { "epoch": 0.4, "grad_norm": 7.113731977499931, "learning_rate": 1.9621731692999284e-05, "loss": 0.2789, "step": 4960 }, { "epoch": 0.4, "grad_norm": 6.665634774774537, "learning_rate": 1.961686815219555e-05, "loss": 0.2591, "step": 4980 }, { "epoch": 0.4, "grad_norm": 4.599220599612653, "learning_rate": 1.9611974155193597e-05, "loss": 0.2753, "step": 5000 }, { "epoch": 0.41, "grad_norm": 6.564880520618788, "learning_rate": 1.960704971749411e-05, "loss": 0.2805, "step": 5020 }, { "epoch": 0.41, "grad_norm": 7.418663470463415, "learning_rate": 1.9602094854694194e-05, "loss": 0.2782, "step": 5040 }, { "epoch": 0.41, "grad_norm": 5.491847672130194, "learning_rate": 1.9597109582487313e-05, "loss": 0.2702, "step": 5060 }, { "epoch": 0.41, "grad_norm": 12.43956214256869, "learning_rate": 1.9592093916663242e-05, "loss": 0.2972, "step": 5080 }, { "epoch": 0.41, "grad_norm": 8.800236820155485, "learning_rate": 1.958704787310803e-05, "loss": 0.2725, "step": 5100 }, { "epoch": 0.41, "grad_norm": 6.644759742176537, "learning_rate": 1.9581971467803934e-05, "loss": 0.289, "step": 5120 }, { "epoch": 0.41, "grad_norm": 5.009336147526538, "learning_rate": 1.9576864716829377e-05, "loss": 0.2969, "step": 5140 }, { "epoch": 0.42, "grad_norm": 5.803503477935393, "learning_rate": 1.95717276363589e-05, "loss": 0.2774, "step": 5160 }, { "epoch": 0.42, "grad_norm": 5.894993950320594, "learning_rate": 1.95665602426631e-05, "loss": 0.2273, "step": 5180 }, { "epoch": 0.42, "grad_norm": 6.96986305003759, "learning_rate": 1.956136255210859e-05, "loss": 0.2736, "step": 5200 }, { "epoch": 0.42, "grad_norm": 9.605041419937288, "learning_rate": 1.955613458115793e-05, "loss": 0.2907, "step": 5220 }, { "epoch": 0.42, "grad_norm": 6.647244804794919, "learning_rate": 1.9550876346369615e-05, "loss": 0.261, "step": 5240 }, { "epoch": 0.42, "grad_norm": 5.607026269896423, "learning_rate": 1.9545587864397955e-05, "loss": 0.3143, "step": 5260 }, { "epoch": 0.43, "grad_norm": 13.142033450455475, "learning_rate": 1.954026915199309e-05, "loss": 0.2434, "step": 5280 }, { "epoch": 0.43, "grad_norm": 3.248788167531875, "learning_rate": 1.9534920226000902e-05, "loss": 0.2705, "step": 5300 }, { "epoch": 0.43, "grad_norm": 13.164180730181236, "learning_rate": 1.9529541103362962e-05, "loss": 0.2862, "step": 5320 }, { "epoch": 0.43, "grad_norm": 5.831970091880435, "learning_rate": 1.9524131801116487e-05, "loss": 0.3054, "step": 5340 }, { "epoch": 0.43, "grad_norm": 5.446448668681817, "learning_rate": 1.951869233639428e-05, "loss": 0.2671, "step": 5360 }, { "epoch": 0.43, "grad_norm": 6.302315229032403, "learning_rate": 1.951322272642468e-05, "loss": 0.2765, "step": 5380 }, { "epoch": 0.44, "grad_norm": 11.243376074569383, "learning_rate": 1.9507722988531502e-05, "loss": 0.2582, "step": 5400 }, { "epoch": 0.44, "grad_norm": 7.673489589122099, "learning_rate": 1.9502193140133983e-05, "loss": 0.3143, "step": 5420 }, { "epoch": 0.44, "grad_norm": 7.9949329541838265, "learning_rate": 1.9496633198746736e-05, "loss": 0.2862, "step": 5440 }, { "epoch": 0.44, "grad_norm": 4.907903307657898, "learning_rate": 1.9491043181979677e-05, "loss": 0.2926, "step": 5460 }, { "epoch": 0.44, "grad_norm": 8.67569902665337, "learning_rate": 1.9485423107537986e-05, "loss": 0.2741, "step": 5480 }, { "epoch": 0.44, "grad_norm": 7.103858232561379, "learning_rate": 1.9479772993222038e-05, "loss": 0.2767, "step": 5500 }, { "epoch": 0.45, "grad_norm": 8.73970254143099, "learning_rate": 1.947409285692736e-05, "loss": 0.232, "step": 5520 }, { "epoch": 0.45, "grad_norm": 5.799420807956918, "learning_rate": 1.946838271664457e-05, "loss": 0.286, "step": 5540 }, { "epoch": 0.45, "grad_norm": 5.575038878985263, "learning_rate": 1.9462642590459306e-05, "loss": 0.2361, "step": 5560 }, { "epoch": 0.45, "grad_norm": 5.718902752613272, "learning_rate": 1.9456872496552184e-05, "loss": 0.2781, "step": 5580 }, { "epoch": 0.45, "grad_norm": 6.156497233891847, "learning_rate": 1.9451072453198742e-05, "loss": 0.2798, "step": 5600 }, { "epoch": 0.45, "grad_norm": 8.158434812741351, "learning_rate": 1.9445242478769374e-05, "loss": 0.2629, "step": 5620 }, { "epoch": 0.46, "grad_norm": 4.100053898526847, "learning_rate": 1.9439382591729265e-05, "loss": 0.2616, "step": 5640 }, { "epoch": 0.46, "grad_norm": 10.398486291325238, "learning_rate": 1.9433492810638355e-05, "loss": 0.281, "step": 5660 }, { "epoch": 0.46, "grad_norm": 7.054531497011973, "learning_rate": 1.942757315415126e-05, "loss": 0.2899, "step": 5680 }, { "epoch": 0.46, "grad_norm": 5.42866324225203, "learning_rate": 1.9421623641017218e-05, "loss": 0.3102, "step": 5700 }, { "epoch": 0.46, "grad_norm": 7.133905299895763, "learning_rate": 1.941564429008004e-05, "loss": 0.2616, "step": 5720 }, { "epoch": 0.46, "grad_norm": 6.043327611859842, "learning_rate": 1.9409635120278035e-05, "loss": 0.2614, "step": 5740 }, { "epoch": 0.46, "grad_norm": 7.125098244508386, "learning_rate": 1.9403596150643957e-05, "loss": 0.2732, "step": 5760 }, { "epoch": 0.47, "grad_norm": 7.809924868565428, "learning_rate": 1.9397527400304944e-05, "loss": 0.2537, "step": 5780 }, { "epoch": 0.47, "grad_norm": 6.721114625225486, "learning_rate": 1.9391428888482466e-05, "loss": 0.2935, "step": 5800 }, { "epoch": 0.47, "grad_norm": 5.41764363012664, "learning_rate": 1.9385300634492244e-05, "loss": 0.2644, "step": 5820 }, { "epoch": 0.47, "grad_norm": 6.577839969793495, "learning_rate": 1.937914265774421e-05, "loss": 0.2822, "step": 5840 }, { "epoch": 0.47, "grad_norm": 5.446653006796048, "learning_rate": 1.9372954977742437e-05, "loss": 0.2767, "step": 5860 }, { "epoch": 0.47, "grad_norm": 6.276051357995197, "learning_rate": 1.9366737614085067e-05, "loss": 0.2693, "step": 5880 }, { "epoch": 0.48, "grad_norm": 5.1174083399984935, "learning_rate": 1.9360490586464265e-05, "loss": 0.2968, "step": 5900 }, { "epoch": 0.48, "grad_norm": 8.060800772518713, "learning_rate": 1.9354213914666154e-05, "loss": 0.3042, "step": 5920 }, { "epoch": 0.48, "grad_norm": 6.946903213944759, "learning_rate": 1.934790761857074e-05, "loss": 0.2896, "step": 5940 }, { "epoch": 0.48, "grad_norm": 6.9058705221323855, "learning_rate": 1.934157171815187e-05, "loss": 0.2697, "step": 5960 }, { "epoch": 0.48, "grad_norm": 6.140873173559298, "learning_rate": 1.9335206233477138e-05, "loss": 0.3012, "step": 5980 }, { "epoch": 0.48, "grad_norm": 6.2613108710926415, "learning_rate": 1.9328811184707857e-05, "loss": 0.2616, "step": 6000 }, { "epoch": 0.49, "grad_norm": 7.103058610195544, "learning_rate": 1.932238659209897e-05, "loss": 0.2476, "step": 6020 }, { "epoch": 0.49, "grad_norm": 7.721454841424903, "learning_rate": 1.9315932475998994e-05, "loss": 0.2772, "step": 6040 }, { "epoch": 0.49, "grad_norm": 7.2979455852743245, "learning_rate": 1.930944885684996e-05, "loss": 0.2463, "step": 6060 }, { "epoch": 0.49, "grad_norm": 5.194835577851161, "learning_rate": 1.9302935755187335e-05, "loss": 0.2595, "step": 6080 }, { "epoch": 0.49, "grad_norm": 8.826615087967348, "learning_rate": 1.9296393191639976e-05, "loss": 0.2728, "step": 6100 }, { "epoch": 0.49, "grad_norm": 4.8811686686091, "learning_rate": 1.9289821186930038e-05, "loss": 0.2998, "step": 6120 }, { "epoch": 0.5, "grad_norm": 6.458493860362177, "learning_rate": 1.9283219761872943e-05, "loss": 0.2608, "step": 6140 }, { "epoch": 0.5, "grad_norm": 7.035199086314618, "learning_rate": 1.9276588937377293e-05, "loss": 0.2789, "step": 6160 }, { "epoch": 0.5, "grad_norm": 5.133893925330738, "learning_rate": 1.9269928734444792e-05, "loss": 0.2858, "step": 6180 }, { "epoch": 0.5, "grad_norm": 6.888562228890644, "learning_rate": 1.9263239174170203e-05, "loss": 0.263, "step": 6200 }, { "epoch": 0.5, "grad_norm": 7.247178153358964, "learning_rate": 1.9256520277741276e-05, "loss": 0.2887, "step": 6220 }, { "epoch": 0.5, "grad_norm": 5.8972079378636755, "learning_rate": 1.9249772066438676e-05, "loss": 0.2693, "step": 6240 }, { "epoch": 0.51, "grad_norm": 4.962455745470868, "learning_rate": 1.924299456163591e-05, "loss": 0.2749, "step": 6260 }, { "epoch": 0.51, "grad_norm": 7.258216936978947, "learning_rate": 1.9236187784799267e-05, "loss": 0.2957, "step": 6280 }, { "epoch": 0.51, "grad_norm": 6.194723517380506, "learning_rate": 1.9229351757487757e-05, "loss": 0.2773, "step": 6300 }, { "epoch": 0.51, "grad_norm": 6.5263758820129505, "learning_rate": 1.9222486501353027e-05, "loss": 0.293, "step": 6320 }, { "epoch": 0.51, "grad_norm": 6.366073941639377, "learning_rate": 1.9215592038139296e-05, "loss": 0.2755, "step": 6340 }, { "epoch": 0.51, "grad_norm": 116.03864354978906, "learning_rate": 1.9208668389683308e-05, "loss": 0.251, "step": 6360 }, { "epoch": 0.52, "grad_norm": 5.264420821601751, "learning_rate": 1.9201715577914223e-05, "loss": 0.2845, "step": 6380 }, { "epoch": 0.52, "grad_norm": 7.40581166704595, "learning_rate": 1.9194733624853584e-05, "loss": 0.2632, "step": 6400 }, { "epoch": 0.52, "grad_norm": 5.941729193844859, "learning_rate": 1.918772255261523e-05, "loss": 0.2597, "step": 6420 }, { "epoch": 0.52, "grad_norm": 6.312855927658963, "learning_rate": 1.9180682383405227e-05, "loss": 0.2692, "step": 6440 }, { "epoch": 0.52, "grad_norm": 6.071350558765477, "learning_rate": 1.9173613139521798e-05, "loss": 0.2731, "step": 6460 }, { "epoch": 0.52, "grad_norm": 8.993502072541519, "learning_rate": 1.9166514843355254e-05, "loss": 0.2548, "step": 6480 }, { "epoch": 0.52, "grad_norm": 7.005495911170442, "learning_rate": 1.9159387517387924e-05, "loss": 0.2612, "step": 6500 }, { "epoch": 0.53, "grad_norm": 7.665844444744844, "learning_rate": 1.915223118419409e-05, "loss": 0.2501, "step": 6520 }, { "epoch": 0.53, "grad_norm": 6.458334088069092, "learning_rate": 1.9145045866439892e-05, "loss": 0.2762, "step": 6540 }, { "epoch": 0.53, "grad_norm": 6.819348453934704, "learning_rate": 1.9137831586883288e-05, "loss": 0.2826, "step": 6560 }, { "epoch": 0.53, "grad_norm": 5.080353890354994, "learning_rate": 1.9130588368373958e-05, "loss": 0.2738, "step": 6580 }, { "epoch": 0.53, "grad_norm": 5.836344060052037, "learning_rate": 1.912331623385324e-05, "loss": 0.2586, "step": 6600 }, { "epoch": 0.53, "grad_norm": 8.357013119166787, "learning_rate": 1.9116015206354067e-05, "loss": 0.3174, "step": 6620 }, { "epoch": 0.54, "grad_norm": 4.307353833021694, "learning_rate": 1.9108685309000866e-05, "loss": 0.2721, "step": 6640 }, { "epoch": 0.54, "grad_norm": 7.002584797605542, "learning_rate": 1.9101326565009517e-05, "loss": 0.2581, "step": 6660 }, { "epoch": 0.54, "grad_norm": 7.299065402050334, "learning_rate": 1.909393899768726e-05, "loss": 0.2933, "step": 6680 }, { "epoch": 0.54, "grad_norm": 6.57009941780665, "learning_rate": 1.9086522630432638e-05, "loss": 0.2843, "step": 6700 }, { "epoch": 0.54, "grad_norm": 6.857683881351832, "learning_rate": 1.907907748673539e-05, "loss": 0.273, "step": 6720 }, { "epoch": 0.54, "grad_norm": 5.553536898181894, "learning_rate": 1.9071603590176417e-05, "loss": 0.2623, "step": 6740 }, { "epoch": 0.55, "grad_norm": 6.30566096206076, "learning_rate": 1.906410096442768e-05, "loss": 0.2366, "step": 6760 }, { "epoch": 0.55, "grad_norm": 10.352010603508685, "learning_rate": 1.9056569633252136e-05, "loss": 0.2546, "step": 6780 }, { "epoch": 0.55, "grad_norm": 7.80077709337333, "learning_rate": 1.9049009620503663e-05, "loss": 0.2763, "step": 6800 }, { "epoch": 0.55, "grad_norm": 8.03821543687826, "learning_rate": 1.9041420950126976e-05, "loss": 0.2486, "step": 6820 }, { "epoch": 0.55, "grad_norm": 5.390540434685423, "learning_rate": 1.9033803646157558e-05, "loss": 0.2964, "step": 6840 }, { "epoch": 0.55, "grad_norm": 7.430208109717449, "learning_rate": 1.9026157732721585e-05, "loss": 0.2681, "step": 6860 }, { "epoch": 0.56, "grad_norm": 7.857246125768761, "learning_rate": 1.9018483234035845e-05, "loss": 0.2719, "step": 6880 }, { "epoch": 0.56, "grad_norm": 5.565927371122475, "learning_rate": 1.901078017440767e-05, "loss": 0.2703, "step": 6900 }, { "epoch": 0.56, "grad_norm": 6.9976710123560375, "learning_rate": 1.9003048578234843e-05, "loss": 0.2566, "step": 6920 }, { "epoch": 0.56, "grad_norm": 6.478229745256355, "learning_rate": 1.899528847000554e-05, "loss": 0.253, "step": 6940 }, { "epoch": 0.56, "grad_norm": 5.116336291620225, "learning_rate": 1.898749987429823e-05, "loss": 0.2529, "step": 6960 }, { "epoch": 0.56, "grad_norm": 5.587216274900646, "learning_rate": 1.8979682815781627e-05, "loss": 0.2848, "step": 6980 }, { "epoch": 0.57, "grad_norm": 5.679239499396368, "learning_rate": 1.8971837319214586e-05, "loss": 0.2435, "step": 7000 }, { "epoch": 0.57, "grad_norm": 6.1169436663169074, "learning_rate": 1.8963963409446022e-05, "loss": 0.2793, "step": 7020 }, { "epoch": 0.57, "grad_norm": 5.802383972086084, "learning_rate": 1.8956061111414865e-05, "loss": 0.2717, "step": 7040 }, { "epoch": 0.57, "grad_norm": 6.507485407821351, "learning_rate": 1.8948130450149942e-05, "loss": 0.3011, "step": 7060 }, { "epoch": 0.57, "grad_norm": 5.9621692749685415, "learning_rate": 1.8940171450769924e-05, "loss": 0.3076, "step": 7080 }, { "epoch": 0.57, "grad_norm": 54.193219403324576, "learning_rate": 1.8932184138483223e-05, "loss": 0.2629, "step": 7100 }, { "epoch": 0.57, "grad_norm": 7.104622864455409, "learning_rate": 1.8924168538587956e-05, "loss": 0.2714, "step": 7120 }, { "epoch": 0.58, "grad_norm": 7.0480173880874615, "learning_rate": 1.8916124676471797e-05, "loss": 0.2736, "step": 7140 }, { "epoch": 0.58, "grad_norm": 8.412429750327739, "learning_rate": 1.8908052577611958e-05, "loss": 0.2644, "step": 7160 }, { "epoch": 0.58, "grad_norm": 8.78692031185526, "learning_rate": 1.8899952267575083e-05, "loss": 0.2402, "step": 7180 }, { "epoch": 0.58, "grad_norm": 6.684638721458758, "learning_rate": 1.889182377201716e-05, "loss": 0.2542, "step": 7200 }, { "epoch": 0.58, "grad_norm": 6.010627553359556, "learning_rate": 1.8883667116683457e-05, "loss": 0.2838, "step": 7220 }, { "epoch": 0.58, "grad_norm": 7.414666524064716, "learning_rate": 1.887548232740843e-05, "loss": 0.2851, "step": 7240 }, { "epoch": 0.59, "grad_norm": 8.34076720224061, "learning_rate": 1.886726943011564e-05, "loss": 0.2516, "step": 7260 }, { "epoch": 0.59, "grad_norm": 6.094640422225146, "learning_rate": 1.885902845081767e-05, "loss": 0.2313, "step": 7280 }, { "epoch": 0.59, "grad_norm": 4.148097252407384, "learning_rate": 1.8850759415616066e-05, "loss": 0.2689, "step": 7300 }, { "epoch": 0.59, "grad_norm": 7.561235558598041, "learning_rate": 1.8842462350701212e-05, "loss": 0.2983, "step": 7320 }, { "epoch": 0.59, "grad_norm": 7.87091273248736, "learning_rate": 1.883413728235228e-05, "loss": 0.2386, "step": 7340 }, { "epoch": 0.59, "grad_norm": 6.197625708495748, "learning_rate": 1.8825784236937146e-05, "loss": 0.282, "step": 7360 }, { "epoch": 0.6, "grad_norm": 9.524108011926973, "learning_rate": 1.8817403240912283e-05, "loss": 0.2776, "step": 7380 }, { "epoch": 0.6, "grad_norm": 5.147943823246307, "learning_rate": 1.8808994320822693e-05, "loss": 0.2625, "step": 7400 }, { "epoch": 0.6, "grad_norm": 5.764109011612628, "learning_rate": 1.8800557503301827e-05, "loss": 0.2859, "step": 7420 }, { "epoch": 0.6, "grad_norm": 7.7970156609149335, "learning_rate": 1.8792092815071498e-05, "loss": 0.2589, "step": 7440 }, { "epoch": 0.6, "grad_norm": 6.220123292737489, "learning_rate": 1.8783600282941782e-05, "loss": 0.269, "step": 7460 }, { "epoch": 0.6, "grad_norm": 6.875941264134116, "learning_rate": 1.877507993381096e-05, "loss": 0.2624, "step": 7480 }, { "epoch": 0.61, "grad_norm": 5.721394912188018, "learning_rate": 1.8766531794665402e-05, "loss": 0.2571, "step": 7500 }, { "epoch": 0.61, "grad_norm": 6.99318335916291, "learning_rate": 1.8757955892579504e-05, "loss": 0.26, "step": 7520 }, { "epoch": 0.61, "grad_norm": 6.692727585899676, "learning_rate": 1.87493522547156e-05, "loss": 0.2635, "step": 7540 }, { "epoch": 0.61, "grad_norm": 6.5007755110350525, "learning_rate": 1.874072090832386e-05, "loss": 0.2754, "step": 7560 }, { "epoch": 0.61, "grad_norm": 7.775379340923738, "learning_rate": 1.873206188074223e-05, "loss": 0.2708, "step": 7580 }, { "epoch": 0.61, "grad_norm": 4.970941791912674, "learning_rate": 1.872337519939631e-05, "loss": 0.2592, "step": 7600 }, { "epoch": 0.62, "grad_norm": 7.276189396167904, "learning_rate": 1.8714660891799302e-05, "loss": 0.2648, "step": 7620 }, { "epoch": 0.62, "grad_norm": 6.724776718800752, "learning_rate": 1.870591898555191e-05, "loss": 0.2606, "step": 7640 }, { "epoch": 0.62, "grad_norm": 7.735768695454274, "learning_rate": 1.8697149508342237e-05, "loss": 0.2511, "step": 7660 }, { "epoch": 0.62, "grad_norm": 5.05914779633595, "learning_rate": 1.868835248794573e-05, "loss": 0.2609, "step": 7680 }, { "epoch": 0.62, "grad_norm": 8.031314122281715, "learning_rate": 1.8679527952225054e-05, "loss": 0.2718, "step": 7700 }, { "epoch": 0.62, "grad_norm": 5.619780768194464, "learning_rate": 1.867067592913004e-05, "loss": 0.2717, "step": 7720 }, { "epoch": 0.62, "grad_norm": 7.595427904662886, "learning_rate": 1.8661796446697557e-05, "loss": 0.2536, "step": 7740 }, { "epoch": 0.63, "grad_norm": 5.462276616537402, "learning_rate": 1.8652889533051473e-05, "loss": 0.2674, "step": 7760 }, { "epoch": 0.63, "grad_norm": 5.302383844019715, "learning_rate": 1.864395521640252e-05, "loss": 0.2856, "step": 7780 }, { "epoch": 0.63, "grad_norm": 4.703940083284321, "learning_rate": 1.8634993525048227e-05, "loss": 0.2609, "step": 7800 }, { "epoch": 0.63, "grad_norm": 6.585961827134786, "learning_rate": 1.862600448737283e-05, "loss": 0.265, "step": 7820 }, { "epoch": 0.63, "grad_norm": 7.27689896277283, "learning_rate": 1.861698813184717e-05, "loss": 0.3018, "step": 7840 }, { "epoch": 0.63, "grad_norm": 6.231232809733686, "learning_rate": 1.860794448702863e-05, "loss": 0.2268, "step": 7860 }, { "epoch": 0.64, "grad_norm": 7.794911353272152, "learning_rate": 1.8598873581561e-05, "loss": 0.2632, "step": 7880 }, { "epoch": 0.64, "grad_norm": 6.977335614708055, "learning_rate": 1.8589775444174436e-05, "loss": 0.3097, "step": 7900 }, { "epoch": 0.64, "grad_norm": 7.607942857642037, "learning_rate": 1.858065010368533e-05, "loss": 0.2658, "step": 7920 }, { "epoch": 0.64, "grad_norm": 6.109669397778123, "learning_rate": 1.857149758899624e-05, "loss": 0.2613, "step": 7940 }, { "epoch": 0.64, "grad_norm": 6.142102090556645, "learning_rate": 1.8562317929095796e-05, "loss": 0.2769, "step": 7960 }, { "epoch": 0.64, "grad_norm": 3.98370343700879, "learning_rate": 1.8553111153058593e-05, "loss": 0.2642, "step": 7980 }, { "epoch": 0.65, "grad_norm": 6.375900504146025, "learning_rate": 1.8543877290045122e-05, "loss": 0.2646, "step": 8000 }, { "epoch": 0.65, "grad_norm": 7.277577534154136, "learning_rate": 1.853461636930166e-05, "loss": 0.2806, "step": 8020 }, { "epoch": 0.65, "grad_norm": 6.81435963858201, "learning_rate": 1.852532842016019e-05, "loss": 0.2536, "step": 8040 }, { "epoch": 0.65, "grad_norm": 5.854006003712663, "learning_rate": 1.851601347203829e-05, "loss": 0.2447, "step": 8060 }, { "epoch": 0.65, "grad_norm": 7.787886275359923, "learning_rate": 1.8506671554439064e-05, "loss": 0.2663, "step": 8080 }, { "epoch": 0.65, "grad_norm": 9.21089898409568, "learning_rate": 1.849730269695103e-05, "loss": 0.2601, "step": 8100 }, { "epoch": 0.66, "grad_norm": 8.590889527489873, "learning_rate": 1.8487906929248028e-05, "loss": 0.2531, "step": 8120 }, { "epoch": 0.66, "grad_norm": 5.551684548356732, "learning_rate": 1.8478484281089143e-05, "loss": 0.2605, "step": 8140 }, { "epoch": 0.66, "grad_norm": 4.227848217032472, "learning_rate": 1.8469034782318585e-05, "loss": 0.2728, "step": 8160 }, { "epoch": 0.66, "grad_norm": 6.015758166139706, "learning_rate": 1.8459558462865613e-05, "loss": 0.2883, "step": 8180 }, { "epoch": 0.66, "grad_norm": 6.568658173678755, "learning_rate": 1.845005535274444e-05, "loss": 0.2454, "step": 8200 }, { "epoch": 0.66, "grad_norm": 5.985752674217696, "learning_rate": 1.844052548205412e-05, "loss": 0.2442, "step": 8220 }, { "epoch": 0.67, "grad_norm": 6.3219678524060425, "learning_rate": 1.843096888097848e-05, "loss": 0.2912, "step": 8240 }, { "epoch": 0.67, "grad_norm": 4.096257730243316, "learning_rate": 1.8421385579785997e-05, "loss": 0.2636, "step": 8260 }, { "epoch": 0.67, "grad_norm": 6.396648972118899, "learning_rate": 1.8411775608829722e-05, "loss": 0.2324, "step": 8280 }, { "epoch": 0.67, "grad_norm": 4.782379216505, "learning_rate": 1.8402138998547174e-05, "loss": 0.2675, "step": 8300 }, { "epoch": 0.67, "grad_norm": 8.676707198167653, "learning_rate": 1.839247577946025e-05, "loss": 0.2843, "step": 8320 }, { "epoch": 0.67, "grad_norm": 5.39138478992206, "learning_rate": 1.8382785982175118e-05, "loss": 0.2742, "step": 8340 }, { "epoch": 0.67, "grad_norm": 5.818443622984385, "learning_rate": 1.8373069637382136e-05, "loss": 0.26, "step": 8360 }, { "epoch": 0.68, "grad_norm": 8.95366226368456, "learning_rate": 1.8363326775855737e-05, "loss": 0.2687, "step": 8380 }, { "epoch": 0.68, "grad_norm": 7.96756080281063, "learning_rate": 1.8353557428454346e-05, "loss": 0.2425, "step": 8400 }, { "epoch": 0.68, "grad_norm": 6.577104865413394, "learning_rate": 1.8343761626120272e-05, "loss": 0.2688, "step": 8420 }, { "epoch": 0.68, "grad_norm": 6.6269283727065, "learning_rate": 1.8333939399879617e-05, "loss": 0.2808, "step": 8440 }, { "epoch": 0.68, "grad_norm": 7.016348140974161, "learning_rate": 1.8324090780842173e-05, "loss": 0.2511, "step": 8460 }, { "epoch": 0.68, "grad_norm": 7.5454363034081116, "learning_rate": 1.831421580020133e-05, "loss": 0.252, "step": 8480 }, { "epoch": 0.69, "grad_norm": 5.837760589468463, "learning_rate": 1.830431448923396e-05, "loss": 0.2728, "step": 8500 }, { "epoch": 0.69, "grad_norm": 6.154380243306325, "learning_rate": 1.8294386879300353e-05, "loss": 0.2867, "step": 8520 }, { "epoch": 0.69, "grad_norm": 7.71122937485844, "learning_rate": 1.8284433001844073e-05, "loss": 0.2302, "step": 8540 }, { "epoch": 0.69, "grad_norm": 6.86335128201322, "learning_rate": 1.8274452888391894e-05, "loss": 0.2586, "step": 8560 }, { "epoch": 0.69, "grad_norm": 5.661853354206643, "learning_rate": 1.8264446570553682e-05, "loss": 0.2505, "step": 8580 }, { "epoch": 0.69, "grad_norm": 5.982364804963667, "learning_rate": 1.82544140800223e-05, "loss": 0.2673, "step": 8600 }, { "epoch": 0.7, "grad_norm": 4.739028708176796, "learning_rate": 1.824435544857351e-05, "loss": 0.2678, "step": 8620 }, { "epoch": 0.7, "grad_norm": 4.91420533377473, "learning_rate": 1.823427070806587e-05, "loss": 0.2559, "step": 8640 }, { "epoch": 0.7, "grad_norm": 5.618249360419533, "learning_rate": 1.8224159890440623e-05, "loss": 0.2493, "step": 8660 }, { "epoch": 0.7, "grad_norm": 5.896677808188606, "learning_rate": 1.821402302772162e-05, "loss": 0.2585, "step": 8680 }, { "epoch": 0.7, "grad_norm": 6.073985124124518, "learning_rate": 1.82038601520152e-05, "loss": 0.2452, "step": 8700 }, { "epoch": 0.7, "grad_norm": 7.1459209410818, "learning_rate": 1.819367129551008e-05, "loss": 0.2592, "step": 8720 }, { "epoch": 0.71, "grad_norm": 6.390094315335785, "learning_rate": 1.8183456490477287e-05, "loss": 0.2461, "step": 8740 }, { "epoch": 0.71, "grad_norm": 5.294426005863845, "learning_rate": 1.8173215769270015e-05, "loss": 0.2685, "step": 8760 }, { "epoch": 0.71, "grad_norm": 5.276924483715485, "learning_rate": 1.8162949164323554e-05, "loss": 0.2615, "step": 8780 }, { "epoch": 0.71, "grad_norm": 7.331765382932756, "learning_rate": 1.8152656708155173e-05, "loss": 0.2828, "step": 8800 }, { "epoch": 0.71, "grad_norm": 5.361402122667844, "learning_rate": 1.8142338433364012e-05, "loss": 0.2849, "step": 8820 }, { "epoch": 0.71, "grad_norm": 6.712375473487036, "learning_rate": 1.8131994372630995e-05, "loss": 0.2716, "step": 8840 }, { "epoch": 0.72, "grad_norm": 8.103353922148388, "learning_rate": 1.812162455871872e-05, "loss": 0.2703, "step": 8860 }, { "epoch": 0.72, "grad_norm": 4.585974100152074, "learning_rate": 1.8111229024471334e-05, "loss": 0.2386, "step": 8880 }, { "epoch": 0.72, "grad_norm": 6.8332489132512375, "learning_rate": 1.8100807802814467e-05, "loss": 0.2935, "step": 8900 }, { "epoch": 0.72, "grad_norm": 5.556964992180211, "learning_rate": 1.80903609267551e-05, "loss": 0.2404, "step": 8920 }, { "epoch": 0.72, "grad_norm": 6.524527124099894, "learning_rate": 1.8079888429381472e-05, "loss": 0.2477, "step": 8940 }, { "epoch": 0.72, "grad_norm": 6.394125877212817, "learning_rate": 1.8069390343862972e-05, "loss": 0.2585, "step": 8960 }, { "epoch": 0.72, "grad_norm": 7.212304875264878, "learning_rate": 1.805886670345003e-05, "loss": 0.2514, "step": 8980 }, { "epoch": 0.73, "grad_norm": 5.915336602662839, "learning_rate": 1.8048317541474015e-05, "loss": 0.2554, "step": 9000 }, { "epoch": 0.73, "grad_norm": 6.204874325324116, "learning_rate": 1.803774289134714e-05, "loss": 0.2663, "step": 9020 }, { "epoch": 0.73, "grad_norm": 4.9458264028130525, "learning_rate": 1.8027142786562334e-05, "loss": 0.2374, "step": 9040 }, { "epoch": 0.73, "grad_norm": 5.66437734846908, "learning_rate": 1.8016517260693152e-05, "loss": 0.2173, "step": 9060 }, { "epoch": 0.73, "grad_norm": 8.8145498502476, "learning_rate": 1.800586634739367e-05, "loss": 0.2672, "step": 9080 }, { "epoch": 0.73, "grad_norm": 5.225621616310874, "learning_rate": 1.799519008039837e-05, "loss": 0.263, "step": 9100 }, { "epoch": 0.74, "grad_norm": 6.749141497235558, "learning_rate": 1.7984488493522033e-05, "loss": 0.294, "step": 9120 }, { "epoch": 0.74, "grad_norm": 6.5925500148457115, "learning_rate": 1.7973761620659645e-05, "loss": 0.2549, "step": 9140 }, { "epoch": 0.74, "grad_norm": 3.6612011894705097, "learning_rate": 1.7963009495786262e-05, "loss": 0.274, "step": 9160 }, { "epoch": 0.74, "grad_norm": 7.730637018917763, "learning_rate": 1.795223215295694e-05, "loss": 0.2476, "step": 9180 }, { "epoch": 0.74, "grad_norm": 5.253387992852078, "learning_rate": 1.7941429626306597e-05, "loss": 0.2557, "step": 9200 }, { "epoch": 0.74, "grad_norm": 6.185451592355014, "learning_rate": 1.7930601950049918e-05, "loss": 0.2414, "step": 9220 }, { "epoch": 0.75, "grad_norm": 8.613330410148825, "learning_rate": 1.7919749158481238e-05, "loss": 0.252, "step": 9240 }, { "epoch": 0.75, "grad_norm": 4.082779957130279, "learning_rate": 1.7908871285974452e-05, "loss": 0.246, "step": 9260 }, { "epoch": 0.75, "grad_norm": 5.080789002249157, "learning_rate": 1.789796836698288e-05, "loss": 0.2241, "step": 9280 }, { "epoch": 0.75, "grad_norm": 5.616004872409631, "learning_rate": 1.788704043603918e-05, "loss": 0.2635, "step": 9300 }, { "epoch": 0.75, "grad_norm": 4.6896605535132005, "learning_rate": 1.787608752775523e-05, "loss": 0.2496, "step": 9320 }, { "epoch": 0.75, "grad_norm": 6.020003669712304, "learning_rate": 1.786510967682201e-05, "loss": 0.2742, "step": 9340 }, { "epoch": 0.76, "grad_norm": 4.869330214670387, "learning_rate": 1.7854106918009516e-05, "loss": 0.2554, "step": 9360 }, { "epoch": 0.76, "grad_norm": 5.463125770044224, "learning_rate": 1.7843079286166613e-05, "loss": 0.256, "step": 9380 }, { "epoch": 0.76, "grad_norm": 12.859151326084799, "learning_rate": 1.7832026816220964e-05, "loss": 0.3044, "step": 9400 }, { "epoch": 0.76, "grad_norm": 7.462079888408213, "learning_rate": 1.7820949543178893e-05, "loss": 0.2603, "step": 9420 }, { "epoch": 0.76, "grad_norm": 6.251675190537996, "learning_rate": 1.7809847502125287e-05, "loss": 0.2524, "step": 9440 }, { "epoch": 0.76, "grad_norm": 20.20686096910179, "learning_rate": 1.779872072822348e-05, "loss": 0.2727, "step": 9460 }, { "epoch": 0.77, "grad_norm": 11.117280832355938, "learning_rate": 1.7787569256715128e-05, "loss": 0.2751, "step": 9480 }, { "epoch": 0.77, "grad_norm": 6.174365967852932, "learning_rate": 1.7776393122920136e-05, "loss": 0.2465, "step": 9500 }, { "epoch": 0.77, "grad_norm": 6.5845686642808205, "learning_rate": 1.7765192362236505e-05, "loss": 0.2637, "step": 9520 }, { "epoch": 0.77, "grad_norm": 9.227894944405277, "learning_rate": 1.775396701014024e-05, "loss": 0.2594, "step": 9540 }, { "epoch": 0.77, "grad_norm": 6.0294211980015255, "learning_rate": 1.7742717102185233e-05, "loss": 0.2506, "step": 9560 }, { "epoch": 0.77, "grad_norm": 6.611585459356701, "learning_rate": 1.7731442674003153e-05, "loss": 0.256, "step": 9580 }, { "epoch": 0.77, "grad_norm": 6.474013099428535, "learning_rate": 1.772014376130333e-05, "loss": 0.2509, "step": 9600 }, { "epoch": 0.78, "grad_norm": 4.050917561517386, "learning_rate": 1.7708820399872644e-05, "loss": 0.2597, "step": 9620 }, { "epoch": 0.78, "grad_norm": 7.523512541811629, "learning_rate": 1.7697472625575415e-05, "loss": 0.2617, "step": 9640 }, { "epoch": 0.78, "grad_norm": 4.674855993980255, "learning_rate": 1.768610047435328e-05, "loss": 0.2148, "step": 9660 }, { "epoch": 0.78, "grad_norm": 3.581193699152847, "learning_rate": 1.7674703982225084e-05, "loss": 0.2485, "step": 9680 }, { "epoch": 0.78, "grad_norm": 5.995347444394187, "learning_rate": 1.7663283185286778e-05, "loss": 0.2504, "step": 9700 }, { "epoch": 0.78, "grad_norm": 6.106039165812286, "learning_rate": 1.7651838119711278e-05, "loss": 0.2591, "step": 9720 }, { "epoch": 0.79, "grad_norm": 5.544368037680747, "learning_rate": 1.7640368821748374e-05, "loss": 0.2589, "step": 9740 }, { "epoch": 0.79, "grad_norm": 11.908781488384356, "learning_rate": 1.7628875327724604e-05, "loss": 0.24, "step": 9760 }, { "epoch": 0.79, "grad_norm": 5.2162186199664005, "learning_rate": 1.761735767404314e-05, "loss": 0.279, "step": 9780 }, { "epoch": 0.79, "grad_norm": 8.332009731717408, "learning_rate": 1.760581589718369e-05, "loss": 0.2523, "step": 9800 }, { "epoch": 0.79, "grad_norm": 6.811834460305066, "learning_rate": 1.759425003370234e-05, "loss": 0.2422, "step": 9820 }, { "epoch": 0.79, "grad_norm": 10.001650864708848, "learning_rate": 1.758266012023149e-05, "loss": 0.2415, "step": 9840 }, { "epoch": 0.8, "grad_norm": 14.181135321229519, "learning_rate": 1.7571046193479697e-05, "loss": 0.2439, "step": 9860 }, { "epoch": 0.8, "grad_norm": 5.304371617930666, "learning_rate": 1.7559408290231582e-05, "loss": 0.2883, "step": 9880 }, { "epoch": 0.8, "grad_norm": 10.159891549680514, "learning_rate": 1.754774644734771e-05, "loss": 0.2402, "step": 9900 }, { "epoch": 0.8, "grad_norm": 21.596871665189294, "learning_rate": 1.753606070176446e-05, "loss": 0.2646, "step": 9920 }, { "epoch": 0.8, "grad_norm": 3.6266946448855064, "learning_rate": 1.752435109049392e-05, "loss": 0.2463, "step": 9940 }, { "epoch": 0.8, "grad_norm": 7.461139967802549, "learning_rate": 1.7512617650623776e-05, "loss": 0.2343, "step": 9960 }, { "epoch": 0.81, "grad_norm": 5.8844648373593955, "learning_rate": 1.7500860419317183e-05, "loss": 0.251, "step": 9980 }, { "epoch": 0.81, "grad_norm": 9.038354738793856, "learning_rate": 1.7489079433812638e-05, "loss": 0.2494, "step": 10000 }, { "epoch": 0.81, "grad_norm": 8.591404154257724, "learning_rate": 1.7477274731423892e-05, "loss": 0.2374, "step": 10020 }, { "epoch": 0.81, "grad_norm": 5.9870710947999815, "learning_rate": 1.7465446349539797e-05, "loss": 0.2206, "step": 10040 }, { "epoch": 0.81, "grad_norm": 6.228813578147013, "learning_rate": 1.7453594325624224e-05, "loss": 0.2462, "step": 10060 }, { "epoch": 0.81, "grad_norm": 5.257017078287017, "learning_rate": 1.7441718697215904e-05, "loss": 0.2409, "step": 10080 }, { "epoch": 0.82, "grad_norm": 6.952956019716318, "learning_rate": 1.742981950192835e-05, "loss": 0.2521, "step": 10100 }, { "epoch": 0.82, "grad_norm": 5.5548892299756805, "learning_rate": 1.7417896777449706e-05, "loss": 0.2647, "step": 10120 }, { "epoch": 0.82, "grad_norm": 5.73273030739662, "learning_rate": 1.7405950561542636e-05, "loss": 0.2473, "step": 10140 }, { "epoch": 0.82, "grad_norm": 5.8226292447674775, "learning_rate": 1.7393980892044222e-05, "loss": 0.2799, "step": 10160 }, { "epoch": 0.82, "grad_norm": 6.573153903103647, "learning_rate": 1.738198780686582e-05, "loss": 0.2391, "step": 10180 }, { "epoch": 0.82, "grad_norm": 6.2081294015592094, "learning_rate": 1.7369971343992953e-05, "loss": 0.2441, "step": 10200 }, { "epoch": 0.82, "grad_norm": 7.239395541675969, "learning_rate": 1.735793154148519e-05, "loss": 0.2467, "step": 10220 }, { "epoch": 0.83, "grad_norm": 6.574019720880623, "learning_rate": 1.7345868437476016e-05, "loss": 0.2742, "step": 10240 }, { "epoch": 0.83, "grad_norm": 3.932079883792344, "learning_rate": 1.733378207017273e-05, "loss": 0.2799, "step": 10260 }, { "epoch": 0.83, "grad_norm": 7.965596611059161, "learning_rate": 1.7321672477856297e-05, "loss": 0.268, "step": 10280 }, { "epoch": 0.83, "grad_norm": 6.637332593742831, "learning_rate": 1.730953969888126e-05, "loss": 0.281, "step": 10300 }, { "epoch": 0.83, "grad_norm": 4.598400020154981, "learning_rate": 1.729738377167559e-05, "loss": 0.2688, "step": 10320 }, { "epoch": 0.83, "grad_norm": 10.008276375495472, "learning_rate": 1.728520473474057e-05, "loss": 0.2424, "step": 10340 }, { "epoch": 0.84, "grad_norm": 9.609588968019253, "learning_rate": 1.7273002626650693e-05, "loss": 0.2562, "step": 10360 }, { "epoch": 0.84, "grad_norm": 6.246946580790647, "learning_rate": 1.726077748605352e-05, "loss": 0.2536, "step": 10380 }, { "epoch": 0.84, "grad_norm": 5.207954250527354, "learning_rate": 1.724852935166955e-05, "loss": 0.2803, "step": 10400 }, { "epoch": 0.84, "grad_norm": 6.83554630577102, "learning_rate": 1.723625826229212e-05, "loss": 0.2366, "step": 10420 }, { "epoch": 0.84, "grad_norm": 5.2741649888827, "learning_rate": 1.7223964256787275e-05, "loss": 0.2589, "step": 10440 }, { "epoch": 0.84, "grad_norm": 4.504793580943435, "learning_rate": 1.7211647374093644e-05, "loss": 0.2654, "step": 10460 }, { "epoch": 0.85, "grad_norm": 5.074320615196733, "learning_rate": 1.71993076532223e-05, "loss": 0.2531, "step": 10480 }, { "epoch": 0.85, "grad_norm": 7.4921309833960645, "learning_rate": 1.7186945133256663e-05, "loss": 0.2452, "step": 10500 }, { "epoch": 0.85, "grad_norm": 4.773435701909952, "learning_rate": 1.7174559853352366e-05, "loss": 0.2786, "step": 10520 }, { "epoch": 0.85, "grad_norm": 5.190944401366304, "learning_rate": 1.7162151852737114e-05, "loss": 0.2082, "step": 10540 }, { "epoch": 0.85, "grad_norm": 6.8860794956428215, "learning_rate": 1.7149721170710597e-05, "loss": 0.2593, "step": 10560 }, { "epoch": 0.85, "grad_norm": 5.315969613200098, "learning_rate": 1.7137267846644324e-05, "loss": 0.2451, "step": 10580 }, { "epoch": 0.86, "grad_norm": 8.924983723943493, "learning_rate": 1.712479191998153e-05, "loss": 0.2487, "step": 10600 }, { "epoch": 0.86, "grad_norm": 4.785603454868163, "learning_rate": 1.711229343023703e-05, "loss": 0.275, "step": 10620 }, { "epoch": 0.86, "grad_norm": 4.5511584473505895, "learning_rate": 1.709977241699711e-05, "loss": 0.2438, "step": 10640 }, { "epoch": 0.86, "grad_norm": 6.601440573023448, "learning_rate": 1.7087228919919395e-05, "loss": 0.2682, "step": 10660 }, { "epoch": 0.86, "grad_norm": 8.06521205975687, "learning_rate": 1.7074662978732713e-05, "loss": 0.2672, "step": 10680 }, { "epoch": 0.86, "grad_norm": 5.877886448612562, "learning_rate": 1.7062074633236992e-05, "loss": 0.2415, "step": 10700 }, { "epoch": 0.87, "grad_norm": 6.00267509589556, "learning_rate": 1.704946392330311e-05, "loss": 0.245, "step": 10720 }, { "epoch": 0.87, "grad_norm": 18.727472632503616, "learning_rate": 1.703683088887278e-05, "loss": 0.2527, "step": 10740 }, { "epoch": 0.87, "grad_norm": 8.42578939933542, "learning_rate": 1.7024175569958435e-05, "loss": 0.2447, "step": 10760 }, { "epoch": 0.87, "grad_norm": 15.871158165018187, "learning_rate": 1.7011498006643075e-05, "loss": 0.2611, "step": 10780 }, { "epoch": 0.87, "grad_norm": 4.623538224443551, "learning_rate": 1.6998798239080167e-05, "loss": 0.2521, "step": 10800 }, { "epoch": 0.87, "grad_norm": 6.908983060916792, "learning_rate": 1.698607630749349e-05, "loss": 0.2298, "step": 10820 }, { "epoch": 0.88, "grad_norm": 6.502465294111384, "learning_rate": 1.6973332252177036e-05, "loss": 0.2498, "step": 10840 }, { "epoch": 0.88, "grad_norm": 4.978479228853818, "learning_rate": 1.6960566113494865e-05, "loss": 0.252, "step": 10860 }, { "epoch": 0.88, "grad_norm": 5.650381173298351, "learning_rate": 1.694777793188098e-05, "loss": 0.2288, "step": 10880 }, { "epoch": 0.88, "grad_norm": 7.073746360539243, "learning_rate": 1.6934967747839202e-05, "loss": 0.2519, "step": 10900 }, { "epoch": 0.88, "grad_norm": 5.927901369661737, "learning_rate": 1.6922135601943037e-05, "loss": 0.265, "step": 10920 }, { "epoch": 0.88, "grad_norm": 5.53567758715019, "learning_rate": 1.690928153483555e-05, "loss": 0.25, "step": 10940 }, { "epoch": 0.88, "grad_norm": 7.570944618942586, "learning_rate": 1.6896405587229247e-05, "loss": 0.2549, "step": 10960 }, { "epoch": 0.89, "grad_norm": 7.379565103013804, "learning_rate": 1.6883507799905922e-05, "loss": 0.2363, "step": 10980 }, { "epoch": 0.89, "grad_norm": 9.023229502472875, "learning_rate": 1.6870588213716555e-05, "loss": 0.2832, "step": 11000 }, { "epoch": 0.89, "grad_norm": 5.6792260655491855, "learning_rate": 1.6857646869581153e-05, "loss": 0.228, "step": 11020 }, { "epoch": 0.89, "grad_norm": 7.456793627942026, "learning_rate": 1.6844683808488647e-05, "loss": 0.2494, "step": 11040 }, { "epoch": 0.89, "grad_norm": 4.8011477449229885, "learning_rate": 1.6831699071496758e-05, "loss": 0.2634, "step": 11060 }, { "epoch": 0.89, "grad_norm": 6.58057290965885, "learning_rate": 1.681869269973184e-05, "loss": 0.2577, "step": 11080 }, { "epoch": 0.9, "grad_norm": 5.68008811828603, "learning_rate": 1.68056647343888e-05, "loss": 0.2297, "step": 11100 }, { "epoch": 0.9, "grad_norm": 6.528010244716758, "learning_rate": 1.6792615216730907e-05, "loss": 0.2196, "step": 11120 }, { "epoch": 0.9, "grad_norm": 5.853566456861371, "learning_rate": 1.6779544188089715e-05, "loss": 0.2629, "step": 11140 }, { "epoch": 0.9, "grad_norm": 10.986926893405414, "learning_rate": 1.67664516898649e-05, "loss": 0.2302, "step": 11160 }, { "epoch": 0.9, "grad_norm": 7.730824034913035, "learning_rate": 1.6753337763524137e-05, "loss": 0.2336, "step": 11180 }, { "epoch": 0.9, "grad_norm": 7.922173067463235, "learning_rate": 1.6740202450602976e-05, "loss": 0.2686, "step": 11200 }, { "epoch": 0.91, "grad_norm": 5.406865255814246, "learning_rate": 1.67270457927047e-05, "loss": 0.226, "step": 11220 }, { "epoch": 0.91, "grad_norm": 6.843481049848729, "learning_rate": 1.6713867831500195e-05, "loss": 0.2586, "step": 11240 }, { "epoch": 0.91, "grad_norm": 5.49549924323287, "learning_rate": 1.670066860872783e-05, "loss": 0.2627, "step": 11260 }, { "epoch": 0.91, "grad_norm": 6.183808429627808, "learning_rate": 1.6687448166193306e-05, "loss": 0.2749, "step": 11280 }, { "epoch": 0.91, "grad_norm": 4.378810204329709, "learning_rate": 1.667420654576954e-05, "loss": 0.2558, "step": 11300 }, { "epoch": 0.91, "grad_norm": 6.028002244995752, "learning_rate": 1.666094378939652e-05, "loss": 0.2554, "step": 11320 }, { "epoch": 0.92, "grad_norm": 7.776788987779546, "learning_rate": 1.664765993908118e-05, "loss": 0.2326, "step": 11340 }, { "epoch": 0.92, "grad_norm": 7.503277380435426, "learning_rate": 1.663435503689726e-05, "loss": 0.2707, "step": 11360 }, { "epoch": 0.92, "grad_norm": 6.303861845235693, "learning_rate": 1.6621029124985195e-05, "loss": 0.2435, "step": 11380 }, { "epoch": 0.92, "grad_norm": 7.213728574312154, "learning_rate": 1.6607682245551935e-05, "loss": 0.2514, "step": 11400 }, { "epoch": 0.92, "grad_norm": 5.2552293437415525, "learning_rate": 1.6594314440870864e-05, "loss": 0.2397, "step": 11420 }, { "epoch": 0.92, "grad_norm": 6.538249814157013, "learning_rate": 1.6580925753281634e-05, "loss": 0.2655, "step": 11440 }, { "epoch": 0.93, "grad_norm": 5.2378821622768905, "learning_rate": 1.6567516225190035e-05, "loss": 0.2607, "step": 11460 }, { "epoch": 0.93, "grad_norm": 5.674850314010563, "learning_rate": 1.655408589906787e-05, "loss": 0.2723, "step": 11480 }, { "epoch": 0.93, "grad_norm": 7.192949169932349, "learning_rate": 1.654063481745281e-05, "loss": 0.2561, "step": 11500 }, { "epoch": 0.93, "grad_norm": 13.135993930717675, "learning_rate": 1.652716302294828e-05, "loss": 0.2382, "step": 11520 }, { "epoch": 0.93, "grad_norm": 4.887607996691356, "learning_rate": 1.651367055822329e-05, "loss": 0.2863, "step": 11540 }, { "epoch": 0.93, "grad_norm": 7.367579978609729, "learning_rate": 1.6500157466012324e-05, "loss": 0.2379, "step": 11560 }, { "epoch": 0.93, "grad_norm": 8.199270857981157, "learning_rate": 1.6486623789115205e-05, "loss": 0.2432, "step": 11580 }, { "epoch": 0.94, "grad_norm": 6.243091274334211, "learning_rate": 1.6473069570396942e-05, "loss": 0.2635, "step": 11600 }, { "epoch": 0.94, "grad_norm": 5.6352137765892545, "learning_rate": 1.6459494852787622e-05, "loss": 0.2292, "step": 11620 }, { "epoch": 0.94, "grad_norm": 5.2104929401235305, "learning_rate": 1.6445899679282248e-05, "loss": 0.2545, "step": 11640 }, { "epoch": 0.94, "grad_norm": 5.635847694521193, "learning_rate": 1.6432284092940607e-05, "loss": 0.247, "step": 11660 }, { "epoch": 0.94, "grad_norm": 5.853851889115171, "learning_rate": 1.6418648136887152e-05, "loss": 0.2323, "step": 11680 }, { "epoch": 0.94, "grad_norm": 4.98208977143132, "learning_rate": 1.6404991854310846e-05, "loss": 0.238, "step": 11700 }, { "epoch": 0.95, "grad_norm": 5.560280174770714, "learning_rate": 1.6391315288465027e-05, "loss": 0.2589, "step": 11720 }, { "epoch": 0.95, "grad_norm": 11.332988584174231, "learning_rate": 1.637761848266729e-05, "loss": 0.2437, "step": 11740 }, { "epoch": 0.95, "grad_norm": 13.079688339953384, "learning_rate": 1.6363901480299323e-05, "loss": 0.2489, "step": 11760 }, { "epoch": 0.95, "grad_norm": 6.852537601204953, "learning_rate": 1.6350164324806787e-05, "loss": 0.218, "step": 11780 }, { "epoch": 0.95, "grad_norm": 6.384240727219325, "learning_rate": 1.633640705969917e-05, "loss": 0.2419, "step": 11800 }, { "epoch": 0.95, "grad_norm": 4.348764283501352, "learning_rate": 1.6322629728549665e-05, "loss": 0.2037, "step": 11820 }, { "epoch": 0.96, "grad_norm": 5.096264739138052, "learning_rate": 1.6308832374994997e-05, "loss": 0.2502, "step": 11840 }, { "epoch": 0.96, "grad_norm": 4.471177088927129, "learning_rate": 1.6295015042735336e-05, "loss": 0.2435, "step": 11860 }, { "epoch": 0.96, "grad_norm": 7.886308089698534, "learning_rate": 1.6281177775534106e-05, "loss": 0.2367, "step": 11880 }, { "epoch": 0.96, "grad_norm": 5.0872043608074335, "learning_rate": 1.6267320617217886e-05, "loss": 0.2618, "step": 11900 }, { "epoch": 0.96, "grad_norm": 7.332403239597943, "learning_rate": 1.6253443611676247e-05, "loss": 0.2377, "step": 11920 }, { "epoch": 0.96, "grad_norm": 5.2156408493688, "learning_rate": 1.6239546802861628e-05, "loss": 0.2588, "step": 11940 }, { "epoch": 0.97, "grad_norm": 14.389605988283588, "learning_rate": 1.6225630234789186e-05, "loss": 0.2359, "step": 11960 }, { "epoch": 0.97, "grad_norm": 6.61108607154756, "learning_rate": 1.621169395153666e-05, "loss": 0.2454, "step": 11980 }, { "epoch": 0.97, "grad_norm": 5.92623925749379, "learning_rate": 1.6197737997244242e-05, "loss": 0.2504, "step": 12000 }, { "epoch": 0.97, "grad_norm": 6.729876438497323, "learning_rate": 1.6183762416114417e-05, "loss": 0.231, "step": 12020 }, { "epoch": 0.97, "grad_norm": 4.91119912664639, "learning_rate": 1.6169767252411843e-05, "loss": 0.2732, "step": 12040 }, { "epoch": 0.97, "grad_norm": 7.372474108547359, "learning_rate": 1.615575255046319e-05, "loss": 0.2396, "step": 12060 }, { "epoch": 0.98, "grad_norm": 4.844310112839635, "learning_rate": 1.6141718354657023e-05, "loss": 0.2682, "step": 12080 }, { "epoch": 0.98, "grad_norm": 7.827541428550464, "learning_rate": 1.6127664709443642e-05, "loss": 0.2351, "step": 12100 }, { "epoch": 0.98, "grad_norm": 6.394194783450918, "learning_rate": 1.6113591659334952e-05, "loss": 0.277, "step": 12120 }, { "epoch": 0.98, "grad_norm": 6.728544539125102, "learning_rate": 1.609949924890432e-05, "loss": 0.2517, "step": 12140 }, { "epoch": 0.98, "grad_norm": 4.095514979882195, "learning_rate": 1.6085387522786432e-05, "loss": 0.2317, "step": 12160 }, { "epoch": 0.98, "grad_norm": 6.899190893971197, "learning_rate": 1.6071256525677144e-05, "loss": 0.239, "step": 12180 }, { "epoch": 0.98, "grad_norm": 5.002813882583922, "learning_rate": 1.6057106302333366e-05, "loss": 0.2411, "step": 12200 }, { "epoch": 0.99, "grad_norm": 6.7562128367712, "learning_rate": 1.6042936897572883e-05, "loss": 0.2347, "step": 12220 }, { "epoch": 0.99, "grad_norm": 9.896004658604653, "learning_rate": 1.6028748356274247e-05, "loss": 0.2526, "step": 12240 }, { "epoch": 0.99, "grad_norm": 7.972800268940516, "learning_rate": 1.6014540723376623e-05, "loss": 0.2505, "step": 12260 }, { "epoch": 0.99, "grad_norm": 5.170343546862058, "learning_rate": 1.600031404387963e-05, "loss": 0.2478, "step": 12280 }, { "epoch": 0.99, "grad_norm": 6.356344714814083, "learning_rate": 1.5986068362843224e-05, "loss": 0.2767, "step": 12300 }, { "epoch": 0.99, "grad_norm": 6.20794198597022, "learning_rate": 1.5971803725387544e-05, "loss": 0.2533, "step": 12320 }, { "epoch": 1.0, "grad_norm": 7.368279449995274, "learning_rate": 1.5957520176692766e-05, "loss": 0.2706, "step": 12340 }, { "epoch": 1.0, "grad_norm": 9.218421438795374, "learning_rate": 1.594321776199896e-05, "loss": 0.2447, "step": 12360 }, { "epoch": 1.0, "grad_norm": 5.4653346268657845, "learning_rate": 1.592889652660596e-05, "loss": 0.2339, "step": 12380 }, { "epoch": 1.0, "grad_norm": 6.741041667370887, "learning_rate": 1.5914556515873197e-05, "loss": 0.1749, "step": 12400 }, { "epoch": 1.0, "grad_norm": 4.207049838195936, "learning_rate": 1.590019777521959e-05, "loss": 0.1849, "step": 12420 }, { "epoch": 1.0, "grad_norm": 7.1496607666636285, "learning_rate": 1.588582035012336e-05, "loss": 0.1743, "step": 12440 }, { "epoch": 1.01, "grad_norm": 7.5265979882421865, "learning_rate": 1.587142428612191e-05, "loss": 0.1868, "step": 12460 }, { "epoch": 1.01, "grad_norm": 5.651063343012383, "learning_rate": 1.5857009628811692e-05, "loss": 0.1983, "step": 12480 }, { "epoch": 1.01, "grad_norm": 9.202976607727676, "learning_rate": 1.5842576423848034e-05, "loss": 0.1917, "step": 12500 } ], "logging_steps": 20, "max_steps": 37164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }