{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22983058457141942, "eval_steps": 500, "global_step": 258000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 227.0, "learning_rate": 4.4539461963299484e-07, "loss": 3.0081, "step": 100 }, { "epoch": 0.0, "grad_norm": 28.0, "learning_rate": 8.907892392659897e-07, "loss": 3.5571, "step": 200 }, { "epoch": 0.0, "grad_norm": 482.0, "learning_rate": 1.3361838588989846e-06, "loss": 2.7953, "step": 300 }, { "epoch": 0.0, "grad_norm": 82.0, "learning_rate": 1.7815784785319793e-06, "loss": 2.6242, "step": 400 }, { "epoch": 0.0, "grad_norm": 137.0, "learning_rate": 2.226973098164974e-06, "loss": 1.8696, "step": 500 }, { "epoch": 0.0, "grad_norm": 60.25, "learning_rate": 2.672367717797969e-06, "loss": 1.5141, "step": 600 }, { "epoch": 0.0, "grad_norm": 494.0, "learning_rate": 3.1177623374309637e-06, "loss": 1.5096, "step": 700 }, { "epoch": 0.0, "grad_norm": 27.625, "learning_rate": 3.5631569570639587e-06, "loss": 1.5056, "step": 800 }, { "epoch": 0.0, "grad_norm": 12.9375, "learning_rate": 4.008551576696954e-06, "loss": 1.3766, "step": 900 }, { "epoch": 0.0, "grad_norm": 56.75, "learning_rate": 4.453946196329948e-06, "loss": 1.2224, "step": 1000 }, { "epoch": 0.0, "grad_norm": 17.125, "learning_rate": 4.899340815962943e-06, "loss": 1.4877, "step": 1100 }, { "epoch": 0.0, "grad_norm": 24.125, "learning_rate": 5.344735435595938e-06, "loss": 1.4661, "step": 1200 }, { "epoch": 0.0, "grad_norm": 55.75, "learning_rate": 5.790130055228933e-06, "loss": 1.4322, "step": 1300 }, { "epoch": 0.0, "grad_norm": 54.25, "learning_rate": 6.235524674861927e-06, "loss": 1.6576, "step": 1400 }, { "epoch": 0.0, "grad_norm": 106.0, "learning_rate": 6.680919294494922e-06, "loss": 1.2569, "step": 1500 }, { "epoch": 0.0, "grad_norm": 49.0, "learning_rate": 7.126313914127917e-06, "loss": 1.3449, "step": 1600 }, { "epoch": 0.0, "grad_norm": 41.75, "learning_rate": 7.571708533760913e-06, "loss": 1.3496, "step": 1700 }, { "epoch": 0.0, "grad_norm": 126.5, "learning_rate": 8.017103153393907e-06, "loss": 1.4364, "step": 1800 }, { "epoch": 0.0, "grad_norm": 23.25, "learning_rate": 8.462497773026902e-06, "loss": 1.4653, "step": 1900 }, { "epoch": 0.0, "grad_norm": 81.0, "learning_rate": 8.907892392659896e-06, "loss": 1.3839, "step": 2000 }, { "epoch": 0.0, "grad_norm": 39.25, "learning_rate": 9.353287012292893e-06, "loss": 1.3782, "step": 2100 }, { "epoch": 0.0, "grad_norm": 30.5, "learning_rate": 9.798681631925886e-06, "loss": 1.3509, "step": 2200 }, { "epoch": 0.0, "grad_norm": 77.5, "learning_rate": 1.0244076251558882e-05, "loss": 1.3643, "step": 2300 }, { "epoch": 0.0, "grad_norm": 46.75, "learning_rate": 1.0689470871191876e-05, "loss": 1.3291, "step": 2400 }, { "epoch": 0.0, "grad_norm": 32.5, "learning_rate": 1.1134865490824871e-05, "loss": 1.2246, "step": 2500 }, { "epoch": 0.0, "grad_norm": 40.25, "learning_rate": 1.1580260110457866e-05, "loss": 1.5088, "step": 2600 }, { "epoch": 0.0, "grad_norm": 24.125, "learning_rate": 1.2025654730090862e-05, "loss": 1.3889, "step": 2700 }, { "epoch": 0.0, "grad_norm": 88.0, "learning_rate": 1.2471049349723855e-05, "loss": 1.5065, "step": 2800 }, { "epoch": 0.0, "grad_norm": 30.875, "learning_rate": 1.2916443969356851e-05, "loss": 1.5287, "step": 2900 }, { "epoch": 0.0, "grad_norm": 23.0, "learning_rate": 1.3361838588989844e-05, "loss": 1.4401, "step": 3000 }, { "epoch": 0.0, "grad_norm": 61.5, "learning_rate": 1.3807233208622842e-05, "loss": 1.306, "step": 3100 }, { "epoch": 0.0, "grad_norm": 248.0, "learning_rate": 1.4252627828255835e-05, "loss": 1.5073, "step": 3200 }, { "epoch": 0.0, "grad_norm": 60.25, "learning_rate": 1.469802244788883e-05, "loss": 1.2386, "step": 3300 }, { "epoch": 0.0, "grad_norm": 58.0, "learning_rate": 1.5143417067521826e-05, "loss": 1.5033, "step": 3400 }, { "epoch": 0.0, "grad_norm": 143.0, "learning_rate": 1.558881168715482e-05, "loss": 1.3654, "step": 3500 }, { "epoch": 0.0, "grad_norm": 37.25, "learning_rate": 1.6034206306787815e-05, "loss": 1.483, "step": 3600 }, { "epoch": 0.0, "grad_norm": 134.0, "learning_rate": 1.647960092642081e-05, "loss": 1.1818, "step": 3700 }, { "epoch": 0.0, "grad_norm": 80.5, "learning_rate": 1.6924995546053804e-05, "loss": 1.335, "step": 3800 }, { "epoch": 0.0, "grad_norm": 120.0, "learning_rate": 1.7370390165686802e-05, "loss": 1.3798, "step": 3900 }, { "epoch": 0.0, "grad_norm": 20.25, "learning_rate": 1.7815784785319793e-05, "loss": 1.3843, "step": 4000 }, { "epoch": 0.0, "grad_norm": 41.25, "learning_rate": 1.8261179404952788e-05, "loss": 1.2019, "step": 4100 }, { "epoch": 0.0, "grad_norm": 115.5, "learning_rate": 1.8706574024585786e-05, "loss": 1.3815, "step": 4200 }, { "epoch": 0.0, "grad_norm": 43.75, "learning_rate": 1.915196864421878e-05, "loss": 1.3856, "step": 4300 }, { "epoch": 0.0, "grad_norm": 14.375, "learning_rate": 1.959736326385177e-05, "loss": 1.3999, "step": 4400 }, { "epoch": 0.0, "grad_norm": 59.5, "learning_rate": 2.004275788348477e-05, "loss": 1.5308, "step": 4500 }, { "epoch": 0.0, "grad_norm": 64.5, "learning_rate": 2.0488152503117764e-05, "loss": 1.4645, "step": 4600 }, { "epoch": 0.0, "grad_norm": 57.0, "learning_rate": 2.093354712275076e-05, "loss": 1.3967, "step": 4700 }, { "epoch": 0.0, "grad_norm": 253.0, "learning_rate": 2.1378941742383753e-05, "loss": 1.3877, "step": 4800 }, { "epoch": 0.0, "grad_norm": 128.0, "learning_rate": 2.1824336362016748e-05, "loss": 1.2397, "step": 4900 }, { "epoch": 0.0, "grad_norm": 36.25, "learning_rate": 2.2269730981649742e-05, "loss": 1.4018, "step": 5000 }, { "epoch": 0.0, "grad_norm": 0.263671875, "learning_rate": 2.271512560128274e-05, "loss": 1.4637, "step": 5100 }, { "epoch": 0.0, "grad_norm": 28.375, "learning_rate": 2.316052022091573e-05, "loss": 1.4674, "step": 5200 }, { "epoch": 0.0, "grad_norm": 0.1279296875, "learning_rate": 2.3605914840548726e-05, "loss": 1.1861, "step": 5300 }, { "epoch": 0.0, "grad_norm": 52.25, "learning_rate": 2.4051309460181724e-05, "loss": 1.3685, "step": 5400 }, { "epoch": 0.0, "grad_norm": 83.0, "learning_rate": 2.449670407981472e-05, "loss": 1.5418, "step": 5500 }, { "epoch": 0.0, "grad_norm": 22.625, "learning_rate": 2.494209869944771e-05, "loss": 1.3366, "step": 5600 }, { "epoch": 0.01, "grad_norm": 41.25, "learning_rate": 2.5387493319080707e-05, "loss": 1.4138, "step": 5700 }, { "epoch": 0.01, "grad_norm": 38.25, "learning_rate": 2.5832887938713702e-05, "loss": 1.5245, "step": 5800 }, { "epoch": 0.01, "grad_norm": 32.75, "learning_rate": 2.6278282558346697e-05, "loss": 1.6365, "step": 5900 }, { "epoch": 0.01, "grad_norm": 27.0, "learning_rate": 2.6723677177979688e-05, "loss": 1.4085, "step": 6000 }, { "epoch": 0.01, "grad_norm": 51.25, "learning_rate": 2.716907179761269e-05, "loss": 1.4528, "step": 6100 }, { "epoch": 0.01, "grad_norm": 56.75, "learning_rate": 2.7614466417245684e-05, "loss": 1.4637, "step": 6200 }, { "epoch": 0.01, "grad_norm": 23.5, "learning_rate": 2.8059861036878675e-05, "loss": 1.3511, "step": 6300 }, { "epoch": 0.01, "grad_norm": 21.75, "learning_rate": 2.850525565651167e-05, "loss": 1.2612, "step": 6400 }, { "epoch": 0.01, "grad_norm": 37.75, "learning_rate": 2.8950650276144664e-05, "loss": 1.272, "step": 6500 }, { "epoch": 0.01, "grad_norm": 34.5, "learning_rate": 2.939604489577766e-05, "loss": 1.2663, "step": 6600 }, { "epoch": 0.01, "grad_norm": 81.0, "learning_rate": 2.9841439515410657e-05, "loss": 1.2861, "step": 6700 }, { "epoch": 0.01, "grad_norm": 32.25, "learning_rate": 3.028683413504365e-05, "loss": 1.3541, "step": 6800 }, { "epoch": 0.01, "grad_norm": 72.0, "learning_rate": 3.073222875467664e-05, "loss": 1.3616, "step": 6900 }, { "epoch": 0.01, "grad_norm": 25.5, "learning_rate": 3.117762337430964e-05, "loss": 1.4044, "step": 7000 }, { "epoch": 0.01, "grad_norm": 89.5, "learning_rate": 3.162301799394263e-05, "loss": 1.4566, "step": 7100 }, { "epoch": 0.01, "grad_norm": 94.5, "learning_rate": 3.206841261357563e-05, "loss": 1.5359, "step": 7200 }, { "epoch": 0.01, "grad_norm": 14.125, "learning_rate": 3.251380723320863e-05, "loss": 1.282, "step": 7300 }, { "epoch": 0.01, "grad_norm": 110.0, "learning_rate": 3.295920185284162e-05, "loss": 1.4597, "step": 7400 }, { "epoch": 0.01, "grad_norm": 67.0, "learning_rate": 3.3404596472474617e-05, "loss": 1.3946, "step": 7500 }, { "epoch": 0.01, "grad_norm": 173.0, "learning_rate": 3.384999109210761e-05, "loss": 1.4038, "step": 7600 }, { "epoch": 0.01, "grad_norm": 8.875, "learning_rate": 3.42953857117406e-05, "loss": 1.2368, "step": 7700 }, { "epoch": 0.01, "grad_norm": 48.5, "learning_rate": 3.4740780331373604e-05, "loss": 1.4606, "step": 7800 }, { "epoch": 0.01, "grad_norm": 45.5, "learning_rate": 3.5186174951006595e-05, "loss": 1.3372, "step": 7900 }, { "epoch": 0.01, "grad_norm": 50.5, "learning_rate": 3.5631569570639586e-05, "loss": 1.3796, "step": 8000 }, { "epoch": 0.01, "grad_norm": 19.5, "learning_rate": 3.6076964190272584e-05, "loss": 1.2696, "step": 8100 }, { "epoch": 0.01, "grad_norm": 73.0, "learning_rate": 3.6522358809905575e-05, "loss": 1.2903, "step": 8200 }, { "epoch": 0.01, "grad_norm": 164.0, "learning_rate": 3.696775342953857e-05, "loss": 1.3901, "step": 8300 }, { "epoch": 0.01, "grad_norm": 77.5, "learning_rate": 3.741314804917157e-05, "loss": 1.293, "step": 8400 }, { "epoch": 0.01, "grad_norm": 14.9375, "learning_rate": 3.785854266880456e-05, "loss": 1.4207, "step": 8500 }, { "epoch": 0.01, "grad_norm": 102.5, "learning_rate": 3.830393728843756e-05, "loss": 1.4051, "step": 8600 }, { "epoch": 0.01, "grad_norm": 51.75, "learning_rate": 3.874933190807055e-05, "loss": 1.2428, "step": 8700 }, { "epoch": 0.01, "grad_norm": 37.5, "learning_rate": 3.919472652770354e-05, "loss": 1.3791, "step": 8800 }, { "epoch": 0.01, "grad_norm": 46.25, "learning_rate": 3.964012114733654e-05, "loss": 1.4908, "step": 8900 }, { "epoch": 0.01, "grad_norm": 82.0, "learning_rate": 4.008551576696954e-05, "loss": 1.3, "step": 9000 }, { "epoch": 0.01, "grad_norm": 33.5, "learning_rate": 4.0530910386602536e-05, "loss": 1.4543, "step": 9100 }, { "epoch": 0.01, "grad_norm": 73.0, "learning_rate": 4.097630500623553e-05, "loss": 1.467, "step": 9200 }, { "epoch": 0.01, "grad_norm": 28.125, "learning_rate": 4.142169962586852e-05, "loss": 1.4087, "step": 9300 }, { "epoch": 0.01, "grad_norm": 1656.0, "learning_rate": 4.186709424550152e-05, "loss": 1.2681, "step": 9400 }, { "epoch": 0.01, "grad_norm": 50.25, "learning_rate": 4.231248886513451e-05, "loss": 1.2393, "step": 9500 }, { "epoch": 0.01, "grad_norm": 71.5, "learning_rate": 4.2757883484767506e-05, "loss": 1.3523, "step": 9600 }, { "epoch": 0.01, "grad_norm": 89.5, "learning_rate": 4.3203278104400504e-05, "loss": 1.5014, "step": 9700 }, { "epoch": 0.01, "grad_norm": 20.625, "learning_rate": 4.3648672724033495e-05, "loss": 1.5627, "step": 9800 }, { "epoch": 0.01, "grad_norm": 105.5, "learning_rate": 4.409406734366649e-05, "loss": 1.3037, "step": 9900 }, { "epoch": 0.01, "grad_norm": 42.75, "learning_rate": 4.4539461963299484e-05, "loss": 0.9767, "step": 10000 }, { "epoch": 0.01, "grad_norm": 88.0, "learning_rate": 4.498485658293248e-05, "loss": 1.3604, "step": 10100 }, { "epoch": 0.01, "grad_norm": 47.5, "learning_rate": 4.543025120256548e-05, "loss": 1.3567, "step": 10200 }, { "epoch": 0.01, "grad_norm": 108.0, "learning_rate": 4.587564582219847e-05, "loss": 1.3405, "step": 10300 }, { "epoch": 0.01, "grad_norm": 16.875, "learning_rate": 4.632104044183146e-05, "loss": 1.2921, "step": 10400 }, { "epoch": 0.01, "grad_norm": 102.0, "learning_rate": 4.676643506146446e-05, "loss": 1.5022, "step": 10500 }, { "epoch": 0.01, "grad_norm": 36.0, "learning_rate": 4.721182968109745e-05, "loss": 1.2794, "step": 10600 }, { "epoch": 0.01, "grad_norm": 107.0, "learning_rate": 4.765722430073045e-05, "loss": 1.1861, "step": 10700 }, { "epoch": 0.01, "grad_norm": 50.0, "learning_rate": 4.810261892036345e-05, "loss": 1.388, "step": 10800 }, { "epoch": 0.01, "grad_norm": 194.0, "learning_rate": 4.854801353999644e-05, "loss": 1.6325, "step": 10900 }, { "epoch": 0.01, "grad_norm": 10.75, "learning_rate": 4.899340815962944e-05, "loss": 1.3704, "step": 11000 }, { "epoch": 0.01, "grad_norm": 20.0, "learning_rate": 4.943880277926243e-05, "loss": 1.4644, "step": 11100 }, { "epoch": 0.01, "grad_norm": 94.5, "learning_rate": 4.988419739889542e-05, "loss": 1.4386, "step": 11200 }, { "epoch": 0.01, "grad_norm": 2.953125, "learning_rate": 4.999667068583872e-05, "loss": 1.3037, "step": 11300 }, { "epoch": 0.01, "grad_norm": 67.5, "learning_rate": 4.99921716126478e-05, "loss": 1.2846, "step": 11400 }, { "epoch": 0.01, "grad_norm": 122.0, "learning_rate": 4.998767253945688e-05, "loss": 1.3673, "step": 11500 }, { "epoch": 0.01, "grad_norm": 752.0, "learning_rate": 4.9983173466265954e-05, "loss": 1.5056, "step": 11600 }, { "epoch": 0.01, "grad_norm": 520.0, "learning_rate": 4.997867439307503e-05, "loss": 1.3553, "step": 11700 }, { "epoch": 0.01, "grad_norm": 104.0, "learning_rate": 4.997417531988411e-05, "loss": 1.2198, "step": 11800 }, { "epoch": 0.01, "grad_norm": 45.0, "learning_rate": 4.996967624669318e-05, "loss": 1.1364, "step": 11900 }, { "epoch": 0.01, "grad_norm": 51.75, "learning_rate": 4.9965177173502256e-05, "loss": 1.221, "step": 12000 }, { "epoch": 0.01, "grad_norm": 46.5, "learning_rate": 4.996067810031134e-05, "loss": 1.3872, "step": 12100 }, { "epoch": 0.01, "grad_norm": 258.0, "learning_rate": 4.995617902712041e-05, "loss": 1.3035, "step": 12200 }, { "epoch": 0.01, "grad_norm": 35.5, "learning_rate": 4.9951679953929495e-05, "loss": 1.4527, "step": 12300 }, { "epoch": 0.01, "grad_norm": 21.625, "learning_rate": 4.994718088073857e-05, "loss": 1.2409, "step": 12400 }, { "epoch": 0.01, "grad_norm": 20.625, "learning_rate": 4.9942681807547646e-05, "loss": 1.1766, "step": 12500 }, { "epoch": 0.01, "grad_norm": 75.5, "learning_rate": 4.993818273435673e-05, "loss": 1.4129, "step": 12600 }, { "epoch": 0.01, "grad_norm": 93.0, "learning_rate": 4.9933683661165803e-05, "loss": 1.2777, "step": 12700 }, { "epoch": 0.01, "grad_norm": 109.0, "learning_rate": 4.992918458797488e-05, "loss": 1.2517, "step": 12800 }, { "epoch": 0.01, "grad_norm": 0.0133056640625, "learning_rate": 4.992468551478396e-05, "loss": 1.3276, "step": 12900 }, { "epoch": 0.01, "grad_norm": 25.5, "learning_rate": 4.9920186441593036e-05, "loss": 1.5745, "step": 13000 }, { "epoch": 0.01, "grad_norm": 31.25, "learning_rate": 4.991568736840211e-05, "loss": 1.3824, "step": 13100 }, { "epoch": 0.01, "grad_norm": 73.5, "learning_rate": 4.991118829521119e-05, "loss": 1.3494, "step": 13200 }, { "epoch": 0.01, "grad_norm": 2480.0, "learning_rate": 4.990668922202026e-05, "loss": 1.3827, "step": 13300 }, { "epoch": 0.01, "grad_norm": 57.75, "learning_rate": 4.9902190148829344e-05, "loss": 1.3879, "step": 13400 }, { "epoch": 0.01, "grad_norm": 26.75, "learning_rate": 4.989769107563842e-05, "loss": 1.3366, "step": 13500 }, { "epoch": 0.01, "grad_norm": 40.0, "learning_rate": 4.9893192002447495e-05, "loss": 1.4362, "step": 13600 }, { "epoch": 0.01, "grad_norm": 38.5, "learning_rate": 4.988869292925658e-05, "loss": 1.4034, "step": 13700 }, { "epoch": 0.01, "grad_norm": 54.5, "learning_rate": 4.988419385606565e-05, "loss": 1.2769, "step": 13800 }, { "epoch": 0.01, "grad_norm": 22.875, "learning_rate": 4.987969478287473e-05, "loss": 1.2381, "step": 13900 }, { "epoch": 0.01, "grad_norm": 23.875, "learning_rate": 4.987519570968381e-05, "loss": 1.4296, "step": 14000 }, { "epoch": 0.01, "grad_norm": 37.75, "learning_rate": 4.9870696636492885e-05, "loss": 1.3941, "step": 14100 }, { "epoch": 0.01, "grad_norm": 112.0, "learning_rate": 4.986619756330197e-05, "loss": 1.1393, "step": 14200 }, { "epoch": 0.01, "grad_norm": 600.0, "learning_rate": 4.986169849011104e-05, "loss": 1.4978, "step": 14300 }, { "epoch": 0.01, "grad_norm": 42.25, "learning_rate": 4.985719941692012e-05, "loss": 1.558, "step": 14400 }, { "epoch": 0.01, "grad_norm": 28.125, "learning_rate": 4.9852700343729193e-05, "loss": 1.5553, "step": 14500 }, { "epoch": 0.01, "grad_norm": 20.5, "learning_rate": 4.984820127053827e-05, "loss": 1.1859, "step": 14600 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 4.9843702197347344e-05, "loss": 1.288, "step": 14700 }, { "epoch": 0.01, "grad_norm": 55.5, "learning_rate": 4.9839203124156426e-05, "loss": 1.3456, "step": 14800 }, { "epoch": 0.01, "grad_norm": 58.75, "learning_rate": 4.98347040509655e-05, "loss": 1.4251, "step": 14900 }, { "epoch": 0.01, "grad_norm": 49.0, "learning_rate": 4.9830204977774584e-05, "loss": 1.5009, "step": 15000 }, { "epoch": 0.01, "grad_norm": 62.5, "learning_rate": 4.982570590458366e-05, "loss": 1.44, "step": 15100 }, { "epoch": 0.01, "grad_norm": 155.0, "learning_rate": 4.9821206831392734e-05, "loss": 1.4204, "step": 15200 }, { "epoch": 0.01, "grad_norm": 37.25, "learning_rate": 4.9816707758201816e-05, "loss": 1.4253, "step": 15300 }, { "epoch": 0.01, "grad_norm": 98.0, "learning_rate": 4.981220868501089e-05, "loss": 1.5352, "step": 15400 }, { "epoch": 0.01, "grad_norm": 92.0, "learning_rate": 4.980770961181997e-05, "loss": 1.46, "step": 15500 }, { "epoch": 0.01, "grad_norm": 51.25, "learning_rate": 4.980321053862905e-05, "loss": 1.2383, "step": 15600 }, { "epoch": 0.01, "grad_norm": 76.5, "learning_rate": 4.9798711465438125e-05, "loss": 1.4453, "step": 15700 }, { "epoch": 0.01, "grad_norm": 115.0, "learning_rate": 4.97942123922472e-05, "loss": 1.2353, "step": 15800 }, { "epoch": 0.01, "grad_norm": 3.3125, "learning_rate": 4.9789713319056275e-05, "loss": 1.4104, "step": 15900 }, { "epoch": 0.01, "grad_norm": 50.5, "learning_rate": 4.978521424586535e-05, "loss": 1.3855, "step": 16000 }, { "epoch": 0.01, "grad_norm": 28.5, "learning_rate": 4.978071517267443e-05, "loss": 1.2479, "step": 16100 }, { "epoch": 0.01, "grad_norm": 73.5, "learning_rate": 4.977621609948351e-05, "loss": 1.5248, "step": 16200 }, { "epoch": 0.01, "grad_norm": 62.25, "learning_rate": 4.977171702629258e-05, "loss": 1.2381, "step": 16300 }, { "epoch": 0.01, "grad_norm": 17.75, "learning_rate": 4.9767217953101666e-05, "loss": 1.1687, "step": 16400 }, { "epoch": 0.01, "grad_norm": 35.75, "learning_rate": 4.976271887991074e-05, "loss": 1.3555, "step": 16500 }, { "epoch": 0.01, "grad_norm": 274.0, "learning_rate": 4.9758219806719816e-05, "loss": 1.3997, "step": 16600 }, { "epoch": 0.01, "grad_norm": 70.5, "learning_rate": 4.97537207335289e-05, "loss": 1.4042, "step": 16700 }, { "epoch": 0.01, "grad_norm": 21.0, "learning_rate": 4.9749221660337974e-05, "loss": 1.462, "step": 16800 }, { "epoch": 0.02, "grad_norm": 2.515625, "learning_rate": 4.974472258714705e-05, "loss": 1.3717, "step": 16900 }, { "epoch": 0.02, "grad_norm": 84.0, "learning_rate": 4.974022351395613e-05, "loss": 1.391, "step": 17000 }, { "epoch": 0.02, "grad_norm": 49.0, "learning_rate": 4.97357244407652e-05, "loss": 1.4007, "step": 17100 }, { "epoch": 0.02, "grad_norm": 29.25, "learning_rate": 4.973122536757428e-05, "loss": 1.217, "step": 17200 }, { "epoch": 0.02, "grad_norm": 46.5, "learning_rate": 4.972672629438336e-05, "loss": 1.2027, "step": 17300 }, { "epoch": 0.02, "grad_norm": 7.9375, "learning_rate": 4.972222722119243e-05, "loss": 1.4868, "step": 17400 }, { "epoch": 0.02, "grad_norm": 35.25, "learning_rate": 4.9717728148001515e-05, "loss": 1.3056, "step": 17500 }, { "epoch": 0.02, "grad_norm": 138.0, "learning_rate": 4.971322907481059e-05, "loss": 1.3636, "step": 17600 }, { "epoch": 0.02, "grad_norm": 77.0, "learning_rate": 4.970873000161967e-05, "loss": 1.5884, "step": 17700 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.970423092842875e-05, "loss": 1.3438, "step": 17800 }, { "epoch": 0.02, "grad_norm": 108.5, "learning_rate": 4.969973185523782e-05, "loss": 1.2762, "step": 17900 }, { "epoch": 0.02, "grad_norm": 38.25, "learning_rate": 4.9695232782046905e-05, "loss": 1.3057, "step": 18000 }, { "epoch": 0.02, "grad_norm": 204.0, "learning_rate": 4.969073370885598e-05, "loss": 1.376, "step": 18100 }, { "epoch": 0.02, "grad_norm": 92.5, "learning_rate": 4.9686234635665055e-05, "loss": 1.2902, "step": 18200 }, { "epoch": 0.02, "grad_norm": 29.5, "learning_rate": 4.968173556247414e-05, "loss": 1.3195, "step": 18300 }, { "epoch": 0.02, "grad_norm": 38.5, "learning_rate": 4.9677236489283206e-05, "loss": 1.3055, "step": 18400 }, { "epoch": 0.02, "grad_norm": 25.5, "learning_rate": 4.967273741609229e-05, "loss": 1.4142, "step": 18500 }, { "epoch": 0.02, "grad_norm": 59.5, "learning_rate": 4.9668238342901364e-05, "loss": 1.4916, "step": 18600 }, { "epoch": 0.02, "grad_norm": 72.0, "learning_rate": 4.966373926971044e-05, "loss": 1.4982, "step": 18700 }, { "epoch": 0.02, "grad_norm": 98.0, "learning_rate": 4.965924019651952e-05, "loss": 1.1261, "step": 18800 }, { "epoch": 0.02, "grad_norm": 33.75, "learning_rate": 4.9654741123328596e-05, "loss": 1.2093, "step": 18900 }, { "epoch": 0.02, "grad_norm": 63.25, "learning_rate": 4.965024205013767e-05, "loss": 1.1638, "step": 19000 }, { "epoch": 0.02, "grad_norm": 29.25, "learning_rate": 4.9645742976946754e-05, "loss": 1.2345, "step": 19100 }, { "epoch": 0.02, "grad_norm": 131.0, "learning_rate": 4.964124390375583e-05, "loss": 1.3946, "step": 19200 }, { "epoch": 0.02, "grad_norm": 42.25, "learning_rate": 4.9636744830564905e-05, "loss": 1.2116, "step": 19300 }, { "epoch": 0.02, "grad_norm": 20.375, "learning_rate": 4.963224575737399e-05, "loss": 1.1863, "step": 19400 }, { "epoch": 0.02, "grad_norm": 32.5, "learning_rate": 4.962774668418306e-05, "loss": 1.4393, "step": 19500 }, { "epoch": 0.02, "grad_norm": 548.0, "learning_rate": 4.962324761099214e-05, "loss": 1.168, "step": 19600 }, { "epoch": 0.02, "grad_norm": 27.25, "learning_rate": 4.961874853780121e-05, "loss": 1.4497, "step": 19700 }, { "epoch": 0.02, "grad_norm": 5.125, "learning_rate": 4.961424946461029e-05, "loss": 1.2966, "step": 19800 }, { "epoch": 0.02, "grad_norm": 77.0, "learning_rate": 4.960975039141937e-05, "loss": 1.2388, "step": 19900 }, { "epoch": 0.02, "grad_norm": 46.25, "learning_rate": 4.9605251318228445e-05, "loss": 1.4248, "step": 20000 }, { "epoch": 0.02, "grad_norm": 56.5, "learning_rate": 4.960075224503752e-05, "loss": 1.2601, "step": 20100 }, { "epoch": 0.02, "grad_norm": 0.56640625, "learning_rate": 4.95962531718466e-05, "loss": 1.3359, "step": 20200 }, { "epoch": 0.02, "grad_norm": 17.25, "learning_rate": 4.959175409865568e-05, "loss": 1.4966, "step": 20300 }, { "epoch": 0.02, "grad_norm": 50.0, "learning_rate": 4.958725502546476e-05, "loss": 1.1853, "step": 20400 }, { "epoch": 0.02, "grad_norm": 35.5, "learning_rate": 4.9582755952273836e-05, "loss": 1.44, "step": 20500 }, { "epoch": 0.02, "grad_norm": 91.0, "learning_rate": 4.957825687908291e-05, "loss": 1.3369, "step": 20600 }, { "epoch": 0.02, "grad_norm": 41.25, "learning_rate": 4.957375780589199e-05, "loss": 1.1995, "step": 20700 }, { "epoch": 0.02, "grad_norm": 38.0, "learning_rate": 4.956925873270107e-05, "loss": 1.187, "step": 20800 }, { "epoch": 0.02, "grad_norm": 19.875, "learning_rate": 4.9564759659510144e-05, "loss": 1.2396, "step": 20900 }, { "epoch": 0.02, "grad_norm": 88.0, "learning_rate": 4.956026058631922e-05, "loss": 1.3147, "step": 21000 }, { "epoch": 0.02, "grad_norm": 2.171875, "learning_rate": 4.9555761513128295e-05, "loss": 1.418, "step": 21100 }, { "epoch": 0.02, "grad_norm": 25.875, "learning_rate": 4.955126243993738e-05, "loss": 1.3389, "step": 21200 }, { "epoch": 0.02, "grad_norm": 79.0, "learning_rate": 4.954676336674645e-05, "loss": 1.5175, "step": 21300 }, { "epoch": 0.02, "grad_norm": 0.265625, "learning_rate": 4.954226429355553e-05, "loss": 1.3892, "step": 21400 }, { "epoch": 0.02, "grad_norm": 199.0, "learning_rate": 4.953776522036461e-05, "loss": 1.2962, "step": 21500 }, { "epoch": 0.02, "grad_norm": 19.25, "learning_rate": 4.9533266147173685e-05, "loss": 1.2883, "step": 21600 }, { "epoch": 0.02, "grad_norm": 25.125, "learning_rate": 4.952876707398276e-05, "loss": 1.3622, "step": 21700 }, { "epoch": 0.02, "grad_norm": 25.5, "learning_rate": 4.952426800079184e-05, "loss": 1.2029, "step": 21800 }, { "epoch": 0.02, "grad_norm": 100.5, "learning_rate": 4.951976892760092e-05, "loss": 1.3844, "step": 21900 }, { "epoch": 0.02, "grad_norm": 78.0, "learning_rate": 4.951526985440999e-05, "loss": 1.4038, "step": 22000 }, { "epoch": 0.02, "grad_norm": 26.0, "learning_rate": 4.9510770781219075e-05, "loss": 1.102, "step": 22100 }, { "epoch": 0.02, "grad_norm": 17.625, "learning_rate": 4.950627170802815e-05, "loss": 1.1101, "step": 22200 }, { "epoch": 0.02, "grad_norm": 23.0, "learning_rate": 4.9501772634837226e-05, "loss": 1.3243, "step": 22300 }, { "epoch": 0.02, "grad_norm": 89.0, "learning_rate": 4.94972735616463e-05, "loss": 1.2503, "step": 22400 }, { "epoch": 0.02, "grad_norm": 35.25, "learning_rate": 4.9492774488455376e-05, "loss": 1.3802, "step": 22500 }, { "epoch": 0.02, "grad_norm": 78.5, "learning_rate": 4.948827541526446e-05, "loss": 1.497, "step": 22600 }, { "epoch": 0.02, "grad_norm": 73.5, "learning_rate": 4.9483776342073534e-05, "loss": 1.3293, "step": 22700 }, { "epoch": 0.02, "grad_norm": 27.25, "learning_rate": 4.947927726888261e-05, "loss": 1.3108, "step": 22800 }, { "epoch": 0.02, "grad_norm": 24.625, "learning_rate": 4.947477819569169e-05, "loss": 1.2249, "step": 22900 }, { "epoch": 0.02, "grad_norm": 63.75, "learning_rate": 4.947027912250077e-05, "loss": 1.1269, "step": 23000 }, { "epoch": 0.02, "grad_norm": 55.0, "learning_rate": 4.946578004930985e-05, "loss": 1.4874, "step": 23100 }, { "epoch": 0.02, "grad_norm": 30.0, "learning_rate": 4.9461280976118924e-05, "loss": 1.2414, "step": 23200 }, { "epoch": 0.02, "grad_norm": 98.0, "learning_rate": 4.9456781902928e-05, "loss": 1.3859, "step": 23300 }, { "epoch": 0.02, "grad_norm": 3.78125, "learning_rate": 4.945228282973708e-05, "loss": 1.3174, "step": 23400 }, { "epoch": 0.02, "grad_norm": 22.125, "learning_rate": 4.944778375654616e-05, "loss": 1.3358, "step": 23500 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.9443284683355225e-05, "loss": 1.4264, "step": 23600 }, { "epoch": 0.02, "grad_norm": 60.75, "learning_rate": 4.943878561016431e-05, "loss": 1.2525, "step": 23700 }, { "epoch": 0.02, "grad_norm": 19.75, "learning_rate": 4.943428653697338e-05, "loss": 1.4347, "step": 23800 }, { "epoch": 0.02, "grad_norm": 66.5, "learning_rate": 4.9429787463782465e-05, "loss": 1.4323, "step": 23900 }, { "epoch": 0.02, "grad_norm": 44.75, "learning_rate": 4.942528839059154e-05, "loss": 1.3538, "step": 24000 }, { "epoch": 0.02, "grad_norm": 104.5, "learning_rate": 4.9420789317400616e-05, "loss": 1.3535, "step": 24100 }, { "epoch": 0.02, "grad_norm": 41.75, "learning_rate": 4.94162902442097e-05, "loss": 1.3764, "step": 24200 }, { "epoch": 0.02, "grad_norm": 30.0, "learning_rate": 4.941179117101877e-05, "loss": 1.3381, "step": 24300 }, { "epoch": 0.02, "grad_norm": 32.5, "learning_rate": 4.940729209782785e-05, "loss": 1.3998, "step": 24400 }, { "epoch": 0.02, "grad_norm": 17.875, "learning_rate": 4.940279302463693e-05, "loss": 1.2407, "step": 24500 }, { "epoch": 0.02, "grad_norm": 92.0, "learning_rate": 4.9398293951446006e-05, "loss": 1.1868, "step": 24600 }, { "epoch": 0.02, "grad_norm": 77.0, "learning_rate": 4.939379487825508e-05, "loss": 1.3361, "step": 24700 }, { "epoch": 0.02, "grad_norm": 12.5, "learning_rate": 4.9389295805064163e-05, "loss": 1.1944, "step": 24800 }, { "epoch": 0.02, "grad_norm": 342.0, "learning_rate": 4.938479673187323e-05, "loss": 1.4354, "step": 24900 }, { "epoch": 0.02, "grad_norm": 282.0, "learning_rate": 4.9380297658682314e-05, "loss": 1.2188, "step": 25000 }, { "epoch": 0.02, "grad_norm": 1.8828125, "learning_rate": 4.937579858549139e-05, "loss": 1.2472, "step": 25100 }, { "epoch": 0.02, "grad_norm": 36.0, "learning_rate": 4.9371299512300465e-05, "loss": 1.0704, "step": 25200 }, { "epoch": 0.02, "grad_norm": 175.0, "learning_rate": 4.936680043910955e-05, "loss": 1.2283, "step": 25300 }, { "epoch": 0.02, "grad_norm": 42.25, "learning_rate": 4.936230136591862e-05, "loss": 1.4128, "step": 25400 }, { "epoch": 0.02, "grad_norm": 34.5, "learning_rate": 4.93578022927277e-05, "loss": 1.4462, "step": 25500 }, { "epoch": 0.02, "grad_norm": 54.0, "learning_rate": 4.935330321953678e-05, "loss": 1.2386, "step": 25600 }, { "epoch": 0.02, "grad_norm": 87.5, "learning_rate": 4.9348804146345855e-05, "loss": 1.2673, "step": 25700 }, { "epoch": 0.02, "grad_norm": 324.0, "learning_rate": 4.934430507315494e-05, "loss": 1.2364, "step": 25800 }, { "epoch": 0.02, "grad_norm": 23.5, "learning_rate": 4.933980599996401e-05, "loss": 1.3105, "step": 25900 }, { "epoch": 0.02, "grad_norm": 89.5, "learning_rate": 4.933530692677309e-05, "loss": 1.2263, "step": 26000 }, { "epoch": 0.02, "grad_norm": 12.75, "learning_rate": 4.933080785358217e-05, "loss": 1.2275, "step": 26100 }, { "epoch": 0.02, "grad_norm": 100.0, "learning_rate": 4.932630878039124e-05, "loss": 1.1758, "step": 26200 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.9321809707200314e-05, "loss": 1.3481, "step": 26300 }, { "epoch": 0.02, "grad_norm": 34.75, "learning_rate": 4.9317310634009396e-05, "loss": 1.2371, "step": 26400 }, { "epoch": 0.02, "grad_norm": 107.0, "learning_rate": 4.931281156081847e-05, "loss": 1.4179, "step": 26500 }, { "epoch": 0.02, "grad_norm": 30.75, "learning_rate": 4.9308312487627553e-05, "loss": 1.1881, "step": 26600 }, { "epoch": 0.02, "grad_norm": 70.0, "learning_rate": 4.930381341443663e-05, "loss": 1.3085, "step": 26700 }, { "epoch": 0.02, "grad_norm": 12.5625, "learning_rate": 4.9299314341245704e-05, "loss": 1.2098, "step": 26800 }, { "epoch": 0.02, "grad_norm": 121.0, "learning_rate": 4.9294815268054786e-05, "loss": 1.2776, "step": 26900 }, { "epoch": 0.02, "grad_norm": 20.375, "learning_rate": 4.929031619486386e-05, "loss": 1.2937, "step": 27000 }, { "epoch": 0.02, "grad_norm": 10.0, "learning_rate": 4.928581712167294e-05, "loss": 1.3407, "step": 27100 }, { "epoch": 0.02, "grad_norm": 21.25, "learning_rate": 4.928131804848202e-05, "loss": 1.3125, "step": 27200 }, { "epoch": 0.02, "grad_norm": 94.0, "learning_rate": 4.9276818975291094e-05, "loss": 1.1974, "step": 27300 }, { "epoch": 0.02, "grad_norm": 170.0, "learning_rate": 4.927231990210017e-05, "loss": 1.2744, "step": 27400 }, { "epoch": 0.02, "grad_norm": 324.0, "learning_rate": 4.9267820828909245e-05, "loss": 1.1951, "step": 27500 }, { "epoch": 0.02, "grad_norm": 34.75, "learning_rate": 4.926332175571832e-05, "loss": 1.2403, "step": 27600 }, { "epoch": 0.02, "grad_norm": 62.25, "learning_rate": 4.92588226825274e-05, "loss": 1.2114, "step": 27700 }, { "epoch": 0.02, "grad_norm": 86.5, "learning_rate": 4.925432360933648e-05, "loss": 1.3519, "step": 27800 }, { "epoch": 0.02, "grad_norm": 32.0, "learning_rate": 4.924982453614555e-05, "loss": 1.2799, "step": 27900 }, { "epoch": 0.02, "grad_norm": 234.0, "learning_rate": 4.9245325462954635e-05, "loss": 1.1573, "step": 28000 }, { "epoch": 0.03, "grad_norm": 193.0, "learning_rate": 4.924082638976371e-05, "loss": 1.3123, "step": 28100 }, { "epoch": 0.03, "grad_norm": 77.5, "learning_rate": 4.9236327316572786e-05, "loss": 1.4343, "step": 28200 }, { "epoch": 0.03, "grad_norm": 0.00811767578125, "learning_rate": 4.923182824338187e-05, "loss": 1.2112, "step": 28300 }, { "epoch": 0.03, "grad_norm": 43.0, "learning_rate": 4.9227329170190943e-05, "loss": 1.404, "step": 28400 }, { "epoch": 0.03, "grad_norm": 37.0, "learning_rate": 4.9222830097000026e-05, "loss": 1.3227, "step": 28500 }, { "epoch": 0.03, "grad_norm": 756.0, "learning_rate": 4.92183310238091e-05, "loss": 1.4063, "step": 28600 }, { "epoch": 0.03, "grad_norm": 27.0, "learning_rate": 4.9213831950618176e-05, "loss": 1.328, "step": 28700 }, { "epoch": 0.03, "grad_norm": 32.0, "learning_rate": 4.920933287742725e-05, "loss": 1.3589, "step": 28800 }, { "epoch": 0.03, "grad_norm": 60.25, "learning_rate": 4.920483380423633e-05, "loss": 1.2317, "step": 28900 }, { "epoch": 0.03, "grad_norm": 6.78125, "learning_rate": 4.92003347310454e-05, "loss": 1.2723, "step": 29000 }, { "epoch": 0.03, "grad_norm": 332.0, "learning_rate": 4.9195835657854484e-05, "loss": 1.1732, "step": 29100 }, { "epoch": 0.03, "grad_norm": 19.125, "learning_rate": 4.919133658466356e-05, "loss": 1.3209, "step": 29200 }, { "epoch": 0.03, "grad_norm": 18.0, "learning_rate": 4.918683751147264e-05, "loss": 1.0832, "step": 29300 }, { "epoch": 0.03, "grad_norm": 101.5, "learning_rate": 4.918233843828172e-05, "loss": 1.2793, "step": 29400 }, { "epoch": 0.03, "grad_norm": 0.00098419189453125, "learning_rate": 4.917783936509079e-05, "loss": 1.2659, "step": 29500 }, { "epoch": 0.03, "grad_norm": 19.125, "learning_rate": 4.9173340291899875e-05, "loss": 1.429, "step": 29600 }, { "epoch": 0.03, "grad_norm": 18.0, "learning_rate": 4.916884121870895e-05, "loss": 1.3748, "step": 29700 }, { "epoch": 0.03, "grad_norm": 11.8125, "learning_rate": 4.9164342145518025e-05, "loss": 1.404, "step": 29800 }, { "epoch": 0.03, "grad_norm": 39.25, "learning_rate": 4.915984307232711e-05, "loss": 1.396, "step": 29900 }, { "epoch": 0.03, "grad_norm": 46.25, "learning_rate": 4.915534399913618e-05, "loss": 1.3234, "step": 30000 }, { "epoch": 0.03, "grad_norm": 33.5, "learning_rate": 4.915084492594526e-05, "loss": 1.2151, "step": 30100 }, { "epoch": 0.03, "grad_norm": 96.0, "learning_rate": 4.9146345852754333e-05, "loss": 1.2557, "step": 30200 }, { "epoch": 0.03, "grad_norm": 31.5, "learning_rate": 4.914184677956341e-05, "loss": 1.3144, "step": 30300 }, { "epoch": 0.03, "grad_norm": 41.5, "learning_rate": 4.913734770637249e-05, "loss": 1.3245, "step": 30400 }, { "epoch": 0.03, "grad_norm": 40.25, "learning_rate": 4.9132848633181566e-05, "loss": 1.2695, "step": 30500 }, { "epoch": 0.03, "grad_norm": 2.15625, "learning_rate": 4.912834955999064e-05, "loss": 1.2701, "step": 30600 }, { "epoch": 0.03, "grad_norm": 0.060546875, "learning_rate": 4.9123850486799724e-05, "loss": 1.2759, "step": 30700 }, { "epoch": 0.03, "grad_norm": 34.5, "learning_rate": 4.91193514136088e-05, "loss": 1.1844, "step": 30800 }, { "epoch": 0.03, "grad_norm": 146.0, "learning_rate": 4.9114852340417874e-05, "loss": 1.3678, "step": 30900 }, { "epoch": 0.03, "grad_norm": 25.875, "learning_rate": 4.9110353267226956e-05, "loss": 1.2937, "step": 31000 }, { "epoch": 0.03, "grad_norm": 26.875, "learning_rate": 4.910585419403603e-05, "loss": 1.2822, "step": 31100 }, { "epoch": 0.03, "grad_norm": 75.5, "learning_rate": 4.9101355120845114e-05, "loss": 1.299, "step": 31200 }, { "epoch": 0.03, "grad_norm": 13.5625, "learning_rate": 4.909685604765419e-05, "loss": 1.2342, "step": 31300 }, { "epoch": 0.03, "grad_norm": 3.3125, "learning_rate": 4.909235697446326e-05, "loss": 1.2827, "step": 31400 }, { "epoch": 0.03, "grad_norm": 116.0, "learning_rate": 4.908785790127234e-05, "loss": 1.13, "step": 31500 }, { "epoch": 0.03, "grad_norm": 32.25, "learning_rate": 4.9083358828081415e-05, "loss": 1.1681, "step": 31600 }, { "epoch": 0.03, "grad_norm": 19.875, "learning_rate": 4.907885975489049e-05, "loss": 1.1674, "step": 31700 }, { "epoch": 0.03, "grad_norm": 55.75, "learning_rate": 4.907436068169957e-05, "loss": 1.3497, "step": 31800 }, { "epoch": 0.03, "grad_norm": 34.5, "learning_rate": 4.906986160850865e-05, "loss": 1.2089, "step": 31900 }, { "epoch": 0.03, "grad_norm": 57.75, "learning_rate": 4.906536253531773e-05, "loss": 1.2747, "step": 32000 }, { "epoch": 0.03, "grad_norm": 76.5, "learning_rate": 4.9060863462126806e-05, "loss": 1.3811, "step": 32100 }, { "epoch": 0.03, "grad_norm": 0.2060546875, "learning_rate": 4.905636438893588e-05, "loss": 1.2696, "step": 32200 }, { "epoch": 0.03, "grad_norm": 116.0, "learning_rate": 4.905186531574496e-05, "loss": 1.2768, "step": 32300 }, { "epoch": 0.03, "grad_norm": 11.75, "learning_rate": 4.904736624255404e-05, "loss": 1.2377, "step": 32400 }, { "epoch": 0.03, "grad_norm": 135.0, "learning_rate": 4.9042867169363114e-05, "loss": 1.1929, "step": 32500 }, { "epoch": 0.03, "grad_norm": 22.75, "learning_rate": 4.9038368096172196e-05, "loss": 1.2497, "step": 32600 }, { "epoch": 0.03, "grad_norm": 20.375, "learning_rate": 4.9033869022981264e-05, "loss": 1.3461, "step": 32700 }, { "epoch": 0.03, "grad_norm": 20.375, "learning_rate": 4.9029369949790346e-05, "loss": 1.2957, "step": 32800 }, { "epoch": 0.03, "grad_norm": 83.5, "learning_rate": 4.902487087659942e-05, "loss": 1.4008, "step": 32900 }, { "epoch": 0.03, "grad_norm": 42.25, "learning_rate": 4.90203718034085e-05, "loss": 1.2242, "step": 33000 }, { "epoch": 0.03, "grad_norm": 165.0, "learning_rate": 4.901587273021758e-05, "loss": 1.3613, "step": 33100 }, { "epoch": 0.03, "grad_norm": 0.1533203125, "learning_rate": 4.9011373657026655e-05, "loss": 1.1811, "step": 33200 }, { "epoch": 0.03, "grad_norm": 49.25, "learning_rate": 4.900687458383573e-05, "loss": 1.3841, "step": 33300 }, { "epoch": 0.03, "grad_norm": 0.64453125, "learning_rate": 4.900237551064481e-05, "loss": 1.3316, "step": 33400 }, { "epoch": 0.03, "grad_norm": 20.0, "learning_rate": 4.899787643745389e-05, "loss": 1.2325, "step": 33500 }, { "epoch": 0.03, "grad_norm": 34.75, "learning_rate": 4.899337736426296e-05, "loss": 1.1637, "step": 33600 }, { "epoch": 0.03, "grad_norm": 77.5, "learning_rate": 4.8988878291072045e-05, "loss": 1.332, "step": 33700 }, { "epoch": 0.03, "grad_norm": 15.875, "learning_rate": 4.898437921788112e-05, "loss": 1.1109, "step": 33800 }, { "epoch": 0.03, "grad_norm": 25.125, "learning_rate": 4.8979880144690196e-05, "loss": 1.1916, "step": 33900 }, { "epoch": 0.03, "grad_norm": 88.0, "learning_rate": 4.897538107149927e-05, "loss": 1.2063, "step": 34000 }, { "epoch": 0.03, "grad_norm": 89.0, "learning_rate": 4.8970881998308346e-05, "loss": 1.2242, "step": 34100 }, { "epoch": 0.03, "grad_norm": 27.125, "learning_rate": 4.896638292511743e-05, "loss": 1.1195, "step": 34200 }, { "epoch": 0.03, "grad_norm": 38.0, "learning_rate": 4.8961883851926504e-05, "loss": 1.1744, "step": 34300 }, { "epoch": 0.03, "grad_norm": 66.0, "learning_rate": 4.895738477873558e-05, "loss": 1.4476, "step": 34400 }, { "epoch": 0.03, "grad_norm": 17.0, "learning_rate": 4.895288570554466e-05, "loss": 1.2077, "step": 34500 }, { "epoch": 0.03, "grad_norm": 153.0, "learning_rate": 4.8948386632353736e-05, "loss": 1.0947, "step": 34600 }, { "epoch": 0.03, "grad_norm": 21.625, "learning_rate": 4.894388755916282e-05, "loss": 1.2036, "step": 34700 }, { "epoch": 0.03, "grad_norm": 1.3671875, "learning_rate": 4.8939388485971894e-05, "loss": 1.2348, "step": 34800 }, { "epoch": 0.03, "grad_norm": 24.625, "learning_rate": 4.893488941278097e-05, "loss": 1.2718, "step": 34900 }, { "epoch": 0.03, "grad_norm": 73.5, "learning_rate": 4.893039033959005e-05, "loss": 1.2822, "step": 35000 }, { "epoch": 0.03, "grad_norm": 2.015625, "learning_rate": 4.892589126639913e-05, "loss": 1.3972, "step": 35100 }, { "epoch": 0.03, "grad_norm": 65.5, "learning_rate": 4.89213921932082e-05, "loss": 1.3597, "step": 35200 }, { "epoch": 0.03, "grad_norm": 44.75, "learning_rate": 4.891689312001728e-05, "loss": 1.3293, "step": 35300 }, { "epoch": 0.03, "grad_norm": 29.75, "learning_rate": 4.891239404682635e-05, "loss": 1.3688, "step": 35400 }, { "epoch": 0.03, "grad_norm": 49.5, "learning_rate": 4.8907894973635435e-05, "loss": 1.4431, "step": 35500 }, { "epoch": 0.03, "grad_norm": 28.875, "learning_rate": 4.890339590044451e-05, "loss": 1.3478, "step": 35600 }, { "epoch": 0.03, "grad_norm": 23.625, "learning_rate": 4.8898896827253586e-05, "loss": 1.3951, "step": 35700 }, { "epoch": 0.03, "grad_norm": 88.5, "learning_rate": 4.889439775406267e-05, "loss": 1.3132, "step": 35800 }, { "epoch": 0.03, "grad_norm": 5.8125, "learning_rate": 4.888989868087174e-05, "loss": 1.2843, "step": 35900 }, { "epoch": 0.03, "grad_norm": 71.0, "learning_rate": 4.888539960768082e-05, "loss": 1.3047, "step": 36000 }, { "epoch": 0.03, "grad_norm": 41.5, "learning_rate": 4.88809005344899e-05, "loss": 1.3439, "step": 36100 }, { "epoch": 0.03, "grad_norm": 24.5, "learning_rate": 4.8876401461298976e-05, "loss": 1.2515, "step": 36200 }, { "epoch": 0.03, "grad_norm": 30.75, "learning_rate": 4.887190238810805e-05, "loss": 1.3564, "step": 36300 }, { "epoch": 0.03, "grad_norm": 20.625, "learning_rate": 4.886740331491713e-05, "loss": 1.1833, "step": 36400 }, { "epoch": 0.03, "grad_norm": 22.625, "learning_rate": 4.886290424172621e-05, "loss": 1.3388, "step": 36500 }, { "epoch": 0.03, "grad_norm": 41.0, "learning_rate": 4.8858405168535284e-05, "loss": 1.2338, "step": 36600 }, { "epoch": 0.03, "grad_norm": 111.5, "learning_rate": 4.885390609534436e-05, "loss": 1.3098, "step": 36700 }, { "epoch": 0.03, "grad_norm": 406.0, "learning_rate": 4.8849407022153435e-05, "loss": 1.2323, "step": 36800 }, { "epoch": 0.03, "grad_norm": 32.25, "learning_rate": 4.884490794896252e-05, "loss": 1.2944, "step": 36900 }, { "epoch": 0.03, "grad_norm": 47.5, "learning_rate": 4.884040887577159e-05, "loss": 1.2743, "step": 37000 }, { "epoch": 0.03, "grad_norm": 39.25, "learning_rate": 4.883590980258067e-05, "loss": 1.2856, "step": 37100 }, { "epoch": 0.03, "grad_norm": 88.0, "learning_rate": 4.883141072938975e-05, "loss": 1.179, "step": 37200 }, { "epoch": 0.03, "grad_norm": 32.25, "learning_rate": 4.8826911656198825e-05, "loss": 1.2453, "step": 37300 }, { "epoch": 0.03, "grad_norm": 76.5, "learning_rate": 4.882241258300791e-05, "loss": 1.3155, "step": 37400 }, { "epoch": 0.03, "grad_norm": 28.375, "learning_rate": 4.881791350981698e-05, "loss": 1.2103, "step": 37500 }, { "epoch": 0.03, "grad_norm": 71.0, "learning_rate": 4.881341443662606e-05, "loss": 1.2651, "step": 37600 }, { "epoch": 0.03, "grad_norm": 17.25, "learning_rate": 4.880891536343514e-05, "loss": 1.5132, "step": 37700 }, { "epoch": 0.03, "grad_norm": 86.5, "learning_rate": 4.8804416290244215e-05, "loss": 1.3271, "step": 37800 }, { "epoch": 0.03, "grad_norm": 123.5, "learning_rate": 4.8799917217053284e-05, "loss": 1.3711, "step": 37900 }, { "epoch": 0.03, "grad_norm": 148.0, "learning_rate": 4.8795418143862366e-05, "loss": 1.2331, "step": 38000 }, { "epoch": 0.03, "grad_norm": 24.25, "learning_rate": 4.879091907067144e-05, "loss": 1.1995, "step": 38100 }, { "epoch": 0.03, "grad_norm": 0.21484375, "learning_rate": 4.878641999748052e-05, "loss": 1.3692, "step": 38200 }, { "epoch": 0.03, "grad_norm": 70.0, "learning_rate": 4.87819209242896e-05, "loss": 1.2408, "step": 38300 }, { "epoch": 0.03, "grad_norm": 17.0, "learning_rate": 4.8777421851098674e-05, "loss": 1.2109, "step": 38400 }, { "epoch": 0.03, "grad_norm": 49.0, "learning_rate": 4.8772922777907756e-05, "loss": 1.2887, "step": 38500 }, { "epoch": 0.03, "grad_norm": 11.25, "learning_rate": 4.876842370471683e-05, "loss": 1.3198, "step": 38600 }, { "epoch": 0.03, "grad_norm": 19.75, "learning_rate": 4.876392463152591e-05, "loss": 1.359, "step": 38700 }, { "epoch": 0.03, "grad_norm": 120.0, "learning_rate": 4.875942555833499e-05, "loss": 1.3114, "step": 38800 }, { "epoch": 0.03, "grad_norm": 25.875, "learning_rate": 4.8754926485144064e-05, "loss": 1.2283, "step": 38900 }, { "epoch": 0.03, "grad_norm": 0.047607421875, "learning_rate": 4.875042741195314e-05, "loss": 1.1942, "step": 39000 }, { "epoch": 0.03, "grad_norm": 16.125, "learning_rate": 4.874592833876222e-05, "loss": 1.2641, "step": 39100 }, { "epoch": 0.03, "grad_norm": 56.5, "learning_rate": 4.874142926557129e-05, "loss": 1.0726, "step": 39200 }, { "epoch": 0.04, "grad_norm": 169.0, "learning_rate": 4.873693019238037e-05, "loss": 1.1969, "step": 39300 }, { "epoch": 0.04, "grad_norm": 54.25, "learning_rate": 4.873243111918945e-05, "loss": 1.1107, "step": 39400 }, { "epoch": 0.04, "grad_norm": 62.0, "learning_rate": 4.872793204599852e-05, "loss": 1.4246, "step": 39500 }, { "epoch": 0.04, "grad_norm": 18.625, "learning_rate": 4.8723432972807605e-05, "loss": 1.2903, "step": 39600 }, { "epoch": 0.04, "grad_norm": 0.028564453125, "learning_rate": 4.871893389961668e-05, "loss": 1.3936, "step": 39700 }, { "epoch": 0.04, "grad_norm": 170.0, "learning_rate": 4.8714434826425756e-05, "loss": 1.1697, "step": 39800 }, { "epoch": 0.04, "grad_norm": 25.0, "learning_rate": 4.870993575323484e-05, "loss": 1.1897, "step": 39900 }, { "epoch": 0.04, "grad_norm": 28.625, "learning_rate": 4.870543668004391e-05, "loss": 1.3424, "step": 40000 }, { "epoch": 0.04, "grad_norm": 52.75, "learning_rate": 4.8700937606852995e-05, "loss": 1.2875, "step": 40100 }, { "epoch": 0.04, "grad_norm": 0.004669189453125, "learning_rate": 4.869643853366207e-05, "loss": 1.2613, "step": 40200 }, { "epoch": 0.04, "grad_norm": 73.0, "learning_rate": 4.8691939460471146e-05, "loss": 1.376, "step": 40300 }, { "epoch": 0.04, "grad_norm": 8.5625, "learning_rate": 4.868744038728022e-05, "loss": 1.2863, "step": 40400 }, { "epoch": 0.04, "grad_norm": 25.25, "learning_rate": 4.86829413140893e-05, "loss": 1.4402, "step": 40500 }, { "epoch": 0.04, "grad_norm": 46.25, "learning_rate": 4.867844224089837e-05, "loss": 1.3281, "step": 40600 }, { "epoch": 0.04, "grad_norm": 140.0, "learning_rate": 4.8673943167707454e-05, "loss": 1.3408, "step": 40700 }, { "epoch": 0.04, "grad_norm": 36.0, "learning_rate": 4.866944409451653e-05, "loss": 1.2578, "step": 40800 }, { "epoch": 0.04, "grad_norm": 49.5, "learning_rate": 4.866494502132561e-05, "loss": 1.3747, "step": 40900 }, { "epoch": 0.04, "grad_norm": 48.25, "learning_rate": 4.866044594813469e-05, "loss": 1.2134, "step": 41000 }, { "epoch": 0.04, "grad_norm": 52.5, "learning_rate": 4.865594687494376e-05, "loss": 1.2627, "step": 41100 }, { "epoch": 0.04, "grad_norm": 61.5, "learning_rate": 4.8651447801752844e-05, "loss": 1.2865, "step": 41200 }, { "epoch": 0.04, "grad_norm": 84.5, "learning_rate": 4.864694872856192e-05, "loss": 1.2833, "step": 41300 }, { "epoch": 0.04, "grad_norm": 22.75, "learning_rate": 4.8642449655370995e-05, "loss": 1.3422, "step": 41400 }, { "epoch": 0.04, "grad_norm": 43.0, "learning_rate": 4.863795058218008e-05, "loss": 1.287, "step": 41500 }, { "epoch": 0.04, "grad_norm": 18.5, "learning_rate": 4.863345150898915e-05, "loss": 1.2372, "step": 41600 }, { "epoch": 0.04, "grad_norm": 78.0, "learning_rate": 4.862895243579823e-05, "loss": 1.3556, "step": 41700 }, { "epoch": 0.04, "grad_norm": 128.0, "learning_rate": 4.86244533626073e-05, "loss": 1.4245, "step": 41800 }, { "epoch": 0.04, "grad_norm": 162.0, "learning_rate": 4.861995428941638e-05, "loss": 1.2928, "step": 41900 }, { "epoch": 0.04, "grad_norm": 107.5, "learning_rate": 4.861545521622546e-05, "loss": 1.2586, "step": 42000 }, { "epoch": 0.04, "grad_norm": 5.0, "learning_rate": 4.8610956143034536e-05, "loss": 1.2892, "step": 42100 }, { "epoch": 0.04, "grad_norm": 73.0, "learning_rate": 4.860645706984361e-05, "loss": 1.1325, "step": 42200 }, { "epoch": 0.04, "grad_norm": 5.5, "learning_rate": 4.8601957996652693e-05, "loss": 1.2525, "step": 42300 }, { "epoch": 0.04, "grad_norm": 58.25, "learning_rate": 4.859745892346177e-05, "loss": 1.0027, "step": 42400 }, { "epoch": 0.04, "grad_norm": 44.0, "learning_rate": 4.8592959850270844e-05, "loss": 1.3387, "step": 42500 }, { "epoch": 0.04, "grad_norm": 0.005401611328125, "learning_rate": 4.8588460777079926e-05, "loss": 1.4061, "step": 42600 }, { "epoch": 0.04, "grad_norm": 47.0, "learning_rate": 4.8583961703889e-05, "loss": 1.2295, "step": 42700 }, { "epoch": 0.04, "grad_norm": 23.375, "learning_rate": 4.8579462630698084e-05, "loss": 1.3215, "step": 42800 }, { "epoch": 0.04, "grad_norm": 78.0, "learning_rate": 4.857496355750716e-05, "loss": 1.115, "step": 42900 }, { "epoch": 0.04, "grad_norm": 25.75, "learning_rate": 4.857046448431623e-05, "loss": 1.3653, "step": 43000 }, { "epoch": 0.04, "grad_norm": 39.5, "learning_rate": 4.856596541112531e-05, "loss": 1.1244, "step": 43100 }, { "epoch": 0.04, "grad_norm": 29.0, "learning_rate": 4.8561466337934385e-05, "loss": 1.2352, "step": 43200 }, { "epoch": 0.04, "grad_norm": 119.0, "learning_rate": 4.855696726474346e-05, "loss": 1.3514, "step": 43300 }, { "epoch": 0.04, "grad_norm": 8.0, "learning_rate": 4.855246819155254e-05, "loss": 1.2969, "step": 43400 }, { "epoch": 0.04, "grad_norm": 49.75, "learning_rate": 4.854796911836162e-05, "loss": 1.3222, "step": 43500 }, { "epoch": 0.04, "grad_norm": 36.75, "learning_rate": 4.85434700451707e-05, "loss": 1.3191, "step": 43600 }, { "epoch": 0.04, "grad_norm": 374.0, "learning_rate": 4.8538970971979775e-05, "loss": 1.3238, "step": 43700 }, { "epoch": 0.04, "grad_norm": 56.0, "learning_rate": 4.853447189878885e-05, "loss": 1.4575, "step": 43800 }, { "epoch": 0.04, "grad_norm": 35.75, "learning_rate": 4.852997282559793e-05, "loss": 1.3157, "step": 43900 }, { "epoch": 0.04, "grad_norm": 23.125, "learning_rate": 4.852547375240701e-05, "loss": 1.1149, "step": 44000 }, { "epoch": 0.04, "grad_norm": 292.0, "learning_rate": 4.8520974679216083e-05, "loss": 1.1264, "step": 44100 }, { "epoch": 0.04, "grad_norm": 29.375, "learning_rate": 4.8516475606025166e-05, "loss": 1.1437, "step": 44200 }, { "epoch": 0.04, "grad_norm": 0.058349609375, "learning_rate": 4.8511976532834234e-05, "loss": 1.1619, "step": 44300 }, { "epoch": 0.04, "grad_norm": 29.0, "learning_rate": 4.8507477459643316e-05, "loss": 1.2967, "step": 44400 }, { "epoch": 0.04, "grad_norm": 13.8125, "learning_rate": 4.850297838645239e-05, "loss": 1.2666, "step": 44500 }, { "epoch": 0.04, "grad_norm": 28.5, "learning_rate": 4.849847931326147e-05, "loss": 1.3183, "step": 44600 }, { "epoch": 0.04, "grad_norm": 12.75, "learning_rate": 4.849398024007055e-05, "loss": 1.0624, "step": 44700 }, { "epoch": 0.04, "grad_norm": 36.0, "learning_rate": 4.8489481166879624e-05, "loss": 1.2649, "step": 44800 }, { "epoch": 0.04, "grad_norm": 59.0, "learning_rate": 4.84849820936887e-05, "loss": 1.2167, "step": 44900 }, { "epoch": 0.04, "grad_norm": 24.5, "learning_rate": 4.848048302049778e-05, "loss": 1.1337, "step": 45000 }, { "epoch": 0.04, "grad_norm": 70.5, "learning_rate": 4.847598394730686e-05, "loss": 1.4822, "step": 45100 }, { "epoch": 0.04, "grad_norm": 0.004852294921875, "learning_rate": 4.847148487411593e-05, "loss": 1.1802, "step": 45200 }, { "epoch": 0.04, "grad_norm": 42.5, "learning_rate": 4.8466985800925015e-05, "loss": 1.2666, "step": 45300 }, { "epoch": 0.04, "grad_norm": 11.0, "learning_rate": 4.846248672773409e-05, "loss": 1.1953, "step": 45400 }, { "epoch": 0.04, "grad_norm": 26.625, "learning_rate": 4.845798765454317e-05, "loss": 1.2385, "step": 45500 }, { "epoch": 0.04, "grad_norm": 51.25, "learning_rate": 4.845348858135224e-05, "loss": 1.2195, "step": 45600 }, { "epoch": 0.04, "grad_norm": 33.75, "learning_rate": 4.8448989508161316e-05, "loss": 1.1949, "step": 45700 }, { "epoch": 0.04, "grad_norm": 70.5, "learning_rate": 4.84444904349704e-05, "loss": 1.2288, "step": 45800 }, { "epoch": 0.04, "grad_norm": 2.703125, "learning_rate": 4.8439991361779473e-05, "loss": 1.2516, "step": 45900 }, { "epoch": 0.04, "grad_norm": 154.0, "learning_rate": 4.843549228858855e-05, "loss": 1.3733, "step": 46000 }, { "epoch": 0.04, "grad_norm": 17.25, "learning_rate": 4.843099321539763e-05, "loss": 1.2666, "step": 46100 }, { "epoch": 0.04, "grad_norm": 21.375, "learning_rate": 4.8426494142206706e-05, "loss": 1.2936, "step": 46200 }, { "epoch": 0.04, "grad_norm": 25.875, "learning_rate": 4.842199506901579e-05, "loss": 1.4903, "step": 46300 }, { "epoch": 0.04, "grad_norm": 111.5, "learning_rate": 4.8417495995824864e-05, "loss": 1.2341, "step": 46400 }, { "epoch": 0.04, "grad_norm": 30.0, "learning_rate": 4.841299692263394e-05, "loss": 1.2469, "step": 46500 }, { "epoch": 0.04, "grad_norm": 10.125, "learning_rate": 4.840849784944302e-05, "loss": 1.2281, "step": 46600 }, { "epoch": 0.04, "grad_norm": 0.02490234375, "learning_rate": 4.8403998776252097e-05, "loss": 1.3978, "step": 46700 }, { "epoch": 0.04, "grad_norm": 104.0, "learning_rate": 4.839949970306117e-05, "loss": 1.3517, "step": 46800 }, { "epoch": 0.04, "grad_norm": 24.0, "learning_rate": 4.839500062987025e-05, "loss": 1.3805, "step": 46900 }, { "epoch": 0.04, "grad_norm": 14.5625, "learning_rate": 4.839050155667932e-05, "loss": 1.1528, "step": 47000 }, { "epoch": 0.04, "grad_norm": 56.25, "learning_rate": 4.8386002483488405e-05, "loss": 1.2795, "step": 47100 }, { "epoch": 0.04, "grad_norm": 0.46484375, "learning_rate": 4.838150341029748e-05, "loss": 1.3025, "step": 47200 }, { "epoch": 0.04, "grad_norm": 60.5, "learning_rate": 4.8377004337106555e-05, "loss": 1.3028, "step": 47300 }, { "epoch": 0.04, "grad_norm": 24.5, "learning_rate": 4.837250526391564e-05, "loss": 1.1532, "step": 47400 }, { "epoch": 0.04, "grad_norm": 165.0, "learning_rate": 4.836800619072471e-05, "loss": 1.3127, "step": 47500 }, { "epoch": 0.04, "grad_norm": 92.5, "learning_rate": 4.836350711753379e-05, "loss": 1.278, "step": 47600 }, { "epoch": 0.04, "grad_norm": 0.00176239013671875, "learning_rate": 4.835900804434287e-05, "loss": 1.0553, "step": 47700 }, { "epoch": 0.04, "grad_norm": 27.0, "learning_rate": 4.8354508971151946e-05, "loss": 1.25, "step": 47800 }, { "epoch": 0.04, "grad_norm": 0.035888671875, "learning_rate": 4.835000989796102e-05, "loss": 1.1231, "step": 47900 }, { "epoch": 0.04, "grad_norm": 39.5, "learning_rate": 4.83455108247701e-05, "loss": 1.311, "step": 48000 }, { "epoch": 0.04, "grad_norm": 40.25, "learning_rate": 4.834101175157918e-05, "loss": 1.2455, "step": 48100 }, { "epoch": 0.04, "grad_norm": 31.125, "learning_rate": 4.8336512678388254e-05, "loss": 1.3419, "step": 48200 }, { "epoch": 0.04, "grad_norm": 59.25, "learning_rate": 4.833201360519733e-05, "loss": 1.2595, "step": 48300 }, { "epoch": 0.04, "grad_norm": 52.5, "learning_rate": 4.8327514532006404e-05, "loss": 1.1156, "step": 48400 }, { "epoch": 0.04, "grad_norm": 88.0, "learning_rate": 4.8323015458815487e-05, "loss": 1.065, "step": 48500 }, { "epoch": 0.04, "grad_norm": 24.75, "learning_rate": 4.831851638562456e-05, "loss": 1.3643, "step": 48600 }, { "epoch": 0.04, "grad_norm": 75.0, "learning_rate": 4.831401731243364e-05, "loss": 1.2967, "step": 48700 }, { "epoch": 0.04, "grad_norm": 0.002349853515625, "learning_rate": 4.830951823924272e-05, "loss": 1.1988, "step": 48800 }, { "epoch": 0.04, "grad_norm": 0.0308837890625, "learning_rate": 4.8305019166051795e-05, "loss": 1.3315, "step": 48900 }, { "epoch": 0.04, "grad_norm": 16.5, "learning_rate": 4.830052009286088e-05, "loss": 1.2752, "step": 49000 }, { "epoch": 0.04, "grad_norm": 24.875, "learning_rate": 4.829602101966995e-05, "loss": 1.2389, "step": 49100 }, { "epoch": 0.04, "grad_norm": 1.59375, "learning_rate": 4.829152194647903e-05, "loss": 1.3067, "step": 49200 }, { "epoch": 0.04, "grad_norm": 23.5, "learning_rate": 4.828702287328811e-05, "loss": 1.0593, "step": 49300 }, { "epoch": 0.04, "grad_norm": 0.75390625, "learning_rate": 4.8282523800097185e-05, "loss": 1.2773, "step": 49400 }, { "epoch": 0.04, "grad_norm": 14.9375, "learning_rate": 4.827802472690626e-05, "loss": 1.2028, "step": 49500 }, { "epoch": 0.04, "grad_norm": 79.0, "learning_rate": 4.8273525653715336e-05, "loss": 1.306, "step": 49600 }, { "epoch": 0.04, "grad_norm": 0.0810546875, "learning_rate": 4.826902658052441e-05, "loss": 1.3618, "step": 49700 }, { "epoch": 0.04, "grad_norm": 16.125, "learning_rate": 4.826452750733349e-05, "loss": 1.1252, "step": 49800 }, { "epoch": 0.04, "grad_norm": 14.875, "learning_rate": 4.826002843414257e-05, "loss": 1.2516, "step": 49900 }, { "epoch": 0.04, "grad_norm": 0.07666015625, "learning_rate": 4.8255529360951644e-05, "loss": 1.0725, "step": 50000 }, { "epoch": 0.04, "grad_norm": 9.4375, "learning_rate": 4.8251030287760726e-05, "loss": 1.3507, "step": 50100 }, { "epoch": 0.04, "grad_norm": 28.5, "learning_rate": 4.82465312145698e-05, "loss": 1.1303, "step": 50200 }, { "epoch": 0.04, "grad_norm": 86.0, "learning_rate": 4.8242032141378877e-05, "loss": 1.2811, "step": 50300 }, { "epoch": 0.04, "grad_norm": 77.5, "learning_rate": 4.823753306818796e-05, "loss": 1.2733, "step": 50400 }, { "epoch": 0.04, "grad_norm": 29.5, "learning_rate": 4.8233033994997034e-05, "loss": 1.3248, "step": 50500 }, { "epoch": 0.05, "grad_norm": 73.0, "learning_rate": 4.822853492180611e-05, "loss": 1.1872, "step": 50600 }, { "epoch": 0.05, "grad_norm": 0.17578125, "learning_rate": 4.822403584861519e-05, "loss": 1.3451, "step": 50700 }, { "epoch": 0.05, "grad_norm": 1.96875, "learning_rate": 4.821953677542426e-05, "loss": 1.2303, "step": 50800 }, { "epoch": 0.05, "grad_norm": 0.04931640625, "learning_rate": 4.821503770223334e-05, "loss": 1.2957, "step": 50900 }, { "epoch": 0.05, "grad_norm": 15.5625, "learning_rate": 4.821053862904242e-05, "loss": 1.2168, "step": 51000 }, { "epoch": 0.05, "grad_norm": 15.25, "learning_rate": 4.820603955585149e-05, "loss": 1.2688, "step": 51100 }, { "epoch": 0.05, "grad_norm": 6.65625, "learning_rate": 4.8201540482660575e-05, "loss": 1.2357, "step": 51200 }, { "epoch": 0.05, "grad_norm": 25.125, "learning_rate": 4.819704140946965e-05, "loss": 1.3002, "step": 51300 }, { "epoch": 0.05, "grad_norm": 0.01214599609375, "learning_rate": 4.8192542336278726e-05, "loss": 1.3091, "step": 51400 }, { "epoch": 0.05, "grad_norm": 15.875, "learning_rate": 4.818804326308781e-05, "loss": 1.1105, "step": 51500 }, { "epoch": 0.05, "grad_norm": 44.75, "learning_rate": 4.818354418989688e-05, "loss": 1.1362, "step": 51600 }, { "epoch": 0.05, "grad_norm": 290.0, "learning_rate": 4.8179045116705965e-05, "loss": 1.2163, "step": 51700 }, { "epoch": 0.05, "grad_norm": 54.75, "learning_rate": 4.817454604351504e-05, "loss": 1.1851, "step": 51800 }, { "epoch": 0.05, "grad_norm": 19.875, "learning_rate": 4.8170046970324116e-05, "loss": 1.1474, "step": 51900 }, { "epoch": 0.05, "grad_norm": 27.75, "learning_rate": 4.81655478971332e-05, "loss": 1.2484, "step": 52000 }, { "epoch": 0.05, "grad_norm": 24.375, "learning_rate": 4.8161048823942266e-05, "loss": 1.3117, "step": 52100 }, { "epoch": 0.05, "grad_norm": 41.75, "learning_rate": 4.815654975075135e-05, "loss": 1.227, "step": 52200 }, { "epoch": 0.05, "grad_norm": 99.5, "learning_rate": 4.8152050677560424e-05, "loss": 1.214, "step": 52300 }, { "epoch": 0.05, "grad_norm": 156.0, "learning_rate": 4.81475516043695e-05, "loss": 1.2802, "step": 52400 }, { "epoch": 0.05, "grad_norm": 282.0, "learning_rate": 4.814305253117858e-05, "loss": 1.2427, "step": 52500 }, { "epoch": 0.05, "grad_norm": 1.4609375, "learning_rate": 4.813855345798766e-05, "loss": 1.2021, "step": 52600 }, { "epoch": 0.05, "grad_norm": 35.0, "learning_rate": 4.813405438479673e-05, "loss": 1.1962, "step": 52700 }, { "epoch": 0.05, "grad_norm": 20.875, "learning_rate": 4.8129555311605814e-05, "loss": 1.2807, "step": 52800 }, { "epoch": 0.05, "grad_norm": 178.0, "learning_rate": 4.812505623841489e-05, "loss": 1.1017, "step": 52900 }, { "epoch": 0.05, "grad_norm": 1.5859375, "learning_rate": 4.8120557165223965e-05, "loss": 1.1054, "step": 53000 }, { "epoch": 0.05, "grad_norm": 15.875, "learning_rate": 4.811605809203305e-05, "loss": 1.2859, "step": 53100 }, { "epoch": 0.05, "grad_norm": 13.25, "learning_rate": 4.811155901884212e-05, "loss": 1.3019, "step": 53200 }, { "epoch": 0.05, "grad_norm": 46.75, "learning_rate": 4.81070599456512e-05, "loss": 1.1397, "step": 53300 }, { "epoch": 0.05, "grad_norm": 229.0, "learning_rate": 4.810256087246027e-05, "loss": 1.1054, "step": 53400 }, { "epoch": 0.05, "grad_norm": 78.0, "learning_rate": 4.809806179926935e-05, "loss": 1.236, "step": 53500 }, { "epoch": 0.05, "grad_norm": 0.07421875, "learning_rate": 4.809356272607843e-05, "loss": 1.3154, "step": 53600 }, { "epoch": 0.05, "grad_norm": 71.5, "learning_rate": 4.8089063652887506e-05, "loss": 1.4473, "step": 53700 }, { "epoch": 0.05, "grad_norm": 26.0, "learning_rate": 4.808456457969658e-05, "loss": 1.256, "step": 53800 }, { "epoch": 0.05, "grad_norm": 75.0, "learning_rate": 4.808006550650566e-05, "loss": 1.241, "step": 53900 }, { "epoch": 0.05, "grad_norm": 27.25, "learning_rate": 4.807556643331474e-05, "loss": 1.2908, "step": 54000 }, { "epoch": 0.05, "grad_norm": 53.25, "learning_rate": 4.8071067360123814e-05, "loss": 1.3112, "step": 54100 }, { "epoch": 0.05, "grad_norm": 140.0, "learning_rate": 4.8066568286932896e-05, "loss": 1.3015, "step": 54200 }, { "epoch": 0.05, "grad_norm": 44.25, "learning_rate": 4.806206921374197e-05, "loss": 1.2982, "step": 54300 }, { "epoch": 0.05, "grad_norm": 15.9375, "learning_rate": 4.8057570140551054e-05, "loss": 1.2639, "step": 54400 }, { "epoch": 0.05, "grad_norm": 0.09716796875, "learning_rate": 4.805307106736013e-05, "loss": 1.3295, "step": 54500 }, { "epoch": 0.05, "grad_norm": 34.25, "learning_rate": 4.8048571994169204e-05, "loss": 1.2623, "step": 54600 }, { "epoch": 0.05, "grad_norm": 16.125, "learning_rate": 4.804407292097828e-05, "loss": 1.4758, "step": 54700 }, { "epoch": 0.05, "grad_norm": 18.875, "learning_rate": 4.8039573847787355e-05, "loss": 1.2223, "step": 54800 }, { "epoch": 0.05, "grad_norm": 4.0, "learning_rate": 4.803507477459643e-05, "loss": 1.1929, "step": 54900 }, { "epoch": 0.05, "grad_norm": 394.0, "learning_rate": 4.803057570140551e-05, "loss": 1.0864, "step": 55000 }, { "epoch": 0.05, "grad_norm": 82.0, "learning_rate": 4.802607662821459e-05, "loss": 1.2047, "step": 55100 }, { "epoch": 0.05, "grad_norm": 72.0, "learning_rate": 4.802157755502367e-05, "loss": 1.2877, "step": 55200 }, { "epoch": 0.05, "grad_norm": 0.1787109375, "learning_rate": 4.8017078481832745e-05, "loss": 1.2841, "step": 55300 }, { "epoch": 0.05, "grad_norm": 62.0, "learning_rate": 4.801257940864182e-05, "loss": 1.1483, "step": 55400 }, { "epoch": 0.05, "grad_norm": 64.5, "learning_rate": 4.80080803354509e-05, "loss": 1.4654, "step": 55500 }, { "epoch": 0.05, "grad_norm": 80.5, "learning_rate": 4.800358126225998e-05, "loss": 1.2821, "step": 55600 }, { "epoch": 0.05, "grad_norm": 32.25, "learning_rate": 4.799908218906905e-05, "loss": 1.4356, "step": 55700 }, { "epoch": 0.05, "grad_norm": 105.0, "learning_rate": 4.7994583115878135e-05, "loss": 1.2366, "step": 55800 }, { "epoch": 0.05, "grad_norm": 26.125, "learning_rate": 4.799008404268721e-05, "loss": 1.181, "step": 55900 }, { "epoch": 0.05, "grad_norm": 33.75, "learning_rate": 4.7985584969496286e-05, "loss": 1.0998, "step": 56000 }, { "epoch": 0.05, "grad_norm": 78.5, "learning_rate": 4.798108589630536e-05, "loss": 1.2721, "step": 56100 }, { "epoch": 0.05, "grad_norm": 22.5, "learning_rate": 4.797658682311444e-05, "loss": 1.3371, "step": 56200 }, { "epoch": 0.05, "grad_norm": 36.0, "learning_rate": 4.797208774992352e-05, "loss": 1.2851, "step": 56300 }, { "epoch": 0.05, "grad_norm": 16.875, "learning_rate": 4.7967588676732594e-05, "loss": 1.2774, "step": 56400 }, { "epoch": 0.05, "grad_norm": 29.5, "learning_rate": 4.796308960354167e-05, "loss": 1.2053, "step": 56500 }, { "epoch": 0.05, "grad_norm": 21.625, "learning_rate": 4.795859053035075e-05, "loss": 1.0901, "step": 56600 }, { "epoch": 0.05, "grad_norm": 0.007293701171875, "learning_rate": 4.795409145715983e-05, "loss": 1.0744, "step": 56700 }, { "epoch": 0.05, "grad_norm": 65.0, "learning_rate": 4.79495923839689e-05, "loss": 1.2934, "step": 56800 }, { "epoch": 0.05, "grad_norm": 58.75, "learning_rate": 4.7945093310777984e-05, "loss": 1.2791, "step": 56900 }, { "epoch": 0.05, "grad_norm": 3.046875, "learning_rate": 4.794059423758706e-05, "loss": 1.2051, "step": 57000 }, { "epoch": 0.05, "grad_norm": 15.125, "learning_rate": 4.793609516439614e-05, "loss": 1.2482, "step": 57100 }, { "epoch": 0.05, "grad_norm": 32.75, "learning_rate": 4.793159609120522e-05, "loss": 1.2103, "step": 57200 }, { "epoch": 0.05, "grad_norm": 33.0, "learning_rate": 4.7927097018014286e-05, "loss": 1.1886, "step": 57300 }, { "epoch": 0.05, "grad_norm": 26.125, "learning_rate": 4.792259794482337e-05, "loss": 1.2094, "step": 57400 }, { "epoch": 0.05, "grad_norm": 23.75, "learning_rate": 4.791809887163244e-05, "loss": 1.2617, "step": 57500 }, { "epoch": 0.05, "grad_norm": 36.75, "learning_rate": 4.791359979844152e-05, "loss": 1.3948, "step": 57600 }, { "epoch": 0.05, "grad_norm": 217.0, "learning_rate": 4.79091007252506e-05, "loss": 1.3061, "step": 57700 }, { "epoch": 0.05, "grad_norm": 43.0, "learning_rate": 4.7904601652059676e-05, "loss": 1.2947, "step": 57800 }, { "epoch": 0.05, "grad_norm": 22.25, "learning_rate": 4.790010257886876e-05, "loss": 1.162, "step": 57900 }, { "epoch": 0.05, "grad_norm": 32.75, "learning_rate": 4.7895603505677834e-05, "loss": 1.212, "step": 58000 }, { "epoch": 0.05, "grad_norm": 24.5, "learning_rate": 4.789110443248691e-05, "loss": 1.2869, "step": 58100 }, { "epoch": 0.05, "grad_norm": 30.0, "learning_rate": 4.788660535929599e-05, "loss": 1.378, "step": 58200 }, { "epoch": 0.05, "grad_norm": 2544.0, "learning_rate": 4.7882106286105066e-05, "loss": 1.4909, "step": 58300 }, { "epoch": 0.05, "grad_norm": 37.5, "learning_rate": 4.787760721291414e-05, "loss": 1.0799, "step": 58400 }, { "epoch": 0.05, "grad_norm": 16.375, "learning_rate": 4.7873108139723224e-05, "loss": 1.1447, "step": 58500 }, { "epoch": 0.05, "grad_norm": 49.25, "learning_rate": 4.786860906653229e-05, "loss": 1.3131, "step": 58600 }, { "epoch": 0.05, "grad_norm": 28.0, "learning_rate": 4.7864109993341374e-05, "loss": 1.3194, "step": 58700 }, { "epoch": 0.05, "grad_norm": 17.5, "learning_rate": 4.785961092015045e-05, "loss": 1.2322, "step": 58800 }, { "epoch": 0.05, "grad_norm": 22.625, "learning_rate": 4.7855111846959525e-05, "loss": 1.3211, "step": 58900 }, { "epoch": 0.05, "grad_norm": 1.2109375, "learning_rate": 4.785061277376861e-05, "loss": 1.0975, "step": 59000 }, { "epoch": 0.05, "grad_norm": 24.375, "learning_rate": 4.784611370057768e-05, "loss": 1.1601, "step": 59100 }, { "epoch": 0.05, "grad_norm": 191.0, "learning_rate": 4.784161462738676e-05, "loss": 1.2428, "step": 59200 }, { "epoch": 0.05, "grad_norm": 0.0031280517578125, "learning_rate": 4.783711555419584e-05, "loss": 1.0643, "step": 59300 }, { "epoch": 0.05, "grad_norm": 62.75, "learning_rate": 4.7832616481004915e-05, "loss": 1.2144, "step": 59400 }, { "epoch": 0.05, "grad_norm": 39.25, "learning_rate": 4.782811740781399e-05, "loss": 1.3446, "step": 59500 }, { "epoch": 0.05, "grad_norm": 26.875, "learning_rate": 4.782361833462307e-05, "loss": 1.2985, "step": 59600 }, { "epoch": 0.05, "grad_norm": 56.75, "learning_rate": 4.781911926143215e-05, "loss": 1.1766, "step": 59700 }, { "epoch": 0.05, "grad_norm": 144.0, "learning_rate": 4.781462018824123e-05, "loss": 1.2657, "step": 59800 }, { "epoch": 0.05, "grad_norm": 53.0, "learning_rate": 4.78101211150503e-05, "loss": 1.1534, "step": 59900 }, { "epoch": 0.05, "grad_norm": 0.32421875, "learning_rate": 4.7805622041859374e-05, "loss": 1.206, "step": 60000 }, { "epoch": 0.05, "grad_norm": 0.080078125, "learning_rate": 4.7801122968668456e-05, "loss": 1.2317, "step": 60100 }, { "epoch": 0.05, "grad_norm": 39.75, "learning_rate": 4.779662389547753e-05, "loss": 1.2616, "step": 60200 }, { "epoch": 0.05, "grad_norm": 234.0, "learning_rate": 4.779212482228661e-05, "loss": 1.1645, "step": 60300 }, { "epoch": 0.05, "grad_norm": 120.0, "learning_rate": 4.778762574909569e-05, "loss": 1.3616, "step": 60400 }, { "epoch": 0.05, "grad_norm": 0.0004711151123046875, "learning_rate": 4.7783126675904764e-05, "loss": 1.3646, "step": 60500 }, { "epoch": 0.05, "grad_norm": 155.0, "learning_rate": 4.7778627602713847e-05, "loss": 1.2605, "step": 60600 }, { "epoch": 0.05, "grad_norm": 25.625, "learning_rate": 4.777412852952292e-05, "loss": 1.2399, "step": 60700 }, { "epoch": 0.05, "grad_norm": 19.875, "learning_rate": 4.7769629456332e-05, "loss": 1.3913, "step": 60800 }, { "epoch": 0.05, "grad_norm": 13.5625, "learning_rate": 4.776513038314108e-05, "loss": 1.0235, "step": 60900 }, { "epoch": 0.05, "grad_norm": 17.5, "learning_rate": 4.7760631309950155e-05, "loss": 1.3921, "step": 61000 }, { "epoch": 0.05, "grad_norm": 20.875, "learning_rate": 4.775613223675923e-05, "loss": 1.3424, "step": 61100 }, { "epoch": 0.05, "grad_norm": 30.5, "learning_rate": 4.7751633163568305e-05, "loss": 1.1381, "step": 61200 }, { "epoch": 0.05, "grad_norm": 6.53125, "learning_rate": 4.774713409037738e-05, "loss": 1.256, "step": 61300 }, { "epoch": 0.05, "grad_norm": 23.875, "learning_rate": 4.774263501718646e-05, "loss": 1.2608, "step": 61400 }, { "epoch": 0.05, "grad_norm": 6.125, "learning_rate": 4.773813594399554e-05, "loss": 1.299, "step": 61500 }, { "epoch": 0.05, "grad_norm": 85.0, "learning_rate": 4.7733636870804613e-05, "loss": 1.4367, "step": 61600 }, { "epoch": 0.05, "grad_norm": 48.5, "learning_rate": 4.7729137797613696e-05, "loss": 1.2704, "step": 61700 }, { "epoch": 0.06, "grad_norm": 0.0003337860107421875, "learning_rate": 4.772463872442277e-05, "loss": 1.1553, "step": 61800 }, { "epoch": 0.06, "grad_norm": 0.1396484375, "learning_rate": 4.7720139651231846e-05, "loss": 1.1049, "step": 61900 }, { "epoch": 0.06, "grad_norm": 41.25, "learning_rate": 4.771564057804093e-05, "loss": 1.2024, "step": 62000 }, { "epoch": 0.06, "grad_norm": 56.75, "learning_rate": 4.7711141504850004e-05, "loss": 1.0926, "step": 62100 }, { "epoch": 0.06, "grad_norm": 23.25, "learning_rate": 4.770664243165908e-05, "loss": 1.2144, "step": 62200 }, { "epoch": 0.06, "grad_norm": 0.361328125, "learning_rate": 4.770214335846816e-05, "loss": 1.2954, "step": 62300 }, { "epoch": 0.06, "grad_norm": 97.5, "learning_rate": 4.7697644285277237e-05, "loss": 1.3381, "step": 62400 }, { "epoch": 0.06, "grad_norm": 75.0, "learning_rate": 4.769314521208631e-05, "loss": 1.1606, "step": 62500 }, { "epoch": 0.06, "grad_norm": 33.0, "learning_rate": 4.768864613889539e-05, "loss": 1.3188, "step": 62600 }, { "epoch": 0.06, "grad_norm": 23.5, "learning_rate": 4.768414706570446e-05, "loss": 1.2255, "step": 62700 }, { "epoch": 0.06, "grad_norm": 11.25, "learning_rate": 4.7679647992513545e-05, "loss": 1.2122, "step": 62800 }, { "epoch": 0.06, "grad_norm": 52.75, "learning_rate": 4.767514891932262e-05, "loss": 1.2049, "step": 62900 }, { "epoch": 0.06, "grad_norm": 0.1484375, "learning_rate": 4.7670649846131695e-05, "loss": 1.1614, "step": 63000 }, { "epoch": 0.06, "grad_norm": 1104.0, "learning_rate": 4.766615077294078e-05, "loss": 1.1922, "step": 63100 }, { "epoch": 0.06, "grad_norm": 24.875, "learning_rate": 4.766165169974985e-05, "loss": 1.2415, "step": 63200 }, { "epoch": 0.06, "grad_norm": 112.0, "learning_rate": 4.7657152626558935e-05, "loss": 1.2242, "step": 63300 }, { "epoch": 0.06, "grad_norm": 175.0, "learning_rate": 4.765265355336801e-05, "loss": 1.0412, "step": 63400 }, { "epoch": 0.06, "grad_norm": 54.75, "learning_rate": 4.7648154480177086e-05, "loss": 1.3835, "step": 63500 }, { "epoch": 0.06, "grad_norm": 0.005096435546875, "learning_rate": 4.764365540698617e-05, "loss": 1.1711, "step": 63600 }, { "epoch": 0.06, "grad_norm": 56.25, "learning_rate": 4.763915633379524e-05, "loss": 1.2561, "step": 63700 }, { "epoch": 0.06, "grad_norm": 21.625, "learning_rate": 4.763465726060432e-05, "loss": 1.1166, "step": 63800 }, { "epoch": 0.06, "grad_norm": 19.375, "learning_rate": 4.7630158187413394e-05, "loss": 1.3502, "step": 63900 }, { "epoch": 0.06, "grad_norm": 28.25, "learning_rate": 4.762565911422247e-05, "loss": 1.1815, "step": 64000 }, { "epoch": 0.06, "grad_norm": 35.75, "learning_rate": 4.762116004103155e-05, "loss": 1.2885, "step": 64100 }, { "epoch": 0.06, "grad_norm": 162.0, "learning_rate": 4.7616660967840627e-05, "loss": 1.2147, "step": 64200 }, { "epoch": 0.06, "grad_norm": 17.75, "learning_rate": 4.76121618946497e-05, "loss": 1.3315, "step": 64300 }, { "epoch": 0.06, "grad_norm": 15.875, "learning_rate": 4.7607662821458784e-05, "loss": 1.3114, "step": 64400 }, { "epoch": 0.06, "grad_norm": 51.5, "learning_rate": 4.760316374826786e-05, "loss": 1.3619, "step": 64500 }, { "epoch": 0.06, "grad_norm": 21.75, "learning_rate": 4.7598664675076935e-05, "loss": 1.2124, "step": 64600 }, { "epoch": 0.06, "grad_norm": 35.0, "learning_rate": 4.759416560188602e-05, "loss": 1.2272, "step": 64700 }, { "epoch": 0.06, "grad_norm": 92.0, "learning_rate": 4.758966652869509e-05, "loss": 1.2565, "step": 64800 }, { "epoch": 0.06, "grad_norm": 27.5, "learning_rate": 4.758516745550417e-05, "loss": 1.0057, "step": 64900 }, { "epoch": 0.06, "grad_norm": 284.0, "learning_rate": 4.758066838231325e-05, "loss": 1.3151, "step": 65000 }, { "epoch": 0.06, "grad_norm": 60.0, "learning_rate": 4.757616930912232e-05, "loss": 1.3785, "step": 65100 }, { "epoch": 0.06, "grad_norm": 0.1201171875, "learning_rate": 4.75716702359314e-05, "loss": 1.1252, "step": 65200 }, { "epoch": 0.06, "grad_norm": 28.125, "learning_rate": 4.7567171162740476e-05, "loss": 1.2642, "step": 65300 }, { "epoch": 0.06, "grad_norm": 21.75, "learning_rate": 4.756267208954955e-05, "loss": 1.1487, "step": 65400 }, { "epoch": 0.06, "grad_norm": 19.875, "learning_rate": 4.755817301635863e-05, "loss": 1.1986, "step": 65500 }, { "epoch": 0.06, "grad_norm": 41.75, "learning_rate": 4.755367394316771e-05, "loss": 1.3195, "step": 65600 }, { "epoch": 0.06, "grad_norm": 35.75, "learning_rate": 4.7549174869976784e-05, "loss": 1.1207, "step": 65700 }, { "epoch": 0.06, "grad_norm": 36.75, "learning_rate": 4.7544675796785866e-05, "loss": 1.2285, "step": 65800 }, { "epoch": 0.06, "grad_norm": 115.5, "learning_rate": 4.754017672359494e-05, "loss": 1.3438, "step": 65900 }, { "epoch": 0.06, "grad_norm": 31.625, "learning_rate": 4.753567765040402e-05, "loss": 1.1679, "step": 66000 }, { "epoch": 0.06, "grad_norm": 17.5, "learning_rate": 4.75311785772131e-05, "loss": 1.3672, "step": 66100 }, { "epoch": 0.06, "grad_norm": 115.0, "learning_rate": 4.7526679504022174e-05, "loss": 1.084, "step": 66200 }, { "epoch": 0.06, "grad_norm": 51.25, "learning_rate": 4.7522180430831256e-05, "loss": 1.3214, "step": 66300 }, { "epoch": 0.06, "grad_norm": 48.75, "learning_rate": 4.7517681357640325e-05, "loss": 1.2078, "step": 66400 }, { "epoch": 0.06, "grad_norm": 50.5, "learning_rate": 4.751318228444941e-05, "loss": 1.2239, "step": 66500 }, { "epoch": 0.06, "grad_norm": 32.75, "learning_rate": 4.750868321125848e-05, "loss": 1.2264, "step": 66600 }, { "epoch": 0.06, "grad_norm": 119.5, "learning_rate": 4.750418413806756e-05, "loss": 1.1354, "step": 66700 }, { "epoch": 0.06, "grad_norm": 17.0, "learning_rate": 4.749968506487664e-05, "loss": 1.2544, "step": 66800 }, { "epoch": 0.06, "grad_norm": 22.375, "learning_rate": 4.7495185991685715e-05, "loss": 1.3681, "step": 66900 }, { "epoch": 0.06, "grad_norm": 17.5, "learning_rate": 4.749068691849479e-05, "loss": 1.1801, "step": 67000 }, { "epoch": 0.06, "grad_norm": 21.375, "learning_rate": 4.748618784530387e-05, "loss": 1.2174, "step": 67100 }, { "epoch": 0.06, "grad_norm": 7.5, "learning_rate": 4.748168877211295e-05, "loss": 1.3273, "step": 67200 }, { "epoch": 0.06, "grad_norm": 110.0, "learning_rate": 4.747718969892202e-05, "loss": 1.3135, "step": 67300 }, { "epoch": 0.06, "grad_norm": 15.9375, "learning_rate": 4.7472690625731105e-05, "loss": 1.2495, "step": 67400 }, { "epoch": 0.06, "grad_norm": 23.375, "learning_rate": 4.746819155254018e-05, "loss": 1.1619, "step": 67500 }, { "epoch": 0.06, "grad_norm": 18.0, "learning_rate": 4.7463692479349256e-05, "loss": 1.3114, "step": 67600 }, { "epoch": 0.06, "grad_norm": 54.25, "learning_rate": 4.745919340615833e-05, "loss": 1.2257, "step": 67700 }, { "epoch": 0.06, "grad_norm": 30.125, "learning_rate": 4.7454694332967407e-05, "loss": 1.224, "step": 67800 }, { "epoch": 0.06, "grad_norm": 44.0, "learning_rate": 4.745019525977649e-05, "loss": 1.1534, "step": 67900 }, { "epoch": 0.06, "grad_norm": 16.25, "learning_rate": 4.7445696186585564e-05, "loss": 1.2763, "step": 68000 }, { "epoch": 0.06, "grad_norm": 49.75, "learning_rate": 4.744119711339464e-05, "loss": 1.2954, "step": 68100 }, { "epoch": 0.06, "grad_norm": 41.25, "learning_rate": 4.743669804020372e-05, "loss": 1.3419, "step": 68200 }, { "epoch": 0.06, "grad_norm": 35.5, "learning_rate": 4.74321989670128e-05, "loss": 1.1872, "step": 68300 }, { "epoch": 0.06, "grad_norm": 41.75, "learning_rate": 4.742769989382187e-05, "loss": 1.214, "step": 68400 }, { "epoch": 0.06, "grad_norm": 36.75, "learning_rate": 4.7423200820630954e-05, "loss": 1.2048, "step": 68500 }, { "epoch": 0.06, "grad_norm": 45.0, "learning_rate": 4.741870174744003e-05, "loss": 0.9973, "step": 68600 }, { "epoch": 0.06, "grad_norm": 23.75, "learning_rate": 4.741420267424911e-05, "loss": 1.2558, "step": 68700 }, { "epoch": 0.06, "grad_norm": 59.75, "learning_rate": 4.740970360105819e-05, "loss": 1.2732, "step": 68800 }, { "epoch": 0.06, "grad_norm": 41.25, "learning_rate": 4.740520452786726e-05, "loss": 1.2606, "step": 68900 }, { "epoch": 0.06, "grad_norm": 79.0, "learning_rate": 4.740070545467634e-05, "loss": 1.2146, "step": 69000 }, { "epoch": 0.06, "grad_norm": 14.0625, "learning_rate": 4.739620638148541e-05, "loss": 1.2459, "step": 69100 }, { "epoch": 0.06, "grad_norm": 12.1875, "learning_rate": 4.7391707308294495e-05, "loss": 1.2515, "step": 69200 }, { "epoch": 0.06, "grad_norm": 25.25, "learning_rate": 4.738720823510357e-05, "loss": 1.3226, "step": 69300 }, { "epoch": 0.06, "grad_norm": 10.5625, "learning_rate": 4.7382709161912646e-05, "loss": 1.3906, "step": 69400 }, { "epoch": 0.06, "grad_norm": 75.0, "learning_rate": 4.737821008872173e-05, "loss": 1.1994, "step": 69500 }, { "epoch": 0.06, "grad_norm": 16.25, "learning_rate": 4.73737110155308e-05, "loss": 1.3124, "step": 69600 }, { "epoch": 0.06, "grad_norm": 50.25, "learning_rate": 4.736921194233988e-05, "loss": 1.2104, "step": 69700 }, { "epoch": 0.06, "grad_norm": 38.25, "learning_rate": 4.736471286914896e-05, "loss": 1.3131, "step": 69800 }, { "epoch": 0.06, "grad_norm": 63.0, "learning_rate": 4.7360213795958036e-05, "loss": 1.3858, "step": 69900 }, { "epoch": 0.06, "grad_norm": 21.5, "learning_rate": 4.735571472276711e-05, "loss": 1.3075, "step": 70000 }, { "epoch": 0.06, "grad_norm": 32.25, "learning_rate": 4.7351215649576194e-05, "loss": 1.2681, "step": 70100 }, { "epoch": 0.06, "grad_norm": 160.0, "learning_rate": 4.734671657638527e-05, "loss": 1.123, "step": 70200 }, { "epoch": 0.06, "grad_norm": 20.125, "learning_rate": 4.7342217503194344e-05, "loss": 1.29, "step": 70300 }, { "epoch": 0.06, "grad_norm": 46.75, "learning_rate": 4.733771843000342e-05, "loss": 1.2494, "step": 70400 }, { "epoch": 0.06, "grad_norm": 65.0, "learning_rate": 4.7333219356812495e-05, "loss": 1.106, "step": 70500 }, { "epoch": 0.06, "grad_norm": 61.0, "learning_rate": 4.732872028362158e-05, "loss": 1.3759, "step": 70600 }, { "epoch": 0.06, "grad_norm": 56.0, "learning_rate": 4.732422121043065e-05, "loss": 1.2734, "step": 70700 }, { "epoch": 0.06, "grad_norm": 80.5, "learning_rate": 4.731972213723973e-05, "loss": 1.1665, "step": 70800 }, { "epoch": 0.06, "grad_norm": 36.25, "learning_rate": 4.731522306404881e-05, "loss": 1.2487, "step": 70900 }, { "epoch": 0.06, "grad_norm": 99.0, "learning_rate": 4.7310723990857885e-05, "loss": 1.2052, "step": 71000 }, { "epoch": 0.06, "grad_norm": 9.875, "learning_rate": 4.730622491766696e-05, "loss": 1.1455, "step": 71100 }, { "epoch": 0.06, "grad_norm": 16.25, "learning_rate": 4.730172584447604e-05, "loss": 1.0591, "step": 71200 }, { "epoch": 0.06, "grad_norm": 13.9375, "learning_rate": 4.729722677128512e-05, "loss": 1.0799, "step": 71300 }, { "epoch": 0.06, "grad_norm": 46.25, "learning_rate": 4.72927276980942e-05, "loss": 1.3123, "step": 71400 }, { "epoch": 0.06, "grad_norm": 63.5, "learning_rate": 4.7288228624903275e-05, "loss": 1.2431, "step": 71500 }, { "epoch": 0.06, "grad_norm": 14.125, "learning_rate": 4.7283729551712344e-05, "loss": 1.2255, "step": 71600 }, { "epoch": 0.06, "grad_norm": 53.0, "learning_rate": 4.7279230478521426e-05, "loss": 1.1026, "step": 71700 }, { "epoch": 0.06, "grad_norm": 47.5, "learning_rate": 4.72747314053305e-05, "loss": 1.2817, "step": 71800 }, { "epoch": 0.06, "grad_norm": 49.25, "learning_rate": 4.727023233213958e-05, "loss": 1.26, "step": 71900 }, { "epoch": 0.06, "grad_norm": 17.375, "learning_rate": 4.726573325894866e-05, "loss": 1.1483, "step": 72000 }, { "epoch": 0.06, "grad_norm": 123.0, "learning_rate": 4.7261234185757734e-05, "loss": 1.3744, "step": 72100 }, { "epoch": 0.06, "grad_norm": 33.0, "learning_rate": 4.7256735112566816e-05, "loss": 1.2989, "step": 72200 }, { "epoch": 0.06, "grad_norm": 29.0, "learning_rate": 4.725223603937589e-05, "loss": 1.0816, "step": 72300 }, { "epoch": 0.06, "grad_norm": 26.5, "learning_rate": 4.724773696618497e-05, "loss": 1.2363, "step": 72400 }, { "epoch": 0.06, "grad_norm": 12.25, "learning_rate": 4.724323789299405e-05, "loss": 1.2743, "step": 72500 }, { "epoch": 0.06, "grad_norm": 16.0, "learning_rate": 4.7238738819803124e-05, "loss": 1.2664, "step": 72600 }, { "epoch": 0.06, "grad_norm": 28.25, "learning_rate": 4.72342397466122e-05, "loss": 1.4499, "step": 72700 }, { "epoch": 0.06, "grad_norm": 91.5, "learning_rate": 4.722974067342128e-05, "loss": 1.1211, "step": 72800 }, { "epoch": 0.06, "grad_norm": 31.875, "learning_rate": 4.722524160023035e-05, "loss": 1.1971, "step": 72900 }, { "epoch": 0.07, "grad_norm": 0.012939453125, "learning_rate": 4.722074252703943e-05, "loss": 1.1434, "step": 73000 }, { "epoch": 0.07, "grad_norm": 0.0859375, "learning_rate": 4.721624345384851e-05, "loss": 1.1664, "step": 73100 }, { "epoch": 0.07, "grad_norm": 39.0, "learning_rate": 4.721174438065758e-05, "loss": 1.1764, "step": 73200 }, { "epoch": 0.07, "grad_norm": 17.125, "learning_rate": 4.7207245307466665e-05, "loss": 1.2504, "step": 73300 }, { "epoch": 0.07, "grad_norm": 89.0, "learning_rate": 4.720274623427574e-05, "loss": 1.2143, "step": 73400 }, { "epoch": 0.07, "grad_norm": 20.375, "learning_rate": 4.7198247161084816e-05, "loss": 1.1952, "step": 73500 }, { "epoch": 0.07, "grad_norm": 103.0, "learning_rate": 4.71937480878939e-05, "loss": 1.2222, "step": 73600 }, { "epoch": 0.07, "grad_norm": 179.0, "learning_rate": 4.7189249014702974e-05, "loss": 1.2778, "step": 73700 }, { "epoch": 0.07, "grad_norm": 0.5546875, "learning_rate": 4.718474994151205e-05, "loss": 1.3061, "step": 73800 }, { "epoch": 0.07, "grad_norm": 149.0, "learning_rate": 4.718025086832113e-05, "loss": 1.2367, "step": 73900 }, { "epoch": 0.07, "grad_norm": 31.375, "learning_rate": 4.7175751795130206e-05, "loss": 1.226, "step": 74000 }, { "epoch": 0.07, "grad_norm": 25.5, "learning_rate": 4.717125272193929e-05, "loss": 1.1026, "step": 74100 }, { "epoch": 0.07, "grad_norm": 47.25, "learning_rate": 4.716675364874836e-05, "loss": 1.2909, "step": 74200 }, { "epoch": 0.07, "grad_norm": 8.1875, "learning_rate": 4.716225457555743e-05, "loss": 1.142, "step": 74300 }, { "epoch": 0.07, "grad_norm": 27.125, "learning_rate": 4.7157755502366514e-05, "loss": 1.134, "step": 74400 }, { "epoch": 0.07, "grad_norm": 39.25, "learning_rate": 4.715325642917559e-05, "loss": 1.2291, "step": 74500 }, { "epoch": 0.07, "grad_norm": 26.25, "learning_rate": 4.7148757355984665e-05, "loss": 1.3683, "step": 74600 }, { "epoch": 0.07, "grad_norm": 18.625, "learning_rate": 4.714425828279375e-05, "loss": 1.2623, "step": 74700 }, { "epoch": 0.07, "grad_norm": 57.0, "learning_rate": 4.713975920960282e-05, "loss": 1.1912, "step": 74800 }, { "epoch": 0.07, "grad_norm": 35.75, "learning_rate": 4.7135260136411905e-05, "loss": 1.0889, "step": 74900 }, { "epoch": 0.07, "grad_norm": 52.25, "learning_rate": 4.713076106322098e-05, "loss": 1.1153, "step": 75000 }, { "epoch": 0.07, "grad_norm": 0.000835418701171875, "learning_rate": 4.7126261990030055e-05, "loss": 1.361, "step": 75100 }, { "epoch": 0.07, "grad_norm": 83.5, "learning_rate": 4.712176291683914e-05, "loss": 1.2175, "step": 75200 }, { "epoch": 0.07, "grad_norm": 34.25, "learning_rate": 4.711726384364821e-05, "loss": 1.1457, "step": 75300 }, { "epoch": 0.07, "grad_norm": 47.75, "learning_rate": 4.711276477045729e-05, "loss": 1.2189, "step": 75400 }, { "epoch": 0.07, "grad_norm": 15.1875, "learning_rate": 4.7108265697266364e-05, "loss": 1.1413, "step": 75500 }, { "epoch": 0.07, "grad_norm": 0.006591796875, "learning_rate": 4.710376662407544e-05, "loss": 1.1837, "step": 75600 }, { "epoch": 0.07, "grad_norm": 31.375, "learning_rate": 4.709926755088452e-05, "loss": 1.2907, "step": 75700 }, { "epoch": 0.07, "grad_norm": 41.75, "learning_rate": 4.7094768477693596e-05, "loss": 1.1726, "step": 75800 }, { "epoch": 0.07, "grad_norm": 125.5, "learning_rate": 4.709026940450267e-05, "loss": 1.3122, "step": 75900 }, { "epoch": 0.07, "grad_norm": 135.0, "learning_rate": 4.7085770331311754e-05, "loss": 1.2983, "step": 76000 }, { "epoch": 0.07, "grad_norm": 222.0, "learning_rate": 4.708127125812083e-05, "loss": 1.2188, "step": 76100 }, { "epoch": 0.07, "grad_norm": 44.0, "learning_rate": 4.7076772184929904e-05, "loss": 1.0877, "step": 76200 }, { "epoch": 0.07, "grad_norm": 34.75, "learning_rate": 4.7072273111738987e-05, "loss": 1.2161, "step": 76300 }, { "epoch": 0.07, "grad_norm": 22.0, "learning_rate": 4.706777403854806e-05, "loss": 1.3827, "step": 76400 }, { "epoch": 0.07, "grad_norm": 91.0, "learning_rate": 4.706327496535714e-05, "loss": 1.2608, "step": 76500 }, { "epoch": 0.07, "grad_norm": 20.0, "learning_rate": 4.705877589216622e-05, "loss": 1.362, "step": 76600 }, { "epoch": 0.07, "grad_norm": 32.75, "learning_rate": 4.7054276818975295e-05, "loss": 1.2003, "step": 76700 }, { "epoch": 0.07, "grad_norm": 3.15625, "learning_rate": 4.704977774578437e-05, "loss": 1.3207, "step": 76800 }, { "epoch": 0.07, "grad_norm": 13.5625, "learning_rate": 4.7045278672593445e-05, "loss": 1.5494, "step": 76900 }, { "epoch": 0.07, "grad_norm": 20.25, "learning_rate": 4.704077959940252e-05, "loss": 1.1945, "step": 77000 }, { "epoch": 0.07, "grad_norm": 24.5, "learning_rate": 4.70362805262116e-05, "loss": 1.205, "step": 77100 }, { "epoch": 0.07, "grad_norm": 52.25, "learning_rate": 4.703178145302068e-05, "loss": 1.1373, "step": 77200 }, { "epoch": 0.07, "grad_norm": 22.375, "learning_rate": 4.7027282379829754e-05, "loss": 1.2527, "step": 77300 }, { "epoch": 0.07, "grad_norm": 24.625, "learning_rate": 4.7022783306638836e-05, "loss": 1.1378, "step": 77400 }, { "epoch": 0.07, "grad_norm": 56.0, "learning_rate": 4.701828423344791e-05, "loss": 1.1482, "step": 77500 }, { "epoch": 0.07, "grad_norm": 50.5, "learning_rate": 4.701378516025699e-05, "loss": 1.2645, "step": 77600 }, { "epoch": 0.07, "grad_norm": 144.0, "learning_rate": 4.700928608706607e-05, "loss": 1.4047, "step": 77700 }, { "epoch": 0.07, "grad_norm": 0.1494140625, "learning_rate": 4.7004787013875144e-05, "loss": 1.1314, "step": 77800 }, { "epoch": 0.07, "grad_norm": 149.0, "learning_rate": 4.7000287940684226e-05, "loss": 1.1793, "step": 77900 }, { "epoch": 0.07, "grad_norm": 0.5546875, "learning_rate": 4.69957888674933e-05, "loss": 1.2604, "step": 78000 }, { "epoch": 0.07, "grad_norm": 126.0, "learning_rate": 4.6991289794302377e-05, "loss": 0.9881, "step": 78100 }, { "epoch": 0.07, "grad_norm": 25.5, "learning_rate": 4.698679072111145e-05, "loss": 1.2622, "step": 78200 }, { "epoch": 0.07, "grad_norm": 35.25, "learning_rate": 4.698229164792053e-05, "loss": 1.2399, "step": 78300 }, { "epoch": 0.07, "grad_norm": 13.25, "learning_rate": 4.697779257472961e-05, "loss": 1.1963, "step": 78400 }, { "epoch": 0.07, "grad_norm": 22.75, "learning_rate": 4.6973293501538685e-05, "loss": 1.3438, "step": 78500 }, { "epoch": 0.07, "grad_norm": 44.0, "learning_rate": 4.696879442834776e-05, "loss": 1.1871, "step": 78600 }, { "epoch": 0.07, "grad_norm": 0.45703125, "learning_rate": 4.696429535515684e-05, "loss": 1.2272, "step": 78700 }, { "epoch": 0.07, "grad_norm": 13.9375, "learning_rate": 4.695979628196592e-05, "loss": 1.4138, "step": 78800 }, { "epoch": 0.07, "grad_norm": 31.75, "learning_rate": 4.695529720877499e-05, "loss": 1.2585, "step": 78900 }, { "epoch": 0.07, "grad_norm": 62.5, "learning_rate": 4.6950798135584075e-05, "loss": 1.1439, "step": 79000 }, { "epoch": 0.07, "grad_norm": 22.125, "learning_rate": 4.694629906239315e-05, "loss": 1.4061, "step": 79100 }, { "epoch": 0.07, "grad_norm": 23.75, "learning_rate": 4.6941799989202226e-05, "loss": 1.2456, "step": 79200 }, { "epoch": 0.07, "grad_norm": 12.625, "learning_rate": 4.693730091601131e-05, "loss": 1.1871, "step": 79300 }, { "epoch": 0.07, "grad_norm": 195.0, "learning_rate": 4.6932801842820376e-05, "loss": 1.3179, "step": 79400 }, { "epoch": 0.07, "grad_norm": 36.75, "learning_rate": 4.692830276962946e-05, "loss": 1.1434, "step": 79500 }, { "epoch": 0.07, "grad_norm": 2.625, "learning_rate": 4.6923803696438534e-05, "loss": 1.1249, "step": 79600 }, { "epoch": 0.07, "grad_norm": 87.5, "learning_rate": 4.691930462324761e-05, "loss": 1.2842, "step": 79700 }, { "epoch": 0.07, "grad_norm": 33.0, "learning_rate": 4.691480555005669e-05, "loss": 1.2856, "step": 79800 }, { "epoch": 0.07, "grad_norm": 33.75, "learning_rate": 4.6910306476865767e-05, "loss": 1.3099, "step": 79900 }, { "epoch": 0.07, "grad_norm": 22.125, "learning_rate": 4.690580740367484e-05, "loss": 1.2222, "step": 80000 }, { "epoch": 0.07, "grad_norm": 73.0, "learning_rate": 4.6901308330483924e-05, "loss": 1.2007, "step": 80100 }, { "epoch": 0.07, "grad_norm": 56.25, "learning_rate": 4.6896809257293e-05, "loss": 1.1251, "step": 80200 }, { "epoch": 0.07, "grad_norm": 61.0, "learning_rate": 4.689231018410208e-05, "loss": 1.2048, "step": 80300 }, { "epoch": 0.07, "grad_norm": 50.5, "learning_rate": 4.688781111091116e-05, "loss": 1.2897, "step": 80400 }, { "epoch": 0.07, "grad_norm": 29.5, "learning_rate": 4.688331203772023e-05, "loss": 1.2261, "step": 80500 }, { "epoch": 0.07, "grad_norm": 26.75, "learning_rate": 4.6878812964529314e-05, "loss": 1.2943, "step": 80600 }, { "epoch": 0.07, "grad_norm": 0.85546875, "learning_rate": 4.687431389133838e-05, "loss": 1.2239, "step": 80700 }, { "epoch": 0.07, "grad_norm": 416.0, "learning_rate": 4.6869814818147465e-05, "loss": 1.2365, "step": 80800 }, { "epoch": 0.07, "grad_norm": 13.8125, "learning_rate": 4.686531574495654e-05, "loss": 1.3162, "step": 80900 }, { "epoch": 0.07, "grad_norm": 72.5, "learning_rate": 4.6860816671765616e-05, "loss": 1.1817, "step": 81000 }, { "epoch": 0.07, "grad_norm": 84.5, "learning_rate": 4.68563175985747e-05, "loss": 1.269, "step": 81100 }, { "epoch": 0.07, "grad_norm": 36.5, "learning_rate": 4.685181852538377e-05, "loss": 1.2479, "step": 81200 }, { "epoch": 0.07, "grad_norm": 66.5, "learning_rate": 4.684731945219285e-05, "loss": 1.1251, "step": 81300 }, { "epoch": 0.07, "grad_norm": 0.06884765625, "learning_rate": 4.684282037900193e-05, "loss": 1.0727, "step": 81400 }, { "epoch": 0.07, "grad_norm": 81.5, "learning_rate": 4.6838321305811006e-05, "loss": 1.3266, "step": 81500 }, { "epoch": 0.07, "grad_norm": 57.5, "learning_rate": 4.683382223262008e-05, "loss": 1.1613, "step": 81600 }, { "epoch": 0.07, "grad_norm": 324.0, "learning_rate": 4.682932315942916e-05, "loss": 1.1394, "step": 81700 }, { "epoch": 0.07, "grad_norm": 38.5, "learning_rate": 4.682482408623824e-05, "loss": 1.1932, "step": 81800 }, { "epoch": 0.07, "grad_norm": 42.75, "learning_rate": 4.6820325013047314e-05, "loss": 1.2507, "step": 81900 }, { "epoch": 0.07, "grad_norm": 11.0625, "learning_rate": 4.681582593985639e-05, "loss": 1.1607, "step": 82000 }, { "epoch": 0.07, "grad_norm": 5.875, "learning_rate": 4.6811326866665465e-05, "loss": 1.2099, "step": 82100 }, { "epoch": 0.07, "grad_norm": 20.0, "learning_rate": 4.680682779347455e-05, "loss": 1.3643, "step": 82200 }, { "epoch": 0.07, "grad_norm": 20.5, "learning_rate": 4.680232872028362e-05, "loss": 1.3056, "step": 82300 }, { "epoch": 0.07, "grad_norm": 27.625, "learning_rate": 4.67978296470927e-05, "loss": 1.3067, "step": 82400 }, { "epoch": 0.07, "grad_norm": 11.4375, "learning_rate": 4.679333057390178e-05, "loss": 1.3445, "step": 82500 }, { "epoch": 0.07, "grad_norm": 45.0, "learning_rate": 4.6788831500710855e-05, "loss": 1.3665, "step": 82600 }, { "epoch": 0.07, "grad_norm": 16.75, "learning_rate": 4.678433242751993e-05, "loss": 1.2115, "step": 82700 }, { "epoch": 0.07, "grad_norm": 16.875, "learning_rate": 4.677983335432901e-05, "loss": 1.1534, "step": 82800 }, { "epoch": 0.07, "grad_norm": 46.5, "learning_rate": 4.677533428113809e-05, "loss": 1.3426, "step": 82900 }, { "epoch": 0.07, "grad_norm": 31.625, "learning_rate": 4.677083520794717e-05, "loss": 1.2648, "step": 83000 }, { "epoch": 0.07, "grad_norm": 17.5, "learning_rate": 4.6766336134756245e-05, "loss": 1.201, "step": 83100 }, { "epoch": 0.07, "grad_norm": 28.5, "learning_rate": 4.676183706156532e-05, "loss": 1.0442, "step": 83200 }, { "epoch": 0.07, "grad_norm": 11.3125, "learning_rate": 4.6757337988374396e-05, "loss": 1.1157, "step": 83300 }, { "epoch": 0.07, "grad_norm": 10.875, "learning_rate": 4.675283891518347e-05, "loss": 1.3605, "step": 83400 }, { "epoch": 0.07, "grad_norm": 34.75, "learning_rate": 4.674833984199255e-05, "loss": 1.4955, "step": 83500 }, { "epoch": 0.07, "grad_norm": 18.5, "learning_rate": 4.674384076880163e-05, "loss": 1.2556, "step": 83600 }, { "epoch": 0.07, "grad_norm": 59.25, "learning_rate": 4.6739341695610704e-05, "loss": 1.399, "step": 83700 }, { "epoch": 0.07, "grad_norm": 3232.0, "learning_rate": 4.6734842622419786e-05, "loss": 1.2609, "step": 83800 }, { "epoch": 0.07, "grad_norm": 80.5, "learning_rate": 4.673034354922886e-05, "loss": 1.156, "step": 83900 }, { "epoch": 0.07, "grad_norm": 38.0, "learning_rate": 4.672584447603794e-05, "loss": 1.2504, "step": 84000 }, { "epoch": 0.07, "grad_norm": 416.0, "learning_rate": 4.672134540284702e-05, "loss": 1.3409, "step": 84100 }, { "epoch": 0.08, "grad_norm": 37.0, "learning_rate": 4.6716846329656094e-05, "loss": 1.2077, "step": 84200 }, { "epoch": 0.08, "grad_norm": 34.75, "learning_rate": 4.671234725646517e-05, "loss": 1.1843, "step": 84300 }, { "epoch": 0.08, "grad_norm": 78.0, "learning_rate": 4.670784818327425e-05, "loss": 1.2665, "step": 84400 }, { "epoch": 0.08, "grad_norm": 16.875, "learning_rate": 4.670334911008333e-05, "loss": 1.1045, "step": 84500 }, { "epoch": 0.08, "grad_norm": 42.75, "learning_rate": 4.66988500368924e-05, "loss": 1.2778, "step": 84600 }, { "epoch": 0.08, "grad_norm": 32.5, "learning_rate": 4.669435096370148e-05, "loss": 1.0987, "step": 84700 }, { "epoch": 0.08, "grad_norm": 50.5, "learning_rate": 4.668985189051055e-05, "loss": 1.2783, "step": 84800 }, { "epoch": 0.08, "grad_norm": 884.0, "learning_rate": 4.6685352817319635e-05, "loss": 1.1679, "step": 84900 }, { "epoch": 0.08, "grad_norm": 24.625, "learning_rate": 4.668085374412871e-05, "loss": 1.0179, "step": 85000 }, { "epoch": 0.08, "grad_norm": 49.75, "learning_rate": 4.6676354670937786e-05, "loss": 1.0883, "step": 85100 }, { "epoch": 0.08, "grad_norm": 30.75, "learning_rate": 4.667185559774687e-05, "loss": 1.1631, "step": 85200 }, { "epoch": 0.08, "grad_norm": 34.75, "learning_rate": 4.666735652455594e-05, "loss": 0.9213, "step": 85300 }, { "epoch": 0.08, "grad_norm": 61.75, "learning_rate": 4.666285745136502e-05, "loss": 1.2807, "step": 85400 }, { "epoch": 0.08, "grad_norm": 30.625, "learning_rate": 4.66583583781741e-05, "loss": 1.1432, "step": 85500 }, { "epoch": 0.08, "grad_norm": 24.25, "learning_rate": 4.6653859304983176e-05, "loss": 1.3741, "step": 85600 }, { "epoch": 0.08, "grad_norm": 25.125, "learning_rate": 4.664936023179226e-05, "loss": 1.1954, "step": 85700 }, { "epoch": 0.08, "grad_norm": 30.75, "learning_rate": 4.6644861158601334e-05, "loss": 1.2219, "step": 85800 }, { "epoch": 0.08, "grad_norm": 0.00439453125, "learning_rate": 4.66403620854104e-05, "loss": 1.233, "step": 85900 }, { "epoch": 0.08, "grad_norm": 60.0, "learning_rate": 4.6635863012219484e-05, "loss": 1.2093, "step": 86000 }, { "epoch": 0.08, "grad_norm": 10.4375, "learning_rate": 4.663136393902856e-05, "loss": 1.145, "step": 86100 }, { "epoch": 0.08, "grad_norm": 21.625, "learning_rate": 4.662686486583764e-05, "loss": 1.2302, "step": 86200 }, { "epoch": 0.08, "grad_norm": 54.75, "learning_rate": 4.662236579264672e-05, "loss": 1.2073, "step": 86300 }, { "epoch": 0.08, "grad_norm": 23.875, "learning_rate": 4.661786671945579e-05, "loss": 1.0794, "step": 86400 }, { "epoch": 0.08, "grad_norm": 4.34375, "learning_rate": 4.6613367646264875e-05, "loss": 1.1232, "step": 86500 }, { "epoch": 0.08, "grad_norm": 10.625, "learning_rate": 4.660886857307395e-05, "loss": 1.3566, "step": 86600 }, { "epoch": 0.08, "grad_norm": 86.5, "learning_rate": 4.6604369499883025e-05, "loss": 1.1867, "step": 86700 }, { "epoch": 0.08, "grad_norm": 20.375, "learning_rate": 4.659987042669211e-05, "loss": 1.092, "step": 86800 }, { "epoch": 0.08, "grad_norm": 18.5, "learning_rate": 4.659537135350118e-05, "loss": 1.175, "step": 86900 }, { "epoch": 0.08, "grad_norm": 17.5, "learning_rate": 4.659087228031026e-05, "loss": 1.2469, "step": 87000 }, { "epoch": 0.08, "grad_norm": 17.25, "learning_rate": 4.658637320711934e-05, "loss": 1.1808, "step": 87100 }, { "epoch": 0.08, "grad_norm": 14.9375, "learning_rate": 4.658187413392841e-05, "loss": 1.1264, "step": 87200 }, { "epoch": 0.08, "grad_norm": 39.0, "learning_rate": 4.657737506073749e-05, "loss": 1.156, "step": 87300 }, { "epoch": 0.08, "grad_norm": 15.75, "learning_rate": 4.6572875987546566e-05, "loss": 1.063, "step": 87400 }, { "epoch": 0.08, "grad_norm": 14.3125, "learning_rate": 4.656837691435564e-05, "loss": 1.1372, "step": 87500 }, { "epoch": 0.08, "grad_norm": 38.0, "learning_rate": 4.6563877841164724e-05, "loss": 1.1772, "step": 87600 }, { "epoch": 0.08, "grad_norm": 1.046875, "learning_rate": 4.65593787679738e-05, "loss": 1.1926, "step": 87700 }, { "epoch": 0.08, "grad_norm": 114.5, "learning_rate": 4.6554879694782874e-05, "loss": 1.2012, "step": 87800 }, { "epoch": 0.08, "grad_norm": 772.0, "learning_rate": 4.6550380621591956e-05, "loss": 1.1147, "step": 87900 }, { "epoch": 0.08, "grad_norm": 536.0, "learning_rate": 4.654588154840103e-05, "loss": 1.1648, "step": 88000 }, { "epoch": 0.08, "grad_norm": 200.0, "learning_rate": 4.654138247521011e-05, "loss": 1.1388, "step": 88100 }, { "epoch": 0.08, "grad_norm": 57.75, "learning_rate": 4.653688340201919e-05, "loss": 1.2505, "step": 88200 }, { "epoch": 0.08, "grad_norm": 43.0, "learning_rate": 4.6532384328828265e-05, "loss": 1.2581, "step": 88300 }, { "epoch": 0.08, "grad_norm": 2.71875, "learning_rate": 4.652788525563735e-05, "loss": 1.215, "step": 88400 }, { "epoch": 0.08, "grad_norm": 0.1474609375, "learning_rate": 4.6523386182446415e-05, "loss": 1.2729, "step": 88500 }, { "epoch": 0.08, "grad_norm": 0.259765625, "learning_rate": 4.651888710925549e-05, "loss": 1.1295, "step": 88600 }, { "epoch": 0.08, "grad_norm": 15.8125, "learning_rate": 4.651438803606457e-05, "loss": 1.2362, "step": 88700 }, { "epoch": 0.08, "grad_norm": 16.125, "learning_rate": 4.650988896287365e-05, "loss": 1.0345, "step": 88800 }, { "epoch": 0.08, "grad_norm": 0.022216796875, "learning_rate": 4.650538988968272e-05, "loss": 1.2219, "step": 88900 }, { "epoch": 0.08, "grad_norm": 36.25, "learning_rate": 4.6500890816491805e-05, "loss": 1.2903, "step": 89000 }, { "epoch": 0.08, "grad_norm": 30.25, "learning_rate": 4.649639174330088e-05, "loss": 1.2863, "step": 89100 }, { "epoch": 0.08, "grad_norm": 50.5, "learning_rate": 4.649189267010996e-05, "loss": 1.3294, "step": 89200 }, { "epoch": 0.08, "grad_norm": 32.25, "learning_rate": 4.648739359691904e-05, "loss": 1.3154, "step": 89300 }, { "epoch": 0.08, "grad_norm": 9.5625, "learning_rate": 4.6482894523728114e-05, "loss": 1.3416, "step": 89400 }, { "epoch": 0.08, "grad_norm": 34.0, "learning_rate": 4.6478395450537196e-05, "loss": 1.1502, "step": 89500 }, { "epoch": 0.08, "grad_norm": 67.0, "learning_rate": 4.647389637734627e-05, "loss": 1.0926, "step": 89600 }, { "epoch": 0.08, "grad_norm": 40.25, "learning_rate": 4.6469397304155346e-05, "loss": 1.3378, "step": 89700 }, { "epoch": 0.08, "grad_norm": 32.25, "learning_rate": 4.646489823096442e-05, "loss": 1.2702, "step": 89800 }, { "epoch": 0.08, "grad_norm": 45.25, "learning_rate": 4.64603991577735e-05, "loss": 1.4877, "step": 89900 }, { "epoch": 0.08, "grad_norm": 38.75, "learning_rate": 4.645590008458258e-05, "loss": 1.2217, "step": 90000 }, { "epoch": 0.08, "grad_norm": 0.0498046875, "learning_rate": 4.6451401011391655e-05, "loss": 1.1891, "step": 90100 }, { "epoch": 0.08, "grad_norm": 161.0, "learning_rate": 4.644690193820073e-05, "loss": 1.3031, "step": 90200 }, { "epoch": 0.08, "grad_norm": 16.25, "learning_rate": 4.644240286500981e-05, "loss": 1.3148, "step": 90300 }, { "epoch": 0.08, "grad_norm": 32.5, "learning_rate": 4.643790379181889e-05, "loss": 1.3879, "step": 90400 }, { "epoch": 0.08, "grad_norm": 201.0, "learning_rate": 4.643340471862796e-05, "loss": 1.1986, "step": 90500 }, { "epoch": 0.08, "grad_norm": 144.0, "learning_rate": 4.6428905645437045e-05, "loss": 1.1706, "step": 90600 }, { "epoch": 0.08, "grad_norm": 22.875, "learning_rate": 4.642440657224612e-05, "loss": 1.2711, "step": 90700 }, { "epoch": 0.08, "grad_norm": 19.25, "learning_rate": 4.6419907499055195e-05, "loss": 1.3133, "step": 90800 }, { "epoch": 0.08, "grad_norm": 91.0, "learning_rate": 4.641540842586428e-05, "loss": 1.2535, "step": 90900 }, { "epoch": 0.08, "grad_norm": 290.0, "learning_rate": 4.641090935267335e-05, "loss": 1.3282, "step": 91000 }, { "epoch": 0.08, "grad_norm": 1.109375, "learning_rate": 4.640641027948243e-05, "loss": 1.1775, "step": 91100 }, { "epoch": 0.08, "grad_norm": 254.0, "learning_rate": 4.6401911206291504e-05, "loss": 1.1602, "step": 91200 }, { "epoch": 0.08, "grad_norm": 21.625, "learning_rate": 4.639741213310058e-05, "loss": 1.1385, "step": 91300 }, { "epoch": 0.08, "grad_norm": 15.4375, "learning_rate": 4.639291305990966e-05, "loss": 1.0891, "step": 91400 }, { "epoch": 0.08, "grad_norm": 29.25, "learning_rate": 4.6388413986718736e-05, "loss": 1.1267, "step": 91500 }, { "epoch": 0.08, "grad_norm": 121.5, "learning_rate": 4.638391491352781e-05, "loss": 1.1393, "step": 91600 }, { "epoch": 0.08, "grad_norm": 26.375, "learning_rate": 4.6379415840336894e-05, "loss": 1.3078, "step": 91700 }, { "epoch": 0.08, "grad_norm": 2.546875, "learning_rate": 4.637491676714597e-05, "loss": 1.1165, "step": 91800 }, { "epoch": 0.08, "grad_norm": 744.0, "learning_rate": 4.637041769395505e-05, "loss": 1.2956, "step": 91900 }, { "epoch": 0.08, "grad_norm": 1.9296875, "learning_rate": 4.636591862076413e-05, "loss": 1.2069, "step": 92000 }, { "epoch": 0.08, "grad_norm": 36.25, "learning_rate": 4.63614195475732e-05, "loss": 1.2714, "step": 92100 }, { "epoch": 0.08, "grad_norm": 76.5, "learning_rate": 4.6356920474382284e-05, "loss": 1.1672, "step": 92200 }, { "epoch": 0.08, "grad_norm": 47.25, "learning_rate": 4.635242140119136e-05, "loss": 1.1454, "step": 92300 }, { "epoch": 0.08, "grad_norm": 19.875, "learning_rate": 4.6347922328000435e-05, "loss": 1.296, "step": 92400 }, { "epoch": 0.08, "grad_norm": 30.125, "learning_rate": 4.634342325480951e-05, "loss": 1.274, "step": 92500 }, { "epoch": 0.08, "grad_norm": 10.625, "learning_rate": 4.6338924181618585e-05, "loss": 1.16, "step": 92600 }, { "epoch": 0.08, "grad_norm": 17.25, "learning_rate": 4.633442510842767e-05, "loss": 1.3192, "step": 92700 }, { "epoch": 0.08, "grad_norm": 217.0, "learning_rate": 4.632992603523674e-05, "loss": 1.1032, "step": 92800 }, { "epoch": 0.08, "grad_norm": 204.0, "learning_rate": 4.632542696204582e-05, "loss": 1.0933, "step": 92900 }, { "epoch": 0.08, "grad_norm": 68.0, "learning_rate": 4.63209278888549e-05, "loss": 1.2312, "step": 93000 }, { "epoch": 0.08, "grad_norm": 33.25, "learning_rate": 4.6316428815663976e-05, "loss": 1.317, "step": 93100 }, { "epoch": 0.08, "grad_norm": 9.125, "learning_rate": 4.631192974247305e-05, "loss": 1.1449, "step": 93200 }, { "epoch": 0.08, "grad_norm": 150.0, "learning_rate": 4.630743066928213e-05, "loss": 1.0247, "step": 93300 }, { "epoch": 0.08, "grad_norm": 53.0, "learning_rate": 4.630293159609121e-05, "loss": 1.2245, "step": 93400 }, { "epoch": 0.08, "grad_norm": 56.0, "learning_rate": 4.6298432522900284e-05, "loss": 1.392, "step": 93500 }, { "epoch": 0.08, "grad_norm": 14.375, "learning_rate": 4.6293933449709366e-05, "loss": 0.9784, "step": 93600 }, { "epoch": 0.08, "grad_norm": 0.203125, "learning_rate": 4.6289434376518435e-05, "loss": 1.1816, "step": 93700 }, { "epoch": 0.08, "grad_norm": 22.75, "learning_rate": 4.628493530332752e-05, "loss": 1.1423, "step": 93800 }, { "epoch": 0.08, "grad_norm": 0.140625, "learning_rate": 4.628043623013659e-05, "loss": 1.264, "step": 93900 }, { "epoch": 0.08, "grad_norm": 15.375, "learning_rate": 4.627593715694567e-05, "loss": 1.1952, "step": 94000 }, { "epoch": 0.08, "grad_norm": 12.6875, "learning_rate": 4.627143808375475e-05, "loss": 1.1834, "step": 94100 }, { "epoch": 0.08, "grad_norm": 44.75, "learning_rate": 4.6266939010563825e-05, "loss": 1.1482, "step": 94200 }, { "epoch": 0.08, "grad_norm": 103.0, "learning_rate": 4.62624399373729e-05, "loss": 1.2201, "step": 94300 }, { "epoch": 0.08, "grad_norm": 68.0, "learning_rate": 4.625794086418198e-05, "loss": 1.0979, "step": 94400 }, { "epoch": 0.08, "grad_norm": 0.09765625, "learning_rate": 4.625344179099106e-05, "loss": 1.2729, "step": 94500 }, { "epoch": 0.08, "grad_norm": 0.26171875, "learning_rate": 4.624894271780014e-05, "loss": 1.2557, "step": 94600 }, { "epoch": 0.08, "grad_norm": 86.0, "learning_rate": 4.6244443644609215e-05, "loss": 1.2296, "step": 94700 }, { "epoch": 0.08, "grad_norm": 33.75, "learning_rate": 4.623994457141829e-05, "loss": 1.203, "step": 94800 }, { "epoch": 0.08, "grad_norm": 9.1875, "learning_rate": 4.623544549822737e-05, "loss": 1.1907, "step": 94900 }, { "epoch": 0.08, "grad_norm": 0.003448486328125, "learning_rate": 4.623094642503644e-05, "loss": 1.1294, "step": 95000 }, { "epoch": 0.08, "grad_norm": 32.75, "learning_rate": 4.622644735184552e-05, "loss": 1.4294, "step": 95100 }, { "epoch": 0.08, "grad_norm": 81.0, "learning_rate": 4.62219482786546e-05, "loss": 1.131, "step": 95200 }, { "epoch": 0.08, "grad_norm": 83.0, "learning_rate": 4.6217449205463674e-05, "loss": 1.2515, "step": 95300 }, { "epoch": 0.08, "grad_norm": 167.0, "learning_rate": 4.6212950132272756e-05, "loss": 1.2297, "step": 95400 }, { "epoch": 0.09, "grad_norm": 22.125, "learning_rate": 4.620845105908183e-05, "loss": 1.2981, "step": 95500 }, { "epoch": 0.09, "grad_norm": 59.5, "learning_rate": 4.620395198589091e-05, "loss": 1.2032, "step": 95600 }, { "epoch": 0.09, "grad_norm": 40.5, "learning_rate": 4.619945291269999e-05, "loss": 1.283, "step": 95700 }, { "epoch": 0.09, "grad_norm": 29.625, "learning_rate": 4.6194953839509064e-05, "loss": 1.3381, "step": 95800 }, { "epoch": 0.09, "grad_norm": 66.0, "learning_rate": 4.619045476631814e-05, "loss": 1.3305, "step": 95900 }, { "epoch": 0.09, "grad_norm": 48.0, "learning_rate": 4.618595569312722e-05, "loss": 1.15, "step": 96000 }, { "epoch": 0.09, "grad_norm": 19.875, "learning_rate": 4.61814566199363e-05, "loss": 1.3633, "step": 96100 }, { "epoch": 0.09, "grad_norm": 17.25, "learning_rate": 4.617695754674537e-05, "loss": 1.1605, "step": 96200 }, { "epoch": 0.09, "grad_norm": 27.5, "learning_rate": 4.617245847355445e-05, "loss": 1.2327, "step": 96300 }, { "epoch": 0.09, "grad_norm": 0.60546875, "learning_rate": 4.616795940036352e-05, "loss": 1.1414, "step": 96400 }, { "epoch": 0.09, "grad_norm": 16.5, "learning_rate": 4.6163460327172605e-05, "loss": 1.3517, "step": 96500 }, { "epoch": 0.09, "grad_norm": 17.125, "learning_rate": 4.615896125398168e-05, "loss": 1.2891, "step": 96600 }, { "epoch": 0.09, "grad_norm": 77.5, "learning_rate": 4.6154462180790756e-05, "loss": 1.2363, "step": 96700 }, { "epoch": 0.09, "grad_norm": 94.0, "learning_rate": 4.614996310759984e-05, "loss": 1.2241, "step": 96800 }, { "epoch": 0.09, "grad_norm": 0.1787109375, "learning_rate": 4.614546403440891e-05, "loss": 1.1567, "step": 96900 }, { "epoch": 0.09, "grad_norm": 2.875, "learning_rate": 4.614096496121799e-05, "loss": 1.2089, "step": 97000 }, { "epoch": 0.09, "grad_norm": 30.375, "learning_rate": 4.613646588802707e-05, "loss": 1.2015, "step": 97100 }, { "epoch": 0.09, "grad_norm": 15.6875, "learning_rate": 4.6131966814836146e-05, "loss": 1.1075, "step": 97200 }, { "epoch": 0.09, "grad_norm": 82.0, "learning_rate": 4.612746774164523e-05, "loss": 1.2898, "step": 97300 }, { "epoch": 0.09, "grad_norm": 175.0, "learning_rate": 4.61229686684543e-05, "loss": 1.3346, "step": 97400 }, { "epoch": 0.09, "grad_norm": 20.25, "learning_rate": 4.611846959526338e-05, "loss": 1.1985, "step": 97500 }, { "epoch": 0.09, "grad_norm": 0.796875, "learning_rate": 4.6113970522072454e-05, "loss": 1.1855, "step": 97600 }, { "epoch": 0.09, "grad_norm": 153.0, "learning_rate": 4.610947144888153e-05, "loss": 1.2736, "step": 97700 }, { "epoch": 0.09, "grad_norm": 26.5, "learning_rate": 4.610497237569061e-05, "loss": 1.0498, "step": 97800 }, { "epoch": 0.09, "grad_norm": 43.5, "learning_rate": 4.610047330249969e-05, "loss": 1.1019, "step": 97900 }, { "epoch": 0.09, "grad_norm": 60.75, "learning_rate": 4.609597422930876e-05, "loss": 1.2442, "step": 98000 }, { "epoch": 0.09, "grad_norm": 23.875, "learning_rate": 4.6091475156117844e-05, "loss": 1.3374, "step": 98100 }, { "epoch": 0.09, "grad_norm": 16.5, "learning_rate": 4.608697608292692e-05, "loss": 1.2757, "step": 98200 }, { "epoch": 0.09, "grad_norm": 67.5, "learning_rate": 4.6082477009735995e-05, "loss": 1.1568, "step": 98300 }, { "epoch": 0.09, "grad_norm": 35.75, "learning_rate": 4.607797793654508e-05, "loss": 1.2625, "step": 98400 }, { "epoch": 0.09, "grad_norm": 20.625, "learning_rate": 4.607347886335415e-05, "loss": 1.3875, "step": 98500 }, { "epoch": 0.09, "grad_norm": 141.0, "learning_rate": 4.606897979016323e-05, "loss": 1.0833, "step": 98600 }, { "epoch": 0.09, "grad_norm": 29.125, "learning_rate": 4.606448071697231e-05, "loss": 1.1732, "step": 98700 }, { "epoch": 0.09, "grad_norm": 46.5, "learning_rate": 4.6059981643781385e-05, "loss": 1.1168, "step": 98800 }, { "epoch": 0.09, "grad_norm": 36.0, "learning_rate": 4.605548257059046e-05, "loss": 1.3608, "step": 98900 }, { "epoch": 0.09, "grad_norm": 18.875, "learning_rate": 4.6050983497399536e-05, "loss": 1.3203, "step": 99000 }, { "epoch": 0.09, "grad_norm": 0.024169921875, "learning_rate": 4.604648442420861e-05, "loss": 1.2144, "step": 99100 }, { "epoch": 0.09, "grad_norm": 12.125, "learning_rate": 4.604198535101769e-05, "loss": 1.2136, "step": 99200 }, { "epoch": 0.09, "grad_norm": 33.75, "learning_rate": 4.603748627782677e-05, "loss": 1.2989, "step": 99300 }, { "epoch": 0.09, "grad_norm": 19.625, "learning_rate": 4.6032987204635844e-05, "loss": 1.0424, "step": 99400 }, { "epoch": 0.09, "grad_norm": 117.0, "learning_rate": 4.6028488131444926e-05, "loss": 1.3084, "step": 99500 }, { "epoch": 0.09, "grad_norm": 26.0, "learning_rate": 4.6023989058254e-05, "loss": 1.0954, "step": 99600 }, { "epoch": 0.09, "grad_norm": 41.25, "learning_rate": 4.601948998506308e-05, "loss": 1.1938, "step": 99700 }, { "epoch": 0.09, "grad_norm": 0.671875, "learning_rate": 4.601499091187216e-05, "loss": 1.1928, "step": 99800 }, { "epoch": 0.09, "grad_norm": 21.875, "learning_rate": 4.6010491838681234e-05, "loss": 1.3598, "step": 99900 }, { "epoch": 0.09, "grad_norm": 34.25, "learning_rate": 4.6005992765490316e-05, "loss": 1.2554, "step": 100000 }, { "epoch": 0.09, "grad_norm": 374.0, "learning_rate": 4.600149369229939e-05, "loss": 1.0904, "step": 100100 }, { "epoch": 0.09, "grad_norm": 15.25, "learning_rate": 4.599699461910846e-05, "loss": 1.2633, "step": 100200 }, { "epoch": 0.09, "grad_norm": 34.25, "learning_rate": 4.599249554591754e-05, "loss": 1.2126, "step": 100300 }, { "epoch": 0.09, "grad_norm": 68.0, "learning_rate": 4.598799647272662e-05, "loss": 1.177, "step": 100400 }, { "epoch": 0.09, "grad_norm": 15.9375, "learning_rate": 4.59834973995357e-05, "loss": 1.3549, "step": 100500 }, { "epoch": 0.09, "grad_norm": 199.0, "learning_rate": 4.5978998326344775e-05, "loss": 1.2707, "step": 100600 }, { "epoch": 0.09, "grad_norm": 59.5, "learning_rate": 4.597449925315385e-05, "loss": 1.3505, "step": 100700 }, { "epoch": 0.09, "grad_norm": 38.75, "learning_rate": 4.597000017996293e-05, "loss": 1.2006, "step": 100800 }, { "epoch": 0.09, "grad_norm": 12.5, "learning_rate": 4.596550110677201e-05, "loss": 1.2509, "step": 100900 }, { "epoch": 0.09, "grad_norm": 25.625, "learning_rate": 4.596100203358108e-05, "loss": 1.2555, "step": 101000 }, { "epoch": 0.09, "grad_norm": 23.125, "learning_rate": 4.5956502960390165e-05, "loss": 1.2871, "step": 101100 }, { "epoch": 0.09, "grad_norm": 18.625, "learning_rate": 4.595200388719924e-05, "loss": 1.0645, "step": 101200 }, { "epoch": 0.09, "grad_norm": 10.8125, "learning_rate": 4.5947504814008316e-05, "loss": 1.1701, "step": 101300 }, { "epoch": 0.09, "grad_norm": 46.5, "learning_rate": 4.59430057408174e-05, "loss": 1.3589, "step": 101400 }, { "epoch": 0.09, "grad_norm": 23.25, "learning_rate": 4.593850666762647e-05, "loss": 1.3453, "step": 101500 }, { "epoch": 0.09, "grad_norm": 7.96875, "learning_rate": 4.593400759443555e-05, "loss": 1.0873, "step": 101600 }, { "epoch": 0.09, "grad_norm": 12.125, "learning_rate": 4.5929508521244624e-05, "loss": 1.0792, "step": 101700 }, { "epoch": 0.09, "grad_norm": 64.5, "learning_rate": 4.59250094480537e-05, "loss": 1.2598, "step": 101800 }, { "epoch": 0.09, "grad_norm": 31.75, "learning_rate": 4.592051037486278e-05, "loss": 1.0844, "step": 101900 }, { "epoch": 0.09, "grad_norm": 129.0, "learning_rate": 4.591601130167186e-05, "loss": 1.2592, "step": 102000 }, { "epoch": 0.09, "grad_norm": 55.75, "learning_rate": 4.591151222848093e-05, "loss": 1.295, "step": 102100 }, { "epoch": 0.09, "grad_norm": 16.0, "learning_rate": 4.5907013155290015e-05, "loss": 1.1847, "step": 102200 }, { "epoch": 0.09, "grad_norm": 23.125, "learning_rate": 4.590251408209909e-05, "loss": 1.1945, "step": 102300 }, { "epoch": 0.09, "grad_norm": 0.18359375, "learning_rate": 4.5898015008908165e-05, "loss": 1.3027, "step": 102400 }, { "epoch": 0.09, "grad_norm": 55.5, "learning_rate": 4.589351593571725e-05, "loss": 1.2763, "step": 102500 }, { "epoch": 0.09, "grad_norm": 206.0, "learning_rate": 4.588901686252632e-05, "loss": 1.2135, "step": 102600 }, { "epoch": 0.09, "grad_norm": 125.5, "learning_rate": 4.5884517789335405e-05, "loss": 1.1985, "step": 102700 }, { "epoch": 0.09, "grad_norm": 15.8125, "learning_rate": 4.588001871614447e-05, "loss": 1.1805, "step": 102800 }, { "epoch": 0.09, "grad_norm": 25.25, "learning_rate": 4.587551964295355e-05, "loss": 1.2184, "step": 102900 }, { "epoch": 0.09, "grad_norm": 30.75, "learning_rate": 4.587102056976263e-05, "loss": 1.1989, "step": 103000 }, { "epoch": 0.09, "grad_norm": 0.05029296875, "learning_rate": 4.5866521496571706e-05, "loss": 1.0852, "step": 103100 }, { "epoch": 0.09, "grad_norm": 23.125, "learning_rate": 4.586202242338079e-05, "loss": 1.075, "step": 103200 }, { "epoch": 0.09, "grad_norm": 16.75, "learning_rate": 4.5857523350189864e-05, "loss": 1.3041, "step": 103300 }, { "epoch": 0.09, "grad_norm": 119.5, "learning_rate": 4.585302427699894e-05, "loss": 1.3158, "step": 103400 }, { "epoch": 0.09, "grad_norm": 21.625, "learning_rate": 4.584852520380802e-05, "loss": 1.13, "step": 103500 }, { "epoch": 0.09, "grad_norm": 82.5, "learning_rate": 4.5844026130617096e-05, "loss": 1.3399, "step": 103600 }, { "epoch": 0.09, "grad_norm": 0.0269775390625, "learning_rate": 4.583952705742617e-05, "loss": 1.1488, "step": 103700 }, { "epoch": 0.09, "grad_norm": 51.5, "learning_rate": 4.5835027984235254e-05, "loss": 1.1056, "step": 103800 }, { "epoch": 0.09, "grad_norm": 0.322265625, "learning_rate": 4.583052891104433e-05, "loss": 1.195, "step": 103900 }, { "epoch": 0.09, "grad_norm": 0.001495361328125, "learning_rate": 4.5826029837853405e-05, "loss": 1.3246, "step": 104000 }, { "epoch": 0.09, "grad_norm": 114.5, "learning_rate": 4.582153076466248e-05, "loss": 1.1865, "step": 104100 }, { "epoch": 0.09, "grad_norm": 0.06005859375, "learning_rate": 4.5817031691471555e-05, "loss": 1.2381, "step": 104200 }, { "epoch": 0.09, "grad_norm": 102.0, "learning_rate": 4.581253261828064e-05, "loss": 1.3197, "step": 104300 }, { "epoch": 0.09, "grad_norm": 23.625, "learning_rate": 4.580803354508971e-05, "loss": 1.1024, "step": 104400 }, { "epoch": 0.09, "grad_norm": 31.25, "learning_rate": 4.580353447189879e-05, "loss": 1.1392, "step": 104500 }, { "epoch": 0.09, "grad_norm": 30.625, "learning_rate": 4.579903539870787e-05, "loss": 1.2728, "step": 104600 }, { "epoch": 0.09, "grad_norm": 45.0, "learning_rate": 4.5794536325516945e-05, "loss": 1.2558, "step": 104700 }, { "epoch": 0.09, "grad_norm": 77.5, "learning_rate": 4.579003725232602e-05, "loss": 1.0428, "step": 104800 }, { "epoch": 0.09, "grad_norm": 113.5, "learning_rate": 4.57855381791351e-05, "loss": 1.1951, "step": 104900 }, { "epoch": 0.09, "grad_norm": 25.625, "learning_rate": 4.578103910594418e-05, "loss": 1.2916, "step": 105000 }, { "epoch": 0.09, "grad_norm": 0.302734375, "learning_rate": 4.5776540032753254e-05, "loss": 1.1258, "step": 105100 }, { "epoch": 0.09, "grad_norm": 15.125, "learning_rate": 4.5772040959562336e-05, "loss": 1.2375, "step": 105200 }, { "epoch": 0.09, "grad_norm": 4.5, "learning_rate": 4.576754188637141e-05, "loss": 1.2654, "step": 105300 }, { "epoch": 0.09, "grad_norm": 0.010986328125, "learning_rate": 4.5763042813180486e-05, "loss": 1.3326, "step": 105400 }, { "epoch": 0.09, "grad_norm": 8.8125, "learning_rate": 4.575854373998956e-05, "loss": 1.2025, "step": 105500 }, { "epoch": 0.09, "grad_norm": 35.0, "learning_rate": 4.575404466679864e-05, "loss": 1.2903, "step": 105600 }, { "epoch": 0.09, "grad_norm": 39.0, "learning_rate": 4.574954559360772e-05, "loss": 1.3111, "step": 105700 }, { "epoch": 0.09, "grad_norm": 0.005645751953125, "learning_rate": 4.5745046520416795e-05, "loss": 1.3519, "step": 105800 }, { "epoch": 0.09, "grad_norm": 26.5, "learning_rate": 4.574054744722587e-05, "loss": 1.2876, "step": 105900 }, { "epoch": 0.09, "grad_norm": 7.90625, "learning_rate": 4.573604837403495e-05, "loss": 1.234, "step": 106000 }, { "epoch": 0.09, "grad_norm": 0.005096435546875, "learning_rate": 4.573154930084403e-05, "loss": 1.395, "step": 106100 }, { "epoch": 0.09, "grad_norm": 56.5, "learning_rate": 4.572705022765311e-05, "loss": 1.3566, "step": 106200 }, { "epoch": 0.09, "grad_norm": 32.5, "learning_rate": 4.5722551154462185e-05, "loss": 1.2325, "step": 106300 }, { "epoch": 0.09, "grad_norm": 16.375, "learning_rate": 4.571805208127126e-05, "loss": 1.2097, "step": 106400 }, { "epoch": 0.09, "grad_norm": 19.25, "learning_rate": 4.571355300808034e-05, "loss": 1.0891, "step": 106500 }, { "epoch": 0.09, "grad_norm": 37.75, "learning_rate": 4.570905393488942e-05, "loss": 1.216, "step": 106600 }, { "epoch": 0.1, "grad_norm": 50.5, "learning_rate": 4.570455486169849e-05, "loss": 1.4524, "step": 106700 }, { "epoch": 0.1, "grad_norm": 0.2001953125, "learning_rate": 4.570005578850757e-05, "loss": 1.116, "step": 106800 }, { "epoch": 0.1, "grad_norm": 14.25, "learning_rate": 4.5695556715316644e-05, "loss": 1.1824, "step": 106900 }, { "epoch": 0.1, "grad_norm": 98.5, "learning_rate": 4.5691057642125726e-05, "loss": 1.2857, "step": 107000 }, { "epoch": 0.1, "grad_norm": 14.5625, "learning_rate": 4.56865585689348e-05, "loss": 1.0591, "step": 107100 }, { "epoch": 0.1, "grad_norm": 37.75, "learning_rate": 4.5682059495743876e-05, "loss": 1.3179, "step": 107200 }, { "epoch": 0.1, "grad_norm": 19.75, "learning_rate": 4.567756042255296e-05, "loss": 1.3088, "step": 107300 }, { "epoch": 0.1, "grad_norm": 11.5, "learning_rate": 4.5673061349362034e-05, "loss": 1.2068, "step": 107400 }, { "epoch": 0.1, "grad_norm": 15.9375, "learning_rate": 4.566856227617111e-05, "loss": 1.0826, "step": 107500 }, { "epoch": 0.1, "grad_norm": 60.75, "learning_rate": 4.566406320298019e-05, "loss": 1.2205, "step": 107600 }, { "epoch": 0.1, "grad_norm": 14.5625, "learning_rate": 4.565956412978927e-05, "loss": 1.1924, "step": 107700 }, { "epoch": 0.1, "grad_norm": 0.310546875, "learning_rate": 4.565506505659834e-05, "loss": 1.138, "step": 107800 }, { "epoch": 0.1, "grad_norm": 22.375, "learning_rate": 4.5650565983407424e-05, "loss": 1.3148, "step": 107900 }, { "epoch": 0.1, "grad_norm": 28.375, "learning_rate": 4.564606691021649e-05, "loss": 1.1837, "step": 108000 }, { "epoch": 0.1, "grad_norm": 41.75, "learning_rate": 4.5641567837025575e-05, "loss": 1.2787, "step": 108100 }, { "epoch": 0.1, "grad_norm": 53.5, "learning_rate": 4.563706876383465e-05, "loss": 1.2307, "step": 108200 }, { "epoch": 0.1, "grad_norm": 28.375, "learning_rate": 4.5632569690643725e-05, "loss": 1.1937, "step": 108300 }, { "epoch": 0.1, "grad_norm": 19.375, "learning_rate": 4.562807061745281e-05, "loss": 1.2141, "step": 108400 }, { "epoch": 0.1, "grad_norm": 45.25, "learning_rate": 4.562357154426188e-05, "loss": 1.3127, "step": 108500 }, { "epoch": 0.1, "grad_norm": 0.10205078125, "learning_rate": 4.561907247107096e-05, "loss": 1.1631, "step": 108600 }, { "epoch": 0.1, "grad_norm": 446.0, "learning_rate": 4.561457339788004e-05, "loss": 1.1566, "step": 108700 }, { "epoch": 0.1, "grad_norm": 208.0, "learning_rate": 4.5610074324689116e-05, "loss": 1.1789, "step": 108800 }, { "epoch": 0.1, "grad_norm": 32.25, "learning_rate": 4.56055752514982e-05, "loss": 1.2044, "step": 108900 }, { "epoch": 0.1, "grad_norm": 0.007415771484375, "learning_rate": 4.560107617830727e-05, "loss": 1.221, "step": 109000 }, { "epoch": 0.1, "grad_norm": 18.375, "learning_rate": 4.559657710511635e-05, "loss": 1.2534, "step": 109100 }, { "epoch": 0.1, "grad_norm": 0.52734375, "learning_rate": 4.559207803192543e-05, "loss": 1.3312, "step": 109200 }, { "epoch": 0.1, "grad_norm": 15.5, "learning_rate": 4.55875789587345e-05, "loss": 1.1578, "step": 109300 }, { "epoch": 0.1, "grad_norm": 33.75, "learning_rate": 4.558307988554358e-05, "loss": 1.2368, "step": 109400 }, { "epoch": 0.1, "grad_norm": 0.78515625, "learning_rate": 4.557858081235266e-05, "loss": 1.1862, "step": 109500 }, { "epoch": 0.1, "grad_norm": 13.9375, "learning_rate": 4.557408173916173e-05, "loss": 1.2698, "step": 109600 }, { "epoch": 0.1, "grad_norm": 63.25, "learning_rate": 4.5569582665970814e-05, "loss": 1.1454, "step": 109700 }, { "epoch": 0.1, "grad_norm": 32.0, "learning_rate": 4.556508359277989e-05, "loss": 1.1937, "step": 109800 }, { "epoch": 0.1, "grad_norm": 16.375, "learning_rate": 4.5560584519588965e-05, "loss": 1.2177, "step": 109900 }, { "epoch": 0.1, "grad_norm": 19.375, "learning_rate": 4.555608544639805e-05, "loss": 1.2745, "step": 110000 }, { "epoch": 0.1, "grad_norm": 54.75, "learning_rate": 4.555158637320712e-05, "loss": 1.2536, "step": 110100 }, { "epoch": 0.1, "grad_norm": 38.25, "learning_rate": 4.55470873000162e-05, "loss": 1.3106, "step": 110200 }, { "epoch": 0.1, "grad_norm": 151.0, "learning_rate": 4.554258822682528e-05, "loss": 1.2801, "step": 110300 }, { "epoch": 0.1, "grad_norm": 15.875, "learning_rate": 4.5538089153634355e-05, "loss": 1.2058, "step": 110400 }, { "epoch": 0.1, "grad_norm": 24.375, "learning_rate": 4.553359008044343e-05, "loss": 1.226, "step": 110500 }, { "epoch": 0.1, "grad_norm": 41.0, "learning_rate": 4.5529091007252506e-05, "loss": 1.1285, "step": 110600 }, { "epoch": 0.1, "grad_norm": 53.25, "learning_rate": 4.552459193406158e-05, "loss": 1.2774, "step": 110700 }, { "epoch": 0.1, "grad_norm": 47.25, "learning_rate": 4.552009286087066e-05, "loss": 1.1275, "step": 110800 }, { "epoch": 0.1, "grad_norm": 41.0, "learning_rate": 4.551559378767974e-05, "loss": 1.2186, "step": 110900 }, { "epoch": 0.1, "grad_norm": 141.0, "learning_rate": 4.5511094714488814e-05, "loss": 1.1936, "step": 111000 }, { "epoch": 0.1, "grad_norm": 17.25, "learning_rate": 4.5506595641297896e-05, "loss": 1.387, "step": 111100 }, { "epoch": 0.1, "grad_norm": 80.0, "learning_rate": 4.550209656810697e-05, "loss": 1.2142, "step": 111200 }, { "epoch": 0.1, "grad_norm": 68.5, "learning_rate": 4.549759749491605e-05, "loss": 1.2587, "step": 111300 }, { "epoch": 0.1, "grad_norm": 9.9375, "learning_rate": 4.549309842172513e-05, "loss": 1.3325, "step": 111400 }, { "epoch": 0.1, "grad_norm": 125.0, "learning_rate": 4.5488599348534204e-05, "loss": 1.2874, "step": 111500 }, { "epoch": 0.1, "grad_norm": 173.0, "learning_rate": 4.5484100275343286e-05, "loss": 1.1543, "step": 111600 }, { "epoch": 0.1, "grad_norm": 23.375, "learning_rate": 4.547960120215236e-05, "loss": 1.1861, "step": 111700 }, { "epoch": 0.1, "grad_norm": 34.25, "learning_rate": 4.547510212896144e-05, "loss": 1.1127, "step": 111800 }, { "epoch": 0.1, "grad_norm": 17.0, "learning_rate": 4.547060305577051e-05, "loss": 1.2207, "step": 111900 }, { "epoch": 0.1, "grad_norm": 0.5625, "learning_rate": 4.546610398257959e-05, "loss": 1.353, "step": 112000 }, { "epoch": 0.1, "grad_norm": 164.0, "learning_rate": 4.546160490938867e-05, "loss": 1.2219, "step": 112100 }, { "epoch": 0.1, "grad_norm": 61.75, "learning_rate": 4.5457105836197745e-05, "loss": 1.1616, "step": 112200 }, { "epoch": 0.1, "grad_norm": 37.5, "learning_rate": 4.545260676300682e-05, "loss": 1.1932, "step": 112300 }, { "epoch": 0.1, "grad_norm": 38.75, "learning_rate": 4.54481076898159e-05, "loss": 1.2167, "step": 112400 }, { "epoch": 0.1, "grad_norm": 536.0, "learning_rate": 4.544360861662498e-05, "loss": 1.2178, "step": 112500 }, { "epoch": 0.1, "grad_norm": 93.0, "learning_rate": 4.543910954343405e-05, "loss": 1.1768, "step": 112600 }, { "epoch": 0.1, "grad_norm": 22.375, "learning_rate": 4.5434610470243135e-05, "loss": 1.4186, "step": 112700 }, { "epoch": 0.1, "grad_norm": 28.75, "learning_rate": 4.543011139705221e-05, "loss": 1.1987, "step": 112800 }, { "epoch": 0.1, "grad_norm": 20.875, "learning_rate": 4.5425612323861286e-05, "loss": 1.216, "step": 112900 }, { "epoch": 0.1, "grad_norm": 58.5, "learning_rate": 4.542111325067037e-05, "loss": 1.2428, "step": 113000 }, { "epoch": 0.1, "grad_norm": 95.5, "learning_rate": 4.5416614177479443e-05, "loss": 1.1357, "step": 113100 }, { "epoch": 0.1, "grad_norm": 27.25, "learning_rate": 4.541211510428852e-05, "loss": 1.2233, "step": 113200 }, { "epoch": 0.1, "grad_norm": 29.5, "learning_rate": 4.5407616031097594e-05, "loss": 1.1413, "step": 113300 }, { "epoch": 0.1, "grad_norm": 56.75, "learning_rate": 4.540311695790667e-05, "loss": 1.0296, "step": 113400 }, { "epoch": 0.1, "grad_norm": 14.8125, "learning_rate": 4.539861788471575e-05, "loss": 1.136, "step": 113500 }, { "epoch": 0.1, "grad_norm": 0.07421875, "learning_rate": 4.539411881152483e-05, "loss": 1.2143, "step": 113600 }, { "epoch": 0.1, "grad_norm": 72.5, "learning_rate": 4.53896197383339e-05, "loss": 1.326, "step": 113700 }, { "epoch": 0.1, "grad_norm": 19.0, "learning_rate": 4.5385120665142984e-05, "loss": 1.2427, "step": 113800 }, { "epoch": 0.1, "grad_norm": 5.6875, "learning_rate": 4.538062159195206e-05, "loss": 1.2672, "step": 113900 }, { "epoch": 0.1, "grad_norm": 20.25, "learning_rate": 4.5376122518761135e-05, "loss": 1.2823, "step": 114000 }, { "epoch": 0.1, "grad_norm": 117.5, "learning_rate": 4.537162344557022e-05, "loss": 1.2756, "step": 114100 }, { "epoch": 0.1, "grad_norm": 1.140625, "learning_rate": 4.536712437237929e-05, "loss": 1.2124, "step": 114200 }, { "epoch": 0.1, "grad_norm": 19.375, "learning_rate": 4.5362625299188375e-05, "loss": 1.1537, "step": 114300 }, { "epoch": 0.1, "grad_norm": 12.0625, "learning_rate": 4.535812622599745e-05, "loss": 1.0495, "step": 114400 }, { "epoch": 0.1, "grad_norm": 18.75, "learning_rate": 4.535362715280652e-05, "loss": 1.2276, "step": 114500 }, { "epoch": 0.1, "grad_norm": 66.5, "learning_rate": 4.53491280796156e-05, "loss": 1.048, "step": 114600 }, { "epoch": 0.1, "grad_norm": 55.25, "learning_rate": 4.5344629006424676e-05, "loss": 1.3131, "step": 114700 }, { "epoch": 0.1, "grad_norm": 21.375, "learning_rate": 4.534012993323376e-05, "loss": 1.0428, "step": 114800 }, { "epoch": 0.1, "grad_norm": 37.5, "learning_rate": 4.5335630860042833e-05, "loss": 1.4023, "step": 114900 }, { "epoch": 0.1, "grad_norm": 185.0, "learning_rate": 4.533113178685191e-05, "loss": 1.2281, "step": 115000 }, { "epoch": 0.1, "grad_norm": 68.5, "learning_rate": 4.532663271366099e-05, "loss": 1.1706, "step": 115100 }, { "epoch": 0.1, "grad_norm": 29.0, "learning_rate": 4.5322133640470066e-05, "loss": 1.1157, "step": 115200 }, { "epoch": 0.1, "grad_norm": 24.375, "learning_rate": 4.531763456727914e-05, "loss": 1.3108, "step": 115300 }, { "epoch": 0.1, "grad_norm": 0.80859375, "learning_rate": 4.5313135494088224e-05, "loss": 1.1107, "step": 115400 }, { "epoch": 0.1, "grad_norm": 12.5625, "learning_rate": 4.53086364208973e-05, "loss": 1.09, "step": 115500 }, { "epoch": 0.1, "grad_norm": 0.40625, "learning_rate": 4.5304137347706374e-05, "loss": 1.1862, "step": 115600 }, { "epoch": 0.1, "grad_norm": 16.375, "learning_rate": 4.5299638274515456e-05, "loss": 1.2026, "step": 115700 }, { "epoch": 0.1, "grad_norm": 548.0, "learning_rate": 4.5295139201324525e-05, "loss": 1.209, "step": 115800 }, { "epoch": 0.1, "grad_norm": 0.1259765625, "learning_rate": 4.529064012813361e-05, "loss": 1.2564, "step": 115900 }, { "epoch": 0.1, "grad_norm": 43.0, "learning_rate": 4.528614105494268e-05, "loss": 1.1271, "step": 116000 }, { "epoch": 0.1, "grad_norm": 19.875, "learning_rate": 4.528164198175176e-05, "loss": 1.2558, "step": 116100 }, { "epoch": 0.1, "grad_norm": 30.5, "learning_rate": 4.527714290856084e-05, "loss": 1.1918, "step": 116200 }, { "epoch": 0.1, "grad_norm": 0.0177001953125, "learning_rate": 4.5272643835369915e-05, "loss": 1.1657, "step": 116300 }, { "epoch": 0.1, "grad_norm": 82.0, "learning_rate": 4.526814476217899e-05, "loss": 1.2569, "step": 116400 }, { "epoch": 0.1, "grad_norm": 13.8125, "learning_rate": 4.526364568898807e-05, "loss": 1.2252, "step": 116500 }, { "epoch": 0.1, "grad_norm": 0.004638671875, "learning_rate": 4.525914661579715e-05, "loss": 1.1186, "step": 116600 }, { "epoch": 0.1, "grad_norm": 42.75, "learning_rate": 4.5254647542606223e-05, "loss": 1.114, "step": 116700 }, { "epoch": 0.1, "grad_norm": 14.1875, "learning_rate": 4.5250148469415306e-05, "loss": 1.1557, "step": 116800 }, { "epoch": 0.1, "grad_norm": 144.0, "learning_rate": 4.524564939622438e-05, "loss": 1.1184, "step": 116900 }, { "epoch": 0.1, "grad_norm": 77.5, "learning_rate": 4.524115032303346e-05, "loss": 1.1887, "step": 117000 }, { "epoch": 0.1, "grad_norm": 10.375, "learning_rate": 4.523665124984253e-05, "loss": 1.3325, "step": 117100 }, { "epoch": 0.1, "grad_norm": 9.75, "learning_rate": 4.523215217665161e-05, "loss": 1.2805, "step": 117200 }, { "epoch": 0.1, "grad_norm": 7.5, "learning_rate": 4.522765310346069e-05, "loss": 1.2729, "step": 117300 }, { "epoch": 0.1, "grad_norm": 15.25, "learning_rate": 4.5223154030269764e-05, "loss": 1.1481, "step": 117400 }, { "epoch": 0.1, "grad_norm": 13.4375, "learning_rate": 4.5218654957078846e-05, "loss": 1.3263, "step": 117500 }, { "epoch": 0.1, "grad_norm": 53.25, "learning_rate": 4.521415588388792e-05, "loss": 1.1524, "step": 117600 }, { "epoch": 0.1, "grad_norm": 17.375, "learning_rate": 4.5209656810697e-05, "loss": 1.2387, "step": 117700 }, { "epoch": 0.1, "grad_norm": 117.5, "learning_rate": 4.520515773750608e-05, "loss": 1.259, "step": 117800 }, { "epoch": 0.11, "grad_norm": 12.6875, "learning_rate": 4.5200658664315155e-05, "loss": 1.2438, "step": 117900 }, { "epoch": 0.11, "grad_norm": 37.25, "learning_rate": 4.519615959112423e-05, "loss": 1.2608, "step": 118000 }, { "epoch": 0.11, "grad_norm": 19.875, "learning_rate": 4.519166051793331e-05, "loss": 1.2646, "step": 118100 }, { "epoch": 0.11, "grad_norm": 29.875, "learning_rate": 4.518716144474239e-05, "loss": 1.1934, "step": 118200 }, { "epoch": 0.11, "grad_norm": 11.125, "learning_rate": 4.518266237155146e-05, "loss": 1.2037, "step": 118300 }, { "epoch": 0.11, "grad_norm": 61.25, "learning_rate": 4.517816329836054e-05, "loss": 1.0696, "step": 118400 }, { "epoch": 0.11, "grad_norm": 31.75, "learning_rate": 4.5173664225169613e-05, "loss": 1.2027, "step": 118500 }, { "epoch": 0.11, "grad_norm": 149.0, "learning_rate": 4.5169165151978696e-05, "loss": 1.3522, "step": 118600 }, { "epoch": 0.11, "grad_norm": 12.4375, "learning_rate": 4.516466607878777e-05, "loss": 1.0956, "step": 118700 }, { "epoch": 0.11, "grad_norm": 45.75, "learning_rate": 4.5160167005596846e-05, "loss": 1.3911, "step": 118800 }, { "epoch": 0.11, "grad_norm": 68.0, "learning_rate": 4.515566793240593e-05, "loss": 1.1735, "step": 118900 }, { "epoch": 0.11, "grad_norm": 43.25, "learning_rate": 4.5151168859215004e-05, "loss": 1.2757, "step": 119000 }, { "epoch": 0.11, "grad_norm": 19.375, "learning_rate": 4.514666978602408e-05, "loss": 1.1572, "step": 119100 }, { "epoch": 0.11, "grad_norm": 108.0, "learning_rate": 4.514217071283316e-05, "loss": 1.2605, "step": 119200 }, { "epoch": 0.11, "grad_norm": 61.25, "learning_rate": 4.5137671639642236e-05, "loss": 1.1866, "step": 119300 }, { "epoch": 0.11, "grad_norm": 37.75, "learning_rate": 4.513317256645131e-05, "loss": 1.3153, "step": 119400 }, { "epoch": 0.11, "grad_norm": 0.8203125, "learning_rate": 4.5128673493260394e-05, "loss": 1.2142, "step": 119500 }, { "epoch": 0.11, "grad_norm": 14.875, "learning_rate": 4.512417442006947e-05, "loss": 1.1503, "step": 119600 }, { "epoch": 0.11, "grad_norm": 34.0, "learning_rate": 4.5119675346878545e-05, "loss": 1.1278, "step": 119700 }, { "epoch": 0.11, "grad_norm": 195.0, "learning_rate": 4.511517627368762e-05, "loss": 1.1959, "step": 119800 }, { "epoch": 0.11, "grad_norm": 247.0, "learning_rate": 4.5110677200496695e-05, "loss": 1.2427, "step": 119900 }, { "epoch": 0.11, "grad_norm": 47.0, "learning_rate": 4.510617812730578e-05, "loss": 1.3063, "step": 120000 }, { "epoch": 0.11, "grad_norm": 28.875, "learning_rate": 4.510167905411485e-05, "loss": 1.2894, "step": 120100 }, { "epoch": 0.11, "grad_norm": 36.75, "learning_rate": 4.5097179980923935e-05, "loss": 1.3332, "step": 120200 }, { "epoch": 0.11, "grad_norm": 11.375, "learning_rate": 4.509268090773301e-05, "loss": 1.1714, "step": 120300 }, { "epoch": 0.11, "grad_norm": 20.375, "learning_rate": 4.5088181834542086e-05, "loss": 1.3889, "step": 120400 }, { "epoch": 0.11, "grad_norm": 23.875, "learning_rate": 4.508368276135117e-05, "loss": 1.3379, "step": 120500 }, { "epoch": 0.11, "grad_norm": 92.5, "learning_rate": 4.507918368816024e-05, "loss": 1.1349, "step": 120600 }, { "epoch": 0.11, "grad_norm": 15.125, "learning_rate": 4.507468461496932e-05, "loss": 1.0149, "step": 120700 }, { "epoch": 0.11, "grad_norm": 33.75, "learning_rate": 4.50701855417784e-05, "loss": 1.2801, "step": 120800 }, { "epoch": 0.11, "grad_norm": 110.5, "learning_rate": 4.5065686468587476e-05, "loss": 1.0912, "step": 120900 }, { "epoch": 0.11, "grad_norm": 20.25, "learning_rate": 4.506118739539655e-05, "loss": 1.2497, "step": 121000 }, { "epoch": 0.11, "grad_norm": 23.5, "learning_rate": 4.5056688322205626e-05, "loss": 1.2293, "step": 121100 }, { "epoch": 0.11, "grad_norm": 39.0, "learning_rate": 4.50521892490147e-05, "loss": 1.2091, "step": 121200 }, { "epoch": 0.11, "grad_norm": 0.033447265625, "learning_rate": 4.5047690175823784e-05, "loss": 1.1963, "step": 121300 }, { "epoch": 0.11, "grad_norm": 58.75, "learning_rate": 4.504319110263286e-05, "loss": 1.1702, "step": 121400 }, { "epoch": 0.11, "grad_norm": 127.0, "learning_rate": 4.5038692029441935e-05, "loss": 1.2308, "step": 121500 }, { "epoch": 0.11, "grad_norm": 9.5625, "learning_rate": 4.503419295625102e-05, "loss": 1.3036, "step": 121600 }, { "epoch": 0.11, "grad_norm": 52.0, "learning_rate": 4.502969388306009e-05, "loss": 1.0735, "step": 121700 }, { "epoch": 0.11, "grad_norm": 60.0, "learning_rate": 4.502519480986917e-05, "loss": 1.2377, "step": 121800 }, { "epoch": 0.11, "grad_norm": 39.0, "learning_rate": 4.502069573667825e-05, "loss": 1.1758, "step": 121900 }, { "epoch": 0.11, "grad_norm": 31.125, "learning_rate": 4.5016196663487325e-05, "loss": 1.3948, "step": 122000 }, { "epoch": 0.11, "grad_norm": 27.625, "learning_rate": 4.50116975902964e-05, "loss": 1.1844, "step": 122100 }, { "epoch": 0.11, "grad_norm": 67.0, "learning_rate": 4.500719851710548e-05, "loss": 1.1769, "step": 122200 }, { "epoch": 0.11, "grad_norm": 31.5, "learning_rate": 4.500269944391455e-05, "loss": 1.2587, "step": 122300 }, { "epoch": 0.11, "grad_norm": 15.4375, "learning_rate": 4.499820037072363e-05, "loss": 1.2565, "step": 122400 }, { "epoch": 0.11, "grad_norm": 64.5, "learning_rate": 4.499370129753271e-05, "loss": 1.152, "step": 122500 }, { "epoch": 0.11, "grad_norm": 22.125, "learning_rate": 4.4989202224341784e-05, "loss": 1.1894, "step": 122600 }, { "epoch": 0.11, "grad_norm": 0.08544921875, "learning_rate": 4.4984703151150866e-05, "loss": 1.1433, "step": 122700 }, { "epoch": 0.11, "grad_norm": 138.0, "learning_rate": 4.498020407795994e-05, "loss": 1.1514, "step": 122800 }, { "epoch": 0.11, "grad_norm": 39.0, "learning_rate": 4.4975705004769016e-05, "loss": 1.1807, "step": 122900 }, { "epoch": 0.11, "grad_norm": 20.125, "learning_rate": 4.49712059315781e-05, "loss": 1.2844, "step": 123000 }, { "epoch": 0.11, "grad_norm": 9.9375, "learning_rate": 4.4966706858387174e-05, "loss": 1.1473, "step": 123100 }, { "epoch": 0.11, "grad_norm": 45.75, "learning_rate": 4.4962207785196256e-05, "loss": 1.175, "step": 123200 }, { "epoch": 0.11, "grad_norm": 16.875, "learning_rate": 4.495770871200533e-05, "loss": 1.3394, "step": 123300 }, { "epoch": 0.11, "grad_norm": 68.0, "learning_rate": 4.495320963881441e-05, "loss": 1.1536, "step": 123400 }, { "epoch": 0.11, "grad_norm": 15.4375, "learning_rate": 4.494871056562349e-05, "loss": 1.2805, "step": 123500 }, { "epoch": 0.11, "grad_norm": 0.2099609375, "learning_rate": 4.494421149243256e-05, "loss": 1.2155, "step": 123600 }, { "epoch": 0.11, "grad_norm": 21.125, "learning_rate": 4.493971241924164e-05, "loss": 1.2393, "step": 123700 }, { "epoch": 0.11, "grad_norm": 33.0, "learning_rate": 4.4935213346050715e-05, "loss": 1.2623, "step": 123800 }, { "epoch": 0.11, "grad_norm": 40.0, "learning_rate": 4.493071427285979e-05, "loss": 1.2894, "step": 123900 }, { "epoch": 0.11, "grad_norm": 14.0, "learning_rate": 4.492621519966887e-05, "loss": 1.273, "step": 124000 }, { "epoch": 0.11, "grad_norm": 13.1875, "learning_rate": 4.492171612647795e-05, "loss": 1.0763, "step": 124100 }, { "epoch": 0.11, "grad_norm": 18.125, "learning_rate": 4.491721705328702e-05, "loss": 1.2579, "step": 124200 }, { "epoch": 0.11, "grad_norm": 53.25, "learning_rate": 4.4912717980096105e-05, "loss": 1.4472, "step": 124300 }, { "epoch": 0.11, "grad_norm": 35.5, "learning_rate": 4.490821890690518e-05, "loss": 1.1988, "step": 124400 }, { "epoch": 0.11, "grad_norm": 35.75, "learning_rate": 4.4903719833714256e-05, "loss": 1.5536, "step": 124500 }, { "epoch": 0.11, "grad_norm": 19.875, "learning_rate": 4.489922076052334e-05, "loss": 1.1916, "step": 124600 }, { "epoch": 0.11, "grad_norm": 28.25, "learning_rate": 4.489472168733241e-05, "loss": 1.2056, "step": 124700 }, { "epoch": 0.11, "grad_norm": 22.25, "learning_rate": 4.489022261414149e-05, "loss": 1.1438, "step": 124800 }, { "epoch": 0.11, "grad_norm": 0.1484375, "learning_rate": 4.4885723540950564e-05, "loss": 1.2385, "step": 124900 }, { "epoch": 0.11, "grad_norm": 44.5, "learning_rate": 4.488122446775964e-05, "loss": 1.2607, "step": 125000 }, { "epoch": 0.11, "grad_norm": 26.625, "learning_rate": 4.487672539456872e-05, "loss": 1.2741, "step": 125100 }, { "epoch": 0.11, "grad_norm": 18.625, "learning_rate": 4.48722263213778e-05, "loss": 1.3419, "step": 125200 }, { "epoch": 0.11, "grad_norm": 13.5625, "learning_rate": 4.486772724818687e-05, "loss": 1.1937, "step": 125300 }, { "epoch": 0.11, "grad_norm": 48.5, "learning_rate": 4.4863228174995954e-05, "loss": 1.252, "step": 125400 }, { "epoch": 0.11, "grad_norm": 0.0089111328125, "learning_rate": 4.485872910180503e-05, "loss": 1.1599, "step": 125500 }, { "epoch": 0.11, "grad_norm": 23.625, "learning_rate": 4.4854230028614105e-05, "loss": 1.1953, "step": 125600 }, { "epoch": 0.11, "grad_norm": 41.5, "learning_rate": 4.484973095542319e-05, "loss": 1.0053, "step": 125700 }, { "epoch": 0.11, "grad_norm": 166.0, "learning_rate": 4.484523188223226e-05, "loss": 1.3735, "step": 125800 }, { "epoch": 0.11, "grad_norm": 161.0, "learning_rate": 4.4840732809041344e-05, "loss": 1.1931, "step": 125900 }, { "epoch": 0.11, "grad_norm": 48.0, "learning_rate": 4.483623373585042e-05, "loss": 1.3276, "step": 126000 }, { "epoch": 0.11, "grad_norm": 0.0015716552734375, "learning_rate": 4.4831734662659495e-05, "loss": 1.3037, "step": 126100 }, { "epoch": 0.11, "grad_norm": 0.376953125, "learning_rate": 4.482723558946857e-05, "loss": 1.2961, "step": 126200 }, { "epoch": 0.11, "grad_norm": 0.0458984375, "learning_rate": 4.4822736516277646e-05, "loss": 1.0874, "step": 126300 }, { "epoch": 0.11, "grad_norm": 23.5, "learning_rate": 4.481823744308673e-05, "loss": 1.1462, "step": 126400 }, { "epoch": 0.11, "grad_norm": 42.0, "learning_rate": 4.48137383698958e-05, "loss": 1.2447, "step": 126500 }, { "epoch": 0.11, "grad_norm": 185.0, "learning_rate": 4.480923929670488e-05, "loss": 1.2716, "step": 126600 }, { "epoch": 0.11, "grad_norm": 320.0, "learning_rate": 4.480474022351396e-05, "loss": 1.1433, "step": 126700 }, { "epoch": 0.11, "grad_norm": 75.0, "learning_rate": 4.4800241150323036e-05, "loss": 1.1822, "step": 126800 }, { "epoch": 0.11, "grad_norm": 10.6875, "learning_rate": 4.479574207713211e-05, "loss": 1.209, "step": 126900 }, { "epoch": 0.11, "grad_norm": 16.125, "learning_rate": 4.4791243003941193e-05, "loss": 1.2751, "step": 127000 }, { "epoch": 0.11, "grad_norm": 0.09033203125, "learning_rate": 4.478674393075027e-05, "loss": 1.3464, "step": 127100 }, { "epoch": 0.11, "grad_norm": 21.125, "learning_rate": 4.4782244857559344e-05, "loss": 1.1065, "step": 127200 }, { "epoch": 0.11, "grad_norm": 64.0, "learning_rate": 4.4777745784368426e-05, "loss": 1.1304, "step": 127300 }, { "epoch": 0.11, "grad_norm": 43.25, "learning_rate": 4.47732467111775e-05, "loss": 1.4005, "step": 127400 }, { "epoch": 0.11, "grad_norm": 28.125, "learning_rate": 4.476874763798658e-05, "loss": 1.2676, "step": 127500 }, { "epoch": 0.11, "grad_norm": 26.25, "learning_rate": 4.476424856479565e-05, "loss": 1.2897, "step": 127600 }, { "epoch": 0.11, "grad_norm": 24.5, "learning_rate": 4.475974949160473e-05, "loss": 1.162, "step": 127700 }, { "epoch": 0.11, "grad_norm": 18.625, "learning_rate": 4.475525041841381e-05, "loss": 1.1439, "step": 127800 }, { "epoch": 0.11, "grad_norm": 0.0322265625, "learning_rate": 4.4750751345222885e-05, "loss": 1.0786, "step": 127900 }, { "epoch": 0.11, "grad_norm": 26.25, "learning_rate": 4.474625227203196e-05, "loss": 1.1313, "step": 128000 }, { "epoch": 0.11, "grad_norm": 13.25, "learning_rate": 4.474175319884104e-05, "loss": 1.0268, "step": 128100 }, { "epoch": 0.11, "grad_norm": 8.625, "learning_rate": 4.473725412565012e-05, "loss": 1.3999, "step": 128200 }, { "epoch": 0.11, "grad_norm": 28.125, "learning_rate": 4.473275505245919e-05, "loss": 0.9953, "step": 128300 }, { "epoch": 0.11, "grad_norm": 22.625, "learning_rate": 4.4728255979268275e-05, "loss": 1.1821, "step": 128400 }, { "epoch": 0.11, "grad_norm": 133.0, "learning_rate": 4.472375690607735e-05, "loss": 1.1824, "step": 128500 }, { "epoch": 0.11, "grad_norm": 0.06884765625, "learning_rate": 4.471925783288643e-05, "loss": 1.1332, "step": 128600 }, { "epoch": 0.11, "grad_norm": 77.5, "learning_rate": 4.471475875969551e-05, "loss": 1.3097, "step": 128700 }, { "epoch": 0.11, "grad_norm": 200.0, "learning_rate": 4.471025968650458e-05, "loss": 1.1262, "step": 128800 }, { "epoch": 0.11, "grad_norm": 39.5, "learning_rate": 4.470576061331366e-05, "loss": 1.4088, "step": 128900 }, { "epoch": 0.11, "grad_norm": 31.625, "learning_rate": 4.4701261540122734e-05, "loss": 1.352, "step": 129000 }, { "epoch": 0.12, "grad_norm": 28.125, "learning_rate": 4.4696762466931816e-05, "loss": 1.2171, "step": 129100 }, { "epoch": 0.12, "grad_norm": 57.25, "learning_rate": 4.469226339374089e-05, "loss": 1.2184, "step": 129200 }, { "epoch": 0.12, "grad_norm": 18.0, "learning_rate": 4.468776432054997e-05, "loss": 1.2715, "step": 129300 }, { "epoch": 0.12, "grad_norm": 93.5, "learning_rate": 4.468326524735905e-05, "loss": 1.2901, "step": 129400 }, { "epoch": 0.12, "grad_norm": 48.75, "learning_rate": 4.4678766174168124e-05, "loss": 1.1037, "step": 129500 }, { "epoch": 0.12, "grad_norm": 49.25, "learning_rate": 4.46742671009772e-05, "loss": 1.1158, "step": 129600 }, { "epoch": 0.12, "grad_norm": 18.625, "learning_rate": 4.466976802778628e-05, "loss": 1.1503, "step": 129700 }, { "epoch": 0.12, "grad_norm": 39.5, "learning_rate": 4.466526895459536e-05, "loss": 1.2502, "step": 129800 }, { "epoch": 0.12, "grad_norm": 29.375, "learning_rate": 4.466076988140443e-05, "loss": 1.2901, "step": 129900 }, { "epoch": 0.12, "grad_norm": 27.5, "learning_rate": 4.4656270808213515e-05, "loss": 1.4238, "step": 130000 }, { "epoch": 0.12, "grad_norm": 55.75, "learning_rate": 4.465177173502258e-05, "loss": 1.1832, "step": 130100 }, { "epoch": 0.12, "grad_norm": 0.314453125, "learning_rate": 4.4647272661831665e-05, "loss": 1.25, "step": 130200 }, { "epoch": 0.12, "grad_norm": 9.4375, "learning_rate": 4.464277358864074e-05, "loss": 1.1483, "step": 130300 }, { "epoch": 0.12, "grad_norm": 102.5, "learning_rate": 4.4638274515449816e-05, "loss": 1.1954, "step": 130400 }, { "epoch": 0.12, "grad_norm": 46.25, "learning_rate": 4.46337754422589e-05, "loss": 1.2708, "step": 130500 }, { "epoch": 0.12, "grad_norm": 40.25, "learning_rate": 4.4629276369067973e-05, "loss": 1.3658, "step": 130600 }, { "epoch": 0.12, "grad_norm": 0.5625, "learning_rate": 4.462477729587705e-05, "loss": 1.1088, "step": 130700 }, { "epoch": 0.12, "grad_norm": 30.375, "learning_rate": 4.462027822268613e-05, "loss": 1.2477, "step": 130800 }, { "epoch": 0.12, "grad_norm": 44.25, "learning_rate": 4.4615779149495206e-05, "loss": 1.2206, "step": 130900 }, { "epoch": 0.12, "grad_norm": 1.125, "learning_rate": 4.461128007630428e-05, "loss": 1.1262, "step": 131000 }, { "epoch": 0.12, "grad_norm": 49.75, "learning_rate": 4.4606781003113364e-05, "loss": 1.1537, "step": 131100 }, { "epoch": 0.12, "grad_norm": 0.5546875, "learning_rate": 4.460228192992244e-05, "loss": 1.1545, "step": 131200 }, { "epoch": 0.12, "grad_norm": 42.25, "learning_rate": 4.459778285673152e-05, "loss": 1.2642, "step": 131300 }, { "epoch": 0.12, "grad_norm": 14.0, "learning_rate": 4.459328378354059e-05, "loss": 1.0595, "step": 131400 }, { "epoch": 0.12, "grad_norm": 21.5, "learning_rate": 4.4588784710349665e-05, "loss": 1.223, "step": 131500 }, { "epoch": 0.12, "grad_norm": 14.8125, "learning_rate": 4.458428563715875e-05, "loss": 1.0834, "step": 131600 }, { "epoch": 0.12, "grad_norm": 46.75, "learning_rate": 4.457978656396782e-05, "loss": 1.2556, "step": 131700 }, { "epoch": 0.12, "grad_norm": 26.75, "learning_rate": 4.4575287490776905e-05, "loss": 1.2542, "step": 131800 }, { "epoch": 0.12, "grad_norm": 7.6875, "learning_rate": 4.457078841758598e-05, "loss": 1.1523, "step": 131900 }, { "epoch": 0.12, "grad_norm": 28.25, "learning_rate": 4.4566289344395055e-05, "loss": 1.2207, "step": 132000 }, { "epoch": 0.12, "grad_norm": 0.0245361328125, "learning_rate": 4.456179027120414e-05, "loss": 1.2154, "step": 132100 }, { "epoch": 0.12, "grad_norm": 8.0625, "learning_rate": 4.455729119801321e-05, "loss": 1.2426, "step": 132200 }, { "epoch": 0.12, "grad_norm": 45.0, "learning_rate": 4.455279212482229e-05, "loss": 1.2117, "step": 132300 }, { "epoch": 0.12, "grad_norm": 8.5625, "learning_rate": 4.454829305163137e-05, "loss": 1.1347, "step": 132400 }, { "epoch": 0.12, "grad_norm": 22.875, "learning_rate": 4.4543793978440446e-05, "loss": 1.14, "step": 132500 }, { "epoch": 0.12, "grad_norm": 64.5, "learning_rate": 4.453929490524952e-05, "loss": 1.2401, "step": 132600 }, { "epoch": 0.12, "grad_norm": 42.25, "learning_rate": 4.4534795832058596e-05, "loss": 1.1757, "step": 132700 }, { "epoch": 0.12, "grad_norm": 0.0052490234375, "learning_rate": 4.453029675886767e-05, "loss": 1.2388, "step": 132800 }, { "epoch": 0.12, "grad_norm": 0.349609375, "learning_rate": 4.4525797685676754e-05, "loss": 1.2856, "step": 132900 }, { "epoch": 0.12, "grad_norm": 97.5, "learning_rate": 4.452129861248583e-05, "loss": 1.1789, "step": 133000 }, { "epoch": 0.12, "grad_norm": 119.5, "learning_rate": 4.4516799539294904e-05, "loss": 1.0874, "step": 133100 }, { "epoch": 0.12, "grad_norm": 0.302734375, "learning_rate": 4.4512300466103986e-05, "loss": 1.2439, "step": 133200 }, { "epoch": 0.12, "grad_norm": 79.0, "learning_rate": 4.450780139291306e-05, "loss": 1.2389, "step": 133300 }, { "epoch": 0.12, "grad_norm": 23.625, "learning_rate": 4.450330231972214e-05, "loss": 1.2274, "step": 133400 }, { "epoch": 0.12, "grad_norm": 21.5, "learning_rate": 4.449880324653122e-05, "loss": 1.188, "step": 133500 }, { "epoch": 0.12, "grad_norm": 17.375, "learning_rate": 4.4494304173340295e-05, "loss": 1.1631, "step": 133600 }, { "epoch": 0.12, "grad_norm": 73.0, "learning_rate": 4.448980510014937e-05, "loss": 1.231, "step": 133700 }, { "epoch": 0.12, "grad_norm": 13.0, "learning_rate": 4.448530602695845e-05, "loss": 1.2178, "step": 133800 }, { "epoch": 0.12, "grad_norm": 51.5, "learning_rate": 4.448080695376753e-05, "loss": 1.2033, "step": 133900 }, { "epoch": 0.12, "grad_norm": 0.01318359375, "learning_rate": 4.44763078805766e-05, "loss": 1.3142, "step": 134000 }, { "epoch": 0.12, "grad_norm": 189.0, "learning_rate": 4.447180880738568e-05, "loss": 1.0322, "step": 134100 }, { "epoch": 0.12, "grad_norm": 13.625, "learning_rate": 4.4467309734194753e-05, "loss": 1.0918, "step": 134200 }, { "epoch": 0.12, "grad_norm": 9.1875, "learning_rate": 4.4462810661003836e-05, "loss": 1.3872, "step": 134300 }, { "epoch": 0.12, "grad_norm": 21.5, "learning_rate": 4.445831158781291e-05, "loss": 1.0979, "step": 134400 }, { "epoch": 0.12, "grad_norm": 94.0, "learning_rate": 4.445381251462199e-05, "loss": 1.1704, "step": 134500 }, { "epoch": 0.12, "grad_norm": 29.75, "learning_rate": 4.444931344143107e-05, "loss": 1.3263, "step": 134600 }, { "epoch": 0.12, "grad_norm": 22.0, "learning_rate": 4.4444814368240144e-05, "loss": 1.1985, "step": 134700 }, { "epoch": 0.12, "grad_norm": 33.0, "learning_rate": 4.4440315295049226e-05, "loss": 1.1766, "step": 134800 }, { "epoch": 0.12, "grad_norm": 0.1650390625, "learning_rate": 4.44358162218583e-05, "loss": 1.2234, "step": 134900 }, { "epoch": 0.12, "grad_norm": 29.625, "learning_rate": 4.4431317148667376e-05, "loss": 1.0468, "step": 135000 }, { "epoch": 0.12, "grad_norm": 40.75, "learning_rate": 4.442681807547646e-05, "loss": 1.2107, "step": 135100 }, { "epoch": 0.12, "grad_norm": 57.25, "learning_rate": 4.4422319002285534e-05, "loss": 1.2141, "step": 135200 }, { "epoch": 0.12, "grad_norm": 35.25, "learning_rate": 4.441781992909461e-05, "loss": 1.0996, "step": 135300 }, { "epoch": 0.12, "grad_norm": 34.5, "learning_rate": 4.4413320855903685e-05, "loss": 1.0985, "step": 135400 }, { "epoch": 0.12, "grad_norm": 15.75, "learning_rate": 4.440882178271276e-05, "loss": 1.16, "step": 135500 }, { "epoch": 0.12, "grad_norm": 31.625, "learning_rate": 4.440432270952184e-05, "loss": 1.2288, "step": 135600 }, { "epoch": 0.12, "grad_norm": 0.006591796875, "learning_rate": 4.439982363633092e-05, "loss": 1.1724, "step": 135700 }, { "epoch": 0.12, "grad_norm": 27.125, "learning_rate": 4.439532456313999e-05, "loss": 1.2668, "step": 135800 }, { "epoch": 0.12, "grad_norm": 18.25, "learning_rate": 4.4390825489949075e-05, "loss": 1.2629, "step": 135900 }, { "epoch": 0.12, "grad_norm": 18.375, "learning_rate": 4.438632641675815e-05, "loss": 1.28, "step": 136000 }, { "epoch": 0.12, "grad_norm": 47.25, "learning_rate": 4.4381827343567226e-05, "loss": 1.03, "step": 136100 }, { "epoch": 0.12, "grad_norm": 21.5, "learning_rate": 4.437732827037631e-05, "loss": 1.1627, "step": 136200 }, { "epoch": 0.12, "grad_norm": 19.375, "learning_rate": 4.437282919718538e-05, "loss": 1.3236, "step": 136300 }, { "epoch": 0.12, "grad_norm": 17.875, "learning_rate": 4.436833012399446e-05, "loss": 1.1586, "step": 136400 }, { "epoch": 0.12, "grad_norm": 27.875, "learning_rate": 4.436383105080354e-05, "loss": 1.1852, "step": 136500 }, { "epoch": 0.12, "grad_norm": 15.5, "learning_rate": 4.435933197761261e-05, "loss": 1.077, "step": 136600 }, { "epoch": 0.12, "grad_norm": 72.0, "learning_rate": 4.435483290442169e-05, "loss": 1.3186, "step": 136700 }, { "epoch": 0.12, "grad_norm": 55.75, "learning_rate": 4.4350333831230766e-05, "loss": 1.1928, "step": 136800 }, { "epoch": 0.12, "grad_norm": 79.0, "learning_rate": 4.434583475803984e-05, "loss": 1.2576, "step": 136900 }, { "epoch": 0.12, "grad_norm": 20.875, "learning_rate": 4.4341335684848924e-05, "loss": 1.1306, "step": 137000 }, { "epoch": 0.12, "grad_norm": 0.00131988525390625, "learning_rate": 4.4336836611658e-05, "loss": 1.2011, "step": 137100 }, { "epoch": 0.12, "grad_norm": 91.5, "learning_rate": 4.433233753846708e-05, "loss": 1.4379, "step": 137200 }, { "epoch": 0.12, "grad_norm": 49.75, "learning_rate": 4.432783846527616e-05, "loss": 1.2621, "step": 137300 }, { "epoch": 0.12, "grad_norm": 49.0, "learning_rate": 4.432333939208523e-05, "loss": 1.2022, "step": 137400 }, { "epoch": 0.12, "grad_norm": 51.25, "learning_rate": 4.4318840318894314e-05, "loss": 1.1237, "step": 137500 }, { "epoch": 0.12, "grad_norm": 19.125, "learning_rate": 4.431434124570339e-05, "loss": 1.1618, "step": 137600 }, { "epoch": 0.12, "grad_norm": 27.5, "learning_rate": 4.4309842172512465e-05, "loss": 1.1117, "step": 137700 }, { "epoch": 0.12, "grad_norm": 66.0, "learning_rate": 4.430534309932155e-05, "loss": 1.1636, "step": 137800 }, { "epoch": 0.12, "grad_norm": 25.125, "learning_rate": 4.4300844026130616e-05, "loss": 1.1296, "step": 137900 }, { "epoch": 0.12, "grad_norm": 41.75, "learning_rate": 4.42963449529397e-05, "loss": 1.1772, "step": 138000 }, { "epoch": 0.12, "grad_norm": 22.25, "learning_rate": 4.429184587974877e-05, "loss": 1.1432, "step": 138100 }, { "epoch": 0.12, "grad_norm": 45.25, "learning_rate": 4.428734680655785e-05, "loss": 1.1964, "step": 138200 }, { "epoch": 0.12, "grad_norm": 24.75, "learning_rate": 4.428284773336693e-05, "loss": 1.0238, "step": 138300 }, { "epoch": 0.12, "grad_norm": 0.435546875, "learning_rate": 4.4278348660176006e-05, "loss": 1.3285, "step": 138400 }, { "epoch": 0.12, "grad_norm": 8.375, "learning_rate": 4.427384958698508e-05, "loss": 1.2392, "step": 138500 }, { "epoch": 0.12, "grad_norm": 62.75, "learning_rate": 4.426935051379416e-05, "loss": 1.1838, "step": 138600 }, { "epoch": 0.12, "grad_norm": 44.5, "learning_rate": 4.426485144060324e-05, "loss": 1.5125, "step": 138700 }, { "epoch": 0.12, "grad_norm": 81.5, "learning_rate": 4.4260352367412314e-05, "loss": 1.1888, "step": 138800 }, { "epoch": 0.12, "grad_norm": 11.875, "learning_rate": 4.4255853294221396e-05, "loss": 1.1931, "step": 138900 }, { "epoch": 0.12, "grad_norm": 28.125, "learning_rate": 4.425135422103047e-05, "loss": 1.1716, "step": 139000 }, { "epoch": 0.12, "grad_norm": 19.75, "learning_rate": 4.424685514783955e-05, "loss": 1.1646, "step": 139100 }, { "epoch": 0.12, "grad_norm": 28.625, "learning_rate": 4.424235607464862e-05, "loss": 1.0503, "step": 139200 }, { "epoch": 0.12, "grad_norm": 34.5, "learning_rate": 4.42378570014577e-05, "loss": 1.083, "step": 139300 }, { "epoch": 0.12, "grad_norm": 92.5, "learning_rate": 4.423335792826678e-05, "loss": 1.2003, "step": 139400 }, { "epoch": 0.12, "grad_norm": 9.3125, "learning_rate": 4.4228858855075855e-05, "loss": 1.1024, "step": 139500 }, { "epoch": 0.12, "grad_norm": 21.375, "learning_rate": 4.422435978188493e-05, "loss": 1.2006, "step": 139600 }, { "epoch": 0.12, "grad_norm": 0.01055908203125, "learning_rate": 4.421986070869401e-05, "loss": 1.0275, "step": 139700 }, { "epoch": 0.12, "grad_norm": 106.0, "learning_rate": 4.421536163550309e-05, "loss": 1.1647, "step": 139800 }, { "epoch": 0.12, "grad_norm": 0.00885009765625, "learning_rate": 4.421086256231216e-05, "loss": 1.3076, "step": 139900 }, { "epoch": 0.12, "grad_norm": 0.07763671875, "learning_rate": 4.4206363489121245e-05, "loss": 1.1696, "step": 140000 }, { "epoch": 0.12, "grad_norm": 41.5, "learning_rate": 4.420186441593032e-05, "loss": 1.3154, "step": 140100 }, { "epoch": 0.12, "grad_norm": 24.125, "learning_rate": 4.41973653427394e-05, "loss": 1.1747, "step": 140200 }, { "epoch": 0.12, "grad_norm": 38.75, "learning_rate": 4.419286626954848e-05, "loss": 1.2746, "step": 140300 }, { "epoch": 0.13, "grad_norm": 33.75, "learning_rate": 4.418836719635755e-05, "loss": 1.1907, "step": 140400 }, { "epoch": 0.13, "grad_norm": 41.5, "learning_rate": 4.418386812316663e-05, "loss": 1.3017, "step": 140500 }, { "epoch": 0.13, "grad_norm": 10.6875, "learning_rate": 4.4179369049975704e-05, "loss": 1.1746, "step": 140600 }, { "epoch": 0.13, "grad_norm": 19.5, "learning_rate": 4.4174869976784786e-05, "loss": 1.384, "step": 140700 }, { "epoch": 0.13, "grad_norm": 30.5, "learning_rate": 4.417037090359386e-05, "loss": 0.912, "step": 140800 }, { "epoch": 0.13, "grad_norm": 52.0, "learning_rate": 4.416587183040294e-05, "loss": 1.1244, "step": 140900 }, { "epoch": 0.13, "grad_norm": 38.5, "learning_rate": 4.416137275721202e-05, "loss": 1.089, "step": 141000 }, { "epoch": 0.13, "grad_norm": 21.75, "learning_rate": 4.4156873684021094e-05, "loss": 1.1577, "step": 141100 }, { "epoch": 0.13, "grad_norm": 11.8125, "learning_rate": 4.415237461083017e-05, "loss": 1.1959, "step": 141200 }, { "epoch": 0.13, "grad_norm": 14.125, "learning_rate": 4.414787553763925e-05, "loss": 1.1803, "step": 141300 }, { "epoch": 0.13, "grad_norm": 15.1875, "learning_rate": 4.414337646444833e-05, "loss": 1.2907, "step": 141400 }, { "epoch": 0.13, "grad_norm": 12.6875, "learning_rate": 4.41388773912574e-05, "loss": 1.0964, "step": 141500 }, { "epoch": 0.13, "grad_norm": 146.0, "learning_rate": 4.4134378318066484e-05, "loss": 1.1939, "step": 141600 }, { "epoch": 0.13, "grad_norm": 8.0625, "learning_rate": 4.412987924487556e-05, "loss": 1.3428, "step": 141700 }, { "epoch": 0.13, "grad_norm": 32.75, "learning_rate": 4.4125380171684635e-05, "loss": 1.1379, "step": 141800 }, { "epoch": 0.13, "grad_norm": 28.875, "learning_rate": 4.412088109849371e-05, "loss": 1.1182, "step": 141900 }, { "epoch": 0.13, "grad_norm": 3.515625, "learning_rate": 4.4116382025302786e-05, "loss": 1.0874, "step": 142000 }, { "epoch": 0.13, "grad_norm": 17.5, "learning_rate": 4.411188295211187e-05, "loss": 1.2566, "step": 142100 }, { "epoch": 0.13, "grad_norm": 56.25, "learning_rate": 4.410738387892094e-05, "loss": 1.3614, "step": 142200 }, { "epoch": 0.13, "grad_norm": 22.0, "learning_rate": 4.410288480573002e-05, "loss": 1.1822, "step": 142300 }, { "epoch": 0.13, "grad_norm": 26.625, "learning_rate": 4.40983857325391e-05, "loss": 1.1969, "step": 142400 }, { "epoch": 0.13, "grad_norm": 39.25, "learning_rate": 4.4093886659348176e-05, "loss": 1.2387, "step": 142500 }, { "epoch": 0.13, "grad_norm": 18.375, "learning_rate": 4.408938758615725e-05, "loss": 1.1866, "step": 142600 }, { "epoch": 0.13, "grad_norm": 85.5, "learning_rate": 4.4084888512966333e-05, "loss": 1.3847, "step": 142700 }, { "epoch": 0.13, "grad_norm": 109.5, "learning_rate": 4.408038943977541e-05, "loss": 1.1563, "step": 142800 }, { "epoch": 0.13, "grad_norm": 36.5, "learning_rate": 4.407589036658449e-05, "loss": 1.0689, "step": 142900 }, { "epoch": 0.13, "grad_norm": 53.25, "learning_rate": 4.4071391293393566e-05, "loss": 1.1359, "step": 143000 }, { "epoch": 0.13, "grad_norm": 26.375, "learning_rate": 4.4066892220202635e-05, "loss": 1.1186, "step": 143100 }, { "epoch": 0.13, "grad_norm": 20.0, "learning_rate": 4.406239314701172e-05, "loss": 1.1483, "step": 143200 }, { "epoch": 0.13, "grad_norm": 42.75, "learning_rate": 4.405789407382079e-05, "loss": 1.416, "step": 143300 }, { "epoch": 0.13, "grad_norm": 390.0, "learning_rate": 4.4053395000629874e-05, "loss": 1.1409, "step": 143400 }, { "epoch": 0.13, "grad_norm": 0.416015625, "learning_rate": 4.404889592743895e-05, "loss": 1.2053, "step": 143500 }, { "epoch": 0.13, "grad_norm": 36.0, "learning_rate": 4.4044396854248025e-05, "loss": 0.9599, "step": 143600 }, { "epoch": 0.13, "grad_norm": 0.09619140625, "learning_rate": 4.403989778105711e-05, "loss": 1.2231, "step": 143700 }, { "epoch": 0.13, "grad_norm": 51.5, "learning_rate": 4.403539870786618e-05, "loss": 1.41, "step": 143800 }, { "epoch": 0.13, "grad_norm": 40.5, "learning_rate": 4.403089963467526e-05, "loss": 1.1409, "step": 143900 }, { "epoch": 0.13, "grad_norm": 6.125, "learning_rate": 4.402640056148434e-05, "loss": 1.1738, "step": 144000 }, { "epoch": 0.13, "grad_norm": 18.75, "learning_rate": 4.4021901488293415e-05, "loss": 1.2016, "step": 144100 }, { "epoch": 0.13, "grad_norm": 24.125, "learning_rate": 4.401740241510249e-05, "loss": 1.4153, "step": 144200 }, { "epoch": 0.13, "grad_norm": 35.75, "learning_rate": 4.401290334191157e-05, "loss": 1.0668, "step": 144300 }, { "epoch": 0.13, "grad_norm": 31.5, "learning_rate": 4.400840426872064e-05, "loss": 1.3122, "step": 144400 }, { "epoch": 0.13, "grad_norm": 39.5, "learning_rate": 4.4003905195529723e-05, "loss": 1.1096, "step": 144500 }, { "epoch": 0.13, "grad_norm": 260.0, "learning_rate": 4.39994061223388e-05, "loss": 1.1968, "step": 144600 }, { "epoch": 0.13, "grad_norm": 11.875, "learning_rate": 4.3994907049147874e-05, "loss": 1.1765, "step": 144700 }, { "epoch": 0.13, "grad_norm": 28.375, "learning_rate": 4.3990407975956956e-05, "loss": 1.2439, "step": 144800 }, { "epoch": 0.13, "grad_norm": 0.072265625, "learning_rate": 4.398590890276603e-05, "loss": 1.1206, "step": 144900 }, { "epoch": 0.13, "grad_norm": 23.25, "learning_rate": 4.398140982957511e-05, "loss": 1.1029, "step": 145000 }, { "epoch": 0.13, "grad_norm": 34.0, "learning_rate": 4.397691075638419e-05, "loss": 1.1021, "step": 145100 }, { "epoch": 0.13, "grad_norm": 15.6875, "learning_rate": 4.3972411683193264e-05, "loss": 1.2597, "step": 145200 }, { "epoch": 0.13, "grad_norm": 16.25, "learning_rate": 4.396791261000234e-05, "loss": 1.2942, "step": 145300 }, { "epoch": 0.13, "grad_norm": 16.5, "learning_rate": 4.396341353681142e-05, "loss": 1.1596, "step": 145400 }, { "epoch": 0.13, "grad_norm": 112.5, "learning_rate": 4.39589144636205e-05, "loss": 1.2234, "step": 145500 }, { "epoch": 0.13, "grad_norm": 16.625, "learning_rate": 4.395441539042958e-05, "loss": 1.3201, "step": 145600 }, { "epoch": 0.13, "grad_norm": 27.0, "learning_rate": 4.394991631723865e-05, "loss": 1.3096, "step": 145700 }, { "epoch": 0.13, "grad_norm": 43.75, "learning_rate": 4.394541724404772e-05, "loss": 1.4264, "step": 145800 }, { "epoch": 0.13, "grad_norm": 18.75, "learning_rate": 4.3940918170856805e-05, "loss": 1.1205, "step": 145900 }, { "epoch": 0.13, "grad_norm": 15.5, "learning_rate": 4.393641909766588e-05, "loss": 1.0715, "step": 146000 }, { "epoch": 0.13, "grad_norm": 25.375, "learning_rate": 4.393192002447496e-05, "loss": 1.014, "step": 146100 }, { "epoch": 0.13, "grad_norm": 22.25, "learning_rate": 4.392742095128404e-05, "loss": 1.1428, "step": 146200 }, { "epoch": 0.13, "grad_norm": 10.625, "learning_rate": 4.3922921878093113e-05, "loss": 1.0716, "step": 146300 }, { "epoch": 0.13, "grad_norm": 0.0087890625, "learning_rate": 4.3918422804902196e-05, "loss": 1.0702, "step": 146400 }, { "epoch": 0.13, "grad_norm": 66.0, "learning_rate": 4.391392373171127e-05, "loss": 1.1625, "step": 146500 }, { "epoch": 0.13, "grad_norm": 9.3125, "learning_rate": 4.3909424658520346e-05, "loss": 1.2266, "step": 146600 }, { "epoch": 0.13, "grad_norm": 2.3125, "learning_rate": 4.390492558532943e-05, "loss": 1.0975, "step": 146700 }, { "epoch": 0.13, "grad_norm": 39.5, "learning_rate": 4.3900426512138504e-05, "loss": 1.2589, "step": 146800 }, { "epoch": 0.13, "grad_norm": 0.66796875, "learning_rate": 4.389592743894758e-05, "loss": 1.3939, "step": 146900 }, { "epoch": 0.13, "grad_norm": 38.0, "learning_rate": 4.3891428365756654e-05, "loss": 1.4059, "step": 147000 }, { "epoch": 0.13, "grad_norm": 61.5, "learning_rate": 4.388692929256573e-05, "loss": 1.2118, "step": 147100 }, { "epoch": 0.13, "grad_norm": 0.004180908203125, "learning_rate": 4.388243021937481e-05, "loss": 1.2294, "step": 147200 }, { "epoch": 0.13, "grad_norm": 169.0, "learning_rate": 4.387793114618389e-05, "loss": 1.2969, "step": 147300 }, { "epoch": 0.13, "grad_norm": 26.625, "learning_rate": 4.387343207299296e-05, "loss": 0.9417, "step": 147400 }, { "epoch": 0.13, "grad_norm": 79.5, "learning_rate": 4.3868932999802045e-05, "loss": 1.1837, "step": 147500 }, { "epoch": 0.13, "grad_norm": 1.2421875, "learning_rate": 4.386443392661112e-05, "loss": 1.2401, "step": 147600 }, { "epoch": 0.13, "grad_norm": 14.3125, "learning_rate": 4.3859934853420195e-05, "loss": 1.2824, "step": 147700 }, { "epoch": 0.13, "grad_norm": 14.875, "learning_rate": 4.385543578022928e-05, "loss": 1.214, "step": 147800 }, { "epoch": 0.13, "grad_norm": 28.0, "learning_rate": 4.385093670703835e-05, "loss": 1.1858, "step": 147900 }, { "epoch": 0.13, "grad_norm": 153.0, "learning_rate": 4.384643763384743e-05, "loss": 1.2291, "step": 148000 }, { "epoch": 0.13, "grad_norm": 51.25, "learning_rate": 4.384193856065651e-05, "loss": 1.1609, "step": 148100 }, { "epoch": 0.13, "grad_norm": 70.0, "learning_rate": 4.3837439487465586e-05, "loss": 1.2215, "step": 148200 }, { "epoch": 0.13, "grad_norm": 74.5, "learning_rate": 4.383294041427466e-05, "loss": 1.4444, "step": 148300 }, { "epoch": 0.13, "grad_norm": 32.25, "learning_rate": 4.3828441341083736e-05, "loss": 1.2274, "step": 148400 }, { "epoch": 0.13, "grad_norm": 318.0, "learning_rate": 4.382394226789281e-05, "loss": 1.2435, "step": 148500 }, { "epoch": 0.13, "grad_norm": 86.5, "learning_rate": 4.3819443194701894e-05, "loss": 1.1043, "step": 148600 }, { "epoch": 0.13, "grad_norm": 30.125, "learning_rate": 4.381494412151097e-05, "loss": 1.0821, "step": 148700 }, { "epoch": 0.13, "grad_norm": 38.75, "learning_rate": 4.381044504832005e-05, "loss": 1.259, "step": 148800 }, { "epoch": 0.13, "grad_norm": 47.5, "learning_rate": 4.3805945975129127e-05, "loss": 1.1161, "step": 148900 }, { "epoch": 0.13, "grad_norm": 15.6875, "learning_rate": 4.38014469019382e-05, "loss": 1.2601, "step": 149000 }, { "epoch": 0.13, "grad_norm": 110.0, "learning_rate": 4.3796947828747284e-05, "loss": 1.1872, "step": 149100 }, { "epoch": 0.13, "grad_norm": 22.75, "learning_rate": 4.379244875555636e-05, "loss": 1.026, "step": 149200 }, { "epoch": 0.13, "grad_norm": 168.0, "learning_rate": 4.3787949682365435e-05, "loss": 1.1621, "step": 149300 }, { "epoch": 0.13, "grad_norm": 24.5, "learning_rate": 4.378345060917452e-05, "loss": 1.195, "step": 149400 }, { "epoch": 0.13, "grad_norm": 75.0, "learning_rate": 4.377895153598359e-05, "loss": 1.1963, "step": 149500 }, { "epoch": 0.13, "grad_norm": 22.375, "learning_rate": 4.377445246279267e-05, "loss": 1.2369, "step": 149600 }, { "epoch": 0.13, "grad_norm": 12.9375, "learning_rate": 4.376995338960174e-05, "loss": 1.0527, "step": 149700 }, { "epoch": 0.13, "grad_norm": 23.625, "learning_rate": 4.376545431641082e-05, "loss": 1.1629, "step": 149800 }, { "epoch": 0.13, "grad_norm": 25.0, "learning_rate": 4.37609552432199e-05, "loss": 1.2391, "step": 149900 }, { "epoch": 0.13, "grad_norm": 22.625, "learning_rate": 4.3756456170028976e-05, "loss": 1.2134, "step": 150000 }, { "epoch": 0.13, "grad_norm": 33.75, "learning_rate": 4.375195709683805e-05, "loss": 1.2521, "step": 150100 }, { "epoch": 0.13, "grad_norm": 0.1337890625, "learning_rate": 4.374745802364713e-05, "loss": 1.2987, "step": 150200 }, { "epoch": 0.13, "grad_norm": 13.0, "learning_rate": 4.374295895045621e-05, "loss": 1.2736, "step": 150300 }, { "epoch": 0.13, "grad_norm": 82.0, "learning_rate": 4.3738459877265284e-05, "loss": 1.1508, "step": 150400 }, { "epoch": 0.13, "grad_norm": 38.5, "learning_rate": 4.3733960804074366e-05, "loss": 1.2961, "step": 150500 }, { "epoch": 0.13, "grad_norm": 16.875, "learning_rate": 4.372946173088344e-05, "loss": 1.259, "step": 150600 }, { "epoch": 0.13, "grad_norm": 61.5, "learning_rate": 4.3724962657692517e-05, "loss": 1.2377, "step": 150700 }, { "epoch": 0.13, "grad_norm": 20.25, "learning_rate": 4.37204635845016e-05, "loss": 1.2646, "step": 150800 }, { "epoch": 0.13, "grad_norm": 17.125, "learning_rate": 4.371596451131067e-05, "loss": 1.2117, "step": 150900 }, { "epoch": 0.13, "grad_norm": 45.0, "learning_rate": 4.371146543811975e-05, "loss": 1.2267, "step": 151000 }, { "epoch": 0.13, "grad_norm": 124.5, "learning_rate": 4.3706966364928825e-05, "loss": 1.1694, "step": 151100 }, { "epoch": 0.13, "grad_norm": 60.5, "learning_rate": 4.37024672917379e-05, "loss": 1.2284, "step": 151200 }, { "epoch": 0.13, "grad_norm": 30.375, "learning_rate": 4.369796821854698e-05, "loss": 1.2425, "step": 151300 }, { "epoch": 0.13, "grad_norm": 1.515625, "learning_rate": 4.369346914535606e-05, "loss": 1.351, "step": 151400 }, { "epoch": 0.13, "grad_norm": 0.003662109375, "learning_rate": 4.368897007216514e-05, "loss": 1.2053, "step": 151500 }, { "epoch": 0.14, "grad_norm": 101.0, "learning_rate": 4.3684470998974215e-05, "loss": 1.3002, "step": 151600 }, { "epoch": 0.14, "grad_norm": 28.5, "learning_rate": 4.367997192578329e-05, "loss": 1.0124, "step": 151700 }, { "epoch": 0.14, "grad_norm": 14.1875, "learning_rate": 4.367547285259237e-05, "loss": 1.1829, "step": 151800 }, { "epoch": 0.14, "grad_norm": 76.0, "learning_rate": 4.367097377940145e-05, "loss": 1.3624, "step": 151900 }, { "epoch": 0.14, "grad_norm": 17.625, "learning_rate": 4.366647470621052e-05, "loss": 1.105, "step": 152000 }, { "epoch": 0.14, "grad_norm": 18.5, "learning_rate": 4.3661975633019605e-05, "loss": 1.2441, "step": 152100 }, { "epoch": 0.14, "grad_norm": 26.5, "learning_rate": 4.3657476559828674e-05, "loss": 1.1693, "step": 152200 }, { "epoch": 0.14, "grad_norm": 50.0, "learning_rate": 4.3652977486637756e-05, "loss": 1.2365, "step": 152300 }, { "epoch": 0.14, "grad_norm": 43.75, "learning_rate": 4.364847841344683e-05, "loss": 1.3167, "step": 152400 }, { "epoch": 0.14, "grad_norm": 20.5, "learning_rate": 4.3643979340255907e-05, "loss": 1.1795, "step": 152500 }, { "epoch": 0.14, "grad_norm": 185.0, "learning_rate": 4.363948026706499e-05, "loss": 1.1482, "step": 152600 }, { "epoch": 0.14, "grad_norm": 25.375, "learning_rate": 4.3634981193874064e-05, "loss": 1.2657, "step": 152700 }, { "epoch": 0.14, "grad_norm": 25.125, "learning_rate": 4.363048212068314e-05, "loss": 1.2282, "step": 152800 }, { "epoch": 0.14, "grad_norm": 0.03125, "learning_rate": 4.362598304749222e-05, "loss": 1.2663, "step": 152900 }, { "epoch": 0.14, "grad_norm": 132.0, "learning_rate": 4.36214839743013e-05, "loss": 1.263, "step": 153000 }, { "epoch": 0.14, "grad_norm": 0.74609375, "learning_rate": 4.361698490111037e-05, "loss": 1.1281, "step": 153100 }, { "epoch": 0.14, "grad_norm": 10.0, "learning_rate": 4.3612485827919454e-05, "loss": 1.1419, "step": 153200 }, { "epoch": 0.14, "grad_norm": 36.5, "learning_rate": 4.360798675472853e-05, "loss": 1.1389, "step": 153300 }, { "epoch": 0.14, "grad_norm": 34.5, "learning_rate": 4.3603487681537605e-05, "loss": 1.2128, "step": 153400 }, { "epoch": 0.14, "grad_norm": 13.5, "learning_rate": 4.359898860834668e-05, "loss": 1.155, "step": 153500 }, { "epoch": 0.14, "grad_norm": 86.5, "learning_rate": 4.3594489535155756e-05, "loss": 1.1803, "step": 153600 }, { "epoch": 0.14, "grad_norm": 126.0, "learning_rate": 4.358999046196484e-05, "loss": 1.1925, "step": 153700 }, { "epoch": 0.14, "grad_norm": 45.25, "learning_rate": 4.358549138877391e-05, "loss": 1.046, "step": 153800 }, { "epoch": 0.14, "grad_norm": 29.0, "learning_rate": 4.358099231558299e-05, "loss": 1.2843, "step": 153900 }, { "epoch": 0.14, "grad_norm": 0.189453125, "learning_rate": 4.357649324239207e-05, "loss": 1.228, "step": 154000 }, { "epoch": 0.14, "grad_norm": 63.0, "learning_rate": 4.3571994169201146e-05, "loss": 1.2228, "step": 154100 }, { "epoch": 0.14, "grad_norm": 168.0, "learning_rate": 4.356749509601023e-05, "loss": 1.3743, "step": 154200 }, { "epoch": 0.14, "grad_norm": 29.125, "learning_rate": 4.35629960228193e-05, "loss": 1.2738, "step": 154300 }, { "epoch": 0.14, "grad_norm": 38.75, "learning_rate": 4.355849694962838e-05, "loss": 1.0825, "step": 154400 }, { "epoch": 0.14, "grad_norm": 20.625, "learning_rate": 4.355399787643746e-05, "loss": 1.3722, "step": 154500 }, { "epoch": 0.14, "grad_norm": 9.375, "learning_rate": 4.3549498803246536e-05, "loss": 1.0836, "step": 154600 }, { "epoch": 0.14, "grad_norm": 82.5, "learning_rate": 4.354499973005561e-05, "loss": 1.3229, "step": 154700 }, { "epoch": 0.14, "grad_norm": 19.125, "learning_rate": 4.354050065686469e-05, "loss": 1.2155, "step": 154800 }, { "epoch": 0.14, "grad_norm": 39.25, "learning_rate": 4.353600158367376e-05, "loss": 1.2239, "step": 154900 }, { "epoch": 0.14, "grad_norm": 98.0, "learning_rate": 4.3531502510482844e-05, "loss": 1.0426, "step": 155000 }, { "epoch": 0.14, "grad_norm": 74.5, "learning_rate": 4.352700343729192e-05, "loss": 1.244, "step": 155100 }, { "epoch": 0.14, "grad_norm": 35.0, "learning_rate": 4.3522504364100995e-05, "loss": 1.2144, "step": 155200 }, { "epoch": 0.14, "grad_norm": 85.5, "learning_rate": 4.351800529091008e-05, "loss": 1.3023, "step": 155300 }, { "epoch": 0.14, "grad_norm": 27.0, "learning_rate": 4.351350621771915e-05, "loss": 1.1603, "step": 155400 }, { "epoch": 0.14, "grad_norm": 9.875, "learning_rate": 4.350900714452823e-05, "loss": 1.1795, "step": 155500 }, { "epoch": 0.14, "grad_norm": 28.0, "learning_rate": 4.350450807133731e-05, "loss": 1.1286, "step": 155600 }, { "epoch": 0.14, "grad_norm": 119.5, "learning_rate": 4.3500008998146385e-05, "loss": 1.2931, "step": 155700 }, { "epoch": 0.14, "grad_norm": 0.259765625, "learning_rate": 4.349550992495546e-05, "loss": 1.1743, "step": 155800 }, { "epoch": 0.14, "grad_norm": 38.75, "learning_rate": 4.349101085176454e-05, "loss": 1.2158, "step": 155900 }, { "epoch": 0.14, "grad_norm": 55.5, "learning_rate": 4.348651177857362e-05, "loss": 1.0754, "step": 156000 }, { "epoch": 0.14, "grad_norm": 18.875, "learning_rate": 4.348201270538269e-05, "loss": 1.2114, "step": 156100 }, { "epoch": 0.14, "grad_norm": 19.375, "learning_rate": 4.347751363219177e-05, "loss": 1.3028, "step": 156200 }, { "epoch": 0.14, "grad_norm": 16.0, "learning_rate": 4.3473014559000844e-05, "loss": 1.2131, "step": 156300 }, { "epoch": 0.14, "grad_norm": 12.8125, "learning_rate": 4.3468515485809926e-05, "loss": 1.231, "step": 156400 }, { "epoch": 0.14, "grad_norm": 24.5, "learning_rate": 4.3464016412619e-05, "loss": 1.3114, "step": 156500 }, { "epoch": 0.14, "grad_norm": 12.0, "learning_rate": 4.345951733942808e-05, "loss": 0.979, "step": 156600 }, { "epoch": 0.14, "grad_norm": 43.75, "learning_rate": 4.345501826623716e-05, "loss": 1.102, "step": 156700 }, { "epoch": 0.14, "grad_norm": 0.003753662109375, "learning_rate": 4.3450519193046234e-05, "loss": 1.1508, "step": 156800 }, { "epoch": 0.14, "grad_norm": 88.5, "learning_rate": 4.3446020119855316e-05, "loss": 1.1869, "step": 156900 }, { "epoch": 0.14, "grad_norm": 43.5, "learning_rate": 4.344152104666439e-05, "loss": 1.0248, "step": 157000 }, { "epoch": 0.14, "grad_norm": 132.0, "learning_rate": 4.343702197347347e-05, "loss": 1.107, "step": 157100 }, { "epoch": 0.14, "grad_norm": 7.03125, "learning_rate": 4.343252290028255e-05, "loss": 1.1439, "step": 157200 }, { "epoch": 0.14, "grad_norm": 131.0, "learning_rate": 4.3428023827091624e-05, "loss": 1.2053, "step": 157300 }, { "epoch": 0.14, "grad_norm": 65.5, "learning_rate": 4.342352475390069e-05, "loss": 1.1828, "step": 157400 }, { "epoch": 0.14, "grad_norm": 0.006072998046875, "learning_rate": 4.3419025680709775e-05, "loss": 1.0896, "step": 157500 }, { "epoch": 0.14, "grad_norm": 16.625, "learning_rate": 4.341452660751885e-05, "loss": 1.1416, "step": 157600 }, { "epoch": 0.14, "grad_norm": 57.75, "learning_rate": 4.341002753432793e-05, "loss": 1.3766, "step": 157700 }, { "epoch": 0.14, "grad_norm": 52.25, "learning_rate": 4.340552846113701e-05, "loss": 1.1819, "step": 157800 }, { "epoch": 0.14, "grad_norm": 24.875, "learning_rate": 4.340102938794608e-05, "loss": 1.2043, "step": 157900 }, { "epoch": 0.14, "grad_norm": 0.1689453125, "learning_rate": 4.3396530314755165e-05, "loss": 1.1778, "step": 158000 }, { "epoch": 0.14, "grad_norm": 9.3125, "learning_rate": 4.339203124156424e-05, "loss": 1.079, "step": 158100 }, { "epoch": 0.14, "grad_norm": 21.125, "learning_rate": 4.3387532168373316e-05, "loss": 1.3042, "step": 158200 }, { "epoch": 0.14, "grad_norm": 0.0859375, "learning_rate": 4.33830330951824e-05, "loss": 1.1161, "step": 158300 }, { "epoch": 0.14, "grad_norm": 14.25, "learning_rate": 4.3378534021991474e-05, "loss": 0.9665, "step": 158400 }, { "epoch": 0.14, "grad_norm": 0.052978515625, "learning_rate": 4.337403494880055e-05, "loss": 1.1824, "step": 158500 }, { "epoch": 0.14, "grad_norm": 100.0, "learning_rate": 4.3369535875609624e-05, "loss": 1.2272, "step": 158600 }, { "epoch": 0.14, "grad_norm": 0.0128173828125, "learning_rate": 4.33650368024187e-05, "loss": 1.219, "step": 158700 }, { "epoch": 0.14, "grad_norm": 3.890625, "learning_rate": 4.336053772922778e-05, "loss": 1.0795, "step": 158800 }, { "epoch": 0.14, "grad_norm": 0.0113525390625, "learning_rate": 4.335603865603686e-05, "loss": 1.1915, "step": 158900 }, { "epoch": 0.14, "grad_norm": 0.00909423828125, "learning_rate": 4.335153958284593e-05, "loss": 1.1958, "step": 159000 }, { "epoch": 0.14, "grad_norm": 9.25, "learning_rate": 4.3347040509655014e-05, "loss": 1.2367, "step": 159100 }, { "epoch": 0.14, "grad_norm": 2008.0, "learning_rate": 4.334254143646409e-05, "loss": 1.1813, "step": 159200 }, { "epoch": 0.14, "grad_norm": 17.5, "learning_rate": 4.3338042363273165e-05, "loss": 1.2036, "step": 159300 }, { "epoch": 0.14, "grad_norm": 56.5, "learning_rate": 4.333354329008225e-05, "loss": 1.1227, "step": 159400 }, { "epoch": 0.14, "grad_norm": 111.5, "learning_rate": 4.332904421689132e-05, "loss": 1.3145, "step": 159500 }, { "epoch": 0.14, "grad_norm": 12.4375, "learning_rate": 4.33245451437004e-05, "loss": 1.233, "step": 159600 }, { "epoch": 0.14, "grad_norm": 81.0, "learning_rate": 4.332004607050948e-05, "loss": 1.2202, "step": 159700 }, { "epoch": 0.14, "grad_norm": 262.0, "learning_rate": 4.3315546997318555e-05, "loss": 1.216, "step": 159800 }, { "epoch": 0.14, "grad_norm": 15.25, "learning_rate": 4.331104792412763e-05, "loss": 1.1336, "step": 159900 }, { "epoch": 0.14, "grad_norm": 0.054443359375, "learning_rate": 4.3306548850936706e-05, "loss": 1.1322, "step": 160000 }, { "epoch": 0.14, "grad_norm": 18.25, "learning_rate": 4.330204977774578e-05, "loss": 0.9996, "step": 160100 }, { "epoch": 0.14, "grad_norm": 47.0, "learning_rate": 4.3297550704554864e-05, "loss": 1.2751, "step": 160200 }, { "epoch": 0.14, "grad_norm": 62.5, "learning_rate": 4.329305163136394e-05, "loss": 1.132, "step": 160300 }, { "epoch": 0.14, "grad_norm": 19.25, "learning_rate": 4.328855255817302e-05, "loss": 1.231, "step": 160400 }, { "epoch": 0.14, "grad_norm": 0.3203125, "learning_rate": 4.3284053484982096e-05, "loss": 1.1239, "step": 160500 }, { "epoch": 0.14, "grad_norm": 15.5, "learning_rate": 4.327955441179117e-05, "loss": 1.1883, "step": 160600 }, { "epoch": 0.14, "grad_norm": 442.0, "learning_rate": 4.3275055338600254e-05, "loss": 0.9981, "step": 160700 }, { "epoch": 0.14, "grad_norm": 32.75, "learning_rate": 4.327055626540933e-05, "loss": 1.3095, "step": 160800 }, { "epoch": 0.14, "grad_norm": 9.125, "learning_rate": 4.3266057192218404e-05, "loss": 1.1381, "step": 160900 }, { "epoch": 0.14, "grad_norm": 25.25, "learning_rate": 4.3261558119027487e-05, "loss": 1.0775, "step": 161000 }, { "epoch": 0.14, "grad_norm": 138.0, "learning_rate": 4.325705904583656e-05, "loss": 1.2819, "step": 161100 }, { "epoch": 0.14, "grad_norm": 40.0, "learning_rate": 4.325255997264564e-05, "loss": 1.2975, "step": 161200 }, { "epoch": 0.14, "grad_norm": 12.6875, "learning_rate": 4.324806089945471e-05, "loss": 1.2189, "step": 161300 }, { "epoch": 0.14, "grad_norm": 36.75, "learning_rate": 4.324356182626379e-05, "loss": 1.273, "step": 161400 }, { "epoch": 0.14, "grad_norm": 77.0, "learning_rate": 4.323906275307287e-05, "loss": 1.2648, "step": 161500 }, { "epoch": 0.14, "grad_norm": 148.0, "learning_rate": 4.3234563679881945e-05, "loss": 1.193, "step": 161600 }, { "epoch": 0.14, "grad_norm": 46.75, "learning_rate": 4.323006460669102e-05, "loss": 1.2597, "step": 161700 }, { "epoch": 0.14, "grad_norm": 4.4375, "learning_rate": 4.32255655335001e-05, "loss": 1.1285, "step": 161800 }, { "epoch": 0.14, "grad_norm": 46.75, "learning_rate": 4.322106646030918e-05, "loss": 1.1198, "step": 161900 }, { "epoch": 0.14, "grad_norm": 56.0, "learning_rate": 4.3216567387118254e-05, "loss": 1.188, "step": 162000 }, { "epoch": 0.14, "grad_norm": 14.25, "learning_rate": 4.3212068313927336e-05, "loss": 1.1083, "step": 162100 }, { "epoch": 0.14, "grad_norm": 17.875, "learning_rate": 4.320756924073641e-05, "loss": 1.2941, "step": 162200 }, { "epoch": 0.14, "grad_norm": 31.25, "learning_rate": 4.3203070167545486e-05, "loss": 1.2685, "step": 162300 }, { "epoch": 0.14, "grad_norm": 5.15625, "learning_rate": 4.319857109435457e-05, "loss": 1.1637, "step": 162400 }, { "epoch": 0.14, "grad_norm": 48.75, "learning_rate": 4.319407202116364e-05, "loss": 1.2843, "step": 162500 }, { "epoch": 0.14, "grad_norm": 15.125, "learning_rate": 4.318957294797272e-05, "loss": 1.1094, "step": 162600 }, { "epoch": 0.14, "grad_norm": 34.0, "learning_rate": 4.3185073874781794e-05, "loss": 1.2415, "step": 162700 }, { "epoch": 0.15, "grad_norm": 12.125, "learning_rate": 4.318057480159087e-05, "loss": 1.14, "step": 162800 }, { "epoch": 0.15, "grad_norm": 44.25, "learning_rate": 4.317607572839995e-05, "loss": 1.175, "step": 162900 }, { "epoch": 0.15, "grad_norm": 62.75, "learning_rate": 4.317157665520903e-05, "loss": 1.249, "step": 163000 }, { "epoch": 0.15, "grad_norm": 25.125, "learning_rate": 4.316707758201811e-05, "loss": 1.2165, "step": 163100 }, { "epoch": 0.15, "grad_norm": 24.125, "learning_rate": 4.3162578508827185e-05, "loss": 1.1537, "step": 163200 }, { "epoch": 0.15, "grad_norm": 44.0, "learning_rate": 4.315807943563626e-05, "loss": 0.9978, "step": 163300 }, { "epoch": 0.15, "grad_norm": 1.15625, "learning_rate": 4.315358036244534e-05, "loss": 1.1201, "step": 163400 }, { "epoch": 0.15, "grad_norm": 57.25, "learning_rate": 4.314908128925442e-05, "loss": 1.0793, "step": 163500 }, { "epoch": 0.15, "grad_norm": 14.25, "learning_rate": 4.314458221606349e-05, "loss": 1.2045, "step": 163600 }, { "epoch": 0.15, "grad_norm": 31.375, "learning_rate": 4.3140083142872575e-05, "loss": 1.3555, "step": 163700 }, { "epoch": 0.15, "grad_norm": 5.34375, "learning_rate": 4.3135584069681644e-05, "loss": 1.0663, "step": 163800 }, { "epoch": 0.15, "grad_norm": 16.625, "learning_rate": 4.3131084996490726e-05, "loss": 1.2379, "step": 163900 }, { "epoch": 0.15, "grad_norm": 46.25, "learning_rate": 4.31265859232998e-05, "loss": 1.1486, "step": 164000 }, { "epoch": 0.15, "grad_norm": 39.0, "learning_rate": 4.3122086850108876e-05, "loss": 1.2255, "step": 164100 }, { "epoch": 0.15, "grad_norm": 70.0, "learning_rate": 4.311758777691796e-05, "loss": 1.445, "step": 164200 }, { "epoch": 0.15, "grad_norm": 28.25, "learning_rate": 4.3113088703727034e-05, "loss": 1.0938, "step": 164300 }, { "epoch": 0.15, "grad_norm": 8.9375, "learning_rate": 4.310858963053611e-05, "loss": 1.1599, "step": 164400 }, { "epoch": 0.15, "grad_norm": 0.7578125, "learning_rate": 4.310409055734519e-05, "loss": 1.1894, "step": 164500 }, { "epoch": 0.15, "grad_norm": 40.5, "learning_rate": 4.3099591484154267e-05, "loss": 1.1745, "step": 164600 }, { "epoch": 0.15, "grad_norm": 41.5, "learning_rate": 4.309509241096334e-05, "loss": 1.2277, "step": 164700 }, { "epoch": 0.15, "grad_norm": 5.0625, "learning_rate": 4.3090593337772424e-05, "loss": 1.2373, "step": 164800 }, { "epoch": 0.15, "grad_norm": 42.25, "learning_rate": 4.30860942645815e-05, "loss": 1.278, "step": 164900 }, { "epoch": 0.15, "grad_norm": 14.6875, "learning_rate": 4.3081595191390575e-05, "loss": 1.2097, "step": 165000 }, { "epoch": 0.15, "grad_norm": 8.375, "learning_rate": 4.307709611819965e-05, "loss": 1.1292, "step": 165100 }, { "epoch": 0.15, "grad_norm": 42.5, "learning_rate": 4.3072597045008725e-05, "loss": 1.341, "step": 165200 }, { "epoch": 0.15, "grad_norm": 121.5, "learning_rate": 4.306809797181781e-05, "loss": 1.081, "step": 165300 }, { "epoch": 0.15, "grad_norm": 12.125, "learning_rate": 4.306359889862688e-05, "loss": 1.1821, "step": 165400 }, { "epoch": 0.15, "grad_norm": 19.375, "learning_rate": 4.305909982543596e-05, "loss": 1.2051, "step": 165500 }, { "epoch": 0.15, "grad_norm": 33.5, "learning_rate": 4.305460075224504e-05, "loss": 1.2173, "step": 165600 }, { "epoch": 0.15, "grad_norm": 11.9375, "learning_rate": 4.3050101679054116e-05, "loss": 1.3438, "step": 165700 }, { "epoch": 0.15, "grad_norm": 41.0, "learning_rate": 4.30456026058632e-05, "loss": 1.1703, "step": 165800 }, { "epoch": 0.15, "grad_norm": 4.78125, "learning_rate": 4.304110353267227e-05, "loss": 1.2001, "step": 165900 }, { "epoch": 0.15, "grad_norm": 740.0, "learning_rate": 4.303660445948135e-05, "loss": 1.2125, "step": 166000 }, { "epoch": 0.15, "grad_norm": 29.25, "learning_rate": 4.303210538629043e-05, "loss": 1.1543, "step": 166100 }, { "epoch": 0.15, "grad_norm": 0.02734375, "learning_rate": 4.3027606313099506e-05, "loss": 1.3533, "step": 166200 }, { "epoch": 0.15, "grad_norm": 26.75, "learning_rate": 4.302310723990858e-05, "loss": 1.2246, "step": 166300 }, { "epoch": 0.15, "grad_norm": 30.0, "learning_rate": 4.3018608166717657e-05, "loss": 1.1274, "step": 166400 }, { "epoch": 0.15, "grad_norm": 22.75, "learning_rate": 4.301410909352673e-05, "loss": 1.1819, "step": 166500 }, { "epoch": 0.15, "grad_norm": 44.0, "learning_rate": 4.3009610020335814e-05, "loss": 1.2977, "step": 166600 }, { "epoch": 0.15, "grad_norm": 58.25, "learning_rate": 4.300511094714489e-05, "loss": 1.3277, "step": 166700 }, { "epoch": 0.15, "grad_norm": 20.25, "learning_rate": 4.3000611873953965e-05, "loss": 1.2512, "step": 166800 }, { "epoch": 0.15, "grad_norm": 29.375, "learning_rate": 4.299611280076305e-05, "loss": 1.2322, "step": 166900 }, { "epoch": 0.15, "grad_norm": 36.25, "learning_rate": 4.299161372757212e-05, "loss": 1.2066, "step": 167000 }, { "epoch": 0.15, "grad_norm": 35.25, "learning_rate": 4.29871146543812e-05, "loss": 1.2478, "step": 167100 }, { "epoch": 0.15, "grad_norm": 146.0, "learning_rate": 4.298261558119028e-05, "loss": 1.3022, "step": 167200 }, { "epoch": 0.15, "grad_norm": 64.0, "learning_rate": 4.2978116507999355e-05, "loss": 1.4135, "step": 167300 }, { "epoch": 0.15, "grad_norm": 17.0, "learning_rate": 4.297361743480843e-05, "loss": 1.3609, "step": 167400 }, { "epoch": 0.15, "grad_norm": 12.25, "learning_rate": 4.296911836161751e-05, "loss": 1.1639, "step": 167500 }, { "epoch": 0.15, "grad_norm": 44.25, "learning_rate": 4.296461928842659e-05, "loss": 1.328, "step": 167600 }, { "epoch": 0.15, "grad_norm": 19.625, "learning_rate": 4.296012021523566e-05, "loss": 1.2159, "step": 167700 }, { "epoch": 0.15, "grad_norm": 29.75, "learning_rate": 4.295562114204474e-05, "loss": 1.3014, "step": 167800 }, { "epoch": 0.15, "grad_norm": 10.0625, "learning_rate": 4.2951122068853814e-05, "loss": 1.1743, "step": 167900 }, { "epoch": 0.15, "grad_norm": 310.0, "learning_rate": 4.2946622995662896e-05, "loss": 1.2183, "step": 168000 }, { "epoch": 0.15, "grad_norm": 31.875, "learning_rate": 4.294212392247197e-05, "loss": 1.2231, "step": 168100 }, { "epoch": 0.15, "grad_norm": 356.0, "learning_rate": 4.2937624849281047e-05, "loss": 1.3259, "step": 168200 }, { "epoch": 0.15, "grad_norm": 10.25, "learning_rate": 4.293312577609013e-05, "loss": 1.1687, "step": 168300 }, { "epoch": 0.15, "grad_norm": 10.5, "learning_rate": 4.2928626702899204e-05, "loss": 1.1799, "step": 168400 }, { "epoch": 0.15, "grad_norm": 11.0, "learning_rate": 4.2924127629708286e-05, "loss": 1.1637, "step": 168500 }, { "epoch": 0.15, "grad_norm": 223.0, "learning_rate": 4.291962855651736e-05, "loss": 1.1419, "step": 168600 }, { "epoch": 0.15, "grad_norm": 44.75, "learning_rate": 4.291512948332644e-05, "loss": 1.334, "step": 168700 }, { "epoch": 0.15, "grad_norm": 13.1875, "learning_rate": 4.291063041013552e-05, "loss": 1.3015, "step": 168800 }, { "epoch": 0.15, "grad_norm": 61.0, "learning_rate": 4.2906131336944594e-05, "loss": 1.0367, "step": 168900 }, { "epoch": 0.15, "grad_norm": 16.375, "learning_rate": 4.290163226375366e-05, "loss": 1.1058, "step": 169000 }, { "epoch": 0.15, "grad_norm": 14.125, "learning_rate": 4.2897133190562745e-05, "loss": 1.1981, "step": 169100 }, { "epoch": 0.15, "grad_norm": 0.130859375, "learning_rate": 4.289263411737182e-05, "loss": 1.2601, "step": 169200 }, { "epoch": 0.15, "grad_norm": 53.5, "learning_rate": 4.28881350441809e-05, "loss": 1.297, "step": 169300 }, { "epoch": 0.15, "grad_norm": 42.0, "learning_rate": 4.288363597098998e-05, "loss": 1.0663, "step": 169400 }, { "epoch": 0.15, "grad_norm": 29.25, "learning_rate": 4.287913689779905e-05, "loss": 1.0977, "step": 169500 }, { "epoch": 0.15, "grad_norm": 21.25, "learning_rate": 4.2874637824608135e-05, "loss": 1.1092, "step": 169600 }, { "epoch": 0.15, "grad_norm": 39.75, "learning_rate": 4.287013875141721e-05, "loss": 1.2423, "step": 169700 }, { "epoch": 0.15, "grad_norm": 29.125, "learning_rate": 4.2865639678226286e-05, "loss": 1.274, "step": 169800 }, { "epoch": 0.15, "grad_norm": 0.1064453125, "learning_rate": 4.286114060503537e-05, "loss": 1.0745, "step": 169900 }, { "epoch": 0.15, "grad_norm": 46.75, "learning_rate": 4.285664153184444e-05, "loss": 1.1619, "step": 170000 }, { "epoch": 0.15, "grad_norm": 26.75, "learning_rate": 4.285214245865352e-05, "loss": 1.1738, "step": 170100 }, { "epoch": 0.15, "grad_norm": 59.5, "learning_rate": 4.28476433854626e-05, "loss": 1.26, "step": 170200 }, { "epoch": 0.15, "grad_norm": 0.388671875, "learning_rate": 4.284314431227167e-05, "loss": 1.3688, "step": 170300 }, { "epoch": 0.15, "grad_norm": 210.0, "learning_rate": 4.283864523908075e-05, "loss": 1.1806, "step": 170400 }, { "epoch": 0.15, "grad_norm": 28.25, "learning_rate": 4.283414616588983e-05, "loss": 1.0955, "step": 170500 }, { "epoch": 0.15, "grad_norm": 17.0, "learning_rate": 4.28296470926989e-05, "loss": 1.3141, "step": 170600 }, { "epoch": 0.15, "grad_norm": 27.625, "learning_rate": 4.2825148019507984e-05, "loss": 1.2581, "step": 170700 }, { "epoch": 0.15, "grad_norm": 27.75, "learning_rate": 4.282064894631706e-05, "loss": 1.1377, "step": 170800 }, { "epoch": 0.15, "grad_norm": 18.25, "learning_rate": 4.2816149873126135e-05, "loss": 1.1178, "step": 170900 }, { "epoch": 0.15, "grad_norm": 21.625, "learning_rate": 4.281165079993522e-05, "loss": 1.2528, "step": 171000 }, { "epoch": 0.15, "grad_norm": 15.1875, "learning_rate": 4.280715172674429e-05, "loss": 1.1468, "step": 171100 }, { "epoch": 0.15, "grad_norm": 23.125, "learning_rate": 4.2802652653553375e-05, "loss": 1.0675, "step": 171200 }, { "epoch": 0.15, "grad_norm": 20.5, "learning_rate": 4.279815358036245e-05, "loss": 1.0935, "step": 171300 }, { "epoch": 0.15, "grad_norm": 195.0, "learning_rate": 4.2793654507171525e-05, "loss": 1.1335, "step": 171400 }, { "epoch": 0.15, "grad_norm": 15.625, "learning_rate": 4.278915543398061e-05, "loss": 1.2274, "step": 171500 }, { "epoch": 0.15, "grad_norm": 31.375, "learning_rate": 4.2784656360789676e-05, "loss": 1.1344, "step": 171600 }, { "epoch": 0.15, "grad_norm": 0.59375, "learning_rate": 4.278015728759875e-05, "loss": 1.1439, "step": 171700 }, { "epoch": 0.15, "grad_norm": 101.5, "learning_rate": 4.277565821440783e-05, "loss": 1.0772, "step": 171800 }, { "epoch": 0.15, "grad_norm": 24.5, "learning_rate": 4.277115914121691e-05, "loss": 1.0917, "step": 171900 }, { "epoch": 0.15, "grad_norm": 154.0, "learning_rate": 4.276666006802599e-05, "loss": 1.1744, "step": 172000 }, { "epoch": 0.15, "grad_norm": 15.875, "learning_rate": 4.2762160994835066e-05, "loss": 1.358, "step": 172100 }, { "epoch": 0.15, "grad_norm": 66.5, "learning_rate": 4.275766192164414e-05, "loss": 1.2677, "step": 172200 }, { "epoch": 0.15, "grad_norm": 65.0, "learning_rate": 4.2753162848453224e-05, "loss": 0.9656, "step": 172300 }, { "epoch": 0.15, "grad_norm": 20.375, "learning_rate": 4.27486637752623e-05, "loss": 1.0813, "step": 172400 }, { "epoch": 0.15, "grad_norm": 2.578125, "learning_rate": 4.2744164702071374e-05, "loss": 1.3343, "step": 172500 }, { "epoch": 0.15, "grad_norm": 89.5, "learning_rate": 4.2739665628880456e-05, "loss": 1.2819, "step": 172600 }, { "epoch": 0.15, "grad_norm": 54.0, "learning_rate": 4.273516655568953e-05, "loss": 1.0236, "step": 172700 }, { "epoch": 0.15, "grad_norm": 22.5, "learning_rate": 4.273066748249861e-05, "loss": 1.0889, "step": 172800 }, { "epoch": 0.15, "grad_norm": 25.0, "learning_rate": 4.272616840930768e-05, "loss": 1.1928, "step": 172900 }, { "epoch": 0.15, "grad_norm": 18.625, "learning_rate": 4.272166933611676e-05, "loss": 1.1959, "step": 173000 }, { "epoch": 0.15, "grad_norm": 17.25, "learning_rate": 4.271717026292584e-05, "loss": 1.3427, "step": 173100 }, { "epoch": 0.15, "grad_norm": 1.078125, "learning_rate": 4.2712671189734915e-05, "loss": 1.2775, "step": 173200 }, { "epoch": 0.15, "grad_norm": 27.25, "learning_rate": 4.270817211654399e-05, "loss": 1.3247, "step": 173300 }, { "epoch": 0.15, "grad_norm": 15.0, "learning_rate": 4.270367304335307e-05, "loss": 1.2258, "step": 173400 }, { "epoch": 0.15, "grad_norm": 67.5, "learning_rate": 4.269917397016215e-05, "loss": 1.191, "step": 173500 }, { "epoch": 0.15, "grad_norm": 16.375, "learning_rate": 4.269467489697122e-05, "loss": 1.086, "step": 173600 }, { "epoch": 0.15, "grad_norm": 0.62109375, "learning_rate": 4.2690175823780305e-05, "loss": 1.048, "step": 173700 }, { "epoch": 0.15, "grad_norm": 25.375, "learning_rate": 4.268567675058938e-05, "loss": 1.016, "step": 173800 }, { "epoch": 0.15, "grad_norm": 52.0, "learning_rate": 4.268117767739846e-05, "loss": 1.252, "step": 173900 }, { "epoch": 0.16, "grad_norm": 21.5, "learning_rate": 4.267667860420754e-05, "loss": 1.1171, "step": 174000 }, { "epoch": 0.16, "grad_norm": 36.25, "learning_rate": 4.2672179531016614e-05, "loss": 1.2418, "step": 174100 }, { "epoch": 0.16, "grad_norm": 54.5, "learning_rate": 4.266768045782569e-05, "loss": 1.0526, "step": 174200 }, { "epoch": 0.16, "grad_norm": 38.75, "learning_rate": 4.2663181384634764e-05, "loss": 1.1106, "step": 174300 }, { "epoch": 0.16, "grad_norm": 1.2890625, "learning_rate": 4.265868231144384e-05, "loss": 1.1271, "step": 174400 }, { "epoch": 0.16, "grad_norm": 21.0, "learning_rate": 4.265418323825292e-05, "loss": 1.0622, "step": 174500 }, { "epoch": 0.16, "grad_norm": 23.625, "learning_rate": 4.2649684165062e-05, "loss": 1.1997, "step": 174600 }, { "epoch": 0.16, "grad_norm": 0.255859375, "learning_rate": 4.264518509187108e-05, "loss": 1.2099, "step": 174700 }, { "epoch": 0.16, "grad_norm": 27.375, "learning_rate": 4.2640686018680155e-05, "loss": 1.1474, "step": 174800 }, { "epoch": 0.16, "grad_norm": 18.125, "learning_rate": 4.263618694548923e-05, "loss": 1.1925, "step": 174900 }, { "epoch": 0.16, "grad_norm": 5.40625, "learning_rate": 4.263168787229831e-05, "loss": 1.3047, "step": 175000 }, { "epoch": 0.16, "grad_norm": 34.75, "learning_rate": 4.262718879910739e-05, "loss": 1.1826, "step": 175100 }, { "epoch": 0.16, "grad_norm": 16.5, "learning_rate": 4.262268972591646e-05, "loss": 1.0752, "step": 175200 }, { "epoch": 0.16, "grad_norm": 12.0, "learning_rate": 4.2618190652725545e-05, "loss": 1.3711, "step": 175300 }, { "epoch": 0.16, "grad_norm": 17.125, "learning_rate": 4.261369157953462e-05, "loss": 1.1903, "step": 175400 }, { "epoch": 0.16, "grad_norm": 28.125, "learning_rate": 4.2609192506343695e-05, "loss": 1.2254, "step": 175500 }, { "epoch": 0.16, "grad_norm": 53.0, "learning_rate": 4.260469343315277e-05, "loss": 1.2376, "step": 175600 }, { "epoch": 0.16, "grad_norm": 46.25, "learning_rate": 4.2600194359961846e-05, "loss": 1.3068, "step": 175700 }, { "epoch": 0.16, "grad_norm": 49.5, "learning_rate": 4.259569528677093e-05, "loss": 1.2719, "step": 175800 }, { "epoch": 0.16, "grad_norm": 21.25, "learning_rate": 4.2591196213580004e-05, "loss": 1.2351, "step": 175900 }, { "epoch": 0.16, "grad_norm": 41.5, "learning_rate": 4.258669714038908e-05, "loss": 1.2047, "step": 176000 }, { "epoch": 0.16, "grad_norm": 15.875, "learning_rate": 4.258219806719816e-05, "loss": 1.2174, "step": 176100 }, { "epoch": 0.16, "grad_norm": 20.875, "learning_rate": 4.2577698994007236e-05, "loss": 1.148, "step": 176200 }, { "epoch": 0.16, "grad_norm": 22.0, "learning_rate": 4.257319992081631e-05, "loss": 1.3212, "step": 176300 }, { "epoch": 0.16, "grad_norm": 15.0, "learning_rate": 4.2568700847625394e-05, "loss": 1.209, "step": 176400 }, { "epoch": 0.16, "grad_norm": 42.75, "learning_rate": 4.256420177443447e-05, "loss": 1.2113, "step": 176500 }, { "epoch": 0.16, "grad_norm": 1.578125, "learning_rate": 4.2559702701243544e-05, "loss": 0.9567, "step": 176600 }, { "epoch": 0.16, "grad_norm": 420.0, "learning_rate": 4.255520362805263e-05, "loss": 1.2237, "step": 176700 }, { "epoch": 0.16, "grad_norm": 736.0, "learning_rate": 4.2550704554861695e-05, "loss": 1.0574, "step": 176800 }, { "epoch": 0.16, "grad_norm": 0.3671875, "learning_rate": 4.254620548167078e-05, "loss": 1.3057, "step": 176900 }, { "epoch": 0.16, "grad_norm": 10.1875, "learning_rate": 4.254170640847985e-05, "loss": 1.2314, "step": 177000 }, { "epoch": 0.16, "grad_norm": 33.5, "learning_rate": 4.253720733528893e-05, "loss": 1.2106, "step": 177100 }, { "epoch": 0.16, "grad_norm": 186.0, "learning_rate": 4.253270826209801e-05, "loss": 1.1305, "step": 177200 }, { "epoch": 0.16, "grad_norm": 255.0, "learning_rate": 4.2528209188907085e-05, "loss": 1.0669, "step": 177300 }, { "epoch": 0.16, "grad_norm": 39.5, "learning_rate": 4.252371011571617e-05, "loss": 1.2703, "step": 177400 }, { "epoch": 0.16, "grad_norm": 6.15625, "learning_rate": 4.251921104252524e-05, "loss": 1.2375, "step": 177500 }, { "epoch": 0.16, "grad_norm": 4000.0, "learning_rate": 4.251471196933432e-05, "loss": 1.1928, "step": 177600 }, { "epoch": 0.16, "grad_norm": 58.75, "learning_rate": 4.25102128961434e-05, "loss": 1.2152, "step": 177700 }, { "epoch": 0.16, "grad_norm": 0.29296875, "learning_rate": 4.2505713822952476e-05, "loss": 1.1017, "step": 177800 }, { "epoch": 0.16, "grad_norm": 28.625, "learning_rate": 4.250121474976155e-05, "loss": 1.1363, "step": 177900 }, { "epoch": 0.16, "grad_norm": 9.8125, "learning_rate": 4.249671567657063e-05, "loss": 1.1717, "step": 178000 }, { "epoch": 0.16, "grad_norm": 26.25, "learning_rate": 4.24922166033797e-05, "loss": 1.0504, "step": 178100 }, { "epoch": 0.16, "grad_norm": 173.0, "learning_rate": 4.2487717530188784e-05, "loss": 1.051, "step": 178200 }, { "epoch": 0.16, "grad_norm": 128.0, "learning_rate": 4.248321845699786e-05, "loss": 1.2008, "step": 178300 }, { "epoch": 0.16, "grad_norm": 29.25, "learning_rate": 4.2478719383806934e-05, "loss": 1.0372, "step": 178400 }, { "epoch": 0.16, "grad_norm": 38.25, "learning_rate": 4.2474220310616017e-05, "loss": 1.229, "step": 178500 }, { "epoch": 0.16, "grad_norm": 44.5, "learning_rate": 4.246972123742509e-05, "loss": 1.1811, "step": 178600 }, { "epoch": 0.16, "grad_norm": 33.25, "learning_rate": 4.246522216423417e-05, "loss": 1.3114, "step": 178700 }, { "epoch": 0.16, "grad_norm": 54.75, "learning_rate": 4.246072309104325e-05, "loss": 1.3516, "step": 178800 }, { "epoch": 0.16, "grad_norm": 44.75, "learning_rate": 4.2456224017852325e-05, "loss": 1.2318, "step": 178900 }, { "epoch": 0.16, "grad_norm": 25.875, "learning_rate": 4.24517249446614e-05, "loss": 1.2588, "step": 179000 }, { "epoch": 0.16, "grad_norm": 4.375, "learning_rate": 4.244722587147048e-05, "loss": 1.0803, "step": 179100 }, { "epoch": 0.16, "grad_norm": 81.5, "learning_rate": 4.244272679827956e-05, "loss": 1.1941, "step": 179200 }, { "epoch": 0.16, "grad_norm": 23.0, "learning_rate": 4.243822772508863e-05, "loss": 1.3492, "step": 179300 }, { "epoch": 0.16, "grad_norm": 51.75, "learning_rate": 4.243372865189771e-05, "loss": 1.4025, "step": 179400 }, { "epoch": 0.16, "grad_norm": 0.11767578125, "learning_rate": 4.2429229578706784e-05, "loss": 1.1006, "step": 179500 }, { "epoch": 0.16, "grad_norm": 19.125, "learning_rate": 4.2424730505515866e-05, "loss": 1.2816, "step": 179600 }, { "epoch": 0.16, "grad_norm": 10.6875, "learning_rate": 4.242023143232494e-05, "loss": 1.3625, "step": 179700 }, { "epoch": 0.16, "grad_norm": 48.0, "learning_rate": 4.2415732359134016e-05, "loss": 1.1887, "step": 179800 }, { "epoch": 0.16, "grad_norm": 41.0, "learning_rate": 4.24112332859431e-05, "loss": 1.2855, "step": 179900 }, { "epoch": 0.16, "grad_norm": 45.75, "learning_rate": 4.2406734212752174e-05, "loss": 1.3335, "step": 180000 }, { "epoch": 0.16, "grad_norm": 19.375, "learning_rate": 4.2402235139561256e-05, "loss": 1.2829, "step": 180100 }, { "epoch": 0.16, "grad_norm": 0.0015411376953125, "learning_rate": 4.239773606637033e-05, "loss": 1.0416, "step": 180200 }, { "epoch": 0.16, "grad_norm": 17.125, "learning_rate": 4.2393236993179407e-05, "loss": 1.1039, "step": 180300 }, { "epoch": 0.16, "grad_norm": 19.625, "learning_rate": 4.238873791998849e-05, "loss": 1.0655, "step": 180400 }, { "epoch": 0.16, "grad_norm": 596.0, "learning_rate": 4.2384238846797564e-05, "loss": 1.1383, "step": 180500 }, { "epoch": 0.16, "grad_norm": 37.0, "learning_rate": 4.237973977360664e-05, "loss": 1.135, "step": 180600 }, { "epoch": 0.16, "grad_norm": 43.25, "learning_rate": 4.2375240700415715e-05, "loss": 1.0729, "step": 180700 }, { "epoch": 0.16, "grad_norm": 0.173828125, "learning_rate": 4.237074162722479e-05, "loss": 1.1227, "step": 180800 }, { "epoch": 0.16, "grad_norm": 6.40625, "learning_rate": 4.236624255403387e-05, "loss": 1.1162, "step": 180900 }, { "epoch": 0.16, "grad_norm": 18.0, "learning_rate": 4.236174348084295e-05, "loss": 1.1697, "step": 181000 }, { "epoch": 0.16, "grad_norm": 27.75, "learning_rate": 4.235724440765202e-05, "loss": 1.2325, "step": 181100 }, { "epoch": 0.16, "grad_norm": 29.875, "learning_rate": 4.2352745334461105e-05, "loss": 1.1419, "step": 181200 }, { "epoch": 0.16, "grad_norm": 17.25, "learning_rate": 4.234824626127018e-05, "loss": 1.3225, "step": 181300 }, { "epoch": 0.16, "grad_norm": 24.5, "learning_rate": 4.2343747188079256e-05, "loss": 1.2887, "step": 181400 }, { "epoch": 0.16, "grad_norm": 40.0, "learning_rate": 4.233924811488834e-05, "loss": 1.2148, "step": 181500 }, { "epoch": 0.16, "grad_norm": 0.0035247802734375, "learning_rate": 4.233474904169741e-05, "loss": 1.2054, "step": 181600 }, { "epoch": 0.16, "grad_norm": 0.00213623046875, "learning_rate": 4.233024996850649e-05, "loss": 1.0868, "step": 181700 }, { "epoch": 0.16, "grad_norm": 0.09326171875, "learning_rate": 4.232575089531557e-05, "loss": 1.1705, "step": 181800 }, { "epoch": 0.16, "grad_norm": 10.4375, "learning_rate": 4.2321251822124646e-05, "loss": 1.0112, "step": 181900 }, { "epoch": 0.16, "grad_norm": 56.25, "learning_rate": 4.231675274893372e-05, "loss": 1.1826, "step": 182000 }, { "epoch": 0.16, "grad_norm": 39.75, "learning_rate": 4.2312253675742797e-05, "loss": 1.1999, "step": 182100 }, { "epoch": 0.16, "grad_norm": 0.049072265625, "learning_rate": 4.230775460255187e-05, "loss": 1.3048, "step": 182200 }, { "epoch": 0.16, "grad_norm": 92.5, "learning_rate": 4.2303255529360954e-05, "loss": 1.2344, "step": 182300 }, { "epoch": 0.16, "grad_norm": 13.5, "learning_rate": 4.229875645617003e-05, "loss": 1.2589, "step": 182400 }, { "epoch": 0.16, "grad_norm": 2.546875, "learning_rate": 4.2294257382979105e-05, "loss": 1.1501, "step": 182500 }, { "epoch": 0.16, "grad_norm": 22.375, "learning_rate": 4.228975830978819e-05, "loss": 1.2442, "step": 182600 }, { "epoch": 0.16, "grad_norm": 22.25, "learning_rate": 4.228525923659726e-05, "loss": 1.3321, "step": 182700 }, { "epoch": 0.16, "grad_norm": 0.0159912109375, "learning_rate": 4.2280760163406344e-05, "loss": 1.1938, "step": 182800 }, { "epoch": 0.16, "grad_norm": 64.0, "learning_rate": 4.227626109021542e-05, "loss": 1.1458, "step": 182900 }, { "epoch": 0.16, "grad_norm": 200.0, "learning_rate": 4.2271762017024495e-05, "loss": 1.1515, "step": 183000 }, { "epoch": 0.16, "grad_norm": 42.5, "learning_rate": 4.226726294383358e-05, "loss": 1.3802, "step": 183100 }, { "epoch": 0.16, "grad_norm": 21.75, "learning_rate": 4.226276387064265e-05, "loss": 0.9465, "step": 183200 }, { "epoch": 0.16, "grad_norm": 69.5, "learning_rate": 4.225826479745172e-05, "loss": 1.0952, "step": 183300 }, { "epoch": 0.16, "grad_norm": 32.25, "learning_rate": 4.22537657242608e-05, "loss": 0.9934, "step": 183400 }, { "epoch": 0.16, "grad_norm": 48.0, "learning_rate": 4.224926665106988e-05, "loss": 1.1524, "step": 183500 }, { "epoch": 0.16, "grad_norm": 9.125, "learning_rate": 4.224476757787896e-05, "loss": 1.2027, "step": 183600 }, { "epoch": 0.16, "grad_norm": 25.875, "learning_rate": 4.2240268504688036e-05, "loss": 1.4629, "step": 183700 }, { "epoch": 0.16, "grad_norm": 5.78125, "learning_rate": 4.223576943149711e-05, "loss": 0.9975, "step": 183800 }, { "epoch": 0.16, "grad_norm": 20.125, "learning_rate": 4.223127035830619e-05, "loss": 1.2801, "step": 183900 }, { "epoch": 0.16, "grad_norm": 19.125, "learning_rate": 4.222677128511527e-05, "loss": 1.093, "step": 184000 }, { "epoch": 0.16, "grad_norm": 16.75, "learning_rate": 4.2222272211924344e-05, "loss": 1.081, "step": 184100 }, { "epoch": 0.16, "grad_norm": 179.0, "learning_rate": 4.2217773138733426e-05, "loss": 1.2437, "step": 184200 }, { "epoch": 0.16, "grad_norm": 10.5, "learning_rate": 4.22132740655425e-05, "loss": 1.1344, "step": 184300 }, { "epoch": 0.16, "grad_norm": 22.875, "learning_rate": 4.220877499235158e-05, "loss": 1.2239, "step": 184400 }, { "epoch": 0.16, "grad_norm": 114.5, "learning_rate": 4.220427591916066e-05, "loss": 1.2939, "step": 184500 }, { "epoch": 0.16, "grad_norm": 38.0, "learning_rate": 4.219977684596973e-05, "loss": 1.2764, "step": 184600 }, { "epoch": 0.16, "grad_norm": 209.0, "learning_rate": 4.219527777277881e-05, "loss": 1.2841, "step": 184700 }, { "epoch": 0.16, "grad_norm": 21.75, "learning_rate": 4.2190778699587885e-05, "loss": 1.3063, "step": 184800 }, { "epoch": 0.16, "grad_norm": 27.0, "learning_rate": 4.218627962639696e-05, "loss": 1.3656, "step": 184900 }, { "epoch": 0.16, "grad_norm": 105.0, "learning_rate": 4.218178055320604e-05, "loss": 1.1371, "step": 185000 }, { "epoch": 0.16, "grad_norm": 0.2265625, "learning_rate": 4.217728148001512e-05, "loss": 1.2233, "step": 185100 }, { "epoch": 0.16, "grad_norm": 71.0, "learning_rate": 4.217278240682419e-05, "loss": 1.1547, "step": 185200 }, { "epoch": 0.17, "grad_norm": 50.25, "learning_rate": 4.2168283333633275e-05, "loss": 1.1923, "step": 185300 }, { "epoch": 0.17, "grad_norm": 18.875, "learning_rate": 4.216378426044235e-05, "loss": 1.2171, "step": 185400 }, { "epoch": 0.17, "grad_norm": 33.25, "learning_rate": 4.215928518725143e-05, "loss": 1.305, "step": 185500 }, { "epoch": 0.17, "grad_norm": 10.5, "learning_rate": 4.215478611406051e-05, "loss": 1.2172, "step": 185600 }, { "epoch": 0.17, "grad_norm": 18.625, "learning_rate": 4.215028704086958e-05, "loss": 1.3026, "step": 185700 }, { "epoch": 0.17, "grad_norm": 22.875, "learning_rate": 4.2145787967678665e-05, "loss": 1.0829, "step": 185800 }, { "epoch": 0.17, "grad_norm": 0.0751953125, "learning_rate": 4.2141288894487734e-05, "loss": 1.1234, "step": 185900 }, { "epoch": 0.17, "grad_norm": 27.875, "learning_rate": 4.213678982129681e-05, "loss": 1.4107, "step": 186000 }, { "epoch": 0.17, "grad_norm": 199.0, "learning_rate": 4.213229074810589e-05, "loss": 1.2315, "step": 186100 }, { "epoch": 0.17, "grad_norm": 41.25, "learning_rate": 4.212779167491497e-05, "loss": 1.2325, "step": 186200 }, { "epoch": 0.17, "grad_norm": 266.0, "learning_rate": 4.212329260172405e-05, "loss": 1.2041, "step": 186300 }, { "epoch": 0.17, "grad_norm": 1.5546875, "learning_rate": 4.2118793528533124e-05, "loss": 1.2449, "step": 186400 }, { "epoch": 0.17, "grad_norm": 1.4453125, "learning_rate": 4.21142944553422e-05, "loss": 1.0151, "step": 186500 }, { "epoch": 0.17, "grad_norm": 0.36328125, "learning_rate": 4.210979538215128e-05, "loss": 1.1362, "step": 186600 }, { "epoch": 0.17, "grad_norm": 1.59375, "learning_rate": 4.210529630896036e-05, "loss": 1.2349, "step": 186700 }, { "epoch": 0.17, "grad_norm": 30.125, "learning_rate": 4.210079723576943e-05, "loss": 1.2757, "step": 186800 }, { "epoch": 0.17, "grad_norm": 65.5, "learning_rate": 4.2096298162578515e-05, "loss": 1.0808, "step": 186900 }, { "epoch": 0.17, "grad_norm": 240.0, "learning_rate": 4.209179908938759e-05, "loss": 1.132, "step": 187000 }, { "epoch": 0.17, "grad_norm": 20.375, "learning_rate": 4.2087300016196665e-05, "loss": 1.1575, "step": 187100 }, { "epoch": 0.17, "grad_norm": 0.08740234375, "learning_rate": 4.208280094300574e-05, "loss": 1.138, "step": 187200 }, { "epoch": 0.17, "grad_norm": 37.25, "learning_rate": 4.2078301869814816e-05, "loss": 1.1676, "step": 187300 }, { "epoch": 0.17, "grad_norm": 19.625, "learning_rate": 4.20738027966239e-05, "loss": 1.0593, "step": 187400 }, { "epoch": 0.17, "grad_norm": 0.2138671875, "learning_rate": 4.206930372343297e-05, "loss": 1.3202, "step": 187500 }, { "epoch": 0.17, "grad_norm": 16.875, "learning_rate": 4.206480465024205e-05, "loss": 1.1956, "step": 187600 }, { "epoch": 0.17, "grad_norm": 18.5, "learning_rate": 4.206030557705113e-05, "loss": 1.3435, "step": 187700 }, { "epoch": 0.17, "grad_norm": 49.5, "learning_rate": 4.2055806503860206e-05, "loss": 1.0993, "step": 187800 }, { "epoch": 0.17, "grad_norm": 39.25, "learning_rate": 4.205130743066928e-05, "loss": 1.0359, "step": 187900 }, { "epoch": 0.17, "grad_norm": 14.375, "learning_rate": 4.2046808357478364e-05, "loss": 1.2202, "step": 188000 }, { "epoch": 0.17, "grad_norm": 32.75, "learning_rate": 4.204230928428744e-05, "loss": 1.2804, "step": 188100 }, { "epoch": 0.17, "grad_norm": 46.25, "learning_rate": 4.203781021109652e-05, "loss": 1.2196, "step": 188200 }, { "epoch": 0.17, "grad_norm": 128.0, "learning_rate": 4.2033311137905596e-05, "loss": 1.1602, "step": 188300 }, { "epoch": 0.17, "grad_norm": 68.5, "learning_rate": 4.202881206471467e-05, "loss": 1.1784, "step": 188400 }, { "epoch": 0.17, "grad_norm": 114.0, "learning_rate": 4.202431299152375e-05, "loss": 1.3161, "step": 188500 }, { "epoch": 0.17, "grad_norm": 18.625, "learning_rate": 4.201981391833282e-05, "loss": 1.1077, "step": 188600 }, { "epoch": 0.17, "grad_norm": 140.0, "learning_rate": 4.20153148451419e-05, "loss": 1.2905, "step": 188700 }, { "epoch": 0.17, "grad_norm": 22.0, "learning_rate": 4.201081577195098e-05, "loss": 1.2771, "step": 188800 }, { "epoch": 0.17, "grad_norm": 10.0, "learning_rate": 4.2006316698760055e-05, "loss": 0.9896, "step": 188900 }, { "epoch": 0.17, "grad_norm": 28.125, "learning_rate": 4.200181762556914e-05, "loss": 1.144, "step": 189000 }, { "epoch": 0.17, "grad_norm": 86.0, "learning_rate": 4.199731855237821e-05, "loss": 1.0594, "step": 189100 }, { "epoch": 0.17, "grad_norm": 1560.0, "learning_rate": 4.199281947918729e-05, "loss": 1.2533, "step": 189200 }, { "epoch": 0.17, "grad_norm": 30.0, "learning_rate": 4.198832040599637e-05, "loss": 1.2242, "step": 189300 }, { "epoch": 0.17, "grad_norm": 21.25, "learning_rate": 4.1983821332805445e-05, "loss": 1.2011, "step": 189400 }, { "epoch": 0.17, "grad_norm": 28.5, "learning_rate": 4.197932225961452e-05, "loss": 1.2743, "step": 189500 }, { "epoch": 0.17, "grad_norm": 12.25, "learning_rate": 4.19748231864236e-05, "loss": 1.1596, "step": 189600 }, { "epoch": 0.17, "grad_norm": 45.75, "learning_rate": 4.197032411323268e-05, "loss": 1.1093, "step": 189700 }, { "epoch": 0.17, "grad_norm": 274.0, "learning_rate": 4.1965825040041754e-05, "loss": 1.2053, "step": 189800 }, { "epoch": 0.17, "grad_norm": 12.6875, "learning_rate": 4.196132596685083e-05, "loss": 1.1662, "step": 189900 }, { "epoch": 0.17, "grad_norm": 6.875, "learning_rate": 4.1956826893659904e-05, "loss": 1.1291, "step": 190000 }, { "epoch": 0.17, "grad_norm": 796.0, "learning_rate": 4.1952327820468986e-05, "loss": 1.1676, "step": 190100 }, { "epoch": 0.17, "grad_norm": 72.0, "learning_rate": 4.194782874727806e-05, "loss": 1.132, "step": 190200 }, { "epoch": 0.17, "grad_norm": 42.25, "learning_rate": 4.194332967408714e-05, "loss": 1.1253, "step": 190300 }, { "epoch": 0.17, "grad_norm": 68.0, "learning_rate": 4.193883060089622e-05, "loss": 1.2876, "step": 190400 }, { "epoch": 0.17, "grad_norm": 0.11083984375, "learning_rate": 4.1934331527705295e-05, "loss": 1.1226, "step": 190500 }, { "epoch": 0.17, "grad_norm": 37.0, "learning_rate": 4.192983245451437e-05, "loss": 1.0657, "step": 190600 }, { "epoch": 0.17, "grad_norm": 109.5, "learning_rate": 4.192533338132345e-05, "loss": 1.1886, "step": 190700 }, { "epoch": 0.17, "grad_norm": 24.5, "learning_rate": 4.192083430813253e-05, "loss": 1.2065, "step": 190800 }, { "epoch": 0.17, "grad_norm": 230.0, "learning_rate": 4.191633523494161e-05, "loss": 1.0951, "step": 190900 }, { "epoch": 0.17, "grad_norm": 40.25, "learning_rate": 4.1911836161750685e-05, "loss": 1.1631, "step": 191000 }, { "epoch": 0.17, "grad_norm": 16.875, "learning_rate": 4.190733708855975e-05, "loss": 1.1751, "step": 191100 }, { "epoch": 0.17, "grad_norm": 27.5, "learning_rate": 4.1902838015368835e-05, "loss": 1.2777, "step": 191200 }, { "epoch": 0.17, "grad_norm": 68.5, "learning_rate": 4.189833894217791e-05, "loss": 1.185, "step": 191300 }, { "epoch": 0.17, "grad_norm": 9.0, "learning_rate": 4.1893839868986986e-05, "loss": 1.1625, "step": 191400 }, { "epoch": 0.17, "grad_norm": 52.5, "learning_rate": 4.188934079579607e-05, "loss": 1.2173, "step": 191500 }, { "epoch": 0.17, "grad_norm": 46.0, "learning_rate": 4.1884841722605144e-05, "loss": 1.2283, "step": 191600 }, { "epoch": 0.17, "grad_norm": 16.625, "learning_rate": 4.1880342649414226e-05, "loss": 1.2837, "step": 191700 }, { "epoch": 0.17, "grad_norm": 9.8125, "learning_rate": 4.18758435762233e-05, "loss": 1.025, "step": 191800 }, { "epoch": 0.17, "grad_norm": 29.0, "learning_rate": 4.1871344503032376e-05, "loss": 1.0625, "step": 191900 }, { "epoch": 0.17, "grad_norm": 0.041015625, "learning_rate": 4.186684542984146e-05, "loss": 1.2094, "step": 192000 }, { "epoch": 0.17, "grad_norm": 75.0, "learning_rate": 4.1862346356650534e-05, "loss": 1.1954, "step": 192100 }, { "epoch": 0.17, "grad_norm": 22.625, "learning_rate": 4.185784728345961e-05, "loss": 1.1494, "step": 192200 }, { "epoch": 0.17, "grad_norm": 42.5, "learning_rate": 4.185334821026869e-05, "loss": 1.1898, "step": 192300 }, { "epoch": 0.17, "grad_norm": 58.25, "learning_rate": 4.184884913707776e-05, "loss": 1.1983, "step": 192400 }, { "epoch": 0.17, "grad_norm": 14.3125, "learning_rate": 4.184435006388684e-05, "loss": 1.0581, "step": 192500 }, { "epoch": 0.17, "grad_norm": 0.022705078125, "learning_rate": 4.183985099069592e-05, "loss": 1.1541, "step": 192600 }, { "epoch": 0.17, "grad_norm": 16.0, "learning_rate": 4.183535191750499e-05, "loss": 1.2466, "step": 192700 }, { "epoch": 0.17, "grad_norm": 160.0, "learning_rate": 4.1830852844314075e-05, "loss": 1.2106, "step": 192800 }, { "epoch": 0.17, "grad_norm": 34.5, "learning_rate": 4.182635377112315e-05, "loss": 1.1169, "step": 192900 }, { "epoch": 0.17, "grad_norm": 19.75, "learning_rate": 4.1821854697932225e-05, "loss": 1.3027, "step": 193000 }, { "epoch": 0.17, "grad_norm": 93.5, "learning_rate": 4.181735562474131e-05, "loss": 1.2137, "step": 193100 }, { "epoch": 0.17, "grad_norm": 37.5, "learning_rate": 4.181285655155038e-05, "loss": 1.0112, "step": 193200 }, { "epoch": 0.17, "grad_norm": 0.0206298828125, "learning_rate": 4.180835747835946e-05, "loss": 1.048, "step": 193300 }, { "epoch": 0.17, "grad_norm": 19.625, "learning_rate": 4.180385840516854e-05, "loss": 1.2389, "step": 193400 }, { "epoch": 0.17, "grad_norm": 18.625, "learning_rate": 4.1799359331977616e-05, "loss": 1.106, "step": 193500 }, { "epoch": 0.17, "grad_norm": 187.0, "learning_rate": 4.179486025878669e-05, "loss": 1.3622, "step": 193600 }, { "epoch": 0.17, "grad_norm": 38.5, "learning_rate": 4.1790361185595766e-05, "loss": 1.1716, "step": 193700 }, { "epoch": 0.17, "grad_norm": 28.875, "learning_rate": 4.178586211240484e-05, "loss": 1.1967, "step": 193800 }, { "epoch": 0.17, "grad_norm": 85.0, "learning_rate": 4.1781363039213924e-05, "loss": 1.1057, "step": 193900 }, { "epoch": 0.17, "grad_norm": 9.8125, "learning_rate": 4.1776863966023e-05, "loss": 1.1438, "step": 194000 }, { "epoch": 0.17, "grad_norm": 51.25, "learning_rate": 4.1772364892832075e-05, "loss": 1.1666, "step": 194100 }, { "epoch": 0.17, "grad_norm": 83.5, "learning_rate": 4.176786581964116e-05, "loss": 1.0818, "step": 194200 }, { "epoch": 0.17, "grad_norm": 58.75, "learning_rate": 4.176336674645023e-05, "loss": 1.0782, "step": 194300 }, { "epoch": 0.17, "grad_norm": 68.5, "learning_rate": 4.1758867673259314e-05, "loss": 1.1737, "step": 194400 }, { "epoch": 0.17, "grad_norm": 0.1396484375, "learning_rate": 4.175436860006839e-05, "loss": 1.1863, "step": 194500 }, { "epoch": 0.17, "grad_norm": 46.75, "learning_rate": 4.1749869526877465e-05, "loss": 1.2777, "step": 194600 }, { "epoch": 0.17, "grad_norm": 10.0625, "learning_rate": 4.174537045368655e-05, "loss": 1.2273, "step": 194700 }, { "epoch": 0.17, "grad_norm": 19.875, "learning_rate": 4.174087138049562e-05, "loss": 1.1958, "step": 194800 }, { "epoch": 0.17, "grad_norm": 8.25, "learning_rate": 4.17363723073047e-05, "loss": 1.3402, "step": 194900 }, { "epoch": 0.17, "grad_norm": 0.208984375, "learning_rate": 4.173187323411377e-05, "loss": 1.0688, "step": 195000 }, { "epoch": 0.17, "grad_norm": 25.0, "learning_rate": 4.172737416092285e-05, "loss": 1.2346, "step": 195100 }, { "epoch": 0.17, "grad_norm": 1.03125, "learning_rate": 4.172287508773193e-05, "loss": 1.1805, "step": 195200 }, { "epoch": 0.17, "grad_norm": 24.125, "learning_rate": 4.1718376014541006e-05, "loss": 1.068, "step": 195300 }, { "epoch": 0.17, "grad_norm": 12.75, "learning_rate": 4.171387694135008e-05, "loss": 1.1489, "step": 195400 }, { "epoch": 0.17, "grad_norm": 26.5, "learning_rate": 4.170937786815916e-05, "loss": 1.1357, "step": 195500 }, { "epoch": 0.17, "grad_norm": 123.0, "learning_rate": 4.170487879496824e-05, "loss": 1.1081, "step": 195600 }, { "epoch": 0.17, "grad_norm": 44.0, "learning_rate": 4.1700379721777314e-05, "loss": 1.1402, "step": 195700 }, { "epoch": 0.17, "grad_norm": 60.25, "learning_rate": 4.1695880648586396e-05, "loss": 1.0414, "step": 195800 }, { "epoch": 0.17, "grad_norm": 71.5, "learning_rate": 4.169138157539547e-05, "loss": 1.2874, "step": 195900 }, { "epoch": 0.17, "grad_norm": 840.0, "learning_rate": 4.168688250220455e-05, "loss": 1.2207, "step": 196000 }, { "epoch": 0.17, "grad_norm": 65.5, "learning_rate": 4.168238342901363e-05, "loss": 1.25, "step": 196100 }, { "epoch": 0.17, "grad_norm": 17.625, "learning_rate": 4.1677884355822704e-05, "loss": 1.2282, "step": 196200 }, { "epoch": 0.17, "grad_norm": 0.142578125, "learning_rate": 4.167338528263178e-05, "loss": 1.2121, "step": 196300 }, { "epoch": 0.17, "grad_norm": 0.1611328125, "learning_rate": 4.1668886209440855e-05, "loss": 1.2917, "step": 196400 }, { "epoch": 0.18, "grad_norm": 32.5, "learning_rate": 4.166438713624993e-05, "loss": 1.0817, "step": 196500 }, { "epoch": 0.18, "grad_norm": 40.25, "learning_rate": 4.165988806305901e-05, "loss": 1.2343, "step": 196600 }, { "epoch": 0.18, "grad_norm": 43.0, "learning_rate": 4.165538898986809e-05, "loss": 1.1656, "step": 196700 }, { "epoch": 0.18, "grad_norm": 10.9375, "learning_rate": 4.165088991667716e-05, "loss": 1.2031, "step": 196800 }, { "epoch": 0.18, "grad_norm": 42.25, "learning_rate": 4.1646390843486245e-05, "loss": 1.3176, "step": 196900 }, { "epoch": 0.18, "grad_norm": 35.0, "learning_rate": 4.164189177029532e-05, "loss": 1.1468, "step": 197000 }, { "epoch": 0.18, "grad_norm": 2.625, "learning_rate": 4.16373926971044e-05, "loss": 1.0494, "step": 197100 }, { "epoch": 0.18, "grad_norm": 8.1875, "learning_rate": 4.163289362391348e-05, "loss": 1.0928, "step": 197200 }, { "epoch": 0.18, "grad_norm": 486.0, "learning_rate": 4.162839455072255e-05, "loss": 1.2197, "step": 197300 }, { "epoch": 0.18, "grad_norm": 60.75, "learning_rate": 4.1623895477531635e-05, "loss": 1.2145, "step": 197400 }, { "epoch": 0.18, "grad_norm": 220.0, "learning_rate": 4.161939640434071e-05, "loss": 1.2995, "step": 197500 }, { "epoch": 0.18, "grad_norm": 40.5, "learning_rate": 4.161489733114978e-05, "loss": 1.0946, "step": 197600 }, { "epoch": 0.18, "grad_norm": 7.0625, "learning_rate": 4.161039825795886e-05, "loss": 1.1699, "step": 197700 }, { "epoch": 0.18, "grad_norm": 33.25, "learning_rate": 4.160589918476794e-05, "loss": 1.3221, "step": 197800 }, { "epoch": 0.18, "grad_norm": 94.5, "learning_rate": 4.160140011157702e-05, "loss": 1.287, "step": 197900 }, { "epoch": 0.18, "grad_norm": 32.0, "learning_rate": 4.1596901038386094e-05, "loss": 1.1507, "step": 198000 }, { "epoch": 0.18, "grad_norm": 22.375, "learning_rate": 4.159240196519517e-05, "loss": 1.2471, "step": 198100 }, { "epoch": 0.18, "grad_norm": 0.08349609375, "learning_rate": 4.158790289200425e-05, "loss": 1.2462, "step": 198200 }, { "epoch": 0.18, "grad_norm": 41.5, "learning_rate": 4.158340381881333e-05, "loss": 1.1003, "step": 198300 }, { "epoch": 0.18, "grad_norm": 36.25, "learning_rate": 4.15789047456224e-05, "loss": 1.1615, "step": 198400 }, { "epoch": 0.18, "grad_norm": 12.0625, "learning_rate": 4.1574405672431484e-05, "loss": 1.3816, "step": 198500 }, { "epoch": 0.18, "grad_norm": 0.06591796875, "learning_rate": 4.156990659924056e-05, "loss": 1.1632, "step": 198600 }, { "epoch": 0.18, "grad_norm": 25.125, "learning_rate": 4.1565407526049635e-05, "loss": 1.1957, "step": 198700 }, { "epoch": 0.18, "grad_norm": 24.625, "learning_rate": 4.156090845285872e-05, "loss": 1.3067, "step": 198800 }, { "epoch": 0.18, "grad_norm": 129.0, "learning_rate": 4.1556409379667786e-05, "loss": 1.1935, "step": 198900 }, { "epoch": 0.18, "grad_norm": 24.25, "learning_rate": 4.155191030647687e-05, "loss": 1.0834, "step": 199000 }, { "epoch": 0.18, "grad_norm": 9.8125, "learning_rate": 4.154741123328594e-05, "loss": 1.2487, "step": 199100 }, { "epoch": 0.18, "grad_norm": 47.0, "learning_rate": 4.154291216009502e-05, "loss": 1.3689, "step": 199200 }, { "epoch": 0.18, "grad_norm": 0.515625, "learning_rate": 4.15384130869041e-05, "loss": 1.0971, "step": 199300 }, { "epoch": 0.18, "grad_norm": 80.5, "learning_rate": 4.1533914013713176e-05, "loss": 1.1138, "step": 199400 }, { "epoch": 0.18, "grad_norm": 8.0625, "learning_rate": 4.152941494052225e-05, "loss": 1.0809, "step": 199500 }, { "epoch": 0.18, "grad_norm": 43.25, "learning_rate": 4.1524915867331333e-05, "loss": 1.1443, "step": 199600 }, { "epoch": 0.18, "grad_norm": 49.25, "learning_rate": 4.152041679414041e-05, "loss": 1.1211, "step": 199700 }, { "epoch": 0.18, "grad_norm": 20.625, "learning_rate": 4.151591772094949e-05, "loss": 1.1441, "step": 199800 }, { "epoch": 0.18, "grad_norm": 106.0, "learning_rate": 4.1511418647758566e-05, "loss": 1.2964, "step": 199900 }, { "epoch": 0.18, "grad_norm": 114.5, "learning_rate": 4.150691957456764e-05, "loss": 1.074, "step": 200000 }, { "epoch": 0.18, "grad_norm": 38.75, "learning_rate": 4.1502420501376724e-05, "loss": 1.1804, "step": 200100 }, { "epoch": 0.18, "grad_norm": 44.0, "learning_rate": 4.149792142818579e-05, "loss": 1.2302, "step": 200200 }, { "epoch": 0.18, "grad_norm": 19.0, "learning_rate": 4.149342235499487e-05, "loss": 1.2207, "step": 200300 }, { "epoch": 0.18, "grad_norm": 1.03125, "learning_rate": 4.148892328180395e-05, "loss": 1.2023, "step": 200400 }, { "epoch": 0.18, "grad_norm": 12.75, "learning_rate": 4.1484424208613025e-05, "loss": 1.4099, "step": 200500 }, { "epoch": 0.18, "grad_norm": 32.75, "learning_rate": 4.147992513542211e-05, "loss": 1.2869, "step": 200600 }, { "epoch": 0.18, "grad_norm": 73.5, "learning_rate": 4.147542606223118e-05, "loss": 1.3064, "step": 200700 }, { "epoch": 0.18, "grad_norm": 32.25, "learning_rate": 4.147092698904026e-05, "loss": 1.1199, "step": 200800 }, { "epoch": 0.18, "grad_norm": 58.75, "learning_rate": 4.146642791584934e-05, "loss": 1.1937, "step": 200900 }, { "epoch": 0.18, "grad_norm": 18.5, "learning_rate": 4.1461928842658415e-05, "loss": 1.1972, "step": 201000 }, { "epoch": 0.18, "grad_norm": 0.016845703125, "learning_rate": 4.145742976946749e-05, "loss": 1.1635, "step": 201100 }, { "epoch": 0.18, "grad_norm": 18.375, "learning_rate": 4.145293069627657e-05, "loss": 1.2409, "step": 201200 }, { "epoch": 0.18, "grad_norm": 48.25, "learning_rate": 4.144843162308565e-05, "loss": 1.1991, "step": 201300 }, { "epoch": 0.18, "grad_norm": 45.75, "learning_rate": 4.1443932549894723e-05, "loss": 1.2935, "step": 201400 }, { "epoch": 0.18, "grad_norm": 22.875, "learning_rate": 4.14394334767038e-05, "loss": 1.2154, "step": 201500 }, { "epoch": 0.18, "grad_norm": 29.75, "learning_rate": 4.1434934403512874e-05, "loss": 1.1718, "step": 201600 }, { "epoch": 0.18, "grad_norm": 19.75, "learning_rate": 4.1430435330321956e-05, "loss": 1.1829, "step": 201700 }, { "epoch": 0.18, "grad_norm": 37.5, "learning_rate": 4.142593625713103e-05, "loss": 1.2179, "step": 201800 }, { "epoch": 0.18, "grad_norm": 2.453125, "learning_rate": 4.142143718394011e-05, "loss": 1.1942, "step": 201900 }, { "epoch": 0.18, "grad_norm": 50.75, "learning_rate": 4.141693811074919e-05, "loss": 1.034, "step": 202000 }, { "epoch": 0.18, "grad_norm": 31.25, "learning_rate": 4.1412439037558264e-05, "loss": 1.1917, "step": 202100 }, { "epoch": 0.18, "grad_norm": 40.75, "learning_rate": 4.140793996436734e-05, "loss": 1.2672, "step": 202200 }, { "epoch": 0.18, "grad_norm": 94.5, "learning_rate": 4.140344089117642e-05, "loss": 1.1245, "step": 202300 }, { "epoch": 0.18, "grad_norm": 42.5, "learning_rate": 4.13989418179855e-05, "loss": 1.2138, "step": 202400 }, { "epoch": 0.18, "grad_norm": 52.75, "learning_rate": 4.139444274479458e-05, "loss": 1.0524, "step": 202500 }, { "epoch": 0.18, "grad_norm": 40.5, "learning_rate": 4.1389943671603655e-05, "loss": 1.2177, "step": 202600 }, { "epoch": 0.18, "grad_norm": 18.25, "learning_rate": 4.138544459841273e-05, "loss": 1.086, "step": 202700 }, { "epoch": 0.18, "grad_norm": 23.625, "learning_rate": 4.1380945525221805e-05, "loss": 1.0988, "step": 202800 }, { "epoch": 0.18, "grad_norm": 0.09375, "learning_rate": 4.137644645203088e-05, "loss": 1.1845, "step": 202900 }, { "epoch": 0.18, "grad_norm": 0.00836181640625, "learning_rate": 4.1371947378839956e-05, "loss": 1.2906, "step": 203000 }, { "epoch": 0.18, "grad_norm": 0.1318359375, "learning_rate": 4.136744830564904e-05, "loss": 1.1791, "step": 203100 }, { "epoch": 0.18, "grad_norm": 28.75, "learning_rate": 4.1362949232458113e-05, "loss": 1.1983, "step": 203200 }, { "epoch": 0.18, "grad_norm": 21.125, "learning_rate": 4.1358450159267196e-05, "loss": 1.2232, "step": 203300 }, { "epoch": 0.18, "grad_norm": 23.5, "learning_rate": 4.135395108607627e-05, "loss": 1.1762, "step": 203400 }, { "epoch": 0.18, "grad_norm": 86.5, "learning_rate": 4.1349452012885346e-05, "loss": 1.2481, "step": 203500 }, { "epoch": 0.18, "grad_norm": 22.0, "learning_rate": 4.134495293969443e-05, "loss": 1.1948, "step": 203600 }, { "epoch": 0.18, "grad_norm": 0.232421875, "learning_rate": 4.1340453866503504e-05, "loss": 1.2403, "step": 203700 }, { "epoch": 0.18, "grad_norm": 27.25, "learning_rate": 4.133595479331258e-05, "loss": 1.2108, "step": 203800 }, { "epoch": 0.18, "grad_norm": 218.0, "learning_rate": 4.133145572012166e-05, "loss": 1.2229, "step": 203900 }, { "epoch": 0.18, "grad_norm": 0.07177734375, "learning_rate": 4.1326956646930736e-05, "loss": 1.0764, "step": 204000 }, { "epoch": 0.18, "grad_norm": 21.875, "learning_rate": 4.132245757373981e-05, "loss": 1.2225, "step": 204100 }, { "epoch": 0.18, "grad_norm": 16.0, "learning_rate": 4.131795850054889e-05, "loss": 1.098, "step": 204200 }, { "epoch": 0.18, "grad_norm": 17.0, "learning_rate": 4.131345942735796e-05, "loss": 1.3416, "step": 204300 }, { "epoch": 0.18, "grad_norm": 24.75, "learning_rate": 4.1308960354167045e-05, "loss": 1.153, "step": 204400 }, { "epoch": 0.18, "grad_norm": 29.375, "learning_rate": 4.130446128097612e-05, "loss": 1.2474, "step": 204500 }, { "epoch": 0.18, "grad_norm": 27.25, "learning_rate": 4.1299962207785195e-05, "loss": 1.078, "step": 204600 }, { "epoch": 0.18, "grad_norm": 10.0, "learning_rate": 4.129546313459428e-05, "loss": 1.3034, "step": 204700 }, { "epoch": 0.18, "grad_norm": 80.0, "learning_rate": 4.129096406140335e-05, "loss": 1.1916, "step": 204800 }, { "epoch": 0.18, "grad_norm": 39.5, "learning_rate": 4.128646498821243e-05, "loss": 1.2762, "step": 204900 }, { "epoch": 0.18, "grad_norm": 9.5625, "learning_rate": 4.128196591502151e-05, "loss": 1.1806, "step": 205000 }, { "epoch": 0.18, "grad_norm": 66.5, "learning_rate": 4.1277466841830586e-05, "loss": 1.1549, "step": 205100 }, { "epoch": 0.18, "grad_norm": 55.0, "learning_rate": 4.127296776863967e-05, "loss": 1.0268, "step": 205200 }, { "epoch": 0.18, "grad_norm": 69.5, "learning_rate": 4.126846869544874e-05, "loss": 1.4482, "step": 205300 }, { "epoch": 0.18, "grad_norm": 56.25, "learning_rate": 4.126396962225781e-05, "loss": 1.1934, "step": 205400 }, { "epoch": 0.18, "grad_norm": 38.25, "learning_rate": 4.1259470549066894e-05, "loss": 1.2673, "step": 205500 }, { "epoch": 0.18, "grad_norm": 72.0, "learning_rate": 4.125497147587597e-05, "loss": 1.2086, "step": 205600 }, { "epoch": 0.18, "grad_norm": 234.0, "learning_rate": 4.1250472402685044e-05, "loss": 1.3153, "step": 205700 }, { "epoch": 0.18, "grad_norm": 45.25, "learning_rate": 4.1245973329494126e-05, "loss": 1.1355, "step": 205800 }, { "epoch": 0.18, "grad_norm": 17.5, "learning_rate": 4.12414742563032e-05, "loss": 1.0873, "step": 205900 }, { "epoch": 0.18, "grad_norm": 42.5, "learning_rate": 4.1236975183112284e-05, "loss": 1.1588, "step": 206000 }, { "epoch": 0.18, "grad_norm": 21.625, "learning_rate": 4.123247610992136e-05, "loss": 1.0837, "step": 206100 }, { "epoch": 0.18, "grad_norm": 20.5, "learning_rate": 4.1227977036730435e-05, "loss": 1.102, "step": 206200 }, { "epoch": 0.18, "grad_norm": 37.25, "learning_rate": 4.122347796353952e-05, "loss": 1.1567, "step": 206300 }, { "epoch": 0.18, "grad_norm": 140.0, "learning_rate": 4.121897889034859e-05, "loss": 1.1777, "step": 206400 }, { "epoch": 0.18, "grad_norm": 21.875, "learning_rate": 4.121447981715767e-05, "loss": 1.1418, "step": 206500 }, { "epoch": 0.18, "grad_norm": 16.125, "learning_rate": 4.120998074396675e-05, "loss": 1.165, "step": 206600 }, { "epoch": 0.18, "grad_norm": 70.0, "learning_rate": 4.120548167077582e-05, "loss": 1.1417, "step": 206700 }, { "epoch": 0.18, "grad_norm": 340.0, "learning_rate": 4.12009825975849e-05, "loss": 1.2375, "step": 206800 }, { "epoch": 0.18, "grad_norm": 37.25, "learning_rate": 4.1196483524393976e-05, "loss": 1.3143, "step": 206900 }, { "epoch": 0.18, "grad_norm": 9.125, "learning_rate": 4.119198445120305e-05, "loss": 1.1753, "step": 207000 }, { "epoch": 0.18, "grad_norm": 0.006439208984375, "learning_rate": 4.118748537801213e-05, "loss": 1.1464, "step": 207100 }, { "epoch": 0.18, "grad_norm": 98.0, "learning_rate": 4.118298630482121e-05, "loss": 1.2333, "step": 207200 }, { "epoch": 0.18, "grad_norm": 29.875, "learning_rate": 4.1178487231630284e-05, "loss": 1.0748, "step": 207300 }, { "epoch": 0.18, "grad_norm": 19.875, "learning_rate": 4.1173988158439366e-05, "loss": 1.1062, "step": 207400 }, { "epoch": 0.18, "grad_norm": 0.2578125, "learning_rate": 4.116948908524844e-05, "loss": 1.2516, "step": 207500 }, { "epoch": 0.18, "grad_norm": 48.5, "learning_rate": 4.1164990012057516e-05, "loss": 1.1372, "step": 207600 }, { "epoch": 0.19, "grad_norm": 196.0, "learning_rate": 4.11604909388666e-05, "loss": 1.2264, "step": 207700 }, { "epoch": 0.19, "grad_norm": 14.75, "learning_rate": 4.1155991865675674e-05, "loss": 1.1682, "step": 207800 }, { "epoch": 0.19, "grad_norm": 27.125, "learning_rate": 4.1151492792484756e-05, "loss": 1.2962, "step": 207900 }, { "epoch": 0.19, "grad_norm": 95.5, "learning_rate": 4.1146993719293825e-05, "loss": 1.3444, "step": 208000 }, { "epoch": 0.19, "grad_norm": 6.28125, "learning_rate": 4.11424946461029e-05, "loss": 1.2622, "step": 208100 }, { "epoch": 0.19, "grad_norm": 0.0250244140625, "learning_rate": 4.113799557291198e-05, "loss": 1.2264, "step": 208200 }, { "epoch": 0.19, "grad_norm": 7.03125, "learning_rate": 4.113349649972106e-05, "loss": 1.0497, "step": 208300 }, { "epoch": 0.19, "grad_norm": 145.0, "learning_rate": 4.112899742653013e-05, "loss": 1.1898, "step": 208400 }, { "epoch": 0.19, "grad_norm": 2.359375, "learning_rate": 4.1124498353339215e-05, "loss": 1.5019, "step": 208500 }, { "epoch": 0.19, "grad_norm": 29.875, "learning_rate": 4.111999928014829e-05, "loss": 1.0873, "step": 208600 }, { "epoch": 0.19, "grad_norm": 37.25, "learning_rate": 4.111550020695737e-05, "loss": 1.3849, "step": 208700 }, { "epoch": 0.19, "grad_norm": 37.0, "learning_rate": 4.111100113376645e-05, "loss": 1.1838, "step": 208800 }, { "epoch": 0.19, "grad_norm": 97.0, "learning_rate": 4.110650206057552e-05, "loss": 1.1726, "step": 208900 }, { "epoch": 0.19, "grad_norm": 15.3125, "learning_rate": 4.1102002987384605e-05, "loss": 1.1309, "step": 209000 }, { "epoch": 0.19, "grad_norm": 42.0, "learning_rate": 4.109750391419368e-05, "loss": 1.2736, "step": 209100 }, { "epoch": 0.19, "grad_norm": 77.5, "learning_rate": 4.1093004841002756e-05, "loss": 1.2169, "step": 209200 }, { "epoch": 0.19, "grad_norm": 8.25, "learning_rate": 4.108850576781183e-05, "loss": 1.2611, "step": 209300 }, { "epoch": 0.19, "grad_norm": 0.00958251953125, "learning_rate": 4.1084006694620906e-05, "loss": 1.1536, "step": 209400 }, { "epoch": 0.19, "grad_norm": 23.0, "learning_rate": 4.107950762142999e-05, "loss": 1.1543, "step": 209500 }, { "epoch": 0.19, "grad_norm": 21.375, "learning_rate": 4.1075008548239064e-05, "loss": 1.1865, "step": 209600 }, { "epoch": 0.19, "grad_norm": 8.3125, "learning_rate": 4.107050947504814e-05, "loss": 1.162, "step": 209700 }, { "epoch": 0.19, "grad_norm": 7.46875, "learning_rate": 4.106601040185722e-05, "loss": 1.2374, "step": 209800 }, { "epoch": 0.19, "grad_norm": 436.0, "learning_rate": 4.10615113286663e-05, "loss": 1.0108, "step": 209900 }, { "epoch": 0.19, "grad_norm": 34.25, "learning_rate": 4.105701225547537e-05, "loss": 1.1299, "step": 210000 }, { "epoch": 0.19, "grad_norm": 0.25390625, "learning_rate": 4.1052513182284454e-05, "loss": 1.0734, "step": 210100 }, { "epoch": 0.19, "grad_norm": 29.75, "learning_rate": 4.104801410909353e-05, "loss": 1.1698, "step": 210200 }, { "epoch": 0.19, "grad_norm": 44.0, "learning_rate": 4.1043515035902605e-05, "loss": 1.2316, "step": 210300 }, { "epoch": 0.19, "grad_norm": 60.0, "learning_rate": 4.103901596271169e-05, "loss": 1.1693, "step": 210400 }, { "epoch": 0.19, "grad_norm": 124.5, "learning_rate": 4.103451688952076e-05, "loss": 1.1764, "step": 210500 }, { "epoch": 0.19, "grad_norm": 0.056396484375, "learning_rate": 4.103001781632984e-05, "loss": 1.254, "step": 210600 }, { "epoch": 0.19, "grad_norm": 47.5, "learning_rate": 4.102551874313891e-05, "loss": 1.0526, "step": 210700 }, { "epoch": 0.19, "grad_norm": 16.125, "learning_rate": 4.102101966994799e-05, "loss": 1.3679, "step": 210800 }, { "epoch": 0.19, "grad_norm": 24.25, "learning_rate": 4.101652059675707e-05, "loss": 1.2325, "step": 210900 }, { "epoch": 0.19, "grad_norm": 24.25, "learning_rate": 4.1012021523566146e-05, "loss": 1.199, "step": 211000 }, { "epoch": 0.19, "grad_norm": 0.036865234375, "learning_rate": 4.100752245037522e-05, "loss": 1.0996, "step": 211100 }, { "epoch": 0.19, "grad_norm": 0.05322265625, "learning_rate": 4.10030233771843e-05, "loss": 1.1855, "step": 211200 }, { "epoch": 0.19, "grad_norm": 19.375, "learning_rate": 4.099852430399338e-05, "loss": 1.3936, "step": 211300 }, { "epoch": 0.19, "grad_norm": 19.0, "learning_rate": 4.099402523080246e-05, "loss": 1.2044, "step": 211400 }, { "epoch": 0.19, "grad_norm": 56.5, "learning_rate": 4.0989526157611536e-05, "loss": 1.1683, "step": 211500 }, { "epoch": 0.19, "grad_norm": 0.36328125, "learning_rate": 4.098502708442061e-05, "loss": 1.2577, "step": 211600 }, { "epoch": 0.19, "grad_norm": 9.9375, "learning_rate": 4.0980528011229693e-05, "loss": 1.2948, "step": 211700 }, { "epoch": 0.19, "grad_norm": 28.125, "learning_rate": 4.097602893803877e-05, "loss": 1.2553, "step": 211800 }, { "epoch": 0.19, "grad_norm": 56.0, "learning_rate": 4.097152986484784e-05, "loss": 1.2418, "step": 211900 }, { "epoch": 0.19, "grad_norm": 11.5625, "learning_rate": 4.096703079165692e-05, "loss": 1.0349, "step": 212000 }, { "epoch": 0.19, "grad_norm": 18.375, "learning_rate": 4.0962531718465995e-05, "loss": 1.2584, "step": 212100 }, { "epoch": 0.19, "grad_norm": 22.0, "learning_rate": 4.095803264527508e-05, "loss": 1.154, "step": 212200 }, { "epoch": 0.19, "grad_norm": 40.5, "learning_rate": 4.095353357208415e-05, "loss": 1.231, "step": 212300 }, { "epoch": 0.19, "grad_norm": 11.1875, "learning_rate": 4.094903449889323e-05, "loss": 1.2227, "step": 212400 }, { "epoch": 0.19, "grad_norm": 53.75, "learning_rate": 4.094453542570231e-05, "loss": 1.1749, "step": 212500 }, { "epoch": 0.19, "grad_norm": 24.125, "learning_rate": 4.0940036352511385e-05, "loss": 1.0617, "step": 212600 }, { "epoch": 0.19, "grad_norm": 26.375, "learning_rate": 4.093553727932046e-05, "loss": 1.3517, "step": 212700 }, { "epoch": 0.19, "grad_norm": 18.125, "learning_rate": 4.093103820612954e-05, "loss": 1.2856, "step": 212800 }, { "epoch": 0.19, "grad_norm": 19.75, "learning_rate": 4.092653913293862e-05, "loss": 1.0325, "step": 212900 }, { "epoch": 0.19, "grad_norm": 196.0, "learning_rate": 4.092204005974769e-05, "loss": 1.2372, "step": 213000 }, { "epoch": 0.19, "grad_norm": 9.9375, "learning_rate": 4.0917540986556775e-05, "loss": 1.1683, "step": 213100 }, { "epoch": 0.19, "grad_norm": 14.875, "learning_rate": 4.0913041913365844e-05, "loss": 1.3145, "step": 213200 }, { "epoch": 0.19, "grad_norm": 10.0625, "learning_rate": 4.0908542840174926e-05, "loss": 1.156, "step": 213300 }, { "epoch": 0.19, "grad_norm": 5.8125, "learning_rate": 4.0904043766984e-05, "loss": 1.198, "step": 213400 }, { "epoch": 0.19, "grad_norm": 56.0, "learning_rate": 4.089954469379308e-05, "loss": 1.2882, "step": 213500 }, { "epoch": 0.19, "grad_norm": 13.4375, "learning_rate": 4.089504562060216e-05, "loss": 1.2905, "step": 213600 }, { "epoch": 0.19, "grad_norm": 16.875, "learning_rate": 4.0890546547411234e-05, "loss": 1.2709, "step": 213700 }, { "epoch": 0.19, "grad_norm": 0.2060546875, "learning_rate": 4.088604747422031e-05, "loss": 1.1773, "step": 213800 }, { "epoch": 0.19, "grad_norm": 0.134765625, "learning_rate": 4.088154840102939e-05, "loss": 1.0958, "step": 213900 }, { "epoch": 0.19, "grad_norm": 0.333984375, "learning_rate": 4.087704932783847e-05, "loss": 1.0608, "step": 214000 }, { "epoch": 0.19, "grad_norm": 132.0, "learning_rate": 4.087255025464755e-05, "loss": 1.0888, "step": 214100 }, { "epoch": 0.19, "grad_norm": 114.5, "learning_rate": 4.0868051181456624e-05, "loss": 0.9883, "step": 214200 }, { "epoch": 0.19, "grad_norm": 60.25, "learning_rate": 4.08635521082657e-05, "loss": 1.2059, "step": 214300 }, { "epoch": 0.19, "grad_norm": 14.25, "learning_rate": 4.085905303507478e-05, "loss": 1.1498, "step": 214400 }, { "epoch": 0.19, "grad_norm": 26.375, "learning_rate": 4.085455396188385e-05, "loss": 1.0384, "step": 214500 }, { "epoch": 0.19, "grad_norm": 22.25, "learning_rate": 4.0850054888692926e-05, "loss": 1.2129, "step": 214600 }, { "epoch": 0.19, "grad_norm": 26.875, "learning_rate": 4.084555581550201e-05, "loss": 1.2073, "step": 214700 }, { "epoch": 0.19, "grad_norm": 25.25, "learning_rate": 4.084105674231108e-05, "loss": 1.1643, "step": 214800 }, { "epoch": 0.19, "grad_norm": 29.5, "learning_rate": 4.0836557669120165e-05, "loss": 1.213, "step": 214900 }, { "epoch": 0.19, "grad_norm": 76.5, "learning_rate": 4.083205859592924e-05, "loss": 1.298, "step": 215000 }, { "epoch": 0.19, "grad_norm": 39.75, "learning_rate": 4.0827559522738316e-05, "loss": 1.1712, "step": 215100 }, { "epoch": 0.19, "grad_norm": 11.0625, "learning_rate": 4.08230604495474e-05, "loss": 1.2301, "step": 215200 }, { "epoch": 0.19, "grad_norm": 12.4375, "learning_rate": 4.0818561376356473e-05, "loss": 1.2684, "step": 215300 }, { "epoch": 0.19, "grad_norm": 0.447265625, "learning_rate": 4.081406230316555e-05, "loss": 1.1148, "step": 215400 }, { "epoch": 0.19, "grad_norm": 245.0, "learning_rate": 4.080956322997463e-05, "loss": 1.2759, "step": 215500 }, { "epoch": 0.19, "grad_norm": 145.0, "learning_rate": 4.0805064156783706e-05, "loss": 1.0697, "step": 215600 }, { "epoch": 0.19, "grad_norm": 44.75, "learning_rate": 4.080056508359278e-05, "loss": 1.1672, "step": 215700 }, { "epoch": 0.19, "grad_norm": 31.125, "learning_rate": 4.079606601040186e-05, "loss": 1.2312, "step": 215800 }, { "epoch": 0.19, "grad_norm": 27.125, "learning_rate": 4.079156693721093e-05, "loss": 1.1621, "step": 215900 }, { "epoch": 0.19, "grad_norm": 14.4375, "learning_rate": 4.0787067864020014e-05, "loss": 1.1509, "step": 216000 }, { "epoch": 0.19, "grad_norm": 116.5, "learning_rate": 4.078256879082909e-05, "loss": 1.3298, "step": 216100 }, { "epoch": 0.19, "grad_norm": 23.625, "learning_rate": 4.0778069717638165e-05, "loss": 1.0676, "step": 216200 }, { "epoch": 0.19, "grad_norm": 8.9375, "learning_rate": 4.077357064444725e-05, "loss": 1.3096, "step": 216300 }, { "epoch": 0.19, "grad_norm": 23.375, "learning_rate": 4.076907157125632e-05, "loss": 1.1852, "step": 216400 }, { "epoch": 0.19, "grad_norm": 27.125, "learning_rate": 4.07645724980654e-05, "loss": 1.0775, "step": 216500 }, { "epoch": 0.19, "grad_norm": 13.4375, "learning_rate": 4.076007342487448e-05, "loss": 1.3339, "step": 216600 }, { "epoch": 0.19, "grad_norm": 254.0, "learning_rate": 4.0755574351683555e-05, "loss": 1.1699, "step": 216700 }, { "epoch": 0.19, "grad_norm": 442.0, "learning_rate": 4.075107527849264e-05, "loss": 1.1621, "step": 216800 }, { "epoch": 0.19, "grad_norm": 19.125, "learning_rate": 4.074657620530171e-05, "loss": 1.1292, "step": 216900 }, { "epoch": 0.19, "grad_norm": 13.125, "learning_rate": 4.074207713211079e-05, "loss": 1.2155, "step": 217000 }, { "epoch": 0.19, "grad_norm": 32.5, "learning_rate": 4.0737578058919863e-05, "loss": 1.2114, "step": 217100 }, { "epoch": 0.19, "grad_norm": 37.75, "learning_rate": 4.073307898572894e-05, "loss": 1.2032, "step": 217200 }, { "epoch": 0.19, "grad_norm": 18.25, "learning_rate": 4.0728579912538014e-05, "loss": 1.2392, "step": 217300 }, { "epoch": 0.19, "grad_norm": 116.5, "learning_rate": 4.0724080839347096e-05, "loss": 1.1625, "step": 217400 }, { "epoch": 0.19, "grad_norm": 10.25, "learning_rate": 4.071958176615617e-05, "loss": 1.2586, "step": 217500 }, { "epoch": 0.19, "grad_norm": 0.265625, "learning_rate": 4.0715082692965254e-05, "loss": 1.0202, "step": 217600 }, { "epoch": 0.19, "grad_norm": 79.5, "learning_rate": 4.071058361977433e-05, "loss": 1.1353, "step": 217700 }, { "epoch": 0.19, "grad_norm": 89.5, "learning_rate": 4.0706084546583404e-05, "loss": 1.3959, "step": 217800 }, { "epoch": 0.19, "grad_norm": 22.0, "learning_rate": 4.0701585473392486e-05, "loss": 1.2321, "step": 217900 }, { "epoch": 0.19, "grad_norm": 43.75, "learning_rate": 4.069708640020156e-05, "loss": 1.1816, "step": 218000 }, { "epoch": 0.19, "grad_norm": 47.0, "learning_rate": 4.069258732701064e-05, "loss": 1.2895, "step": 218100 }, { "epoch": 0.19, "grad_norm": 87.5, "learning_rate": 4.068808825381972e-05, "loss": 0.9913, "step": 218200 }, { "epoch": 0.19, "grad_norm": 202.0, "learning_rate": 4.0683589180628795e-05, "loss": 1.2628, "step": 218300 }, { "epoch": 0.19, "grad_norm": 21.875, "learning_rate": 4.067909010743787e-05, "loss": 1.1928, "step": 218400 }, { "epoch": 0.19, "grad_norm": 0.0947265625, "learning_rate": 4.0674591034246945e-05, "loss": 1.2083, "step": 218500 }, { "epoch": 0.19, "grad_norm": 2.828125, "learning_rate": 4.067009196105602e-05, "loss": 1.3128, "step": 218600 }, { "epoch": 0.19, "grad_norm": 100.5, "learning_rate": 4.06655928878651e-05, "loss": 1.1397, "step": 218700 }, { "epoch": 0.19, "grad_norm": 0.11865234375, "learning_rate": 4.066109381467418e-05, "loss": 1.2369, "step": 218800 }, { "epoch": 0.19, "grad_norm": 8.625, "learning_rate": 4.0656594741483253e-05, "loss": 1.0596, "step": 218900 }, { "epoch": 0.2, "grad_norm": 18.875, "learning_rate": 4.0652095668292336e-05, "loss": 1.1393, "step": 219000 }, { "epoch": 0.2, "grad_norm": 33.25, "learning_rate": 4.064759659510141e-05, "loss": 1.1817, "step": 219100 }, { "epoch": 0.2, "grad_norm": 79.5, "learning_rate": 4.0643097521910486e-05, "loss": 1.2787, "step": 219200 }, { "epoch": 0.2, "grad_norm": 29.5, "learning_rate": 4.063859844871957e-05, "loss": 1.265, "step": 219300 }, { "epoch": 0.2, "grad_norm": 0.024658203125, "learning_rate": 4.0634099375528644e-05, "loss": 1.2237, "step": 219400 }, { "epoch": 0.2, "grad_norm": 59.5, "learning_rate": 4.0629600302337726e-05, "loss": 1.316, "step": 219500 }, { "epoch": 0.2, "grad_norm": 69.0, "learning_rate": 4.06251012291468e-05, "loss": 1.0299, "step": 219600 }, { "epoch": 0.2, "grad_norm": 20.375, "learning_rate": 4.062060215595587e-05, "loss": 1.1371, "step": 219700 }, { "epoch": 0.2, "grad_norm": 0.326171875, "learning_rate": 4.061610308276495e-05, "loss": 1.2016, "step": 219800 }, { "epoch": 0.2, "grad_norm": 111.5, "learning_rate": 4.061160400957403e-05, "loss": 1.2443, "step": 219900 }, { "epoch": 0.2, "grad_norm": 25.75, "learning_rate": 4.06071049363831e-05, "loss": 1.077, "step": 220000 }, { "epoch": 0.2, "grad_norm": 28.125, "learning_rate": 4.0602605863192185e-05, "loss": 1.2198, "step": 220100 }, { "epoch": 0.2, "grad_norm": 23.125, "learning_rate": 4.059810679000126e-05, "loss": 1.164, "step": 220200 }, { "epoch": 0.2, "grad_norm": 72.5, "learning_rate": 4.059360771681034e-05, "loss": 1.0561, "step": 220300 }, { "epoch": 0.2, "grad_norm": 26.25, "learning_rate": 4.058910864361942e-05, "loss": 1.1069, "step": 220400 }, { "epoch": 0.2, "grad_norm": 49.5, "learning_rate": 4.058460957042849e-05, "loss": 1.2199, "step": 220500 }, { "epoch": 0.2, "grad_norm": 25.25, "learning_rate": 4.0580110497237575e-05, "loss": 1.1114, "step": 220600 }, { "epoch": 0.2, "grad_norm": 47.0, "learning_rate": 4.057561142404665e-05, "loss": 1.3102, "step": 220700 }, { "epoch": 0.2, "grad_norm": 37.25, "learning_rate": 4.0571112350855726e-05, "loss": 1.293, "step": 220800 }, { "epoch": 0.2, "grad_norm": 88.5, "learning_rate": 4.056661327766481e-05, "loss": 1.1002, "step": 220900 }, { "epoch": 0.2, "grad_norm": 53.5, "learning_rate": 4.0562114204473876e-05, "loss": 1.2794, "step": 221000 }, { "epoch": 0.2, "grad_norm": 33.0, "learning_rate": 4.055761513128296e-05, "loss": 1.3038, "step": 221100 }, { "epoch": 0.2, "grad_norm": 63.25, "learning_rate": 4.0553116058092034e-05, "loss": 1.2728, "step": 221200 }, { "epoch": 0.2, "grad_norm": 0.08447265625, "learning_rate": 4.054861698490111e-05, "loss": 1.2417, "step": 221300 }, { "epoch": 0.2, "grad_norm": 13.3125, "learning_rate": 4.054411791171019e-05, "loss": 1.3289, "step": 221400 }, { "epoch": 0.2, "grad_norm": 8.8125, "learning_rate": 4.0539618838519266e-05, "loss": 1.0395, "step": 221500 }, { "epoch": 0.2, "grad_norm": 76.5, "learning_rate": 4.053511976532834e-05, "loss": 1.0644, "step": 221600 }, { "epoch": 0.2, "grad_norm": 14.3125, "learning_rate": 4.0530620692137424e-05, "loss": 1.1834, "step": 221700 }, { "epoch": 0.2, "grad_norm": 52.0, "learning_rate": 4.05261216189465e-05, "loss": 1.102, "step": 221800 }, { "epoch": 0.2, "grad_norm": 13.1875, "learning_rate": 4.0521622545755575e-05, "loss": 1.2857, "step": 221900 }, { "epoch": 0.2, "grad_norm": 116.0, "learning_rate": 4.051712347256466e-05, "loss": 1.1668, "step": 222000 }, { "epoch": 0.2, "grad_norm": 5.25, "learning_rate": 4.051262439937373e-05, "loss": 1.2486, "step": 222100 }, { "epoch": 0.2, "grad_norm": 28.5, "learning_rate": 4.0508125326182814e-05, "loss": 1.0911, "step": 222200 }, { "epoch": 0.2, "grad_norm": 23.0, "learning_rate": 4.050362625299188e-05, "loss": 1.171, "step": 222300 }, { "epoch": 0.2, "grad_norm": 50.5, "learning_rate": 4.049912717980096e-05, "loss": 1.2212, "step": 222400 }, { "epoch": 0.2, "grad_norm": 23.0, "learning_rate": 4.049462810661004e-05, "loss": 1.3627, "step": 222500 }, { "epoch": 0.2, "grad_norm": 19.75, "learning_rate": 4.0490129033419116e-05, "loss": 1.272, "step": 222600 }, { "epoch": 0.2, "grad_norm": 22.5, "learning_rate": 4.048562996022819e-05, "loss": 1.1911, "step": 222700 }, { "epoch": 0.2, "grad_norm": 66.5, "learning_rate": 4.048113088703727e-05, "loss": 1.2106, "step": 222800 }, { "epoch": 0.2, "grad_norm": 33.75, "learning_rate": 4.047663181384635e-05, "loss": 1.2051, "step": 222900 }, { "epoch": 0.2, "grad_norm": 55.75, "learning_rate": 4.047213274065543e-05, "loss": 1.0809, "step": 223000 }, { "epoch": 0.2, "grad_norm": 61.75, "learning_rate": 4.0467633667464506e-05, "loss": 1.2241, "step": 223100 }, { "epoch": 0.2, "grad_norm": 15.625, "learning_rate": 4.046313459427358e-05, "loss": 1.1545, "step": 223200 }, { "epoch": 0.2, "grad_norm": 18.75, "learning_rate": 4.045863552108266e-05, "loss": 1.0844, "step": 223300 }, { "epoch": 0.2, "grad_norm": 14.0, "learning_rate": 4.045413644789174e-05, "loss": 1.2074, "step": 223400 }, { "epoch": 0.2, "grad_norm": 8.75, "learning_rate": 4.0449637374700814e-05, "loss": 1.2002, "step": 223500 }, { "epoch": 0.2, "grad_norm": 9.5, "learning_rate": 4.044513830150989e-05, "loss": 1.276, "step": 223600 }, { "epoch": 0.2, "grad_norm": 83.0, "learning_rate": 4.0440639228318965e-05, "loss": 1.2091, "step": 223700 }, { "epoch": 0.2, "grad_norm": 16.0, "learning_rate": 4.043614015512805e-05, "loss": 1.2641, "step": 223800 }, { "epoch": 0.2, "grad_norm": 11.5625, "learning_rate": 4.043164108193712e-05, "loss": 1.1295, "step": 223900 }, { "epoch": 0.2, "grad_norm": 362.0, "learning_rate": 4.04271420087462e-05, "loss": 1.1258, "step": 224000 }, { "epoch": 0.2, "grad_norm": 44.5, "learning_rate": 4.042264293555528e-05, "loss": 1.1873, "step": 224100 }, { "epoch": 0.2, "grad_norm": 32.0, "learning_rate": 4.0418143862364355e-05, "loss": 1.2692, "step": 224200 }, { "epoch": 0.2, "grad_norm": 26.5, "learning_rate": 4.041364478917343e-05, "loss": 1.1327, "step": 224300 }, { "epoch": 0.2, "grad_norm": 5.90625, "learning_rate": 4.040914571598251e-05, "loss": 1.3219, "step": 224400 }, { "epoch": 0.2, "grad_norm": 1.4375, "learning_rate": 4.040464664279159e-05, "loss": 1.2652, "step": 224500 }, { "epoch": 0.2, "grad_norm": 45.0, "learning_rate": 4.040014756960066e-05, "loss": 1.2871, "step": 224600 }, { "epoch": 0.2, "grad_norm": 8.125, "learning_rate": 4.0395648496409745e-05, "loss": 1.1212, "step": 224700 }, { "epoch": 0.2, "grad_norm": 20.5, "learning_rate": 4.039114942321882e-05, "loss": 1.0369, "step": 224800 }, { "epoch": 0.2, "grad_norm": 33.5, "learning_rate": 4.0386650350027896e-05, "loss": 1.2802, "step": 224900 }, { "epoch": 0.2, "grad_norm": 0.0026397705078125, "learning_rate": 4.038215127683697e-05, "loss": 1.1381, "step": 225000 }, { "epoch": 0.2, "grad_norm": 7.15625, "learning_rate": 4.0377652203646046e-05, "loss": 1.3793, "step": 225100 }, { "epoch": 0.2, "grad_norm": 19.125, "learning_rate": 4.037315313045513e-05, "loss": 1.2149, "step": 225200 }, { "epoch": 0.2, "grad_norm": 21.75, "learning_rate": 4.0368654057264204e-05, "loss": 1.2249, "step": 225300 }, { "epoch": 0.2, "grad_norm": 38.75, "learning_rate": 4.036415498407328e-05, "loss": 1.2624, "step": 225400 }, { "epoch": 0.2, "grad_norm": 146.0, "learning_rate": 4.035965591088236e-05, "loss": 1.3057, "step": 225500 }, { "epoch": 0.2, "grad_norm": 0.030029296875, "learning_rate": 4.035515683769144e-05, "loss": 1.1855, "step": 225600 }, { "epoch": 0.2, "grad_norm": 18.125, "learning_rate": 4.035065776450052e-05, "loss": 1.0747, "step": 225700 }, { "epoch": 0.2, "grad_norm": 24.75, "learning_rate": 4.0346158691309594e-05, "loss": 1.2749, "step": 225800 }, { "epoch": 0.2, "grad_norm": 3.921875, "learning_rate": 4.034165961811867e-05, "loss": 1.2182, "step": 225900 }, { "epoch": 0.2, "grad_norm": 30.25, "learning_rate": 4.033716054492775e-05, "loss": 1.2729, "step": 226000 }, { "epoch": 0.2, "grad_norm": 33.75, "learning_rate": 4.033266147173683e-05, "loss": 1.1403, "step": 226100 }, { "epoch": 0.2, "grad_norm": 298.0, "learning_rate": 4.03281623985459e-05, "loss": 1.3445, "step": 226200 }, { "epoch": 0.2, "grad_norm": 350.0, "learning_rate": 4.032366332535498e-05, "loss": 1.2373, "step": 226300 }, { "epoch": 0.2, "grad_norm": 18.0, "learning_rate": 4.031916425216405e-05, "loss": 1.2, "step": 226400 }, { "epoch": 0.2, "grad_norm": 5.25, "learning_rate": 4.0314665178973135e-05, "loss": 1.3136, "step": 226500 }, { "epoch": 0.2, "grad_norm": 0.1640625, "learning_rate": 4.031016610578221e-05, "loss": 1.2127, "step": 226600 }, { "epoch": 0.2, "grad_norm": 20.375, "learning_rate": 4.0305667032591286e-05, "loss": 1.1985, "step": 226700 }, { "epoch": 0.2, "grad_norm": 1.8359375, "learning_rate": 4.030116795940037e-05, "loss": 1.2223, "step": 226800 }, { "epoch": 0.2, "grad_norm": 2.96875, "learning_rate": 4.029666888620944e-05, "loss": 1.2426, "step": 226900 }, { "epoch": 0.2, "grad_norm": 0.00701904296875, "learning_rate": 4.029216981301852e-05, "loss": 1.1825, "step": 227000 }, { "epoch": 0.2, "grad_norm": 0.55078125, "learning_rate": 4.02876707398276e-05, "loss": 1.3144, "step": 227100 }, { "epoch": 0.2, "grad_norm": 21.875, "learning_rate": 4.0283171666636676e-05, "loss": 1.3405, "step": 227200 }, { "epoch": 0.2, "grad_norm": 0.058349609375, "learning_rate": 4.027867259344575e-05, "loss": 1.1609, "step": 227300 }, { "epoch": 0.2, "grad_norm": 43.25, "learning_rate": 4.0274173520254833e-05, "loss": 1.0487, "step": 227400 }, { "epoch": 0.2, "grad_norm": 101.0, "learning_rate": 4.02696744470639e-05, "loss": 1.2605, "step": 227500 }, { "epoch": 0.2, "grad_norm": 11.125, "learning_rate": 4.0265175373872984e-05, "loss": 1.0055, "step": 227600 }, { "epoch": 0.2, "grad_norm": 52.25, "learning_rate": 4.026067630068206e-05, "loss": 1.1338, "step": 227700 }, { "epoch": 0.2, "grad_norm": 26.75, "learning_rate": 4.0256177227491135e-05, "loss": 1.148, "step": 227800 }, { "epoch": 0.2, "grad_norm": 15.625, "learning_rate": 4.025167815430022e-05, "loss": 1.2267, "step": 227900 }, { "epoch": 0.2, "grad_norm": 37.25, "learning_rate": 4.024717908110929e-05, "loss": 1.0464, "step": 228000 }, { "epoch": 0.2, "grad_norm": 32.75, "learning_rate": 4.024268000791837e-05, "loss": 1.1657, "step": 228100 }, { "epoch": 0.2, "grad_norm": 39.25, "learning_rate": 4.023818093472745e-05, "loss": 1.1317, "step": 228200 }, { "epoch": 0.2, "grad_norm": 17.75, "learning_rate": 4.0233681861536525e-05, "loss": 1.3315, "step": 228300 }, { "epoch": 0.2, "grad_norm": 72.0, "learning_rate": 4.022918278834561e-05, "loss": 1.146, "step": 228400 }, { "epoch": 0.2, "grad_norm": 13.375, "learning_rate": 4.022468371515468e-05, "loss": 1.1449, "step": 228500 }, { "epoch": 0.2, "grad_norm": 35.75, "learning_rate": 4.022018464196376e-05, "loss": 1.18, "step": 228600 }, { "epoch": 0.2, "grad_norm": 23.625, "learning_rate": 4.021568556877284e-05, "loss": 1.12, "step": 228700 }, { "epoch": 0.2, "grad_norm": 55.25, "learning_rate": 4.021118649558191e-05, "loss": 1.278, "step": 228800 }, { "epoch": 0.2, "grad_norm": 7.96875, "learning_rate": 4.0206687422390984e-05, "loss": 1.1042, "step": 228900 }, { "epoch": 0.2, "grad_norm": 0.33984375, "learning_rate": 4.0202188349200066e-05, "loss": 1.2166, "step": 229000 }, { "epoch": 0.2, "grad_norm": 9.0625, "learning_rate": 4.019768927600914e-05, "loss": 1.1626, "step": 229100 }, { "epoch": 0.2, "grad_norm": 39.25, "learning_rate": 4.0193190202818223e-05, "loss": 1.109, "step": 229200 }, { "epoch": 0.2, "grad_norm": 194.0, "learning_rate": 4.01886911296273e-05, "loss": 1.0713, "step": 229300 }, { "epoch": 0.2, "grad_norm": 51.0, "learning_rate": 4.0184192056436374e-05, "loss": 1.2785, "step": 229400 }, { "epoch": 0.2, "grad_norm": 19.5, "learning_rate": 4.0179692983245456e-05, "loss": 1.3195, "step": 229500 }, { "epoch": 0.2, "grad_norm": 75.5, "learning_rate": 4.017519391005453e-05, "loss": 1.0769, "step": 229600 }, { "epoch": 0.2, "grad_norm": 25.125, "learning_rate": 4.017069483686361e-05, "loss": 1.1878, "step": 229700 }, { "epoch": 0.2, "grad_norm": 25.875, "learning_rate": 4.016619576367269e-05, "loss": 1.1975, "step": 229800 }, { "epoch": 0.2, "grad_norm": 36.25, "learning_rate": 4.0161696690481764e-05, "loss": 1.2175, "step": 229900 }, { "epoch": 0.2, "grad_norm": 109.5, "learning_rate": 4.015719761729084e-05, "loss": 1.1586, "step": 230000 }, { "epoch": 0.2, "grad_norm": 18.125, "learning_rate": 4.0152698544099915e-05, "loss": 1.3735, "step": 230100 }, { "epoch": 0.21, "grad_norm": 0.26171875, "learning_rate": 4.014819947090899e-05, "loss": 1.223, "step": 230200 }, { "epoch": 0.21, "grad_norm": 34.0, "learning_rate": 4.014370039771807e-05, "loss": 1.0653, "step": 230300 }, { "epoch": 0.21, "grad_norm": 6.09375, "learning_rate": 4.013920132452715e-05, "loss": 1.2073, "step": 230400 }, { "epoch": 0.21, "grad_norm": 11.5, "learning_rate": 4.013470225133622e-05, "loss": 1.1292, "step": 230500 }, { "epoch": 0.21, "grad_norm": 50.75, "learning_rate": 4.0130203178145305e-05, "loss": 1.2277, "step": 230600 }, { "epoch": 0.21, "grad_norm": 456.0, "learning_rate": 4.012570410495438e-05, "loss": 1.1183, "step": 230700 }, { "epoch": 0.21, "grad_norm": 26.75, "learning_rate": 4.0121205031763456e-05, "loss": 1.1154, "step": 230800 }, { "epoch": 0.21, "grad_norm": 1.734375, "learning_rate": 4.011670595857254e-05, "loss": 1.1814, "step": 230900 }, { "epoch": 0.21, "grad_norm": 32.5, "learning_rate": 4.0112206885381613e-05, "loss": 1.198, "step": 231000 }, { "epoch": 0.21, "grad_norm": 52.0, "learning_rate": 4.0107707812190696e-05, "loss": 1.1281, "step": 231100 }, { "epoch": 0.21, "grad_norm": 23.375, "learning_rate": 4.010320873899977e-05, "loss": 1.1255, "step": 231200 }, { "epoch": 0.21, "grad_norm": 16.875, "learning_rate": 4.0098709665808846e-05, "loss": 1.2562, "step": 231300 }, { "epoch": 0.21, "grad_norm": 56.75, "learning_rate": 4.009421059261792e-05, "loss": 1.2504, "step": 231400 }, { "epoch": 0.21, "grad_norm": 68.5, "learning_rate": 4.0089711519427e-05, "loss": 1.1979, "step": 231500 }, { "epoch": 0.21, "grad_norm": 19.625, "learning_rate": 4.008521244623607e-05, "loss": 1.1168, "step": 231600 }, { "epoch": 0.21, "grad_norm": 25.25, "learning_rate": 4.0080713373045154e-05, "loss": 1.2779, "step": 231700 }, { "epoch": 0.21, "grad_norm": 9.1875, "learning_rate": 4.007621429985423e-05, "loss": 1.2426, "step": 231800 }, { "epoch": 0.21, "grad_norm": 25.25, "learning_rate": 4.007171522666331e-05, "loss": 1.2031, "step": 231900 }, { "epoch": 0.21, "grad_norm": 37.0, "learning_rate": 4.006721615347239e-05, "loss": 1.178, "step": 232000 }, { "epoch": 0.21, "grad_norm": 258.0, "learning_rate": 4.006271708028146e-05, "loss": 1.0253, "step": 232100 }, { "epoch": 0.21, "grad_norm": 34.75, "learning_rate": 4.0058218007090545e-05, "loss": 1.1623, "step": 232200 }, { "epoch": 0.21, "grad_norm": 26.75, "learning_rate": 4.005371893389962e-05, "loss": 1.2595, "step": 232300 }, { "epoch": 0.21, "grad_norm": 532.0, "learning_rate": 4.0049219860708695e-05, "loss": 1.2893, "step": 232400 }, { "epoch": 0.21, "grad_norm": 27.375, "learning_rate": 4.004472078751778e-05, "loss": 1.0809, "step": 232500 }, { "epoch": 0.21, "grad_norm": 168.0, "learning_rate": 4.004022171432685e-05, "loss": 1.068, "step": 232600 }, { "epoch": 0.21, "grad_norm": 68.5, "learning_rate": 4.003572264113593e-05, "loss": 1.1476, "step": 232700 }, { "epoch": 0.21, "grad_norm": 60.25, "learning_rate": 4.0031223567945003e-05, "loss": 1.1423, "step": 232800 }, { "epoch": 0.21, "grad_norm": 30.0, "learning_rate": 4.002672449475408e-05, "loss": 1.3094, "step": 232900 }, { "epoch": 0.21, "grad_norm": 0.03662109375, "learning_rate": 4.002222542156316e-05, "loss": 1.2968, "step": 233000 }, { "epoch": 0.21, "grad_norm": 23.25, "learning_rate": 4.0017726348372236e-05, "loss": 1.1127, "step": 233100 }, { "epoch": 0.21, "grad_norm": 396.0, "learning_rate": 4.001322727518131e-05, "loss": 1.2254, "step": 233200 }, { "epoch": 0.21, "grad_norm": 15.125, "learning_rate": 4.0008728201990394e-05, "loss": 1.2719, "step": 233300 }, { "epoch": 0.21, "grad_norm": 107.5, "learning_rate": 4.000422912879947e-05, "loss": 1.0719, "step": 233400 }, { "epoch": 0.21, "grad_norm": 12.25, "learning_rate": 3.9999730055608544e-05, "loss": 1.1556, "step": 233500 }, { "epoch": 0.21, "grad_norm": 152.0, "learning_rate": 3.9995230982417627e-05, "loss": 1.0694, "step": 233600 }, { "epoch": 0.21, "grad_norm": 6.28125, "learning_rate": 3.99907319092267e-05, "loss": 1.2125, "step": 233700 }, { "epoch": 0.21, "grad_norm": 11.75, "learning_rate": 3.9986232836035784e-05, "loss": 1.129, "step": 233800 }, { "epoch": 0.21, "grad_norm": 48.75, "learning_rate": 3.998173376284486e-05, "loss": 1.1163, "step": 233900 }, { "epoch": 0.21, "grad_norm": 15.75, "learning_rate": 3.997723468965393e-05, "loss": 1.1119, "step": 234000 }, { "epoch": 0.21, "grad_norm": 58.5, "learning_rate": 3.997273561646301e-05, "loss": 1.129, "step": 234100 }, { "epoch": 0.21, "grad_norm": 0.00106048583984375, "learning_rate": 3.9968236543272085e-05, "loss": 1.17, "step": 234200 }, { "epoch": 0.21, "grad_norm": 34.0, "learning_rate": 3.996373747008116e-05, "loss": 1.1715, "step": 234300 }, { "epoch": 0.21, "grad_norm": 5.28125, "learning_rate": 3.995923839689024e-05, "loss": 1.1322, "step": 234400 }, { "epoch": 0.21, "grad_norm": 91.0, "learning_rate": 3.995473932369932e-05, "loss": 1.2509, "step": 234500 }, { "epoch": 0.21, "grad_norm": 0.0052490234375, "learning_rate": 3.99502402505084e-05, "loss": 1.3465, "step": 234600 }, { "epoch": 0.21, "grad_norm": 29.25, "learning_rate": 3.9945741177317476e-05, "loss": 1.309, "step": 234700 }, { "epoch": 0.21, "grad_norm": 14.875, "learning_rate": 3.994124210412655e-05, "loss": 1.3001, "step": 234800 }, { "epoch": 0.21, "grad_norm": 0.01043701171875, "learning_rate": 3.993674303093563e-05, "loss": 1.1559, "step": 234900 }, { "epoch": 0.21, "grad_norm": 12.875, "learning_rate": 3.993224395774471e-05, "loss": 1.2696, "step": 235000 }, { "epoch": 0.21, "grad_norm": 17.125, "learning_rate": 3.9927744884553784e-05, "loss": 1.0966, "step": 235100 }, { "epoch": 0.21, "grad_norm": 22.75, "learning_rate": 3.9923245811362866e-05, "loss": 1.1111, "step": 235200 }, { "epoch": 0.21, "grad_norm": 17.0, "learning_rate": 3.9918746738171934e-05, "loss": 1.2537, "step": 235300 }, { "epoch": 0.21, "grad_norm": 51.75, "learning_rate": 3.9914247664981017e-05, "loss": 1.2658, "step": 235400 }, { "epoch": 0.21, "grad_norm": 34.25, "learning_rate": 3.990974859179009e-05, "loss": 1.0868, "step": 235500 }, { "epoch": 0.21, "grad_norm": 0.00494384765625, "learning_rate": 3.990524951859917e-05, "loss": 0.9567, "step": 235600 }, { "epoch": 0.21, "grad_norm": 96.5, "learning_rate": 3.990075044540825e-05, "loss": 1.2206, "step": 235700 }, { "epoch": 0.21, "grad_norm": 13.0, "learning_rate": 3.9896251372217325e-05, "loss": 1.0097, "step": 235800 }, { "epoch": 0.21, "grad_norm": 115.0, "learning_rate": 3.98917522990264e-05, "loss": 1.1683, "step": 235900 }, { "epoch": 0.21, "grad_norm": 39.5, "learning_rate": 3.988725322583548e-05, "loss": 1.0478, "step": 236000 }, { "epoch": 0.21, "grad_norm": 13.3125, "learning_rate": 3.988275415264456e-05, "loss": 1.4626, "step": 236100 }, { "epoch": 0.21, "grad_norm": 25.75, "learning_rate": 3.987825507945363e-05, "loss": 1.1941, "step": 236200 }, { "epoch": 0.21, "grad_norm": 45.25, "learning_rate": 3.9873756006262715e-05, "loss": 1.2885, "step": 236300 }, { "epoch": 0.21, "grad_norm": 14.1875, "learning_rate": 3.986925693307179e-05, "loss": 1.2543, "step": 236400 }, { "epoch": 0.21, "grad_norm": 0.1416015625, "learning_rate": 3.986475785988087e-05, "loss": 1.1691, "step": 236500 }, { "epoch": 0.21, "grad_norm": 42.0, "learning_rate": 3.986025878668994e-05, "loss": 1.1036, "step": 236600 }, { "epoch": 0.21, "grad_norm": 36.75, "learning_rate": 3.9855759713499016e-05, "loss": 1.1534, "step": 236700 }, { "epoch": 0.21, "grad_norm": 17.125, "learning_rate": 3.98512606403081e-05, "loss": 1.0397, "step": 236800 }, { "epoch": 0.21, "grad_norm": 31.5, "learning_rate": 3.9846761567117174e-05, "loss": 1.1877, "step": 236900 }, { "epoch": 0.21, "grad_norm": 31.0, "learning_rate": 3.984226249392625e-05, "loss": 1.0923, "step": 237000 }, { "epoch": 0.21, "grad_norm": 2.3125, "learning_rate": 3.983776342073533e-05, "loss": 1.1543, "step": 237100 }, { "epoch": 0.21, "grad_norm": 175.0, "learning_rate": 3.9833264347544407e-05, "loss": 1.1596, "step": 237200 }, { "epoch": 0.21, "grad_norm": 0.298828125, "learning_rate": 3.982876527435349e-05, "loss": 1.2475, "step": 237300 }, { "epoch": 0.21, "grad_norm": 12.125, "learning_rate": 3.9824266201162564e-05, "loss": 1.2365, "step": 237400 }, { "epoch": 0.21, "grad_norm": 118.0, "learning_rate": 3.981976712797164e-05, "loss": 1.0924, "step": 237500 }, { "epoch": 0.21, "grad_norm": 29.375, "learning_rate": 3.981526805478072e-05, "loss": 1.1695, "step": 237600 }, { "epoch": 0.21, "grad_norm": 37.5, "learning_rate": 3.98107689815898e-05, "loss": 1.2291, "step": 237700 }, { "epoch": 0.21, "grad_norm": 40.0, "learning_rate": 3.980626990839887e-05, "loss": 1.1649, "step": 237800 }, { "epoch": 0.21, "grad_norm": 47.75, "learning_rate": 3.980177083520795e-05, "loss": 1.0497, "step": 237900 }, { "epoch": 0.21, "grad_norm": 76.5, "learning_rate": 3.979727176201702e-05, "loss": 1.3253, "step": 238000 }, { "epoch": 0.21, "grad_norm": 16.625, "learning_rate": 3.9792772688826105e-05, "loss": 1.1352, "step": 238100 }, { "epoch": 0.21, "grad_norm": 14.8125, "learning_rate": 3.978827361563518e-05, "loss": 1.0653, "step": 238200 }, { "epoch": 0.21, "grad_norm": 24.625, "learning_rate": 3.9783774542444256e-05, "loss": 0.9743, "step": 238300 }, { "epoch": 0.21, "grad_norm": 25.875, "learning_rate": 3.977927546925334e-05, "loss": 1.1918, "step": 238400 }, { "epoch": 0.21, "grad_norm": 31.75, "learning_rate": 3.977477639606241e-05, "loss": 1.227, "step": 238500 }, { "epoch": 0.21, "grad_norm": 61.75, "learning_rate": 3.977027732287149e-05, "loss": 1.1184, "step": 238600 }, { "epoch": 0.21, "grad_norm": 48.25, "learning_rate": 3.976577824968057e-05, "loss": 1.1853, "step": 238700 }, { "epoch": 0.21, "grad_norm": 41.5, "learning_rate": 3.9761279176489646e-05, "loss": 1.1154, "step": 238800 }, { "epoch": 0.21, "grad_norm": 0.010009765625, "learning_rate": 3.975678010329872e-05, "loss": 1.3571, "step": 238900 }, { "epoch": 0.21, "grad_norm": 18.0, "learning_rate": 3.97522810301078e-05, "loss": 1.3166, "step": 239000 }, { "epoch": 0.21, "grad_norm": 9.3125, "learning_rate": 3.974778195691688e-05, "loss": 1.2893, "step": 239100 }, { "epoch": 0.21, "grad_norm": 20.75, "learning_rate": 3.9743282883725954e-05, "loss": 1.1967, "step": 239200 }, { "epoch": 0.21, "grad_norm": 51.5, "learning_rate": 3.973878381053503e-05, "loss": 1.2685, "step": 239300 }, { "epoch": 0.21, "grad_norm": 42.75, "learning_rate": 3.9734284737344105e-05, "loss": 1.1552, "step": 239400 }, { "epoch": 0.21, "grad_norm": 49.75, "learning_rate": 3.972978566415319e-05, "loss": 1.0932, "step": 239500 }, { "epoch": 0.21, "grad_norm": 17.125, "learning_rate": 3.972528659096226e-05, "loss": 1.1386, "step": 239600 }, { "epoch": 0.21, "grad_norm": 3.75, "learning_rate": 3.972078751777134e-05, "loss": 1.0738, "step": 239700 }, { "epoch": 0.21, "grad_norm": 34.5, "learning_rate": 3.971628844458042e-05, "loss": 1.0091, "step": 239800 }, { "epoch": 0.21, "grad_norm": 78.0, "learning_rate": 3.9711789371389495e-05, "loss": 1.0894, "step": 239900 }, { "epoch": 0.21, "grad_norm": 0.07568359375, "learning_rate": 3.970729029819858e-05, "loss": 1.2147, "step": 240000 }, { "epoch": 0.21, "grad_norm": 50.75, "learning_rate": 3.970279122500765e-05, "loss": 1.1033, "step": 240100 }, { "epoch": 0.21, "grad_norm": 83.0, "learning_rate": 3.969829215181673e-05, "loss": 1.1346, "step": 240200 }, { "epoch": 0.21, "grad_norm": 22.375, "learning_rate": 3.969379307862581e-05, "loss": 1.29, "step": 240300 }, { "epoch": 0.21, "grad_norm": 30.875, "learning_rate": 3.9689294005434885e-05, "loss": 0.9884, "step": 240400 }, { "epoch": 0.21, "grad_norm": 60.0, "learning_rate": 3.968479493224396e-05, "loss": 1.2222, "step": 240500 }, { "epoch": 0.21, "grad_norm": 42.5, "learning_rate": 3.9680295859053036e-05, "loss": 1.15, "step": 240600 }, { "epoch": 0.21, "grad_norm": 31.75, "learning_rate": 3.967579678586211e-05, "loss": 1.0979, "step": 240700 }, { "epoch": 0.21, "grad_norm": 27.875, "learning_rate": 3.967129771267119e-05, "loss": 1.2747, "step": 240800 }, { "epoch": 0.21, "grad_norm": 0.130859375, "learning_rate": 3.966679863948027e-05, "loss": 1.2433, "step": 240900 }, { "epoch": 0.21, "grad_norm": 18.25, "learning_rate": 3.9662299566289344e-05, "loss": 1.1585, "step": 241000 }, { "epoch": 0.21, "grad_norm": 22.875, "learning_rate": 3.9657800493098426e-05, "loss": 1.2039, "step": 241100 }, { "epoch": 0.21, "grad_norm": 86.5, "learning_rate": 3.96533014199075e-05, "loss": 1.206, "step": 241200 }, { "epoch": 0.21, "grad_norm": 0.126953125, "learning_rate": 3.964880234671658e-05, "loss": 1.2161, "step": 241300 }, { "epoch": 0.22, "grad_norm": 12.4375, "learning_rate": 3.964430327352566e-05, "loss": 1.2156, "step": 241400 }, { "epoch": 0.22, "grad_norm": 114.0, "learning_rate": 3.9639804200334734e-05, "loss": 1.1289, "step": 241500 }, { "epoch": 0.22, "grad_norm": 27.25, "learning_rate": 3.963530512714381e-05, "loss": 1.0126, "step": 241600 }, { "epoch": 0.22, "grad_norm": 23.375, "learning_rate": 3.963080605395289e-05, "loss": 1.1066, "step": 241700 }, { "epoch": 0.22, "grad_norm": 51.0, "learning_rate": 3.962630698076196e-05, "loss": 1.0179, "step": 241800 }, { "epoch": 0.22, "grad_norm": 25.75, "learning_rate": 3.962180790757104e-05, "loss": 1.1068, "step": 241900 }, { "epoch": 0.22, "grad_norm": 63.5, "learning_rate": 3.961730883438012e-05, "loss": 1.0146, "step": 242000 }, { "epoch": 0.22, "grad_norm": 14.6875, "learning_rate": 3.961280976118919e-05, "loss": 1.2853, "step": 242100 }, { "epoch": 0.22, "grad_norm": 69.0, "learning_rate": 3.9608310687998275e-05, "loss": 1.1743, "step": 242200 }, { "epoch": 0.22, "grad_norm": 2.21875, "learning_rate": 3.960381161480735e-05, "loss": 1.149, "step": 242300 }, { "epoch": 0.22, "grad_norm": 13.1875, "learning_rate": 3.9599312541616426e-05, "loss": 1.1604, "step": 242400 }, { "epoch": 0.22, "grad_norm": 8.875, "learning_rate": 3.959481346842551e-05, "loss": 1.3036, "step": 242500 }, { "epoch": 0.22, "grad_norm": 39.75, "learning_rate": 3.959031439523458e-05, "loss": 1.1397, "step": 242600 }, { "epoch": 0.22, "grad_norm": 72.5, "learning_rate": 3.9585815322043665e-05, "loss": 1.3535, "step": 242700 }, { "epoch": 0.22, "grad_norm": 26.625, "learning_rate": 3.958131624885274e-05, "loss": 1.2853, "step": 242800 }, { "epoch": 0.22, "grad_norm": 24.5, "learning_rate": 3.9576817175661816e-05, "loss": 1.2195, "step": 242900 }, { "epoch": 0.22, "grad_norm": 2.5625, "learning_rate": 3.95723181024709e-05, "loss": 1.1829, "step": 243000 }, { "epoch": 0.22, "grad_norm": 1.140625, "learning_rate": 3.956781902927997e-05, "loss": 1.0797, "step": 243100 }, { "epoch": 0.22, "grad_norm": 67.0, "learning_rate": 3.956331995608905e-05, "loss": 1.3246, "step": 243200 }, { "epoch": 0.22, "grad_norm": 2.734375, "learning_rate": 3.9558820882898124e-05, "loss": 1.1519, "step": 243300 }, { "epoch": 0.22, "grad_norm": 24.25, "learning_rate": 3.95543218097072e-05, "loss": 1.166, "step": 243400 }, { "epoch": 0.22, "grad_norm": 9.5625, "learning_rate": 3.954982273651628e-05, "loss": 1.1949, "step": 243500 }, { "epoch": 0.22, "grad_norm": 35.5, "learning_rate": 3.954532366332536e-05, "loss": 1.0722, "step": 243600 }, { "epoch": 0.22, "grad_norm": 15.5625, "learning_rate": 3.954082459013443e-05, "loss": 1.0706, "step": 243700 }, { "epoch": 0.22, "grad_norm": 32.5, "learning_rate": 3.9536325516943514e-05, "loss": 1.1307, "step": 243800 }, { "epoch": 0.22, "grad_norm": 23.875, "learning_rate": 3.953182644375259e-05, "loss": 1.331, "step": 243900 }, { "epoch": 0.22, "grad_norm": 59.5, "learning_rate": 3.9527327370561665e-05, "loss": 1.1699, "step": 244000 }, { "epoch": 0.22, "grad_norm": 19.0, "learning_rate": 3.952282829737075e-05, "loss": 1.2879, "step": 244100 }, { "epoch": 0.22, "grad_norm": 92.5, "learning_rate": 3.951832922417982e-05, "loss": 1.3037, "step": 244200 }, { "epoch": 0.22, "grad_norm": 19.125, "learning_rate": 3.95138301509889e-05, "loss": 1.0337, "step": 244300 }, { "epoch": 0.22, "grad_norm": 62.0, "learning_rate": 3.950933107779797e-05, "loss": 1.1418, "step": 244400 }, { "epoch": 0.22, "grad_norm": 11.1875, "learning_rate": 3.950483200460705e-05, "loss": 1.3614, "step": 244500 }, { "epoch": 0.22, "grad_norm": 11.0625, "learning_rate": 3.950033293141613e-05, "loss": 1.1876, "step": 244600 }, { "epoch": 0.22, "grad_norm": 30.0, "learning_rate": 3.9495833858225206e-05, "loss": 1.2167, "step": 244700 }, { "epoch": 0.22, "grad_norm": 45.25, "learning_rate": 3.949133478503428e-05, "loss": 1.1304, "step": 244800 }, { "epoch": 0.22, "grad_norm": 68.5, "learning_rate": 3.9486835711843364e-05, "loss": 1.3406, "step": 244900 }, { "epoch": 0.22, "grad_norm": 40.25, "learning_rate": 3.948233663865244e-05, "loss": 1.1495, "step": 245000 }, { "epoch": 0.22, "grad_norm": 11.25, "learning_rate": 3.9477837565461514e-05, "loss": 1.2369, "step": 245100 }, { "epoch": 0.22, "grad_norm": 42.25, "learning_rate": 3.9473338492270596e-05, "loss": 1.1874, "step": 245200 }, { "epoch": 0.22, "grad_norm": 19.0, "learning_rate": 3.946883941907967e-05, "loss": 1.2262, "step": 245300 }, { "epoch": 0.22, "grad_norm": 17.75, "learning_rate": 3.9464340345888754e-05, "loss": 1.022, "step": 245400 }, { "epoch": 0.22, "grad_norm": 8.625, "learning_rate": 3.945984127269783e-05, "loss": 1.2238, "step": 245500 }, { "epoch": 0.22, "grad_norm": 27.625, "learning_rate": 3.9455342199506904e-05, "loss": 1.1759, "step": 245600 }, { "epoch": 0.22, "grad_norm": 1.7109375, "learning_rate": 3.945084312631598e-05, "loss": 1.078, "step": 245700 }, { "epoch": 0.22, "grad_norm": 0.0036468505859375, "learning_rate": 3.9446344053125055e-05, "loss": 1.1926, "step": 245800 }, { "epoch": 0.22, "grad_norm": 0.1875, "learning_rate": 3.944184497993413e-05, "loss": 1.03, "step": 245900 }, { "epoch": 0.22, "grad_norm": 15.6875, "learning_rate": 3.943734590674321e-05, "loss": 1.3733, "step": 246000 }, { "epoch": 0.22, "grad_norm": 27.875, "learning_rate": 3.943284683355229e-05, "loss": 1.2373, "step": 246100 }, { "epoch": 0.22, "grad_norm": 236.0, "learning_rate": 3.942834776036137e-05, "loss": 1.2598, "step": 246200 }, { "epoch": 0.22, "grad_norm": 220.0, "learning_rate": 3.9423848687170445e-05, "loss": 1.1383, "step": 246300 }, { "epoch": 0.22, "grad_norm": 1.0703125, "learning_rate": 3.941934961397952e-05, "loss": 1.2764, "step": 246400 }, { "epoch": 0.22, "grad_norm": 223.0, "learning_rate": 3.94148505407886e-05, "loss": 0.9768, "step": 246500 }, { "epoch": 0.22, "grad_norm": 556.0, "learning_rate": 3.941035146759768e-05, "loss": 1.2681, "step": 246600 }, { "epoch": 0.22, "grad_norm": 23.125, "learning_rate": 3.9405852394406754e-05, "loss": 1.048, "step": 246700 }, { "epoch": 0.22, "grad_norm": 36.25, "learning_rate": 3.9401353321215836e-05, "loss": 1.1942, "step": 246800 }, { "epoch": 0.22, "grad_norm": 22.375, "learning_rate": 3.939685424802491e-05, "loss": 1.2036, "step": 246900 }, { "epoch": 0.22, "grad_norm": 77.5, "learning_rate": 3.9392355174833986e-05, "loss": 1.149, "step": 247000 }, { "epoch": 0.22, "grad_norm": 17.625, "learning_rate": 3.938785610164306e-05, "loss": 1.308, "step": 247100 }, { "epoch": 0.22, "grad_norm": 49.0, "learning_rate": 3.938335702845214e-05, "loss": 1.3527, "step": 247200 }, { "epoch": 0.22, "grad_norm": 27.125, "learning_rate": 3.937885795526122e-05, "loss": 1.1117, "step": 247300 }, { "epoch": 0.22, "grad_norm": 20.875, "learning_rate": 3.9374358882070294e-05, "loss": 1.2889, "step": 247400 }, { "epoch": 0.22, "grad_norm": 25.5, "learning_rate": 3.936985980887937e-05, "loss": 1.3867, "step": 247500 }, { "epoch": 0.22, "grad_norm": 42.5, "learning_rate": 3.936536073568845e-05, "loss": 1.3019, "step": 247600 }, { "epoch": 0.22, "grad_norm": 81.0, "learning_rate": 3.936086166249753e-05, "loss": 1.1621, "step": 247700 }, { "epoch": 0.22, "grad_norm": 147.0, "learning_rate": 3.93563625893066e-05, "loss": 1.1166, "step": 247800 }, { "epoch": 0.22, "grad_norm": 13.0, "learning_rate": 3.9351863516115685e-05, "loss": 1.2203, "step": 247900 }, { "epoch": 0.22, "grad_norm": 114.5, "learning_rate": 3.934736444292476e-05, "loss": 1.1678, "step": 248000 }, { "epoch": 0.22, "grad_norm": 4.84375, "learning_rate": 3.934286536973384e-05, "loss": 1.039, "step": 248100 }, { "epoch": 0.22, "grad_norm": 35.75, "learning_rate": 3.933836629654292e-05, "loss": 1.1206, "step": 248200 }, { "epoch": 0.22, "grad_norm": 23.125, "learning_rate": 3.9333867223351986e-05, "loss": 1.2726, "step": 248300 }, { "epoch": 0.22, "grad_norm": 54.5, "learning_rate": 3.932936815016107e-05, "loss": 1.0794, "step": 248400 }, { "epoch": 0.22, "grad_norm": 228.0, "learning_rate": 3.9324869076970144e-05, "loss": 1.0072, "step": 248500 }, { "epoch": 0.22, "grad_norm": 32.5, "learning_rate": 3.932037000377922e-05, "loss": 1.3515, "step": 248600 }, { "epoch": 0.22, "grad_norm": 29.125, "learning_rate": 3.93158709305883e-05, "loss": 1.2737, "step": 248700 }, { "epoch": 0.22, "grad_norm": 87.0, "learning_rate": 3.9311371857397376e-05, "loss": 1.3866, "step": 248800 }, { "epoch": 0.22, "grad_norm": 12.5625, "learning_rate": 3.930687278420646e-05, "loss": 1.0515, "step": 248900 }, { "epoch": 0.22, "grad_norm": 31.5, "learning_rate": 3.9302373711015534e-05, "loss": 1.3832, "step": 249000 }, { "epoch": 0.22, "grad_norm": 0.004791259765625, "learning_rate": 3.929787463782461e-05, "loss": 1.2286, "step": 249100 }, { "epoch": 0.22, "grad_norm": 332.0, "learning_rate": 3.929337556463369e-05, "loss": 1.3023, "step": 249200 }, { "epoch": 0.22, "grad_norm": 43.5, "learning_rate": 3.9288876491442767e-05, "loss": 1.3827, "step": 249300 }, { "epoch": 0.22, "grad_norm": 12.125, "learning_rate": 3.928437741825184e-05, "loss": 1.2476, "step": 249400 }, { "epoch": 0.22, "grad_norm": 1.328125, "learning_rate": 3.9279878345060924e-05, "loss": 1.1633, "step": 249500 }, { "epoch": 0.22, "grad_norm": 34.25, "learning_rate": 3.927537927186999e-05, "loss": 1.1164, "step": 249600 }, { "epoch": 0.22, "grad_norm": 17.375, "learning_rate": 3.9270880198679075e-05, "loss": 1.1873, "step": 249700 }, { "epoch": 0.22, "grad_norm": 32.25, "learning_rate": 3.926638112548815e-05, "loss": 1.2064, "step": 249800 }, { "epoch": 0.22, "grad_norm": 310.0, "learning_rate": 3.9261882052297225e-05, "loss": 1.1259, "step": 249900 }, { "epoch": 0.22, "grad_norm": 85.0, "learning_rate": 3.925738297910631e-05, "loss": 1.2184, "step": 250000 }, { "epoch": 0.22, "grad_norm": 0.11669921875, "learning_rate": 3.925288390591538e-05, "loss": 0.9433, "step": 250100 }, { "epoch": 0.22, "grad_norm": 171.0, "learning_rate": 3.924838483272446e-05, "loss": 1.086, "step": 250200 }, { "epoch": 0.22, "grad_norm": 15.375, "learning_rate": 3.924388575953354e-05, "loss": 1.1756, "step": 250300 }, { "epoch": 0.22, "grad_norm": 93.0, "learning_rate": 3.9239386686342616e-05, "loss": 1.2926, "step": 250400 }, { "epoch": 0.22, "grad_norm": 25.0, "learning_rate": 3.923488761315169e-05, "loss": 1.0511, "step": 250500 }, { "epoch": 0.22, "grad_norm": 93.0, "learning_rate": 3.923038853996077e-05, "loss": 1.1765, "step": 250600 }, { "epoch": 0.22, "grad_norm": 0.095703125, "learning_rate": 3.922588946676985e-05, "loss": 1.2579, "step": 250700 }, { "epoch": 0.22, "grad_norm": 0.01025390625, "learning_rate": 3.922139039357893e-05, "loss": 1.2552, "step": 250800 }, { "epoch": 0.22, "grad_norm": 11.25, "learning_rate": 3.9216891320388e-05, "loss": 1.3754, "step": 250900 }, { "epoch": 0.22, "grad_norm": 29.625, "learning_rate": 3.9212392247197074e-05, "loss": 1.3205, "step": 251000 }, { "epoch": 0.22, "grad_norm": 59.75, "learning_rate": 3.9207893174006157e-05, "loss": 1.2434, "step": 251100 }, { "epoch": 0.22, "grad_norm": 51.25, "learning_rate": 3.920339410081523e-05, "loss": 1.1646, "step": 251200 }, { "epoch": 0.22, "grad_norm": 13.875, "learning_rate": 3.919889502762431e-05, "loss": 1.1913, "step": 251300 }, { "epoch": 0.22, "grad_norm": 47.25, "learning_rate": 3.919439595443339e-05, "loss": 1.2242, "step": 251400 }, { "epoch": 0.22, "grad_norm": 30.75, "learning_rate": 3.9189896881242465e-05, "loss": 1.0874, "step": 251500 }, { "epoch": 0.22, "grad_norm": 16.25, "learning_rate": 3.918539780805155e-05, "loss": 1.1604, "step": 251600 }, { "epoch": 0.22, "grad_norm": 46.75, "learning_rate": 3.918089873486062e-05, "loss": 1.1404, "step": 251700 }, { "epoch": 0.22, "grad_norm": 136.0, "learning_rate": 3.91763996616697e-05, "loss": 1.2929, "step": 251800 }, { "epoch": 0.22, "grad_norm": 52.0, "learning_rate": 3.917190058847878e-05, "loss": 1.2881, "step": 251900 }, { "epoch": 0.22, "grad_norm": 32.0, "learning_rate": 3.9167401515287855e-05, "loss": 1.04, "step": 252000 }, { "epoch": 0.22, "grad_norm": 0.7734375, "learning_rate": 3.916290244209693e-05, "loss": 1.2319, "step": 252100 }, { "epoch": 0.22, "grad_norm": 50.25, "learning_rate": 3.9158403368906006e-05, "loss": 1.1052, "step": 252200 }, { "epoch": 0.22, "grad_norm": 27.125, "learning_rate": 3.915390429571508e-05, "loss": 1.2576, "step": 252300 }, { "epoch": 0.22, "grad_norm": 30.75, "learning_rate": 3.914940522252416e-05, "loss": 1.1814, "step": 252400 }, { "epoch": 0.22, "grad_norm": 24.5, "learning_rate": 3.914490614933324e-05, "loss": 1.1493, "step": 252500 }, { "epoch": 0.23, "grad_norm": 33.0, "learning_rate": 3.9140407076142314e-05, "loss": 1.2086, "step": 252600 }, { "epoch": 0.23, "grad_norm": 8.6875, "learning_rate": 3.9135908002951396e-05, "loss": 1.1718, "step": 252700 }, { "epoch": 0.23, "grad_norm": 18.875, "learning_rate": 3.913140892976047e-05, "loss": 1.071, "step": 252800 }, { "epoch": 0.23, "grad_norm": 30.875, "learning_rate": 3.9126909856569547e-05, "loss": 1.4632, "step": 252900 }, { "epoch": 0.23, "grad_norm": 92.0, "learning_rate": 3.912241078337863e-05, "loss": 1.1439, "step": 253000 }, { "epoch": 0.23, "grad_norm": 23.125, "learning_rate": 3.9117911710187704e-05, "loss": 1.1257, "step": 253100 }, { "epoch": 0.23, "grad_norm": 35.75, "learning_rate": 3.911341263699678e-05, "loss": 1.1817, "step": 253200 }, { "epoch": 0.23, "grad_norm": 29.875, "learning_rate": 3.910891356380586e-05, "loss": 1.0107, "step": 253300 }, { "epoch": 0.23, "grad_norm": 7.96875, "learning_rate": 3.910441449061494e-05, "loss": 1.1322, "step": 253400 }, { "epoch": 0.23, "grad_norm": 14.375, "learning_rate": 3.909991541742401e-05, "loss": 1.0913, "step": 253500 }, { "epoch": 0.23, "grad_norm": 9.9375, "learning_rate": 3.909541634423309e-05, "loss": 1.1918, "step": 253600 }, { "epoch": 0.23, "grad_norm": 11.375, "learning_rate": 3.909091727104216e-05, "loss": 1.1895, "step": 253700 }, { "epoch": 0.23, "grad_norm": 24.25, "learning_rate": 3.9086418197851245e-05, "loss": 1.2283, "step": 253800 }, { "epoch": 0.23, "grad_norm": 0.322265625, "learning_rate": 3.908191912466032e-05, "loss": 1.2178, "step": 253900 }, { "epoch": 0.23, "grad_norm": 39.5, "learning_rate": 3.9077420051469396e-05, "loss": 1.1791, "step": 254000 }, { "epoch": 0.23, "grad_norm": 14.125, "learning_rate": 3.907292097827848e-05, "loss": 1.3667, "step": 254100 }, { "epoch": 0.23, "grad_norm": 0.0390625, "learning_rate": 3.906842190508755e-05, "loss": 1.0755, "step": 254200 }, { "epoch": 0.23, "grad_norm": 0.01116943359375, "learning_rate": 3.9063922831896635e-05, "loss": 1.0778, "step": 254300 }, { "epoch": 0.23, "grad_norm": 72.0, "learning_rate": 3.905942375870571e-05, "loss": 1.1521, "step": 254400 }, { "epoch": 0.23, "grad_norm": 18.125, "learning_rate": 3.9054924685514786e-05, "loss": 1.2442, "step": 254500 }, { "epoch": 0.23, "grad_norm": 150.0, "learning_rate": 3.905042561232387e-05, "loss": 1.2532, "step": 254600 }, { "epoch": 0.23, "grad_norm": 19.625, "learning_rate": 3.904592653913294e-05, "loss": 1.0806, "step": 254700 }, { "epoch": 0.23, "grad_norm": 53.0, "learning_rate": 3.904142746594202e-05, "loss": 1.2819, "step": 254800 }, { "epoch": 0.23, "grad_norm": 64.5, "learning_rate": 3.9036928392751094e-05, "loss": 1.3103, "step": 254900 }, { "epoch": 0.23, "grad_norm": 149.0, "learning_rate": 3.903242931956017e-05, "loss": 1.3287, "step": 255000 }, { "epoch": 0.23, "grad_norm": 20.625, "learning_rate": 3.902793024636925e-05, "loss": 1.2764, "step": 255100 }, { "epoch": 0.23, "grad_norm": 15.8125, "learning_rate": 3.902343117317833e-05, "loss": 1.0663, "step": 255200 }, { "epoch": 0.23, "grad_norm": 17.375, "learning_rate": 3.90189320999874e-05, "loss": 1.2626, "step": 255300 }, { "epoch": 0.23, "grad_norm": 17.625, "learning_rate": 3.9014433026796484e-05, "loss": 1.1583, "step": 255400 }, { "epoch": 0.23, "grad_norm": 41.75, "learning_rate": 3.900993395360556e-05, "loss": 1.1388, "step": 255500 }, { "epoch": 0.23, "grad_norm": 34.75, "learning_rate": 3.9005434880414635e-05, "loss": 1.0883, "step": 255600 }, { "epoch": 0.23, "grad_norm": 44.25, "learning_rate": 3.900093580722372e-05, "loss": 1.0158, "step": 255700 }, { "epoch": 0.23, "grad_norm": 23.5, "learning_rate": 3.899643673403279e-05, "loss": 1.2122, "step": 255800 }, { "epoch": 0.23, "grad_norm": 28.0, "learning_rate": 3.899193766084187e-05, "loss": 1.1908, "step": 255900 }, { "epoch": 0.23, "grad_norm": 96.5, "learning_rate": 3.898743858765095e-05, "loss": 1.3359, "step": 256000 }, { "epoch": 0.23, "grad_norm": 19.25, "learning_rate": 3.898293951446002e-05, "loss": 1.0705, "step": 256100 }, { "epoch": 0.23, "grad_norm": 50.25, "learning_rate": 3.89784404412691e-05, "loss": 1.1882, "step": 256200 }, { "epoch": 0.23, "grad_norm": 13.75, "learning_rate": 3.8973941368078176e-05, "loss": 1.109, "step": 256300 }, { "epoch": 0.23, "grad_norm": 49.5, "learning_rate": 3.896944229488725e-05, "loss": 1.0857, "step": 256400 }, { "epoch": 0.23, "grad_norm": 14.6875, "learning_rate": 3.896494322169633e-05, "loss": 1.0024, "step": 256500 }, { "epoch": 0.23, "grad_norm": 0.3359375, "learning_rate": 3.896044414850541e-05, "loss": 1.1698, "step": 256600 }, { "epoch": 0.23, "grad_norm": 0.470703125, "learning_rate": 3.8955945075314484e-05, "loss": 1.3496, "step": 256700 }, { "epoch": 0.23, "grad_norm": 45.0, "learning_rate": 3.8951446002123566e-05, "loss": 1.1962, "step": 256800 }, { "epoch": 0.23, "grad_norm": 34.25, "learning_rate": 3.894694692893264e-05, "loss": 1.203, "step": 256900 }, { "epoch": 0.23, "grad_norm": 135.0, "learning_rate": 3.8942447855741724e-05, "loss": 1.2846, "step": 257000 }, { "epoch": 0.23, "grad_norm": 19.0, "learning_rate": 3.89379487825508e-05, "loss": 1.1652, "step": 257100 }, { "epoch": 0.23, "grad_norm": 47.25, "learning_rate": 3.8933449709359874e-05, "loss": 1.1692, "step": 257200 }, { "epoch": 0.23, "grad_norm": 33.0, "learning_rate": 3.8928950636168956e-05, "loss": 1.3056, "step": 257300 }, { "epoch": 0.23, "grad_norm": 338.0, "learning_rate": 3.8924451562978025e-05, "loss": 1.2563, "step": 257400 }, { "epoch": 0.23, "grad_norm": 48.75, "learning_rate": 3.891995248978711e-05, "loss": 1.2937, "step": 257500 }, { "epoch": 0.23, "grad_norm": 29.625, "learning_rate": 3.891545341659618e-05, "loss": 1.1976, "step": 257600 }, { "epoch": 0.23, "grad_norm": 19.375, "learning_rate": 3.891095434340526e-05, "loss": 1.2014, "step": 257700 }, { "epoch": 0.23, "grad_norm": 8.6875, "learning_rate": 3.890645527021434e-05, "loss": 1.1279, "step": 257800 }, { "epoch": 0.23, "grad_norm": 0.1982421875, "learning_rate": 3.8901956197023415e-05, "loss": 1.2667, "step": 257900 }, { "epoch": 0.23, "grad_norm": 23.125, "learning_rate": 3.889745712383249e-05, "loss": 1.3891, "step": 258000 } ], "logging_steps": 100, "max_steps": 1122566, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "total_flos": 4.1202133673908224e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }