diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18081 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22983058457141942, + "eval_steps": 500, + "global_step": 258000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 227.0, + "learning_rate": 4.4539461963299484e-07, + "loss": 3.0081, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 28.0, + "learning_rate": 8.907892392659897e-07, + "loss": 3.5571, + "step": 200 + }, + { + "epoch": 0.0, + "grad_norm": 482.0, + "learning_rate": 1.3361838588989846e-06, + "loss": 2.7953, + "step": 300 + }, + { + "epoch": 0.0, + "grad_norm": 82.0, + "learning_rate": 1.7815784785319793e-06, + "loss": 2.6242, + "step": 400 + }, + { + "epoch": 0.0, + "grad_norm": 137.0, + "learning_rate": 2.226973098164974e-06, + "loss": 1.8696, + "step": 500 + }, + { + "epoch": 0.0, + "grad_norm": 60.25, + "learning_rate": 2.672367717797969e-06, + "loss": 1.5141, + "step": 600 + }, + { + "epoch": 0.0, + "grad_norm": 494.0, + "learning_rate": 3.1177623374309637e-06, + "loss": 1.5096, + "step": 700 + }, + { + "epoch": 0.0, + "grad_norm": 27.625, + "learning_rate": 3.5631569570639587e-06, + "loss": 1.5056, + "step": 800 + }, + { + "epoch": 0.0, + "grad_norm": 12.9375, + "learning_rate": 4.008551576696954e-06, + "loss": 1.3766, + "step": 900 + }, + { + "epoch": 0.0, + "grad_norm": 56.75, + "learning_rate": 4.453946196329948e-06, + "loss": 1.2224, + "step": 1000 + }, + { + "epoch": 0.0, + "grad_norm": 17.125, + "learning_rate": 4.899340815962943e-06, + "loss": 1.4877, + "step": 1100 + }, + { + "epoch": 0.0, + "grad_norm": 24.125, + "learning_rate": 5.344735435595938e-06, + "loss": 1.4661, + "step": 1200 + }, + { + "epoch": 0.0, + "grad_norm": 55.75, + "learning_rate": 5.790130055228933e-06, + "loss": 1.4322, + "step": 1300 + }, + { + "epoch": 0.0, + "grad_norm": 54.25, + "learning_rate": 6.235524674861927e-06, + "loss": 1.6576, + "step": 1400 + }, + { + "epoch": 0.0, + "grad_norm": 106.0, + "learning_rate": 6.680919294494922e-06, + "loss": 1.2569, + "step": 1500 + }, + { + "epoch": 0.0, + "grad_norm": 49.0, + "learning_rate": 7.126313914127917e-06, + "loss": 1.3449, + "step": 1600 + }, + { + "epoch": 0.0, + "grad_norm": 41.75, + "learning_rate": 7.571708533760913e-06, + "loss": 1.3496, + "step": 1700 + }, + { + "epoch": 0.0, + "grad_norm": 126.5, + "learning_rate": 8.017103153393907e-06, + "loss": 1.4364, + "step": 1800 + }, + { + "epoch": 0.0, + "grad_norm": 23.25, + "learning_rate": 8.462497773026902e-06, + "loss": 1.4653, + "step": 1900 + }, + { + "epoch": 0.0, + "grad_norm": 81.0, + "learning_rate": 8.907892392659896e-06, + "loss": 1.3839, + "step": 2000 + }, + { + "epoch": 0.0, + "grad_norm": 39.25, + "learning_rate": 9.353287012292893e-06, + "loss": 1.3782, + "step": 2100 + }, + { + "epoch": 0.0, + "grad_norm": 30.5, + "learning_rate": 9.798681631925886e-06, + "loss": 1.3509, + "step": 2200 + }, + { + "epoch": 0.0, + "grad_norm": 77.5, + "learning_rate": 1.0244076251558882e-05, + "loss": 1.3643, + "step": 2300 + }, + { + "epoch": 0.0, + "grad_norm": 46.75, + "learning_rate": 1.0689470871191876e-05, + "loss": 1.3291, + "step": 2400 + }, + { + "epoch": 0.0, + "grad_norm": 32.5, + "learning_rate": 1.1134865490824871e-05, + "loss": 1.2246, + "step": 2500 + }, + { + "epoch": 0.0, + "grad_norm": 40.25, + "learning_rate": 1.1580260110457866e-05, + "loss": 1.5088, + "step": 2600 + }, + { + "epoch": 0.0, + "grad_norm": 24.125, + "learning_rate": 1.2025654730090862e-05, + "loss": 1.3889, + "step": 2700 + }, + { + "epoch": 0.0, + "grad_norm": 88.0, + "learning_rate": 1.2471049349723855e-05, + "loss": 1.5065, + "step": 2800 + }, + { + "epoch": 0.0, + "grad_norm": 30.875, + "learning_rate": 1.2916443969356851e-05, + "loss": 1.5287, + "step": 2900 + }, + { + "epoch": 0.0, + "grad_norm": 23.0, + "learning_rate": 1.3361838588989844e-05, + "loss": 1.4401, + "step": 3000 + }, + { + "epoch": 0.0, + "grad_norm": 61.5, + "learning_rate": 1.3807233208622842e-05, + "loss": 1.306, + "step": 3100 + }, + { + "epoch": 0.0, + "grad_norm": 248.0, + "learning_rate": 1.4252627828255835e-05, + "loss": 1.5073, + "step": 3200 + }, + { + "epoch": 0.0, + "grad_norm": 60.25, + "learning_rate": 1.469802244788883e-05, + "loss": 1.2386, + "step": 3300 + }, + { + "epoch": 0.0, + "grad_norm": 58.0, + "learning_rate": 1.5143417067521826e-05, + "loss": 1.5033, + "step": 3400 + }, + { + "epoch": 0.0, + "grad_norm": 143.0, + "learning_rate": 1.558881168715482e-05, + "loss": 1.3654, + "step": 3500 + }, + { + "epoch": 0.0, + "grad_norm": 37.25, + "learning_rate": 1.6034206306787815e-05, + "loss": 1.483, + "step": 3600 + }, + { + "epoch": 0.0, + "grad_norm": 134.0, + "learning_rate": 1.647960092642081e-05, + "loss": 1.1818, + "step": 3700 + }, + { + "epoch": 0.0, + "grad_norm": 80.5, + "learning_rate": 1.6924995546053804e-05, + "loss": 1.335, + "step": 3800 + }, + { + "epoch": 0.0, + "grad_norm": 120.0, + "learning_rate": 1.7370390165686802e-05, + "loss": 1.3798, + "step": 3900 + }, + { + "epoch": 0.0, + "grad_norm": 20.25, + "learning_rate": 1.7815784785319793e-05, + "loss": 1.3843, + "step": 4000 + }, + { + "epoch": 0.0, + "grad_norm": 41.25, + "learning_rate": 1.8261179404952788e-05, + "loss": 1.2019, + "step": 4100 + }, + { + "epoch": 0.0, + "grad_norm": 115.5, + "learning_rate": 1.8706574024585786e-05, + "loss": 1.3815, + "step": 4200 + }, + { + "epoch": 0.0, + "grad_norm": 43.75, + "learning_rate": 1.915196864421878e-05, + "loss": 1.3856, + "step": 4300 + }, + { + "epoch": 0.0, + "grad_norm": 14.375, + "learning_rate": 1.959736326385177e-05, + "loss": 1.3999, + "step": 4400 + }, + { + "epoch": 0.0, + "grad_norm": 59.5, + "learning_rate": 2.004275788348477e-05, + "loss": 1.5308, + "step": 4500 + }, + { + "epoch": 0.0, + "grad_norm": 64.5, + "learning_rate": 2.0488152503117764e-05, + "loss": 1.4645, + "step": 4600 + }, + { + "epoch": 0.0, + "grad_norm": 57.0, + "learning_rate": 2.093354712275076e-05, + "loss": 1.3967, + "step": 4700 + }, + { + "epoch": 0.0, + "grad_norm": 253.0, + "learning_rate": 2.1378941742383753e-05, + "loss": 1.3877, + "step": 4800 + }, + { + "epoch": 0.0, + "grad_norm": 128.0, + "learning_rate": 2.1824336362016748e-05, + "loss": 1.2397, + "step": 4900 + }, + { + "epoch": 0.0, + "grad_norm": 36.25, + "learning_rate": 2.2269730981649742e-05, + "loss": 1.4018, + "step": 5000 + }, + { + "epoch": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 2.271512560128274e-05, + "loss": 1.4637, + "step": 5100 + }, + { + "epoch": 0.0, + "grad_norm": 28.375, + "learning_rate": 2.316052022091573e-05, + "loss": 1.4674, + "step": 5200 + }, + { + "epoch": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 2.3605914840548726e-05, + "loss": 1.1861, + "step": 5300 + }, + { + "epoch": 0.0, + "grad_norm": 52.25, + "learning_rate": 2.4051309460181724e-05, + "loss": 1.3685, + "step": 5400 + }, + { + "epoch": 0.0, + "grad_norm": 83.0, + "learning_rate": 2.449670407981472e-05, + "loss": 1.5418, + "step": 5500 + }, + { + "epoch": 0.0, + "grad_norm": 22.625, + "learning_rate": 2.494209869944771e-05, + "loss": 1.3366, + "step": 5600 + }, + { + "epoch": 0.01, + "grad_norm": 41.25, + "learning_rate": 2.5387493319080707e-05, + "loss": 1.4138, + "step": 5700 + }, + { + "epoch": 0.01, + "grad_norm": 38.25, + "learning_rate": 2.5832887938713702e-05, + "loss": 1.5245, + "step": 5800 + }, + { + "epoch": 0.01, + "grad_norm": 32.75, + "learning_rate": 2.6278282558346697e-05, + "loss": 1.6365, + "step": 5900 + }, + { + "epoch": 0.01, + "grad_norm": 27.0, + "learning_rate": 2.6723677177979688e-05, + "loss": 1.4085, + "step": 6000 + }, + { + "epoch": 0.01, + "grad_norm": 51.25, + "learning_rate": 2.716907179761269e-05, + "loss": 1.4528, + "step": 6100 + }, + { + "epoch": 0.01, + "grad_norm": 56.75, + "learning_rate": 2.7614466417245684e-05, + "loss": 1.4637, + "step": 6200 + }, + { + "epoch": 0.01, + "grad_norm": 23.5, + "learning_rate": 2.8059861036878675e-05, + "loss": 1.3511, + "step": 6300 + }, + { + "epoch": 0.01, + "grad_norm": 21.75, + "learning_rate": 2.850525565651167e-05, + "loss": 1.2612, + "step": 6400 + }, + { + "epoch": 0.01, + "grad_norm": 37.75, + "learning_rate": 2.8950650276144664e-05, + "loss": 1.272, + "step": 6500 + }, + { + "epoch": 0.01, + "grad_norm": 34.5, + "learning_rate": 2.939604489577766e-05, + "loss": 1.2663, + "step": 6600 + }, + { + "epoch": 0.01, + "grad_norm": 81.0, + "learning_rate": 2.9841439515410657e-05, + "loss": 1.2861, + "step": 6700 + }, + { + "epoch": 0.01, + "grad_norm": 32.25, + "learning_rate": 3.028683413504365e-05, + "loss": 1.3541, + "step": 6800 + }, + { + "epoch": 0.01, + "grad_norm": 72.0, + "learning_rate": 3.073222875467664e-05, + "loss": 1.3616, + "step": 6900 + }, + { + "epoch": 0.01, + "grad_norm": 25.5, + "learning_rate": 3.117762337430964e-05, + "loss": 1.4044, + "step": 7000 + }, + { + "epoch": 0.01, + "grad_norm": 89.5, + "learning_rate": 3.162301799394263e-05, + "loss": 1.4566, + "step": 7100 + }, + { + "epoch": 0.01, + "grad_norm": 94.5, + "learning_rate": 3.206841261357563e-05, + "loss": 1.5359, + "step": 7200 + }, + { + "epoch": 0.01, + "grad_norm": 14.125, + "learning_rate": 3.251380723320863e-05, + "loss": 1.282, + "step": 7300 + }, + { + "epoch": 0.01, + "grad_norm": 110.0, + "learning_rate": 3.295920185284162e-05, + "loss": 1.4597, + "step": 7400 + }, + { + "epoch": 0.01, + "grad_norm": 67.0, + "learning_rate": 3.3404596472474617e-05, + "loss": 1.3946, + "step": 7500 + }, + { + "epoch": 0.01, + "grad_norm": 173.0, + "learning_rate": 3.384999109210761e-05, + "loss": 1.4038, + "step": 7600 + }, + { + "epoch": 0.01, + "grad_norm": 8.875, + "learning_rate": 3.42953857117406e-05, + "loss": 1.2368, + "step": 7700 + }, + { + "epoch": 0.01, + "grad_norm": 48.5, + "learning_rate": 3.4740780331373604e-05, + "loss": 1.4606, + "step": 7800 + }, + { + "epoch": 0.01, + "grad_norm": 45.5, + "learning_rate": 3.5186174951006595e-05, + "loss": 1.3372, + "step": 7900 + }, + { + "epoch": 0.01, + "grad_norm": 50.5, + "learning_rate": 3.5631569570639586e-05, + "loss": 1.3796, + "step": 8000 + }, + { + "epoch": 0.01, + "grad_norm": 19.5, + "learning_rate": 3.6076964190272584e-05, + "loss": 1.2696, + "step": 8100 + }, + { + "epoch": 0.01, + "grad_norm": 73.0, + "learning_rate": 3.6522358809905575e-05, + "loss": 1.2903, + "step": 8200 + }, + { + "epoch": 0.01, + "grad_norm": 164.0, + "learning_rate": 3.696775342953857e-05, + "loss": 1.3901, + "step": 8300 + }, + { + "epoch": 0.01, + "grad_norm": 77.5, + "learning_rate": 3.741314804917157e-05, + "loss": 1.293, + "step": 8400 + }, + { + "epoch": 0.01, + "grad_norm": 14.9375, + "learning_rate": 3.785854266880456e-05, + "loss": 1.4207, + "step": 8500 + }, + { + "epoch": 0.01, + "grad_norm": 102.5, + "learning_rate": 3.830393728843756e-05, + "loss": 1.4051, + "step": 8600 + }, + { + "epoch": 0.01, + "grad_norm": 51.75, + "learning_rate": 3.874933190807055e-05, + "loss": 1.2428, + "step": 8700 + }, + { + "epoch": 0.01, + "grad_norm": 37.5, + "learning_rate": 3.919472652770354e-05, + "loss": 1.3791, + "step": 8800 + }, + { + "epoch": 0.01, + "grad_norm": 46.25, + "learning_rate": 3.964012114733654e-05, + "loss": 1.4908, + "step": 8900 + }, + { + "epoch": 0.01, + "grad_norm": 82.0, + "learning_rate": 4.008551576696954e-05, + "loss": 1.3, + "step": 9000 + }, + { + "epoch": 0.01, + "grad_norm": 33.5, + "learning_rate": 4.0530910386602536e-05, + "loss": 1.4543, + "step": 9100 + }, + { + "epoch": 0.01, + "grad_norm": 73.0, + "learning_rate": 4.097630500623553e-05, + "loss": 1.467, + "step": 9200 + }, + { + "epoch": 0.01, + "grad_norm": 28.125, + "learning_rate": 4.142169962586852e-05, + "loss": 1.4087, + "step": 9300 + }, + { + "epoch": 0.01, + "grad_norm": 1656.0, + "learning_rate": 4.186709424550152e-05, + "loss": 1.2681, + "step": 9400 + }, + { + "epoch": 0.01, + "grad_norm": 50.25, + "learning_rate": 4.231248886513451e-05, + "loss": 1.2393, + "step": 9500 + }, + { + "epoch": 0.01, + "grad_norm": 71.5, + "learning_rate": 4.2757883484767506e-05, + "loss": 1.3523, + "step": 9600 + }, + { + "epoch": 0.01, + "grad_norm": 89.5, + "learning_rate": 4.3203278104400504e-05, + "loss": 1.5014, + "step": 9700 + }, + { + "epoch": 0.01, + "grad_norm": 20.625, + "learning_rate": 4.3648672724033495e-05, + "loss": 1.5627, + "step": 9800 + }, + { + "epoch": 0.01, + "grad_norm": 105.5, + "learning_rate": 4.409406734366649e-05, + "loss": 1.3037, + "step": 9900 + }, + { + "epoch": 0.01, + "grad_norm": 42.75, + "learning_rate": 4.4539461963299484e-05, + "loss": 0.9767, + "step": 10000 + }, + { + "epoch": 0.01, + "grad_norm": 88.0, + "learning_rate": 4.498485658293248e-05, + "loss": 1.3604, + "step": 10100 + }, + { + "epoch": 0.01, + "grad_norm": 47.5, + "learning_rate": 4.543025120256548e-05, + "loss": 1.3567, + "step": 10200 + }, + { + "epoch": 0.01, + "grad_norm": 108.0, + "learning_rate": 4.587564582219847e-05, + "loss": 1.3405, + "step": 10300 + }, + { + "epoch": 0.01, + "grad_norm": 16.875, + "learning_rate": 4.632104044183146e-05, + "loss": 1.2921, + "step": 10400 + }, + { + "epoch": 0.01, + "grad_norm": 102.0, + "learning_rate": 4.676643506146446e-05, + "loss": 1.5022, + "step": 10500 + }, + { + "epoch": 0.01, + "grad_norm": 36.0, + "learning_rate": 4.721182968109745e-05, + "loss": 1.2794, + "step": 10600 + }, + { + "epoch": 0.01, + "grad_norm": 107.0, + "learning_rate": 4.765722430073045e-05, + "loss": 1.1861, + "step": 10700 + }, + { + "epoch": 0.01, + "grad_norm": 50.0, + "learning_rate": 4.810261892036345e-05, + "loss": 1.388, + "step": 10800 + }, + { + "epoch": 0.01, + "grad_norm": 194.0, + "learning_rate": 4.854801353999644e-05, + "loss": 1.6325, + "step": 10900 + }, + { + "epoch": 0.01, + "grad_norm": 10.75, + "learning_rate": 4.899340815962944e-05, + "loss": 1.3704, + "step": 11000 + }, + { + "epoch": 0.01, + "grad_norm": 20.0, + "learning_rate": 4.943880277926243e-05, + "loss": 1.4644, + "step": 11100 + }, + { + "epoch": 0.01, + "grad_norm": 94.5, + "learning_rate": 4.988419739889542e-05, + "loss": 1.4386, + "step": 11200 + }, + { + "epoch": 0.01, + "grad_norm": 2.953125, + "learning_rate": 4.999667068583872e-05, + "loss": 1.3037, + "step": 11300 + }, + { + "epoch": 0.01, + "grad_norm": 67.5, + "learning_rate": 4.99921716126478e-05, + "loss": 1.2846, + "step": 11400 + }, + { + "epoch": 0.01, + "grad_norm": 122.0, + "learning_rate": 4.998767253945688e-05, + "loss": 1.3673, + "step": 11500 + }, + { + "epoch": 0.01, + "grad_norm": 752.0, + "learning_rate": 4.9983173466265954e-05, + "loss": 1.5056, + "step": 11600 + }, + { + "epoch": 0.01, + "grad_norm": 520.0, + "learning_rate": 4.997867439307503e-05, + "loss": 1.3553, + "step": 11700 + }, + { + "epoch": 0.01, + "grad_norm": 104.0, + "learning_rate": 4.997417531988411e-05, + "loss": 1.2198, + "step": 11800 + }, + { + "epoch": 0.01, + "grad_norm": 45.0, + "learning_rate": 4.996967624669318e-05, + "loss": 1.1364, + "step": 11900 + }, + { + "epoch": 0.01, + "grad_norm": 51.75, + "learning_rate": 4.9965177173502256e-05, + "loss": 1.221, + "step": 12000 + }, + { + "epoch": 0.01, + "grad_norm": 46.5, + "learning_rate": 4.996067810031134e-05, + "loss": 1.3872, + "step": 12100 + }, + { + "epoch": 0.01, + "grad_norm": 258.0, + "learning_rate": 4.995617902712041e-05, + "loss": 1.3035, + "step": 12200 + }, + { + "epoch": 0.01, + "grad_norm": 35.5, + "learning_rate": 4.9951679953929495e-05, + "loss": 1.4527, + "step": 12300 + }, + { + "epoch": 0.01, + "grad_norm": 21.625, + "learning_rate": 4.994718088073857e-05, + "loss": 1.2409, + "step": 12400 + }, + { + "epoch": 0.01, + "grad_norm": 20.625, + "learning_rate": 4.9942681807547646e-05, + "loss": 1.1766, + "step": 12500 + }, + { + "epoch": 0.01, + "grad_norm": 75.5, + "learning_rate": 4.993818273435673e-05, + "loss": 1.4129, + "step": 12600 + }, + { + "epoch": 0.01, + "grad_norm": 93.0, + "learning_rate": 4.9933683661165803e-05, + "loss": 1.2777, + "step": 12700 + }, + { + "epoch": 0.01, + "grad_norm": 109.0, + "learning_rate": 4.992918458797488e-05, + "loss": 1.2517, + "step": 12800 + }, + { + "epoch": 0.01, + "grad_norm": 0.0133056640625, + "learning_rate": 4.992468551478396e-05, + "loss": 1.3276, + "step": 12900 + }, + { + "epoch": 0.01, + "grad_norm": 25.5, + "learning_rate": 4.9920186441593036e-05, + "loss": 1.5745, + "step": 13000 + }, + { + "epoch": 0.01, + "grad_norm": 31.25, + "learning_rate": 4.991568736840211e-05, + "loss": 1.3824, + "step": 13100 + }, + { + "epoch": 0.01, + "grad_norm": 73.5, + "learning_rate": 4.991118829521119e-05, + "loss": 1.3494, + "step": 13200 + }, + { + "epoch": 0.01, + "grad_norm": 2480.0, + "learning_rate": 4.990668922202026e-05, + "loss": 1.3827, + "step": 13300 + }, + { + "epoch": 0.01, + "grad_norm": 57.75, + "learning_rate": 4.9902190148829344e-05, + "loss": 1.3879, + "step": 13400 + }, + { + "epoch": 0.01, + "grad_norm": 26.75, + "learning_rate": 4.989769107563842e-05, + "loss": 1.3366, + "step": 13500 + }, + { + "epoch": 0.01, + "grad_norm": 40.0, + "learning_rate": 4.9893192002447495e-05, + "loss": 1.4362, + "step": 13600 + }, + { + "epoch": 0.01, + "grad_norm": 38.5, + "learning_rate": 4.988869292925658e-05, + "loss": 1.4034, + "step": 13700 + }, + { + "epoch": 0.01, + "grad_norm": 54.5, + "learning_rate": 4.988419385606565e-05, + "loss": 1.2769, + "step": 13800 + }, + { + "epoch": 0.01, + "grad_norm": 22.875, + "learning_rate": 4.987969478287473e-05, + "loss": 1.2381, + "step": 13900 + }, + { + "epoch": 0.01, + "grad_norm": 23.875, + "learning_rate": 4.987519570968381e-05, + "loss": 1.4296, + "step": 14000 + }, + { + "epoch": 0.01, + "grad_norm": 37.75, + "learning_rate": 4.9870696636492885e-05, + "loss": 1.3941, + "step": 14100 + }, + { + "epoch": 0.01, + "grad_norm": 112.0, + "learning_rate": 4.986619756330197e-05, + "loss": 1.1393, + "step": 14200 + }, + { + "epoch": 0.01, + "grad_norm": 600.0, + "learning_rate": 4.986169849011104e-05, + "loss": 1.4978, + "step": 14300 + }, + { + "epoch": 0.01, + "grad_norm": 42.25, + "learning_rate": 4.985719941692012e-05, + "loss": 1.558, + "step": 14400 + }, + { + "epoch": 0.01, + "grad_norm": 28.125, + "learning_rate": 4.9852700343729193e-05, + "loss": 1.5553, + "step": 14500 + }, + { + "epoch": 0.01, + "grad_norm": 20.5, + "learning_rate": 4.984820127053827e-05, + "loss": 1.1859, + "step": 14600 + }, + { + "epoch": 0.01, + "grad_norm": 33.75, + "learning_rate": 4.9843702197347344e-05, + "loss": 1.288, + "step": 14700 + }, + { + "epoch": 0.01, + "grad_norm": 55.5, + "learning_rate": 4.9839203124156426e-05, + "loss": 1.3456, + "step": 14800 + }, + { + "epoch": 0.01, + "grad_norm": 58.75, + "learning_rate": 4.98347040509655e-05, + "loss": 1.4251, + "step": 14900 + }, + { + "epoch": 0.01, + "grad_norm": 49.0, + "learning_rate": 4.9830204977774584e-05, + "loss": 1.5009, + "step": 15000 + }, + { + "epoch": 0.01, + "grad_norm": 62.5, + "learning_rate": 4.982570590458366e-05, + "loss": 1.44, + "step": 15100 + }, + { + "epoch": 0.01, + "grad_norm": 155.0, + "learning_rate": 4.9821206831392734e-05, + "loss": 1.4204, + "step": 15200 + }, + { + "epoch": 0.01, + "grad_norm": 37.25, + "learning_rate": 4.9816707758201816e-05, + "loss": 1.4253, + "step": 15300 + }, + { + "epoch": 0.01, + "grad_norm": 98.0, + "learning_rate": 4.981220868501089e-05, + "loss": 1.5352, + "step": 15400 + }, + { + "epoch": 0.01, + "grad_norm": 92.0, + "learning_rate": 4.980770961181997e-05, + "loss": 1.46, + "step": 15500 + }, + { + "epoch": 0.01, + "grad_norm": 51.25, + "learning_rate": 4.980321053862905e-05, + "loss": 1.2383, + "step": 15600 + }, + { + "epoch": 0.01, + "grad_norm": 76.5, + "learning_rate": 4.9798711465438125e-05, + "loss": 1.4453, + "step": 15700 + }, + { + "epoch": 0.01, + "grad_norm": 115.0, + "learning_rate": 4.97942123922472e-05, + "loss": 1.2353, + "step": 15800 + }, + { + "epoch": 0.01, + "grad_norm": 3.3125, + "learning_rate": 4.9789713319056275e-05, + "loss": 1.4104, + "step": 15900 + }, + { + "epoch": 0.01, + "grad_norm": 50.5, + "learning_rate": 4.978521424586535e-05, + "loss": 1.3855, + "step": 16000 + }, + { + "epoch": 0.01, + "grad_norm": 28.5, + "learning_rate": 4.978071517267443e-05, + "loss": 1.2479, + "step": 16100 + }, + { + "epoch": 0.01, + "grad_norm": 73.5, + "learning_rate": 4.977621609948351e-05, + "loss": 1.5248, + "step": 16200 + }, + { + "epoch": 0.01, + "grad_norm": 62.25, + "learning_rate": 4.977171702629258e-05, + "loss": 1.2381, + "step": 16300 + }, + { + "epoch": 0.01, + "grad_norm": 17.75, + "learning_rate": 4.9767217953101666e-05, + "loss": 1.1687, + "step": 16400 + }, + { + "epoch": 0.01, + "grad_norm": 35.75, + "learning_rate": 4.976271887991074e-05, + "loss": 1.3555, + "step": 16500 + }, + { + "epoch": 0.01, + "grad_norm": 274.0, + "learning_rate": 4.9758219806719816e-05, + "loss": 1.3997, + "step": 16600 + }, + { + "epoch": 0.01, + "grad_norm": 70.5, + "learning_rate": 4.97537207335289e-05, + "loss": 1.4042, + "step": 16700 + }, + { + "epoch": 0.01, + "grad_norm": 21.0, + "learning_rate": 4.9749221660337974e-05, + "loss": 1.462, + "step": 16800 + }, + { + "epoch": 0.02, + "grad_norm": 2.515625, + "learning_rate": 4.974472258714705e-05, + "loss": 1.3717, + "step": 16900 + }, + { + "epoch": 0.02, + "grad_norm": 84.0, + "learning_rate": 4.974022351395613e-05, + "loss": 1.391, + "step": 17000 + }, + { + "epoch": 0.02, + "grad_norm": 49.0, + "learning_rate": 4.97357244407652e-05, + "loss": 1.4007, + "step": 17100 + }, + { + "epoch": 0.02, + "grad_norm": 29.25, + "learning_rate": 4.973122536757428e-05, + "loss": 1.217, + "step": 17200 + }, + { + "epoch": 0.02, + "grad_norm": 46.5, + "learning_rate": 4.972672629438336e-05, + "loss": 1.2027, + "step": 17300 + }, + { + "epoch": 0.02, + "grad_norm": 7.9375, + "learning_rate": 4.972222722119243e-05, + "loss": 1.4868, + "step": 17400 + }, + { + "epoch": 0.02, + "grad_norm": 35.25, + "learning_rate": 4.9717728148001515e-05, + "loss": 1.3056, + "step": 17500 + }, + { + "epoch": 0.02, + "grad_norm": 138.0, + "learning_rate": 4.971322907481059e-05, + "loss": 1.3636, + "step": 17600 + }, + { + "epoch": 0.02, + "grad_norm": 77.0, + "learning_rate": 4.970873000161967e-05, + "loss": 1.5884, + "step": 17700 + }, + { + "epoch": 0.02, + "grad_norm": 36.25, + "learning_rate": 4.970423092842875e-05, + "loss": 1.3438, + "step": 17800 + }, + { + "epoch": 0.02, + "grad_norm": 108.5, + "learning_rate": 4.969973185523782e-05, + "loss": 1.2762, + "step": 17900 + }, + { + "epoch": 0.02, + "grad_norm": 38.25, + "learning_rate": 4.9695232782046905e-05, + "loss": 1.3057, + "step": 18000 + }, + { + "epoch": 0.02, + "grad_norm": 204.0, + "learning_rate": 4.969073370885598e-05, + "loss": 1.376, + "step": 18100 + }, + { + "epoch": 0.02, + "grad_norm": 92.5, + "learning_rate": 4.9686234635665055e-05, + "loss": 1.2902, + "step": 18200 + }, + { + "epoch": 0.02, + "grad_norm": 29.5, + "learning_rate": 4.968173556247414e-05, + "loss": 1.3195, + "step": 18300 + }, + { + "epoch": 0.02, + "grad_norm": 38.5, + "learning_rate": 4.9677236489283206e-05, + "loss": 1.3055, + "step": 18400 + }, + { + "epoch": 0.02, + "grad_norm": 25.5, + "learning_rate": 4.967273741609229e-05, + "loss": 1.4142, + "step": 18500 + }, + { + "epoch": 0.02, + "grad_norm": 59.5, + "learning_rate": 4.9668238342901364e-05, + "loss": 1.4916, + "step": 18600 + }, + { + "epoch": 0.02, + "grad_norm": 72.0, + "learning_rate": 4.966373926971044e-05, + "loss": 1.4982, + "step": 18700 + }, + { + "epoch": 0.02, + "grad_norm": 98.0, + "learning_rate": 4.965924019651952e-05, + "loss": 1.1261, + "step": 18800 + }, + { + "epoch": 0.02, + "grad_norm": 33.75, + "learning_rate": 4.9654741123328596e-05, + "loss": 1.2093, + "step": 18900 + }, + { + "epoch": 0.02, + "grad_norm": 63.25, + "learning_rate": 4.965024205013767e-05, + "loss": 1.1638, + "step": 19000 + }, + { + "epoch": 0.02, + "grad_norm": 29.25, + "learning_rate": 4.9645742976946754e-05, + "loss": 1.2345, + "step": 19100 + }, + { + "epoch": 0.02, + "grad_norm": 131.0, + "learning_rate": 4.964124390375583e-05, + "loss": 1.3946, + "step": 19200 + }, + { + "epoch": 0.02, + "grad_norm": 42.25, + "learning_rate": 4.9636744830564905e-05, + "loss": 1.2116, + "step": 19300 + }, + { + "epoch": 0.02, + "grad_norm": 20.375, + "learning_rate": 4.963224575737399e-05, + "loss": 1.1863, + "step": 19400 + }, + { + "epoch": 0.02, + "grad_norm": 32.5, + "learning_rate": 4.962774668418306e-05, + "loss": 1.4393, + "step": 19500 + }, + { + "epoch": 0.02, + "grad_norm": 548.0, + "learning_rate": 4.962324761099214e-05, + "loss": 1.168, + "step": 19600 + }, + { + "epoch": 0.02, + "grad_norm": 27.25, + "learning_rate": 4.961874853780121e-05, + "loss": 1.4497, + "step": 19700 + }, + { + "epoch": 0.02, + "grad_norm": 5.125, + "learning_rate": 4.961424946461029e-05, + "loss": 1.2966, + "step": 19800 + }, + { + "epoch": 0.02, + "grad_norm": 77.0, + "learning_rate": 4.960975039141937e-05, + "loss": 1.2388, + "step": 19900 + }, + { + "epoch": 0.02, + "grad_norm": 46.25, + "learning_rate": 4.9605251318228445e-05, + "loss": 1.4248, + "step": 20000 + }, + { + "epoch": 0.02, + "grad_norm": 56.5, + "learning_rate": 4.960075224503752e-05, + "loss": 1.2601, + "step": 20100 + }, + { + "epoch": 0.02, + "grad_norm": 0.56640625, + "learning_rate": 4.95962531718466e-05, + "loss": 1.3359, + "step": 20200 + }, + { + "epoch": 0.02, + "grad_norm": 17.25, + "learning_rate": 4.959175409865568e-05, + "loss": 1.4966, + "step": 20300 + }, + { + "epoch": 0.02, + "grad_norm": 50.0, + "learning_rate": 4.958725502546476e-05, + "loss": 1.1853, + "step": 20400 + }, + { + "epoch": 0.02, + "grad_norm": 35.5, + "learning_rate": 4.9582755952273836e-05, + "loss": 1.44, + "step": 20500 + }, + { + "epoch": 0.02, + "grad_norm": 91.0, + "learning_rate": 4.957825687908291e-05, + "loss": 1.3369, + "step": 20600 + }, + { + "epoch": 0.02, + "grad_norm": 41.25, + "learning_rate": 4.957375780589199e-05, + "loss": 1.1995, + "step": 20700 + }, + { + "epoch": 0.02, + "grad_norm": 38.0, + "learning_rate": 4.956925873270107e-05, + "loss": 1.187, + "step": 20800 + }, + { + "epoch": 0.02, + "grad_norm": 19.875, + "learning_rate": 4.9564759659510144e-05, + "loss": 1.2396, + "step": 20900 + }, + { + "epoch": 0.02, + "grad_norm": 88.0, + "learning_rate": 4.956026058631922e-05, + "loss": 1.3147, + "step": 21000 + }, + { + "epoch": 0.02, + "grad_norm": 2.171875, + "learning_rate": 4.9555761513128295e-05, + "loss": 1.418, + "step": 21100 + }, + { + "epoch": 0.02, + "grad_norm": 25.875, + "learning_rate": 4.955126243993738e-05, + "loss": 1.3389, + "step": 21200 + }, + { + "epoch": 0.02, + "grad_norm": 79.0, + "learning_rate": 4.954676336674645e-05, + "loss": 1.5175, + "step": 21300 + }, + { + "epoch": 0.02, + "grad_norm": 0.265625, + "learning_rate": 4.954226429355553e-05, + "loss": 1.3892, + "step": 21400 + }, + { + "epoch": 0.02, + "grad_norm": 199.0, + "learning_rate": 4.953776522036461e-05, + "loss": 1.2962, + "step": 21500 + }, + { + "epoch": 0.02, + "grad_norm": 19.25, + "learning_rate": 4.9533266147173685e-05, + "loss": 1.2883, + "step": 21600 + }, + { + "epoch": 0.02, + "grad_norm": 25.125, + "learning_rate": 4.952876707398276e-05, + "loss": 1.3622, + "step": 21700 + }, + { + "epoch": 0.02, + "grad_norm": 25.5, + "learning_rate": 4.952426800079184e-05, + "loss": 1.2029, + "step": 21800 + }, + { + "epoch": 0.02, + "grad_norm": 100.5, + "learning_rate": 4.951976892760092e-05, + "loss": 1.3844, + "step": 21900 + }, + { + "epoch": 0.02, + "grad_norm": 78.0, + "learning_rate": 4.951526985440999e-05, + "loss": 1.4038, + "step": 22000 + }, + { + "epoch": 0.02, + "grad_norm": 26.0, + "learning_rate": 4.9510770781219075e-05, + "loss": 1.102, + "step": 22100 + }, + { + "epoch": 0.02, + "grad_norm": 17.625, + "learning_rate": 4.950627170802815e-05, + "loss": 1.1101, + "step": 22200 + }, + { + "epoch": 0.02, + "grad_norm": 23.0, + "learning_rate": 4.9501772634837226e-05, + "loss": 1.3243, + "step": 22300 + }, + { + "epoch": 0.02, + "grad_norm": 89.0, + "learning_rate": 4.94972735616463e-05, + "loss": 1.2503, + "step": 22400 + }, + { + "epoch": 0.02, + "grad_norm": 35.25, + "learning_rate": 4.9492774488455376e-05, + "loss": 1.3802, + "step": 22500 + }, + { + "epoch": 0.02, + "grad_norm": 78.5, + "learning_rate": 4.948827541526446e-05, + "loss": 1.497, + "step": 22600 + }, + { + "epoch": 0.02, + "grad_norm": 73.5, + "learning_rate": 4.9483776342073534e-05, + "loss": 1.3293, + "step": 22700 + }, + { + "epoch": 0.02, + "grad_norm": 27.25, + "learning_rate": 4.947927726888261e-05, + "loss": 1.3108, + "step": 22800 + }, + { + "epoch": 0.02, + "grad_norm": 24.625, + "learning_rate": 4.947477819569169e-05, + "loss": 1.2249, + "step": 22900 + }, + { + "epoch": 0.02, + "grad_norm": 63.75, + "learning_rate": 4.947027912250077e-05, + "loss": 1.1269, + "step": 23000 + }, + { + "epoch": 0.02, + "grad_norm": 55.0, + "learning_rate": 4.946578004930985e-05, + "loss": 1.4874, + "step": 23100 + }, + { + "epoch": 0.02, + "grad_norm": 30.0, + "learning_rate": 4.9461280976118924e-05, + "loss": 1.2414, + "step": 23200 + }, + { + "epoch": 0.02, + "grad_norm": 98.0, + "learning_rate": 4.9456781902928e-05, + "loss": 1.3859, + "step": 23300 + }, + { + "epoch": 0.02, + "grad_norm": 3.78125, + "learning_rate": 4.945228282973708e-05, + "loss": 1.3174, + "step": 23400 + }, + { + "epoch": 0.02, + "grad_norm": 22.125, + "learning_rate": 4.944778375654616e-05, + "loss": 1.3358, + "step": 23500 + }, + { + "epoch": 0.02, + "grad_norm": 36.25, + "learning_rate": 4.9443284683355225e-05, + "loss": 1.4264, + "step": 23600 + }, + { + "epoch": 0.02, + "grad_norm": 60.75, + "learning_rate": 4.943878561016431e-05, + "loss": 1.2525, + "step": 23700 + }, + { + "epoch": 0.02, + "grad_norm": 19.75, + "learning_rate": 4.943428653697338e-05, + "loss": 1.4347, + "step": 23800 + }, + { + "epoch": 0.02, + "grad_norm": 66.5, + "learning_rate": 4.9429787463782465e-05, + "loss": 1.4323, + "step": 23900 + }, + { + "epoch": 0.02, + "grad_norm": 44.75, + "learning_rate": 4.942528839059154e-05, + "loss": 1.3538, + "step": 24000 + }, + { + "epoch": 0.02, + "grad_norm": 104.5, + "learning_rate": 4.9420789317400616e-05, + "loss": 1.3535, + "step": 24100 + }, + { + "epoch": 0.02, + "grad_norm": 41.75, + "learning_rate": 4.94162902442097e-05, + "loss": 1.3764, + "step": 24200 + }, + { + "epoch": 0.02, + "grad_norm": 30.0, + "learning_rate": 4.941179117101877e-05, + "loss": 1.3381, + "step": 24300 + }, + { + "epoch": 0.02, + "grad_norm": 32.5, + "learning_rate": 4.940729209782785e-05, + "loss": 1.3998, + "step": 24400 + }, + { + "epoch": 0.02, + "grad_norm": 17.875, + "learning_rate": 4.940279302463693e-05, + "loss": 1.2407, + "step": 24500 + }, + { + "epoch": 0.02, + "grad_norm": 92.0, + "learning_rate": 4.9398293951446006e-05, + "loss": 1.1868, + "step": 24600 + }, + { + "epoch": 0.02, + "grad_norm": 77.0, + "learning_rate": 4.939379487825508e-05, + "loss": 1.3361, + "step": 24700 + }, + { + "epoch": 0.02, + "grad_norm": 12.5, + "learning_rate": 4.9389295805064163e-05, + "loss": 1.1944, + "step": 24800 + }, + { + "epoch": 0.02, + "grad_norm": 342.0, + "learning_rate": 4.938479673187323e-05, + "loss": 1.4354, + "step": 24900 + }, + { + "epoch": 0.02, + "grad_norm": 282.0, + "learning_rate": 4.9380297658682314e-05, + "loss": 1.2188, + "step": 25000 + }, + { + "epoch": 0.02, + "grad_norm": 1.8828125, + "learning_rate": 4.937579858549139e-05, + "loss": 1.2472, + "step": 25100 + }, + { + "epoch": 0.02, + "grad_norm": 36.0, + "learning_rate": 4.9371299512300465e-05, + "loss": 1.0704, + "step": 25200 + }, + { + "epoch": 0.02, + "grad_norm": 175.0, + "learning_rate": 4.936680043910955e-05, + "loss": 1.2283, + "step": 25300 + }, + { + "epoch": 0.02, + "grad_norm": 42.25, + "learning_rate": 4.936230136591862e-05, + "loss": 1.4128, + "step": 25400 + }, + { + "epoch": 0.02, + "grad_norm": 34.5, + "learning_rate": 4.93578022927277e-05, + "loss": 1.4462, + "step": 25500 + }, + { + "epoch": 0.02, + "grad_norm": 54.0, + "learning_rate": 4.935330321953678e-05, + "loss": 1.2386, + "step": 25600 + }, + { + "epoch": 0.02, + "grad_norm": 87.5, + "learning_rate": 4.9348804146345855e-05, + "loss": 1.2673, + "step": 25700 + }, + { + "epoch": 0.02, + "grad_norm": 324.0, + "learning_rate": 4.934430507315494e-05, + "loss": 1.2364, + "step": 25800 + }, + { + "epoch": 0.02, + "grad_norm": 23.5, + "learning_rate": 4.933980599996401e-05, + "loss": 1.3105, + "step": 25900 + }, + { + "epoch": 0.02, + "grad_norm": 89.5, + "learning_rate": 4.933530692677309e-05, + "loss": 1.2263, + "step": 26000 + }, + { + "epoch": 0.02, + "grad_norm": 12.75, + "learning_rate": 4.933080785358217e-05, + "loss": 1.2275, + "step": 26100 + }, + { + "epoch": 0.02, + "grad_norm": 100.0, + "learning_rate": 4.932630878039124e-05, + "loss": 1.1758, + "step": 26200 + }, + { + "epoch": 0.02, + "grad_norm": 36.25, + "learning_rate": 4.9321809707200314e-05, + "loss": 1.3481, + "step": 26300 + }, + { + "epoch": 0.02, + "grad_norm": 34.75, + "learning_rate": 4.9317310634009396e-05, + "loss": 1.2371, + "step": 26400 + }, + { + "epoch": 0.02, + "grad_norm": 107.0, + "learning_rate": 4.931281156081847e-05, + "loss": 1.4179, + "step": 26500 + }, + { + "epoch": 0.02, + "grad_norm": 30.75, + "learning_rate": 4.9308312487627553e-05, + "loss": 1.1881, + "step": 26600 + }, + { + "epoch": 0.02, + "grad_norm": 70.0, + "learning_rate": 4.930381341443663e-05, + "loss": 1.3085, + "step": 26700 + }, + { + "epoch": 0.02, + "grad_norm": 12.5625, + "learning_rate": 4.9299314341245704e-05, + "loss": 1.2098, + "step": 26800 + }, + { + "epoch": 0.02, + "grad_norm": 121.0, + "learning_rate": 4.9294815268054786e-05, + "loss": 1.2776, + "step": 26900 + }, + { + "epoch": 0.02, + "grad_norm": 20.375, + "learning_rate": 4.929031619486386e-05, + "loss": 1.2937, + "step": 27000 + }, + { + "epoch": 0.02, + "grad_norm": 10.0, + "learning_rate": 4.928581712167294e-05, + "loss": 1.3407, + "step": 27100 + }, + { + "epoch": 0.02, + "grad_norm": 21.25, + "learning_rate": 4.928131804848202e-05, + "loss": 1.3125, + "step": 27200 + }, + { + "epoch": 0.02, + "grad_norm": 94.0, + "learning_rate": 4.9276818975291094e-05, + "loss": 1.1974, + "step": 27300 + }, + { + "epoch": 0.02, + "grad_norm": 170.0, + "learning_rate": 4.927231990210017e-05, + "loss": 1.2744, + "step": 27400 + }, + { + "epoch": 0.02, + "grad_norm": 324.0, + "learning_rate": 4.9267820828909245e-05, + "loss": 1.1951, + "step": 27500 + }, + { + "epoch": 0.02, + "grad_norm": 34.75, + "learning_rate": 4.926332175571832e-05, + "loss": 1.2403, + "step": 27600 + }, + { + "epoch": 0.02, + "grad_norm": 62.25, + "learning_rate": 4.92588226825274e-05, + "loss": 1.2114, + "step": 27700 + }, + { + "epoch": 0.02, + "grad_norm": 86.5, + "learning_rate": 4.925432360933648e-05, + "loss": 1.3519, + "step": 27800 + }, + { + "epoch": 0.02, + "grad_norm": 32.0, + "learning_rate": 4.924982453614555e-05, + "loss": 1.2799, + "step": 27900 + }, + { + "epoch": 0.02, + "grad_norm": 234.0, + "learning_rate": 4.9245325462954635e-05, + "loss": 1.1573, + "step": 28000 + }, + { + "epoch": 0.03, + "grad_norm": 193.0, + "learning_rate": 4.924082638976371e-05, + "loss": 1.3123, + "step": 28100 + }, + { + "epoch": 0.03, + "grad_norm": 77.5, + "learning_rate": 4.9236327316572786e-05, + "loss": 1.4343, + "step": 28200 + }, + { + "epoch": 0.03, + "grad_norm": 0.00811767578125, + "learning_rate": 4.923182824338187e-05, + "loss": 1.2112, + "step": 28300 + }, + { + "epoch": 0.03, + "grad_norm": 43.0, + "learning_rate": 4.9227329170190943e-05, + "loss": 1.404, + "step": 28400 + }, + { + "epoch": 0.03, + "grad_norm": 37.0, + "learning_rate": 4.9222830097000026e-05, + "loss": 1.3227, + "step": 28500 + }, + { + "epoch": 0.03, + "grad_norm": 756.0, + "learning_rate": 4.92183310238091e-05, + "loss": 1.4063, + "step": 28600 + }, + { + "epoch": 0.03, + "grad_norm": 27.0, + "learning_rate": 4.9213831950618176e-05, + "loss": 1.328, + "step": 28700 + }, + { + "epoch": 0.03, + "grad_norm": 32.0, + "learning_rate": 4.920933287742725e-05, + "loss": 1.3589, + "step": 28800 + }, + { + "epoch": 0.03, + "grad_norm": 60.25, + "learning_rate": 4.920483380423633e-05, + "loss": 1.2317, + "step": 28900 + }, + { + "epoch": 0.03, + "grad_norm": 6.78125, + "learning_rate": 4.92003347310454e-05, + "loss": 1.2723, + "step": 29000 + }, + { + "epoch": 0.03, + "grad_norm": 332.0, + "learning_rate": 4.9195835657854484e-05, + "loss": 1.1732, + "step": 29100 + }, + { + "epoch": 0.03, + "grad_norm": 19.125, + "learning_rate": 4.919133658466356e-05, + "loss": 1.3209, + "step": 29200 + }, + { + "epoch": 0.03, + "grad_norm": 18.0, + "learning_rate": 4.918683751147264e-05, + "loss": 1.0832, + "step": 29300 + }, + { + "epoch": 0.03, + "grad_norm": 101.5, + "learning_rate": 4.918233843828172e-05, + "loss": 1.2793, + "step": 29400 + }, + { + "epoch": 0.03, + "grad_norm": 0.00098419189453125, + "learning_rate": 4.917783936509079e-05, + "loss": 1.2659, + "step": 29500 + }, + { + "epoch": 0.03, + "grad_norm": 19.125, + "learning_rate": 4.9173340291899875e-05, + "loss": 1.429, + "step": 29600 + }, + { + "epoch": 0.03, + "grad_norm": 18.0, + "learning_rate": 4.916884121870895e-05, + "loss": 1.3748, + "step": 29700 + }, + { + "epoch": 0.03, + "grad_norm": 11.8125, + "learning_rate": 4.9164342145518025e-05, + "loss": 1.404, + "step": 29800 + }, + { + "epoch": 0.03, + "grad_norm": 39.25, + "learning_rate": 4.915984307232711e-05, + "loss": 1.396, + "step": 29900 + }, + { + "epoch": 0.03, + "grad_norm": 46.25, + "learning_rate": 4.915534399913618e-05, + "loss": 1.3234, + "step": 30000 + }, + { + "epoch": 0.03, + "grad_norm": 33.5, + "learning_rate": 4.915084492594526e-05, + "loss": 1.2151, + "step": 30100 + }, + { + "epoch": 0.03, + "grad_norm": 96.0, + "learning_rate": 4.9146345852754333e-05, + "loss": 1.2557, + "step": 30200 + }, + { + "epoch": 0.03, + "grad_norm": 31.5, + "learning_rate": 4.914184677956341e-05, + "loss": 1.3144, + "step": 30300 + }, + { + "epoch": 0.03, + "grad_norm": 41.5, + "learning_rate": 4.913734770637249e-05, + "loss": 1.3245, + "step": 30400 + }, + { + "epoch": 0.03, + "grad_norm": 40.25, + "learning_rate": 4.9132848633181566e-05, + "loss": 1.2695, + "step": 30500 + }, + { + "epoch": 0.03, + "grad_norm": 2.15625, + "learning_rate": 4.912834955999064e-05, + "loss": 1.2701, + "step": 30600 + }, + { + "epoch": 0.03, + "grad_norm": 0.060546875, + "learning_rate": 4.9123850486799724e-05, + "loss": 1.2759, + "step": 30700 + }, + { + "epoch": 0.03, + "grad_norm": 34.5, + "learning_rate": 4.91193514136088e-05, + "loss": 1.1844, + "step": 30800 + }, + { + "epoch": 0.03, + "grad_norm": 146.0, + "learning_rate": 4.9114852340417874e-05, + "loss": 1.3678, + "step": 30900 + }, + { + "epoch": 0.03, + "grad_norm": 25.875, + "learning_rate": 4.9110353267226956e-05, + "loss": 1.2937, + "step": 31000 + }, + { + "epoch": 0.03, + "grad_norm": 26.875, + "learning_rate": 4.910585419403603e-05, + "loss": 1.2822, + "step": 31100 + }, + { + "epoch": 0.03, + "grad_norm": 75.5, + "learning_rate": 4.9101355120845114e-05, + "loss": 1.299, + "step": 31200 + }, + { + "epoch": 0.03, + "grad_norm": 13.5625, + "learning_rate": 4.909685604765419e-05, + "loss": 1.2342, + "step": 31300 + }, + { + "epoch": 0.03, + "grad_norm": 3.3125, + "learning_rate": 4.909235697446326e-05, + "loss": 1.2827, + "step": 31400 + }, + { + "epoch": 0.03, + "grad_norm": 116.0, + "learning_rate": 4.908785790127234e-05, + "loss": 1.13, + "step": 31500 + }, + { + "epoch": 0.03, + "grad_norm": 32.25, + "learning_rate": 4.9083358828081415e-05, + "loss": 1.1681, + "step": 31600 + }, + { + "epoch": 0.03, + "grad_norm": 19.875, + "learning_rate": 4.907885975489049e-05, + "loss": 1.1674, + "step": 31700 + }, + { + "epoch": 0.03, + "grad_norm": 55.75, + "learning_rate": 4.907436068169957e-05, + "loss": 1.3497, + "step": 31800 + }, + { + "epoch": 0.03, + "grad_norm": 34.5, + "learning_rate": 4.906986160850865e-05, + "loss": 1.2089, + "step": 31900 + }, + { + "epoch": 0.03, + "grad_norm": 57.75, + "learning_rate": 4.906536253531773e-05, + "loss": 1.2747, + "step": 32000 + }, + { + "epoch": 0.03, + "grad_norm": 76.5, + "learning_rate": 4.9060863462126806e-05, + "loss": 1.3811, + "step": 32100 + }, + { + "epoch": 0.03, + "grad_norm": 0.2060546875, + "learning_rate": 4.905636438893588e-05, + "loss": 1.2696, + "step": 32200 + }, + { + "epoch": 0.03, + "grad_norm": 116.0, + "learning_rate": 4.905186531574496e-05, + "loss": 1.2768, + "step": 32300 + }, + { + "epoch": 0.03, + "grad_norm": 11.75, + "learning_rate": 4.904736624255404e-05, + "loss": 1.2377, + "step": 32400 + }, + { + "epoch": 0.03, + "grad_norm": 135.0, + "learning_rate": 4.9042867169363114e-05, + "loss": 1.1929, + "step": 32500 + }, + { + "epoch": 0.03, + "grad_norm": 22.75, + "learning_rate": 4.9038368096172196e-05, + "loss": 1.2497, + "step": 32600 + }, + { + "epoch": 0.03, + "grad_norm": 20.375, + "learning_rate": 4.9033869022981264e-05, + "loss": 1.3461, + "step": 32700 + }, + { + "epoch": 0.03, + "grad_norm": 20.375, + "learning_rate": 4.9029369949790346e-05, + "loss": 1.2957, + "step": 32800 + }, + { + "epoch": 0.03, + "grad_norm": 83.5, + "learning_rate": 4.902487087659942e-05, + "loss": 1.4008, + "step": 32900 + }, + { + "epoch": 0.03, + "grad_norm": 42.25, + "learning_rate": 4.90203718034085e-05, + "loss": 1.2242, + "step": 33000 + }, + { + "epoch": 0.03, + "grad_norm": 165.0, + "learning_rate": 4.901587273021758e-05, + "loss": 1.3613, + "step": 33100 + }, + { + "epoch": 0.03, + "grad_norm": 0.1533203125, + "learning_rate": 4.9011373657026655e-05, + "loss": 1.1811, + "step": 33200 + }, + { + "epoch": 0.03, + "grad_norm": 49.25, + "learning_rate": 4.900687458383573e-05, + "loss": 1.3841, + "step": 33300 + }, + { + "epoch": 0.03, + "grad_norm": 0.64453125, + "learning_rate": 4.900237551064481e-05, + "loss": 1.3316, + "step": 33400 + }, + { + "epoch": 0.03, + "grad_norm": 20.0, + "learning_rate": 4.899787643745389e-05, + "loss": 1.2325, + "step": 33500 + }, + { + "epoch": 0.03, + "grad_norm": 34.75, + "learning_rate": 4.899337736426296e-05, + "loss": 1.1637, + "step": 33600 + }, + { + "epoch": 0.03, + "grad_norm": 77.5, + "learning_rate": 4.8988878291072045e-05, + "loss": 1.332, + "step": 33700 + }, + { + "epoch": 0.03, + "grad_norm": 15.875, + "learning_rate": 4.898437921788112e-05, + "loss": 1.1109, + "step": 33800 + }, + { + "epoch": 0.03, + "grad_norm": 25.125, + "learning_rate": 4.8979880144690196e-05, + "loss": 1.1916, + "step": 33900 + }, + { + "epoch": 0.03, + "grad_norm": 88.0, + "learning_rate": 4.897538107149927e-05, + "loss": 1.2063, + "step": 34000 + }, + { + "epoch": 0.03, + "grad_norm": 89.0, + "learning_rate": 4.8970881998308346e-05, + "loss": 1.2242, + "step": 34100 + }, + { + "epoch": 0.03, + "grad_norm": 27.125, + "learning_rate": 4.896638292511743e-05, + "loss": 1.1195, + "step": 34200 + }, + { + "epoch": 0.03, + "grad_norm": 38.0, + "learning_rate": 4.8961883851926504e-05, + "loss": 1.1744, + "step": 34300 + }, + { + "epoch": 0.03, + "grad_norm": 66.0, + "learning_rate": 4.895738477873558e-05, + "loss": 1.4476, + "step": 34400 + }, + { + "epoch": 0.03, + "grad_norm": 17.0, + "learning_rate": 4.895288570554466e-05, + "loss": 1.2077, + "step": 34500 + }, + { + "epoch": 0.03, + "grad_norm": 153.0, + "learning_rate": 4.8948386632353736e-05, + "loss": 1.0947, + "step": 34600 + }, + { + "epoch": 0.03, + "grad_norm": 21.625, + "learning_rate": 4.894388755916282e-05, + "loss": 1.2036, + "step": 34700 + }, + { + "epoch": 0.03, + "grad_norm": 1.3671875, + "learning_rate": 4.8939388485971894e-05, + "loss": 1.2348, + "step": 34800 + }, + { + "epoch": 0.03, + "grad_norm": 24.625, + "learning_rate": 4.893488941278097e-05, + "loss": 1.2718, + "step": 34900 + }, + { + "epoch": 0.03, + "grad_norm": 73.5, + "learning_rate": 4.893039033959005e-05, + "loss": 1.2822, + "step": 35000 + }, + { + "epoch": 0.03, + "grad_norm": 2.015625, + "learning_rate": 4.892589126639913e-05, + "loss": 1.3972, + "step": 35100 + }, + { + "epoch": 0.03, + "grad_norm": 65.5, + "learning_rate": 4.89213921932082e-05, + "loss": 1.3597, + "step": 35200 + }, + { + "epoch": 0.03, + "grad_norm": 44.75, + "learning_rate": 4.891689312001728e-05, + "loss": 1.3293, + "step": 35300 + }, + { + "epoch": 0.03, + "grad_norm": 29.75, + "learning_rate": 4.891239404682635e-05, + "loss": 1.3688, + "step": 35400 + }, + { + "epoch": 0.03, + "grad_norm": 49.5, + "learning_rate": 4.8907894973635435e-05, + "loss": 1.4431, + "step": 35500 + }, + { + "epoch": 0.03, + "grad_norm": 28.875, + "learning_rate": 4.890339590044451e-05, + "loss": 1.3478, + "step": 35600 + }, + { + "epoch": 0.03, + "grad_norm": 23.625, + "learning_rate": 4.8898896827253586e-05, + "loss": 1.3951, + "step": 35700 + }, + { + "epoch": 0.03, + "grad_norm": 88.5, + "learning_rate": 4.889439775406267e-05, + "loss": 1.3132, + "step": 35800 + }, + { + "epoch": 0.03, + "grad_norm": 5.8125, + "learning_rate": 4.888989868087174e-05, + "loss": 1.2843, + "step": 35900 + }, + { + "epoch": 0.03, + "grad_norm": 71.0, + "learning_rate": 4.888539960768082e-05, + "loss": 1.3047, + "step": 36000 + }, + { + "epoch": 0.03, + "grad_norm": 41.5, + "learning_rate": 4.88809005344899e-05, + "loss": 1.3439, + "step": 36100 + }, + { + "epoch": 0.03, + "grad_norm": 24.5, + "learning_rate": 4.8876401461298976e-05, + "loss": 1.2515, + "step": 36200 + }, + { + "epoch": 0.03, + "grad_norm": 30.75, + "learning_rate": 4.887190238810805e-05, + "loss": 1.3564, + "step": 36300 + }, + { + "epoch": 0.03, + "grad_norm": 20.625, + "learning_rate": 4.886740331491713e-05, + "loss": 1.1833, + "step": 36400 + }, + { + "epoch": 0.03, + "grad_norm": 22.625, + "learning_rate": 4.886290424172621e-05, + "loss": 1.3388, + "step": 36500 + }, + { + "epoch": 0.03, + "grad_norm": 41.0, + "learning_rate": 4.8858405168535284e-05, + "loss": 1.2338, + "step": 36600 + }, + { + "epoch": 0.03, + "grad_norm": 111.5, + "learning_rate": 4.885390609534436e-05, + "loss": 1.3098, + "step": 36700 + }, + { + "epoch": 0.03, + "grad_norm": 406.0, + "learning_rate": 4.8849407022153435e-05, + "loss": 1.2323, + "step": 36800 + }, + { + "epoch": 0.03, + "grad_norm": 32.25, + "learning_rate": 4.884490794896252e-05, + "loss": 1.2944, + "step": 36900 + }, + { + "epoch": 0.03, + "grad_norm": 47.5, + "learning_rate": 4.884040887577159e-05, + "loss": 1.2743, + "step": 37000 + }, + { + "epoch": 0.03, + "grad_norm": 39.25, + "learning_rate": 4.883590980258067e-05, + "loss": 1.2856, + "step": 37100 + }, + { + "epoch": 0.03, + "grad_norm": 88.0, + "learning_rate": 4.883141072938975e-05, + "loss": 1.179, + "step": 37200 + }, + { + "epoch": 0.03, + "grad_norm": 32.25, + "learning_rate": 4.8826911656198825e-05, + "loss": 1.2453, + "step": 37300 + }, + { + "epoch": 0.03, + "grad_norm": 76.5, + "learning_rate": 4.882241258300791e-05, + "loss": 1.3155, + "step": 37400 + }, + { + "epoch": 0.03, + "grad_norm": 28.375, + "learning_rate": 4.881791350981698e-05, + "loss": 1.2103, + "step": 37500 + }, + { + "epoch": 0.03, + "grad_norm": 71.0, + "learning_rate": 4.881341443662606e-05, + "loss": 1.2651, + "step": 37600 + }, + { + "epoch": 0.03, + "grad_norm": 17.25, + "learning_rate": 4.880891536343514e-05, + "loss": 1.5132, + "step": 37700 + }, + { + "epoch": 0.03, + "grad_norm": 86.5, + "learning_rate": 4.8804416290244215e-05, + "loss": 1.3271, + "step": 37800 + }, + { + "epoch": 0.03, + "grad_norm": 123.5, + "learning_rate": 4.8799917217053284e-05, + "loss": 1.3711, + "step": 37900 + }, + { + "epoch": 0.03, + "grad_norm": 148.0, + "learning_rate": 4.8795418143862366e-05, + "loss": 1.2331, + "step": 38000 + }, + { + "epoch": 0.03, + "grad_norm": 24.25, + "learning_rate": 4.879091907067144e-05, + "loss": 1.1995, + "step": 38100 + }, + { + "epoch": 0.03, + "grad_norm": 0.21484375, + "learning_rate": 4.878641999748052e-05, + "loss": 1.3692, + "step": 38200 + }, + { + "epoch": 0.03, + "grad_norm": 70.0, + "learning_rate": 4.87819209242896e-05, + "loss": 1.2408, + "step": 38300 + }, + { + "epoch": 0.03, + "grad_norm": 17.0, + "learning_rate": 4.8777421851098674e-05, + "loss": 1.2109, + "step": 38400 + }, + { + "epoch": 0.03, + "grad_norm": 49.0, + "learning_rate": 4.8772922777907756e-05, + "loss": 1.2887, + "step": 38500 + }, + { + "epoch": 0.03, + "grad_norm": 11.25, + "learning_rate": 4.876842370471683e-05, + "loss": 1.3198, + "step": 38600 + }, + { + "epoch": 0.03, + "grad_norm": 19.75, + "learning_rate": 4.876392463152591e-05, + "loss": 1.359, + "step": 38700 + }, + { + "epoch": 0.03, + "grad_norm": 120.0, + "learning_rate": 4.875942555833499e-05, + "loss": 1.3114, + "step": 38800 + }, + { + "epoch": 0.03, + "grad_norm": 25.875, + "learning_rate": 4.8754926485144064e-05, + "loss": 1.2283, + "step": 38900 + }, + { + "epoch": 0.03, + "grad_norm": 0.047607421875, + "learning_rate": 4.875042741195314e-05, + "loss": 1.1942, + "step": 39000 + }, + { + "epoch": 0.03, + "grad_norm": 16.125, + "learning_rate": 4.874592833876222e-05, + "loss": 1.2641, + "step": 39100 + }, + { + "epoch": 0.03, + "grad_norm": 56.5, + "learning_rate": 4.874142926557129e-05, + "loss": 1.0726, + "step": 39200 + }, + { + "epoch": 0.04, + "grad_norm": 169.0, + "learning_rate": 4.873693019238037e-05, + "loss": 1.1969, + "step": 39300 + }, + { + "epoch": 0.04, + "grad_norm": 54.25, + "learning_rate": 4.873243111918945e-05, + "loss": 1.1107, + "step": 39400 + }, + { + "epoch": 0.04, + "grad_norm": 62.0, + "learning_rate": 4.872793204599852e-05, + "loss": 1.4246, + "step": 39500 + }, + { + "epoch": 0.04, + "grad_norm": 18.625, + "learning_rate": 4.8723432972807605e-05, + "loss": 1.2903, + "step": 39600 + }, + { + "epoch": 0.04, + "grad_norm": 0.028564453125, + "learning_rate": 4.871893389961668e-05, + "loss": 1.3936, + "step": 39700 + }, + { + "epoch": 0.04, + "grad_norm": 170.0, + "learning_rate": 4.8714434826425756e-05, + "loss": 1.1697, + "step": 39800 + }, + { + "epoch": 0.04, + "grad_norm": 25.0, + "learning_rate": 4.870993575323484e-05, + "loss": 1.1897, + "step": 39900 + }, + { + "epoch": 0.04, + "grad_norm": 28.625, + "learning_rate": 4.870543668004391e-05, + "loss": 1.3424, + "step": 40000 + }, + { + "epoch": 0.04, + "grad_norm": 52.75, + "learning_rate": 4.8700937606852995e-05, + "loss": 1.2875, + "step": 40100 + }, + { + "epoch": 0.04, + "grad_norm": 0.004669189453125, + "learning_rate": 4.869643853366207e-05, + "loss": 1.2613, + "step": 40200 + }, + { + "epoch": 0.04, + "grad_norm": 73.0, + "learning_rate": 4.8691939460471146e-05, + "loss": 1.376, + "step": 40300 + }, + { + "epoch": 0.04, + "grad_norm": 8.5625, + "learning_rate": 4.868744038728022e-05, + "loss": 1.2863, + "step": 40400 + }, + { + "epoch": 0.04, + "grad_norm": 25.25, + "learning_rate": 4.86829413140893e-05, + "loss": 1.4402, + "step": 40500 + }, + { + "epoch": 0.04, + "grad_norm": 46.25, + "learning_rate": 4.867844224089837e-05, + "loss": 1.3281, + "step": 40600 + }, + { + "epoch": 0.04, + "grad_norm": 140.0, + "learning_rate": 4.8673943167707454e-05, + "loss": 1.3408, + "step": 40700 + }, + { + "epoch": 0.04, + "grad_norm": 36.0, + "learning_rate": 4.866944409451653e-05, + "loss": 1.2578, + "step": 40800 + }, + { + "epoch": 0.04, + "grad_norm": 49.5, + "learning_rate": 4.866494502132561e-05, + "loss": 1.3747, + "step": 40900 + }, + { + "epoch": 0.04, + "grad_norm": 48.25, + "learning_rate": 4.866044594813469e-05, + "loss": 1.2134, + "step": 41000 + }, + { + "epoch": 0.04, + "grad_norm": 52.5, + "learning_rate": 4.865594687494376e-05, + "loss": 1.2627, + "step": 41100 + }, + { + "epoch": 0.04, + "grad_norm": 61.5, + "learning_rate": 4.8651447801752844e-05, + "loss": 1.2865, + "step": 41200 + }, + { + "epoch": 0.04, + "grad_norm": 84.5, + "learning_rate": 4.864694872856192e-05, + "loss": 1.2833, + "step": 41300 + }, + { + "epoch": 0.04, + "grad_norm": 22.75, + "learning_rate": 4.8642449655370995e-05, + "loss": 1.3422, + "step": 41400 + }, + { + "epoch": 0.04, + "grad_norm": 43.0, + "learning_rate": 4.863795058218008e-05, + "loss": 1.287, + "step": 41500 + }, + { + "epoch": 0.04, + "grad_norm": 18.5, + "learning_rate": 4.863345150898915e-05, + "loss": 1.2372, + "step": 41600 + }, + { + "epoch": 0.04, + "grad_norm": 78.0, + "learning_rate": 4.862895243579823e-05, + "loss": 1.3556, + "step": 41700 + }, + { + "epoch": 0.04, + "grad_norm": 128.0, + "learning_rate": 4.86244533626073e-05, + "loss": 1.4245, + "step": 41800 + }, + { + "epoch": 0.04, + "grad_norm": 162.0, + "learning_rate": 4.861995428941638e-05, + "loss": 1.2928, + "step": 41900 + }, + { + "epoch": 0.04, + "grad_norm": 107.5, + "learning_rate": 4.861545521622546e-05, + "loss": 1.2586, + "step": 42000 + }, + { + "epoch": 0.04, + "grad_norm": 5.0, + "learning_rate": 4.8610956143034536e-05, + "loss": 1.2892, + "step": 42100 + }, + { + "epoch": 0.04, + "grad_norm": 73.0, + "learning_rate": 4.860645706984361e-05, + "loss": 1.1325, + "step": 42200 + }, + { + "epoch": 0.04, + "grad_norm": 5.5, + "learning_rate": 4.8601957996652693e-05, + "loss": 1.2525, + "step": 42300 + }, + { + "epoch": 0.04, + "grad_norm": 58.25, + "learning_rate": 4.859745892346177e-05, + "loss": 1.0027, + "step": 42400 + }, + { + "epoch": 0.04, + "grad_norm": 44.0, + "learning_rate": 4.8592959850270844e-05, + "loss": 1.3387, + "step": 42500 + }, + { + "epoch": 0.04, + "grad_norm": 0.005401611328125, + "learning_rate": 4.8588460777079926e-05, + "loss": 1.4061, + "step": 42600 + }, + { + "epoch": 0.04, + "grad_norm": 47.0, + "learning_rate": 4.8583961703889e-05, + "loss": 1.2295, + "step": 42700 + }, + { + "epoch": 0.04, + "grad_norm": 23.375, + "learning_rate": 4.8579462630698084e-05, + "loss": 1.3215, + "step": 42800 + }, + { + "epoch": 0.04, + "grad_norm": 78.0, + "learning_rate": 4.857496355750716e-05, + "loss": 1.115, + "step": 42900 + }, + { + "epoch": 0.04, + "grad_norm": 25.75, + "learning_rate": 4.857046448431623e-05, + "loss": 1.3653, + "step": 43000 + }, + { + "epoch": 0.04, + "grad_norm": 39.5, + "learning_rate": 4.856596541112531e-05, + "loss": 1.1244, + "step": 43100 + }, + { + "epoch": 0.04, + "grad_norm": 29.0, + "learning_rate": 4.8561466337934385e-05, + "loss": 1.2352, + "step": 43200 + }, + { + "epoch": 0.04, + "grad_norm": 119.0, + "learning_rate": 4.855696726474346e-05, + "loss": 1.3514, + "step": 43300 + }, + { + "epoch": 0.04, + "grad_norm": 8.0, + "learning_rate": 4.855246819155254e-05, + "loss": 1.2969, + "step": 43400 + }, + { + "epoch": 0.04, + "grad_norm": 49.75, + "learning_rate": 4.854796911836162e-05, + "loss": 1.3222, + "step": 43500 + }, + { + "epoch": 0.04, + "grad_norm": 36.75, + "learning_rate": 4.85434700451707e-05, + "loss": 1.3191, + "step": 43600 + }, + { + "epoch": 0.04, + "grad_norm": 374.0, + "learning_rate": 4.8538970971979775e-05, + "loss": 1.3238, + "step": 43700 + }, + { + "epoch": 0.04, + "grad_norm": 56.0, + "learning_rate": 4.853447189878885e-05, + "loss": 1.4575, + "step": 43800 + }, + { + "epoch": 0.04, + "grad_norm": 35.75, + "learning_rate": 4.852997282559793e-05, + "loss": 1.3157, + "step": 43900 + }, + { + "epoch": 0.04, + "grad_norm": 23.125, + "learning_rate": 4.852547375240701e-05, + "loss": 1.1149, + "step": 44000 + }, + { + "epoch": 0.04, + "grad_norm": 292.0, + "learning_rate": 4.8520974679216083e-05, + "loss": 1.1264, + "step": 44100 + }, + { + "epoch": 0.04, + "grad_norm": 29.375, + "learning_rate": 4.8516475606025166e-05, + "loss": 1.1437, + "step": 44200 + }, + { + "epoch": 0.04, + "grad_norm": 0.058349609375, + "learning_rate": 4.8511976532834234e-05, + "loss": 1.1619, + "step": 44300 + }, + { + "epoch": 0.04, + "grad_norm": 29.0, + "learning_rate": 4.8507477459643316e-05, + "loss": 1.2967, + "step": 44400 + }, + { + "epoch": 0.04, + "grad_norm": 13.8125, + "learning_rate": 4.850297838645239e-05, + "loss": 1.2666, + "step": 44500 + }, + { + "epoch": 0.04, + "grad_norm": 28.5, + "learning_rate": 4.849847931326147e-05, + "loss": 1.3183, + "step": 44600 + }, + { + "epoch": 0.04, + "grad_norm": 12.75, + "learning_rate": 4.849398024007055e-05, + "loss": 1.0624, + "step": 44700 + }, + { + "epoch": 0.04, + "grad_norm": 36.0, + "learning_rate": 4.8489481166879624e-05, + "loss": 1.2649, + "step": 44800 + }, + { + "epoch": 0.04, + "grad_norm": 59.0, + "learning_rate": 4.84849820936887e-05, + "loss": 1.2167, + "step": 44900 + }, + { + "epoch": 0.04, + "grad_norm": 24.5, + "learning_rate": 4.848048302049778e-05, + "loss": 1.1337, + "step": 45000 + }, + { + "epoch": 0.04, + "grad_norm": 70.5, + "learning_rate": 4.847598394730686e-05, + "loss": 1.4822, + "step": 45100 + }, + { + "epoch": 0.04, + "grad_norm": 0.004852294921875, + "learning_rate": 4.847148487411593e-05, + "loss": 1.1802, + "step": 45200 + }, + { + "epoch": 0.04, + "grad_norm": 42.5, + "learning_rate": 4.8466985800925015e-05, + "loss": 1.2666, + "step": 45300 + }, + { + "epoch": 0.04, + "grad_norm": 11.0, + "learning_rate": 4.846248672773409e-05, + "loss": 1.1953, + "step": 45400 + }, + { + "epoch": 0.04, + "grad_norm": 26.625, + "learning_rate": 4.845798765454317e-05, + "loss": 1.2385, + "step": 45500 + }, + { + "epoch": 0.04, + "grad_norm": 51.25, + "learning_rate": 4.845348858135224e-05, + "loss": 1.2195, + "step": 45600 + }, + { + "epoch": 0.04, + "grad_norm": 33.75, + "learning_rate": 4.8448989508161316e-05, + "loss": 1.1949, + "step": 45700 + }, + { + "epoch": 0.04, + "grad_norm": 70.5, + "learning_rate": 4.84444904349704e-05, + "loss": 1.2288, + "step": 45800 + }, + { + "epoch": 0.04, + "grad_norm": 2.703125, + "learning_rate": 4.8439991361779473e-05, + "loss": 1.2516, + "step": 45900 + }, + { + "epoch": 0.04, + "grad_norm": 154.0, + "learning_rate": 4.843549228858855e-05, + "loss": 1.3733, + "step": 46000 + }, + { + "epoch": 0.04, + "grad_norm": 17.25, + "learning_rate": 4.843099321539763e-05, + "loss": 1.2666, + "step": 46100 + }, + { + "epoch": 0.04, + "grad_norm": 21.375, + "learning_rate": 4.8426494142206706e-05, + "loss": 1.2936, + "step": 46200 + }, + { + "epoch": 0.04, + "grad_norm": 25.875, + "learning_rate": 4.842199506901579e-05, + "loss": 1.4903, + "step": 46300 + }, + { + "epoch": 0.04, + "grad_norm": 111.5, + "learning_rate": 4.8417495995824864e-05, + "loss": 1.2341, + "step": 46400 + }, + { + "epoch": 0.04, + "grad_norm": 30.0, + "learning_rate": 4.841299692263394e-05, + "loss": 1.2469, + "step": 46500 + }, + { + "epoch": 0.04, + "grad_norm": 10.125, + "learning_rate": 4.840849784944302e-05, + "loss": 1.2281, + "step": 46600 + }, + { + "epoch": 0.04, + "grad_norm": 0.02490234375, + "learning_rate": 4.8403998776252097e-05, + "loss": 1.3978, + "step": 46700 + }, + { + "epoch": 0.04, + "grad_norm": 104.0, + "learning_rate": 4.839949970306117e-05, + "loss": 1.3517, + "step": 46800 + }, + { + "epoch": 0.04, + "grad_norm": 24.0, + "learning_rate": 4.839500062987025e-05, + "loss": 1.3805, + "step": 46900 + }, + { + "epoch": 0.04, + "grad_norm": 14.5625, + "learning_rate": 4.839050155667932e-05, + "loss": 1.1528, + "step": 47000 + }, + { + "epoch": 0.04, + "grad_norm": 56.25, + "learning_rate": 4.8386002483488405e-05, + "loss": 1.2795, + "step": 47100 + }, + { + "epoch": 0.04, + "grad_norm": 0.46484375, + "learning_rate": 4.838150341029748e-05, + "loss": 1.3025, + "step": 47200 + }, + { + "epoch": 0.04, + "grad_norm": 60.5, + "learning_rate": 4.8377004337106555e-05, + "loss": 1.3028, + "step": 47300 + }, + { + "epoch": 0.04, + "grad_norm": 24.5, + "learning_rate": 4.837250526391564e-05, + "loss": 1.1532, + "step": 47400 + }, + { + "epoch": 0.04, + "grad_norm": 165.0, + "learning_rate": 4.836800619072471e-05, + "loss": 1.3127, + "step": 47500 + }, + { + "epoch": 0.04, + "grad_norm": 92.5, + "learning_rate": 4.836350711753379e-05, + "loss": 1.278, + "step": 47600 + }, + { + "epoch": 0.04, + "grad_norm": 0.00176239013671875, + "learning_rate": 4.835900804434287e-05, + "loss": 1.0553, + "step": 47700 + }, + { + "epoch": 0.04, + "grad_norm": 27.0, + "learning_rate": 4.8354508971151946e-05, + "loss": 1.25, + "step": 47800 + }, + { + "epoch": 0.04, + "grad_norm": 0.035888671875, + "learning_rate": 4.835000989796102e-05, + "loss": 1.1231, + "step": 47900 + }, + { + "epoch": 0.04, + "grad_norm": 39.5, + "learning_rate": 4.83455108247701e-05, + "loss": 1.311, + "step": 48000 + }, + { + "epoch": 0.04, + "grad_norm": 40.25, + "learning_rate": 4.834101175157918e-05, + "loss": 1.2455, + "step": 48100 + }, + { + "epoch": 0.04, + "grad_norm": 31.125, + "learning_rate": 4.8336512678388254e-05, + "loss": 1.3419, + "step": 48200 + }, + { + "epoch": 0.04, + "grad_norm": 59.25, + "learning_rate": 4.833201360519733e-05, + "loss": 1.2595, + "step": 48300 + }, + { + "epoch": 0.04, + "grad_norm": 52.5, + "learning_rate": 4.8327514532006404e-05, + "loss": 1.1156, + "step": 48400 + }, + { + "epoch": 0.04, + "grad_norm": 88.0, + "learning_rate": 4.8323015458815487e-05, + "loss": 1.065, + "step": 48500 + }, + { + "epoch": 0.04, + "grad_norm": 24.75, + "learning_rate": 4.831851638562456e-05, + "loss": 1.3643, + "step": 48600 + }, + { + "epoch": 0.04, + "grad_norm": 75.0, + "learning_rate": 4.831401731243364e-05, + "loss": 1.2967, + "step": 48700 + }, + { + "epoch": 0.04, + "grad_norm": 0.002349853515625, + "learning_rate": 4.830951823924272e-05, + "loss": 1.1988, + "step": 48800 + }, + { + "epoch": 0.04, + "grad_norm": 0.0308837890625, + "learning_rate": 4.8305019166051795e-05, + "loss": 1.3315, + "step": 48900 + }, + { + "epoch": 0.04, + "grad_norm": 16.5, + "learning_rate": 4.830052009286088e-05, + "loss": 1.2752, + "step": 49000 + }, + { + "epoch": 0.04, + "grad_norm": 24.875, + "learning_rate": 4.829602101966995e-05, + "loss": 1.2389, + "step": 49100 + }, + { + "epoch": 0.04, + "grad_norm": 1.59375, + "learning_rate": 4.829152194647903e-05, + "loss": 1.3067, + "step": 49200 + }, + { + "epoch": 0.04, + "grad_norm": 23.5, + "learning_rate": 4.828702287328811e-05, + "loss": 1.0593, + "step": 49300 + }, + { + "epoch": 0.04, + "grad_norm": 0.75390625, + "learning_rate": 4.8282523800097185e-05, + "loss": 1.2773, + "step": 49400 + }, + { + "epoch": 0.04, + "grad_norm": 14.9375, + "learning_rate": 4.827802472690626e-05, + "loss": 1.2028, + "step": 49500 + }, + { + "epoch": 0.04, + "grad_norm": 79.0, + "learning_rate": 4.8273525653715336e-05, + "loss": 1.306, + "step": 49600 + }, + { + "epoch": 0.04, + "grad_norm": 0.0810546875, + "learning_rate": 4.826902658052441e-05, + "loss": 1.3618, + "step": 49700 + }, + { + "epoch": 0.04, + "grad_norm": 16.125, + "learning_rate": 4.826452750733349e-05, + "loss": 1.1252, + "step": 49800 + }, + { + "epoch": 0.04, + "grad_norm": 14.875, + "learning_rate": 4.826002843414257e-05, + "loss": 1.2516, + "step": 49900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07666015625, + "learning_rate": 4.8255529360951644e-05, + "loss": 1.0725, + "step": 50000 + }, + { + "epoch": 0.04, + "grad_norm": 9.4375, + "learning_rate": 4.8251030287760726e-05, + "loss": 1.3507, + "step": 50100 + }, + { + "epoch": 0.04, + "grad_norm": 28.5, + "learning_rate": 4.82465312145698e-05, + "loss": 1.1303, + "step": 50200 + }, + { + "epoch": 0.04, + "grad_norm": 86.0, + "learning_rate": 4.8242032141378877e-05, + "loss": 1.2811, + "step": 50300 + }, + { + "epoch": 0.04, + "grad_norm": 77.5, + "learning_rate": 4.823753306818796e-05, + "loss": 1.2733, + "step": 50400 + }, + { + "epoch": 0.04, + "grad_norm": 29.5, + "learning_rate": 4.8233033994997034e-05, + "loss": 1.3248, + "step": 50500 + }, + { + "epoch": 0.05, + "grad_norm": 73.0, + "learning_rate": 4.822853492180611e-05, + "loss": 1.1872, + "step": 50600 + }, + { + "epoch": 0.05, + "grad_norm": 0.17578125, + "learning_rate": 4.822403584861519e-05, + "loss": 1.3451, + "step": 50700 + }, + { + "epoch": 0.05, + "grad_norm": 1.96875, + "learning_rate": 4.821953677542426e-05, + "loss": 1.2303, + "step": 50800 + }, + { + "epoch": 0.05, + "grad_norm": 0.04931640625, + "learning_rate": 4.821503770223334e-05, + "loss": 1.2957, + "step": 50900 + }, + { + "epoch": 0.05, + "grad_norm": 15.5625, + "learning_rate": 4.821053862904242e-05, + "loss": 1.2168, + "step": 51000 + }, + { + "epoch": 0.05, + "grad_norm": 15.25, + "learning_rate": 4.820603955585149e-05, + "loss": 1.2688, + "step": 51100 + }, + { + "epoch": 0.05, + "grad_norm": 6.65625, + "learning_rate": 4.8201540482660575e-05, + "loss": 1.2357, + "step": 51200 + }, + { + "epoch": 0.05, + "grad_norm": 25.125, + "learning_rate": 4.819704140946965e-05, + "loss": 1.3002, + "step": 51300 + }, + { + "epoch": 0.05, + "grad_norm": 0.01214599609375, + "learning_rate": 4.8192542336278726e-05, + "loss": 1.3091, + "step": 51400 + }, + { + "epoch": 0.05, + "grad_norm": 15.875, + "learning_rate": 4.818804326308781e-05, + "loss": 1.1105, + "step": 51500 + }, + { + "epoch": 0.05, + "grad_norm": 44.75, + "learning_rate": 4.818354418989688e-05, + "loss": 1.1362, + "step": 51600 + }, + { + "epoch": 0.05, + "grad_norm": 290.0, + "learning_rate": 4.8179045116705965e-05, + "loss": 1.2163, + "step": 51700 + }, + { + "epoch": 0.05, + "grad_norm": 54.75, + "learning_rate": 4.817454604351504e-05, + "loss": 1.1851, + "step": 51800 + }, + { + "epoch": 0.05, + "grad_norm": 19.875, + "learning_rate": 4.8170046970324116e-05, + "loss": 1.1474, + "step": 51900 + }, + { + "epoch": 0.05, + "grad_norm": 27.75, + "learning_rate": 4.81655478971332e-05, + "loss": 1.2484, + "step": 52000 + }, + { + "epoch": 0.05, + "grad_norm": 24.375, + "learning_rate": 4.8161048823942266e-05, + "loss": 1.3117, + "step": 52100 + }, + { + "epoch": 0.05, + "grad_norm": 41.75, + "learning_rate": 4.815654975075135e-05, + "loss": 1.227, + "step": 52200 + }, + { + "epoch": 0.05, + "grad_norm": 99.5, + "learning_rate": 4.8152050677560424e-05, + "loss": 1.214, + "step": 52300 + }, + { + "epoch": 0.05, + "grad_norm": 156.0, + "learning_rate": 4.81475516043695e-05, + "loss": 1.2802, + "step": 52400 + }, + { + "epoch": 0.05, + "grad_norm": 282.0, + "learning_rate": 4.814305253117858e-05, + "loss": 1.2427, + "step": 52500 + }, + { + "epoch": 0.05, + "grad_norm": 1.4609375, + "learning_rate": 4.813855345798766e-05, + "loss": 1.2021, + "step": 52600 + }, + { + "epoch": 0.05, + "grad_norm": 35.0, + "learning_rate": 4.813405438479673e-05, + "loss": 1.1962, + "step": 52700 + }, + { + "epoch": 0.05, + "grad_norm": 20.875, + "learning_rate": 4.8129555311605814e-05, + "loss": 1.2807, + "step": 52800 + }, + { + "epoch": 0.05, + "grad_norm": 178.0, + "learning_rate": 4.812505623841489e-05, + "loss": 1.1017, + "step": 52900 + }, + { + "epoch": 0.05, + "grad_norm": 1.5859375, + "learning_rate": 4.8120557165223965e-05, + "loss": 1.1054, + "step": 53000 + }, + { + "epoch": 0.05, + "grad_norm": 15.875, + "learning_rate": 4.811605809203305e-05, + "loss": 1.2859, + "step": 53100 + }, + { + "epoch": 0.05, + "grad_norm": 13.25, + "learning_rate": 4.811155901884212e-05, + "loss": 1.3019, + "step": 53200 + }, + { + "epoch": 0.05, + "grad_norm": 46.75, + "learning_rate": 4.81070599456512e-05, + "loss": 1.1397, + "step": 53300 + }, + { + "epoch": 0.05, + "grad_norm": 229.0, + "learning_rate": 4.810256087246027e-05, + "loss": 1.1054, + "step": 53400 + }, + { + "epoch": 0.05, + "grad_norm": 78.0, + "learning_rate": 4.809806179926935e-05, + "loss": 1.236, + "step": 53500 + }, + { + "epoch": 0.05, + "grad_norm": 0.07421875, + "learning_rate": 4.809356272607843e-05, + "loss": 1.3154, + "step": 53600 + }, + { + "epoch": 0.05, + "grad_norm": 71.5, + "learning_rate": 4.8089063652887506e-05, + "loss": 1.4473, + "step": 53700 + }, + { + "epoch": 0.05, + "grad_norm": 26.0, + "learning_rate": 4.808456457969658e-05, + "loss": 1.256, + "step": 53800 + }, + { + "epoch": 0.05, + "grad_norm": 75.0, + "learning_rate": 4.808006550650566e-05, + "loss": 1.241, + "step": 53900 + }, + { + "epoch": 0.05, + "grad_norm": 27.25, + "learning_rate": 4.807556643331474e-05, + "loss": 1.2908, + "step": 54000 + }, + { + "epoch": 0.05, + "grad_norm": 53.25, + "learning_rate": 4.8071067360123814e-05, + "loss": 1.3112, + "step": 54100 + }, + { + "epoch": 0.05, + "grad_norm": 140.0, + "learning_rate": 4.8066568286932896e-05, + "loss": 1.3015, + "step": 54200 + }, + { + "epoch": 0.05, + "grad_norm": 44.25, + "learning_rate": 4.806206921374197e-05, + "loss": 1.2982, + "step": 54300 + }, + { + "epoch": 0.05, + "grad_norm": 15.9375, + "learning_rate": 4.8057570140551054e-05, + "loss": 1.2639, + "step": 54400 + }, + { + "epoch": 0.05, + "grad_norm": 0.09716796875, + "learning_rate": 4.805307106736013e-05, + "loss": 1.3295, + "step": 54500 + }, + { + "epoch": 0.05, + "grad_norm": 34.25, + "learning_rate": 4.8048571994169204e-05, + "loss": 1.2623, + "step": 54600 + }, + { + "epoch": 0.05, + "grad_norm": 16.125, + "learning_rate": 4.804407292097828e-05, + "loss": 1.4758, + "step": 54700 + }, + { + "epoch": 0.05, + "grad_norm": 18.875, + "learning_rate": 4.8039573847787355e-05, + "loss": 1.2223, + "step": 54800 + }, + { + "epoch": 0.05, + "grad_norm": 4.0, + "learning_rate": 4.803507477459643e-05, + "loss": 1.1929, + "step": 54900 + }, + { + "epoch": 0.05, + "grad_norm": 394.0, + "learning_rate": 4.803057570140551e-05, + "loss": 1.0864, + "step": 55000 + }, + { + "epoch": 0.05, + "grad_norm": 82.0, + "learning_rate": 4.802607662821459e-05, + "loss": 1.2047, + "step": 55100 + }, + { + "epoch": 0.05, + "grad_norm": 72.0, + "learning_rate": 4.802157755502367e-05, + "loss": 1.2877, + "step": 55200 + }, + { + "epoch": 0.05, + "grad_norm": 0.1787109375, + "learning_rate": 4.8017078481832745e-05, + "loss": 1.2841, + "step": 55300 + }, + { + "epoch": 0.05, + "grad_norm": 62.0, + "learning_rate": 4.801257940864182e-05, + "loss": 1.1483, + "step": 55400 + }, + { + "epoch": 0.05, + "grad_norm": 64.5, + "learning_rate": 4.80080803354509e-05, + "loss": 1.4654, + "step": 55500 + }, + { + "epoch": 0.05, + "grad_norm": 80.5, + "learning_rate": 4.800358126225998e-05, + "loss": 1.2821, + "step": 55600 + }, + { + "epoch": 0.05, + "grad_norm": 32.25, + "learning_rate": 4.799908218906905e-05, + "loss": 1.4356, + "step": 55700 + }, + { + "epoch": 0.05, + "grad_norm": 105.0, + "learning_rate": 4.7994583115878135e-05, + "loss": 1.2366, + "step": 55800 + }, + { + "epoch": 0.05, + "grad_norm": 26.125, + "learning_rate": 4.799008404268721e-05, + "loss": 1.181, + "step": 55900 + }, + { + "epoch": 0.05, + "grad_norm": 33.75, + "learning_rate": 4.7985584969496286e-05, + "loss": 1.0998, + "step": 56000 + }, + { + "epoch": 0.05, + "grad_norm": 78.5, + "learning_rate": 4.798108589630536e-05, + "loss": 1.2721, + "step": 56100 + }, + { + "epoch": 0.05, + "grad_norm": 22.5, + "learning_rate": 4.797658682311444e-05, + "loss": 1.3371, + "step": 56200 + }, + { + "epoch": 0.05, + "grad_norm": 36.0, + "learning_rate": 4.797208774992352e-05, + "loss": 1.2851, + "step": 56300 + }, + { + "epoch": 0.05, + "grad_norm": 16.875, + "learning_rate": 4.7967588676732594e-05, + "loss": 1.2774, + "step": 56400 + }, + { + "epoch": 0.05, + "grad_norm": 29.5, + "learning_rate": 4.796308960354167e-05, + "loss": 1.2053, + "step": 56500 + }, + { + "epoch": 0.05, + "grad_norm": 21.625, + "learning_rate": 4.795859053035075e-05, + "loss": 1.0901, + "step": 56600 + }, + { + "epoch": 0.05, + "grad_norm": 0.007293701171875, + "learning_rate": 4.795409145715983e-05, + "loss": 1.0744, + "step": 56700 + }, + { + "epoch": 0.05, + "grad_norm": 65.0, + "learning_rate": 4.79495923839689e-05, + "loss": 1.2934, + "step": 56800 + }, + { + "epoch": 0.05, + "grad_norm": 58.75, + "learning_rate": 4.7945093310777984e-05, + "loss": 1.2791, + "step": 56900 + }, + { + "epoch": 0.05, + "grad_norm": 3.046875, + "learning_rate": 4.794059423758706e-05, + "loss": 1.2051, + "step": 57000 + }, + { + "epoch": 0.05, + "grad_norm": 15.125, + "learning_rate": 4.793609516439614e-05, + "loss": 1.2482, + "step": 57100 + }, + { + "epoch": 0.05, + "grad_norm": 32.75, + "learning_rate": 4.793159609120522e-05, + "loss": 1.2103, + "step": 57200 + }, + { + "epoch": 0.05, + "grad_norm": 33.0, + "learning_rate": 4.7927097018014286e-05, + "loss": 1.1886, + "step": 57300 + }, + { + "epoch": 0.05, + "grad_norm": 26.125, + "learning_rate": 4.792259794482337e-05, + "loss": 1.2094, + "step": 57400 + }, + { + "epoch": 0.05, + "grad_norm": 23.75, + "learning_rate": 4.791809887163244e-05, + "loss": 1.2617, + "step": 57500 + }, + { + "epoch": 0.05, + "grad_norm": 36.75, + "learning_rate": 4.791359979844152e-05, + "loss": 1.3948, + "step": 57600 + }, + { + "epoch": 0.05, + "grad_norm": 217.0, + "learning_rate": 4.79091007252506e-05, + "loss": 1.3061, + "step": 57700 + }, + { + "epoch": 0.05, + "grad_norm": 43.0, + "learning_rate": 4.7904601652059676e-05, + "loss": 1.2947, + "step": 57800 + }, + { + "epoch": 0.05, + "grad_norm": 22.25, + "learning_rate": 4.790010257886876e-05, + "loss": 1.162, + "step": 57900 + }, + { + "epoch": 0.05, + "grad_norm": 32.75, + "learning_rate": 4.7895603505677834e-05, + "loss": 1.212, + "step": 58000 + }, + { + "epoch": 0.05, + "grad_norm": 24.5, + "learning_rate": 4.789110443248691e-05, + "loss": 1.2869, + "step": 58100 + }, + { + "epoch": 0.05, + "grad_norm": 30.0, + "learning_rate": 4.788660535929599e-05, + "loss": 1.378, + "step": 58200 + }, + { + "epoch": 0.05, + "grad_norm": 2544.0, + "learning_rate": 4.7882106286105066e-05, + "loss": 1.4909, + "step": 58300 + }, + { + "epoch": 0.05, + "grad_norm": 37.5, + "learning_rate": 4.787760721291414e-05, + "loss": 1.0799, + "step": 58400 + }, + { + "epoch": 0.05, + "grad_norm": 16.375, + "learning_rate": 4.7873108139723224e-05, + "loss": 1.1447, + "step": 58500 + }, + { + "epoch": 0.05, + "grad_norm": 49.25, + "learning_rate": 4.786860906653229e-05, + "loss": 1.3131, + "step": 58600 + }, + { + "epoch": 0.05, + "grad_norm": 28.0, + "learning_rate": 4.7864109993341374e-05, + "loss": 1.3194, + "step": 58700 + }, + { + "epoch": 0.05, + "grad_norm": 17.5, + "learning_rate": 4.785961092015045e-05, + "loss": 1.2322, + "step": 58800 + }, + { + "epoch": 0.05, + "grad_norm": 22.625, + "learning_rate": 4.7855111846959525e-05, + "loss": 1.3211, + "step": 58900 + }, + { + "epoch": 0.05, + "grad_norm": 1.2109375, + "learning_rate": 4.785061277376861e-05, + "loss": 1.0975, + "step": 59000 + }, + { + "epoch": 0.05, + "grad_norm": 24.375, + "learning_rate": 4.784611370057768e-05, + "loss": 1.1601, + "step": 59100 + }, + { + "epoch": 0.05, + "grad_norm": 191.0, + "learning_rate": 4.784161462738676e-05, + "loss": 1.2428, + "step": 59200 + }, + { + "epoch": 0.05, + "grad_norm": 0.0031280517578125, + "learning_rate": 4.783711555419584e-05, + "loss": 1.0643, + "step": 59300 + }, + { + "epoch": 0.05, + "grad_norm": 62.75, + "learning_rate": 4.7832616481004915e-05, + "loss": 1.2144, + "step": 59400 + }, + { + "epoch": 0.05, + "grad_norm": 39.25, + "learning_rate": 4.782811740781399e-05, + "loss": 1.3446, + "step": 59500 + }, + { + "epoch": 0.05, + "grad_norm": 26.875, + "learning_rate": 4.782361833462307e-05, + "loss": 1.2985, + "step": 59600 + }, + { + "epoch": 0.05, + "grad_norm": 56.75, + "learning_rate": 4.781911926143215e-05, + "loss": 1.1766, + "step": 59700 + }, + { + "epoch": 0.05, + "grad_norm": 144.0, + "learning_rate": 4.781462018824123e-05, + "loss": 1.2657, + "step": 59800 + }, + { + "epoch": 0.05, + "grad_norm": 53.0, + "learning_rate": 4.78101211150503e-05, + "loss": 1.1534, + "step": 59900 + }, + { + "epoch": 0.05, + "grad_norm": 0.32421875, + "learning_rate": 4.7805622041859374e-05, + "loss": 1.206, + "step": 60000 + }, + { + "epoch": 0.05, + "grad_norm": 0.080078125, + "learning_rate": 4.7801122968668456e-05, + "loss": 1.2317, + "step": 60100 + }, + { + "epoch": 0.05, + "grad_norm": 39.75, + "learning_rate": 4.779662389547753e-05, + "loss": 1.2616, + "step": 60200 + }, + { + "epoch": 0.05, + "grad_norm": 234.0, + "learning_rate": 4.779212482228661e-05, + "loss": 1.1645, + "step": 60300 + }, + { + "epoch": 0.05, + "grad_norm": 120.0, + "learning_rate": 4.778762574909569e-05, + "loss": 1.3616, + "step": 60400 + }, + { + "epoch": 0.05, + "grad_norm": 0.0004711151123046875, + "learning_rate": 4.7783126675904764e-05, + "loss": 1.3646, + "step": 60500 + }, + { + "epoch": 0.05, + "grad_norm": 155.0, + "learning_rate": 4.7778627602713847e-05, + "loss": 1.2605, + "step": 60600 + }, + { + "epoch": 0.05, + "grad_norm": 25.625, + "learning_rate": 4.777412852952292e-05, + "loss": 1.2399, + "step": 60700 + }, + { + "epoch": 0.05, + "grad_norm": 19.875, + "learning_rate": 4.7769629456332e-05, + "loss": 1.3913, + "step": 60800 + }, + { + "epoch": 0.05, + "grad_norm": 13.5625, + "learning_rate": 4.776513038314108e-05, + "loss": 1.0235, + "step": 60900 + }, + { + "epoch": 0.05, + "grad_norm": 17.5, + "learning_rate": 4.7760631309950155e-05, + "loss": 1.3921, + "step": 61000 + }, + { + "epoch": 0.05, + "grad_norm": 20.875, + "learning_rate": 4.775613223675923e-05, + "loss": 1.3424, + "step": 61100 + }, + { + "epoch": 0.05, + "grad_norm": 30.5, + "learning_rate": 4.7751633163568305e-05, + "loss": 1.1381, + "step": 61200 + }, + { + "epoch": 0.05, + "grad_norm": 6.53125, + "learning_rate": 4.774713409037738e-05, + "loss": 1.256, + "step": 61300 + }, + { + "epoch": 0.05, + "grad_norm": 23.875, + "learning_rate": 4.774263501718646e-05, + "loss": 1.2608, + "step": 61400 + }, + { + "epoch": 0.05, + "grad_norm": 6.125, + "learning_rate": 4.773813594399554e-05, + "loss": 1.299, + "step": 61500 + }, + { + "epoch": 0.05, + "grad_norm": 85.0, + "learning_rate": 4.7733636870804613e-05, + "loss": 1.4367, + "step": 61600 + }, + { + "epoch": 0.05, + "grad_norm": 48.5, + "learning_rate": 4.7729137797613696e-05, + "loss": 1.2704, + "step": 61700 + }, + { + "epoch": 0.06, + "grad_norm": 0.0003337860107421875, + "learning_rate": 4.772463872442277e-05, + "loss": 1.1553, + "step": 61800 + }, + { + "epoch": 0.06, + "grad_norm": 0.1396484375, + "learning_rate": 4.7720139651231846e-05, + "loss": 1.1049, + "step": 61900 + }, + { + "epoch": 0.06, + "grad_norm": 41.25, + "learning_rate": 4.771564057804093e-05, + "loss": 1.2024, + "step": 62000 + }, + { + "epoch": 0.06, + "grad_norm": 56.75, + "learning_rate": 4.7711141504850004e-05, + "loss": 1.0926, + "step": 62100 + }, + { + "epoch": 0.06, + "grad_norm": 23.25, + "learning_rate": 4.770664243165908e-05, + "loss": 1.2144, + "step": 62200 + }, + { + "epoch": 0.06, + "grad_norm": 0.361328125, + "learning_rate": 4.770214335846816e-05, + "loss": 1.2954, + "step": 62300 + }, + { + "epoch": 0.06, + "grad_norm": 97.5, + "learning_rate": 4.7697644285277237e-05, + "loss": 1.3381, + "step": 62400 + }, + { + "epoch": 0.06, + "grad_norm": 75.0, + "learning_rate": 4.769314521208631e-05, + "loss": 1.1606, + "step": 62500 + }, + { + "epoch": 0.06, + "grad_norm": 33.0, + "learning_rate": 4.768864613889539e-05, + "loss": 1.3188, + "step": 62600 + }, + { + "epoch": 0.06, + "grad_norm": 23.5, + "learning_rate": 4.768414706570446e-05, + "loss": 1.2255, + "step": 62700 + }, + { + "epoch": 0.06, + "grad_norm": 11.25, + "learning_rate": 4.7679647992513545e-05, + "loss": 1.2122, + "step": 62800 + }, + { + "epoch": 0.06, + "grad_norm": 52.75, + "learning_rate": 4.767514891932262e-05, + "loss": 1.2049, + "step": 62900 + }, + { + "epoch": 0.06, + "grad_norm": 0.1484375, + "learning_rate": 4.7670649846131695e-05, + "loss": 1.1614, + "step": 63000 + }, + { + "epoch": 0.06, + "grad_norm": 1104.0, + "learning_rate": 4.766615077294078e-05, + "loss": 1.1922, + "step": 63100 + }, + { + "epoch": 0.06, + "grad_norm": 24.875, + "learning_rate": 4.766165169974985e-05, + "loss": 1.2415, + "step": 63200 + }, + { + "epoch": 0.06, + "grad_norm": 112.0, + "learning_rate": 4.7657152626558935e-05, + "loss": 1.2242, + "step": 63300 + }, + { + "epoch": 0.06, + "grad_norm": 175.0, + "learning_rate": 4.765265355336801e-05, + "loss": 1.0412, + "step": 63400 + }, + { + "epoch": 0.06, + "grad_norm": 54.75, + "learning_rate": 4.7648154480177086e-05, + "loss": 1.3835, + "step": 63500 + }, + { + "epoch": 0.06, + "grad_norm": 0.005096435546875, + "learning_rate": 4.764365540698617e-05, + "loss": 1.1711, + "step": 63600 + }, + { + "epoch": 0.06, + "grad_norm": 56.25, + "learning_rate": 4.763915633379524e-05, + "loss": 1.2561, + "step": 63700 + }, + { + "epoch": 0.06, + "grad_norm": 21.625, + "learning_rate": 4.763465726060432e-05, + "loss": 1.1166, + "step": 63800 + }, + { + "epoch": 0.06, + "grad_norm": 19.375, + "learning_rate": 4.7630158187413394e-05, + "loss": 1.3502, + "step": 63900 + }, + { + "epoch": 0.06, + "grad_norm": 28.25, + "learning_rate": 4.762565911422247e-05, + "loss": 1.1815, + "step": 64000 + }, + { + "epoch": 0.06, + "grad_norm": 35.75, + "learning_rate": 4.762116004103155e-05, + "loss": 1.2885, + "step": 64100 + }, + { + "epoch": 0.06, + "grad_norm": 162.0, + "learning_rate": 4.7616660967840627e-05, + "loss": 1.2147, + "step": 64200 + }, + { + "epoch": 0.06, + "grad_norm": 17.75, + "learning_rate": 4.76121618946497e-05, + "loss": 1.3315, + "step": 64300 + }, + { + "epoch": 0.06, + "grad_norm": 15.875, + "learning_rate": 4.7607662821458784e-05, + "loss": 1.3114, + "step": 64400 + }, + { + "epoch": 0.06, + "grad_norm": 51.5, + "learning_rate": 4.760316374826786e-05, + "loss": 1.3619, + "step": 64500 + }, + { + "epoch": 0.06, + "grad_norm": 21.75, + "learning_rate": 4.7598664675076935e-05, + "loss": 1.2124, + "step": 64600 + }, + { + "epoch": 0.06, + "grad_norm": 35.0, + "learning_rate": 4.759416560188602e-05, + "loss": 1.2272, + "step": 64700 + }, + { + "epoch": 0.06, + "grad_norm": 92.0, + "learning_rate": 4.758966652869509e-05, + "loss": 1.2565, + "step": 64800 + }, + { + "epoch": 0.06, + "grad_norm": 27.5, + "learning_rate": 4.758516745550417e-05, + "loss": 1.0057, + "step": 64900 + }, + { + "epoch": 0.06, + "grad_norm": 284.0, + "learning_rate": 4.758066838231325e-05, + "loss": 1.3151, + "step": 65000 + }, + { + "epoch": 0.06, + "grad_norm": 60.0, + "learning_rate": 4.757616930912232e-05, + "loss": 1.3785, + "step": 65100 + }, + { + "epoch": 0.06, + "grad_norm": 0.1201171875, + "learning_rate": 4.75716702359314e-05, + "loss": 1.1252, + "step": 65200 + }, + { + "epoch": 0.06, + "grad_norm": 28.125, + "learning_rate": 4.7567171162740476e-05, + "loss": 1.2642, + "step": 65300 + }, + { + "epoch": 0.06, + "grad_norm": 21.75, + "learning_rate": 4.756267208954955e-05, + "loss": 1.1487, + "step": 65400 + }, + { + "epoch": 0.06, + "grad_norm": 19.875, + "learning_rate": 4.755817301635863e-05, + "loss": 1.1986, + "step": 65500 + }, + { + "epoch": 0.06, + "grad_norm": 41.75, + "learning_rate": 4.755367394316771e-05, + "loss": 1.3195, + "step": 65600 + }, + { + "epoch": 0.06, + "grad_norm": 35.75, + "learning_rate": 4.7549174869976784e-05, + "loss": 1.1207, + "step": 65700 + }, + { + "epoch": 0.06, + "grad_norm": 36.75, + "learning_rate": 4.7544675796785866e-05, + "loss": 1.2285, + "step": 65800 + }, + { + "epoch": 0.06, + "grad_norm": 115.5, + "learning_rate": 4.754017672359494e-05, + "loss": 1.3438, + "step": 65900 + }, + { + "epoch": 0.06, + "grad_norm": 31.625, + "learning_rate": 4.753567765040402e-05, + "loss": 1.1679, + "step": 66000 + }, + { + "epoch": 0.06, + "grad_norm": 17.5, + "learning_rate": 4.75311785772131e-05, + "loss": 1.3672, + "step": 66100 + }, + { + "epoch": 0.06, + "grad_norm": 115.0, + "learning_rate": 4.7526679504022174e-05, + "loss": 1.084, + "step": 66200 + }, + { + "epoch": 0.06, + "grad_norm": 51.25, + "learning_rate": 4.7522180430831256e-05, + "loss": 1.3214, + "step": 66300 + }, + { + "epoch": 0.06, + "grad_norm": 48.75, + "learning_rate": 4.7517681357640325e-05, + "loss": 1.2078, + "step": 66400 + }, + { + "epoch": 0.06, + "grad_norm": 50.5, + "learning_rate": 4.751318228444941e-05, + "loss": 1.2239, + "step": 66500 + }, + { + "epoch": 0.06, + "grad_norm": 32.75, + "learning_rate": 4.750868321125848e-05, + "loss": 1.2264, + "step": 66600 + }, + { + "epoch": 0.06, + "grad_norm": 119.5, + "learning_rate": 4.750418413806756e-05, + "loss": 1.1354, + "step": 66700 + }, + { + "epoch": 0.06, + "grad_norm": 17.0, + "learning_rate": 4.749968506487664e-05, + "loss": 1.2544, + "step": 66800 + }, + { + "epoch": 0.06, + "grad_norm": 22.375, + "learning_rate": 4.7495185991685715e-05, + "loss": 1.3681, + "step": 66900 + }, + { + "epoch": 0.06, + "grad_norm": 17.5, + "learning_rate": 4.749068691849479e-05, + "loss": 1.1801, + "step": 67000 + }, + { + "epoch": 0.06, + "grad_norm": 21.375, + "learning_rate": 4.748618784530387e-05, + "loss": 1.2174, + "step": 67100 + }, + { + "epoch": 0.06, + "grad_norm": 7.5, + "learning_rate": 4.748168877211295e-05, + "loss": 1.3273, + "step": 67200 + }, + { + "epoch": 0.06, + "grad_norm": 110.0, + "learning_rate": 4.747718969892202e-05, + "loss": 1.3135, + "step": 67300 + }, + { + "epoch": 0.06, + "grad_norm": 15.9375, + "learning_rate": 4.7472690625731105e-05, + "loss": 1.2495, + "step": 67400 + }, + { + "epoch": 0.06, + "grad_norm": 23.375, + "learning_rate": 4.746819155254018e-05, + "loss": 1.1619, + "step": 67500 + }, + { + "epoch": 0.06, + "grad_norm": 18.0, + "learning_rate": 4.7463692479349256e-05, + "loss": 1.3114, + "step": 67600 + }, + { + "epoch": 0.06, + "grad_norm": 54.25, + "learning_rate": 4.745919340615833e-05, + "loss": 1.2257, + "step": 67700 + }, + { + "epoch": 0.06, + "grad_norm": 30.125, + "learning_rate": 4.7454694332967407e-05, + "loss": 1.224, + "step": 67800 + }, + { + "epoch": 0.06, + "grad_norm": 44.0, + "learning_rate": 4.745019525977649e-05, + "loss": 1.1534, + "step": 67900 + }, + { + "epoch": 0.06, + "grad_norm": 16.25, + "learning_rate": 4.7445696186585564e-05, + "loss": 1.2763, + "step": 68000 + }, + { + "epoch": 0.06, + "grad_norm": 49.75, + "learning_rate": 4.744119711339464e-05, + "loss": 1.2954, + "step": 68100 + }, + { + "epoch": 0.06, + "grad_norm": 41.25, + "learning_rate": 4.743669804020372e-05, + "loss": 1.3419, + "step": 68200 + }, + { + "epoch": 0.06, + "grad_norm": 35.5, + "learning_rate": 4.74321989670128e-05, + "loss": 1.1872, + "step": 68300 + }, + { + "epoch": 0.06, + "grad_norm": 41.75, + "learning_rate": 4.742769989382187e-05, + "loss": 1.214, + "step": 68400 + }, + { + "epoch": 0.06, + "grad_norm": 36.75, + "learning_rate": 4.7423200820630954e-05, + "loss": 1.2048, + "step": 68500 + }, + { + "epoch": 0.06, + "grad_norm": 45.0, + "learning_rate": 4.741870174744003e-05, + "loss": 0.9973, + "step": 68600 + }, + { + "epoch": 0.06, + "grad_norm": 23.75, + "learning_rate": 4.741420267424911e-05, + "loss": 1.2558, + "step": 68700 + }, + { + "epoch": 0.06, + "grad_norm": 59.75, + "learning_rate": 4.740970360105819e-05, + "loss": 1.2732, + "step": 68800 + }, + { + "epoch": 0.06, + "grad_norm": 41.25, + "learning_rate": 4.740520452786726e-05, + "loss": 1.2606, + "step": 68900 + }, + { + "epoch": 0.06, + "grad_norm": 79.0, + "learning_rate": 4.740070545467634e-05, + "loss": 1.2146, + "step": 69000 + }, + { + "epoch": 0.06, + "grad_norm": 14.0625, + "learning_rate": 4.739620638148541e-05, + "loss": 1.2459, + "step": 69100 + }, + { + "epoch": 0.06, + "grad_norm": 12.1875, + "learning_rate": 4.7391707308294495e-05, + "loss": 1.2515, + "step": 69200 + }, + { + "epoch": 0.06, + "grad_norm": 25.25, + "learning_rate": 4.738720823510357e-05, + "loss": 1.3226, + "step": 69300 + }, + { + "epoch": 0.06, + "grad_norm": 10.5625, + "learning_rate": 4.7382709161912646e-05, + "loss": 1.3906, + "step": 69400 + }, + { + "epoch": 0.06, + "grad_norm": 75.0, + "learning_rate": 4.737821008872173e-05, + "loss": 1.1994, + "step": 69500 + }, + { + "epoch": 0.06, + "grad_norm": 16.25, + "learning_rate": 4.73737110155308e-05, + "loss": 1.3124, + "step": 69600 + }, + { + "epoch": 0.06, + "grad_norm": 50.25, + "learning_rate": 4.736921194233988e-05, + "loss": 1.2104, + "step": 69700 + }, + { + "epoch": 0.06, + "grad_norm": 38.25, + "learning_rate": 4.736471286914896e-05, + "loss": 1.3131, + "step": 69800 + }, + { + "epoch": 0.06, + "grad_norm": 63.0, + "learning_rate": 4.7360213795958036e-05, + "loss": 1.3858, + "step": 69900 + }, + { + "epoch": 0.06, + "grad_norm": 21.5, + "learning_rate": 4.735571472276711e-05, + "loss": 1.3075, + "step": 70000 + }, + { + "epoch": 0.06, + "grad_norm": 32.25, + "learning_rate": 4.7351215649576194e-05, + "loss": 1.2681, + "step": 70100 + }, + { + "epoch": 0.06, + "grad_norm": 160.0, + "learning_rate": 4.734671657638527e-05, + "loss": 1.123, + "step": 70200 + }, + { + "epoch": 0.06, + "grad_norm": 20.125, + "learning_rate": 4.7342217503194344e-05, + "loss": 1.29, + "step": 70300 + }, + { + "epoch": 0.06, + "grad_norm": 46.75, + "learning_rate": 4.733771843000342e-05, + "loss": 1.2494, + "step": 70400 + }, + { + "epoch": 0.06, + "grad_norm": 65.0, + "learning_rate": 4.7333219356812495e-05, + "loss": 1.106, + "step": 70500 + }, + { + "epoch": 0.06, + "grad_norm": 61.0, + "learning_rate": 4.732872028362158e-05, + "loss": 1.3759, + "step": 70600 + }, + { + "epoch": 0.06, + "grad_norm": 56.0, + "learning_rate": 4.732422121043065e-05, + "loss": 1.2734, + "step": 70700 + }, + { + "epoch": 0.06, + "grad_norm": 80.5, + "learning_rate": 4.731972213723973e-05, + "loss": 1.1665, + "step": 70800 + }, + { + "epoch": 0.06, + "grad_norm": 36.25, + "learning_rate": 4.731522306404881e-05, + "loss": 1.2487, + "step": 70900 + }, + { + "epoch": 0.06, + "grad_norm": 99.0, + "learning_rate": 4.7310723990857885e-05, + "loss": 1.2052, + "step": 71000 + }, + { + "epoch": 0.06, + "grad_norm": 9.875, + "learning_rate": 4.730622491766696e-05, + "loss": 1.1455, + "step": 71100 + }, + { + "epoch": 0.06, + "grad_norm": 16.25, + "learning_rate": 4.730172584447604e-05, + "loss": 1.0591, + "step": 71200 + }, + { + "epoch": 0.06, + "grad_norm": 13.9375, + "learning_rate": 4.729722677128512e-05, + "loss": 1.0799, + "step": 71300 + }, + { + "epoch": 0.06, + "grad_norm": 46.25, + "learning_rate": 4.72927276980942e-05, + "loss": 1.3123, + "step": 71400 + }, + { + "epoch": 0.06, + "grad_norm": 63.5, + "learning_rate": 4.7288228624903275e-05, + "loss": 1.2431, + "step": 71500 + }, + { + "epoch": 0.06, + "grad_norm": 14.125, + "learning_rate": 4.7283729551712344e-05, + "loss": 1.2255, + "step": 71600 + }, + { + "epoch": 0.06, + "grad_norm": 53.0, + "learning_rate": 4.7279230478521426e-05, + "loss": 1.1026, + "step": 71700 + }, + { + "epoch": 0.06, + "grad_norm": 47.5, + "learning_rate": 4.72747314053305e-05, + "loss": 1.2817, + "step": 71800 + }, + { + "epoch": 0.06, + "grad_norm": 49.25, + "learning_rate": 4.727023233213958e-05, + "loss": 1.26, + "step": 71900 + }, + { + "epoch": 0.06, + "grad_norm": 17.375, + "learning_rate": 4.726573325894866e-05, + "loss": 1.1483, + "step": 72000 + }, + { + "epoch": 0.06, + "grad_norm": 123.0, + "learning_rate": 4.7261234185757734e-05, + "loss": 1.3744, + "step": 72100 + }, + { + "epoch": 0.06, + "grad_norm": 33.0, + "learning_rate": 4.7256735112566816e-05, + "loss": 1.2989, + "step": 72200 + }, + { + "epoch": 0.06, + "grad_norm": 29.0, + "learning_rate": 4.725223603937589e-05, + "loss": 1.0816, + "step": 72300 + }, + { + "epoch": 0.06, + "grad_norm": 26.5, + "learning_rate": 4.724773696618497e-05, + "loss": 1.2363, + "step": 72400 + }, + { + "epoch": 0.06, + "grad_norm": 12.25, + "learning_rate": 4.724323789299405e-05, + "loss": 1.2743, + "step": 72500 + }, + { + "epoch": 0.06, + "grad_norm": 16.0, + "learning_rate": 4.7238738819803124e-05, + "loss": 1.2664, + "step": 72600 + }, + { + "epoch": 0.06, + "grad_norm": 28.25, + "learning_rate": 4.72342397466122e-05, + "loss": 1.4499, + "step": 72700 + }, + { + "epoch": 0.06, + "grad_norm": 91.5, + "learning_rate": 4.722974067342128e-05, + "loss": 1.1211, + "step": 72800 + }, + { + "epoch": 0.06, + "grad_norm": 31.875, + "learning_rate": 4.722524160023035e-05, + "loss": 1.1971, + "step": 72900 + }, + { + "epoch": 0.07, + "grad_norm": 0.012939453125, + "learning_rate": 4.722074252703943e-05, + "loss": 1.1434, + "step": 73000 + }, + { + "epoch": 0.07, + "grad_norm": 0.0859375, + "learning_rate": 4.721624345384851e-05, + "loss": 1.1664, + "step": 73100 + }, + { + "epoch": 0.07, + "grad_norm": 39.0, + "learning_rate": 4.721174438065758e-05, + "loss": 1.1764, + "step": 73200 + }, + { + "epoch": 0.07, + "grad_norm": 17.125, + "learning_rate": 4.7207245307466665e-05, + "loss": 1.2504, + "step": 73300 + }, + { + "epoch": 0.07, + "grad_norm": 89.0, + "learning_rate": 4.720274623427574e-05, + "loss": 1.2143, + "step": 73400 + }, + { + "epoch": 0.07, + "grad_norm": 20.375, + "learning_rate": 4.7198247161084816e-05, + "loss": 1.1952, + "step": 73500 + }, + { + "epoch": 0.07, + "grad_norm": 103.0, + "learning_rate": 4.71937480878939e-05, + "loss": 1.2222, + "step": 73600 + }, + { + "epoch": 0.07, + "grad_norm": 179.0, + "learning_rate": 4.7189249014702974e-05, + "loss": 1.2778, + "step": 73700 + }, + { + "epoch": 0.07, + "grad_norm": 0.5546875, + "learning_rate": 4.718474994151205e-05, + "loss": 1.3061, + "step": 73800 + }, + { + "epoch": 0.07, + "grad_norm": 149.0, + "learning_rate": 4.718025086832113e-05, + "loss": 1.2367, + "step": 73900 + }, + { + "epoch": 0.07, + "grad_norm": 31.375, + "learning_rate": 4.7175751795130206e-05, + "loss": 1.226, + "step": 74000 + }, + { + "epoch": 0.07, + "grad_norm": 25.5, + "learning_rate": 4.717125272193929e-05, + "loss": 1.1026, + "step": 74100 + }, + { + "epoch": 0.07, + "grad_norm": 47.25, + "learning_rate": 4.716675364874836e-05, + "loss": 1.2909, + "step": 74200 + }, + { + "epoch": 0.07, + "grad_norm": 8.1875, + "learning_rate": 4.716225457555743e-05, + "loss": 1.142, + "step": 74300 + }, + { + "epoch": 0.07, + "grad_norm": 27.125, + "learning_rate": 4.7157755502366514e-05, + "loss": 1.134, + "step": 74400 + }, + { + "epoch": 0.07, + "grad_norm": 39.25, + "learning_rate": 4.715325642917559e-05, + "loss": 1.2291, + "step": 74500 + }, + { + "epoch": 0.07, + "grad_norm": 26.25, + "learning_rate": 4.7148757355984665e-05, + "loss": 1.3683, + "step": 74600 + }, + { + "epoch": 0.07, + "grad_norm": 18.625, + "learning_rate": 4.714425828279375e-05, + "loss": 1.2623, + "step": 74700 + }, + { + "epoch": 0.07, + "grad_norm": 57.0, + "learning_rate": 4.713975920960282e-05, + "loss": 1.1912, + "step": 74800 + }, + { + "epoch": 0.07, + "grad_norm": 35.75, + "learning_rate": 4.7135260136411905e-05, + "loss": 1.0889, + "step": 74900 + }, + { + "epoch": 0.07, + "grad_norm": 52.25, + "learning_rate": 4.713076106322098e-05, + "loss": 1.1153, + "step": 75000 + }, + { + "epoch": 0.07, + "grad_norm": 0.000835418701171875, + "learning_rate": 4.7126261990030055e-05, + "loss": 1.361, + "step": 75100 + }, + { + "epoch": 0.07, + "grad_norm": 83.5, + "learning_rate": 4.712176291683914e-05, + "loss": 1.2175, + "step": 75200 + }, + { + "epoch": 0.07, + "grad_norm": 34.25, + "learning_rate": 4.711726384364821e-05, + "loss": 1.1457, + "step": 75300 + }, + { + "epoch": 0.07, + "grad_norm": 47.75, + "learning_rate": 4.711276477045729e-05, + "loss": 1.2189, + "step": 75400 + }, + { + "epoch": 0.07, + "grad_norm": 15.1875, + "learning_rate": 4.7108265697266364e-05, + "loss": 1.1413, + "step": 75500 + }, + { + "epoch": 0.07, + "grad_norm": 0.006591796875, + "learning_rate": 4.710376662407544e-05, + "loss": 1.1837, + "step": 75600 + }, + { + "epoch": 0.07, + "grad_norm": 31.375, + "learning_rate": 4.709926755088452e-05, + "loss": 1.2907, + "step": 75700 + }, + { + "epoch": 0.07, + "grad_norm": 41.75, + "learning_rate": 4.7094768477693596e-05, + "loss": 1.1726, + "step": 75800 + }, + { + "epoch": 0.07, + "grad_norm": 125.5, + "learning_rate": 4.709026940450267e-05, + "loss": 1.3122, + "step": 75900 + }, + { + "epoch": 0.07, + "grad_norm": 135.0, + "learning_rate": 4.7085770331311754e-05, + "loss": 1.2983, + "step": 76000 + }, + { + "epoch": 0.07, + "grad_norm": 222.0, + "learning_rate": 4.708127125812083e-05, + "loss": 1.2188, + "step": 76100 + }, + { + "epoch": 0.07, + "grad_norm": 44.0, + "learning_rate": 4.7076772184929904e-05, + "loss": 1.0877, + "step": 76200 + }, + { + "epoch": 0.07, + "grad_norm": 34.75, + "learning_rate": 4.7072273111738987e-05, + "loss": 1.2161, + "step": 76300 + }, + { + "epoch": 0.07, + "grad_norm": 22.0, + "learning_rate": 4.706777403854806e-05, + "loss": 1.3827, + "step": 76400 + }, + { + "epoch": 0.07, + "grad_norm": 91.0, + "learning_rate": 4.706327496535714e-05, + "loss": 1.2608, + "step": 76500 + }, + { + "epoch": 0.07, + "grad_norm": 20.0, + "learning_rate": 4.705877589216622e-05, + "loss": 1.362, + "step": 76600 + }, + { + "epoch": 0.07, + "grad_norm": 32.75, + "learning_rate": 4.7054276818975295e-05, + "loss": 1.2003, + "step": 76700 + }, + { + "epoch": 0.07, + "grad_norm": 3.15625, + "learning_rate": 4.704977774578437e-05, + "loss": 1.3207, + "step": 76800 + }, + { + "epoch": 0.07, + "grad_norm": 13.5625, + "learning_rate": 4.7045278672593445e-05, + "loss": 1.5494, + "step": 76900 + }, + { + "epoch": 0.07, + "grad_norm": 20.25, + "learning_rate": 4.704077959940252e-05, + "loss": 1.1945, + "step": 77000 + }, + { + "epoch": 0.07, + "grad_norm": 24.5, + "learning_rate": 4.70362805262116e-05, + "loss": 1.205, + "step": 77100 + }, + { + "epoch": 0.07, + "grad_norm": 52.25, + "learning_rate": 4.703178145302068e-05, + "loss": 1.1373, + "step": 77200 + }, + { + "epoch": 0.07, + "grad_norm": 22.375, + "learning_rate": 4.7027282379829754e-05, + "loss": 1.2527, + "step": 77300 + }, + { + "epoch": 0.07, + "grad_norm": 24.625, + "learning_rate": 4.7022783306638836e-05, + "loss": 1.1378, + "step": 77400 + }, + { + "epoch": 0.07, + "grad_norm": 56.0, + "learning_rate": 4.701828423344791e-05, + "loss": 1.1482, + "step": 77500 + }, + { + "epoch": 0.07, + "grad_norm": 50.5, + "learning_rate": 4.701378516025699e-05, + "loss": 1.2645, + "step": 77600 + }, + { + "epoch": 0.07, + "grad_norm": 144.0, + "learning_rate": 4.700928608706607e-05, + "loss": 1.4047, + "step": 77700 + }, + { + "epoch": 0.07, + "grad_norm": 0.1494140625, + "learning_rate": 4.7004787013875144e-05, + "loss": 1.1314, + "step": 77800 + }, + { + "epoch": 0.07, + "grad_norm": 149.0, + "learning_rate": 4.7000287940684226e-05, + "loss": 1.1793, + "step": 77900 + }, + { + "epoch": 0.07, + "grad_norm": 0.5546875, + "learning_rate": 4.69957888674933e-05, + "loss": 1.2604, + "step": 78000 + }, + { + "epoch": 0.07, + "grad_norm": 126.0, + "learning_rate": 4.6991289794302377e-05, + "loss": 0.9881, + "step": 78100 + }, + { + "epoch": 0.07, + "grad_norm": 25.5, + "learning_rate": 4.698679072111145e-05, + "loss": 1.2622, + "step": 78200 + }, + { + "epoch": 0.07, + "grad_norm": 35.25, + "learning_rate": 4.698229164792053e-05, + "loss": 1.2399, + "step": 78300 + }, + { + "epoch": 0.07, + "grad_norm": 13.25, + "learning_rate": 4.697779257472961e-05, + "loss": 1.1963, + "step": 78400 + }, + { + "epoch": 0.07, + "grad_norm": 22.75, + "learning_rate": 4.6973293501538685e-05, + "loss": 1.3438, + "step": 78500 + }, + { + "epoch": 0.07, + "grad_norm": 44.0, + "learning_rate": 4.696879442834776e-05, + "loss": 1.1871, + "step": 78600 + }, + { + "epoch": 0.07, + "grad_norm": 0.45703125, + "learning_rate": 4.696429535515684e-05, + "loss": 1.2272, + "step": 78700 + }, + { + "epoch": 0.07, + "grad_norm": 13.9375, + "learning_rate": 4.695979628196592e-05, + "loss": 1.4138, + "step": 78800 + }, + { + "epoch": 0.07, + "grad_norm": 31.75, + "learning_rate": 4.695529720877499e-05, + "loss": 1.2585, + "step": 78900 + }, + { + "epoch": 0.07, + "grad_norm": 62.5, + "learning_rate": 4.6950798135584075e-05, + "loss": 1.1439, + "step": 79000 + }, + { + "epoch": 0.07, + "grad_norm": 22.125, + "learning_rate": 4.694629906239315e-05, + "loss": 1.4061, + "step": 79100 + }, + { + "epoch": 0.07, + "grad_norm": 23.75, + "learning_rate": 4.6941799989202226e-05, + "loss": 1.2456, + "step": 79200 + }, + { + "epoch": 0.07, + "grad_norm": 12.625, + "learning_rate": 4.693730091601131e-05, + "loss": 1.1871, + "step": 79300 + }, + { + "epoch": 0.07, + "grad_norm": 195.0, + "learning_rate": 4.6932801842820376e-05, + "loss": 1.3179, + "step": 79400 + }, + { + "epoch": 0.07, + "grad_norm": 36.75, + "learning_rate": 4.692830276962946e-05, + "loss": 1.1434, + "step": 79500 + }, + { + "epoch": 0.07, + "grad_norm": 2.625, + "learning_rate": 4.6923803696438534e-05, + "loss": 1.1249, + "step": 79600 + }, + { + "epoch": 0.07, + "grad_norm": 87.5, + "learning_rate": 4.691930462324761e-05, + "loss": 1.2842, + "step": 79700 + }, + { + "epoch": 0.07, + "grad_norm": 33.0, + "learning_rate": 4.691480555005669e-05, + "loss": 1.2856, + "step": 79800 + }, + { + "epoch": 0.07, + "grad_norm": 33.75, + "learning_rate": 4.6910306476865767e-05, + "loss": 1.3099, + "step": 79900 + }, + { + "epoch": 0.07, + "grad_norm": 22.125, + "learning_rate": 4.690580740367484e-05, + "loss": 1.2222, + "step": 80000 + }, + { + "epoch": 0.07, + "grad_norm": 73.0, + "learning_rate": 4.6901308330483924e-05, + "loss": 1.2007, + "step": 80100 + }, + { + "epoch": 0.07, + "grad_norm": 56.25, + "learning_rate": 4.6896809257293e-05, + "loss": 1.1251, + "step": 80200 + }, + { + "epoch": 0.07, + "grad_norm": 61.0, + "learning_rate": 4.689231018410208e-05, + "loss": 1.2048, + "step": 80300 + }, + { + "epoch": 0.07, + "grad_norm": 50.5, + "learning_rate": 4.688781111091116e-05, + "loss": 1.2897, + "step": 80400 + }, + { + "epoch": 0.07, + "grad_norm": 29.5, + "learning_rate": 4.688331203772023e-05, + "loss": 1.2261, + "step": 80500 + }, + { + "epoch": 0.07, + "grad_norm": 26.75, + "learning_rate": 4.6878812964529314e-05, + "loss": 1.2943, + "step": 80600 + }, + { + "epoch": 0.07, + "grad_norm": 0.85546875, + "learning_rate": 4.687431389133838e-05, + "loss": 1.2239, + "step": 80700 + }, + { + "epoch": 0.07, + "grad_norm": 416.0, + "learning_rate": 4.6869814818147465e-05, + "loss": 1.2365, + "step": 80800 + }, + { + "epoch": 0.07, + "grad_norm": 13.8125, + "learning_rate": 4.686531574495654e-05, + "loss": 1.3162, + "step": 80900 + }, + { + "epoch": 0.07, + "grad_norm": 72.5, + "learning_rate": 4.6860816671765616e-05, + "loss": 1.1817, + "step": 81000 + }, + { + "epoch": 0.07, + "grad_norm": 84.5, + "learning_rate": 4.68563175985747e-05, + "loss": 1.269, + "step": 81100 + }, + { + "epoch": 0.07, + "grad_norm": 36.5, + "learning_rate": 4.685181852538377e-05, + "loss": 1.2479, + "step": 81200 + }, + { + "epoch": 0.07, + "grad_norm": 66.5, + "learning_rate": 4.684731945219285e-05, + "loss": 1.1251, + "step": 81300 + }, + { + "epoch": 0.07, + "grad_norm": 0.06884765625, + "learning_rate": 4.684282037900193e-05, + "loss": 1.0727, + "step": 81400 + }, + { + "epoch": 0.07, + "grad_norm": 81.5, + "learning_rate": 4.6838321305811006e-05, + "loss": 1.3266, + "step": 81500 + }, + { + "epoch": 0.07, + "grad_norm": 57.5, + "learning_rate": 4.683382223262008e-05, + "loss": 1.1613, + "step": 81600 + }, + { + "epoch": 0.07, + "grad_norm": 324.0, + "learning_rate": 4.682932315942916e-05, + "loss": 1.1394, + "step": 81700 + }, + { + "epoch": 0.07, + "grad_norm": 38.5, + "learning_rate": 4.682482408623824e-05, + "loss": 1.1932, + "step": 81800 + }, + { + "epoch": 0.07, + "grad_norm": 42.75, + "learning_rate": 4.6820325013047314e-05, + "loss": 1.2507, + "step": 81900 + }, + { + "epoch": 0.07, + "grad_norm": 11.0625, + "learning_rate": 4.681582593985639e-05, + "loss": 1.1607, + "step": 82000 + }, + { + "epoch": 0.07, + "grad_norm": 5.875, + "learning_rate": 4.6811326866665465e-05, + "loss": 1.2099, + "step": 82100 + }, + { + "epoch": 0.07, + "grad_norm": 20.0, + "learning_rate": 4.680682779347455e-05, + "loss": 1.3643, + "step": 82200 + }, + { + "epoch": 0.07, + "grad_norm": 20.5, + "learning_rate": 4.680232872028362e-05, + "loss": 1.3056, + "step": 82300 + }, + { + "epoch": 0.07, + "grad_norm": 27.625, + "learning_rate": 4.67978296470927e-05, + "loss": 1.3067, + "step": 82400 + }, + { + "epoch": 0.07, + "grad_norm": 11.4375, + "learning_rate": 4.679333057390178e-05, + "loss": 1.3445, + "step": 82500 + }, + { + "epoch": 0.07, + "grad_norm": 45.0, + "learning_rate": 4.6788831500710855e-05, + "loss": 1.3665, + "step": 82600 + }, + { + "epoch": 0.07, + "grad_norm": 16.75, + "learning_rate": 4.678433242751993e-05, + "loss": 1.2115, + "step": 82700 + }, + { + "epoch": 0.07, + "grad_norm": 16.875, + "learning_rate": 4.677983335432901e-05, + "loss": 1.1534, + "step": 82800 + }, + { + "epoch": 0.07, + "grad_norm": 46.5, + "learning_rate": 4.677533428113809e-05, + "loss": 1.3426, + "step": 82900 + }, + { + "epoch": 0.07, + "grad_norm": 31.625, + "learning_rate": 4.677083520794717e-05, + "loss": 1.2648, + "step": 83000 + }, + { + "epoch": 0.07, + "grad_norm": 17.5, + "learning_rate": 4.6766336134756245e-05, + "loss": 1.201, + "step": 83100 + }, + { + "epoch": 0.07, + "grad_norm": 28.5, + "learning_rate": 4.676183706156532e-05, + "loss": 1.0442, + "step": 83200 + }, + { + "epoch": 0.07, + "grad_norm": 11.3125, + "learning_rate": 4.6757337988374396e-05, + "loss": 1.1157, + "step": 83300 + }, + { + "epoch": 0.07, + "grad_norm": 10.875, + "learning_rate": 4.675283891518347e-05, + "loss": 1.3605, + "step": 83400 + }, + { + "epoch": 0.07, + "grad_norm": 34.75, + "learning_rate": 4.674833984199255e-05, + "loss": 1.4955, + "step": 83500 + }, + { + "epoch": 0.07, + "grad_norm": 18.5, + "learning_rate": 4.674384076880163e-05, + "loss": 1.2556, + "step": 83600 + }, + { + "epoch": 0.07, + "grad_norm": 59.25, + "learning_rate": 4.6739341695610704e-05, + "loss": 1.399, + "step": 83700 + }, + { + "epoch": 0.07, + "grad_norm": 3232.0, + "learning_rate": 4.6734842622419786e-05, + "loss": 1.2609, + "step": 83800 + }, + { + "epoch": 0.07, + "grad_norm": 80.5, + "learning_rate": 4.673034354922886e-05, + "loss": 1.156, + "step": 83900 + }, + { + "epoch": 0.07, + "grad_norm": 38.0, + "learning_rate": 4.672584447603794e-05, + "loss": 1.2504, + "step": 84000 + }, + { + "epoch": 0.07, + "grad_norm": 416.0, + "learning_rate": 4.672134540284702e-05, + "loss": 1.3409, + "step": 84100 + }, + { + "epoch": 0.08, + "grad_norm": 37.0, + "learning_rate": 4.6716846329656094e-05, + "loss": 1.2077, + "step": 84200 + }, + { + "epoch": 0.08, + "grad_norm": 34.75, + "learning_rate": 4.671234725646517e-05, + "loss": 1.1843, + "step": 84300 + }, + { + "epoch": 0.08, + "grad_norm": 78.0, + "learning_rate": 4.670784818327425e-05, + "loss": 1.2665, + "step": 84400 + }, + { + "epoch": 0.08, + "grad_norm": 16.875, + "learning_rate": 4.670334911008333e-05, + "loss": 1.1045, + "step": 84500 + }, + { + "epoch": 0.08, + "grad_norm": 42.75, + "learning_rate": 4.66988500368924e-05, + "loss": 1.2778, + "step": 84600 + }, + { + "epoch": 0.08, + "grad_norm": 32.5, + "learning_rate": 4.669435096370148e-05, + "loss": 1.0987, + "step": 84700 + }, + { + "epoch": 0.08, + "grad_norm": 50.5, + "learning_rate": 4.668985189051055e-05, + "loss": 1.2783, + "step": 84800 + }, + { + "epoch": 0.08, + "grad_norm": 884.0, + "learning_rate": 4.6685352817319635e-05, + "loss": 1.1679, + "step": 84900 + }, + { + "epoch": 0.08, + "grad_norm": 24.625, + "learning_rate": 4.668085374412871e-05, + "loss": 1.0179, + "step": 85000 + }, + { + "epoch": 0.08, + "grad_norm": 49.75, + "learning_rate": 4.6676354670937786e-05, + "loss": 1.0883, + "step": 85100 + }, + { + "epoch": 0.08, + "grad_norm": 30.75, + "learning_rate": 4.667185559774687e-05, + "loss": 1.1631, + "step": 85200 + }, + { + "epoch": 0.08, + "grad_norm": 34.75, + "learning_rate": 4.666735652455594e-05, + "loss": 0.9213, + "step": 85300 + }, + { + "epoch": 0.08, + "grad_norm": 61.75, + "learning_rate": 4.666285745136502e-05, + "loss": 1.2807, + "step": 85400 + }, + { + "epoch": 0.08, + "grad_norm": 30.625, + "learning_rate": 4.66583583781741e-05, + "loss": 1.1432, + "step": 85500 + }, + { + "epoch": 0.08, + "grad_norm": 24.25, + "learning_rate": 4.6653859304983176e-05, + "loss": 1.3741, + "step": 85600 + }, + { + "epoch": 0.08, + "grad_norm": 25.125, + "learning_rate": 4.664936023179226e-05, + "loss": 1.1954, + "step": 85700 + }, + { + "epoch": 0.08, + "grad_norm": 30.75, + "learning_rate": 4.6644861158601334e-05, + "loss": 1.2219, + "step": 85800 + }, + { + "epoch": 0.08, + "grad_norm": 0.00439453125, + "learning_rate": 4.66403620854104e-05, + "loss": 1.233, + "step": 85900 + }, + { + "epoch": 0.08, + "grad_norm": 60.0, + "learning_rate": 4.6635863012219484e-05, + "loss": 1.2093, + "step": 86000 + }, + { + "epoch": 0.08, + "grad_norm": 10.4375, + "learning_rate": 4.663136393902856e-05, + "loss": 1.145, + "step": 86100 + }, + { + "epoch": 0.08, + "grad_norm": 21.625, + "learning_rate": 4.662686486583764e-05, + "loss": 1.2302, + "step": 86200 + }, + { + "epoch": 0.08, + "grad_norm": 54.75, + "learning_rate": 4.662236579264672e-05, + "loss": 1.2073, + "step": 86300 + }, + { + "epoch": 0.08, + "grad_norm": 23.875, + "learning_rate": 4.661786671945579e-05, + "loss": 1.0794, + "step": 86400 + }, + { + "epoch": 0.08, + "grad_norm": 4.34375, + "learning_rate": 4.6613367646264875e-05, + "loss": 1.1232, + "step": 86500 + }, + { + "epoch": 0.08, + "grad_norm": 10.625, + "learning_rate": 4.660886857307395e-05, + "loss": 1.3566, + "step": 86600 + }, + { + "epoch": 0.08, + "grad_norm": 86.5, + "learning_rate": 4.6604369499883025e-05, + "loss": 1.1867, + "step": 86700 + }, + { + "epoch": 0.08, + "grad_norm": 20.375, + "learning_rate": 4.659987042669211e-05, + "loss": 1.092, + "step": 86800 + }, + { + "epoch": 0.08, + "grad_norm": 18.5, + "learning_rate": 4.659537135350118e-05, + "loss": 1.175, + "step": 86900 + }, + { + "epoch": 0.08, + "grad_norm": 17.5, + "learning_rate": 4.659087228031026e-05, + "loss": 1.2469, + "step": 87000 + }, + { + "epoch": 0.08, + "grad_norm": 17.25, + "learning_rate": 4.658637320711934e-05, + "loss": 1.1808, + "step": 87100 + }, + { + "epoch": 0.08, + "grad_norm": 14.9375, + "learning_rate": 4.658187413392841e-05, + "loss": 1.1264, + "step": 87200 + }, + { + "epoch": 0.08, + "grad_norm": 39.0, + "learning_rate": 4.657737506073749e-05, + "loss": 1.156, + "step": 87300 + }, + { + "epoch": 0.08, + "grad_norm": 15.75, + "learning_rate": 4.6572875987546566e-05, + "loss": 1.063, + "step": 87400 + }, + { + "epoch": 0.08, + "grad_norm": 14.3125, + "learning_rate": 4.656837691435564e-05, + "loss": 1.1372, + "step": 87500 + }, + { + "epoch": 0.08, + "grad_norm": 38.0, + "learning_rate": 4.6563877841164724e-05, + "loss": 1.1772, + "step": 87600 + }, + { + "epoch": 0.08, + "grad_norm": 1.046875, + "learning_rate": 4.65593787679738e-05, + "loss": 1.1926, + "step": 87700 + }, + { + "epoch": 0.08, + "grad_norm": 114.5, + "learning_rate": 4.6554879694782874e-05, + "loss": 1.2012, + "step": 87800 + }, + { + "epoch": 0.08, + "grad_norm": 772.0, + "learning_rate": 4.6550380621591956e-05, + "loss": 1.1147, + "step": 87900 + }, + { + "epoch": 0.08, + "grad_norm": 536.0, + "learning_rate": 4.654588154840103e-05, + "loss": 1.1648, + "step": 88000 + }, + { + "epoch": 0.08, + "grad_norm": 200.0, + "learning_rate": 4.654138247521011e-05, + "loss": 1.1388, + "step": 88100 + }, + { + "epoch": 0.08, + "grad_norm": 57.75, + "learning_rate": 4.653688340201919e-05, + "loss": 1.2505, + "step": 88200 + }, + { + "epoch": 0.08, + "grad_norm": 43.0, + "learning_rate": 4.6532384328828265e-05, + "loss": 1.2581, + "step": 88300 + }, + { + "epoch": 0.08, + "grad_norm": 2.71875, + "learning_rate": 4.652788525563735e-05, + "loss": 1.215, + "step": 88400 + }, + { + "epoch": 0.08, + "grad_norm": 0.1474609375, + "learning_rate": 4.6523386182446415e-05, + "loss": 1.2729, + "step": 88500 + }, + { + "epoch": 0.08, + "grad_norm": 0.259765625, + "learning_rate": 4.651888710925549e-05, + "loss": 1.1295, + "step": 88600 + }, + { + "epoch": 0.08, + "grad_norm": 15.8125, + "learning_rate": 4.651438803606457e-05, + "loss": 1.2362, + "step": 88700 + }, + { + "epoch": 0.08, + "grad_norm": 16.125, + "learning_rate": 4.650988896287365e-05, + "loss": 1.0345, + "step": 88800 + }, + { + "epoch": 0.08, + "grad_norm": 0.022216796875, + "learning_rate": 4.650538988968272e-05, + "loss": 1.2219, + "step": 88900 + }, + { + "epoch": 0.08, + "grad_norm": 36.25, + "learning_rate": 4.6500890816491805e-05, + "loss": 1.2903, + "step": 89000 + }, + { + "epoch": 0.08, + "grad_norm": 30.25, + "learning_rate": 4.649639174330088e-05, + "loss": 1.2863, + "step": 89100 + }, + { + "epoch": 0.08, + "grad_norm": 50.5, + "learning_rate": 4.649189267010996e-05, + "loss": 1.3294, + "step": 89200 + }, + { + "epoch": 0.08, + "grad_norm": 32.25, + "learning_rate": 4.648739359691904e-05, + "loss": 1.3154, + "step": 89300 + }, + { + "epoch": 0.08, + "grad_norm": 9.5625, + "learning_rate": 4.6482894523728114e-05, + "loss": 1.3416, + "step": 89400 + }, + { + "epoch": 0.08, + "grad_norm": 34.0, + "learning_rate": 4.6478395450537196e-05, + "loss": 1.1502, + "step": 89500 + }, + { + "epoch": 0.08, + "grad_norm": 67.0, + "learning_rate": 4.647389637734627e-05, + "loss": 1.0926, + "step": 89600 + }, + { + "epoch": 0.08, + "grad_norm": 40.25, + "learning_rate": 4.6469397304155346e-05, + "loss": 1.3378, + "step": 89700 + }, + { + "epoch": 0.08, + "grad_norm": 32.25, + "learning_rate": 4.646489823096442e-05, + "loss": 1.2702, + "step": 89800 + }, + { + "epoch": 0.08, + "grad_norm": 45.25, + "learning_rate": 4.64603991577735e-05, + "loss": 1.4877, + "step": 89900 + }, + { + "epoch": 0.08, + "grad_norm": 38.75, + "learning_rate": 4.645590008458258e-05, + "loss": 1.2217, + "step": 90000 + }, + { + "epoch": 0.08, + "grad_norm": 0.0498046875, + "learning_rate": 4.6451401011391655e-05, + "loss": 1.1891, + "step": 90100 + }, + { + "epoch": 0.08, + "grad_norm": 161.0, + "learning_rate": 4.644690193820073e-05, + "loss": 1.3031, + "step": 90200 + }, + { + "epoch": 0.08, + "grad_norm": 16.25, + "learning_rate": 4.644240286500981e-05, + "loss": 1.3148, + "step": 90300 + }, + { + "epoch": 0.08, + "grad_norm": 32.5, + "learning_rate": 4.643790379181889e-05, + "loss": 1.3879, + "step": 90400 + }, + { + "epoch": 0.08, + "grad_norm": 201.0, + "learning_rate": 4.643340471862796e-05, + "loss": 1.1986, + "step": 90500 + }, + { + "epoch": 0.08, + "grad_norm": 144.0, + "learning_rate": 4.6428905645437045e-05, + "loss": 1.1706, + "step": 90600 + }, + { + "epoch": 0.08, + "grad_norm": 22.875, + "learning_rate": 4.642440657224612e-05, + "loss": 1.2711, + "step": 90700 + }, + { + "epoch": 0.08, + "grad_norm": 19.25, + "learning_rate": 4.6419907499055195e-05, + "loss": 1.3133, + "step": 90800 + }, + { + "epoch": 0.08, + "grad_norm": 91.0, + "learning_rate": 4.641540842586428e-05, + "loss": 1.2535, + "step": 90900 + }, + { + "epoch": 0.08, + "grad_norm": 290.0, + "learning_rate": 4.641090935267335e-05, + "loss": 1.3282, + "step": 91000 + }, + { + "epoch": 0.08, + "grad_norm": 1.109375, + "learning_rate": 4.640641027948243e-05, + "loss": 1.1775, + "step": 91100 + }, + { + "epoch": 0.08, + "grad_norm": 254.0, + "learning_rate": 4.6401911206291504e-05, + "loss": 1.1602, + "step": 91200 + }, + { + "epoch": 0.08, + "grad_norm": 21.625, + "learning_rate": 4.639741213310058e-05, + "loss": 1.1385, + "step": 91300 + }, + { + "epoch": 0.08, + "grad_norm": 15.4375, + "learning_rate": 4.639291305990966e-05, + "loss": 1.0891, + "step": 91400 + }, + { + "epoch": 0.08, + "grad_norm": 29.25, + "learning_rate": 4.6388413986718736e-05, + "loss": 1.1267, + "step": 91500 + }, + { + "epoch": 0.08, + "grad_norm": 121.5, + "learning_rate": 4.638391491352781e-05, + "loss": 1.1393, + "step": 91600 + }, + { + "epoch": 0.08, + "grad_norm": 26.375, + "learning_rate": 4.6379415840336894e-05, + "loss": 1.3078, + "step": 91700 + }, + { + "epoch": 0.08, + "grad_norm": 2.546875, + "learning_rate": 4.637491676714597e-05, + "loss": 1.1165, + "step": 91800 + }, + { + "epoch": 0.08, + "grad_norm": 744.0, + "learning_rate": 4.637041769395505e-05, + "loss": 1.2956, + "step": 91900 + }, + { + "epoch": 0.08, + "grad_norm": 1.9296875, + "learning_rate": 4.636591862076413e-05, + "loss": 1.2069, + "step": 92000 + }, + { + "epoch": 0.08, + "grad_norm": 36.25, + "learning_rate": 4.63614195475732e-05, + "loss": 1.2714, + "step": 92100 + }, + { + "epoch": 0.08, + "grad_norm": 76.5, + "learning_rate": 4.6356920474382284e-05, + "loss": 1.1672, + "step": 92200 + }, + { + "epoch": 0.08, + "grad_norm": 47.25, + "learning_rate": 4.635242140119136e-05, + "loss": 1.1454, + "step": 92300 + }, + { + "epoch": 0.08, + "grad_norm": 19.875, + "learning_rate": 4.6347922328000435e-05, + "loss": 1.296, + "step": 92400 + }, + { + "epoch": 0.08, + "grad_norm": 30.125, + "learning_rate": 4.634342325480951e-05, + "loss": 1.274, + "step": 92500 + }, + { + "epoch": 0.08, + "grad_norm": 10.625, + "learning_rate": 4.6338924181618585e-05, + "loss": 1.16, + "step": 92600 + }, + { + "epoch": 0.08, + "grad_norm": 17.25, + "learning_rate": 4.633442510842767e-05, + "loss": 1.3192, + "step": 92700 + }, + { + "epoch": 0.08, + "grad_norm": 217.0, + "learning_rate": 4.632992603523674e-05, + "loss": 1.1032, + "step": 92800 + }, + { + "epoch": 0.08, + "grad_norm": 204.0, + "learning_rate": 4.632542696204582e-05, + "loss": 1.0933, + "step": 92900 + }, + { + "epoch": 0.08, + "grad_norm": 68.0, + "learning_rate": 4.63209278888549e-05, + "loss": 1.2312, + "step": 93000 + }, + { + "epoch": 0.08, + "grad_norm": 33.25, + "learning_rate": 4.6316428815663976e-05, + "loss": 1.317, + "step": 93100 + }, + { + "epoch": 0.08, + "grad_norm": 9.125, + "learning_rate": 4.631192974247305e-05, + "loss": 1.1449, + "step": 93200 + }, + { + "epoch": 0.08, + "grad_norm": 150.0, + "learning_rate": 4.630743066928213e-05, + "loss": 1.0247, + "step": 93300 + }, + { + "epoch": 0.08, + "grad_norm": 53.0, + "learning_rate": 4.630293159609121e-05, + "loss": 1.2245, + "step": 93400 + }, + { + "epoch": 0.08, + "grad_norm": 56.0, + "learning_rate": 4.6298432522900284e-05, + "loss": 1.392, + "step": 93500 + }, + { + "epoch": 0.08, + "grad_norm": 14.375, + "learning_rate": 4.6293933449709366e-05, + "loss": 0.9784, + "step": 93600 + }, + { + "epoch": 0.08, + "grad_norm": 0.203125, + "learning_rate": 4.6289434376518435e-05, + "loss": 1.1816, + "step": 93700 + }, + { + "epoch": 0.08, + "grad_norm": 22.75, + "learning_rate": 4.628493530332752e-05, + "loss": 1.1423, + "step": 93800 + }, + { + "epoch": 0.08, + "grad_norm": 0.140625, + "learning_rate": 4.628043623013659e-05, + "loss": 1.264, + "step": 93900 + }, + { + "epoch": 0.08, + "grad_norm": 15.375, + "learning_rate": 4.627593715694567e-05, + "loss": 1.1952, + "step": 94000 + }, + { + "epoch": 0.08, + "grad_norm": 12.6875, + "learning_rate": 4.627143808375475e-05, + "loss": 1.1834, + "step": 94100 + }, + { + "epoch": 0.08, + "grad_norm": 44.75, + "learning_rate": 4.6266939010563825e-05, + "loss": 1.1482, + "step": 94200 + }, + { + "epoch": 0.08, + "grad_norm": 103.0, + "learning_rate": 4.62624399373729e-05, + "loss": 1.2201, + "step": 94300 + }, + { + "epoch": 0.08, + "grad_norm": 68.0, + "learning_rate": 4.625794086418198e-05, + "loss": 1.0979, + "step": 94400 + }, + { + "epoch": 0.08, + "grad_norm": 0.09765625, + "learning_rate": 4.625344179099106e-05, + "loss": 1.2729, + "step": 94500 + }, + { + "epoch": 0.08, + "grad_norm": 0.26171875, + "learning_rate": 4.624894271780014e-05, + "loss": 1.2557, + "step": 94600 + }, + { + "epoch": 0.08, + "grad_norm": 86.0, + "learning_rate": 4.6244443644609215e-05, + "loss": 1.2296, + "step": 94700 + }, + { + "epoch": 0.08, + "grad_norm": 33.75, + "learning_rate": 4.623994457141829e-05, + "loss": 1.203, + "step": 94800 + }, + { + "epoch": 0.08, + "grad_norm": 9.1875, + "learning_rate": 4.623544549822737e-05, + "loss": 1.1907, + "step": 94900 + }, + { + "epoch": 0.08, + "grad_norm": 0.003448486328125, + "learning_rate": 4.623094642503644e-05, + "loss": 1.1294, + "step": 95000 + }, + { + "epoch": 0.08, + "grad_norm": 32.75, + "learning_rate": 4.622644735184552e-05, + "loss": 1.4294, + "step": 95100 + }, + { + "epoch": 0.08, + "grad_norm": 81.0, + "learning_rate": 4.62219482786546e-05, + "loss": 1.131, + "step": 95200 + }, + { + "epoch": 0.08, + "grad_norm": 83.0, + "learning_rate": 4.6217449205463674e-05, + "loss": 1.2515, + "step": 95300 + }, + { + "epoch": 0.08, + "grad_norm": 167.0, + "learning_rate": 4.6212950132272756e-05, + "loss": 1.2297, + "step": 95400 + }, + { + "epoch": 0.09, + "grad_norm": 22.125, + "learning_rate": 4.620845105908183e-05, + "loss": 1.2981, + "step": 95500 + }, + { + "epoch": 0.09, + "grad_norm": 59.5, + "learning_rate": 4.620395198589091e-05, + "loss": 1.2032, + "step": 95600 + }, + { + "epoch": 0.09, + "grad_norm": 40.5, + "learning_rate": 4.619945291269999e-05, + "loss": 1.283, + "step": 95700 + }, + { + "epoch": 0.09, + "grad_norm": 29.625, + "learning_rate": 4.6194953839509064e-05, + "loss": 1.3381, + "step": 95800 + }, + { + "epoch": 0.09, + "grad_norm": 66.0, + "learning_rate": 4.619045476631814e-05, + "loss": 1.3305, + "step": 95900 + }, + { + "epoch": 0.09, + "grad_norm": 48.0, + "learning_rate": 4.618595569312722e-05, + "loss": 1.15, + "step": 96000 + }, + { + "epoch": 0.09, + "grad_norm": 19.875, + "learning_rate": 4.61814566199363e-05, + "loss": 1.3633, + "step": 96100 + }, + { + "epoch": 0.09, + "grad_norm": 17.25, + "learning_rate": 4.617695754674537e-05, + "loss": 1.1605, + "step": 96200 + }, + { + "epoch": 0.09, + "grad_norm": 27.5, + "learning_rate": 4.617245847355445e-05, + "loss": 1.2327, + "step": 96300 + }, + { + "epoch": 0.09, + "grad_norm": 0.60546875, + "learning_rate": 4.616795940036352e-05, + "loss": 1.1414, + "step": 96400 + }, + { + "epoch": 0.09, + "grad_norm": 16.5, + "learning_rate": 4.6163460327172605e-05, + "loss": 1.3517, + "step": 96500 + }, + { + "epoch": 0.09, + "grad_norm": 17.125, + "learning_rate": 4.615896125398168e-05, + "loss": 1.2891, + "step": 96600 + }, + { + "epoch": 0.09, + "grad_norm": 77.5, + "learning_rate": 4.6154462180790756e-05, + "loss": 1.2363, + "step": 96700 + }, + { + "epoch": 0.09, + "grad_norm": 94.0, + "learning_rate": 4.614996310759984e-05, + "loss": 1.2241, + "step": 96800 + }, + { + "epoch": 0.09, + "grad_norm": 0.1787109375, + "learning_rate": 4.614546403440891e-05, + "loss": 1.1567, + "step": 96900 + }, + { + "epoch": 0.09, + "grad_norm": 2.875, + "learning_rate": 4.614096496121799e-05, + "loss": 1.2089, + "step": 97000 + }, + { + "epoch": 0.09, + "grad_norm": 30.375, + "learning_rate": 4.613646588802707e-05, + "loss": 1.2015, + "step": 97100 + }, + { + "epoch": 0.09, + "grad_norm": 15.6875, + "learning_rate": 4.6131966814836146e-05, + "loss": 1.1075, + "step": 97200 + }, + { + "epoch": 0.09, + "grad_norm": 82.0, + "learning_rate": 4.612746774164523e-05, + "loss": 1.2898, + "step": 97300 + }, + { + "epoch": 0.09, + "grad_norm": 175.0, + "learning_rate": 4.61229686684543e-05, + "loss": 1.3346, + "step": 97400 + }, + { + "epoch": 0.09, + "grad_norm": 20.25, + "learning_rate": 4.611846959526338e-05, + "loss": 1.1985, + "step": 97500 + }, + { + "epoch": 0.09, + "grad_norm": 0.796875, + "learning_rate": 4.6113970522072454e-05, + "loss": 1.1855, + "step": 97600 + }, + { + "epoch": 0.09, + "grad_norm": 153.0, + "learning_rate": 4.610947144888153e-05, + "loss": 1.2736, + "step": 97700 + }, + { + "epoch": 0.09, + "grad_norm": 26.5, + "learning_rate": 4.610497237569061e-05, + "loss": 1.0498, + "step": 97800 + }, + { + "epoch": 0.09, + "grad_norm": 43.5, + "learning_rate": 4.610047330249969e-05, + "loss": 1.1019, + "step": 97900 + }, + { + "epoch": 0.09, + "grad_norm": 60.75, + "learning_rate": 4.609597422930876e-05, + "loss": 1.2442, + "step": 98000 + }, + { + "epoch": 0.09, + "grad_norm": 23.875, + "learning_rate": 4.6091475156117844e-05, + "loss": 1.3374, + "step": 98100 + }, + { + "epoch": 0.09, + "grad_norm": 16.5, + "learning_rate": 4.608697608292692e-05, + "loss": 1.2757, + "step": 98200 + }, + { + "epoch": 0.09, + "grad_norm": 67.5, + "learning_rate": 4.6082477009735995e-05, + "loss": 1.1568, + "step": 98300 + }, + { + "epoch": 0.09, + "grad_norm": 35.75, + "learning_rate": 4.607797793654508e-05, + "loss": 1.2625, + "step": 98400 + }, + { + "epoch": 0.09, + "grad_norm": 20.625, + "learning_rate": 4.607347886335415e-05, + "loss": 1.3875, + "step": 98500 + }, + { + "epoch": 0.09, + "grad_norm": 141.0, + "learning_rate": 4.606897979016323e-05, + "loss": 1.0833, + "step": 98600 + }, + { + "epoch": 0.09, + "grad_norm": 29.125, + "learning_rate": 4.606448071697231e-05, + "loss": 1.1732, + "step": 98700 + }, + { + "epoch": 0.09, + "grad_norm": 46.5, + "learning_rate": 4.6059981643781385e-05, + "loss": 1.1168, + "step": 98800 + }, + { + "epoch": 0.09, + "grad_norm": 36.0, + "learning_rate": 4.605548257059046e-05, + "loss": 1.3608, + "step": 98900 + }, + { + "epoch": 0.09, + "grad_norm": 18.875, + "learning_rate": 4.6050983497399536e-05, + "loss": 1.3203, + "step": 99000 + }, + { + "epoch": 0.09, + "grad_norm": 0.024169921875, + "learning_rate": 4.604648442420861e-05, + "loss": 1.2144, + "step": 99100 + }, + { + "epoch": 0.09, + "grad_norm": 12.125, + "learning_rate": 4.604198535101769e-05, + "loss": 1.2136, + "step": 99200 + }, + { + "epoch": 0.09, + "grad_norm": 33.75, + "learning_rate": 4.603748627782677e-05, + "loss": 1.2989, + "step": 99300 + }, + { + "epoch": 0.09, + "grad_norm": 19.625, + "learning_rate": 4.6032987204635844e-05, + "loss": 1.0424, + "step": 99400 + }, + { + "epoch": 0.09, + "grad_norm": 117.0, + "learning_rate": 4.6028488131444926e-05, + "loss": 1.3084, + "step": 99500 + }, + { + "epoch": 0.09, + "grad_norm": 26.0, + "learning_rate": 4.6023989058254e-05, + "loss": 1.0954, + "step": 99600 + }, + { + "epoch": 0.09, + "grad_norm": 41.25, + "learning_rate": 4.601948998506308e-05, + "loss": 1.1938, + "step": 99700 + }, + { + "epoch": 0.09, + "grad_norm": 0.671875, + "learning_rate": 4.601499091187216e-05, + "loss": 1.1928, + "step": 99800 + }, + { + "epoch": 0.09, + "grad_norm": 21.875, + "learning_rate": 4.6010491838681234e-05, + "loss": 1.3598, + "step": 99900 + }, + { + "epoch": 0.09, + "grad_norm": 34.25, + "learning_rate": 4.6005992765490316e-05, + "loss": 1.2554, + "step": 100000 + }, + { + "epoch": 0.09, + "grad_norm": 374.0, + "learning_rate": 4.600149369229939e-05, + "loss": 1.0904, + "step": 100100 + }, + { + "epoch": 0.09, + "grad_norm": 15.25, + "learning_rate": 4.599699461910846e-05, + "loss": 1.2633, + "step": 100200 + }, + { + "epoch": 0.09, + "grad_norm": 34.25, + "learning_rate": 4.599249554591754e-05, + "loss": 1.2126, + "step": 100300 + }, + { + "epoch": 0.09, + "grad_norm": 68.0, + "learning_rate": 4.598799647272662e-05, + "loss": 1.177, + "step": 100400 + }, + { + "epoch": 0.09, + "grad_norm": 15.9375, + "learning_rate": 4.59834973995357e-05, + "loss": 1.3549, + "step": 100500 + }, + { + "epoch": 0.09, + "grad_norm": 199.0, + "learning_rate": 4.5978998326344775e-05, + "loss": 1.2707, + "step": 100600 + }, + { + "epoch": 0.09, + "grad_norm": 59.5, + "learning_rate": 4.597449925315385e-05, + "loss": 1.3505, + "step": 100700 + }, + { + "epoch": 0.09, + "grad_norm": 38.75, + "learning_rate": 4.597000017996293e-05, + "loss": 1.2006, + "step": 100800 + }, + { + "epoch": 0.09, + "grad_norm": 12.5, + "learning_rate": 4.596550110677201e-05, + "loss": 1.2509, + "step": 100900 + }, + { + "epoch": 0.09, + "grad_norm": 25.625, + "learning_rate": 4.596100203358108e-05, + "loss": 1.2555, + "step": 101000 + }, + { + "epoch": 0.09, + "grad_norm": 23.125, + "learning_rate": 4.5956502960390165e-05, + "loss": 1.2871, + "step": 101100 + }, + { + "epoch": 0.09, + "grad_norm": 18.625, + "learning_rate": 4.595200388719924e-05, + "loss": 1.0645, + "step": 101200 + }, + { + "epoch": 0.09, + "grad_norm": 10.8125, + "learning_rate": 4.5947504814008316e-05, + "loss": 1.1701, + "step": 101300 + }, + { + "epoch": 0.09, + "grad_norm": 46.5, + "learning_rate": 4.59430057408174e-05, + "loss": 1.3589, + "step": 101400 + }, + { + "epoch": 0.09, + "grad_norm": 23.25, + "learning_rate": 4.593850666762647e-05, + "loss": 1.3453, + "step": 101500 + }, + { + "epoch": 0.09, + "grad_norm": 7.96875, + "learning_rate": 4.593400759443555e-05, + "loss": 1.0873, + "step": 101600 + }, + { + "epoch": 0.09, + "grad_norm": 12.125, + "learning_rate": 4.5929508521244624e-05, + "loss": 1.0792, + "step": 101700 + }, + { + "epoch": 0.09, + "grad_norm": 64.5, + "learning_rate": 4.59250094480537e-05, + "loss": 1.2598, + "step": 101800 + }, + { + "epoch": 0.09, + "grad_norm": 31.75, + "learning_rate": 4.592051037486278e-05, + "loss": 1.0844, + "step": 101900 + }, + { + "epoch": 0.09, + "grad_norm": 129.0, + "learning_rate": 4.591601130167186e-05, + "loss": 1.2592, + "step": 102000 + }, + { + "epoch": 0.09, + "grad_norm": 55.75, + "learning_rate": 4.591151222848093e-05, + "loss": 1.295, + "step": 102100 + }, + { + "epoch": 0.09, + "grad_norm": 16.0, + "learning_rate": 4.5907013155290015e-05, + "loss": 1.1847, + "step": 102200 + }, + { + "epoch": 0.09, + "grad_norm": 23.125, + "learning_rate": 4.590251408209909e-05, + "loss": 1.1945, + "step": 102300 + }, + { + "epoch": 0.09, + "grad_norm": 0.18359375, + "learning_rate": 4.5898015008908165e-05, + "loss": 1.3027, + "step": 102400 + }, + { + "epoch": 0.09, + "grad_norm": 55.5, + "learning_rate": 4.589351593571725e-05, + "loss": 1.2763, + "step": 102500 + }, + { + "epoch": 0.09, + "grad_norm": 206.0, + "learning_rate": 4.588901686252632e-05, + "loss": 1.2135, + "step": 102600 + }, + { + "epoch": 0.09, + "grad_norm": 125.5, + "learning_rate": 4.5884517789335405e-05, + "loss": 1.1985, + "step": 102700 + }, + { + "epoch": 0.09, + "grad_norm": 15.8125, + "learning_rate": 4.588001871614447e-05, + "loss": 1.1805, + "step": 102800 + }, + { + "epoch": 0.09, + "grad_norm": 25.25, + "learning_rate": 4.587551964295355e-05, + "loss": 1.2184, + "step": 102900 + }, + { + "epoch": 0.09, + "grad_norm": 30.75, + "learning_rate": 4.587102056976263e-05, + "loss": 1.1989, + "step": 103000 + }, + { + "epoch": 0.09, + "grad_norm": 0.05029296875, + "learning_rate": 4.5866521496571706e-05, + "loss": 1.0852, + "step": 103100 + }, + { + "epoch": 0.09, + "grad_norm": 23.125, + "learning_rate": 4.586202242338079e-05, + "loss": 1.075, + "step": 103200 + }, + { + "epoch": 0.09, + "grad_norm": 16.75, + "learning_rate": 4.5857523350189864e-05, + "loss": 1.3041, + "step": 103300 + }, + { + "epoch": 0.09, + "grad_norm": 119.5, + "learning_rate": 4.585302427699894e-05, + "loss": 1.3158, + "step": 103400 + }, + { + "epoch": 0.09, + "grad_norm": 21.625, + "learning_rate": 4.584852520380802e-05, + "loss": 1.13, + "step": 103500 + }, + { + "epoch": 0.09, + "grad_norm": 82.5, + "learning_rate": 4.5844026130617096e-05, + "loss": 1.3399, + "step": 103600 + }, + { + "epoch": 0.09, + "grad_norm": 0.0269775390625, + "learning_rate": 4.583952705742617e-05, + "loss": 1.1488, + "step": 103700 + }, + { + "epoch": 0.09, + "grad_norm": 51.5, + "learning_rate": 4.5835027984235254e-05, + "loss": 1.1056, + "step": 103800 + }, + { + "epoch": 0.09, + "grad_norm": 0.322265625, + "learning_rate": 4.583052891104433e-05, + "loss": 1.195, + "step": 103900 + }, + { + "epoch": 0.09, + "grad_norm": 0.001495361328125, + "learning_rate": 4.5826029837853405e-05, + "loss": 1.3246, + "step": 104000 + }, + { + "epoch": 0.09, + "grad_norm": 114.5, + "learning_rate": 4.582153076466248e-05, + "loss": 1.1865, + "step": 104100 + }, + { + "epoch": 0.09, + "grad_norm": 0.06005859375, + "learning_rate": 4.5817031691471555e-05, + "loss": 1.2381, + "step": 104200 + }, + { + "epoch": 0.09, + "grad_norm": 102.0, + "learning_rate": 4.581253261828064e-05, + "loss": 1.3197, + "step": 104300 + }, + { + "epoch": 0.09, + "grad_norm": 23.625, + "learning_rate": 4.580803354508971e-05, + "loss": 1.1024, + "step": 104400 + }, + { + "epoch": 0.09, + "grad_norm": 31.25, + "learning_rate": 4.580353447189879e-05, + "loss": 1.1392, + "step": 104500 + }, + { + "epoch": 0.09, + "grad_norm": 30.625, + "learning_rate": 4.579903539870787e-05, + "loss": 1.2728, + "step": 104600 + }, + { + "epoch": 0.09, + "grad_norm": 45.0, + "learning_rate": 4.5794536325516945e-05, + "loss": 1.2558, + "step": 104700 + }, + { + "epoch": 0.09, + "grad_norm": 77.5, + "learning_rate": 4.579003725232602e-05, + "loss": 1.0428, + "step": 104800 + }, + { + "epoch": 0.09, + "grad_norm": 113.5, + "learning_rate": 4.57855381791351e-05, + "loss": 1.1951, + "step": 104900 + }, + { + "epoch": 0.09, + "grad_norm": 25.625, + "learning_rate": 4.578103910594418e-05, + "loss": 1.2916, + "step": 105000 + }, + { + "epoch": 0.09, + "grad_norm": 0.302734375, + "learning_rate": 4.5776540032753254e-05, + "loss": 1.1258, + "step": 105100 + }, + { + "epoch": 0.09, + "grad_norm": 15.125, + "learning_rate": 4.5772040959562336e-05, + "loss": 1.2375, + "step": 105200 + }, + { + "epoch": 0.09, + "grad_norm": 4.5, + "learning_rate": 4.576754188637141e-05, + "loss": 1.2654, + "step": 105300 + }, + { + "epoch": 0.09, + "grad_norm": 0.010986328125, + "learning_rate": 4.5763042813180486e-05, + "loss": 1.3326, + "step": 105400 + }, + { + "epoch": 0.09, + "grad_norm": 8.8125, + "learning_rate": 4.575854373998956e-05, + "loss": 1.2025, + "step": 105500 + }, + { + "epoch": 0.09, + "grad_norm": 35.0, + "learning_rate": 4.575404466679864e-05, + "loss": 1.2903, + "step": 105600 + }, + { + "epoch": 0.09, + "grad_norm": 39.0, + "learning_rate": 4.574954559360772e-05, + "loss": 1.3111, + "step": 105700 + }, + { + "epoch": 0.09, + "grad_norm": 0.005645751953125, + "learning_rate": 4.5745046520416795e-05, + "loss": 1.3519, + "step": 105800 + }, + { + "epoch": 0.09, + "grad_norm": 26.5, + "learning_rate": 4.574054744722587e-05, + "loss": 1.2876, + "step": 105900 + }, + { + "epoch": 0.09, + "grad_norm": 7.90625, + "learning_rate": 4.573604837403495e-05, + "loss": 1.234, + "step": 106000 + }, + { + "epoch": 0.09, + "grad_norm": 0.005096435546875, + "learning_rate": 4.573154930084403e-05, + "loss": 1.395, + "step": 106100 + }, + { + "epoch": 0.09, + "grad_norm": 56.5, + "learning_rate": 4.572705022765311e-05, + "loss": 1.3566, + "step": 106200 + }, + { + "epoch": 0.09, + "grad_norm": 32.5, + "learning_rate": 4.5722551154462185e-05, + "loss": 1.2325, + "step": 106300 + }, + { + "epoch": 0.09, + "grad_norm": 16.375, + "learning_rate": 4.571805208127126e-05, + "loss": 1.2097, + "step": 106400 + }, + { + "epoch": 0.09, + "grad_norm": 19.25, + "learning_rate": 4.571355300808034e-05, + "loss": 1.0891, + "step": 106500 + }, + { + "epoch": 0.09, + "grad_norm": 37.75, + "learning_rate": 4.570905393488942e-05, + "loss": 1.216, + "step": 106600 + }, + { + "epoch": 0.1, + "grad_norm": 50.5, + "learning_rate": 4.570455486169849e-05, + "loss": 1.4524, + "step": 106700 + }, + { + "epoch": 0.1, + "grad_norm": 0.2001953125, + "learning_rate": 4.570005578850757e-05, + "loss": 1.116, + "step": 106800 + }, + { + "epoch": 0.1, + "grad_norm": 14.25, + "learning_rate": 4.5695556715316644e-05, + "loss": 1.1824, + "step": 106900 + }, + { + "epoch": 0.1, + "grad_norm": 98.5, + "learning_rate": 4.5691057642125726e-05, + "loss": 1.2857, + "step": 107000 + }, + { + "epoch": 0.1, + "grad_norm": 14.5625, + "learning_rate": 4.56865585689348e-05, + "loss": 1.0591, + "step": 107100 + }, + { + "epoch": 0.1, + "grad_norm": 37.75, + "learning_rate": 4.5682059495743876e-05, + "loss": 1.3179, + "step": 107200 + }, + { + "epoch": 0.1, + "grad_norm": 19.75, + "learning_rate": 4.567756042255296e-05, + "loss": 1.3088, + "step": 107300 + }, + { + "epoch": 0.1, + "grad_norm": 11.5, + "learning_rate": 4.5673061349362034e-05, + "loss": 1.2068, + "step": 107400 + }, + { + "epoch": 0.1, + "grad_norm": 15.9375, + "learning_rate": 4.566856227617111e-05, + "loss": 1.0826, + "step": 107500 + }, + { + "epoch": 0.1, + "grad_norm": 60.75, + "learning_rate": 4.566406320298019e-05, + "loss": 1.2205, + "step": 107600 + }, + { + "epoch": 0.1, + "grad_norm": 14.5625, + "learning_rate": 4.565956412978927e-05, + "loss": 1.1924, + "step": 107700 + }, + { + "epoch": 0.1, + "grad_norm": 0.310546875, + "learning_rate": 4.565506505659834e-05, + "loss": 1.138, + "step": 107800 + }, + { + "epoch": 0.1, + "grad_norm": 22.375, + "learning_rate": 4.5650565983407424e-05, + "loss": 1.3148, + "step": 107900 + }, + { + "epoch": 0.1, + "grad_norm": 28.375, + "learning_rate": 4.564606691021649e-05, + "loss": 1.1837, + "step": 108000 + }, + { + "epoch": 0.1, + "grad_norm": 41.75, + "learning_rate": 4.5641567837025575e-05, + "loss": 1.2787, + "step": 108100 + }, + { + "epoch": 0.1, + "grad_norm": 53.5, + "learning_rate": 4.563706876383465e-05, + "loss": 1.2307, + "step": 108200 + }, + { + "epoch": 0.1, + "grad_norm": 28.375, + "learning_rate": 4.5632569690643725e-05, + "loss": 1.1937, + "step": 108300 + }, + { + "epoch": 0.1, + "grad_norm": 19.375, + "learning_rate": 4.562807061745281e-05, + "loss": 1.2141, + "step": 108400 + }, + { + "epoch": 0.1, + "grad_norm": 45.25, + "learning_rate": 4.562357154426188e-05, + "loss": 1.3127, + "step": 108500 + }, + { + "epoch": 0.1, + "grad_norm": 0.10205078125, + "learning_rate": 4.561907247107096e-05, + "loss": 1.1631, + "step": 108600 + }, + { + "epoch": 0.1, + "grad_norm": 446.0, + "learning_rate": 4.561457339788004e-05, + "loss": 1.1566, + "step": 108700 + }, + { + "epoch": 0.1, + "grad_norm": 208.0, + "learning_rate": 4.5610074324689116e-05, + "loss": 1.1789, + "step": 108800 + }, + { + "epoch": 0.1, + "grad_norm": 32.25, + "learning_rate": 4.56055752514982e-05, + "loss": 1.2044, + "step": 108900 + }, + { + "epoch": 0.1, + "grad_norm": 0.007415771484375, + "learning_rate": 4.560107617830727e-05, + "loss": 1.221, + "step": 109000 + }, + { + "epoch": 0.1, + "grad_norm": 18.375, + "learning_rate": 4.559657710511635e-05, + "loss": 1.2534, + "step": 109100 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 4.559207803192543e-05, + "loss": 1.3312, + "step": 109200 + }, + { + "epoch": 0.1, + "grad_norm": 15.5, + "learning_rate": 4.55875789587345e-05, + "loss": 1.1578, + "step": 109300 + }, + { + "epoch": 0.1, + "grad_norm": 33.75, + "learning_rate": 4.558307988554358e-05, + "loss": 1.2368, + "step": 109400 + }, + { + "epoch": 0.1, + "grad_norm": 0.78515625, + "learning_rate": 4.557858081235266e-05, + "loss": 1.1862, + "step": 109500 + }, + { + "epoch": 0.1, + "grad_norm": 13.9375, + "learning_rate": 4.557408173916173e-05, + "loss": 1.2698, + "step": 109600 + }, + { + "epoch": 0.1, + "grad_norm": 63.25, + "learning_rate": 4.5569582665970814e-05, + "loss": 1.1454, + "step": 109700 + }, + { + "epoch": 0.1, + "grad_norm": 32.0, + "learning_rate": 4.556508359277989e-05, + "loss": 1.1937, + "step": 109800 + }, + { + "epoch": 0.1, + "grad_norm": 16.375, + "learning_rate": 4.5560584519588965e-05, + "loss": 1.2177, + "step": 109900 + }, + { + "epoch": 0.1, + "grad_norm": 19.375, + "learning_rate": 4.555608544639805e-05, + "loss": 1.2745, + "step": 110000 + }, + { + "epoch": 0.1, + "grad_norm": 54.75, + "learning_rate": 4.555158637320712e-05, + "loss": 1.2536, + "step": 110100 + }, + { + "epoch": 0.1, + "grad_norm": 38.25, + "learning_rate": 4.55470873000162e-05, + "loss": 1.3106, + "step": 110200 + }, + { + "epoch": 0.1, + "grad_norm": 151.0, + "learning_rate": 4.554258822682528e-05, + "loss": 1.2801, + "step": 110300 + }, + { + "epoch": 0.1, + "grad_norm": 15.875, + "learning_rate": 4.5538089153634355e-05, + "loss": 1.2058, + "step": 110400 + }, + { + "epoch": 0.1, + "grad_norm": 24.375, + "learning_rate": 4.553359008044343e-05, + "loss": 1.226, + "step": 110500 + }, + { + "epoch": 0.1, + "grad_norm": 41.0, + "learning_rate": 4.5529091007252506e-05, + "loss": 1.1285, + "step": 110600 + }, + { + "epoch": 0.1, + "grad_norm": 53.25, + "learning_rate": 4.552459193406158e-05, + "loss": 1.2774, + "step": 110700 + }, + { + "epoch": 0.1, + "grad_norm": 47.25, + "learning_rate": 4.552009286087066e-05, + "loss": 1.1275, + "step": 110800 + }, + { + "epoch": 0.1, + "grad_norm": 41.0, + "learning_rate": 4.551559378767974e-05, + "loss": 1.2186, + "step": 110900 + }, + { + "epoch": 0.1, + "grad_norm": 141.0, + "learning_rate": 4.5511094714488814e-05, + "loss": 1.1936, + "step": 111000 + }, + { + "epoch": 0.1, + "grad_norm": 17.25, + "learning_rate": 4.5506595641297896e-05, + "loss": 1.387, + "step": 111100 + }, + { + "epoch": 0.1, + "grad_norm": 80.0, + "learning_rate": 4.550209656810697e-05, + "loss": 1.2142, + "step": 111200 + }, + { + "epoch": 0.1, + "grad_norm": 68.5, + "learning_rate": 4.549759749491605e-05, + "loss": 1.2587, + "step": 111300 + }, + { + "epoch": 0.1, + "grad_norm": 9.9375, + "learning_rate": 4.549309842172513e-05, + "loss": 1.3325, + "step": 111400 + }, + { + "epoch": 0.1, + "grad_norm": 125.0, + "learning_rate": 4.5488599348534204e-05, + "loss": 1.2874, + "step": 111500 + }, + { + "epoch": 0.1, + "grad_norm": 173.0, + "learning_rate": 4.5484100275343286e-05, + "loss": 1.1543, + "step": 111600 + }, + { + "epoch": 0.1, + "grad_norm": 23.375, + "learning_rate": 4.547960120215236e-05, + "loss": 1.1861, + "step": 111700 + }, + { + "epoch": 0.1, + "grad_norm": 34.25, + "learning_rate": 4.547510212896144e-05, + "loss": 1.1127, + "step": 111800 + }, + { + "epoch": 0.1, + "grad_norm": 17.0, + "learning_rate": 4.547060305577051e-05, + "loss": 1.2207, + "step": 111900 + }, + { + "epoch": 0.1, + "grad_norm": 0.5625, + "learning_rate": 4.546610398257959e-05, + "loss": 1.353, + "step": 112000 + }, + { + "epoch": 0.1, + "grad_norm": 164.0, + "learning_rate": 4.546160490938867e-05, + "loss": 1.2219, + "step": 112100 + }, + { + "epoch": 0.1, + "grad_norm": 61.75, + "learning_rate": 4.5457105836197745e-05, + "loss": 1.1616, + "step": 112200 + }, + { + "epoch": 0.1, + "grad_norm": 37.5, + "learning_rate": 4.545260676300682e-05, + "loss": 1.1932, + "step": 112300 + }, + { + "epoch": 0.1, + "grad_norm": 38.75, + "learning_rate": 4.54481076898159e-05, + "loss": 1.2167, + "step": 112400 + }, + { + "epoch": 0.1, + "grad_norm": 536.0, + "learning_rate": 4.544360861662498e-05, + "loss": 1.2178, + "step": 112500 + }, + { + "epoch": 0.1, + "grad_norm": 93.0, + "learning_rate": 4.543910954343405e-05, + "loss": 1.1768, + "step": 112600 + }, + { + "epoch": 0.1, + "grad_norm": 22.375, + "learning_rate": 4.5434610470243135e-05, + "loss": 1.4186, + "step": 112700 + }, + { + "epoch": 0.1, + "grad_norm": 28.75, + "learning_rate": 4.543011139705221e-05, + "loss": 1.1987, + "step": 112800 + }, + { + "epoch": 0.1, + "grad_norm": 20.875, + "learning_rate": 4.5425612323861286e-05, + "loss": 1.216, + "step": 112900 + }, + { + "epoch": 0.1, + "grad_norm": 58.5, + "learning_rate": 4.542111325067037e-05, + "loss": 1.2428, + "step": 113000 + }, + { + "epoch": 0.1, + "grad_norm": 95.5, + "learning_rate": 4.5416614177479443e-05, + "loss": 1.1357, + "step": 113100 + }, + { + "epoch": 0.1, + "grad_norm": 27.25, + "learning_rate": 4.541211510428852e-05, + "loss": 1.2233, + "step": 113200 + }, + { + "epoch": 0.1, + "grad_norm": 29.5, + "learning_rate": 4.5407616031097594e-05, + "loss": 1.1413, + "step": 113300 + }, + { + "epoch": 0.1, + "grad_norm": 56.75, + "learning_rate": 4.540311695790667e-05, + "loss": 1.0296, + "step": 113400 + }, + { + "epoch": 0.1, + "grad_norm": 14.8125, + "learning_rate": 4.539861788471575e-05, + "loss": 1.136, + "step": 113500 + }, + { + "epoch": 0.1, + "grad_norm": 0.07421875, + "learning_rate": 4.539411881152483e-05, + "loss": 1.2143, + "step": 113600 + }, + { + "epoch": 0.1, + "grad_norm": 72.5, + "learning_rate": 4.53896197383339e-05, + "loss": 1.326, + "step": 113700 + }, + { + "epoch": 0.1, + "grad_norm": 19.0, + "learning_rate": 4.5385120665142984e-05, + "loss": 1.2427, + "step": 113800 + }, + { + "epoch": 0.1, + "grad_norm": 5.6875, + "learning_rate": 4.538062159195206e-05, + "loss": 1.2672, + "step": 113900 + }, + { + "epoch": 0.1, + "grad_norm": 20.25, + "learning_rate": 4.5376122518761135e-05, + "loss": 1.2823, + "step": 114000 + }, + { + "epoch": 0.1, + "grad_norm": 117.5, + "learning_rate": 4.537162344557022e-05, + "loss": 1.2756, + "step": 114100 + }, + { + "epoch": 0.1, + "grad_norm": 1.140625, + "learning_rate": 4.536712437237929e-05, + "loss": 1.2124, + "step": 114200 + }, + { + "epoch": 0.1, + "grad_norm": 19.375, + "learning_rate": 4.5362625299188375e-05, + "loss": 1.1537, + "step": 114300 + }, + { + "epoch": 0.1, + "grad_norm": 12.0625, + "learning_rate": 4.535812622599745e-05, + "loss": 1.0495, + "step": 114400 + }, + { + "epoch": 0.1, + "grad_norm": 18.75, + "learning_rate": 4.535362715280652e-05, + "loss": 1.2276, + "step": 114500 + }, + { + "epoch": 0.1, + "grad_norm": 66.5, + "learning_rate": 4.53491280796156e-05, + "loss": 1.048, + "step": 114600 + }, + { + "epoch": 0.1, + "grad_norm": 55.25, + "learning_rate": 4.5344629006424676e-05, + "loss": 1.3131, + "step": 114700 + }, + { + "epoch": 0.1, + "grad_norm": 21.375, + "learning_rate": 4.534012993323376e-05, + "loss": 1.0428, + "step": 114800 + }, + { + "epoch": 0.1, + "grad_norm": 37.5, + "learning_rate": 4.5335630860042833e-05, + "loss": 1.4023, + "step": 114900 + }, + { + "epoch": 0.1, + "grad_norm": 185.0, + "learning_rate": 4.533113178685191e-05, + "loss": 1.2281, + "step": 115000 + }, + { + "epoch": 0.1, + "grad_norm": 68.5, + "learning_rate": 4.532663271366099e-05, + "loss": 1.1706, + "step": 115100 + }, + { + "epoch": 0.1, + "grad_norm": 29.0, + "learning_rate": 4.5322133640470066e-05, + "loss": 1.1157, + "step": 115200 + }, + { + "epoch": 0.1, + "grad_norm": 24.375, + "learning_rate": 4.531763456727914e-05, + "loss": 1.3108, + "step": 115300 + }, + { + "epoch": 0.1, + "grad_norm": 0.80859375, + "learning_rate": 4.5313135494088224e-05, + "loss": 1.1107, + "step": 115400 + }, + { + "epoch": 0.1, + "grad_norm": 12.5625, + "learning_rate": 4.53086364208973e-05, + "loss": 1.09, + "step": 115500 + }, + { + "epoch": 0.1, + "grad_norm": 0.40625, + "learning_rate": 4.5304137347706374e-05, + "loss": 1.1862, + "step": 115600 + }, + { + "epoch": 0.1, + "grad_norm": 16.375, + "learning_rate": 4.5299638274515456e-05, + "loss": 1.2026, + "step": 115700 + }, + { + "epoch": 0.1, + "grad_norm": 548.0, + "learning_rate": 4.5295139201324525e-05, + "loss": 1.209, + "step": 115800 + }, + { + "epoch": 0.1, + "grad_norm": 0.1259765625, + "learning_rate": 4.529064012813361e-05, + "loss": 1.2564, + "step": 115900 + }, + { + "epoch": 0.1, + "grad_norm": 43.0, + "learning_rate": 4.528614105494268e-05, + "loss": 1.1271, + "step": 116000 + }, + { + "epoch": 0.1, + "grad_norm": 19.875, + "learning_rate": 4.528164198175176e-05, + "loss": 1.2558, + "step": 116100 + }, + { + "epoch": 0.1, + "grad_norm": 30.5, + "learning_rate": 4.527714290856084e-05, + "loss": 1.1918, + "step": 116200 + }, + { + "epoch": 0.1, + "grad_norm": 0.0177001953125, + "learning_rate": 4.5272643835369915e-05, + "loss": 1.1657, + "step": 116300 + }, + { + "epoch": 0.1, + "grad_norm": 82.0, + "learning_rate": 4.526814476217899e-05, + "loss": 1.2569, + "step": 116400 + }, + { + "epoch": 0.1, + "grad_norm": 13.8125, + "learning_rate": 4.526364568898807e-05, + "loss": 1.2252, + "step": 116500 + }, + { + "epoch": 0.1, + "grad_norm": 0.004638671875, + "learning_rate": 4.525914661579715e-05, + "loss": 1.1186, + "step": 116600 + }, + { + "epoch": 0.1, + "grad_norm": 42.75, + "learning_rate": 4.5254647542606223e-05, + "loss": 1.114, + "step": 116700 + }, + { + "epoch": 0.1, + "grad_norm": 14.1875, + "learning_rate": 4.5250148469415306e-05, + "loss": 1.1557, + "step": 116800 + }, + { + "epoch": 0.1, + "grad_norm": 144.0, + "learning_rate": 4.524564939622438e-05, + "loss": 1.1184, + "step": 116900 + }, + { + "epoch": 0.1, + "grad_norm": 77.5, + "learning_rate": 4.524115032303346e-05, + "loss": 1.1887, + "step": 117000 + }, + { + "epoch": 0.1, + "grad_norm": 10.375, + "learning_rate": 4.523665124984253e-05, + "loss": 1.3325, + "step": 117100 + }, + { + "epoch": 0.1, + "grad_norm": 9.75, + "learning_rate": 4.523215217665161e-05, + "loss": 1.2805, + "step": 117200 + }, + { + "epoch": 0.1, + "grad_norm": 7.5, + "learning_rate": 4.522765310346069e-05, + "loss": 1.2729, + "step": 117300 + }, + { + "epoch": 0.1, + "grad_norm": 15.25, + "learning_rate": 4.5223154030269764e-05, + "loss": 1.1481, + "step": 117400 + }, + { + "epoch": 0.1, + "grad_norm": 13.4375, + "learning_rate": 4.5218654957078846e-05, + "loss": 1.3263, + "step": 117500 + }, + { + "epoch": 0.1, + "grad_norm": 53.25, + "learning_rate": 4.521415588388792e-05, + "loss": 1.1524, + "step": 117600 + }, + { + "epoch": 0.1, + "grad_norm": 17.375, + "learning_rate": 4.5209656810697e-05, + "loss": 1.2387, + "step": 117700 + }, + { + "epoch": 0.1, + "grad_norm": 117.5, + "learning_rate": 4.520515773750608e-05, + "loss": 1.259, + "step": 117800 + }, + { + "epoch": 0.11, + "grad_norm": 12.6875, + "learning_rate": 4.5200658664315155e-05, + "loss": 1.2438, + "step": 117900 + }, + { + "epoch": 0.11, + "grad_norm": 37.25, + "learning_rate": 4.519615959112423e-05, + "loss": 1.2608, + "step": 118000 + }, + { + "epoch": 0.11, + "grad_norm": 19.875, + "learning_rate": 4.519166051793331e-05, + "loss": 1.2646, + "step": 118100 + }, + { + "epoch": 0.11, + "grad_norm": 29.875, + "learning_rate": 4.518716144474239e-05, + "loss": 1.1934, + "step": 118200 + }, + { + "epoch": 0.11, + "grad_norm": 11.125, + "learning_rate": 4.518266237155146e-05, + "loss": 1.2037, + "step": 118300 + }, + { + "epoch": 0.11, + "grad_norm": 61.25, + "learning_rate": 4.517816329836054e-05, + "loss": 1.0696, + "step": 118400 + }, + { + "epoch": 0.11, + "grad_norm": 31.75, + "learning_rate": 4.5173664225169613e-05, + "loss": 1.2027, + "step": 118500 + }, + { + "epoch": 0.11, + "grad_norm": 149.0, + "learning_rate": 4.5169165151978696e-05, + "loss": 1.3522, + "step": 118600 + }, + { + "epoch": 0.11, + "grad_norm": 12.4375, + "learning_rate": 4.516466607878777e-05, + "loss": 1.0956, + "step": 118700 + }, + { + "epoch": 0.11, + "grad_norm": 45.75, + "learning_rate": 4.5160167005596846e-05, + "loss": 1.3911, + "step": 118800 + }, + { + "epoch": 0.11, + "grad_norm": 68.0, + "learning_rate": 4.515566793240593e-05, + "loss": 1.1735, + "step": 118900 + }, + { + "epoch": 0.11, + "grad_norm": 43.25, + "learning_rate": 4.5151168859215004e-05, + "loss": 1.2757, + "step": 119000 + }, + { + "epoch": 0.11, + "grad_norm": 19.375, + "learning_rate": 4.514666978602408e-05, + "loss": 1.1572, + "step": 119100 + }, + { + "epoch": 0.11, + "grad_norm": 108.0, + "learning_rate": 4.514217071283316e-05, + "loss": 1.2605, + "step": 119200 + }, + { + "epoch": 0.11, + "grad_norm": 61.25, + "learning_rate": 4.5137671639642236e-05, + "loss": 1.1866, + "step": 119300 + }, + { + "epoch": 0.11, + "grad_norm": 37.75, + "learning_rate": 4.513317256645131e-05, + "loss": 1.3153, + "step": 119400 + }, + { + "epoch": 0.11, + "grad_norm": 0.8203125, + "learning_rate": 4.5128673493260394e-05, + "loss": 1.2142, + "step": 119500 + }, + { + "epoch": 0.11, + "grad_norm": 14.875, + "learning_rate": 4.512417442006947e-05, + "loss": 1.1503, + "step": 119600 + }, + { + "epoch": 0.11, + "grad_norm": 34.0, + "learning_rate": 4.5119675346878545e-05, + "loss": 1.1278, + "step": 119700 + }, + { + "epoch": 0.11, + "grad_norm": 195.0, + "learning_rate": 4.511517627368762e-05, + "loss": 1.1959, + "step": 119800 + }, + { + "epoch": 0.11, + "grad_norm": 247.0, + "learning_rate": 4.5110677200496695e-05, + "loss": 1.2427, + "step": 119900 + }, + { + "epoch": 0.11, + "grad_norm": 47.0, + "learning_rate": 4.510617812730578e-05, + "loss": 1.3063, + "step": 120000 + }, + { + "epoch": 0.11, + "grad_norm": 28.875, + "learning_rate": 4.510167905411485e-05, + "loss": 1.2894, + "step": 120100 + }, + { + "epoch": 0.11, + "grad_norm": 36.75, + "learning_rate": 4.5097179980923935e-05, + "loss": 1.3332, + "step": 120200 + }, + { + "epoch": 0.11, + "grad_norm": 11.375, + "learning_rate": 4.509268090773301e-05, + "loss": 1.1714, + "step": 120300 + }, + { + "epoch": 0.11, + "grad_norm": 20.375, + "learning_rate": 4.5088181834542086e-05, + "loss": 1.3889, + "step": 120400 + }, + { + "epoch": 0.11, + "grad_norm": 23.875, + "learning_rate": 4.508368276135117e-05, + "loss": 1.3379, + "step": 120500 + }, + { + "epoch": 0.11, + "grad_norm": 92.5, + "learning_rate": 4.507918368816024e-05, + "loss": 1.1349, + "step": 120600 + }, + { + "epoch": 0.11, + "grad_norm": 15.125, + "learning_rate": 4.507468461496932e-05, + "loss": 1.0149, + "step": 120700 + }, + { + "epoch": 0.11, + "grad_norm": 33.75, + "learning_rate": 4.50701855417784e-05, + "loss": 1.2801, + "step": 120800 + }, + { + "epoch": 0.11, + "grad_norm": 110.5, + "learning_rate": 4.5065686468587476e-05, + "loss": 1.0912, + "step": 120900 + }, + { + "epoch": 0.11, + "grad_norm": 20.25, + "learning_rate": 4.506118739539655e-05, + "loss": 1.2497, + "step": 121000 + }, + { + "epoch": 0.11, + "grad_norm": 23.5, + "learning_rate": 4.5056688322205626e-05, + "loss": 1.2293, + "step": 121100 + }, + { + "epoch": 0.11, + "grad_norm": 39.0, + "learning_rate": 4.50521892490147e-05, + "loss": 1.2091, + "step": 121200 + }, + { + "epoch": 0.11, + "grad_norm": 0.033447265625, + "learning_rate": 4.5047690175823784e-05, + "loss": 1.1963, + "step": 121300 + }, + { + "epoch": 0.11, + "grad_norm": 58.75, + "learning_rate": 4.504319110263286e-05, + "loss": 1.1702, + "step": 121400 + }, + { + "epoch": 0.11, + "grad_norm": 127.0, + "learning_rate": 4.5038692029441935e-05, + "loss": 1.2308, + "step": 121500 + }, + { + "epoch": 0.11, + "grad_norm": 9.5625, + "learning_rate": 4.503419295625102e-05, + "loss": 1.3036, + "step": 121600 + }, + { + "epoch": 0.11, + "grad_norm": 52.0, + "learning_rate": 4.502969388306009e-05, + "loss": 1.0735, + "step": 121700 + }, + { + "epoch": 0.11, + "grad_norm": 60.0, + "learning_rate": 4.502519480986917e-05, + "loss": 1.2377, + "step": 121800 + }, + { + "epoch": 0.11, + "grad_norm": 39.0, + "learning_rate": 4.502069573667825e-05, + "loss": 1.1758, + "step": 121900 + }, + { + "epoch": 0.11, + "grad_norm": 31.125, + "learning_rate": 4.5016196663487325e-05, + "loss": 1.3948, + "step": 122000 + }, + { + "epoch": 0.11, + "grad_norm": 27.625, + "learning_rate": 4.50116975902964e-05, + "loss": 1.1844, + "step": 122100 + }, + { + "epoch": 0.11, + "grad_norm": 67.0, + "learning_rate": 4.500719851710548e-05, + "loss": 1.1769, + "step": 122200 + }, + { + "epoch": 0.11, + "grad_norm": 31.5, + "learning_rate": 4.500269944391455e-05, + "loss": 1.2587, + "step": 122300 + }, + { + "epoch": 0.11, + "grad_norm": 15.4375, + "learning_rate": 4.499820037072363e-05, + "loss": 1.2565, + "step": 122400 + }, + { + "epoch": 0.11, + "grad_norm": 64.5, + "learning_rate": 4.499370129753271e-05, + "loss": 1.152, + "step": 122500 + }, + { + "epoch": 0.11, + "grad_norm": 22.125, + "learning_rate": 4.4989202224341784e-05, + "loss": 1.1894, + "step": 122600 + }, + { + "epoch": 0.11, + "grad_norm": 0.08544921875, + "learning_rate": 4.4984703151150866e-05, + "loss": 1.1433, + "step": 122700 + }, + { + "epoch": 0.11, + "grad_norm": 138.0, + "learning_rate": 4.498020407795994e-05, + "loss": 1.1514, + "step": 122800 + }, + { + "epoch": 0.11, + "grad_norm": 39.0, + "learning_rate": 4.4975705004769016e-05, + "loss": 1.1807, + "step": 122900 + }, + { + "epoch": 0.11, + "grad_norm": 20.125, + "learning_rate": 4.49712059315781e-05, + "loss": 1.2844, + "step": 123000 + }, + { + "epoch": 0.11, + "grad_norm": 9.9375, + "learning_rate": 4.4966706858387174e-05, + "loss": 1.1473, + "step": 123100 + }, + { + "epoch": 0.11, + "grad_norm": 45.75, + "learning_rate": 4.4962207785196256e-05, + "loss": 1.175, + "step": 123200 + }, + { + "epoch": 0.11, + "grad_norm": 16.875, + "learning_rate": 4.495770871200533e-05, + "loss": 1.3394, + "step": 123300 + }, + { + "epoch": 0.11, + "grad_norm": 68.0, + "learning_rate": 4.495320963881441e-05, + "loss": 1.1536, + "step": 123400 + }, + { + "epoch": 0.11, + "grad_norm": 15.4375, + "learning_rate": 4.494871056562349e-05, + "loss": 1.2805, + "step": 123500 + }, + { + "epoch": 0.11, + "grad_norm": 0.2099609375, + "learning_rate": 4.494421149243256e-05, + "loss": 1.2155, + "step": 123600 + }, + { + "epoch": 0.11, + "grad_norm": 21.125, + "learning_rate": 4.493971241924164e-05, + "loss": 1.2393, + "step": 123700 + }, + { + "epoch": 0.11, + "grad_norm": 33.0, + "learning_rate": 4.4935213346050715e-05, + "loss": 1.2623, + "step": 123800 + }, + { + "epoch": 0.11, + "grad_norm": 40.0, + "learning_rate": 4.493071427285979e-05, + "loss": 1.2894, + "step": 123900 + }, + { + "epoch": 0.11, + "grad_norm": 14.0, + "learning_rate": 4.492621519966887e-05, + "loss": 1.273, + "step": 124000 + }, + { + "epoch": 0.11, + "grad_norm": 13.1875, + "learning_rate": 4.492171612647795e-05, + "loss": 1.0763, + "step": 124100 + }, + { + "epoch": 0.11, + "grad_norm": 18.125, + "learning_rate": 4.491721705328702e-05, + "loss": 1.2579, + "step": 124200 + }, + { + "epoch": 0.11, + "grad_norm": 53.25, + "learning_rate": 4.4912717980096105e-05, + "loss": 1.4472, + "step": 124300 + }, + { + "epoch": 0.11, + "grad_norm": 35.5, + "learning_rate": 4.490821890690518e-05, + "loss": 1.1988, + "step": 124400 + }, + { + "epoch": 0.11, + "grad_norm": 35.75, + "learning_rate": 4.4903719833714256e-05, + "loss": 1.5536, + "step": 124500 + }, + { + "epoch": 0.11, + "grad_norm": 19.875, + "learning_rate": 4.489922076052334e-05, + "loss": 1.1916, + "step": 124600 + }, + { + "epoch": 0.11, + "grad_norm": 28.25, + "learning_rate": 4.489472168733241e-05, + "loss": 1.2056, + "step": 124700 + }, + { + "epoch": 0.11, + "grad_norm": 22.25, + "learning_rate": 4.489022261414149e-05, + "loss": 1.1438, + "step": 124800 + }, + { + "epoch": 0.11, + "grad_norm": 0.1484375, + "learning_rate": 4.4885723540950564e-05, + "loss": 1.2385, + "step": 124900 + }, + { + "epoch": 0.11, + "grad_norm": 44.5, + "learning_rate": 4.488122446775964e-05, + "loss": 1.2607, + "step": 125000 + }, + { + "epoch": 0.11, + "grad_norm": 26.625, + "learning_rate": 4.487672539456872e-05, + "loss": 1.2741, + "step": 125100 + }, + { + "epoch": 0.11, + "grad_norm": 18.625, + "learning_rate": 4.48722263213778e-05, + "loss": 1.3419, + "step": 125200 + }, + { + "epoch": 0.11, + "grad_norm": 13.5625, + "learning_rate": 4.486772724818687e-05, + "loss": 1.1937, + "step": 125300 + }, + { + "epoch": 0.11, + "grad_norm": 48.5, + "learning_rate": 4.4863228174995954e-05, + "loss": 1.252, + "step": 125400 + }, + { + "epoch": 0.11, + "grad_norm": 0.0089111328125, + "learning_rate": 4.485872910180503e-05, + "loss": 1.1599, + "step": 125500 + }, + { + "epoch": 0.11, + "grad_norm": 23.625, + "learning_rate": 4.4854230028614105e-05, + "loss": 1.1953, + "step": 125600 + }, + { + "epoch": 0.11, + "grad_norm": 41.5, + "learning_rate": 4.484973095542319e-05, + "loss": 1.0053, + "step": 125700 + }, + { + "epoch": 0.11, + "grad_norm": 166.0, + "learning_rate": 4.484523188223226e-05, + "loss": 1.3735, + "step": 125800 + }, + { + "epoch": 0.11, + "grad_norm": 161.0, + "learning_rate": 4.4840732809041344e-05, + "loss": 1.1931, + "step": 125900 + }, + { + "epoch": 0.11, + "grad_norm": 48.0, + "learning_rate": 4.483623373585042e-05, + "loss": 1.3276, + "step": 126000 + }, + { + "epoch": 0.11, + "grad_norm": 0.0015716552734375, + "learning_rate": 4.4831734662659495e-05, + "loss": 1.3037, + "step": 126100 + }, + { + "epoch": 0.11, + "grad_norm": 0.376953125, + "learning_rate": 4.482723558946857e-05, + "loss": 1.2961, + "step": 126200 + }, + { + "epoch": 0.11, + "grad_norm": 0.0458984375, + "learning_rate": 4.4822736516277646e-05, + "loss": 1.0874, + "step": 126300 + }, + { + "epoch": 0.11, + "grad_norm": 23.5, + "learning_rate": 4.481823744308673e-05, + "loss": 1.1462, + "step": 126400 + }, + { + "epoch": 0.11, + "grad_norm": 42.0, + "learning_rate": 4.48137383698958e-05, + "loss": 1.2447, + "step": 126500 + }, + { + "epoch": 0.11, + "grad_norm": 185.0, + "learning_rate": 4.480923929670488e-05, + "loss": 1.2716, + "step": 126600 + }, + { + "epoch": 0.11, + "grad_norm": 320.0, + "learning_rate": 4.480474022351396e-05, + "loss": 1.1433, + "step": 126700 + }, + { + "epoch": 0.11, + "grad_norm": 75.0, + "learning_rate": 4.4800241150323036e-05, + "loss": 1.1822, + "step": 126800 + }, + { + "epoch": 0.11, + "grad_norm": 10.6875, + "learning_rate": 4.479574207713211e-05, + "loss": 1.209, + "step": 126900 + }, + { + "epoch": 0.11, + "grad_norm": 16.125, + "learning_rate": 4.4791243003941193e-05, + "loss": 1.2751, + "step": 127000 + }, + { + "epoch": 0.11, + "grad_norm": 0.09033203125, + "learning_rate": 4.478674393075027e-05, + "loss": 1.3464, + "step": 127100 + }, + { + "epoch": 0.11, + "grad_norm": 21.125, + "learning_rate": 4.4782244857559344e-05, + "loss": 1.1065, + "step": 127200 + }, + { + "epoch": 0.11, + "grad_norm": 64.0, + "learning_rate": 4.4777745784368426e-05, + "loss": 1.1304, + "step": 127300 + }, + { + "epoch": 0.11, + "grad_norm": 43.25, + "learning_rate": 4.47732467111775e-05, + "loss": 1.4005, + "step": 127400 + }, + { + "epoch": 0.11, + "grad_norm": 28.125, + "learning_rate": 4.476874763798658e-05, + "loss": 1.2676, + "step": 127500 + }, + { + "epoch": 0.11, + "grad_norm": 26.25, + "learning_rate": 4.476424856479565e-05, + "loss": 1.2897, + "step": 127600 + }, + { + "epoch": 0.11, + "grad_norm": 24.5, + "learning_rate": 4.475974949160473e-05, + "loss": 1.162, + "step": 127700 + }, + { + "epoch": 0.11, + "grad_norm": 18.625, + "learning_rate": 4.475525041841381e-05, + "loss": 1.1439, + "step": 127800 + }, + { + "epoch": 0.11, + "grad_norm": 0.0322265625, + "learning_rate": 4.4750751345222885e-05, + "loss": 1.0786, + "step": 127900 + }, + { + "epoch": 0.11, + "grad_norm": 26.25, + "learning_rate": 4.474625227203196e-05, + "loss": 1.1313, + "step": 128000 + }, + { + "epoch": 0.11, + "grad_norm": 13.25, + "learning_rate": 4.474175319884104e-05, + "loss": 1.0268, + "step": 128100 + }, + { + "epoch": 0.11, + "grad_norm": 8.625, + "learning_rate": 4.473725412565012e-05, + "loss": 1.3999, + "step": 128200 + }, + { + "epoch": 0.11, + "grad_norm": 28.125, + "learning_rate": 4.473275505245919e-05, + "loss": 0.9953, + "step": 128300 + }, + { + "epoch": 0.11, + "grad_norm": 22.625, + "learning_rate": 4.4728255979268275e-05, + "loss": 1.1821, + "step": 128400 + }, + { + "epoch": 0.11, + "grad_norm": 133.0, + "learning_rate": 4.472375690607735e-05, + "loss": 1.1824, + "step": 128500 + }, + { + "epoch": 0.11, + "grad_norm": 0.06884765625, + "learning_rate": 4.471925783288643e-05, + "loss": 1.1332, + "step": 128600 + }, + { + "epoch": 0.11, + "grad_norm": 77.5, + "learning_rate": 4.471475875969551e-05, + "loss": 1.3097, + "step": 128700 + }, + { + "epoch": 0.11, + "grad_norm": 200.0, + "learning_rate": 4.471025968650458e-05, + "loss": 1.1262, + "step": 128800 + }, + { + "epoch": 0.11, + "grad_norm": 39.5, + "learning_rate": 4.470576061331366e-05, + "loss": 1.4088, + "step": 128900 + }, + { + "epoch": 0.11, + "grad_norm": 31.625, + "learning_rate": 4.4701261540122734e-05, + "loss": 1.352, + "step": 129000 + }, + { + "epoch": 0.12, + "grad_norm": 28.125, + "learning_rate": 4.4696762466931816e-05, + "loss": 1.2171, + "step": 129100 + }, + { + "epoch": 0.12, + "grad_norm": 57.25, + "learning_rate": 4.469226339374089e-05, + "loss": 1.2184, + "step": 129200 + }, + { + "epoch": 0.12, + "grad_norm": 18.0, + "learning_rate": 4.468776432054997e-05, + "loss": 1.2715, + "step": 129300 + }, + { + "epoch": 0.12, + "grad_norm": 93.5, + "learning_rate": 4.468326524735905e-05, + "loss": 1.2901, + "step": 129400 + }, + { + "epoch": 0.12, + "grad_norm": 48.75, + "learning_rate": 4.4678766174168124e-05, + "loss": 1.1037, + "step": 129500 + }, + { + "epoch": 0.12, + "grad_norm": 49.25, + "learning_rate": 4.46742671009772e-05, + "loss": 1.1158, + "step": 129600 + }, + { + "epoch": 0.12, + "grad_norm": 18.625, + "learning_rate": 4.466976802778628e-05, + "loss": 1.1503, + "step": 129700 + }, + { + "epoch": 0.12, + "grad_norm": 39.5, + "learning_rate": 4.466526895459536e-05, + "loss": 1.2502, + "step": 129800 + }, + { + "epoch": 0.12, + "grad_norm": 29.375, + "learning_rate": 4.466076988140443e-05, + "loss": 1.2901, + "step": 129900 + }, + { + "epoch": 0.12, + "grad_norm": 27.5, + "learning_rate": 4.4656270808213515e-05, + "loss": 1.4238, + "step": 130000 + }, + { + "epoch": 0.12, + "grad_norm": 55.75, + "learning_rate": 4.465177173502258e-05, + "loss": 1.1832, + "step": 130100 + }, + { + "epoch": 0.12, + "grad_norm": 0.314453125, + "learning_rate": 4.4647272661831665e-05, + "loss": 1.25, + "step": 130200 + }, + { + "epoch": 0.12, + "grad_norm": 9.4375, + "learning_rate": 4.464277358864074e-05, + "loss": 1.1483, + "step": 130300 + }, + { + "epoch": 0.12, + "grad_norm": 102.5, + "learning_rate": 4.4638274515449816e-05, + "loss": 1.1954, + "step": 130400 + }, + { + "epoch": 0.12, + "grad_norm": 46.25, + "learning_rate": 4.46337754422589e-05, + "loss": 1.2708, + "step": 130500 + }, + { + "epoch": 0.12, + "grad_norm": 40.25, + "learning_rate": 4.4629276369067973e-05, + "loss": 1.3658, + "step": 130600 + }, + { + "epoch": 0.12, + "grad_norm": 0.5625, + "learning_rate": 4.462477729587705e-05, + "loss": 1.1088, + "step": 130700 + }, + { + "epoch": 0.12, + "grad_norm": 30.375, + "learning_rate": 4.462027822268613e-05, + "loss": 1.2477, + "step": 130800 + }, + { + "epoch": 0.12, + "grad_norm": 44.25, + "learning_rate": 4.4615779149495206e-05, + "loss": 1.2206, + "step": 130900 + }, + { + "epoch": 0.12, + "grad_norm": 1.125, + "learning_rate": 4.461128007630428e-05, + "loss": 1.1262, + "step": 131000 + }, + { + "epoch": 0.12, + "grad_norm": 49.75, + "learning_rate": 4.4606781003113364e-05, + "loss": 1.1537, + "step": 131100 + }, + { + "epoch": 0.12, + "grad_norm": 0.5546875, + "learning_rate": 4.460228192992244e-05, + "loss": 1.1545, + "step": 131200 + }, + { + "epoch": 0.12, + "grad_norm": 42.25, + "learning_rate": 4.459778285673152e-05, + "loss": 1.2642, + "step": 131300 + }, + { + "epoch": 0.12, + "grad_norm": 14.0, + "learning_rate": 4.459328378354059e-05, + "loss": 1.0595, + "step": 131400 + }, + { + "epoch": 0.12, + "grad_norm": 21.5, + "learning_rate": 4.4588784710349665e-05, + "loss": 1.223, + "step": 131500 + }, + { + "epoch": 0.12, + "grad_norm": 14.8125, + "learning_rate": 4.458428563715875e-05, + "loss": 1.0834, + "step": 131600 + }, + { + "epoch": 0.12, + "grad_norm": 46.75, + "learning_rate": 4.457978656396782e-05, + "loss": 1.2556, + "step": 131700 + }, + { + "epoch": 0.12, + "grad_norm": 26.75, + "learning_rate": 4.4575287490776905e-05, + "loss": 1.2542, + "step": 131800 + }, + { + "epoch": 0.12, + "grad_norm": 7.6875, + "learning_rate": 4.457078841758598e-05, + "loss": 1.1523, + "step": 131900 + }, + { + "epoch": 0.12, + "grad_norm": 28.25, + "learning_rate": 4.4566289344395055e-05, + "loss": 1.2207, + "step": 132000 + }, + { + "epoch": 0.12, + "grad_norm": 0.0245361328125, + "learning_rate": 4.456179027120414e-05, + "loss": 1.2154, + "step": 132100 + }, + { + "epoch": 0.12, + "grad_norm": 8.0625, + "learning_rate": 4.455729119801321e-05, + "loss": 1.2426, + "step": 132200 + }, + { + "epoch": 0.12, + "grad_norm": 45.0, + "learning_rate": 4.455279212482229e-05, + "loss": 1.2117, + "step": 132300 + }, + { + "epoch": 0.12, + "grad_norm": 8.5625, + "learning_rate": 4.454829305163137e-05, + "loss": 1.1347, + "step": 132400 + }, + { + "epoch": 0.12, + "grad_norm": 22.875, + "learning_rate": 4.4543793978440446e-05, + "loss": 1.14, + "step": 132500 + }, + { + "epoch": 0.12, + "grad_norm": 64.5, + "learning_rate": 4.453929490524952e-05, + "loss": 1.2401, + "step": 132600 + }, + { + "epoch": 0.12, + "grad_norm": 42.25, + "learning_rate": 4.4534795832058596e-05, + "loss": 1.1757, + "step": 132700 + }, + { + "epoch": 0.12, + "grad_norm": 0.0052490234375, + "learning_rate": 4.453029675886767e-05, + "loss": 1.2388, + "step": 132800 + }, + { + "epoch": 0.12, + "grad_norm": 0.349609375, + "learning_rate": 4.4525797685676754e-05, + "loss": 1.2856, + "step": 132900 + }, + { + "epoch": 0.12, + "grad_norm": 97.5, + "learning_rate": 4.452129861248583e-05, + "loss": 1.1789, + "step": 133000 + }, + { + "epoch": 0.12, + "grad_norm": 119.5, + "learning_rate": 4.4516799539294904e-05, + "loss": 1.0874, + "step": 133100 + }, + { + "epoch": 0.12, + "grad_norm": 0.302734375, + "learning_rate": 4.4512300466103986e-05, + "loss": 1.2439, + "step": 133200 + }, + { + "epoch": 0.12, + "grad_norm": 79.0, + "learning_rate": 4.450780139291306e-05, + "loss": 1.2389, + "step": 133300 + }, + { + "epoch": 0.12, + "grad_norm": 23.625, + "learning_rate": 4.450330231972214e-05, + "loss": 1.2274, + "step": 133400 + }, + { + "epoch": 0.12, + "grad_norm": 21.5, + "learning_rate": 4.449880324653122e-05, + "loss": 1.188, + "step": 133500 + }, + { + "epoch": 0.12, + "grad_norm": 17.375, + "learning_rate": 4.4494304173340295e-05, + "loss": 1.1631, + "step": 133600 + }, + { + "epoch": 0.12, + "grad_norm": 73.0, + "learning_rate": 4.448980510014937e-05, + "loss": 1.231, + "step": 133700 + }, + { + "epoch": 0.12, + "grad_norm": 13.0, + "learning_rate": 4.448530602695845e-05, + "loss": 1.2178, + "step": 133800 + }, + { + "epoch": 0.12, + "grad_norm": 51.5, + "learning_rate": 4.448080695376753e-05, + "loss": 1.2033, + "step": 133900 + }, + { + "epoch": 0.12, + "grad_norm": 0.01318359375, + "learning_rate": 4.44763078805766e-05, + "loss": 1.3142, + "step": 134000 + }, + { + "epoch": 0.12, + "grad_norm": 189.0, + "learning_rate": 4.447180880738568e-05, + "loss": 1.0322, + "step": 134100 + }, + { + "epoch": 0.12, + "grad_norm": 13.625, + "learning_rate": 4.4467309734194753e-05, + "loss": 1.0918, + "step": 134200 + }, + { + "epoch": 0.12, + "grad_norm": 9.1875, + "learning_rate": 4.4462810661003836e-05, + "loss": 1.3872, + "step": 134300 + }, + { + "epoch": 0.12, + "grad_norm": 21.5, + "learning_rate": 4.445831158781291e-05, + "loss": 1.0979, + "step": 134400 + }, + { + "epoch": 0.12, + "grad_norm": 94.0, + "learning_rate": 4.445381251462199e-05, + "loss": 1.1704, + "step": 134500 + }, + { + "epoch": 0.12, + "grad_norm": 29.75, + "learning_rate": 4.444931344143107e-05, + "loss": 1.3263, + "step": 134600 + }, + { + "epoch": 0.12, + "grad_norm": 22.0, + "learning_rate": 4.4444814368240144e-05, + "loss": 1.1985, + "step": 134700 + }, + { + "epoch": 0.12, + "grad_norm": 33.0, + "learning_rate": 4.4440315295049226e-05, + "loss": 1.1766, + "step": 134800 + }, + { + "epoch": 0.12, + "grad_norm": 0.1650390625, + "learning_rate": 4.44358162218583e-05, + "loss": 1.2234, + "step": 134900 + }, + { + "epoch": 0.12, + "grad_norm": 29.625, + "learning_rate": 4.4431317148667376e-05, + "loss": 1.0468, + "step": 135000 + }, + { + "epoch": 0.12, + "grad_norm": 40.75, + "learning_rate": 4.442681807547646e-05, + "loss": 1.2107, + "step": 135100 + }, + { + "epoch": 0.12, + "grad_norm": 57.25, + "learning_rate": 4.4422319002285534e-05, + "loss": 1.2141, + "step": 135200 + }, + { + "epoch": 0.12, + "grad_norm": 35.25, + "learning_rate": 4.441781992909461e-05, + "loss": 1.0996, + "step": 135300 + }, + { + "epoch": 0.12, + "grad_norm": 34.5, + "learning_rate": 4.4413320855903685e-05, + "loss": 1.0985, + "step": 135400 + }, + { + "epoch": 0.12, + "grad_norm": 15.75, + "learning_rate": 4.440882178271276e-05, + "loss": 1.16, + "step": 135500 + }, + { + "epoch": 0.12, + "grad_norm": 31.625, + "learning_rate": 4.440432270952184e-05, + "loss": 1.2288, + "step": 135600 + }, + { + "epoch": 0.12, + "grad_norm": 0.006591796875, + "learning_rate": 4.439982363633092e-05, + "loss": 1.1724, + "step": 135700 + }, + { + "epoch": 0.12, + "grad_norm": 27.125, + "learning_rate": 4.439532456313999e-05, + "loss": 1.2668, + "step": 135800 + }, + { + "epoch": 0.12, + "grad_norm": 18.25, + "learning_rate": 4.4390825489949075e-05, + "loss": 1.2629, + "step": 135900 + }, + { + "epoch": 0.12, + "grad_norm": 18.375, + "learning_rate": 4.438632641675815e-05, + "loss": 1.28, + "step": 136000 + }, + { + "epoch": 0.12, + "grad_norm": 47.25, + "learning_rate": 4.4381827343567226e-05, + "loss": 1.03, + "step": 136100 + }, + { + "epoch": 0.12, + "grad_norm": 21.5, + "learning_rate": 4.437732827037631e-05, + "loss": 1.1627, + "step": 136200 + }, + { + "epoch": 0.12, + "grad_norm": 19.375, + "learning_rate": 4.437282919718538e-05, + "loss": 1.3236, + "step": 136300 + }, + { + "epoch": 0.12, + "grad_norm": 17.875, + "learning_rate": 4.436833012399446e-05, + "loss": 1.1586, + "step": 136400 + }, + { + "epoch": 0.12, + "grad_norm": 27.875, + "learning_rate": 4.436383105080354e-05, + "loss": 1.1852, + "step": 136500 + }, + { + "epoch": 0.12, + "grad_norm": 15.5, + "learning_rate": 4.435933197761261e-05, + "loss": 1.077, + "step": 136600 + }, + { + "epoch": 0.12, + "grad_norm": 72.0, + "learning_rate": 4.435483290442169e-05, + "loss": 1.3186, + "step": 136700 + }, + { + "epoch": 0.12, + "grad_norm": 55.75, + "learning_rate": 4.4350333831230766e-05, + "loss": 1.1928, + "step": 136800 + }, + { + "epoch": 0.12, + "grad_norm": 79.0, + "learning_rate": 4.434583475803984e-05, + "loss": 1.2576, + "step": 136900 + }, + { + "epoch": 0.12, + "grad_norm": 20.875, + "learning_rate": 4.4341335684848924e-05, + "loss": 1.1306, + "step": 137000 + }, + { + "epoch": 0.12, + "grad_norm": 0.00131988525390625, + "learning_rate": 4.4336836611658e-05, + "loss": 1.2011, + "step": 137100 + }, + { + "epoch": 0.12, + "grad_norm": 91.5, + "learning_rate": 4.433233753846708e-05, + "loss": 1.4379, + "step": 137200 + }, + { + "epoch": 0.12, + "grad_norm": 49.75, + "learning_rate": 4.432783846527616e-05, + "loss": 1.2621, + "step": 137300 + }, + { + "epoch": 0.12, + "grad_norm": 49.0, + "learning_rate": 4.432333939208523e-05, + "loss": 1.2022, + "step": 137400 + }, + { + "epoch": 0.12, + "grad_norm": 51.25, + "learning_rate": 4.4318840318894314e-05, + "loss": 1.1237, + "step": 137500 + }, + { + "epoch": 0.12, + "grad_norm": 19.125, + "learning_rate": 4.431434124570339e-05, + "loss": 1.1618, + "step": 137600 + }, + { + "epoch": 0.12, + "grad_norm": 27.5, + "learning_rate": 4.4309842172512465e-05, + "loss": 1.1117, + "step": 137700 + }, + { + "epoch": 0.12, + "grad_norm": 66.0, + "learning_rate": 4.430534309932155e-05, + "loss": 1.1636, + "step": 137800 + }, + { + "epoch": 0.12, + "grad_norm": 25.125, + "learning_rate": 4.4300844026130616e-05, + "loss": 1.1296, + "step": 137900 + }, + { + "epoch": 0.12, + "grad_norm": 41.75, + "learning_rate": 4.42963449529397e-05, + "loss": 1.1772, + "step": 138000 + }, + { + "epoch": 0.12, + "grad_norm": 22.25, + "learning_rate": 4.429184587974877e-05, + "loss": 1.1432, + "step": 138100 + }, + { + "epoch": 0.12, + "grad_norm": 45.25, + "learning_rate": 4.428734680655785e-05, + "loss": 1.1964, + "step": 138200 + }, + { + "epoch": 0.12, + "grad_norm": 24.75, + "learning_rate": 4.428284773336693e-05, + "loss": 1.0238, + "step": 138300 + }, + { + "epoch": 0.12, + "grad_norm": 0.435546875, + "learning_rate": 4.4278348660176006e-05, + "loss": 1.3285, + "step": 138400 + }, + { + "epoch": 0.12, + "grad_norm": 8.375, + "learning_rate": 4.427384958698508e-05, + "loss": 1.2392, + "step": 138500 + }, + { + "epoch": 0.12, + "grad_norm": 62.75, + "learning_rate": 4.426935051379416e-05, + "loss": 1.1838, + "step": 138600 + }, + { + "epoch": 0.12, + "grad_norm": 44.5, + "learning_rate": 4.426485144060324e-05, + "loss": 1.5125, + "step": 138700 + }, + { + "epoch": 0.12, + "grad_norm": 81.5, + "learning_rate": 4.4260352367412314e-05, + "loss": 1.1888, + "step": 138800 + }, + { + "epoch": 0.12, + "grad_norm": 11.875, + "learning_rate": 4.4255853294221396e-05, + "loss": 1.1931, + "step": 138900 + }, + { + "epoch": 0.12, + "grad_norm": 28.125, + "learning_rate": 4.425135422103047e-05, + "loss": 1.1716, + "step": 139000 + }, + { + "epoch": 0.12, + "grad_norm": 19.75, + "learning_rate": 4.424685514783955e-05, + "loss": 1.1646, + "step": 139100 + }, + { + "epoch": 0.12, + "grad_norm": 28.625, + "learning_rate": 4.424235607464862e-05, + "loss": 1.0503, + "step": 139200 + }, + { + "epoch": 0.12, + "grad_norm": 34.5, + "learning_rate": 4.42378570014577e-05, + "loss": 1.083, + "step": 139300 + }, + { + "epoch": 0.12, + "grad_norm": 92.5, + "learning_rate": 4.423335792826678e-05, + "loss": 1.2003, + "step": 139400 + }, + { + "epoch": 0.12, + "grad_norm": 9.3125, + "learning_rate": 4.4228858855075855e-05, + "loss": 1.1024, + "step": 139500 + }, + { + "epoch": 0.12, + "grad_norm": 21.375, + "learning_rate": 4.422435978188493e-05, + "loss": 1.2006, + "step": 139600 + }, + { + "epoch": 0.12, + "grad_norm": 0.01055908203125, + "learning_rate": 4.421986070869401e-05, + "loss": 1.0275, + "step": 139700 + }, + { + "epoch": 0.12, + "grad_norm": 106.0, + "learning_rate": 4.421536163550309e-05, + "loss": 1.1647, + "step": 139800 + }, + { + "epoch": 0.12, + "grad_norm": 0.00885009765625, + "learning_rate": 4.421086256231216e-05, + "loss": 1.3076, + "step": 139900 + }, + { + "epoch": 0.12, + "grad_norm": 0.07763671875, + "learning_rate": 4.4206363489121245e-05, + "loss": 1.1696, + "step": 140000 + }, + { + "epoch": 0.12, + "grad_norm": 41.5, + "learning_rate": 4.420186441593032e-05, + "loss": 1.3154, + "step": 140100 + }, + { + "epoch": 0.12, + "grad_norm": 24.125, + "learning_rate": 4.41973653427394e-05, + "loss": 1.1747, + "step": 140200 + }, + { + "epoch": 0.12, + "grad_norm": 38.75, + "learning_rate": 4.419286626954848e-05, + "loss": 1.2746, + "step": 140300 + }, + { + "epoch": 0.13, + "grad_norm": 33.75, + "learning_rate": 4.418836719635755e-05, + "loss": 1.1907, + "step": 140400 + }, + { + "epoch": 0.13, + "grad_norm": 41.5, + "learning_rate": 4.418386812316663e-05, + "loss": 1.3017, + "step": 140500 + }, + { + "epoch": 0.13, + "grad_norm": 10.6875, + "learning_rate": 4.4179369049975704e-05, + "loss": 1.1746, + "step": 140600 + }, + { + "epoch": 0.13, + "grad_norm": 19.5, + "learning_rate": 4.4174869976784786e-05, + "loss": 1.384, + "step": 140700 + }, + { + "epoch": 0.13, + "grad_norm": 30.5, + "learning_rate": 4.417037090359386e-05, + "loss": 0.912, + "step": 140800 + }, + { + "epoch": 0.13, + "grad_norm": 52.0, + "learning_rate": 4.416587183040294e-05, + "loss": 1.1244, + "step": 140900 + }, + { + "epoch": 0.13, + "grad_norm": 38.5, + "learning_rate": 4.416137275721202e-05, + "loss": 1.089, + "step": 141000 + }, + { + "epoch": 0.13, + "grad_norm": 21.75, + "learning_rate": 4.4156873684021094e-05, + "loss": 1.1577, + "step": 141100 + }, + { + "epoch": 0.13, + "grad_norm": 11.8125, + "learning_rate": 4.415237461083017e-05, + "loss": 1.1959, + "step": 141200 + }, + { + "epoch": 0.13, + "grad_norm": 14.125, + "learning_rate": 4.414787553763925e-05, + "loss": 1.1803, + "step": 141300 + }, + { + "epoch": 0.13, + "grad_norm": 15.1875, + "learning_rate": 4.414337646444833e-05, + "loss": 1.2907, + "step": 141400 + }, + { + "epoch": 0.13, + "grad_norm": 12.6875, + "learning_rate": 4.41388773912574e-05, + "loss": 1.0964, + "step": 141500 + }, + { + "epoch": 0.13, + "grad_norm": 146.0, + "learning_rate": 4.4134378318066484e-05, + "loss": 1.1939, + "step": 141600 + }, + { + "epoch": 0.13, + "grad_norm": 8.0625, + "learning_rate": 4.412987924487556e-05, + "loss": 1.3428, + "step": 141700 + }, + { + "epoch": 0.13, + "grad_norm": 32.75, + "learning_rate": 4.4125380171684635e-05, + "loss": 1.1379, + "step": 141800 + }, + { + "epoch": 0.13, + "grad_norm": 28.875, + "learning_rate": 4.412088109849371e-05, + "loss": 1.1182, + "step": 141900 + }, + { + "epoch": 0.13, + "grad_norm": 3.515625, + "learning_rate": 4.4116382025302786e-05, + "loss": 1.0874, + "step": 142000 + }, + { + "epoch": 0.13, + "grad_norm": 17.5, + "learning_rate": 4.411188295211187e-05, + "loss": 1.2566, + "step": 142100 + }, + { + "epoch": 0.13, + "grad_norm": 56.25, + "learning_rate": 4.410738387892094e-05, + "loss": 1.3614, + "step": 142200 + }, + { + "epoch": 0.13, + "grad_norm": 22.0, + "learning_rate": 4.410288480573002e-05, + "loss": 1.1822, + "step": 142300 + }, + { + "epoch": 0.13, + "grad_norm": 26.625, + "learning_rate": 4.40983857325391e-05, + "loss": 1.1969, + "step": 142400 + }, + { + "epoch": 0.13, + "grad_norm": 39.25, + "learning_rate": 4.4093886659348176e-05, + "loss": 1.2387, + "step": 142500 + }, + { + "epoch": 0.13, + "grad_norm": 18.375, + "learning_rate": 4.408938758615725e-05, + "loss": 1.1866, + "step": 142600 + }, + { + "epoch": 0.13, + "grad_norm": 85.5, + "learning_rate": 4.4084888512966333e-05, + "loss": 1.3847, + "step": 142700 + }, + { + "epoch": 0.13, + "grad_norm": 109.5, + "learning_rate": 4.408038943977541e-05, + "loss": 1.1563, + "step": 142800 + }, + { + "epoch": 0.13, + "grad_norm": 36.5, + "learning_rate": 4.407589036658449e-05, + "loss": 1.0689, + "step": 142900 + }, + { + "epoch": 0.13, + "grad_norm": 53.25, + "learning_rate": 4.4071391293393566e-05, + "loss": 1.1359, + "step": 143000 + }, + { + "epoch": 0.13, + "grad_norm": 26.375, + "learning_rate": 4.4066892220202635e-05, + "loss": 1.1186, + "step": 143100 + }, + { + "epoch": 0.13, + "grad_norm": 20.0, + "learning_rate": 4.406239314701172e-05, + "loss": 1.1483, + "step": 143200 + }, + { + "epoch": 0.13, + "grad_norm": 42.75, + "learning_rate": 4.405789407382079e-05, + "loss": 1.416, + "step": 143300 + }, + { + "epoch": 0.13, + "grad_norm": 390.0, + "learning_rate": 4.4053395000629874e-05, + "loss": 1.1409, + "step": 143400 + }, + { + "epoch": 0.13, + "grad_norm": 0.416015625, + "learning_rate": 4.404889592743895e-05, + "loss": 1.2053, + "step": 143500 + }, + { + "epoch": 0.13, + "grad_norm": 36.0, + "learning_rate": 4.4044396854248025e-05, + "loss": 0.9599, + "step": 143600 + }, + { + "epoch": 0.13, + "grad_norm": 0.09619140625, + "learning_rate": 4.403989778105711e-05, + "loss": 1.2231, + "step": 143700 + }, + { + "epoch": 0.13, + "grad_norm": 51.5, + "learning_rate": 4.403539870786618e-05, + "loss": 1.41, + "step": 143800 + }, + { + "epoch": 0.13, + "grad_norm": 40.5, + "learning_rate": 4.403089963467526e-05, + "loss": 1.1409, + "step": 143900 + }, + { + "epoch": 0.13, + "grad_norm": 6.125, + "learning_rate": 4.402640056148434e-05, + "loss": 1.1738, + "step": 144000 + }, + { + "epoch": 0.13, + "grad_norm": 18.75, + "learning_rate": 4.4021901488293415e-05, + "loss": 1.2016, + "step": 144100 + }, + { + "epoch": 0.13, + "grad_norm": 24.125, + "learning_rate": 4.401740241510249e-05, + "loss": 1.4153, + "step": 144200 + }, + { + "epoch": 0.13, + "grad_norm": 35.75, + "learning_rate": 4.401290334191157e-05, + "loss": 1.0668, + "step": 144300 + }, + { + "epoch": 0.13, + "grad_norm": 31.5, + "learning_rate": 4.400840426872064e-05, + "loss": 1.3122, + "step": 144400 + }, + { + "epoch": 0.13, + "grad_norm": 39.5, + "learning_rate": 4.4003905195529723e-05, + "loss": 1.1096, + "step": 144500 + }, + { + "epoch": 0.13, + "grad_norm": 260.0, + "learning_rate": 4.39994061223388e-05, + "loss": 1.1968, + "step": 144600 + }, + { + "epoch": 0.13, + "grad_norm": 11.875, + "learning_rate": 4.3994907049147874e-05, + "loss": 1.1765, + "step": 144700 + }, + { + "epoch": 0.13, + "grad_norm": 28.375, + "learning_rate": 4.3990407975956956e-05, + "loss": 1.2439, + "step": 144800 + }, + { + "epoch": 0.13, + "grad_norm": 0.072265625, + "learning_rate": 4.398590890276603e-05, + "loss": 1.1206, + "step": 144900 + }, + { + "epoch": 0.13, + "grad_norm": 23.25, + "learning_rate": 4.398140982957511e-05, + "loss": 1.1029, + "step": 145000 + }, + { + "epoch": 0.13, + "grad_norm": 34.0, + "learning_rate": 4.397691075638419e-05, + "loss": 1.1021, + "step": 145100 + }, + { + "epoch": 0.13, + "grad_norm": 15.6875, + "learning_rate": 4.3972411683193264e-05, + "loss": 1.2597, + "step": 145200 + }, + { + "epoch": 0.13, + "grad_norm": 16.25, + "learning_rate": 4.396791261000234e-05, + "loss": 1.2942, + "step": 145300 + }, + { + "epoch": 0.13, + "grad_norm": 16.5, + "learning_rate": 4.396341353681142e-05, + "loss": 1.1596, + "step": 145400 + }, + { + "epoch": 0.13, + "grad_norm": 112.5, + "learning_rate": 4.39589144636205e-05, + "loss": 1.2234, + "step": 145500 + }, + { + "epoch": 0.13, + "grad_norm": 16.625, + "learning_rate": 4.395441539042958e-05, + "loss": 1.3201, + "step": 145600 + }, + { + "epoch": 0.13, + "grad_norm": 27.0, + "learning_rate": 4.394991631723865e-05, + "loss": 1.3096, + "step": 145700 + }, + { + "epoch": 0.13, + "grad_norm": 43.75, + "learning_rate": 4.394541724404772e-05, + "loss": 1.4264, + "step": 145800 + }, + { + "epoch": 0.13, + "grad_norm": 18.75, + "learning_rate": 4.3940918170856805e-05, + "loss": 1.1205, + "step": 145900 + }, + { + "epoch": 0.13, + "grad_norm": 15.5, + "learning_rate": 4.393641909766588e-05, + "loss": 1.0715, + "step": 146000 + }, + { + "epoch": 0.13, + "grad_norm": 25.375, + "learning_rate": 4.393192002447496e-05, + "loss": 1.014, + "step": 146100 + }, + { + "epoch": 0.13, + "grad_norm": 22.25, + "learning_rate": 4.392742095128404e-05, + "loss": 1.1428, + "step": 146200 + }, + { + "epoch": 0.13, + "grad_norm": 10.625, + "learning_rate": 4.3922921878093113e-05, + "loss": 1.0716, + "step": 146300 + }, + { + "epoch": 0.13, + "grad_norm": 0.0087890625, + "learning_rate": 4.3918422804902196e-05, + "loss": 1.0702, + "step": 146400 + }, + { + "epoch": 0.13, + "grad_norm": 66.0, + "learning_rate": 4.391392373171127e-05, + "loss": 1.1625, + "step": 146500 + }, + { + "epoch": 0.13, + "grad_norm": 9.3125, + "learning_rate": 4.3909424658520346e-05, + "loss": 1.2266, + "step": 146600 + }, + { + "epoch": 0.13, + "grad_norm": 2.3125, + "learning_rate": 4.390492558532943e-05, + "loss": 1.0975, + "step": 146700 + }, + { + "epoch": 0.13, + "grad_norm": 39.5, + "learning_rate": 4.3900426512138504e-05, + "loss": 1.2589, + "step": 146800 + }, + { + "epoch": 0.13, + "grad_norm": 0.66796875, + "learning_rate": 4.389592743894758e-05, + "loss": 1.3939, + "step": 146900 + }, + { + "epoch": 0.13, + "grad_norm": 38.0, + "learning_rate": 4.3891428365756654e-05, + "loss": 1.4059, + "step": 147000 + }, + { + "epoch": 0.13, + "grad_norm": 61.5, + "learning_rate": 4.388692929256573e-05, + "loss": 1.2118, + "step": 147100 + }, + { + "epoch": 0.13, + "grad_norm": 0.004180908203125, + "learning_rate": 4.388243021937481e-05, + "loss": 1.2294, + "step": 147200 + }, + { + "epoch": 0.13, + "grad_norm": 169.0, + "learning_rate": 4.387793114618389e-05, + "loss": 1.2969, + "step": 147300 + }, + { + "epoch": 0.13, + "grad_norm": 26.625, + "learning_rate": 4.387343207299296e-05, + "loss": 0.9417, + "step": 147400 + }, + { + "epoch": 0.13, + "grad_norm": 79.5, + "learning_rate": 4.3868932999802045e-05, + "loss": 1.1837, + "step": 147500 + }, + { + "epoch": 0.13, + "grad_norm": 1.2421875, + "learning_rate": 4.386443392661112e-05, + "loss": 1.2401, + "step": 147600 + }, + { + "epoch": 0.13, + "grad_norm": 14.3125, + "learning_rate": 4.3859934853420195e-05, + "loss": 1.2824, + "step": 147700 + }, + { + "epoch": 0.13, + "grad_norm": 14.875, + "learning_rate": 4.385543578022928e-05, + "loss": 1.214, + "step": 147800 + }, + { + "epoch": 0.13, + "grad_norm": 28.0, + "learning_rate": 4.385093670703835e-05, + "loss": 1.1858, + "step": 147900 + }, + { + "epoch": 0.13, + "grad_norm": 153.0, + "learning_rate": 4.384643763384743e-05, + "loss": 1.2291, + "step": 148000 + }, + { + "epoch": 0.13, + "grad_norm": 51.25, + "learning_rate": 4.384193856065651e-05, + "loss": 1.1609, + "step": 148100 + }, + { + "epoch": 0.13, + "grad_norm": 70.0, + "learning_rate": 4.3837439487465586e-05, + "loss": 1.2215, + "step": 148200 + }, + { + "epoch": 0.13, + "grad_norm": 74.5, + "learning_rate": 4.383294041427466e-05, + "loss": 1.4444, + "step": 148300 + }, + { + "epoch": 0.13, + "grad_norm": 32.25, + "learning_rate": 4.3828441341083736e-05, + "loss": 1.2274, + "step": 148400 + }, + { + "epoch": 0.13, + "grad_norm": 318.0, + "learning_rate": 4.382394226789281e-05, + "loss": 1.2435, + "step": 148500 + }, + { + "epoch": 0.13, + "grad_norm": 86.5, + "learning_rate": 4.3819443194701894e-05, + "loss": 1.1043, + "step": 148600 + }, + { + "epoch": 0.13, + "grad_norm": 30.125, + "learning_rate": 4.381494412151097e-05, + "loss": 1.0821, + "step": 148700 + }, + { + "epoch": 0.13, + "grad_norm": 38.75, + "learning_rate": 4.381044504832005e-05, + "loss": 1.259, + "step": 148800 + }, + { + "epoch": 0.13, + "grad_norm": 47.5, + "learning_rate": 4.3805945975129127e-05, + "loss": 1.1161, + "step": 148900 + }, + { + "epoch": 0.13, + "grad_norm": 15.6875, + "learning_rate": 4.38014469019382e-05, + "loss": 1.2601, + "step": 149000 + }, + { + "epoch": 0.13, + "grad_norm": 110.0, + "learning_rate": 4.3796947828747284e-05, + "loss": 1.1872, + "step": 149100 + }, + { + "epoch": 0.13, + "grad_norm": 22.75, + "learning_rate": 4.379244875555636e-05, + "loss": 1.026, + "step": 149200 + }, + { + "epoch": 0.13, + "grad_norm": 168.0, + "learning_rate": 4.3787949682365435e-05, + "loss": 1.1621, + "step": 149300 + }, + { + "epoch": 0.13, + "grad_norm": 24.5, + "learning_rate": 4.378345060917452e-05, + "loss": 1.195, + "step": 149400 + }, + { + "epoch": 0.13, + "grad_norm": 75.0, + "learning_rate": 4.377895153598359e-05, + "loss": 1.1963, + "step": 149500 + }, + { + "epoch": 0.13, + "grad_norm": 22.375, + "learning_rate": 4.377445246279267e-05, + "loss": 1.2369, + "step": 149600 + }, + { + "epoch": 0.13, + "grad_norm": 12.9375, + "learning_rate": 4.376995338960174e-05, + "loss": 1.0527, + "step": 149700 + }, + { + "epoch": 0.13, + "grad_norm": 23.625, + "learning_rate": 4.376545431641082e-05, + "loss": 1.1629, + "step": 149800 + }, + { + "epoch": 0.13, + "grad_norm": 25.0, + "learning_rate": 4.37609552432199e-05, + "loss": 1.2391, + "step": 149900 + }, + { + "epoch": 0.13, + "grad_norm": 22.625, + "learning_rate": 4.3756456170028976e-05, + "loss": 1.2134, + "step": 150000 + }, + { + "epoch": 0.13, + "grad_norm": 33.75, + "learning_rate": 4.375195709683805e-05, + "loss": 1.2521, + "step": 150100 + }, + { + "epoch": 0.13, + "grad_norm": 0.1337890625, + "learning_rate": 4.374745802364713e-05, + "loss": 1.2987, + "step": 150200 + }, + { + "epoch": 0.13, + "grad_norm": 13.0, + "learning_rate": 4.374295895045621e-05, + "loss": 1.2736, + "step": 150300 + }, + { + "epoch": 0.13, + "grad_norm": 82.0, + "learning_rate": 4.3738459877265284e-05, + "loss": 1.1508, + "step": 150400 + }, + { + "epoch": 0.13, + "grad_norm": 38.5, + "learning_rate": 4.3733960804074366e-05, + "loss": 1.2961, + "step": 150500 + }, + { + "epoch": 0.13, + "grad_norm": 16.875, + "learning_rate": 4.372946173088344e-05, + "loss": 1.259, + "step": 150600 + }, + { + "epoch": 0.13, + "grad_norm": 61.5, + "learning_rate": 4.3724962657692517e-05, + "loss": 1.2377, + "step": 150700 + }, + { + "epoch": 0.13, + "grad_norm": 20.25, + "learning_rate": 4.37204635845016e-05, + "loss": 1.2646, + "step": 150800 + }, + { + "epoch": 0.13, + "grad_norm": 17.125, + "learning_rate": 4.371596451131067e-05, + "loss": 1.2117, + "step": 150900 + }, + { + "epoch": 0.13, + "grad_norm": 45.0, + "learning_rate": 4.371146543811975e-05, + "loss": 1.2267, + "step": 151000 + }, + { + "epoch": 0.13, + "grad_norm": 124.5, + "learning_rate": 4.3706966364928825e-05, + "loss": 1.1694, + "step": 151100 + }, + { + "epoch": 0.13, + "grad_norm": 60.5, + "learning_rate": 4.37024672917379e-05, + "loss": 1.2284, + "step": 151200 + }, + { + "epoch": 0.13, + "grad_norm": 30.375, + "learning_rate": 4.369796821854698e-05, + "loss": 1.2425, + "step": 151300 + }, + { + "epoch": 0.13, + "grad_norm": 1.515625, + "learning_rate": 4.369346914535606e-05, + "loss": 1.351, + "step": 151400 + }, + { + "epoch": 0.13, + "grad_norm": 0.003662109375, + "learning_rate": 4.368897007216514e-05, + "loss": 1.2053, + "step": 151500 + }, + { + "epoch": 0.14, + "grad_norm": 101.0, + "learning_rate": 4.3684470998974215e-05, + "loss": 1.3002, + "step": 151600 + }, + { + "epoch": 0.14, + "grad_norm": 28.5, + "learning_rate": 4.367997192578329e-05, + "loss": 1.0124, + "step": 151700 + }, + { + "epoch": 0.14, + "grad_norm": 14.1875, + "learning_rate": 4.367547285259237e-05, + "loss": 1.1829, + "step": 151800 + }, + { + "epoch": 0.14, + "grad_norm": 76.0, + "learning_rate": 4.367097377940145e-05, + "loss": 1.3624, + "step": 151900 + }, + { + "epoch": 0.14, + "grad_norm": 17.625, + "learning_rate": 4.366647470621052e-05, + "loss": 1.105, + "step": 152000 + }, + { + "epoch": 0.14, + "grad_norm": 18.5, + "learning_rate": 4.3661975633019605e-05, + "loss": 1.2441, + "step": 152100 + }, + { + "epoch": 0.14, + "grad_norm": 26.5, + "learning_rate": 4.3657476559828674e-05, + "loss": 1.1693, + "step": 152200 + }, + { + "epoch": 0.14, + "grad_norm": 50.0, + "learning_rate": 4.3652977486637756e-05, + "loss": 1.2365, + "step": 152300 + }, + { + "epoch": 0.14, + "grad_norm": 43.75, + "learning_rate": 4.364847841344683e-05, + "loss": 1.3167, + "step": 152400 + }, + { + "epoch": 0.14, + "grad_norm": 20.5, + "learning_rate": 4.3643979340255907e-05, + "loss": 1.1795, + "step": 152500 + }, + { + "epoch": 0.14, + "grad_norm": 185.0, + "learning_rate": 4.363948026706499e-05, + "loss": 1.1482, + "step": 152600 + }, + { + "epoch": 0.14, + "grad_norm": 25.375, + "learning_rate": 4.3634981193874064e-05, + "loss": 1.2657, + "step": 152700 + }, + { + "epoch": 0.14, + "grad_norm": 25.125, + "learning_rate": 4.363048212068314e-05, + "loss": 1.2282, + "step": 152800 + }, + { + "epoch": 0.14, + "grad_norm": 0.03125, + "learning_rate": 4.362598304749222e-05, + "loss": 1.2663, + "step": 152900 + }, + { + "epoch": 0.14, + "grad_norm": 132.0, + "learning_rate": 4.36214839743013e-05, + "loss": 1.263, + "step": 153000 + }, + { + "epoch": 0.14, + "grad_norm": 0.74609375, + "learning_rate": 4.361698490111037e-05, + "loss": 1.1281, + "step": 153100 + }, + { + "epoch": 0.14, + "grad_norm": 10.0, + "learning_rate": 4.3612485827919454e-05, + "loss": 1.1419, + "step": 153200 + }, + { + "epoch": 0.14, + "grad_norm": 36.5, + "learning_rate": 4.360798675472853e-05, + "loss": 1.1389, + "step": 153300 + }, + { + "epoch": 0.14, + "grad_norm": 34.5, + "learning_rate": 4.3603487681537605e-05, + "loss": 1.2128, + "step": 153400 + }, + { + "epoch": 0.14, + "grad_norm": 13.5, + "learning_rate": 4.359898860834668e-05, + "loss": 1.155, + "step": 153500 + }, + { + "epoch": 0.14, + "grad_norm": 86.5, + "learning_rate": 4.3594489535155756e-05, + "loss": 1.1803, + "step": 153600 + }, + { + "epoch": 0.14, + "grad_norm": 126.0, + "learning_rate": 4.358999046196484e-05, + "loss": 1.1925, + "step": 153700 + }, + { + "epoch": 0.14, + "grad_norm": 45.25, + "learning_rate": 4.358549138877391e-05, + "loss": 1.046, + "step": 153800 + }, + { + "epoch": 0.14, + "grad_norm": 29.0, + "learning_rate": 4.358099231558299e-05, + "loss": 1.2843, + "step": 153900 + }, + { + "epoch": 0.14, + "grad_norm": 0.189453125, + "learning_rate": 4.357649324239207e-05, + "loss": 1.228, + "step": 154000 + }, + { + "epoch": 0.14, + "grad_norm": 63.0, + "learning_rate": 4.3571994169201146e-05, + "loss": 1.2228, + "step": 154100 + }, + { + "epoch": 0.14, + "grad_norm": 168.0, + "learning_rate": 4.356749509601023e-05, + "loss": 1.3743, + "step": 154200 + }, + { + "epoch": 0.14, + "grad_norm": 29.125, + "learning_rate": 4.35629960228193e-05, + "loss": 1.2738, + "step": 154300 + }, + { + "epoch": 0.14, + "grad_norm": 38.75, + "learning_rate": 4.355849694962838e-05, + "loss": 1.0825, + "step": 154400 + }, + { + "epoch": 0.14, + "grad_norm": 20.625, + "learning_rate": 4.355399787643746e-05, + "loss": 1.3722, + "step": 154500 + }, + { + "epoch": 0.14, + "grad_norm": 9.375, + "learning_rate": 4.3549498803246536e-05, + "loss": 1.0836, + "step": 154600 + }, + { + "epoch": 0.14, + "grad_norm": 82.5, + "learning_rate": 4.354499973005561e-05, + "loss": 1.3229, + "step": 154700 + }, + { + "epoch": 0.14, + "grad_norm": 19.125, + "learning_rate": 4.354050065686469e-05, + "loss": 1.2155, + "step": 154800 + }, + { + "epoch": 0.14, + "grad_norm": 39.25, + "learning_rate": 4.353600158367376e-05, + "loss": 1.2239, + "step": 154900 + }, + { + "epoch": 0.14, + "grad_norm": 98.0, + "learning_rate": 4.3531502510482844e-05, + "loss": 1.0426, + "step": 155000 + }, + { + "epoch": 0.14, + "grad_norm": 74.5, + "learning_rate": 4.352700343729192e-05, + "loss": 1.244, + "step": 155100 + }, + { + "epoch": 0.14, + "grad_norm": 35.0, + "learning_rate": 4.3522504364100995e-05, + "loss": 1.2144, + "step": 155200 + }, + { + "epoch": 0.14, + "grad_norm": 85.5, + "learning_rate": 4.351800529091008e-05, + "loss": 1.3023, + "step": 155300 + }, + { + "epoch": 0.14, + "grad_norm": 27.0, + "learning_rate": 4.351350621771915e-05, + "loss": 1.1603, + "step": 155400 + }, + { + "epoch": 0.14, + "grad_norm": 9.875, + "learning_rate": 4.350900714452823e-05, + "loss": 1.1795, + "step": 155500 + }, + { + "epoch": 0.14, + "grad_norm": 28.0, + "learning_rate": 4.350450807133731e-05, + "loss": 1.1286, + "step": 155600 + }, + { + "epoch": 0.14, + "grad_norm": 119.5, + "learning_rate": 4.3500008998146385e-05, + "loss": 1.2931, + "step": 155700 + }, + { + "epoch": 0.14, + "grad_norm": 0.259765625, + "learning_rate": 4.349550992495546e-05, + "loss": 1.1743, + "step": 155800 + }, + { + "epoch": 0.14, + "grad_norm": 38.75, + "learning_rate": 4.349101085176454e-05, + "loss": 1.2158, + "step": 155900 + }, + { + "epoch": 0.14, + "grad_norm": 55.5, + "learning_rate": 4.348651177857362e-05, + "loss": 1.0754, + "step": 156000 + }, + { + "epoch": 0.14, + "grad_norm": 18.875, + "learning_rate": 4.348201270538269e-05, + "loss": 1.2114, + "step": 156100 + }, + { + "epoch": 0.14, + "grad_norm": 19.375, + "learning_rate": 4.347751363219177e-05, + "loss": 1.3028, + "step": 156200 + }, + { + "epoch": 0.14, + "grad_norm": 16.0, + "learning_rate": 4.3473014559000844e-05, + "loss": 1.2131, + "step": 156300 + }, + { + "epoch": 0.14, + "grad_norm": 12.8125, + "learning_rate": 4.3468515485809926e-05, + "loss": 1.231, + "step": 156400 + }, + { + "epoch": 0.14, + "grad_norm": 24.5, + "learning_rate": 4.3464016412619e-05, + "loss": 1.3114, + "step": 156500 + }, + { + "epoch": 0.14, + "grad_norm": 12.0, + "learning_rate": 4.345951733942808e-05, + "loss": 0.979, + "step": 156600 + }, + { + "epoch": 0.14, + "grad_norm": 43.75, + "learning_rate": 4.345501826623716e-05, + "loss": 1.102, + "step": 156700 + }, + { + "epoch": 0.14, + "grad_norm": 0.003753662109375, + "learning_rate": 4.3450519193046234e-05, + "loss": 1.1508, + "step": 156800 + }, + { + "epoch": 0.14, + "grad_norm": 88.5, + "learning_rate": 4.3446020119855316e-05, + "loss": 1.1869, + "step": 156900 + }, + { + "epoch": 0.14, + "grad_norm": 43.5, + "learning_rate": 4.344152104666439e-05, + "loss": 1.0248, + "step": 157000 + }, + { + "epoch": 0.14, + "grad_norm": 132.0, + "learning_rate": 4.343702197347347e-05, + "loss": 1.107, + "step": 157100 + }, + { + "epoch": 0.14, + "grad_norm": 7.03125, + "learning_rate": 4.343252290028255e-05, + "loss": 1.1439, + "step": 157200 + }, + { + "epoch": 0.14, + "grad_norm": 131.0, + "learning_rate": 4.3428023827091624e-05, + "loss": 1.2053, + "step": 157300 + }, + { + "epoch": 0.14, + "grad_norm": 65.5, + "learning_rate": 4.342352475390069e-05, + "loss": 1.1828, + "step": 157400 + }, + { + "epoch": 0.14, + "grad_norm": 0.006072998046875, + "learning_rate": 4.3419025680709775e-05, + "loss": 1.0896, + "step": 157500 + }, + { + "epoch": 0.14, + "grad_norm": 16.625, + "learning_rate": 4.341452660751885e-05, + "loss": 1.1416, + "step": 157600 + }, + { + "epoch": 0.14, + "grad_norm": 57.75, + "learning_rate": 4.341002753432793e-05, + "loss": 1.3766, + "step": 157700 + }, + { + "epoch": 0.14, + "grad_norm": 52.25, + "learning_rate": 4.340552846113701e-05, + "loss": 1.1819, + "step": 157800 + }, + { + "epoch": 0.14, + "grad_norm": 24.875, + "learning_rate": 4.340102938794608e-05, + "loss": 1.2043, + "step": 157900 + }, + { + "epoch": 0.14, + "grad_norm": 0.1689453125, + "learning_rate": 4.3396530314755165e-05, + "loss": 1.1778, + "step": 158000 + }, + { + "epoch": 0.14, + "grad_norm": 9.3125, + "learning_rate": 4.339203124156424e-05, + "loss": 1.079, + "step": 158100 + }, + { + "epoch": 0.14, + "grad_norm": 21.125, + "learning_rate": 4.3387532168373316e-05, + "loss": 1.3042, + "step": 158200 + }, + { + "epoch": 0.14, + "grad_norm": 0.0859375, + "learning_rate": 4.33830330951824e-05, + "loss": 1.1161, + "step": 158300 + }, + { + "epoch": 0.14, + "grad_norm": 14.25, + "learning_rate": 4.3378534021991474e-05, + "loss": 0.9665, + "step": 158400 + }, + { + "epoch": 0.14, + "grad_norm": 0.052978515625, + "learning_rate": 4.337403494880055e-05, + "loss": 1.1824, + "step": 158500 + }, + { + "epoch": 0.14, + "grad_norm": 100.0, + "learning_rate": 4.3369535875609624e-05, + "loss": 1.2272, + "step": 158600 + }, + { + "epoch": 0.14, + "grad_norm": 0.0128173828125, + "learning_rate": 4.33650368024187e-05, + "loss": 1.219, + "step": 158700 + }, + { + "epoch": 0.14, + "grad_norm": 3.890625, + "learning_rate": 4.336053772922778e-05, + "loss": 1.0795, + "step": 158800 + }, + { + "epoch": 0.14, + "grad_norm": 0.0113525390625, + "learning_rate": 4.335603865603686e-05, + "loss": 1.1915, + "step": 158900 + }, + { + "epoch": 0.14, + "grad_norm": 0.00909423828125, + "learning_rate": 4.335153958284593e-05, + "loss": 1.1958, + "step": 159000 + }, + { + "epoch": 0.14, + "grad_norm": 9.25, + "learning_rate": 4.3347040509655014e-05, + "loss": 1.2367, + "step": 159100 + }, + { + "epoch": 0.14, + "grad_norm": 2008.0, + "learning_rate": 4.334254143646409e-05, + "loss": 1.1813, + "step": 159200 + }, + { + "epoch": 0.14, + "grad_norm": 17.5, + "learning_rate": 4.3338042363273165e-05, + "loss": 1.2036, + "step": 159300 + }, + { + "epoch": 0.14, + "grad_norm": 56.5, + "learning_rate": 4.333354329008225e-05, + "loss": 1.1227, + "step": 159400 + }, + { + "epoch": 0.14, + "grad_norm": 111.5, + "learning_rate": 4.332904421689132e-05, + "loss": 1.3145, + "step": 159500 + }, + { + "epoch": 0.14, + "grad_norm": 12.4375, + "learning_rate": 4.33245451437004e-05, + "loss": 1.233, + "step": 159600 + }, + { + "epoch": 0.14, + "grad_norm": 81.0, + "learning_rate": 4.332004607050948e-05, + "loss": 1.2202, + "step": 159700 + }, + { + "epoch": 0.14, + "grad_norm": 262.0, + "learning_rate": 4.3315546997318555e-05, + "loss": 1.216, + "step": 159800 + }, + { + "epoch": 0.14, + "grad_norm": 15.25, + "learning_rate": 4.331104792412763e-05, + "loss": 1.1336, + "step": 159900 + }, + { + "epoch": 0.14, + "grad_norm": 0.054443359375, + "learning_rate": 4.3306548850936706e-05, + "loss": 1.1322, + "step": 160000 + }, + { + "epoch": 0.14, + "grad_norm": 18.25, + "learning_rate": 4.330204977774578e-05, + "loss": 0.9996, + "step": 160100 + }, + { + "epoch": 0.14, + "grad_norm": 47.0, + "learning_rate": 4.3297550704554864e-05, + "loss": 1.2751, + "step": 160200 + }, + { + "epoch": 0.14, + "grad_norm": 62.5, + "learning_rate": 4.329305163136394e-05, + "loss": 1.132, + "step": 160300 + }, + { + "epoch": 0.14, + "grad_norm": 19.25, + "learning_rate": 4.328855255817302e-05, + "loss": 1.231, + "step": 160400 + }, + { + "epoch": 0.14, + "grad_norm": 0.3203125, + "learning_rate": 4.3284053484982096e-05, + "loss": 1.1239, + "step": 160500 + }, + { + "epoch": 0.14, + "grad_norm": 15.5, + "learning_rate": 4.327955441179117e-05, + "loss": 1.1883, + "step": 160600 + }, + { + "epoch": 0.14, + "grad_norm": 442.0, + "learning_rate": 4.3275055338600254e-05, + "loss": 0.9981, + "step": 160700 + }, + { + "epoch": 0.14, + "grad_norm": 32.75, + "learning_rate": 4.327055626540933e-05, + "loss": 1.3095, + "step": 160800 + }, + { + "epoch": 0.14, + "grad_norm": 9.125, + "learning_rate": 4.3266057192218404e-05, + "loss": 1.1381, + "step": 160900 + }, + { + "epoch": 0.14, + "grad_norm": 25.25, + "learning_rate": 4.3261558119027487e-05, + "loss": 1.0775, + "step": 161000 + }, + { + "epoch": 0.14, + "grad_norm": 138.0, + "learning_rate": 4.325705904583656e-05, + "loss": 1.2819, + "step": 161100 + }, + { + "epoch": 0.14, + "grad_norm": 40.0, + "learning_rate": 4.325255997264564e-05, + "loss": 1.2975, + "step": 161200 + }, + { + "epoch": 0.14, + "grad_norm": 12.6875, + "learning_rate": 4.324806089945471e-05, + "loss": 1.2189, + "step": 161300 + }, + { + "epoch": 0.14, + "grad_norm": 36.75, + "learning_rate": 4.324356182626379e-05, + "loss": 1.273, + "step": 161400 + }, + { + "epoch": 0.14, + "grad_norm": 77.0, + "learning_rate": 4.323906275307287e-05, + "loss": 1.2648, + "step": 161500 + }, + { + "epoch": 0.14, + "grad_norm": 148.0, + "learning_rate": 4.3234563679881945e-05, + "loss": 1.193, + "step": 161600 + }, + { + "epoch": 0.14, + "grad_norm": 46.75, + "learning_rate": 4.323006460669102e-05, + "loss": 1.2597, + "step": 161700 + }, + { + "epoch": 0.14, + "grad_norm": 4.4375, + "learning_rate": 4.32255655335001e-05, + "loss": 1.1285, + "step": 161800 + }, + { + "epoch": 0.14, + "grad_norm": 46.75, + "learning_rate": 4.322106646030918e-05, + "loss": 1.1198, + "step": 161900 + }, + { + "epoch": 0.14, + "grad_norm": 56.0, + "learning_rate": 4.3216567387118254e-05, + "loss": 1.188, + "step": 162000 + }, + { + "epoch": 0.14, + "grad_norm": 14.25, + "learning_rate": 4.3212068313927336e-05, + "loss": 1.1083, + "step": 162100 + }, + { + "epoch": 0.14, + "grad_norm": 17.875, + "learning_rate": 4.320756924073641e-05, + "loss": 1.2941, + "step": 162200 + }, + { + "epoch": 0.14, + "grad_norm": 31.25, + "learning_rate": 4.3203070167545486e-05, + "loss": 1.2685, + "step": 162300 + }, + { + "epoch": 0.14, + "grad_norm": 5.15625, + "learning_rate": 4.319857109435457e-05, + "loss": 1.1637, + "step": 162400 + }, + { + "epoch": 0.14, + "grad_norm": 48.75, + "learning_rate": 4.319407202116364e-05, + "loss": 1.2843, + "step": 162500 + }, + { + "epoch": 0.14, + "grad_norm": 15.125, + "learning_rate": 4.318957294797272e-05, + "loss": 1.1094, + "step": 162600 + }, + { + "epoch": 0.14, + "grad_norm": 34.0, + "learning_rate": 4.3185073874781794e-05, + "loss": 1.2415, + "step": 162700 + }, + { + "epoch": 0.15, + "grad_norm": 12.125, + "learning_rate": 4.318057480159087e-05, + "loss": 1.14, + "step": 162800 + }, + { + "epoch": 0.15, + "grad_norm": 44.25, + "learning_rate": 4.317607572839995e-05, + "loss": 1.175, + "step": 162900 + }, + { + "epoch": 0.15, + "grad_norm": 62.75, + "learning_rate": 4.317157665520903e-05, + "loss": 1.249, + "step": 163000 + }, + { + "epoch": 0.15, + "grad_norm": 25.125, + "learning_rate": 4.316707758201811e-05, + "loss": 1.2165, + "step": 163100 + }, + { + "epoch": 0.15, + "grad_norm": 24.125, + "learning_rate": 4.3162578508827185e-05, + "loss": 1.1537, + "step": 163200 + }, + { + "epoch": 0.15, + "grad_norm": 44.0, + "learning_rate": 4.315807943563626e-05, + "loss": 0.9978, + "step": 163300 + }, + { + "epoch": 0.15, + "grad_norm": 1.15625, + "learning_rate": 4.315358036244534e-05, + "loss": 1.1201, + "step": 163400 + }, + { + "epoch": 0.15, + "grad_norm": 57.25, + "learning_rate": 4.314908128925442e-05, + "loss": 1.0793, + "step": 163500 + }, + { + "epoch": 0.15, + "grad_norm": 14.25, + "learning_rate": 4.314458221606349e-05, + "loss": 1.2045, + "step": 163600 + }, + { + "epoch": 0.15, + "grad_norm": 31.375, + "learning_rate": 4.3140083142872575e-05, + "loss": 1.3555, + "step": 163700 + }, + { + "epoch": 0.15, + "grad_norm": 5.34375, + "learning_rate": 4.3135584069681644e-05, + "loss": 1.0663, + "step": 163800 + }, + { + "epoch": 0.15, + "grad_norm": 16.625, + "learning_rate": 4.3131084996490726e-05, + "loss": 1.2379, + "step": 163900 + }, + { + "epoch": 0.15, + "grad_norm": 46.25, + "learning_rate": 4.31265859232998e-05, + "loss": 1.1486, + "step": 164000 + }, + { + "epoch": 0.15, + "grad_norm": 39.0, + "learning_rate": 4.3122086850108876e-05, + "loss": 1.2255, + "step": 164100 + }, + { + "epoch": 0.15, + "grad_norm": 70.0, + "learning_rate": 4.311758777691796e-05, + "loss": 1.445, + "step": 164200 + }, + { + "epoch": 0.15, + "grad_norm": 28.25, + "learning_rate": 4.3113088703727034e-05, + "loss": 1.0938, + "step": 164300 + }, + { + "epoch": 0.15, + "grad_norm": 8.9375, + "learning_rate": 4.310858963053611e-05, + "loss": 1.1599, + "step": 164400 + }, + { + "epoch": 0.15, + "grad_norm": 0.7578125, + "learning_rate": 4.310409055734519e-05, + "loss": 1.1894, + "step": 164500 + }, + { + "epoch": 0.15, + "grad_norm": 40.5, + "learning_rate": 4.3099591484154267e-05, + "loss": 1.1745, + "step": 164600 + }, + { + "epoch": 0.15, + "grad_norm": 41.5, + "learning_rate": 4.309509241096334e-05, + "loss": 1.2277, + "step": 164700 + }, + { + "epoch": 0.15, + "grad_norm": 5.0625, + "learning_rate": 4.3090593337772424e-05, + "loss": 1.2373, + "step": 164800 + }, + { + "epoch": 0.15, + "grad_norm": 42.25, + "learning_rate": 4.30860942645815e-05, + "loss": 1.278, + "step": 164900 + }, + { + "epoch": 0.15, + "grad_norm": 14.6875, + "learning_rate": 4.3081595191390575e-05, + "loss": 1.2097, + "step": 165000 + }, + { + "epoch": 0.15, + "grad_norm": 8.375, + "learning_rate": 4.307709611819965e-05, + "loss": 1.1292, + "step": 165100 + }, + { + "epoch": 0.15, + "grad_norm": 42.5, + "learning_rate": 4.3072597045008725e-05, + "loss": 1.341, + "step": 165200 + }, + { + "epoch": 0.15, + "grad_norm": 121.5, + "learning_rate": 4.306809797181781e-05, + "loss": 1.081, + "step": 165300 + }, + { + "epoch": 0.15, + "grad_norm": 12.125, + "learning_rate": 4.306359889862688e-05, + "loss": 1.1821, + "step": 165400 + }, + { + "epoch": 0.15, + "grad_norm": 19.375, + "learning_rate": 4.305909982543596e-05, + "loss": 1.2051, + "step": 165500 + }, + { + "epoch": 0.15, + "grad_norm": 33.5, + "learning_rate": 4.305460075224504e-05, + "loss": 1.2173, + "step": 165600 + }, + { + "epoch": 0.15, + "grad_norm": 11.9375, + "learning_rate": 4.3050101679054116e-05, + "loss": 1.3438, + "step": 165700 + }, + { + "epoch": 0.15, + "grad_norm": 41.0, + "learning_rate": 4.30456026058632e-05, + "loss": 1.1703, + "step": 165800 + }, + { + "epoch": 0.15, + "grad_norm": 4.78125, + "learning_rate": 4.304110353267227e-05, + "loss": 1.2001, + "step": 165900 + }, + { + "epoch": 0.15, + "grad_norm": 740.0, + "learning_rate": 4.303660445948135e-05, + "loss": 1.2125, + "step": 166000 + }, + { + "epoch": 0.15, + "grad_norm": 29.25, + "learning_rate": 4.303210538629043e-05, + "loss": 1.1543, + "step": 166100 + }, + { + "epoch": 0.15, + "grad_norm": 0.02734375, + "learning_rate": 4.3027606313099506e-05, + "loss": 1.3533, + "step": 166200 + }, + { + "epoch": 0.15, + "grad_norm": 26.75, + "learning_rate": 4.302310723990858e-05, + "loss": 1.2246, + "step": 166300 + }, + { + "epoch": 0.15, + "grad_norm": 30.0, + "learning_rate": 4.3018608166717657e-05, + "loss": 1.1274, + "step": 166400 + }, + { + "epoch": 0.15, + "grad_norm": 22.75, + "learning_rate": 4.301410909352673e-05, + "loss": 1.1819, + "step": 166500 + }, + { + "epoch": 0.15, + "grad_norm": 44.0, + "learning_rate": 4.3009610020335814e-05, + "loss": 1.2977, + "step": 166600 + }, + { + "epoch": 0.15, + "grad_norm": 58.25, + "learning_rate": 4.300511094714489e-05, + "loss": 1.3277, + "step": 166700 + }, + { + "epoch": 0.15, + "grad_norm": 20.25, + "learning_rate": 4.3000611873953965e-05, + "loss": 1.2512, + "step": 166800 + }, + { + "epoch": 0.15, + "grad_norm": 29.375, + "learning_rate": 4.299611280076305e-05, + "loss": 1.2322, + "step": 166900 + }, + { + "epoch": 0.15, + "grad_norm": 36.25, + "learning_rate": 4.299161372757212e-05, + "loss": 1.2066, + "step": 167000 + }, + { + "epoch": 0.15, + "grad_norm": 35.25, + "learning_rate": 4.29871146543812e-05, + "loss": 1.2478, + "step": 167100 + }, + { + "epoch": 0.15, + "grad_norm": 146.0, + "learning_rate": 4.298261558119028e-05, + "loss": 1.3022, + "step": 167200 + }, + { + "epoch": 0.15, + "grad_norm": 64.0, + "learning_rate": 4.2978116507999355e-05, + "loss": 1.4135, + "step": 167300 + }, + { + "epoch": 0.15, + "grad_norm": 17.0, + "learning_rate": 4.297361743480843e-05, + "loss": 1.3609, + "step": 167400 + }, + { + "epoch": 0.15, + "grad_norm": 12.25, + "learning_rate": 4.296911836161751e-05, + "loss": 1.1639, + "step": 167500 + }, + { + "epoch": 0.15, + "grad_norm": 44.25, + "learning_rate": 4.296461928842659e-05, + "loss": 1.328, + "step": 167600 + }, + { + "epoch": 0.15, + "grad_norm": 19.625, + "learning_rate": 4.296012021523566e-05, + "loss": 1.2159, + "step": 167700 + }, + { + "epoch": 0.15, + "grad_norm": 29.75, + "learning_rate": 4.295562114204474e-05, + "loss": 1.3014, + "step": 167800 + }, + { + "epoch": 0.15, + "grad_norm": 10.0625, + "learning_rate": 4.2951122068853814e-05, + "loss": 1.1743, + "step": 167900 + }, + { + "epoch": 0.15, + "grad_norm": 310.0, + "learning_rate": 4.2946622995662896e-05, + "loss": 1.2183, + "step": 168000 + }, + { + "epoch": 0.15, + "grad_norm": 31.875, + "learning_rate": 4.294212392247197e-05, + "loss": 1.2231, + "step": 168100 + }, + { + "epoch": 0.15, + "grad_norm": 356.0, + "learning_rate": 4.2937624849281047e-05, + "loss": 1.3259, + "step": 168200 + }, + { + "epoch": 0.15, + "grad_norm": 10.25, + "learning_rate": 4.293312577609013e-05, + "loss": 1.1687, + "step": 168300 + }, + { + "epoch": 0.15, + "grad_norm": 10.5, + "learning_rate": 4.2928626702899204e-05, + "loss": 1.1799, + "step": 168400 + }, + { + "epoch": 0.15, + "grad_norm": 11.0, + "learning_rate": 4.2924127629708286e-05, + "loss": 1.1637, + "step": 168500 + }, + { + "epoch": 0.15, + "grad_norm": 223.0, + "learning_rate": 4.291962855651736e-05, + "loss": 1.1419, + "step": 168600 + }, + { + "epoch": 0.15, + "grad_norm": 44.75, + "learning_rate": 4.291512948332644e-05, + "loss": 1.334, + "step": 168700 + }, + { + "epoch": 0.15, + "grad_norm": 13.1875, + "learning_rate": 4.291063041013552e-05, + "loss": 1.3015, + "step": 168800 + }, + { + "epoch": 0.15, + "grad_norm": 61.0, + "learning_rate": 4.2906131336944594e-05, + "loss": 1.0367, + "step": 168900 + }, + { + "epoch": 0.15, + "grad_norm": 16.375, + "learning_rate": 4.290163226375366e-05, + "loss": 1.1058, + "step": 169000 + }, + { + "epoch": 0.15, + "grad_norm": 14.125, + "learning_rate": 4.2897133190562745e-05, + "loss": 1.1981, + "step": 169100 + }, + { + "epoch": 0.15, + "grad_norm": 0.130859375, + "learning_rate": 4.289263411737182e-05, + "loss": 1.2601, + "step": 169200 + }, + { + "epoch": 0.15, + "grad_norm": 53.5, + "learning_rate": 4.28881350441809e-05, + "loss": 1.297, + "step": 169300 + }, + { + "epoch": 0.15, + "grad_norm": 42.0, + "learning_rate": 4.288363597098998e-05, + "loss": 1.0663, + "step": 169400 + }, + { + "epoch": 0.15, + "grad_norm": 29.25, + "learning_rate": 4.287913689779905e-05, + "loss": 1.0977, + "step": 169500 + }, + { + "epoch": 0.15, + "grad_norm": 21.25, + "learning_rate": 4.2874637824608135e-05, + "loss": 1.1092, + "step": 169600 + }, + { + "epoch": 0.15, + "grad_norm": 39.75, + "learning_rate": 4.287013875141721e-05, + "loss": 1.2423, + "step": 169700 + }, + { + "epoch": 0.15, + "grad_norm": 29.125, + "learning_rate": 4.2865639678226286e-05, + "loss": 1.274, + "step": 169800 + }, + { + "epoch": 0.15, + "grad_norm": 0.1064453125, + "learning_rate": 4.286114060503537e-05, + "loss": 1.0745, + "step": 169900 + }, + { + "epoch": 0.15, + "grad_norm": 46.75, + "learning_rate": 4.285664153184444e-05, + "loss": 1.1619, + "step": 170000 + }, + { + "epoch": 0.15, + "grad_norm": 26.75, + "learning_rate": 4.285214245865352e-05, + "loss": 1.1738, + "step": 170100 + }, + { + "epoch": 0.15, + "grad_norm": 59.5, + "learning_rate": 4.28476433854626e-05, + "loss": 1.26, + "step": 170200 + }, + { + "epoch": 0.15, + "grad_norm": 0.388671875, + "learning_rate": 4.284314431227167e-05, + "loss": 1.3688, + "step": 170300 + }, + { + "epoch": 0.15, + "grad_norm": 210.0, + "learning_rate": 4.283864523908075e-05, + "loss": 1.1806, + "step": 170400 + }, + { + "epoch": 0.15, + "grad_norm": 28.25, + "learning_rate": 4.283414616588983e-05, + "loss": 1.0955, + "step": 170500 + }, + { + "epoch": 0.15, + "grad_norm": 17.0, + "learning_rate": 4.28296470926989e-05, + "loss": 1.3141, + "step": 170600 + }, + { + "epoch": 0.15, + "grad_norm": 27.625, + "learning_rate": 4.2825148019507984e-05, + "loss": 1.2581, + "step": 170700 + }, + { + "epoch": 0.15, + "grad_norm": 27.75, + "learning_rate": 4.282064894631706e-05, + "loss": 1.1377, + "step": 170800 + }, + { + "epoch": 0.15, + "grad_norm": 18.25, + "learning_rate": 4.2816149873126135e-05, + "loss": 1.1178, + "step": 170900 + }, + { + "epoch": 0.15, + "grad_norm": 21.625, + "learning_rate": 4.281165079993522e-05, + "loss": 1.2528, + "step": 171000 + }, + { + "epoch": 0.15, + "grad_norm": 15.1875, + "learning_rate": 4.280715172674429e-05, + "loss": 1.1468, + "step": 171100 + }, + { + "epoch": 0.15, + "grad_norm": 23.125, + "learning_rate": 4.2802652653553375e-05, + "loss": 1.0675, + "step": 171200 + }, + { + "epoch": 0.15, + "grad_norm": 20.5, + "learning_rate": 4.279815358036245e-05, + "loss": 1.0935, + "step": 171300 + }, + { + "epoch": 0.15, + "grad_norm": 195.0, + "learning_rate": 4.2793654507171525e-05, + "loss": 1.1335, + "step": 171400 + }, + { + "epoch": 0.15, + "grad_norm": 15.625, + "learning_rate": 4.278915543398061e-05, + "loss": 1.2274, + "step": 171500 + }, + { + "epoch": 0.15, + "grad_norm": 31.375, + "learning_rate": 4.2784656360789676e-05, + "loss": 1.1344, + "step": 171600 + }, + { + "epoch": 0.15, + "grad_norm": 0.59375, + "learning_rate": 4.278015728759875e-05, + "loss": 1.1439, + "step": 171700 + }, + { + "epoch": 0.15, + "grad_norm": 101.5, + "learning_rate": 4.277565821440783e-05, + "loss": 1.0772, + "step": 171800 + }, + { + "epoch": 0.15, + "grad_norm": 24.5, + "learning_rate": 4.277115914121691e-05, + "loss": 1.0917, + "step": 171900 + }, + { + "epoch": 0.15, + "grad_norm": 154.0, + "learning_rate": 4.276666006802599e-05, + "loss": 1.1744, + "step": 172000 + }, + { + "epoch": 0.15, + "grad_norm": 15.875, + "learning_rate": 4.2762160994835066e-05, + "loss": 1.358, + "step": 172100 + }, + { + "epoch": 0.15, + "grad_norm": 66.5, + "learning_rate": 4.275766192164414e-05, + "loss": 1.2677, + "step": 172200 + }, + { + "epoch": 0.15, + "grad_norm": 65.0, + "learning_rate": 4.2753162848453224e-05, + "loss": 0.9656, + "step": 172300 + }, + { + "epoch": 0.15, + "grad_norm": 20.375, + "learning_rate": 4.27486637752623e-05, + "loss": 1.0813, + "step": 172400 + }, + { + "epoch": 0.15, + "grad_norm": 2.578125, + "learning_rate": 4.2744164702071374e-05, + "loss": 1.3343, + "step": 172500 + }, + { + "epoch": 0.15, + "grad_norm": 89.5, + "learning_rate": 4.2739665628880456e-05, + "loss": 1.2819, + "step": 172600 + }, + { + "epoch": 0.15, + "grad_norm": 54.0, + "learning_rate": 4.273516655568953e-05, + "loss": 1.0236, + "step": 172700 + }, + { + "epoch": 0.15, + "grad_norm": 22.5, + "learning_rate": 4.273066748249861e-05, + "loss": 1.0889, + "step": 172800 + }, + { + "epoch": 0.15, + "grad_norm": 25.0, + "learning_rate": 4.272616840930768e-05, + "loss": 1.1928, + "step": 172900 + }, + { + "epoch": 0.15, + "grad_norm": 18.625, + "learning_rate": 4.272166933611676e-05, + "loss": 1.1959, + "step": 173000 + }, + { + "epoch": 0.15, + "grad_norm": 17.25, + "learning_rate": 4.271717026292584e-05, + "loss": 1.3427, + "step": 173100 + }, + { + "epoch": 0.15, + "grad_norm": 1.078125, + "learning_rate": 4.2712671189734915e-05, + "loss": 1.2775, + "step": 173200 + }, + { + "epoch": 0.15, + "grad_norm": 27.25, + "learning_rate": 4.270817211654399e-05, + "loss": 1.3247, + "step": 173300 + }, + { + "epoch": 0.15, + "grad_norm": 15.0, + "learning_rate": 4.270367304335307e-05, + "loss": 1.2258, + "step": 173400 + }, + { + "epoch": 0.15, + "grad_norm": 67.5, + "learning_rate": 4.269917397016215e-05, + "loss": 1.191, + "step": 173500 + }, + { + "epoch": 0.15, + "grad_norm": 16.375, + "learning_rate": 4.269467489697122e-05, + "loss": 1.086, + "step": 173600 + }, + { + "epoch": 0.15, + "grad_norm": 0.62109375, + "learning_rate": 4.2690175823780305e-05, + "loss": 1.048, + "step": 173700 + }, + { + "epoch": 0.15, + "grad_norm": 25.375, + "learning_rate": 4.268567675058938e-05, + "loss": 1.016, + "step": 173800 + }, + { + "epoch": 0.15, + "grad_norm": 52.0, + "learning_rate": 4.268117767739846e-05, + "loss": 1.252, + "step": 173900 + }, + { + "epoch": 0.16, + "grad_norm": 21.5, + "learning_rate": 4.267667860420754e-05, + "loss": 1.1171, + "step": 174000 + }, + { + "epoch": 0.16, + "grad_norm": 36.25, + "learning_rate": 4.2672179531016614e-05, + "loss": 1.2418, + "step": 174100 + }, + { + "epoch": 0.16, + "grad_norm": 54.5, + "learning_rate": 4.266768045782569e-05, + "loss": 1.0526, + "step": 174200 + }, + { + "epoch": 0.16, + "grad_norm": 38.75, + "learning_rate": 4.2663181384634764e-05, + "loss": 1.1106, + "step": 174300 + }, + { + "epoch": 0.16, + "grad_norm": 1.2890625, + "learning_rate": 4.265868231144384e-05, + "loss": 1.1271, + "step": 174400 + }, + { + "epoch": 0.16, + "grad_norm": 21.0, + "learning_rate": 4.265418323825292e-05, + "loss": 1.0622, + "step": 174500 + }, + { + "epoch": 0.16, + "grad_norm": 23.625, + "learning_rate": 4.2649684165062e-05, + "loss": 1.1997, + "step": 174600 + }, + { + "epoch": 0.16, + "grad_norm": 0.255859375, + "learning_rate": 4.264518509187108e-05, + "loss": 1.2099, + "step": 174700 + }, + { + "epoch": 0.16, + "grad_norm": 27.375, + "learning_rate": 4.2640686018680155e-05, + "loss": 1.1474, + "step": 174800 + }, + { + "epoch": 0.16, + "grad_norm": 18.125, + "learning_rate": 4.263618694548923e-05, + "loss": 1.1925, + "step": 174900 + }, + { + "epoch": 0.16, + "grad_norm": 5.40625, + "learning_rate": 4.263168787229831e-05, + "loss": 1.3047, + "step": 175000 + }, + { + "epoch": 0.16, + "grad_norm": 34.75, + "learning_rate": 4.262718879910739e-05, + "loss": 1.1826, + "step": 175100 + }, + { + "epoch": 0.16, + "grad_norm": 16.5, + "learning_rate": 4.262268972591646e-05, + "loss": 1.0752, + "step": 175200 + }, + { + "epoch": 0.16, + "grad_norm": 12.0, + "learning_rate": 4.2618190652725545e-05, + "loss": 1.3711, + "step": 175300 + }, + { + "epoch": 0.16, + "grad_norm": 17.125, + "learning_rate": 4.261369157953462e-05, + "loss": 1.1903, + "step": 175400 + }, + { + "epoch": 0.16, + "grad_norm": 28.125, + "learning_rate": 4.2609192506343695e-05, + "loss": 1.2254, + "step": 175500 + }, + { + "epoch": 0.16, + "grad_norm": 53.0, + "learning_rate": 4.260469343315277e-05, + "loss": 1.2376, + "step": 175600 + }, + { + "epoch": 0.16, + "grad_norm": 46.25, + "learning_rate": 4.2600194359961846e-05, + "loss": 1.3068, + "step": 175700 + }, + { + "epoch": 0.16, + "grad_norm": 49.5, + "learning_rate": 4.259569528677093e-05, + "loss": 1.2719, + "step": 175800 + }, + { + "epoch": 0.16, + "grad_norm": 21.25, + "learning_rate": 4.2591196213580004e-05, + "loss": 1.2351, + "step": 175900 + }, + { + "epoch": 0.16, + "grad_norm": 41.5, + "learning_rate": 4.258669714038908e-05, + "loss": 1.2047, + "step": 176000 + }, + { + "epoch": 0.16, + "grad_norm": 15.875, + "learning_rate": 4.258219806719816e-05, + "loss": 1.2174, + "step": 176100 + }, + { + "epoch": 0.16, + "grad_norm": 20.875, + "learning_rate": 4.2577698994007236e-05, + "loss": 1.148, + "step": 176200 + }, + { + "epoch": 0.16, + "grad_norm": 22.0, + "learning_rate": 4.257319992081631e-05, + "loss": 1.3212, + "step": 176300 + }, + { + "epoch": 0.16, + "grad_norm": 15.0, + "learning_rate": 4.2568700847625394e-05, + "loss": 1.209, + "step": 176400 + }, + { + "epoch": 0.16, + "grad_norm": 42.75, + "learning_rate": 4.256420177443447e-05, + "loss": 1.2113, + "step": 176500 + }, + { + "epoch": 0.16, + "grad_norm": 1.578125, + "learning_rate": 4.2559702701243544e-05, + "loss": 0.9567, + "step": 176600 + }, + { + "epoch": 0.16, + "grad_norm": 420.0, + "learning_rate": 4.255520362805263e-05, + "loss": 1.2237, + "step": 176700 + }, + { + "epoch": 0.16, + "grad_norm": 736.0, + "learning_rate": 4.2550704554861695e-05, + "loss": 1.0574, + "step": 176800 + }, + { + "epoch": 0.16, + "grad_norm": 0.3671875, + "learning_rate": 4.254620548167078e-05, + "loss": 1.3057, + "step": 176900 + }, + { + "epoch": 0.16, + "grad_norm": 10.1875, + "learning_rate": 4.254170640847985e-05, + "loss": 1.2314, + "step": 177000 + }, + { + "epoch": 0.16, + "grad_norm": 33.5, + "learning_rate": 4.253720733528893e-05, + "loss": 1.2106, + "step": 177100 + }, + { + "epoch": 0.16, + "grad_norm": 186.0, + "learning_rate": 4.253270826209801e-05, + "loss": 1.1305, + "step": 177200 + }, + { + "epoch": 0.16, + "grad_norm": 255.0, + "learning_rate": 4.2528209188907085e-05, + "loss": 1.0669, + "step": 177300 + }, + { + "epoch": 0.16, + "grad_norm": 39.5, + "learning_rate": 4.252371011571617e-05, + "loss": 1.2703, + "step": 177400 + }, + { + "epoch": 0.16, + "grad_norm": 6.15625, + "learning_rate": 4.251921104252524e-05, + "loss": 1.2375, + "step": 177500 + }, + { + "epoch": 0.16, + "grad_norm": 4000.0, + "learning_rate": 4.251471196933432e-05, + "loss": 1.1928, + "step": 177600 + }, + { + "epoch": 0.16, + "grad_norm": 58.75, + "learning_rate": 4.25102128961434e-05, + "loss": 1.2152, + "step": 177700 + }, + { + "epoch": 0.16, + "grad_norm": 0.29296875, + "learning_rate": 4.2505713822952476e-05, + "loss": 1.1017, + "step": 177800 + }, + { + "epoch": 0.16, + "grad_norm": 28.625, + "learning_rate": 4.250121474976155e-05, + "loss": 1.1363, + "step": 177900 + }, + { + "epoch": 0.16, + "grad_norm": 9.8125, + "learning_rate": 4.249671567657063e-05, + "loss": 1.1717, + "step": 178000 + }, + { + "epoch": 0.16, + "grad_norm": 26.25, + "learning_rate": 4.24922166033797e-05, + "loss": 1.0504, + "step": 178100 + }, + { + "epoch": 0.16, + "grad_norm": 173.0, + "learning_rate": 4.2487717530188784e-05, + "loss": 1.051, + "step": 178200 + }, + { + "epoch": 0.16, + "grad_norm": 128.0, + "learning_rate": 4.248321845699786e-05, + "loss": 1.2008, + "step": 178300 + }, + { + "epoch": 0.16, + "grad_norm": 29.25, + "learning_rate": 4.2478719383806934e-05, + "loss": 1.0372, + "step": 178400 + }, + { + "epoch": 0.16, + "grad_norm": 38.25, + "learning_rate": 4.2474220310616017e-05, + "loss": 1.229, + "step": 178500 + }, + { + "epoch": 0.16, + "grad_norm": 44.5, + "learning_rate": 4.246972123742509e-05, + "loss": 1.1811, + "step": 178600 + }, + { + "epoch": 0.16, + "grad_norm": 33.25, + "learning_rate": 4.246522216423417e-05, + "loss": 1.3114, + "step": 178700 + }, + { + "epoch": 0.16, + "grad_norm": 54.75, + "learning_rate": 4.246072309104325e-05, + "loss": 1.3516, + "step": 178800 + }, + { + "epoch": 0.16, + "grad_norm": 44.75, + "learning_rate": 4.2456224017852325e-05, + "loss": 1.2318, + "step": 178900 + }, + { + "epoch": 0.16, + "grad_norm": 25.875, + "learning_rate": 4.24517249446614e-05, + "loss": 1.2588, + "step": 179000 + }, + { + "epoch": 0.16, + "grad_norm": 4.375, + "learning_rate": 4.244722587147048e-05, + "loss": 1.0803, + "step": 179100 + }, + { + "epoch": 0.16, + "grad_norm": 81.5, + "learning_rate": 4.244272679827956e-05, + "loss": 1.1941, + "step": 179200 + }, + { + "epoch": 0.16, + "grad_norm": 23.0, + "learning_rate": 4.243822772508863e-05, + "loss": 1.3492, + "step": 179300 + }, + { + "epoch": 0.16, + "grad_norm": 51.75, + "learning_rate": 4.243372865189771e-05, + "loss": 1.4025, + "step": 179400 + }, + { + "epoch": 0.16, + "grad_norm": 0.11767578125, + "learning_rate": 4.2429229578706784e-05, + "loss": 1.1006, + "step": 179500 + }, + { + "epoch": 0.16, + "grad_norm": 19.125, + "learning_rate": 4.2424730505515866e-05, + "loss": 1.2816, + "step": 179600 + }, + { + "epoch": 0.16, + "grad_norm": 10.6875, + "learning_rate": 4.242023143232494e-05, + "loss": 1.3625, + "step": 179700 + }, + { + "epoch": 0.16, + "grad_norm": 48.0, + "learning_rate": 4.2415732359134016e-05, + "loss": 1.1887, + "step": 179800 + }, + { + "epoch": 0.16, + "grad_norm": 41.0, + "learning_rate": 4.24112332859431e-05, + "loss": 1.2855, + "step": 179900 + }, + { + "epoch": 0.16, + "grad_norm": 45.75, + "learning_rate": 4.2406734212752174e-05, + "loss": 1.3335, + "step": 180000 + }, + { + "epoch": 0.16, + "grad_norm": 19.375, + "learning_rate": 4.2402235139561256e-05, + "loss": 1.2829, + "step": 180100 + }, + { + "epoch": 0.16, + "grad_norm": 0.0015411376953125, + "learning_rate": 4.239773606637033e-05, + "loss": 1.0416, + "step": 180200 + }, + { + "epoch": 0.16, + "grad_norm": 17.125, + "learning_rate": 4.2393236993179407e-05, + "loss": 1.1039, + "step": 180300 + }, + { + "epoch": 0.16, + "grad_norm": 19.625, + "learning_rate": 4.238873791998849e-05, + "loss": 1.0655, + "step": 180400 + }, + { + "epoch": 0.16, + "grad_norm": 596.0, + "learning_rate": 4.2384238846797564e-05, + "loss": 1.1383, + "step": 180500 + }, + { + "epoch": 0.16, + "grad_norm": 37.0, + "learning_rate": 4.237973977360664e-05, + "loss": 1.135, + "step": 180600 + }, + { + "epoch": 0.16, + "grad_norm": 43.25, + "learning_rate": 4.2375240700415715e-05, + "loss": 1.0729, + "step": 180700 + }, + { + "epoch": 0.16, + "grad_norm": 0.173828125, + "learning_rate": 4.237074162722479e-05, + "loss": 1.1227, + "step": 180800 + }, + { + "epoch": 0.16, + "grad_norm": 6.40625, + "learning_rate": 4.236624255403387e-05, + "loss": 1.1162, + "step": 180900 + }, + { + "epoch": 0.16, + "grad_norm": 18.0, + "learning_rate": 4.236174348084295e-05, + "loss": 1.1697, + "step": 181000 + }, + { + "epoch": 0.16, + "grad_norm": 27.75, + "learning_rate": 4.235724440765202e-05, + "loss": 1.2325, + "step": 181100 + }, + { + "epoch": 0.16, + "grad_norm": 29.875, + "learning_rate": 4.2352745334461105e-05, + "loss": 1.1419, + "step": 181200 + }, + { + "epoch": 0.16, + "grad_norm": 17.25, + "learning_rate": 4.234824626127018e-05, + "loss": 1.3225, + "step": 181300 + }, + { + "epoch": 0.16, + "grad_norm": 24.5, + "learning_rate": 4.2343747188079256e-05, + "loss": 1.2887, + "step": 181400 + }, + { + "epoch": 0.16, + "grad_norm": 40.0, + "learning_rate": 4.233924811488834e-05, + "loss": 1.2148, + "step": 181500 + }, + { + "epoch": 0.16, + "grad_norm": 0.0035247802734375, + "learning_rate": 4.233474904169741e-05, + "loss": 1.2054, + "step": 181600 + }, + { + "epoch": 0.16, + "grad_norm": 0.00213623046875, + "learning_rate": 4.233024996850649e-05, + "loss": 1.0868, + "step": 181700 + }, + { + "epoch": 0.16, + "grad_norm": 0.09326171875, + "learning_rate": 4.232575089531557e-05, + "loss": 1.1705, + "step": 181800 + }, + { + "epoch": 0.16, + "grad_norm": 10.4375, + "learning_rate": 4.2321251822124646e-05, + "loss": 1.0112, + "step": 181900 + }, + { + "epoch": 0.16, + "grad_norm": 56.25, + "learning_rate": 4.231675274893372e-05, + "loss": 1.1826, + "step": 182000 + }, + { + "epoch": 0.16, + "grad_norm": 39.75, + "learning_rate": 4.2312253675742797e-05, + "loss": 1.1999, + "step": 182100 + }, + { + "epoch": 0.16, + "grad_norm": 0.049072265625, + "learning_rate": 4.230775460255187e-05, + "loss": 1.3048, + "step": 182200 + }, + { + "epoch": 0.16, + "grad_norm": 92.5, + "learning_rate": 4.2303255529360954e-05, + "loss": 1.2344, + "step": 182300 + }, + { + "epoch": 0.16, + "grad_norm": 13.5, + "learning_rate": 4.229875645617003e-05, + "loss": 1.2589, + "step": 182400 + }, + { + "epoch": 0.16, + "grad_norm": 2.546875, + "learning_rate": 4.2294257382979105e-05, + "loss": 1.1501, + "step": 182500 + }, + { + "epoch": 0.16, + "grad_norm": 22.375, + "learning_rate": 4.228975830978819e-05, + "loss": 1.2442, + "step": 182600 + }, + { + "epoch": 0.16, + "grad_norm": 22.25, + "learning_rate": 4.228525923659726e-05, + "loss": 1.3321, + "step": 182700 + }, + { + "epoch": 0.16, + "grad_norm": 0.0159912109375, + "learning_rate": 4.2280760163406344e-05, + "loss": 1.1938, + "step": 182800 + }, + { + "epoch": 0.16, + "grad_norm": 64.0, + "learning_rate": 4.227626109021542e-05, + "loss": 1.1458, + "step": 182900 + }, + { + "epoch": 0.16, + "grad_norm": 200.0, + "learning_rate": 4.2271762017024495e-05, + "loss": 1.1515, + "step": 183000 + }, + { + "epoch": 0.16, + "grad_norm": 42.5, + "learning_rate": 4.226726294383358e-05, + "loss": 1.3802, + "step": 183100 + }, + { + "epoch": 0.16, + "grad_norm": 21.75, + "learning_rate": 4.226276387064265e-05, + "loss": 0.9465, + "step": 183200 + }, + { + "epoch": 0.16, + "grad_norm": 69.5, + "learning_rate": 4.225826479745172e-05, + "loss": 1.0952, + "step": 183300 + }, + { + "epoch": 0.16, + "grad_norm": 32.25, + "learning_rate": 4.22537657242608e-05, + "loss": 0.9934, + "step": 183400 + }, + { + "epoch": 0.16, + "grad_norm": 48.0, + "learning_rate": 4.224926665106988e-05, + "loss": 1.1524, + "step": 183500 + }, + { + "epoch": 0.16, + "grad_norm": 9.125, + "learning_rate": 4.224476757787896e-05, + "loss": 1.2027, + "step": 183600 + }, + { + "epoch": 0.16, + "grad_norm": 25.875, + "learning_rate": 4.2240268504688036e-05, + "loss": 1.4629, + "step": 183700 + }, + { + "epoch": 0.16, + "grad_norm": 5.78125, + "learning_rate": 4.223576943149711e-05, + "loss": 0.9975, + "step": 183800 + }, + { + "epoch": 0.16, + "grad_norm": 20.125, + "learning_rate": 4.223127035830619e-05, + "loss": 1.2801, + "step": 183900 + }, + { + "epoch": 0.16, + "grad_norm": 19.125, + "learning_rate": 4.222677128511527e-05, + "loss": 1.093, + "step": 184000 + }, + { + "epoch": 0.16, + "grad_norm": 16.75, + "learning_rate": 4.2222272211924344e-05, + "loss": 1.081, + "step": 184100 + }, + { + "epoch": 0.16, + "grad_norm": 179.0, + "learning_rate": 4.2217773138733426e-05, + "loss": 1.2437, + "step": 184200 + }, + { + "epoch": 0.16, + "grad_norm": 10.5, + "learning_rate": 4.22132740655425e-05, + "loss": 1.1344, + "step": 184300 + }, + { + "epoch": 0.16, + "grad_norm": 22.875, + "learning_rate": 4.220877499235158e-05, + "loss": 1.2239, + "step": 184400 + }, + { + "epoch": 0.16, + "grad_norm": 114.5, + "learning_rate": 4.220427591916066e-05, + "loss": 1.2939, + "step": 184500 + }, + { + "epoch": 0.16, + "grad_norm": 38.0, + "learning_rate": 4.219977684596973e-05, + "loss": 1.2764, + "step": 184600 + }, + { + "epoch": 0.16, + "grad_norm": 209.0, + "learning_rate": 4.219527777277881e-05, + "loss": 1.2841, + "step": 184700 + }, + { + "epoch": 0.16, + "grad_norm": 21.75, + "learning_rate": 4.2190778699587885e-05, + "loss": 1.3063, + "step": 184800 + }, + { + "epoch": 0.16, + "grad_norm": 27.0, + "learning_rate": 4.218627962639696e-05, + "loss": 1.3656, + "step": 184900 + }, + { + "epoch": 0.16, + "grad_norm": 105.0, + "learning_rate": 4.218178055320604e-05, + "loss": 1.1371, + "step": 185000 + }, + { + "epoch": 0.16, + "grad_norm": 0.2265625, + "learning_rate": 4.217728148001512e-05, + "loss": 1.2233, + "step": 185100 + }, + { + "epoch": 0.16, + "grad_norm": 71.0, + "learning_rate": 4.217278240682419e-05, + "loss": 1.1547, + "step": 185200 + }, + { + "epoch": 0.17, + "grad_norm": 50.25, + "learning_rate": 4.2168283333633275e-05, + "loss": 1.1923, + "step": 185300 + }, + { + "epoch": 0.17, + "grad_norm": 18.875, + "learning_rate": 4.216378426044235e-05, + "loss": 1.2171, + "step": 185400 + }, + { + "epoch": 0.17, + "grad_norm": 33.25, + "learning_rate": 4.215928518725143e-05, + "loss": 1.305, + "step": 185500 + }, + { + "epoch": 0.17, + "grad_norm": 10.5, + "learning_rate": 4.215478611406051e-05, + "loss": 1.2172, + "step": 185600 + }, + { + "epoch": 0.17, + "grad_norm": 18.625, + "learning_rate": 4.215028704086958e-05, + "loss": 1.3026, + "step": 185700 + }, + { + "epoch": 0.17, + "grad_norm": 22.875, + "learning_rate": 4.2145787967678665e-05, + "loss": 1.0829, + "step": 185800 + }, + { + "epoch": 0.17, + "grad_norm": 0.0751953125, + "learning_rate": 4.2141288894487734e-05, + "loss": 1.1234, + "step": 185900 + }, + { + "epoch": 0.17, + "grad_norm": 27.875, + "learning_rate": 4.213678982129681e-05, + "loss": 1.4107, + "step": 186000 + }, + { + "epoch": 0.17, + "grad_norm": 199.0, + "learning_rate": 4.213229074810589e-05, + "loss": 1.2315, + "step": 186100 + }, + { + "epoch": 0.17, + "grad_norm": 41.25, + "learning_rate": 4.212779167491497e-05, + "loss": 1.2325, + "step": 186200 + }, + { + "epoch": 0.17, + "grad_norm": 266.0, + "learning_rate": 4.212329260172405e-05, + "loss": 1.2041, + "step": 186300 + }, + { + "epoch": 0.17, + "grad_norm": 1.5546875, + "learning_rate": 4.2118793528533124e-05, + "loss": 1.2449, + "step": 186400 + }, + { + "epoch": 0.17, + "grad_norm": 1.4453125, + "learning_rate": 4.21142944553422e-05, + "loss": 1.0151, + "step": 186500 + }, + { + "epoch": 0.17, + "grad_norm": 0.36328125, + "learning_rate": 4.210979538215128e-05, + "loss": 1.1362, + "step": 186600 + }, + { + "epoch": 0.17, + "grad_norm": 1.59375, + "learning_rate": 4.210529630896036e-05, + "loss": 1.2349, + "step": 186700 + }, + { + "epoch": 0.17, + "grad_norm": 30.125, + "learning_rate": 4.210079723576943e-05, + "loss": 1.2757, + "step": 186800 + }, + { + "epoch": 0.17, + "grad_norm": 65.5, + "learning_rate": 4.2096298162578515e-05, + "loss": 1.0808, + "step": 186900 + }, + { + "epoch": 0.17, + "grad_norm": 240.0, + "learning_rate": 4.209179908938759e-05, + "loss": 1.132, + "step": 187000 + }, + { + "epoch": 0.17, + "grad_norm": 20.375, + "learning_rate": 4.2087300016196665e-05, + "loss": 1.1575, + "step": 187100 + }, + { + "epoch": 0.17, + "grad_norm": 0.08740234375, + "learning_rate": 4.208280094300574e-05, + "loss": 1.138, + "step": 187200 + }, + { + "epoch": 0.17, + "grad_norm": 37.25, + "learning_rate": 4.2078301869814816e-05, + "loss": 1.1676, + "step": 187300 + }, + { + "epoch": 0.17, + "grad_norm": 19.625, + "learning_rate": 4.20738027966239e-05, + "loss": 1.0593, + "step": 187400 + }, + { + "epoch": 0.17, + "grad_norm": 0.2138671875, + "learning_rate": 4.206930372343297e-05, + "loss": 1.3202, + "step": 187500 + }, + { + "epoch": 0.17, + "grad_norm": 16.875, + "learning_rate": 4.206480465024205e-05, + "loss": 1.1956, + "step": 187600 + }, + { + "epoch": 0.17, + "grad_norm": 18.5, + "learning_rate": 4.206030557705113e-05, + "loss": 1.3435, + "step": 187700 + }, + { + "epoch": 0.17, + "grad_norm": 49.5, + "learning_rate": 4.2055806503860206e-05, + "loss": 1.0993, + "step": 187800 + }, + { + "epoch": 0.17, + "grad_norm": 39.25, + "learning_rate": 4.205130743066928e-05, + "loss": 1.0359, + "step": 187900 + }, + { + "epoch": 0.17, + "grad_norm": 14.375, + "learning_rate": 4.2046808357478364e-05, + "loss": 1.2202, + "step": 188000 + }, + { + "epoch": 0.17, + "grad_norm": 32.75, + "learning_rate": 4.204230928428744e-05, + "loss": 1.2804, + "step": 188100 + }, + { + "epoch": 0.17, + "grad_norm": 46.25, + "learning_rate": 4.203781021109652e-05, + "loss": 1.2196, + "step": 188200 + }, + { + "epoch": 0.17, + "grad_norm": 128.0, + "learning_rate": 4.2033311137905596e-05, + "loss": 1.1602, + "step": 188300 + }, + { + "epoch": 0.17, + "grad_norm": 68.5, + "learning_rate": 4.202881206471467e-05, + "loss": 1.1784, + "step": 188400 + }, + { + "epoch": 0.17, + "grad_norm": 114.0, + "learning_rate": 4.202431299152375e-05, + "loss": 1.3161, + "step": 188500 + }, + { + "epoch": 0.17, + "grad_norm": 18.625, + "learning_rate": 4.201981391833282e-05, + "loss": 1.1077, + "step": 188600 + }, + { + "epoch": 0.17, + "grad_norm": 140.0, + "learning_rate": 4.20153148451419e-05, + "loss": 1.2905, + "step": 188700 + }, + { + "epoch": 0.17, + "grad_norm": 22.0, + "learning_rate": 4.201081577195098e-05, + "loss": 1.2771, + "step": 188800 + }, + { + "epoch": 0.17, + "grad_norm": 10.0, + "learning_rate": 4.2006316698760055e-05, + "loss": 0.9896, + "step": 188900 + }, + { + "epoch": 0.17, + "grad_norm": 28.125, + "learning_rate": 4.200181762556914e-05, + "loss": 1.144, + "step": 189000 + }, + { + "epoch": 0.17, + "grad_norm": 86.0, + "learning_rate": 4.199731855237821e-05, + "loss": 1.0594, + "step": 189100 + }, + { + "epoch": 0.17, + "grad_norm": 1560.0, + "learning_rate": 4.199281947918729e-05, + "loss": 1.2533, + "step": 189200 + }, + { + "epoch": 0.17, + "grad_norm": 30.0, + "learning_rate": 4.198832040599637e-05, + "loss": 1.2242, + "step": 189300 + }, + { + "epoch": 0.17, + "grad_norm": 21.25, + "learning_rate": 4.1983821332805445e-05, + "loss": 1.2011, + "step": 189400 + }, + { + "epoch": 0.17, + "grad_norm": 28.5, + "learning_rate": 4.197932225961452e-05, + "loss": 1.2743, + "step": 189500 + }, + { + "epoch": 0.17, + "grad_norm": 12.25, + "learning_rate": 4.19748231864236e-05, + "loss": 1.1596, + "step": 189600 + }, + { + "epoch": 0.17, + "grad_norm": 45.75, + "learning_rate": 4.197032411323268e-05, + "loss": 1.1093, + "step": 189700 + }, + { + "epoch": 0.17, + "grad_norm": 274.0, + "learning_rate": 4.1965825040041754e-05, + "loss": 1.2053, + "step": 189800 + }, + { + "epoch": 0.17, + "grad_norm": 12.6875, + "learning_rate": 4.196132596685083e-05, + "loss": 1.1662, + "step": 189900 + }, + { + "epoch": 0.17, + "grad_norm": 6.875, + "learning_rate": 4.1956826893659904e-05, + "loss": 1.1291, + "step": 190000 + }, + { + "epoch": 0.17, + "grad_norm": 796.0, + "learning_rate": 4.1952327820468986e-05, + "loss": 1.1676, + "step": 190100 + }, + { + "epoch": 0.17, + "grad_norm": 72.0, + "learning_rate": 4.194782874727806e-05, + "loss": 1.132, + "step": 190200 + }, + { + "epoch": 0.17, + "grad_norm": 42.25, + "learning_rate": 4.194332967408714e-05, + "loss": 1.1253, + "step": 190300 + }, + { + "epoch": 0.17, + "grad_norm": 68.0, + "learning_rate": 4.193883060089622e-05, + "loss": 1.2876, + "step": 190400 + }, + { + "epoch": 0.17, + "grad_norm": 0.11083984375, + "learning_rate": 4.1934331527705295e-05, + "loss": 1.1226, + "step": 190500 + }, + { + "epoch": 0.17, + "grad_norm": 37.0, + "learning_rate": 4.192983245451437e-05, + "loss": 1.0657, + "step": 190600 + }, + { + "epoch": 0.17, + "grad_norm": 109.5, + "learning_rate": 4.192533338132345e-05, + "loss": 1.1886, + "step": 190700 + }, + { + "epoch": 0.17, + "grad_norm": 24.5, + "learning_rate": 4.192083430813253e-05, + "loss": 1.2065, + "step": 190800 + }, + { + "epoch": 0.17, + "grad_norm": 230.0, + "learning_rate": 4.191633523494161e-05, + "loss": 1.0951, + "step": 190900 + }, + { + "epoch": 0.17, + "grad_norm": 40.25, + "learning_rate": 4.1911836161750685e-05, + "loss": 1.1631, + "step": 191000 + }, + { + "epoch": 0.17, + "grad_norm": 16.875, + "learning_rate": 4.190733708855975e-05, + "loss": 1.1751, + "step": 191100 + }, + { + "epoch": 0.17, + "grad_norm": 27.5, + "learning_rate": 4.1902838015368835e-05, + "loss": 1.2777, + "step": 191200 + }, + { + "epoch": 0.17, + "grad_norm": 68.5, + "learning_rate": 4.189833894217791e-05, + "loss": 1.185, + "step": 191300 + }, + { + "epoch": 0.17, + "grad_norm": 9.0, + "learning_rate": 4.1893839868986986e-05, + "loss": 1.1625, + "step": 191400 + }, + { + "epoch": 0.17, + "grad_norm": 52.5, + "learning_rate": 4.188934079579607e-05, + "loss": 1.2173, + "step": 191500 + }, + { + "epoch": 0.17, + "grad_norm": 46.0, + "learning_rate": 4.1884841722605144e-05, + "loss": 1.2283, + "step": 191600 + }, + { + "epoch": 0.17, + "grad_norm": 16.625, + "learning_rate": 4.1880342649414226e-05, + "loss": 1.2837, + "step": 191700 + }, + { + "epoch": 0.17, + "grad_norm": 9.8125, + "learning_rate": 4.18758435762233e-05, + "loss": 1.025, + "step": 191800 + }, + { + "epoch": 0.17, + "grad_norm": 29.0, + "learning_rate": 4.1871344503032376e-05, + "loss": 1.0625, + "step": 191900 + }, + { + "epoch": 0.17, + "grad_norm": 0.041015625, + "learning_rate": 4.186684542984146e-05, + "loss": 1.2094, + "step": 192000 + }, + { + "epoch": 0.17, + "grad_norm": 75.0, + "learning_rate": 4.1862346356650534e-05, + "loss": 1.1954, + "step": 192100 + }, + { + "epoch": 0.17, + "grad_norm": 22.625, + "learning_rate": 4.185784728345961e-05, + "loss": 1.1494, + "step": 192200 + }, + { + "epoch": 0.17, + "grad_norm": 42.5, + "learning_rate": 4.185334821026869e-05, + "loss": 1.1898, + "step": 192300 + }, + { + "epoch": 0.17, + "grad_norm": 58.25, + "learning_rate": 4.184884913707776e-05, + "loss": 1.1983, + "step": 192400 + }, + { + "epoch": 0.17, + "grad_norm": 14.3125, + "learning_rate": 4.184435006388684e-05, + "loss": 1.0581, + "step": 192500 + }, + { + "epoch": 0.17, + "grad_norm": 0.022705078125, + "learning_rate": 4.183985099069592e-05, + "loss": 1.1541, + "step": 192600 + }, + { + "epoch": 0.17, + "grad_norm": 16.0, + "learning_rate": 4.183535191750499e-05, + "loss": 1.2466, + "step": 192700 + }, + { + "epoch": 0.17, + "grad_norm": 160.0, + "learning_rate": 4.1830852844314075e-05, + "loss": 1.2106, + "step": 192800 + }, + { + "epoch": 0.17, + "grad_norm": 34.5, + "learning_rate": 4.182635377112315e-05, + "loss": 1.1169, + "step": 192900 + }, + { + "epoch": 0.17, + "grad_norm": 19.75, + "learning_rate": 4.1821854697932225e-05, + "loss": 1.3027, + "step": 193000 + }, + { + "epoch": 0.17, + "grad_norm": 93.5, + "learning_rate": 4.181735562474131e-05, + "loss": 1.2137, + "step": 193100 + }, + { + "epoch": 0.17, + "grad_norm": 37.5, + "learning_rate": 4.181285655155038e-05, + "loss": 1.0112, + "step": 193200 + }, + { + "epoch": 0.17, + "grad_norm": 0.0206298828125, + "learning_rate": 4.180835747835946e-05, + "loss": 1.048, + "step": 193300 + }, + { + "epoch": 0.17, + "grad_norm": 19.625, + "learning_rate": 4.180385840516854e-05, + "loss": 1.2389, + "step": 193400 + }, + { + "epoch": 0.17, + "grad_norm": 18.625, + "learning_rate": 4.1799359331977616e-05, + "loss": 1.106, + "step": 193500 + }, + { + "epoch": 0.17, + "grad_norm": 187.0, + "learning_rate": 4.179486025878669e-05, + "loss": 1.3622, + "step": 193600 + }, + { + "epoch": 0.17, + "grad_norm": 38.5, + "learning_rate": 4.1790361185595766e-05, + "loss": 1.1716, + "step": 193700 + }, + { + "epoch": 0.17, + "grad_norm": 28.875, + "learning_rate": 4.178586211240484e-05, + "loss": 1.1967, + "step": 193800 + }, + { + "epoch": 0.17, + "grad_norm": 85.0, + "learning_rate": 4.1781363039213924e-05, + "loss": 1.1057, + "step": 193900 + }, + { + "epoch": 0.17, + "grad_norm": 9.8125, + "learning_rate": 4.1776863966023e-05, + "loss": 1.1438, + "step": 194000 + }, + { + "epoch": 0.17, + "grad_norm": 51.25, + "learning_rate": 4.1772364892832075e-05, + "loss": 1.1666, + "step": 194100 + }, + { + "epoch": 0.17, + "grad_norm": 83.5, + "learning_rate": 4.176786581964116e-05, + "loss": 1.0818, + "step": 194200 + }, + { + "epoch": 0.17, + "grad_norm": 58.75, + "learning_rate": 4.176336674645023e-05, + "loss": 1.0782, + "step": 194300 + }, + { + "epoch": 0.17, + "grad_norm": 68.5, + "learning_rate": 4.1758867673259314e-05, + "loss": 1.1737, + "step": 194400 + }, + { + "epoch": 0.17, + "grad_norm": 0.1396484375, + "learning_rate": 4.175436860006839e-05, + "loss": 1.1863, + "step": 194500 + }, + { + "epoch": 0.17, + "grad_norm": 46.75, + "learning_rate": 4.1749869526877465e-05, + "loss": 1.2777, + "step": 194600 + }, + { + "epoch": 0.17, + "grad_norm": 10.0625, + "learning_rate": 4.174537045368655e-05, + "loss": 1.2273, + "step": 194700 + }, + { + "epoch": 0.17, + "grad_norm": 19.875, + "learning_rate": 4.174087138049562e-05, + "loss": 1.1958, + "step": 194800 + }, + { + "epoch": 0.17, + "grad_norm": 8.25, + "learning_rate": 4.17363723073047e-05, + "loss": 1.3402, + "step": 194900 + }, + { + "epoch": 0.17, + "grad_norm": 0.208984375, + "learning_rate": 4.173187323411377e-05, + "loss": 1.0688, + "step": 195000 + }, + { + "epoch": 0.17, + "grad_norm": 25.0, + "learning_rate": 4.172737416092285e-05, + "loss": 1.2346, + "step": 195100 + }, + { + "epoch": 0.17, + "grad_norm": 1.03125, + "learning_rate": 4.172287508773193e-05, + "loss": 1.1805, + "step": 195200 + }, + { + "epoch": 0.17, + "grad_norm": 24.125, + "learning_rate": 4.1718376014541006e-05, + "loss": 1.068, + "step": 195300 + }, + { + "epoch": 0.17, + "grad_norm": 12.75, + "learning_rate": 4.171387694135008e-05, + "loss": 1.1489, + "step": 195400 + }, + { + "epoch": 0.17, + "grad_norm": 26.5, + "learning_rate": 4.170937786815916e-05, + "loss": 1.1357, + "step": 195500 + }, + { + "epoch": 0.17, + "grad_norm": 123.0, + "learning_rate": 4.170487879496824e-05, + "loss": 1.1081, + "step": 195600 + }, + { + "epoch": 0.17, + "grad_norm": 44.0, + "learning_rate": 4.1700379721777314e-05, + "loss": 1.1402, + "step": 195700 + }, + { + "epoch": 0.17, + "grad_norm": 60.25, + "learning_rate": 4.1695880648586396e-05, + "loss": 1.0414, + "step": 195800 + }, + { + "epoch": 0.17, + "grad_norm": 71.5, + "learning_rate": 4.169138157539547e-05, + "loss": 1.2874, + "step": 195900 + }, + { + "epoch": 0.17, + "grad_norm": 840.0, + "learning_rate": 4.168688250220455e-05, + "loss": 1.2207, + "step": 196000 + }, + { + "epoch": 0.17, + "grad_norm": 65.5, + "learning_rate": 4.168238342901363e-05, + "loss": 1.25, + "step": 196100 + }, + { + "epoch": 0.17, + "grad_norm": 17.625, + "learning_rate": 4.1677884355822704e-05, + "loss": 1.2282, + "step": 196200 + }, + { + "epoch": 0.17, + "grad_norm": 0.142578125, + "learning_rate": 4.167338528263178e-05, + "loss": 1.2121, + "step": 196300 + }, + { + "epoch": 0.17, + "grad_norm": 0.1611328125, + "learning_rate": 4.1668886209440855e-05, + "loss": 1.2917, + "step": 196400 + }, + { + "epoch": 0.18, + "grad_norm": 32.5, + "learning_rate": 4.166438713624993e-05, + "loss": 1.0817, + "step": 196500 + }, + { + "epoch": 0.18, + "grad_norm": 40.25, + "learning_rate": 4.165988806305901e-05, + "loss": 1.2343, + "step": 196600 + }, + { + "epoch": 0.18, + "grad_norm": 43.0, + "learning_rate": 4.165538898986809e-05, + "loss": 1.1656, + "step": 196700 + }, + { + "epoch": 0.18, + "grad_norm": 10.9375, + "learning_rate": 4.165088991667716e-05, + "loss": 1.2031, + "step": 196800 + }, + { + "epoch": 0.18, + "grad_norm": 42.25, + "learning_rate": 4.1646390843486245e-05, + "loss": 1.3176, + "step": 196900 + }, + { + "epoch": 0.18, + "grad_norm": 35.0, + "learning_rate": 4.164189177029532e-05, + "loss": 1.1468, + "step": 197000 + }, + { + "epoch": 0.18, + "grad_norm": 2.625, + "learning_rate": 4.16373926971044e-05, + "loss": 1.0494, + "step": 197100 + }, + { + "epoch": 0.18, + "grad_norm": 8.1875, + "learning_rate": 4.163289362391348e-05, + "loss": 1.0928, + "step": 197200 + }, + { + "epoch": 0.18, + "grad_norm": 486.0, + "learning_rate": 4.162839455072255e-05, + "loss": 1.2197, + "step": 197300 + }, + { + "epoch": 0.18, + "grad_norm": 60.75, + "learning_rate": 4.1623895477531635e-05, + "loss": 1.2145, + "step": 197400 + }, + { + "epoch": 0.18, + "grad_norm": 220.0, + "learning_rate": 4.161939640434071e-05, + "loss": 1.2995, + "step": 197500 + }, + { + "epoch": 0.18, + "grad_norm": 40.5, + "learning_rate": 4.161489733114978e-05, + "loss": 1.0946, + "step": 197600 + }, + { + "epoch": 0.18, + "grad_norm": 7.0625, + "learning_rate": 4.161039825795886e-05, + "loss": 1.1699, + "step": 197700 + }, + { + "epoch": 0.18, + "grad_norm": 33.25, + "learning_rate": 4.160589918476794e-05, + "loss": 1.3221, + "step": 197800 + }, + { + "epoch": 0.18, + "grad_norm": 94.5, + "learning_rate": 4.160140011157702e-05, + "loss": 1.287, + "step": 197900 + }, + { + "epoch": 0.18, + "grad_norm": 32.0, + "learning_rate": 4.1596901038386094e-05, + "loss": 1.1507, + "step": 198000 + }, + { + "epoch": 0.18, + "grad_norm": 22.375, + "learning_rate": 4.159240196519517e-05, + "loss": 1.2471, + "step": 198100 + }, + { + "epoch": 0.18, + "grad_norm": 0.08349609375, + "learning_rate": 4.158790289200425e-05, + "loss": 1.2462, + "step": 198200 + }, + { + "epoch": 0.18, + "grad_norm": 41.5, + "learning_rate": 4.158340381881333e-05, + "loss": 1.1003, + "step": 198300 + }, + { + "epoch": 0.18, + "grad_norm": 36.25, + "learning_rate": 4.15789047456224e-05, + "loss": 1.1615, + "step": 198400 + }, + { + "epoch": 0.18, + "grad_norm": 12.0625, + "learning_rate": 4.1574405672431484e-05, + "loss": 1.3816, + "step": 198500 + }, + { + "epoch": 0.18, + "grad_norm": 0.06591796875, + "learning_rate": 4.156990659924056e-05, + "loss": 1.1632, + "step": 198600 + }, + { + "epoch": 0.18, + "grad_norm": 25.125, + "learning_rate": 4.1565407526049635e-05, + "loss": 1.1957, + "step": 198700 + }, + { + "epoch": 0.18, + "grad_norm": 24.625, + "learning_rate": 4.156090845285872e-05, + "loss": 1.3067, + "step": 198800 + }, + { + "epoch": 0.18, + "grad_norm": 129.0, + "learning_rate": 4.1556409379667786e-05, + "loss": 1.1935, + "step": 198900 + }, + { + "epoch": 0.18, + "grad_norm": 24.25, + "learning_rate": 4.155191030647687e-05, + "loss": 1.0834, + "step": 199000 + }, + { + "epoch": 0.18, + "grad_norm": 9.8125, + "learning_rate": 4.154741123328594e-05, + "loss": 1.2487, + "step": 199100 + }, + { + "epoch": 0.18, + "grad_norm": 47.0, + "learning_rate": 4.154291216009502e-05, + "loss": 1.3689, + "step": 199200 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 4.15384130869041e-05, + "loss": 1.0971, + "step": 199300 + }, + { + "epoch": 0.18, + "grad_norm": 80.5, + "learning_rate": 4.1533914013713176e-05, + "loss": 1.1138, + "step": 199400 + }, + { + "epoch": 0.18, + "grad_norm": 8.0625, + "learning_rate": 4.152941494052225e-05, + "loss": 1.0809, + "step": 199500 + }, + { + "epoch": 0.18, + "grad_norm": 43.25, + "learning_rate": 4.1524915867331333e-05, + "loss": 1.1443, + "step": 199600 + }, + { + "epoch": 0.18, + "grad_norm": 49.25, + "learning_rate": 4.152041679414041e-05, + "loss": 1.1211, + "step": 199700 + }, + { + "epoch": 0.18, + "grad_norm": 20.625, + "learning_rate": 4.151591772094949e-05, + "loss": 1.1441, + "step": 199800 + }, + { + "epoch": 0.18, + "grad_norm": 106.0, + "learning_rate": 4.1511418647758566e-05, + "loss": 1.2964, + "step": 199900 + }, + { + "epoch": 0.18, + "grad_norm": 114.5, + "learning_rate": 4.150691957456764e-05, + "loss": 1.074, + "step": 200000 + }, + { + "epoch": 0.18, + "grad_norm": 38.75, + "learning_rate": 4.1502420501376724e-05, + "loss": 1.1804, + "step": 200100 + }, + { + "epoch": 0.18, + "grad_norm": 44.0, + "learning_rate": 4.149792142818579e-05, + "loss": 1.2302, + "step": 200200 + }, + { + "epoch": 0.18, + "grad_norm": 19.0, + "learning_rate": 4.149342235499487e-05, + "loss": 1.2207, + "step": 200300 + }, + { + "epoch": 0.18, + "grad_norm": 1.03125, + "learning_rate": 4.148892328180395e-05, + "loss": 1.2023, + "step": 200400 + }, + { + "epoch": 0.18, + "grad_norm": 12.75, + "learning_rate": 4.1484424208613025e-05, + "loss": 1.4099, + "step": 200500 + }, + { + "epoch": 0.18, + "grad_norm": 32.75, + "learning_rate": 4.147992513542211e-05, + "loss": 1.2869, + "step": 200600 + }, + { + "epoch": 0.18, + "grad_norm": 73.5, + "learning_rate": 4.147542606223118e-05, + "loss": 1.3064, + "step": 200700 + }, + { + "epoch": 0.18, + "grad_norm": 32.25, + "learning_rate": 4.147092698904026e-05, + "loss": 1.1199, + "step": 200800 + }, + { + "epoch": 0.18, + "grad_norm": 58.75, + "learning_rate": 4.146642791584934e-05, + "loss": 1.1937, + "step": 200900 + }, + { + "epoch": 0.18, + "grad_norm": 18.5, + "learning_rate": 4.1461928842658415e-05, + "loss": 1.1972, + "step": 201000 + }, + { + "epoch": 0.18, + "grad_norm": 0.016845703125, + "learning_rate": 4.145742976946749e-05, + "loss": 1.1635, + "step": 201100 + }, + { + "epoch": 0.18, + "grad_norm": 18.375, + "learning_rate": 4.145293069627657e-05, + "loss": 1.2409, + "step": 201200 + }, + { + "epoch": 0.18, + "grad_norm": 48.25, + "learning_rate": 4.144843162308565e-05, + "loss": 1.1991, + "step": 201300 + }, + { + "epoch": 0.18, + "grad_norm": 45.75, + "learning_rate": 4.1443932549894723e-05, + "loss": 1.2935, + "step": 201400 + }, + { + "epoch": 0.18, + "grad_norm": 22.875, + "learning_rate": 4.14394334767038e-05, + "loss": 1.2154, + "step": 201500 + }, + { + "epoch": 0.18, + "grad_norm": 29.75, + "learning_rate": 4.1434934403512874e-05, + "loss": 1.1718, + "step": 201600 + }, + { + "epoch": 0.18, + "grad_norm": 19.75, + "learning_rate": 4.1430435330321956e-05, + "loss": 1.1829, + "step": 201700 + }, + { + "epoch": 0.18, + "grad_norm": 37.5, + "learning_rate": 4.142593625713103e-05, + "loss": 1.2179, + "step": 201800 + }, + { + "epoch": 0.18, + "grad_norm": 2.453125, + "learning_rate": 4.142143718394011e-05, + "loss": 1.1942, + "step": 201900 + }, + { + "epoch": 0.18, + "grad_norm": 50.75, + "learning_rate": 4.141693811074919e-05, + "loss": 1.034, + "step": 202000 + }, + { + "epoch": 0.18, + "grad_norm": 31.25, + "learning_rate": 4.1412439037558264e-05, + "loss": 1.1917, + "step": 202100 + }, + { + "epoch": 0.18, + "grad_norm": 40.75, + "learning_rate": 4.140793996436734e-05, + "loss": 1.2672, + "step": 202200 + }, + { + "epoch": 0.18, + "grad_norm": 94.5, + "learning_rate": 4.140344089117642e-05, + "loss": 1.1245, + "step": 202300 + }, + { + "epoch": 0.18, + "grad_norm": 42.5, + "learning_rate": 4.13989418179855e-05, + "loss": 1.2138, + "step": 202400 + }, + { + "epoch": 0.18, + "grad_norm": 52.75, + "learning_rate": 4.139444274479458e-05, + "loss": 1.0524, + "step": 202500 + }, + { + "epoch": 0.18, + "grad_norm": 40.5, + "learning_rate": 4.1389943671603655e-05, + "loss": 1.2177, + "step": 202600 + }, + { + "epoch": 0.18, + "grad_norm": 18.25, + "learning_rate": 4.138544459841273e-05, + "loss": 1.086, + "step": 202700 + }, + { + "epoch": 0.18, + "grad_norm": 23.625, + "learning_rate": 4.1380945525221805e-05, + "loss": 1.0988, + "step": 202800 + }, + { + "epoch": 0.18, + "grad_norm": 0.09375, + "learning_rate": 4.137644645203088e-05, + "loss": 1.1845, + "step": 202900 + }, + { + "epoch": 0.18, + "grad_norm": 0.00836181640625, + "learning_rate": 4.1371947378839956e-05, + "loss": 1.2906, + "step": 203000 + }, + { + "epoch": 0.18, + "grad_norm": 0.1318359375, + "learning_rate": 4.136744830564904e-05, + "loss": 1.1791, + "step": 203100 + }, + { + "epoch": 0.18, + "grad_norm": 28.75, + "learning_rate": 4.1362949232458113e-05, + "loss": 1.1983, + "step": 203200 + }, + { + "epoch": 0.18, + "grad_norm": 21.125, + "learning_rate": 4.1358450159267196e-05, + "loss": 1.2232, + "step": 203300 + }, + { + "epoch": 0.18, + "grad_norm": 23.5, + "learning_rate": 4.135395108607627e-05, + "loss": 1.1762, + "step": 203400 + }, + { + "epoch": 0.18, + "grad_norm": 86.5, + "learning_rate": 4.1349452012885346e-05, + "loss": 1.2481, + "step": 203500 + }, + { + "epoch": 0.18, + "grad_norm": 22.0, + "learning_rate": 4.134495293969443e-05, + "loss": 1.1948, + "step": 203600 + }, + { + "epoch": 0.18, + "grad_norm": 0.232421875, + "learning_rate": 4.1340453866503504e-05, + "loss": 1.2403, + "step": 203700 + }, + { + "epoch": 0.18, + "grad_norm": 27.25, + "learning_rate": 4.133595479331258e-05, + "loss": 1.2108, + "step": 203800 + }, + { + "epoch": 0.18, + "grad_norm": 218.0, + "learning_rate": 4.133145572012166e-05, + "loss": 1.2229, + "step": 203900 + }, + { + "epoch": 0.18, + "grad_norm": 0.07177734375, + "learning_rate": 4.1326956646930736e-05, + "loss": 1.0764, + "step": 204000 + }, + { + "epoch": 0.18, + "grad_norm": 21.875, + "learning_rate": 4.132245757373981e-05, + "loss": 1.2225, + "step": 204100 + }, + { + "epoch": 0.18, + "grad_norm": 16.0, + "learning_rate": 4.131795850054889e-05, + "loss": 1.098, + "step": 204200 + }, + { + "epoch": 0.18, + "grad_norm": 17.0, + "learning_rate": 4.131345942735796e-05, + "loss": 1.3416, + "step": 204300 + }, + { + "epoch": 0.18, + "grad_norm": 24.75, + "learning_rate": 4.1308960354167045e-05, + "loss": 1.153, + "step": 204400 + }, + { + "epoch": 0.18, + "grad_norm": 29.375, + "learning_rate": 4.130446128097612e-05, + "loss": 1.2474, + "step": 204500 + }, + { + "epoch": 0.18, + "grad_norm": 27.25, + "learning_rate": 4.1299962207785195e-05, + "loss": 1.078, + "step": 204600 + }, + { + "epoch": 0.18, + "grad_norm": 10.0, + "learning_rate": 4.129546313459428e-05, + "loss": 1.3034, + "step": 204700 + }, + { + "epoch": 0.18, + "grad_norm": 80.0, + "learning_rate": 4.129096406140335e-05, + "loss": 1.1916, + "step": 204800 + }, + { + "epoch": 0.18, + "grad_norm": 39.5, + "learning_rate": 4.128646498821243e-05, + "loss": 1.2762, + "step": 204900 + }, + { + "epoch": 0.18, + "grad_norm": 9.5625, + "learning_rate": 4.128196591502151e-05, + "loss": 1.1806, + "step": 205000 + }, + { + "epoch": 0.18, + "grad_norm": 66.5, + "learning_rate": 4.1277466841830586e-05, + "loss": 1.1549, + "step": 205100 + }, + { + "epoch": 0.18, + "grad_norm": 55.0, + "learning_rate": 4.127296776863967e-05, + "loss": 1.0268, + "step": 205200 + }, + { + "epoch": 0.18, + "grad_norm": 69.5, + "learning_rate": 4.126846869544874e-05, + "loss": 1.4482, + "step": 205300 + }, + { + "epoch": 0.18, + "grad_norm": 56.25, + "learning_rate": 4.126396962225781e-05, + "loss": 1.1934, + "step": 205400 + }, + { + "epoch": 0.18, + "grad_norm": 38.25, + "learning_rate": 4.1259470549066894e-05, + "loss": 1.2673, + "step": 205500 + }, + { + "epoch": 0.18, + "grad_norm": 72.0, + "learning_rate": 4.125497147587597e-05, + "loss": 1.2086, + "step": 205600 + }, + { + "epoch": 0.18, + "grad_norm": 234.0, + "learning_rate": 4.1250472402685044e-05, + "loss": 1.3153, + "step": 205700 + }, + { + "epoch": 0.18, + "grad_norm": 45.25, + "learning_rate": 4.1245973329494126e-05, + "loss": 1.1355, + "step": 205800 + }, + { + "epoch": 0.18, + "grad_norm": 17.5, + "learning_rate": 4.12414742563032e-05, + "loss": 1.0873, + "step": 205900 + }, + { + "epoch": 0.18, + "grad_norm": 42.5, + "learning_rate": 4.1236975183112284e-05, + "loss": 1.1588, + "step": 206000 + }, + { + "epoch": 0.18, + "grad_norm": 21.625, + "learning_rate": 4.123247610992136e-05, + "loss": 1.0837, + "step": 206100 + }, + { + "epoch": 0.18, + "grad_norm": 20.5, + "learning_rate": 4.1227977036730435e-05, + "loss": 1.102, + "step": 206200 + }, + { + "epoch": 0.18, + "grad_norm": 37.25, + "learning_rate": 4.122347796353952e-05, + "loss": 1.1567, + "step": 206300 + }, + { + "epoch": 0.18, + "grad_norm": 140.0, + "learning_rate": 4.121897889034859e-05, + "loss": 1.1777, + "step": 206400 + }, + { + "epoch": 0.18, + "grad_norm": 21.875, + "learning_rate": 4.121447981715767e-05, + "loss": 1.1418, + "step": 206500 + }, + { + "epoch": 0.18, + "grad_norm": 16.125, + "learning_rate": 4.120998074396675e-05, + "loss": 1.165, + "step": 206600 + }, + { + "epoch": 0.18, + "grad_norm": 70.0, + "learning_rate": 4.120548167077582e-05, + "loss": 1.1417, + "step": 206700 + }, + { + "epoch": 0.18, + "grad_norm": 340.0, + "learning_rate": 4.12009825975849e-05, + "loss": 1.2375, + "step": 206800 + }, + { + "epoch": 0.18, + "grad_norm": 37.25, + "learning_rate": 4.1196483524393976e-05, + "loss": 1.3143, + "step": 206900 + }, + { + "epoch": 0.18, + "grad_norm": 9.125, + "learning_rate": 4.119198445120305e-05, + "loss": 1.1753, + "step": 207000 + }, + { + "epoch": 0.18, + "grad_norm": 0.006439208984375, + "learning_rate": 4.118748537801213e-05, + "loss": 1.1464, + "step": 207100 + }, + { + "epoch": 0.18, + "grad_norm": 98.0, + "learning_rate": 4.118298630482121e-05, + "loss": 1.2333, + "step": 207200 + }, + { + "epoch": 0.18, + "grad_norm": 29.875, + "learning_rate": 4.1178487231630284e-05, + "loss": 1.0748, + "step": 207300 + }, + { + "epoch": 0.18, + "grad_norm": 19.875, + "learning_rate": 4.1173988158439366e-05, + "loss": 1.1062, + "step": 207400 + }, + { + "epoch": 0.18, + "grad_norm": 0.2578125, + "learning_rate": 4.116948908524844e-05, + "loss": 1.2516, + "step": 207500 + }, + { + "epoch": 0.18, + "grad_norm": 48.5, + "learning_rate": 4.1164990012057516e-05, + "loss": 1.1372, + "step": 207600 + }, + { + "epoch": 0.19, + "grad_norm": 196.0, + "learning_rate": 4.11604909388666e-05, + "loss": 1.2264, + "step": 207700 + }, + { + "epoch": 0.19, + "grad_norm": 14.75, + "learning_rate": 4.1155991865675674e-05, + "loss": 1.1682, + "step": 207800 + }, + { + "epoch": 0.19, + "grad_norm": 27.125, + "learning_rate": 4.1151492792484756e-05, + "loss": 1.2962, + "step": 207900 + }, + { + "epoch": 0.19, + "grad_norm": 95.5, + "learning_rate": 4.1146993719293825e-05, + "loss": 1.3444, + "step": 208000 + }, + { + "epoch": 0.19, + "grad_norm": 6.28125, + "learning_rate": 4.11424946461029e-05, + "loss": 1.2622, + "step": 208100 + }, + { + "epoch": 0.19, + "grad_norm": 0.0250244140625, + "learning_rate": 4.113799557291198e-05, + "loss": 1.2264, + "step": 208200 + }, + { + "epoch": 0.19, + "grad_norm": 7.03125, + "learning_rate": 4.113349649972106e-05, + "loss": 1.0497, + "step": 208300 + }, + { + "epoch": 0.19, + "grad_norm": 145.0, + "learning_rate": 4.112899742653013e-05, + "loss": 1.1898, + "step": 208400 + }, + { + "epoch": 0.19, + "grad_norm": 2.359375, + "learning_rate": 4.1124498353339215e-05, + "loss": 1.5019, + "step": 208500 + }, + { + "epoch": 0.19, + "grad_norm": 29.875, + "learning_rate": 4.111999928014829e-05, + "loss": 1.0873, + "step": 208600 + }, + { + "epoch": 0.19, + "grad_norm": 37.25, + "learning_rate": 4.111550020695737e-05, + "loss": 1.3849, + "step": 208700 + }, + { + "epoch": 0.19, + "grad_norm": 37.0, + "learning_rate": 4.111100113376645e-05, + "loss": 1.1838, + "step": 208800 + }, + { + "epoch": 0.19, + "grad_norm": 97.0, + "learning_rate": 4.110650206057552e-05, + "loss": 1.1726, + "step": 208900 + }, + { + "epoch": 0.19, + "grad_norm": 15.3125, + "learning_rate": 4.1102002987384605e-05, + "loss": 1.1309, + "step": 209000 + }, + { + "epoch": 0.19, + "grad_norm": 42.0, + "learning_rate": 4.109750391419368e-05, + "loss": 1.2736, + "step": 209100 + }, + { + "epoch": 0.19, + "grad_norm": 77.5, + "learning_rate": 4.1093004841002756e-05, + "loss": 1.2169, + "step": 209200 + }, + { + "epoch": 0.19, + "grad_norm": 8.25, + "learning_rate": 4.108850576781183e-05, + "loss": 1.2611, + "step": 209300 + }, + { + "epoch": 0.19, + "grad_norm": 0.00958251953125, + "learning_rate": 4.1084006694620906e-05, + "loss": 1.1536, + "step": 209400 + }, + { + "epoch": 0.19, + "grad_norm": 23.0, + "learning_rate": 4.107950762142999e-05, + "loss": 1.1543, + "step": 209500 + }, + { + "epoch": 0.19, + "grad_norm": 21.375, + "learning_rate": 4.1075008548239064e-05, + "loss": 1.1865, + "step": 209600 + }, + { + "epoch": 0.19, + "grad_norm": 8.3125, + "learning_rate": 4.107050947504814e-05, + "loss": 1.162, + "step": 209700 + }, + { + "epoch": 0.19, + "grad_norm": 7.46875, + "learning_rate": 4.106601040185722e-05, + "loss": 1.2374, + "step": 209800 + }, + { + "epoch": 0.19, + "grad_norm": 436.0, + "learning_rate": 4.10615113286663e-05, + "loss": 1.0108, + "step": 209900 + }, + { + "epoch": 0.19, + "grad_norm": 34.25, + "learning_rate": 4.105701225547537e-05, + "loss": 1.1299, + "step": 210000 + }, + { + "epoch": 0.19, + "grad_norm": 0.25390625, + "learning_rate": 4.1052513182284454e-05, + "loss": 1.0734, + "step": 210100 + }, + { + "epoch": 0.19, + "grad_norm": 29.75, + "learning_rate": 4.104801410909353e-05, + "loss": 1.1698, + "step": 210200 + }, + { + "epoch": 0.19, + "grad_norm": 44.0, + "learning_rate": 4.1043515035902605e-05, + "loss": 1.2316, + "step": 210300 + }, + { + "epoch": 0.19, + "grad_norm": 60.0, + "learning_rate": 4.103901596271169e-05, + "loss": 1.1693, + "step": 210400 + }, + { + "epoch": 0.19, + "grad_norm": 124.5, + "learning_rate": 4.103451688952076e-05, + "loss": 1.1764, + "step": 210500 + }, + { + "epoch": 0.19, + "grad_norm": 0.056396484375, + "learning_rate": 4.103001781632984e-05, + "loss": 1.254, + "step": 210600 + }, + { + "epoch": 0.19, + "grad_norm": 47.5, + "learning_rate": 4.102551874313891e-05, + "loss": 1.0526, + "step": 210700 + }, + { + "epoch": 0.19, + "grad_norm": 16.125, + "learning_rate": 4.102101966994799e-05, + "loss": 1.3679, + "step": 210800 + }, + { + "epoch": 0.19, + "grad_norm": 24.25, + "learning_rate": 4.101652059675707e-05, + "loss": 1.2325, + "step": 210900 + }, + { + "epoch": 0.19, + "grad_norm": 24.25, + "learning_rate": 4.1012021523566146e-05, + "loss": 1.199, + "step": 211000 + }, + { + "epoch": 0.19, + "grad_norm": 0.036865234375, + "learning_rate": 4.100752245037522e-05, + "loss": 1.0996, + "step": 211100 + }, + { + "epoch": 0.19, + "grad_norm": 0.05322265625, + "learning_rate": 4.10030233771843e-05, + "loss": 1.1855, + "step": 211200 + }, + { + "epoch": 0.19, + "grad_norm": 19.375, + "learning_rate": 4.099852430399338e-05, + "loss": 1.3936, + "step": 211300 + }, + { + "epoch": 0.19, + "grad_norm": 19.0, + "learning_rate": 4.099402523080246e-05, + "loss": 1.2044, + "step": 211400 + }, + { + "epoch": 0.19, + "grad_norm": 56.5, + "learning_rate": 4.0989526157611536e-05, + "loss": 1.1683, + "step": 211500 + }, + { + "epoch": 0.19, + "grad_norm": 0.36328125, + "learning_rate": 4.098502708442061e-05, + "loss": 1.2577, + "step": 211600 + }, + { + "epoch": 0.19, + "grad_norm": 9.9375, + "learning_rate": 4.0980528011229693e-05, + "loss": 1.2948, + "step": 211700 + }, + { + "epoch": 0.19, + "grad_norm": 28.125, + "learning_rate": 4.097602893803877e-05, + "loss": 1.2553, + "step": 211800 + }, + { + "epoch": 0.19, + "grad_norm": 56.0, + "learning_rate": 4.097152986484784e-05, + "loss": 1.2418, + "step": 211900 + }, + { + "epoch": 0.19, + "grad_norm": 11.5625, + "learning_rate": 4.096703079165692e-05, + "loss": 1.0349, + "step": 212000 + }, + { + "epoch": 0.19, + "grad_norm": 18.375, + "learning_rate": 4.0962531718465995e-05, + "loss": 1.2584, + "step": 212100 + }, + { + "epoch": 0.19, + "grad_norm": 22.0, + "learning_rate": 4.095803264527508e-05, + "loss": 1.154, + "step": 212200 + }, + { + "epoch": 0.19, + "grad_norm": 40.5, + "learning_rate": 4.095353357208415e-05, + "loss": 1.231, + "step": 212300 + }, + { + "epoch": 0.19, + "grad_norm": 11.1875, + "learning_rate": 4.094903449889323e-05, + "loss": 1.2227, + "step": 212400 + }, + { + "epoch": 0.19, + "grad_norm": 53.75, + "learning_rate": 4.094453542570231e-05, + "loss": 1.1749, + "step": 212500 + }, + { + "epoch": 0.19, + "grad_norm": 24.125, + "learning_rate": 4.0940036352511385e-05, + "loss": 1.0617, + "step": 212600 + }, + { + "epoch": 0.19, + "grad_norm": 26.375, + "learning_rate": 4.093553727932046e-05, + "loss": 1.3517, + "step": 212700 + }, + { + "epoch": 0.19, + "grad_norm": 18.125, + "learning_rate": 4.093103820612954e-05, + "loss": 1.2856, + "step": 212800 + }, + { + "epoch": 0.19, + "grad_norm": 19.75, + "learning_rate": 4.092653913293862e-05, + "loss": 1.0325, + "step": 212900 + }, + { + "epoch": 0.19, + "grad_norm": 196.0, + "learning_rate": 4.092204005974769e-05, + "loss": 1.2372, + "step": 213000 + }, + { + "epoch": 0.19, + "grad_norm": 9.9375, + "learning_rate": 4.0917540986556775e-05, + "loss": 1.1683, + "step": 213100 + }, + { + "epoch": 0.19, + "grad_norm": 14.875, + "learning_rate": 4.0913041913365844e-05, + "loss": 1.3145, + "step": 213200 + }, + { + "epoch": 0.19, + "grad_norm": 10.0625, + "learning_rate": 4.0908542840174926e-05, + "loss": 1.156, + "step": 213300 + }, + { + "epoch": 0.19, + "grad_norm": 5.8125, + "learning_rate": 4.0904043766984e-05, + "loss": 1.198, + "step": 213400 + }, + { + "epoch": 0.19, + "grad_norm": 56.0, + "learning_rate": 4.089954469379308e-05, + "loss": 1.2882, + "step": 213500 + }, + { + "epoch": 0.19, + "grad_norm": 13.4375, + "learning_rate": 4.089504562060216e-05, + "loss": 1.2905, + "step": 213600 + }, + { + "epoch": 0.19, + "grad_norm": 16.875, + "learning_rate": 4.0890546547411234e-05, + "loss": 1.2709, + "step": 213700 + }, + { + "epoch": 0.19, + "grad_norm": 0.2060546875, + "learning_rate": 4.088604747422031e-05, + "loss": 1.1773, + "step": 213800 + }, + { + "epoch": 0.19, + "grad_norm": 0.134765625, + "learning_rate": 4.088154840102939e-05, + "loss": 1.0958, + "step": 213900 + }, + { + "epoch": 0.19, + "grad_norm": 0.333984375, + "learning_rate": 4.087704932783847e-05, + "loss": 1.0608, + "step": 214000 + }, + { + "epoch": 0.19, + "grad_norm": 132.0, + "learning_rate": 4.087255025464755e-05, + "loss": 1.0888, + "step": 214100 + }, + { + "epoch": 0.19, + "grad_norm": 114.5, + "learning_rate": 4.0868051181456624e-05, + "loss": 0.9883, + "step": 214200 + }, + { + "epoch": 0.19, + "grad_norm": 60.25, + "learning_rate": 4.08635521082657e-05, + "loss": 1.2059, + "step": 214300 + }, + { + "epoch": 0.19, + "grad_norm": 14.25, + "learning_rate": 4.085905303507478e-05, + "loss": 1.1498, + "step": 214400 + }, + { + "epoch": 0.19, + "grad_norm": 26.375, + "learning_rate": 4.085455396188385e-05, + "loss": 1.0384, + "step": 214500 + }, + { + "epoch": 0.19, + "grad_norm": 22.25, + "learning_rate": 4.0850054888692926e-05, + "loss": 1.2129, + "step": 214600 + }, + { + "epoch": 0.19, + "grad_norm": 26.875, + "learning_rate": 4.084555581550201e-05, + "loss": 1.2073, + "step": 214700 + }, + { + "epoch": 0.19, + "grad_norm": 25.25, + "learning_rate": 4.084105674231108e-05, + "loss": 1.1643, + "step": 214800 + }, + { + "epoch": 0.19, + "grad_norm": 29.5, + "learning_rate": 4.0836557669120165e-05, + "loss": 1.213, + "step": 214900 + }, + { + "epoch": 0.19, + "grad_norm": 76.5, + "learning_rate": 4.083205859592924e-05, + "loss": 1.298, + "step": 215000 + }, + { + "epoch": 0.19, + "grad_norm": 39.75, + "learning_rate": 4.0827559522738316e-05, + "loss": 1.1712, + "step": 215100 + }, + { + "epoch": 0.19, + "grad_norm": 11.0625, + "learning_rate": 4.08230604495474e-05, + "loss": 1.2301, + "step": 215200 + }, + { + "epoch": 0.19, + "grad_norm": 12.4375, + "learning_rate": 4.0818561376356473e-05, + "loss": 1.2684, + "step": 215300 + }, + { + "epoch": 0.19, + "grad_norm": 0.447265625, + "learning_rate": 4.081406230316555e-05, + "loss": 1.1148, + "step": 215400 + }, + { + "epoch": 0.19, + "grad_norm": 245.0, + "learning_rate": 4.080956322997463e-05, + "loss": 1.2759, + "step": 215500 + }, + { + "epoch": 0.19, + "grad_norm": 145.0, + "learning_rate": 4.0805064156783706e-05, + "loss": 1.0697, + "step": 215600 + }, + { + "epoch": 0.19, + "grad_norm": 44.75, + "learning_rate": 4.080056508359278e-05, + "loss": 1.1672, + "step": 215700 + }, + { + "epoch": 0.19, + "grad_norm": 31.125, + "learning_rate": 4.079606601040186e-05, + "loss": 1.2312, + "step": 215800 + }, + { + "epoch": 0.19, + "grad_norm": 27.125, + "learning_rate": 4.079156693721093e-05, + "loss": 1.1621, + "step": 215900 + }, + { + "epoch": 0.19, + "grad_norm": 14.4375, + "learning_rate": 4.0787067864020014e-05, + "loss": 1.1509, + "step": 216000 + }, + { + "epoch": 0.19, + "grad_norm": 116.5, + "learning_rate": 4.078256879082909e-05, + "loss": 1.3298, + "step": 216100 + }, + { + "epoch": 0.19, + "grad_norm": 23.625, + "learning_rate": 4.0778069717638165e-05, + "loss": 1.0676, + "step": 216200 + }, + { + "epoch": 0.19, + "grad_norm": 8.9375, + "learning_rate": 4.077357064444725e-05, + "loss": 1.3096, + "step": 216300 + }, + { + "epoch": 0.19, + "grad_norm": 23.375, + "learning_rate": 4.076907157125632e-05, + "loss": 1.1852, + "step": 216400 + }, + { + "epoch": 0.19, + "grad_norm": 27.125, + "learning_rate": 4.07645724980654e-05, + "loss": 1.0775, + "step": 216500 + }, + { + "epoch": 0.19, + "grad_norm": 13.4375, + "learning_rate": 4.076007342487448e-05, + "loss": 1.3339, + "step": 216600 + }, + { + "epoch": 0.19, + "grad_norm": 254.0, + "learning_rate": 4.0755574351683555e-05, + "loss": 1.1699, + "step": 216700 + }, + { + "epoch": 0.19, + "grad_norm": 442.0, + "learning_rate": 4.075107527849264e-05, + "loss": 1.1621, + "step": 216800 + }, + { + "epoch": 0.19, + "grad_norm": 19.125, + "learning_rate": 4.074657620530171e-05, + "loss": 1.1292, + "step": 216900 + }, + { + "epoch": 0.19, + "grad_norm": 13.125, + "learning_rate": 4.074207713211079e-05, + "loss": 1.2155, + "step": 217000 + }, + { + "epoch": 0.19, + "grad_norm": 32.5, + "learning_rate": 4.0737578058919863e-05, + "loss": 1.2114, + "step": 217100 + }, + { + "epoch": 0.19, + "grad_norm": 37.75, + "learning_rate": 4.073307898572894e-05, + "loss": 1.2032, + "step": 217200 + }, + { + "epoch": 0.19, + "grad_norm": 18.25, + "learning_rate": 4.0728579912538014e-05, + "loss": 1.2392, + "step": 217300 + }, + { + "epoch": 0.19, + "grad_norm": 116.5, + "learning_rate": 4.0724080839347096e-05, + "loss": 1.1625, + "step": 217400 + }, + { + "epoch": 0.19, + "grad_norm": 10.25, + "learning_rate": 4.071958176615617e-05, + "loss": 1.2586, + "step": 217500 + }, + { + "epoch": 0.19, + "grad_norm": 0.265625, + "learning_rate": 4.0715082692965254e-05, + "loss": 1.0202, + "step": 217600 + }, + { + "epoch": 0.19, + "grad_norm": 79.5, + "learning_rate": 4.071058361977433e-05, + "loss": 1.1353, + "step": 217700 + }, + { + "epoch": 0.19, + "grad_norm": 89.5, + "learning_rate": 4.0706084546583404e-05, + "loss": 1.3959, + "step": 217800 + }, + { + "epoch": 0.19, + "grad_norm": 22.0, + "learning_rate": 4.0701585473392486e-05, + "loss": 1.2321, + "step": 217900 + }, + { + "epoch": 0.19, + "grad_norm": 43.75, + "learning_rate": 4.069708640020156e-05, + "loss": 1.1816, + "step": 218000 + }, + { + "epoch": 0.19, + "grad_norm": 47.0, + "learning_rate": 4.069258732701064e-05, + "loss": 1.2895, + "step": 218100 + }, + { + "epoch": 0.19, + "grad_norm": 87.5, + "learning_rate": 4.068808825381972e-05, + "loss": 0.9913, + "step": 218200 + }, + { + "epoch": 0.19, + "grad_norm": 202.0, + "learning_rate": 4.0683589180628795e-05, + "loss": 1.2628, + "step": 218300 + }, + { + "epoch": 0.19, + "grad_norm": 21.875, + "learning_rate": 4.067909010743787e-05, + "loss": 1.1928, + "step": 218400 + }, + { + "epoch": 0.19, + "grad_norm": 0.0947265625, + "learning_rate": 4.0674591034246945e-05, + "loss": 1.2083, + "step": 218500 + }, + { + "epoch": 0.19, + "grad_norm": 2.828125, + "learning_rate": 4.067009196105602e-05, + "loss": 1.3128, + "step": 218600 + }, + { + "epoch": 0.19, + "grad_norm": 100.5, + "learning_rate": 4.06655928878651e-05, + "loss": 1.1397, + "step": 218700 + }, + { + "epoch": 0.19, + "grad_norm": 0.11865234375, + "learning_rate": 4.066109381467418e-05, + "loss": 1.2369, + "step": 218800 + }, + { + "epoch": 0.19, + "grad_norm": 8.625, + "learning_rate": 4.0656594741483253e-05, + "loss": 1.0596, + "step": 218900 + }, + { + "epoch": 0.2, + "grad_norm": 18.875, + "learning_rate": 4.0652095668292336e-05, + "loss": 1.1393, + "step": 219000 + }, + { + "epoch": 0.2, + "grad_norm": 33.25, + "learning_rate": 4.064759659510141e-05, + "loss": 1.1817, + "step": 219100 + }, + { + "epoch": 0.2, + "grad_norm": 79.5, + "learning_rate": 4.0643097521910486e-05, + "loss": 1.2787, + "step": 219200 + }, + { + "epoch": 0.2, + "grad_norm": 29.5, + "learning_rate": 4.063859844871957e-05, + "loss": 1.265, + "step": 219300 + }, + { + "epoch": 0.2, + "grad_norm": 0.024658203125, + "learning_rate": 4.0634099375528644e-05, + "loss": 1.2237, + "step": 219400 + }, + { + "epoch": 0.2, + "grad_norm": 59.5, + "learning_rate": 4.0629600302337726e-05, + "loss": 1.316, + "step": 219500 + }, + { + "epoch": 0.2, + "grad_norm": 69.0, + "learning_rate": 4.06251012291468e-05, + "loss": 1.0299, + "step": 219600 + }, + { + "epoch": 0.2, + "grad_norm": 20.375, + "learning_rate": 4.062060215595587e-05, + "loss": 1.1371, + "step": 219700 + }, + { + "epoch": 0.2, + "grad_norm": 0.326171875, + "learning_rate": 4.061610308276495e-05, + "loss": 1.2016, + "step": 219800 + }, + { + "epoch": 0.2, + "grad_norm": 111.5, + "learning_rate": 4.061160400957403e-05, + "loss": 1.2443, + "step": 219900 + }, + { + "epoch": 0.2, + "grad_norm": 25.75, + "learning_rate": 4.06071049363831e-05, + "loss": 1.077, + "step": 220000 + }, + { + "epoch": 0.2, + "grad_norm": 28.125, + "learning_rate": 4.0602605863192185e-05, + "loss": 1.2198, + "step": 220100 + }, + { + "epoch": 0.2, + "grad_norm": 23.125, + "learning_rate": 4.059810679000126e-05, + "loss": 1.164, + "step": 220200 + }, + { + "epoch": 0.2, + "grad_norm": 72.5, + "learning_rate": 4.059360771681034e-05, + "loss": 1.0561, + "step": 220300 + }, + { + "epoch": 0.2, + "grad_norm": 26.25, + "learning_rate": 4.058910864361942e-05, + "loss": 1.1069, + "step": 220400 + }, + { + "epoch": 0.2, + "grad_norm": 49.5, + "learning_rate": 4.058460957042849e-05, + "loss": 1.2199, + "step": 220500 + }, + { + "epoch": 0.2, + "grad_norm": 25.25, + "learning_rate": 4.0580110497237575e-05, + "loss": 1.1114, + "step": 220600 + }, + { + "epoch": 0.2, + "grad_norm": 47.0, + "learning_rate": 4.057561142404665e-05, + "loss": 1.3102, + "step": 220700 + }, + { + "epoch": 0.2, + "grad_norm": 37.25, + "learning_rate": 4.0571112350855726e-05, + "loss": 1.293, + "step": 220800 + }, + { + "epoch": 0.2, + "grad_norm": 88.5, + "learning_rate": 4.056661327766481e-05, + "loss": 1.1002, + "step": 220900 + }, + { + "epoch": 0.2, + "grad_norm": 53.5, + "learning_rate": 4.0562114204473876e-05, + "loss": 1.2794, + "step": 221000 + }, + { + "epoch": 0.2, + "grad_norm": 33.0, + "learning_rate": 4.055761513128296e-05, + "loss": 1.3038, + "step": 221100 + }, + { + "epoch": 0.2, + "grad_norm": 63.25, + "learning_rate": 4.0553116058092034e-05, + "loss": 1.2728, + "step": 221200 + }, + { + "epoch": 0.2, + "grad_norm": 0.08447265625, + "learning_rate": 4.054861698490111e-05, + "loss": 1.2417, + "step": 221300 + }, + { + "epoch": 0.2, + "grad_norm": 13.3125, + "learning_rate": 4.054411791171019e-05, + "loss": 1.3289, + "step": 221400 + }, + { + "epoch": 0.2, + "grad_norm": 8.8125, + "learning_rate": 4.0539618838519266e-05, + "loss": 1.0395, + "step": 221500 + }, + { + "epoch": 0.2, + "grad_norm": 76.5, + "learning_rate": 4.053511976532834e-05, + "loss": 1.0644, + "step": 221600 + }, + { + "epoch": 0.2, + "grad_norm": 14.3125, + "learning_rate": 4.0530620692137424e-05, + "loss": 1.1834, + "step": 221700 + }, + { + "epoch": 0.2, + "grad_norm": 52.0, + "learning_rate": 4.05261216189465e-05, + "loss": 1.102, + "step": 221800 + }, + { + "epoch": 0.2, + "grad_norm": 13.1875, + "learning_rate": 4.0521622545755575e-05, + "loss": 1.2857, + "step": 221900 + }, + { + "epoch": 0.2, + "grad_norm": 116.0, + "learning_rate": 4.051712347256466e-05, + "loss": 1.1668, + "step": 222000 + }, + { + "epoch": 0.2, + "grad_norm": 5.25, + "learning_rate": 4.051262439937373e-05, + "loss": 1.2486, + "step": 222100 + }, + { + "epoch": 0.2, + "grad_norm": 28.5, + "learning_rate": 4.0508125326182814e-05, + "loss": 1.0911, + "step": 222200 + }, + { + "epoch": 0.2, + "grad_norm": 23.0, + "learning_rate": 4.050362625299188e-05, + "loss": 1.171, + "step": 222300 + }, + { + "epoch": 0.2, + "grad_norm": 50.5, + "learning_rate": 4.049912717980096e-05, + "loss": 1.2212, + "step": 222400 + }, + { + "epoch": 0.2, + "grad_norm": 23.0, + "learning_rate": 4.049462810661004e-05, + "loss": 1.3627, + "step": 222500 + }, + { + "epoch": 0.2, + "grad_norm": 19.75, + "learning_rate": 4.0490129033419116e-05, + "loss": 1.272, + "step": 222600 + }, + { + "epoch": 0.2, + "grad_norm": 22.5, + "learning_rate": 4.048562996022819e-05, + "loss": 1.1911, + "step": 222700 + }, + { + "epoch": 0.2, + "grad_norm": 66.5, + "learning_rate": 4.048113088703727e-05, + "loss": 1.2106, + "step": 222800 + }, + { + "epoch": 0.2, + "grad_norm": 33.75, + "learning_rate": 4.047663181384635e-05, + "loss": 1.2051, + "step": 222900 + }, + { + "epoch": 0.2, + "grad_norm": 55.75, + "learning_rate": 4.047213274065543e-05, + "loss": 1.0809, + "step": 223000 + }, + { + "epoch": 0.2, + "grad_norm": 61.75, + "learning_rate": 4.0467633667464506e-05, + "loss": 1.2241, + "step": 223100 + }, + { + "epoch": 0.2, + "grad_norm": 15.625, + "learning_rate": 4.046313459427358e-05, + "loss": 1.1545, + "step": 223200 + }, + { + "epoch": 0.2, + "grad_norm": 18.75, + "learning_rate": 4.045863552108266e-05, + "loss": 1.0844, + "step": 223300 + }, + { + "epoch": 0.2, + "grad_norm": 14.0, + "learning_rate": 4.045413644789174e-05, + "loss": 1.2074, + "step": 223400 + }, + { + "epoch": 0.2, + "grad_norm": 8.75, + "learning_rate": 4.0449637374700814e-05, + "loss": 1.2002, + "step": 223500 + }, + { + "epoch": 0.2, + "grad_norm": 9.5, + "learning_rate": 4.044513830150989e-05, + "loss": 1.276, + "step": 223600 + }, + { + "epoch": 0.2, + "grad_norm": 83.0, + "learning_rate": 4.0440639228318965e-05, + "loss": 1.2091, + "step": 223700 + }, + { + "epoch": 0.2, + "grad_norm": 16.0, + "learning_rate": 4.043614015512805e-05, + "loss": 1.2641, + "step": 223800 + }, + { + "epoch": 0.2, + "grad_norm": 11.5625, + "learning_rate": 4.043164108193712e-05, + "loss": 1.1295, + "step": 223900 + }, + { + "epoch": 0.2, + "grad_norm": 362.0, + "learning_rate": 4.04271420087462e-05, + "loss": 1.1258, + "step": 224000 + }, + { + "epoch": 0.2, + "grad_norm": 44.5, + "learning_rate": 4.042264293555528e-05, + "loss": 1.1873, + "step": 224100 + }, + { + "epoch": 0.2, + "grad_norm": 32.0, + "learning_rate": 4.0418143862364355e-05, + "loss": 1.2692, + "step": 224200 + }, + { + "epoch": 0.2, + "grad_norm": 26.5, + "learning_rate": 4.041364478917343e-05, + "loss": 1.1327, + "step": 224300 + }, + { + "epoch": 0.2, + "grad_norm": 5.90625, + "learning_rate": 4.040914571598251e-05, + "loss": 1.3219, + "step": 224400 + }, + { + "epoch": 0.2, + "grad_norm": 1.4375, + "learning_rate": 4.040464664279159e-05, + "loss": 1.2652, + "step": 224500 + }, + { + "epoch": 0.2, + "grad_norm": 45.0, + "learning_rate": 4.040014756960066e-05, + "loss": 1.2871, + "step": 224600 + }, + { + "epoch": 0.2, + "grad_norm": 8.125, + "learning_rate": 4.0395648496409745e-05, + "loss": 1.1212, + "step": 224700 + }, + { + "epoch": 0.2, + "grad_norm": 20.5, + "learning_rate": 4.039114942321882e-05, + "loss": 1.0369, + "step": 224800 + }, + { + "epoch": 0.2, + "grad_norm": 33.5, + "learning_rate": 4.0386650350027896e-05, + "loss": 1.2802, + "step": 224900 + }, + { + "epoch": 0.2, + "grad_norm": 0.0026397705078125, + "learning_rate": 4.038215127683697e-05, + "loss": 1.1381, + "step": 225000 + }, + { + "epoch": 0.2, + "grad_norm": 7.15625, + "learning_rate": 4.0377652203646046e-05, + "loss": 1.3793, + "step": 225100 + }, + { + "epoch": 0.2, + "grad_norm": 19.125, + "learning_rate": 4.037315313045513e-05, + "loss": 1.2149, + "step": 225200 + }, + { + "epoch": 0.2, + "grad_norm": 21.75, + "learning_rate": 4.0368654057264204e-05, + "loss": 1.2249, + "step": 225300 + }, + { + "epoch": 0.2, + "grad_norm": 38.75, + "learning_rate": 4.036415498407328e-05, + "loss": 1.2624, + "step": 225400 + }, + { + "epoch": 0.2, + "grad_norm": 146.0, + "learning_rate": 4.035965591088236e-05, + "loss": 1.3057, + "step": 225500 + }, + { + "epoch": 0.2, + "grad_norm": 0.030029296875, + "learning_rate": 4.035515683769144e-05, + "loss": 1.1855, + "step": 225600 + }, + { + "epoch": 0.2, + "grad_norm": 18.125, + "learning_rate": 4.035065776450052e-05, + "loss": 1.0747, + "step": 225700 + }, + { + "epoch": 0.2, + "grad_norm": 24.75, + "learning_rate": 4.0346158691309594e-05, + "loss": 1.2749, + "step": 225800 + }, + { + "epoch": 0.2, + "grad_norm": 3.921875, + "learning_rate": 4.034165961811867e-05, + "loss": 1.2182, + "step": 225900 + }, + { + "epoch": 0.2, + "grad_norm": 30.25, + "learning_rate": 4.033716054492775e-05, + "loss": 1.2729, + "step": 226000 + }, + { + "epoch": 0.2, + "grad_norm": 33.75, + "learning_rate": 4.033266147173683e-05, + "loss": 1.1403, + "step": 226100 + }, + { + "epoch": 0.2, + "grad_norm": 298.0, + "learning_rate": 4.03281623985459e-05, + "loss": 1.3445, + "step": 226200 + }, + { + "epoch": 0.2, + "grad_norm": 350.0, + "learning_rate": 4.032366332535498e-05, + "loss": 1.2373, + "step": 226300 + }, + { + "epoch": 0.2, + "grad_norm": 18.0, + "learning_rate": 4.031916425216405e-05, + "loss": 1.2, + "step": 226400 + }, + { + "epoch": 0.2, + "grad_norm": 5.25, + "learning_rate": 4.0314665178973135e-05, + "loss": 1.3136, + "step": 226500 + }, + { + "epoch": 0.2, + "grad_norm": 0.1640625, + "learning_rate": 4.031016610578221e-05, + "loss": 1.2127, + "step": 226600 + }, + { + "epoch": 0.2, + "grad_norm": 20.375, + "learning_rate": 4.0305667032591286e-05, + "loss": 1.1985, + "step": 226700 + }, + { + "epoch": 0.2, + "grad_norm": 1.8359375, + "learning_rate": 4.030116795940037e-05, + "loss": 1.2223, + "step": 226800 + }, + { + "epoch": 0.2, + "grad_norm": 2.96875, + "learning_rate": 4.029666888620944e-05, + "loss": 1.2426, + "step": 226900 + }, + { + "epoch": 0.2, + "grad_norm": 0.00701904296875, + "learning_rate": 4.029216981301852e-05, + "loss": 1.1825, + "step": 227000 + }, + { + "epoch": 0.2, + "grad_norm": 0.55078125, + "learning_rate": 4.02876707398276e-05, + "loss": 1.3144, + "step": 227100 + }, + { + "epoch": 0.2, + "grad_norm": 21.875, + "learning_rate": 4.0283171666636676e-05, + "loss": 1.3405, + "step": 227200 + }, + { + "epoch": 0.2, + "grad_norm": 0.058349609375, + "learning_rate": 4.027867259344575e-05, + "loss": 1.1609, + "step": 227300 + }, + { + "epoch": 0.2, + "grad_norm": 43.25, + "learning_rate": 4.0274173520254833e-05, + "loss": 1.0487, + "step": 227400 + }, + { + "epoch": 0.2, + "grad_norm": 101.0, + "learning_rate": 4.02696744470639e-05, + "loss": 1.2605, + "step": 227500 + }, + { + "epoch": 0.2, + "grad_norm": 11.125, + "learning_rate": 4.0265175373872984e-05, + "loss": 1.0055, + "step": 227600 + }, + { + "epoch": 0.2, + "grad_norm": 52.25, + "learning_rate": 4.026067630068206e-05, + "loss": 1.1338, + "step": 227700 + }, + { + "epoch": 0.2, + "grad_norm": 26.75, + "learning_rate": 4.0256177227491135e-05, + "loss": 1.148, + "step": 227800 + }, + { + "epoch": 0.2, + "grad_norm": 15.625, + "learning_rate": 4.025167815430022e-05, + "loss": 1.2267, + "step": 227900 + }, + { + "epoch": 0.2, + "grad_norm": 37.25, + "learning_rate": 4.024717908110929e-05, + "loss": 1.0464, + "step": 228000 + }, + { + "epoch": 0.2, + "grad_norm": 32.75, + "learning_rate": 4.024268000791837e-05, + "loss": 1.1657, + "step": 228100 + }, + { + "epoch": 0.2, + "grad_norm": 39.25, + "learning_rate": 4.023818093472745e-05, + "loss": 1.1317, + "step": 228200 + }, + { + "epoch": 0.2, + "grad_norm": 17.75, + "learning_rate": 4.0233681861536525e-05, + "loss": 1.3315, + "step": 228300 + }, + { + "epoch": 0.2, + "grad_norm": 72.0, + "learning_rate": 4.022918278834561e-05, + "loss": 1.146, + "step": 228400 + }, + { + "epoch": 0.2, + "grad_norm": 13.375, + "learning_rate": 4.022468371515468e-05, + "loss": 1.1449, + "step": 228500 + }, + { + "epoch": 0.2, + "grad_norm": 35.75, + "learning_rate": 4.022018464196376e-05, + "loss": 1.18, + "step": 228600 + }, + { + "epoch": 0.2, + "grad_norm": 23.625, + "learning_rate": 4.021568556877284e-05, + "loss": 1.12, + "step": 228700 + }, + { + "epoch": 0.2, + "grad_norm": 55.25, + "learning_rate": 4.021118649558191e-05, + "loss": 1.278, + "step": 228800 + }, + { + "epoch": 0.2, + "grad_norm": 7.96875, + "learning_rate": 4.0206687422390984e-05, + "loss": 1.1042, + "step": 228900 + }, + { + "epoch": 0.2, + "grad_norm": 0.33984375, + "learning_rate": 4.0202188349200066e-05, + "loss": 1.2166, + "step": 229000 + }, + { + "epoch": 0.2, + "grad_norm": 9.0625, + "learning_rate": 4.019768927600914e-05, + "loss": 1.1626, + "step": 229100 + }, + { + "epoch": 0.2, + "grad_norm": 39.25, + "learning_rate": 4.0193190202818223e-05, + "loss": 1.109, + "step": 229200 + }, + { + "epoch": 0.2, + "grad_norm": 194.0, + "learning_rate": 4.01886911296273e-05, + "loss": 1.0713, + "step": 229300 + }, + { + "epoch": 0.2, + "grad_norm": 51.0, + "learning_rate": 4.0184192056436374e-05, + "loss": 1.2785, + "step": 229400 + }, + { + "epoch": 0.2, + "grad_norm": 19.5, + "learning_rate": 4.0179692983245456e-05, + "loss": 1.3195, + "step": 229500 + }, + { + "epoch": 0.2, + "grad_norm": 75.5, + "learning_rate": 4.017519391005453e-05, + "loss": 1.0769, + "step": 229600 + }, + { + "epoch": 0.2, + "grad_norm": 25.125, + "learning_rate": 4.017069483686361e-05, + "loss": 1.1878, + "step": 229700 + }, + { + "epoch": 0.2, + "grad_norm": 25.875, + "learning_rate": 4.016619576367269e-05, + "loss": 1.1975, + "step": 229800 + }, + { + "epoch": 0.2, + "grad_norm": 36.25, + "learning_rate": 4.0161696690481764e-05, + "loss": 1.2175, + "step": 229900 + }, + { + "epoch": 0.2, + "grad_norm": 109.5, + "learning_rate": 4.015719761729084e-05, + "loss": 1.1586, + "step": 230000 + }, + { + "epoch": 0.2, + "grad_norm": 18.125, + "learning_rate": 4.0152698544099915e-05, + "loss": 1.3735, + "step": 230100 + }, + { + "epoch": 0.21, + "grad_norm": 0.26171875, + "learning_rate": 4.014819947090899e-05, + "loss": 1.223, + "step": 230200 + }, + { + "epoch": 0.21, + "grad_norm": 34.0, + "learning_rate": 4.014370039771807e-05, + "loss": 1.0653, + "step": 230300 + }, + { + "epoch": 0.21, + "grad_norm": 6.09375, + "learning_rate": 4.013920132452715e-05, + "loss": 1.2073, + "step": 230400 + }, + { + "epoch": 0.21, + "grad_norm": 11.5, + "learning_rate": 4.013470225133622e-05, + "loss": 1.1292, + "step": 230500 + }, + { + "epoch": 0.21, + "grad_norm": 50.75, + "learning_rate": 4.0130203178145305e-05, + "loss": 1.2277, + "step": 230600 + }, + { + "epoch": 0.21, + "grad_norm": 456.0, + "learning_rate": 4.012570410495438e-05, + "loss": 1.1183, + "step": 230700 + }, + { + "epoch": 0.21, + "grad_norm": 26.75, + "learning_rate": 4.0121205031763456e-05, + "loss": 1.1154, + "step": 230800 + }, + { + "epoch": 0.21, + "grad_norm": 1.734375, + "learning_rate": 4.011670595857254e-05, + "loss": 1.1814, + "step": 230900 + }, + { + "epoch": 0.21, + "grad_norm": 32.5, + "learning_rate": 4.0112206885381613e-05, + "loss": 1.198, + "step": 231000 + }, + { + "epoch": 0.21, + "grad_norm": 52.0, + "learning_rate": 4.0107707812190696e-05, + "loss": 1.1281, + "step": 231100 + }, + { + "epoch": 0.21, + "grad_norm": 23.375, + "learning_rate": 4.010320873899977e-05, + "loss": 1.1255, + "step": 231200 + }, + { + "epoch": 0.21, + "grad_norm": 16.875, + "learning_rate": 4.0098709665808846e-05, + "loss": 1.2562, + "step": 231300 + }, + { + "epoch": 0.21, + "grad_norm": 56.75, + "learning_rate": 4.009421059261792e-05, + "loss": 1.2504, + "step": 231400 + }, + { + "epoch": 0.21, + "grad_norm": 68.5, + "learning_rate": 4.0089711519427e-05, + "loss": 1.1979, + "step": 231500 + }, + { + "epoch": 0.21, + "grad_norm": 19.625, + "learning_rate": 4.008521244623607e-05, + "loss": 1.1168, + "step": 231600 + }, + { + "epoch": 0.21, + "grad_norm": 25.25, + "learning_rate": 4.0080713373045154e-05, + "loss": 1.2779, + "step": 231700 + }, + { + "epoch": 0.21, + "grad_norm": 9.1875, + "learning_rate": 4.007621429985423e-05, + "loss": 1.2426, + "step": 231800 + }, + { + "epoch": 0.21, + "grad_norm": 25.25, + "learning_rate": 4.007171522666331e-05, + "loss": 1.2031, + "step": 231900 + }, + { + "epoch": 0.21, + "grad_norm": 37.0, + "learning_rate": 4.006721615347239e-05, + "loss": 1.178, + "step": 232000 + }, + { + "epoch": 0.21, + "grad_norm": 258.0, + "learning_rate": 4.006271708028146e-05, + "loss": 1.0253, + "step": 232100 + }, + { + "epoch": 0.21, + "grad_norm": 34.75, + "learning_rate": 4.0058218007090545e-05, + "loss": 1.1623, + "step": 232200 + }, + { + "epoch": 0.21, + "grad_norm": 26.75, + "learning_rate": 4.005371893389962e-05, + "loss": 1.2595, + "step": 232300 + }, + { + "epoch": 0.21, + "grad_norm": 532.0, + "learning_rate": 4.0049219860708695e-05, + "loss": 1.2893, + "step": 232400 + }, + { + "epoch": 0.21, + "grad_norm": 27.375, + "learning_rate": 4.004472078751778e-05, + "loss": 1.0809, + "step": 232500 + }, + { + "epoch": 0.21, + "grad_norm": 168.0, + "learning_rate": 4.004022171432685e-05, + "loss": 1.068, + "step": 232600 + }, + { + "epoch": 0.21, + "grad_norm": 68.5, + "learning_rate": 4.003572264113593e-05, + "loss": 1.1476, + "step": 232700 + }, + { + "epoch": 0.21, + "grad_norm": 60.25, + "learning_rate": 4.0031223567945003e-05, + "loss": 1.1423, + "step": 232800 + }, + { + "epoch": 0.21, + "grad_norm": 30.0, + "learning_rate": 4.002672449475408e-05, + "loss": 1.3094, + "step": 232900 + }, + { + "epoch": 0.21, + "grad_norm": 0.03662109375, + "learning_rate": 4.002222542156316e-05, + "loss": 1.2968, + "step": 233000 + }, + { + "epoch": 0.21, + "grad_norm": 23.25, + "learning_rate": 4.0017726348372236e-05, + "loss": 1.1127, + "step": 233100 + }, + { + "epoch": 0.21, + "grad_norm": 396.0, + "learning_rate": 4.001322727518131e-05, + "loss": 1.2254, + "step": 233200 + }, + { + "epoch": 0.21, + "grad_norm": 15.125, + "learning_rate": 4.0008728201990394e-05, + "loss": 1.2719, + "step": 233300 + }, + { + "epoch": 0.21, + "grad_norm": 107.5, + "learning_rate": 4.000422912879947e-05, + "loss": 1.0719, + "step": 233400 + }, + { + "epoch": 0.21, + "grad_norm": 12.25, + "learning_rate": 3.9999730055608544e-05, + "loss": 1.1556, + "step": 233500 + }, + { + "epoch": 0.21, + "grad_norm": 152.0, + "learning_rate": 3.9995230982417627e-05, + "loss": 1.0694, + "step": 233600 + }, + { + "epoch": 0.21, + "grad_norm": 6.28125, + "learning_rate": 3.99907319092267e-05, + "loss": 1.2125, + "step": 233700 + }, + { + "epoch": 0.21, + "grad_norm": 11.75, + "learning_rate": 3.9986232836035784e-05, + "loss": 1.129, + "step": 233800 + }, + { + "epoch": 0.21, + "grad_norm": 48.75, + "learning_rate": 3.998173376284486e-05, + "loss": 1.1163, + "step": 233900 + }, + { + "epoch": 0.21, + "grad_norm": 15.75, + "learning_rate": 3.997723468965393e-05, + "loss": 1.1119, + "step": 234000 + }, + { + "epoch": 0.21, + "grad_norm": 58.5, + "learning_rate": 3.997273561646301e-05, + "loss": 1.129, + "step": 234100 + }, + { + "epoch": 0.21, + "grad_norm": 0.00106048583984375, + "learning_rate": 3.9968236543272085e-05, + "loss": 1.17, + "step": 234200 + }, + { + "epoch": 0.21, + "grad_norm": 34.0, + "learning_rate": 3.996373747008116e-05, + "loss": 1.1715, + "step": 234300 + }, + { + "epoch": 0.21, + "grad_norm": 5.28125, + "learning_rate": 3.995923839689024e-05, + "loss": 1.1322, + "step": 234400 + }, + { + "epoch": 0.21, + "grad_norm": 91.0, + "learning_rate": 3.995473932369932e-05, + "loss": 1.2509, + "step": 234500 + }, + { + "epoch": 0.21, + "grad_norm": 0.0052490234375, + "learning_rate": 3.99502402505084e-05, + "loss": 1.3465, + "step": 234600 + }, + { + "epoch": 0.21, + "grad_norm": 29.25, + "learning_rate": 3.9945741177317476e-05, + "loss": 1.309, + "step": 234700 + }, + { + "epoch": 0.21, + "grad_norm": 14.875, + "learning_rate": 3.994124210412655e-05, + "loss": 1.3001, + "step": 234800 + }, + { + "epoch": 0.21, + "grad_norm": 0.01043701171875, + "learning_rate": 3.993674303093563e-05, + "loss": 1.1559, + "step": 234900 + }, + { + "epoch": 0.21, + "grad_norm": 12.875, + "learning_rate": 3.993224395774471e-05, + "loss": 1.2696, + "step": 235000 + }, + { + "epoch": 0.21, + "grad_norm": 17.125, + "learning_rate": 3.9927744884553784e-05, + "loss": 1.0966, + "step": 235100 + }, + { + "epoch": 0.21, + "grad_norm": 22.75, + "learning_rate": 3.9923245811362866e-05, + "loss": 1.1111, + "step": 235200 + }, + { + "epoch": 0.21, + "grad_norm": 17.0, + "learning_rate": 3.9918746738171934e-05, + "loss": 1.2537, + "step": 235300 + }, + { + "epoch": 0.21, + "grad_norm": 51.75, + "learning_rate": 3.9914247664981017e-05, + "loss": 1.2658, + "step": 235400 + }, + { + "epoch": 0.21, + "grad_norm": 34.25, + "learning_rate": 3.990974859179009e-05, + "loss": 1.0868, + "step": 235500 + }, + { + "epoch": 0.21, + "grad_norm": 0.00494384765625, + "learning_rate": 3.990524951859917e-05, + "loss": 0.9567, + "step": 235600 + }, + { + "epoch": 0.21, + "grad_norm": 96.5, + "learning_rate": 3.990075044540825e-05, + "loss": 1.2206, + "step": 235700 + }, + { + "epoch": 0.21, + "grad_norm": 13.0, + "learning_rate": 3.9896251372217325e-05, + "loss": 1.0097, + "step": 235800 + }, + { + "epoch": 0.21, + "grad_norm": 115.0, + "learning_rate": 3.98917522990264e-05, + "loss": 1.1683, + "step": 235900 + }, + { + "epoch": 0.21, + "grad_norm": 39.5, + "learning_rate": 3.988725322583548e-05, + "loss": 1.0478, + "step": 236000 + }, + { + "epoch": 0.21, + "grad_norm": 13.3125, + "learning_rate": 3.988275415264456e-05, + "loss": 1.4626, + "step": 236100 + }, + { + "epoch": 0.21, + "grad_norm": 25.75, + "learning_rate": 3.987825507945363e-05, + "loss": 1.1941, + "step": 236200 + }, + { + "epoch": 0.21, + "grad_norm": 45.25, + "learning_rate": 3.9873756006262715e-05, + "loss": 1.2885, + "step": 236300 + }, + { + "epoch": 0.21, + "grad_norm": 14.1875, + "learning_rate": 3.986925693307179e-05, + "loss": 1.2543, + "step": 236400 + }, + { + "epoch": 0.21, + "grad_norm": 0.1416015625, + "learning_rate": 3.986475785988087e-05, + "loss": 1.1691, + "step": 236500 + }, + { + "epoch": 0.21, + "grad_norm": 42.0, + "learning_rate": 3.986025878668994e-05, + "loss": 1.1036, + "step": 236600 + }, + { + "epoch": 0.21, + "grad_norm": 36.75, + "learning_rate": 3.9855759713499016e-05, + "loss": 1.1534, + "step": 236700 + }, + { + "epoch": 0.21, + "grad_norm": 17.125, + "learning_rate": 3.98512606403081e-05, + "loss": 1.0397, + "step": 236800 + }, + { + "epoch": 0.21, + "grad_norm": 31.5, + "learning_rate": 3.9846761567117174e-05, + "loss": 1.1877, + "step": 236900 + }, + { + "epoch": 0.21, + "grad_norm": 31.0, + "learning_rate": 3.984226249392625e-05, + "loss": 1.0923, + "step": 237000 + }, + { + "epoch": 0.21, + "grad_norm": 2.3125, + "learning_rate": 3.983776342073533e-05, + "loss": 1.1543, + "step": 237100 + }, + { + "epoch": 0.21, + "grad_norm": 175.0, + "learning_rate": 3.9833264347544407e-05, + "loss": 1.1596, + "step": 237200 + }, + { + "epoch": 0.21, + "grad_norm": 0.298828125, + "learning_rate": 3.982876527435349e-05, + "loss": 1.2475, + "step": 237300 + }, + { + "epoch": 0.21, + "grad_norm": 12.125, + "learning_rate": 3.9824266201162564e-05, + "loss": 1.2365, + "step": 237400 + }, + { + "epoch": 0.21, + "grad_norm": 118.0, + "learning_rate": 3.981976712797164e-05, + "loss": 1.0924, + "step": 237500 + }, + { + "epoch": 0.21, + "grad_norm": 29.375, + "learning_rate": 3.981526805478072e-05, + "loss": 1.1695, + "step": 237600 + }, + { + "epoch": 0.21, + "grad_norm": 37.5, + "learning_rate": 3.98107689815898e-05, + "loss": 1.2291, + "step": 237700 + }, + { + "epoch": 0.21, + "grad_norm": 40.0, + "learning_rate": 3.980626990839887e-05, + "loss": 1.1649, + "step": 237800 + }, + { + "epoch": 0.21, + "grad_norm": 47.75, + "learning_rate": 3.980177083520795e-05, + "loss": 1.0497, + "step": 237900 + }, + { + "epoch": 0.21, + "grad_norm": 76.5, + "learning_rate": 3.979727176201702e-05, + "loss": 1.3253, + "step": 238000 + }, + { + "epoch": 0.21, + "grad_norm": 16.625, + "learning_rate": 3.9792772688826105e-05, + "loss": 1.1352, + "step": 238100 + }, + { + "epoch": 0.21, + "grad_norm": 14.8125, + "learning_rate": 3.978827361563518e-05, + "loss": 1.0653, + "step": 238200 + }, + { + "epoch": 0.21, + "grad_norm": 24.625, + "learning_rate": 3.9783774542444256e-05, + "loss": 0.9743, + "step": 238300 + }, + { + "epoch": 0.21, + "grad_norm": 25.875, + "learning_rate": 3.977927546925334e-05, + "loss": 1.1918, + "step": 238400 + }, + { + "epoch": 0.21, + "grad_norm": 31.75, + "learning_rate": 3.977477639606241e-05, + "loss": 1.227, + "step": 238500 + }, + { + "epoch": 0.21, + "grad_norm": 61.75, + "learning_rate": 3.977027732287149e-05, + "loss": 1.1184, + "step": 238600 + }, + { + "epoch": 0.21, + "grad_norm": 48.25, + "learning_rate": 3.976577824968057e-05, + "loss": 1.1853, + "step": 238700 + }, + { + "epoch": 0.21, + "grad_norm": 41.5, + "learning_rate": 3.9761279176489646e-05, + "loss": 1.1154, + "step": 238800 + }, + { + "epoch": 0.21, + "grad_norm": 0.010009765625, + "learning_rate": 3.975678010329872e-05, + "loss": 1.3571, + "step": 238900 + }, + { + "epoch": 0.21, + "grad_norm": 18.0, + "learning_rate": 3.97522810301078e-05, + "loss": 1.3166, + "step": 239000 + }, + { + "epoch": 0.21, + "grad_norm": 9.3125, + "learning_rate": 3.974778195691688e-05, + "loss": 1.2893, + "step": 239100 + }, + { + "epoch": 0.21, + "grad_norm": 20.75, + "learning_rate": 3.9743282883725954e-05, + "loss": 1.1967, + "step": 239200 + }, + { + "epoch": 0.21, + "grad_norm": 51.5, + "learning_rate": 3.973878381053503e-05, + "loss": 1.2685, + "step": 239300 + }, + { + "epoch": 0.21, + "grad_norm": 42.75, + "learning_rate": 3.9734284737344105e-05, + "loss": 1.1552, + "step": 239400 + }, + { + "epoch": 0.21, + "grad_norm": 49.75, + "learning_rate": 3.972978566415319e-05, + "loss": 1.0932, + "step": 239500 + }, + { + "epoch": 0.21, + "grad_norm": 17.125, + "learning_rate": 3.972528659096226e-05, + "loss": 1.1386, + "step": 239600 + }, + { + "epoch": 0.21, + "grad_norm": 3.75, + "learning_rate": 3.972078751777134e-05, + "loss": 1.0738, + "step": 239700 + }, + { + "epoch": 0.21, + "grad_norm": 34.5, + "learning_rate": 3.971628844458042e-05, + "loss": 1.0091, + "step": 239800 + }, + { + "epoch": 0.21, + "grad_norm": 78.0, + "learning_rate": 3.9711789371389495e-05, + "loss": 1.0894, + "step": 239900 + }, + { + "epoch": 0.21, + "grad_norm": 0.07568359375, + "learning_rate": 3.970729029819858e-05, + "loss": 1.2147, + "step": 240000 + }, + { + "epoch": 0.21, + "grad_norm": 50.75, + "learning_rate": 3.970279122500765e-05, + "loss": 1.1033, + "step": 240100 + }, + { + "epoch": 0.21, + "grad_norm": 83.0, + "learning_rate": 3.969829215181673e-05, + "loss": 1.1346, + "step": 240200 + }, + { + "epoch": 0.21, + "grad_norm": 22.375, + "learning_rate": 3.969379307862581e-05, + "loss": 1.29, + "step": 240300 + }, + { + "epoch": 0.21, + "grad_norm": 30.875, + "learning_rate": 3.9689294005434885e-05, + "loss": 0.9884, + "step": 240400 + }, + { + "epoch": 0.21, + "grad_norm": 60.0, + "learning_rate": 3.968479493224396e-05, + "loss": 1.2222, + "step": 240500 + }, + { + "epoch": 0.21, + "grad_norm": 42.5, + "learning_rate": 3.9680295859053036e-05, + "loss": 1.15, + "step": 240600 + }, + { + "epoch": 0.21, + "grad_norm": 31.75, + "learning_rate": 3.967579678586211e-05, + "loss": 1.0979, + "step": 240700 + }, + { + "epoch": 0.21, + "grad_norm": 27.875, + "learning_rate": 3.967129771267119e-05, + "loss": 1.2747, + "step": 240800 + }, + { + "epoch": 0.21, + "grad_norm": 0.130859375, + "learning_rate": 3.966679863948027e-05, + "loss": 1.2433, + "step": 240900 + }, + { + "epoch": 0.21, + "grad_norm": 18.25, + "learning_rate": 3.9662299566289344e-05, + "loss": 1.1585, + "step": 241000 + }, + { + "epoch": 0.21, + "grad_norm": 22.875, + "learning_rate": 3.9657800493098426e-05, + "loss": 1.2039, + "step": 241100 + }, + { + "epoch": 0.21, + "grad_norm": 86.5, + "learning_rate": 3.96533014199075e-05, + "loss": 1.206, + "step": 241200 + }, + { + "epoch": 0.21, + "grad_norm": 0.126953125, + "learning_rate": 3.964880234671658e-05, + "loss": 1.2161, + "step": 241300 + }, + { + "epoch": 0.22, + "grad_norm": 12.4375, + "learning_rate": 3.964430327352566e-05, + "loss": 1.2156, + "step": 241400 + }, + { + "epoch": 0.22, + "grad_norm": 114.0, + "learning_rate": 3.9639804200334734e-05, + "loss": 1.1289, + "step": 241500 + }, + { + "epoch": 0.22, + "grad_norm": 27.25, + "learning_rate": 3.963530512714381e-05, + "loss": 1.0126, + "step": 241600 + }, + { + "epoch": 0.22, + "grad_norm": 23.375, + "learning_rate": 3.963080605395289e-05, + "loss": 1.1066, + "step": 241700 + }, + { + "epoch": 0.22, + "grad_norm": 51.0, + "learning_rate": 3.962630698076196e-05, + "loss": 1.0179, + "step": 241800 + }, + { + "epoch": 0.22, + "grad_norm": 25.75, + "learning_rate": 3.962180790757104e-05, + "loss": 1.1068, + "step": 241900 + }, + { + "epoch": 0.22, + "grad_norm": 63.5, + "learning_rate": 3.961730883438012e-05, + "loss": 1.0146, + "step": 242000 + }, + { + "epoch": 0.22, + "grad_norm": 14.6875, + "learning_rate": 3.961280976118919e-05, + "loss": 1.2853, + "step": 242100 + }, + { + "epoch": 0.22, + "grad_norm": 69.0, + "learning_rate": 3.9608310687998275e-05, + "loss": 1.1743, + "step": 242200 + }, + { + "epoch": 0.22, + "grad_norm": 2.21875, + "learning_rate": 3.960381161480735e-05, + "loss": 1.149, + "step": 242300 + }, + { + "epoch": 0.22, + "grad_norm": 13.1875, + "learning_rate": 3.9599312541616426e-05, + "loss": 1.1604, + "step": 242400 + }, + { + "epoch": 0.22, + "grad_norm": 8.875, + "learning_rate": 3.959481346842551e-05, + "loss": 1.3036, + "step": 242500 + }, + { + "epoch": 0.22, + "grad_norm": 39.75, + "learning_rate": 3.959031439523458e-05, + "loss": 1.1397, + "step": 242600 + }, + { + "epoch": 0.22, + "grad_norm": 72.5, + "learning_rate": 3.9585815322043665e-05, + "loss": 1.3535, + "step": 242700 + }, + { + "epoch": 0.22, + "grad_norm": 26.625, + "learning_rate": 3.958131624885274e-05, + "loss": 1.2853, + "step": 242800 + }, + { + "epoch": 0.22, + "grad_norm": 24.5, + "learning_rate": 3.9576817175661816e-05, + "loss": 1.2195, + "step": 242900 + }, + { + "epoch": 0.22, + "grad_norm": 2.5625, + "learning_rate": 3.95723181024709e-05, + "loss": 1.1829, + "step": 243000 + }, + { + "epoch": 0.22, + "grad_norm": 1.140625, + "learning_rate": 3.956781902927997e-05, + "loss": 1.0797, + "step": 243100 + }, + { + "epoch": 0.22, + "grad_norm": 67.0, + "learning_rate": 3.956331995608905e-05, + "loss": 1.3246, + "step": 243200 + }, + { + "epoch": 0.22, + "grad_norm": 2.734375, + "learning_rate": 3.9558820882898124e-05, + "loss": 1.1519, + "step": 243300 + }, + { + "epoch": 0.22, + "grad_norm": 24.25, + "learning_rate": 3.95543218097072e-05, + "loss": 1.166, + "step": 243400 + }, + { + "epoch": 0.22, + "grad_norm": 9.5625, + "learning_rate": 3.954982273651628e-05, + "loss": 1.1949, + "step": 243500 + }, + { + "epoch": 0.22, + "grad_norm": 35.5, + "learning_rate": 3.954532366332536e-05, + "loss": 1.0722, + "step": 243600 + }, + { + "epoch": 0.22, + "grad_norm": 15.5625, + "learning_rate": 3.954082459013443e-05, + "loss": 1.0706, + "step": 243700 + }, + { + "epoch": 0.22, + "grad_norm": 32.5, + "learning_rate": 3.9536325516943514e-05, + "loss": 1.1307, + "step": 243800 + }, + { + "epoch": 0.22, + "grad_norm": 23.875, + "learning_rate": 3.953182644375259e-05, + "loss": 1.331, + "step": 243900 + }, + { + "epoch": 0.22, + "grad_norm": 59.5, + "learning_rate": 3.9527327370561665e-05, + "loss": 1.1699, + "step": 244000 + }, + { + "epoch": 0.22, + "grad_norm": 19.0, + "learning_rate": 3.952282829737075e-05, + "loss": 1.2879, + "step": 244100 + }, + { + "epoch": 0.22, + "grad_norm": 92.5, + "learning_rate": 3.951832922417982e-05, + "loss": 1.3037, + "step": 244200 + }, + { + "epoch": 0.22, + "grad_norm": 19.125, + "learning_rate": 3.95138301509889e-05, + "loss": 1.0337, + "step": 244300 + }, + { + "epoch": 0.22, + "grad_norm": 62.0, + "learning_rate": 3.950933107779797e-05, + "loss": 1.1418, + "step": 244400 + }, + { + "epoch": 0.22, + "grad_norm": 11.1875, + "learning_rate": 3.950483200460705e-05, + "loss": 1.3614, + "step": 244500 + }, + { + "epoch": 0.22, + "grad_norm": 11.0625, + "learning_rate": 3.950033293141613e-05, + "loss": 1.1876, + "step": 244600 + }, + { + "epoch": 0.22, + "grad_norm": 30.0, + "learning_rate": 3.9495833858225206e-05, + "loss": 1.2167, + "step": 244700 + }, + { + "epoch": 0.22, + "grad_norm": 45.25, + "learning_rate": 3.949133478503428e-05, + "loss": 1.1304, + "step": 244800 + }, + { + "epoch": 0.22, + "grad_norm": 68.5, + "learning_rate": 3.9486835711843364e-05, + "loss": 1.3406, + "step": 244900 + }, + { + "epoch": 0.22, + "grad_norm": 40.25, + "learning_rate": 3.948233663865244e-05, + "loss": 1.1495, + "step": 245000 + }, + { + "epoch": 0.22, + "grad_norm": 11.25, + "learning_rate": 3.9477837565461514e-05, + "loss": 1.2369, + "step": 245100 + }, + { + "epoch": 0.22, + "grad_norm": 42.25, + "learning_rate": 3.9473338492270596e-05, + "loss": 1.1874, + "step": 245200 + }, + { + "epoch": 0.22, + "grad_norm": 19.0, + "learning_rate": 3.946883941907967e-05, + "loss": 1.2262, + "step": 245300 + }, + { + "epoch": 0.22, + "grad_norm": 17.75, + "learning_rate": 3.9464340345888754e-05, + "loss": 1.022, + "step": 245400 + }, + { + "epoch": 0.22, + "grad_norm": 8.625, + "learning_rate": 3.945984127269783e-05, + "loss": 1.2238, + "step": 245500 + }, + { + "epoch": 0.22, + "grad_norm": 27.625, + "learning_rate": 3.9455342199506904e-05, + "loss": 1.1759, + "step": 245600 + }, + { + "epoch": 0.22, + "grad_norm": 1.7109375, + "learning_rate": 3.945084312631598e-05, + "loss": 1.078, + "step": 245700 + }, + { + "epoch": 0.22, + "grad_norm": 0.0036468505859375, + "learning_rate": 3.9446344053125055e-05, + "loss": 1.1926, + "step": 245800 + }, + { + "epoch": 0.22, + "grad_norm": 0.1875, + "learning_rate": 3.944184497993413e-05, + "loss": 1.03, + "step": 245900 + }, + { + "epoch": 0.22, + "grad_norm": 15.6875, + "learning_rate": 3.943734590674321e-05, + "loss": 1.3733, + "step": 246000 + }, + { + "epoch": 0.22, + "grad_norm": 27.875, + "learning_rate": 3.943284683355229e-05, + "loss": 1.2373, + "step": 246100 + }, + { + "epoch": 0.22, + "grad_norm": 236.0, + "learning_rate": 3.942834776036137e-05, + "loss": 1.2598, + "step": 246200 + }, + { + "epoch": 0.22, + "grad_norm": 220.0, + "learning_rate": 3.9423848687170445e-05, + "loss": 1.1383, + "step": 246300 + }, + { + "epoch": 0.22, + "grad_norm": 1.0703125, + "learning_rate": 3.941934961397952e-05, + "loss": 1.2764, + "step": 246400 + }, + { + "epoch": 0.22, + "grad_norm": 223.0, + "learning_rate": 3.94148505407886e-05, + "loss": 0.9768, + "step": 246500 + }, + { + "epoch": 0.22, + "grad_norm": 556.0, + "learning_rate": 3.941035146759768e-05, + "loss": 1.2681, + "step": 246600 + }, + { + "epoch": 0.22, + "grad_norm": 23.125, + "learning_rate": 3.9405852394406754e-05, + "loss": 1.048, + "step": 246700 + }, + { + "epoch": 0.22, + "grad_norm": 36.25, + "learning_rate": 3.9401353321215836e-05, + "loss": 1.1942, + "step": 246800 + }, + { + "epoch": 0.22, + "grad_norm": 22.375, + "learning_rate": 3.939685424802491e-05, + "loss": 1.2036, + "step": 246900 + }, + { + "epoch": 0.22, + "grad_norm": 77.5, + "learning_rate": 3.9392355174833986e-05, + "loss": 1.149, + "step": 247000 + }, + { + "epoch": 0.22, + "grad_norm": 17.625, + "learning_rate": 3.938785610164306e-05, + "loss": 1.308, + "step": 247100 + }, + { + "epoch": 0.22, + "grad_norm": 49.0, + "learning_rate": 3.938335702845214e-05, + "loss": 1.3527, + "step": 247200 + }, + { + "epoch": 0.22, + "grad_norm": 27.125, + "learning_rate": 3.937885795526122e-05, + "loss": 1.1117, + "step": 247300 + }, + { + "epoch": 0.22, + "grad_norm": 20.875, + "learning_rate": 3.9374358882070294e-05, + "loss": 1.2889, + "step": 247400 + }, + { + "epoch": 0.22, + "grad_norm": 25.5, + "learning_rate": 3.936985980887937e-05, + "loss": 1.3867, + "step": 247500 + }, + { + "epoch": 0.22, + "grad_norm": 42.5, + "learning_rate": 3.936536073568845e-05, + "loss": 1.3019, + "step": 247600 + }, + { + "epoch": 0.22, + "grad_norm": 81.0, + "learning_rate": 3.936086166249753e-05, + "loss": 1.1621, + "step": 247700 + }, + { + "epoch": 0.22, + "grad_norm": 147.0, + "learning_rate": 3.93563625893066e-05, + "loss": 1.1166, + "step": 247800 + }, + { + "epoch": 0.22, + "grad_norm": 13.0, + "learning_rate": 3.9351863516115685e-05, + "loss": 1.2203, + "step": 247900 + }, + { + "epoch": 0.22, + "grad_norm": 114.5, + "learning_rate": 3.934736444292476e-05, + "loss": 1.1678, + "step": 248000 + }, + { + "epoch": 0.22, + "grad_norm": 4.84375, + "learning_rate": 3.934286536973384e-05, + "loss": 1.039, + "step": 248100 + }, + { + "epoch": 0.22, + "grad_norm": 35.75, + "learning_rate": 3.933836629654292e-05, + "loss": 1.1206, + "step": 248200 + }, + { + "epoch": 0.22, + "grad_norm": 23.125, + "learning_rate": 3.9333867223351986e-05, + "loss": 1.2726, + "step": 248300 + }, + { + "epoch": 0.22, + "grad_norm": 54.5, + "learning_rate": 3.932936815016107e-05, + "loss": 1.0794, + "step": 248400 + }, + { + "epoch": 0.22, + "grad_norm": 228.0, + "learning_rate": 3.9324869076970144e-05, + "loss": 1.0072, + "step": 248500 + }, + { + "epoch": 0.22, + "grad_norm": 32.5, + "learning_rate": 3.932037000377922e-05, + "loss": 1.3515, + "step": 248600 + }, + { + "epoch": 0.22, + "grad_norm": 29.125, + "learning_rate": 3.93158709305883e-05, + "loss": 1.2737, + "step": 248700 + }, + { + "epoch": 0.22, + "grad_norm": 87.0, + "learning_rate": 3.9311371857397376e-05, + "loss": 1.3866, + "step": 248800 + }, + { + "epoch": 0.22, + "grad_norm": 12.5625, + "learning_rate": 3.930687278420646e-05, + "loss": 1.0515, + "step": 248900 + }, + { + "epoch": 0.22, + "grad_norm": 31.5, + "learning_rate": 3.9302373711015534e-05, + "loss": 1.3832, + "step": 249000 + }, + { + "epoch": 0.22, + "grad_norm": 0.004791259765625, + "learning_rate": 3.929787463782461e-05, + "loss": 1.2286, + "step": 249100 + }, + { + "epoch": 0.22, + "grad_norm": 332.0, + "learning_rate": 3.929337556463369e-05, + "loss": 1.3023, + "step": 249200 + }, + { + "epoch": 0.22, + "grad_norm": 43.5, + "learning_rate": 3.9288876491442767e-05, + "loss": 1.3827, + "step": 249300 + }, + { + "epoch": 0.22, + "grad_norm": 12.125, + "learning_rate": 3.928437741825184e-05, + "loss": 1.2476, + "step": 249400 + }, + { + "epoch": 0.22, + "grad_norm": 1.328125, + "learning_rate": 3.9279878345060924e-05, + "loss": 1.1633, + "step": 249500 + }, + { + "epoch": 0.22, + "grad_norm": 34.25, + "learning_rate": 3.927537927186999e-05, + "loss": 1.1164, + "step": 249600 + }, + { + "epoch": 0.22, + "grad_norm": 17.375, + "learning_rate": 3.9270880198679075e-05, + "loss": 1.1873, + "step": 249700 + }, + { + "epoch": 0.22, + "grad_norm": 32.25, + "learning_rate": 3.926638112548815e-05, + "loss": 1.2064, + "step": 249800 + }, + { + "epoch": 0.22, + "grad_norm": 310.0, + "learning_rate": 3.9261882052297225e-05, + "loss": 1.1259, + "step": 249900 + }, + { + "epoch": 0.22, + "grad_norm": 85.0, + "learning_rate": 3.925738297910631e-05, + "loss": 1.2184, + "step": 250000 + }, + { + "epoch": 0.22, + "grad_norm": 0.11669921875, + "learning_rate": 3.925288390591538e-05, + "loss": 0.9433, + "step": 250100 + }, + { + "epoch": 0.22, + "grad_norm": 171.0, + "learning_rate": 3.924838483272446e-05, + "loss": 1.086, + "step": 250200 + }, + { + "epoch": 0.22, + "grad_norm": 15.375, + "learning_rate": 3.924388575953354e-05, + "loss": 1.1756, + "step": 250300 + }, + { + "epoch": 0.22, + "grad_norm": 93.0, + "learning_rate": 3.9239386686342616e-05, + "loss": 1.2926, + "step": 250400 + }, + { + "epoch": 0.22, + "grad_norm": 25.0, + "learning_rate": 3.923488761315169e-05, + "loss": 1.0511, + "step": 250500 + }, + { + "epoch": 0.22, + "grad_norm": 93.0, + "learning_rate": 3.923038853996077e-05, + "loss": 1.1765, + "step": 250600 + }, + { + "epoch": 0.22, + "grad_norm": 0.095703125, + "learning_rate": 3.922588946676985e-05, + "loss": 1.2579, + "step": 250700 + }, + { + "epoch": 0.22, + "grad_norm": 0.01025390625, + "learning_rate": 3.922139039357893e-05, + "loss": 1.2552, + "step": 250800 + }, + { + "epoch": 0.22, + "grad_norm": 11.25, + "learning_rate": 3.9216891320388e-05, + "loss": 1.3754, + "step": 250900 + }, + { + "epoch": 0.22, + "grad_norm": 29.625, + "learning_rate": 3.9212392247197074e-05, + "loss": 1.3205, + "step": 251000 + }, + { + "epoch": 0.22, + "grad_norm": 59.75, + "learning_rate": 3.9207893174006157e-05, + "loss": 1.2434, + "step": 251100 + }, + { + "epoch": 0.22, + "grad_norm": 51.25, + "learning_rate": 3.920339410081523e-05, + "loss": 1.1646, + "step": 251200 + }, + { + "epoch": 0.22, + "grad_norm": 13.875, + "learning_rate": 3.919889502762431e-05, + "loss": 1.1913, + "step": 251300 + }, + { + "epoch": 0.22, + "grad_norm": 47.25, + "learning_rate": 3.919439595443339e-05, + "loss": 1.2242, + "step": 251400 + }, + { + "epoch": 0.22, + "grad_norm": 30.75, + "learning_rate": 3.9189896881242465e-05, + "loss": 1.0874, + "step": 251500 + }, + { + "epoch": 0.22, + "grad_norm": 16.25, + "learning_rate": 3.918539780805155e-05, + "loss": 1.1604, + "step": 251600 + }, + { + "epoch": 0.22, + "grad_norm": 46.75, + "learning_rate": 3.918089873486062e-05, + "loss": 1.1404, + "step": 251700 + }, + { + "epoch": 0.22, + "grad_norm": 136.0, + "learning_rate": 3.91763996616697e-05, + "loss": 1.2929, + "step": 251800 + }, + { + "epoch": 0.22, + "grad_norm": 52.0, + "learning_rate": 3.917190058847878e-05, + "loss": 1.2881, + "step": 251900 + }, + { + "epoch": 0.22, + "grad_norm": 32.0, + "learning_rate": 3.9167401515287855e-05, + "loss": 1.04, + "step": 252000 + }, + { + "epoch": 0.22, + "grad_norm": 0.7734375, + "learning_rate": 3.916290244209693e-05, + "loss": 1.2319, + "step": 252100 + }, + { + "epoch": 0.22, + "grad_norm": 50.25, + "learning_rate": 3.9158403368906006e-05, + "loss": 1.1052, + "step": 252200 + }, + { + "epoch": 0.22, + "grad_norm": 27.125, + "learning_rate": 3.915390429571508e-05, + "loss": 1.2576, + "step": 252300 + }, + { + "epoch": 0.22, + "grad_norm": 30.75, + "learning_rate": 3.914940522252416e-05, + "loss": 1.1814, + "step": 252400 + }, + { + "epoch": 0.22, + "grad_norm": 24.5, + "learning_rate": 3.914490614933324e-05, + "loss": 1.1493, + "step": 252500 + }, + { + "epoch": 0.23, + "grad_norm": 33.0, + "learning_rate": 3.9140407076142314e-05, + "loss": 1.2086, + "step": 252600 + }, + { + "epoch": 0.23, + "grad_norm": 8.6875, + "learning_rate": 3.9135908002951396e-05, + "loss": 1.1718, + "step": 252700 + }, + { + "epoch": 0.23, + "grad_norm": 18.875, + "learning_rate": 3.913140892976047e-05, + "loss": 1.071, + "step": 252800 + }, + { + "epoch": 0.23, + "grad_norm": 30.875, + "learning_rate": 3.9126909856569547e-05, + "loss": 1.4632, + "step": 252900 + }, + { + "epoch": 0.23, + "grad_norm": 92.0, + "learning_rate": 3.912241078337863e-05, + "loss": 1.1439, + "step": 253000 + }, + { + "epoch": 0.23, + "grad_norm": 23.125, + "learning_rate": 3.9117911710187704e-05, + "loss": 1.1257, + "step": 253100 + }, + { + "epoch": 0.23, + "grad_norm": 35.75, + "learning_rate": 3.911341263699678e-05, + "loss": 1.1817, + "step": 253200 + }, + { + "epoch": 0.23, + "grad_norm": 29.875, + "learning_rate": 3.910891356380586e-05, + "loss": 1.0107, + "step": 253300 + }, + { + "epoch": 0.23, + "grad_norm": 7.96875, + "learning_rate": 3.910441449061494e-05, + "loss": 1.1322, + "step": 253400 + }, + { + "epoch": 0.23, + "grad_norm": 14.375, + "learning_rate": 3.909991541742401e-05, + "loss": 1.0913, + "step": 253500 + }, + { + "epoch": 0.23, + "grad_norm": 9.9375, + "learning_rate": 3.909541634423309e-05, + "loss": 1.1918, + "step": 253600 + }, + { + "epoch": 0.23, + "grad_norm": 11.375, + "learning_rate": 3.909091727104216e-05, + "loss": 1.1895, + "step": 253700 + }, + { + "epoch": 0.23, + "grad_norm": 24.25, + "learning_rate": 3.9086418197851245e-05, + "loss": 1.2283, + "step": 253800 + }, + { + "epoch": 0.23, + "grad_norm": 0.322265625, + "learning_rate": 3.908191912466032e-05, + "loss": 1.2178, + "step": 253900 + }, + { + "epoch": 0.23, + "grad_norm": 39.5, + "learning_rate": 3.9077420051469396e-05, + "loss": 1.1791, + "step": 254000 + }, + { + "epoch": 0.23, + "grad_norm": 14.125, + "learning_rate": 3.907292097827848e-05, + "loss": 1.3667, + "step": 254100 + }, + { + "epoch": 0.23, + "grad_norm": 0.0390625, + "learning_rate": 3.906842190508755e-05, + "loss": 1.0755, + "step": 254200 + }, + { + "epoch": 0.23, + "grad_norm": 0.01116943359375, + "learning_rate": 3.9063922831896635e-05, + "loss": 1.0778, + "step": 254300 + }, + { + "epoch": 0.23, + "grad_norm": 72.0, + "learning_rate": 3.905942375870571e-05, + "loss": 1.1521, + "step": 254400 + }, + { + "epoch": 0.23, + "grad_norm": 18.125, + "learning_rate": 3.9054924685514786e-05, + "loss": 1.2442, + "step": 254500 + }, + { + "epoch": 0.23, + "grad_norm": 150.0, + "learning_rate": 3.905042561232387e-05, + "loss": 1.2532, + "step": 254600 + }, + { + "epoch": 0.23, + "grad_norm": 19.625, + "learning_rate": 3.904592653913294e-05, + "loss": 1.0806, + "step": 254700 + }, + { + "epoch": 0.23, + "grad_norm": 53.0, + "learning_rate": 3.904142746594202e-05, + "loss": 1.2819, + "step": 254800 + }, + { + "epoch": 0.23, + "grad_norm": 64.5, + "learning_rate": 3.9036928392751094e-05, + "loss": 1.3103, + "step": 254900 + }, + { + "epoch": 0.23, + "grad_norm": 149.0, + "learning_rate": 3.903242931956017e-05, + "loss": 1.3287, + "step": 255000 + }, + { + "epoch": 0.23, + "grad_norm": 20.625, + "learning_rate": 3.902793024636925e-05, + "loss": 1.2764, + "step": 255100 + }, + { + "epoch": 0.23, + "grad_norm": 15.8125, + "learning_rate": 3.902343117317833e-05, + "loss": 1.0663, + "step": 255200 + }, + { + "epoch": 0.23, + "grad_norm": 17.375, + "learning_rate": 3.90189320999874e-05, + "loss": 1.2626, + "step": 255300 + }, + { + "epoch": 0.23, + "grad_norm": 17.625, + "learning_rate": 3.9014433026796484e-05, + "loss": 1.1583, + "step": 255400 + }, + { + "epoch": 0.23, + "grad_norm": 41.75, + "learning_rate": 3.900993395360556e-05, + "loss": 1.1388, + "step": 255500 + }, + { + "epoch": 0.23, + "grad_norm": 34.75, + "learning_rate": 3.9005434880414635e-05, + "loss": 1.0883, + "step": 255600 + }, + { + "epoch": 0.23, + "grad_norm": 44.25, + "learning_rate": 3.900093580722372e-05, + "loss": 1.0158, + "step": 255700 + }, + { + "epoch": 0.23, + "grad_norm": 23.5, + "learning_rate": 3.899643673403279e-05, + "loss": 1.2122, + "step": 255800 + }, + { + "epoch": 0.23, + "grad_norm": 28.0, + "learning_rate": 3.899193766084187e-05, + "loss": 1.1908, + "step": 255900 + }, + { + "epoch": 0.23, + "grad_norm": 96.5, + "learning_rate": 3.898743858765095e-05, + "loss": 1.3359, + "step": 256000 + }, + { + "epoch": 0.23, + "grad_norm": 19.25, + "learning_rate": 3.898293951446002e-05, + "loss": 1.0705, + "step": 256100 + }, + { + "epoch": 0.23, + "grad_norm": 50.25, + "learning_rate": 3.89784404412691e-05, + "loss": 1.1882, + "step": 256200 + }, + { + "epoch": 0.23, + "grad_norm": 13.75, + "learning_rate": 3.8973941368078176e-05, + "loss": 1.109, + "step": 256300 + }, + { + "epoch": 0.23, + "grad_norm": 49.5, + "learning_rate": 3.896944229488725e-05, + "loss": 1.0857, + "step": 256400 + }, + { + "epoch": 0.23, + "grad_norm": 14.6875, + "learning_rate": 3.896494322169633e-05, + "loss": 1.0024, + "step": 256500 + }, + { + "epoch": 0.23, + "grad_norm": 0.3359375, + "learning_rate": 3.896044414850541e-05, + "loss": 1.1698, + "step": 256600 + }, + { + "epoch": 0.23, + "grad_norm": 0.470703125, + "learning_rate": 3.8955945075314484e-05, + "loss": 1.3496, + "step": 256700 + }, + { + "epoch": 0.23, + "grad_norm": 45.0, + "learning_rate": 3.8951446002123566e-05, + "loss": 1.1962, + "step": 256800 + }, + { + "epoch": 0.23, + "grad_norm": 34.25, + "learning_rate": 3.894694692893264e-05, + "loss": 1.203, + "step": 256900 + }, + { + "epoch": 0.23, + "grad_norm": 135.0, + "learning_rate": 3.8942447855741724e-05, + "loss": 1.2846, + "step": 257000 + }, + { + "epoch": 0.23, + "grad_norm": 19.0, + "learning_rate": 3.89379487825508e-05, + "loss": 1.1652, + "step": 257100 + }, + { + "epoch": 0.23, + "grad_norm": 47.25, + "learning_rate": 3.8933449709359874e-05, + "loss": 1.1692, + "step": 257200 + }, + { + "epoch": 0.23, + "grad_norm": 33.0, + "learning_rate": 3.8928950636168956e-05, + "loss": 1.3056, + "step": 257300 + }, + { + "epoch": 0.23, + "grad_norm": 338.0, + "learning_rate": 3.8924451562978025e-05, + "loss": 1.2563, + "step": 257400 + }, + { + "epoch": 0.23, + "grad_norm": 48.75, + "learning_rate": 3.891995248978711e-05, + "loss": 1.2937, + "step": 257500 + }, + { + "epoch": 0.23, + "grad_norm": 29.625, + "learning_rate": 3.891545341659618e-05, + "loss": 1.1976, + "step": 257600 + }, + { + "epoch": 0.23, + "grad_norm": 19.375, + "learning_rate": 3.891095434340526e-05, + "loss": 1.2014, + "step": 257700 + }, + { + "epoch": 0.23, + "grad_norm": 8.6875, + "learning_rate": 3.890645527021434e-05, + "loss": 1.1279, + "step": 257800 + }, + { + "epoch": 0.23, + "grad_norm": 0.1982421875, + "learning_rate": 3.8901956197023415e-05, + "loss": 1.2667, + "step": 257900 + }, + { + "epoch": 0.23, + "grad_norm": 23.125, + "learning_rate": 3.889745712383249e-05, + "loss": 1.3891, + "step": 258000 + } + ], + "logging_steps": 100, + "max_steps": 1122566, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "total_flos": 4.1202133673908224e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}