{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3474183255149363, "eval_steps": 500, "global_step": 390000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 173.0, "learning_rate": 4.4539461963299484e-07, "loss": 2.153, "step": 100 }, { "epoch": 0.0, "grad_norm": 40.5, "learning_rate": 8.907892392659897e-07, "loss": 2.398, "step": 200 }, { "epoch": 0.0, "grad_norm": 223.0, "learning_rate": 1.3361838588989846e-06, "loss": 1.8912, "step": 300 }, { "epoch": 0.0, "grad_norm": 95.5, "learning_rate": 1.7815784785319793e-06, "loss": 1.6403, "step": 400 }, { "epoch": 0.0, "grad_norm": 600.0, "learning_rate": 2.226973098164974e-06, "loss": 1.4652, "step": 500 }, { "epoch": 0.0, "grad_norm": 51.75, "learning_rate": 2.672367717797969e-06, "loss": 1.2489, "step": 600 }, { "epoch": 0.0, "grad_norm": 290.0, "learning_rate": 3.1177623374309637e-06, "loss": 1.2421, "step": 700 }, { "epoch": 0.0, "grad_norm": 27.75, "learning_rate": 3.5631569570639587e-06, "loss": 1.2188, "step": 800 }, { "epoch": 0.0, "grad_norm": 13.0, "learning_rate": 4.008551576696954e-06, "loss": 1.123, "step": 900 }, { "epoch": 0.0, "grad_norm": 39.75, "learning_rate": 4.453946196329948e-06, "loss": 1.0388, "step": 1000 }, { "epoch": 0.0, "grad_norm": 21.375, "learning_rate": 4.899340815962943e-06, "loss": 1.3034, "step": 1100 }, { "epoch": 0.0, "grad_norm": 36.75, "learning_rate": 5.344735435595938e-06, "loss": 1.2272, "step": 1200 }, { "epoch": 0.0, "grad_norm": 71.5, "learning_rate": 5.790130055228933e-06, "loss": 1.1825, "step": 1300 }, { "epoch": 0.0, "grad_norm": 54.5, "learning_rate": 6.235524674861927e-06, "loss": 1.3466, "step": 1400 }, { "epoch": 0.0, "grad_norm": 169.0, "learning_rate": 6.680919294494922e-06, "loss": 1.0543, "step": 1500 }, { "epoch": 0.0, "grad_norm": 47.75, "learning_rate": 7.126313914127917e-06, "loss": 1.1117, "step": 1600 }, { "epoch": 0.0, "grad_norm": 9.5, "learning_rate": 7.571708533760913e-06, "loss": 1.1458, "step": 1700 }, { "epoch": 0.0, "grad_norm": 119.0, "learning_rate": 8.017103153393907e-06, "loss": 1.1823, "step": 1800 }, { "epoch": 0.0, "grad_norm": 31.0, "learning_rate": 8.462497773026902e-06, "loss": 1.2171, "step": 1900 }, { "epoch": 0.0, "grad_norm": 57.25, "learning_rate": 8.907892392659896e-06, "loss": 1.16, "step": 2000 }, { "epoch": 0.0, "grad_norm": 77.0, "learning_rate": 9.353287012292893e-06, "loss": 1.1243, "step": 2100 }, { "epoch": 0.0, "grad_norm": 31.5, "learning_rate": 9.798681631925886e-06, "loss": 1.1218, "step": 2200 }, { "epoch": 0.0, "grad_norm": 98.0, "learning_rate": 1.0244076251558882e-05, "loss": 1.1527, "step": 2300 }, { "epoch": 0.0, "grad_norm": 32.25, "learning_rate": 1.0689470871191876e-05, "loss": 1.1042, "step": 2400 }, { "epoch": 0.0, "grad_norm": 29.0, "learning_rate": 1.1134865490824871e-05, "loss": 1.0131, "step": 2500 }, { "epoch": 0.0, "grad_norm": 50.0, "learning_rate": 1.1580260110457866e-05, "loss": 1.2544, "step": 2600 }, { "epoch": 0.0, "grad_norm": 20.625, "learning_rate": 1.2025654730090862e-05, "loss": 1.1475, "step": 2700 }, { "epoch": 0.0, "grad_norm": 23.375, "learning_rate": 1.2471049349723855e-05, "loss": 1.2345, "step": 2800 }, { "epoch": 0.0, "grad_norm": 39.5, "learning_rate": 1.2916443969356851e-05, "loss": 1.269, "step": 2900 }, { "epoch": 0.0, "grad_norm": 30.5, "learning_rate": 1.3361838588989844e-05, "loss": 1.1995, "step": 3000 }, { "epoch": 0.0, "grad_norm": 17.875, "learning_rate": 1.3807233208622842e-05, "loss": 1.1367, "step": 3100 }, { "epoch": 0.0, "grad_norm": 255.0, "learning_rate": 1.4252627828255835e-05, "loss": 1.1993, "step": 3200 }, { "epoch": 0.0, "grad_norm": 73.5, "learning_rate": 1.469802244788883e-05, "loss": 1.039, "step": 3300 }, { "epoch": 0.0, "grad_norm": 106.5, "learning_rate": 1.5143417067521826e-05, "loss": 1.2538, "step": 3400 }, { "epoch": 0.0, "grad_norm": 153.0, "learning_rate": 1.558881168715482e-05, "loss": 1.156, "step": 3500 }, { "epoch": 0.0, "grad_norm": 111.0, "learning_rate": 1.6034206306787815e-05, "loss": 1.2418, "step": 3600 }, { "epoch": 0.0, "grad_norm": 20.25, "learning_rate": 1.647960092642081e-05, "loss": 1.0018, "step": 3700 }, { "epoch": 0.0, "grad_norm": 68.0, "learning_rate": 1.6924995546053804e-05, "loss": 1.101, "step": 3800 }, { "epoch": 0.0, "grad_norm": 79.5, "learning_rate": 1.7370390165686802e-05, "loss": 1.1538, "step": 3900 }, { "epoch": 0.0, "grad_norm": 26.5, "learning_rate": 1.7815784785319793e-05, "loss": 1.176, "step": 4000 }, { "epoch": 0.0, "grad_norm": 96.5, "learning_rate": 1.8261179404952788e-05, "loss": 0.9909, "step": 4100 }, { "epoch": 0.0, "grad_norm": 45.75, "learning_rate": 1.8706574024585786e-05, "loss": 1.1521, "step": 4200 }, { "epoch": 0.0, "grad_norm": 42.0, "learning_rate": 1.915196864421878e-05, "loss": 1.1438, "step": 4300 }, { "epoch": 0.0, "grad_norm": 16.125, "learning_rate": 1.959736326385177e-05, "loss": 1.1887, "step": 4400 }, { "epoch": 0.0, "grad_norm": 38.75, "learning_rate": 2.004275788348477e-05, "loss": 1.2946, "step": 4500 }, { "epoch": 0.0, "grad_norm": 99.5, "learning_rate": 2.0488152503117764e-05, "loss": 1.2574, "step": 4600 }, { "epoch": 0.0, "grad_norm": 65.5, "learning_rate": 2.093354712275076e-05, "loss": 1.1517, "step": 4700 }, { "epoch": 0.0, "grad_norm": 79.5, "learning_rate": 2.1378941742383753e-05, "loss": 1.187, "step": 4800 }, { "epoch": 0.0, "grad_norm": 162.0, "learning_rate": 2.1824336362016748e-05, "loss": 0.9974, "step": 4900 }, { "epoch": 0.0, "grad_norm": 48.5, "learning_rate": 2.2269730981649742e-05, "loss": 1.2278, "step": 5000 }, { "epoch": 0.0, "grad_norm": 0.09423828125, "learning_rate": 2.271512560128274e-05, "loss": 1.2128, "step": 5100 }, { "epoch": 0.0, "grad_norm": 34.75, "learning_rate": 2.316052022091573e-05, "loss": 1.1712, "step": 5200 }, { "epoch": 0.0, "grad_norm": 0.01483154296875, "learning_rate": 2.3605914840548726e-05, "loss": 0.9784, "step": 5300 }, { "epoch": 0.0, "grad_norm": 89.0, "learning_rate": 2.4051309460181724e-05, "loss": 1.1382, "step": 5400 }, { "epoch": 0.0, "grad_norm": 59.5, "learning_rate": 2.449670407981472e-05, "loss": 1.291, "step": 5500 }, { "epoch": 0.0, "grad_norm": 29.5, "learning_rate": 2.494209869944771e-05, "loss": 1.1317, "step": 5600 }, { "epoch": 0.01, "grad_norm": 24.125, "learning_rate": 2.5387493319080707e-05, "loss": 1.1689, "step": 5700 }, { "epoch": 0.01, "grad_norm": 34.0, "learning_rate": 2.5832887938713702e-05, "loss": 1.2427, "step": 5800 }, { "epoch": 0.01, "grad_norm": 34.5, "learning_rate": 2.6278282558346697e-05, "loss": 1.353, "step": 5900 }, { "epoch": 0.01, "grad_norm": 32.25, "learning_rate": 2.6723677177979688e-05, "loss": 1.1593, "step": 6000 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 2.716907179761269e-05, "loss": 1.1391, "step": 6100 }, { "epoch": 0.01, "grad_norm": 82.0, "learning_rate": 2.7614466417245684e-05, "loss": 1.1999, "step": 6200 }, { "epoch": 0.01, "grad_norm": 25.375, "learning_rate": 2.8059861036878675e-05, "loss": 1.0865, "step": 6300 }, { "epoch": 0.01, "grad_norm": 34.25, "learning_rate": 2.850525565651167e-05, "loss": 1.0761, "step": 6400 }, { "epoch": 0.01, "grad_norm": 44.0, "learning_rate": 2.8950650276144664e-05, "loss": 1.1187, "step": 6500 }, { "epoch": 0.01, "grad_norm": 38.25, "learning_rate": 2.939604489577766e-05, "loss": 1.0824, "step": 6600 }, { "epoch": 0.01, "grad_norm": 49.75, "learning_rate": 2.9841439515410657e-05, "loss": 1.0826, "step": 6700 }, { "epoch": 0.01, "grad_norm": 47.0, "learning_rate": 3.028683413504365e-05, "loss": 1.1387, "step": 6800 }, { "epoch": 0.01, "grad_norm": 61.0, "learning_rate": 3.073222875467664e-05, "loss": 1.1661, "step": 6900 }, { "epoch": 0.01, "grad_norm": 31.625, "learning_rate": 3.117762337430964e-05, "loss": 1.2022, "step": 7000 }, { "epoch": 0.01, "grad_norm": 38.75, "learning_rate": 3.162301799394263e-05, "loss": 1.2461, "step": 7100 }, { "epoch": 0.01, "grad_norm": 28.5, "learning_rate": 3.206841261357563e-05, "loss": 1.2583, "step": 7200 }, { "epoch": 0.01, "grad_norm": 12.0, "learning_rate": 3.251380723320863e-05, "loss": 1.1094, "step": 7300 }, { "epoch": 0.01, "grad_norm": 68.0, "learning_rate": 3.295920185284162e-05, "loss": 1.2244, "step": 7400 }, { "epoch": 0.01, "grad_norm": 42.0, "learning_rate": 3.3404596472474617e-05, "loss": 1.1486, "step": 7500 }, { "epoch": 0.01, "grad_norm": 344.0, "learning_rate": 3.384999109210761e-05, "loss": 1.2049, "step": 7600 }, { "epoch": 0.01, "grad_norm": 11.6875, "learning_rate": 3.42953857117406e-05, "loss": 1.0226, "step": 7700 }, { "epoch": 0.01, "grad_norm": 43.75, "learning_rate": 3.4740780331373604e-05, "loss": 1.2662, "step": 7800 }, { "epoch": 0.01, "grad_norm": 51.5, "learning_rate": 3.5186174951006595e-05, "loss": 1.1029, "step": 7900 }, { "epoch": 0.01, "grad_norm": 36.25, "learning_rate": 3.5631569570639586e-05, "loss": 1.1623, "step": 8000 }, { "epoch": 0.01, "grad_norm": 32.75, "learning_rate": 3.6076964190272584e-05, "loss": 1.0682, "step": 8100 }, { "epoch": 0.01, "grad_norm": 57.75, "learning_rate": 3.6522358809905575e-05, "loss": 1.1232, "step": 8200 }, { "epoch": 0.01, "grad_norm": 27.125, "learning_rate": 3.696775342953857e-05, "loss": 1.1474, "step": 8300 }, { "epoch": 0.01, "grad_norm": 93.0, "learning_rate": 3.741314804917157e-05, "loss": 1.0526, "step": 8400 }, { "epoch": 0.01, "grad_norm": 20.125, "learning_rate": 3.785854266880456e-05, "loss": 1.2, "step": 8500 }, { "epoch": 0.01, "grad_norm": 46.75, "learning_rate": 3.830393728843756e-05, "loss": 1.1716, "step": 8600 }, { "epoch": 0.01, "grad_norm": 54.75, "learning_rate": 3.874933190807055e-05, "loss": 1.042, "step": 8700 }, { "epoch": 0.01, "grad_norm": 51.0, "learning_rate": 3.919472652770354e-05, "loss": 1.1756, "step": 8800 }, { "epoch": 0.01, "grad_norm": 48.25, "learning_rate": 3.964012114733654e-05, "loss": 1.2597, "step": 8900 }, { "epoch": 0.01, "grad_norm": 14.75, "learning_rate": 4.008551576696954e-05, "loss": 1.0449, "step": 9000 }, { "epoch": 0.01, "grad_norm": 18.25, "learning_rate": 4.0530910386602536e-05, "loss": 1.2163, "step": 9100 }, { "epoch": 0.01, "grad_norm": 80.0, "learning_rate": 4.097630500623553e-05, "loss": 1.2302, "step": 9200 }, { "epoch": 0.01, "grad_norm": 32.0, "learning_rate": 4.142169962586852e-05, "loss": 1.1843, "step": 9300 }, { "epoch": 0.01, "grad_norm": 44.25, "learning_rate": 4.186709424550152e-05, "loss": 1.0384, "step": 9400 }, { "epoch": 0.01, "grad_norm": 66.5, "learning_rate": 4.231248886513451e-05, "loss": 1.0315, "step": 9500 }, { "epoch": 0.01, "grad_norm": 72.5, "learning_rate": 4.2757883484767506e-05, "loss": 1.1746, "step": 9600 }, { "epoch": 0.01, "grad_norm": 37.5, "learning_rate": 4.3203278104400504e-05, "loss": 1.2093, "step": 9700 }, { "epoch": 0.01, "grad_norm": 81.5, "learning_rate": 4.3648672724033495e-05, "loss": 1.3342, "step": 9800 }, { "epoch": 0.01, "grad_norm": 88.5, "learning_rate": 4.409406734366649e-05, "loss": 1.1162, "step": 9900 }, { "epoch": 0.01, "grad_norm": 356.0, "learning_rate": 4.4539461963299484e-05, "loss": 0.8431, "step": 10000 }, { "epoch": 0.01, "grad_norm": 75.5, "learning_rate": 4.498485658293248e-05, "loss": 1.15, "step": 10100 }, { "epoch": 0.01, "grad_norm": 51.0, "learning_rate": 4.543025120256548e-05, "loss": 1.1635, "step": 10200 }, { "epoch": 0.01, "grad_norm": 136.0, "learning_rate": 4.587564582219847e-05, "loss": 1.1175, "step": 10300 }, { "epoch": 0.01, "grad_norm": 180.0, "learning_rate": 4.632104044183146e-05, "loss": 1.0931, "step": 10400 }, { "epoch": 0.01, "grad_norm": 107.5, "learning_rate": 4.676643506146446e-05, "loss": 1.3118, "step": 10500 }, { "epoch": 0.01, "grad_norm": 34.0, "learning_rate": 4.721182968109745e-05, "loss": 1.0301, "step": 10600 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 4.765722430073045e-05, "loss": 0.9912, "step": 10700 }, { "epoch": 0.01, "grad_norm": 48.5, "learning_rate": 4.810261892036345e-05, "loss": 1.1649, "step": 10800 }, { "epoch": 0.01, "grad_norm": 88.5, "learning_rate": 4.854801353999644e-05, "loss": 1.4056, "step": 10900 }, { "epoch": 0.01, "grad_norm": 143.0, "learning_rate": 4.899340815962944e-05, "loss": 1.1315, "step": 11000 }, { "epoch": 0.01, "grad_norm": 26.5, "learning_rate": 4.943880277926243e-05, "loss": 1.231, "step": 11100 }, { "epoch": 0.01, "grad_norm": 47.75, "learning_rate": 4.988419739889542e-05, "loss": 1.2519, "step": 11200 }, { "epoch": 0.01, "grad_norm": 4.5625, "learning_rate": 4.999667068583872e-05, "loss": 1.1036, "step": 11300 }, { "epoch": 0.01, "grad_norm": 72.5, "learning_rate": 4.99921716126478e-05, "loss": 1.0886, "step": 11400 }, { "epoch": 0.01, "grad_norm": 102.5, "learning_rate": 4.998767253945688e-05, "loss": 1.1742, "step": 11500 }, { "epoch": 0.01, "grad_norm": 127.0, "learning_rate": 4.9983173466265954e-05, "loss": 1.2533, "step": 11600 }, { "epoch": 0.01, "grad_norm": 221.0, "learning_rate": 4.997867439307503e-05, "loss": 1.1356, "step": 11700 }, { "epoch": 0.01, "grad_norm": 132.0, "learning_rate": 4.997417531988411e-05, "loss": 1.0534, "step": 11800 }, { "epoch": 0.01, "grad_norm": 30.625, "learning_rate": 4.996967624669318e-05, "loss": 0.936, "step": 11900 }, { "epoch": 0.01, "grad_norm": 56.5, "learning_rate": 4.9965177173502256e-05, "loss": 1.0134, "step": 12000 }, { "epoch": 0.01, "grad_norm": 37.75, "learning_rate": 4.996067810031134e-05, "loss": 1.1623, "step": 12100 }, { "epoch": 0.01, "grad_norm": 175.0, "learning_rate": 4.995617902712041e-05, "loss": 1.0928, "step": 12200 }, { "epoch": 0.01, "grad_norm": 44.5, "learning_rate": 4.9951679953929495e-05, "loss": 1.2636, "step": 12300 }, { "epoch": 0.01, "grad_norm": 20.625, "learning_rate": 4.994718088073857e-05, "loss": 1.07, "step": 12400 }, { "epoch": 0.01, "grad_norm": 28.75, "learning_rate": 4.9942681807547646e-05, "loss": 0.9899, "step": 12500 }, { "epoch": 0.01, "grad_norm": 132.0, "learning_rate": 4.993818273435673e-05, "loss": 1.1851, "step": 12600 }, { "epoch": 0.01, "grad_norm": 133.0, "learning_rate": 4.9933683661165803e-05, "loss": 1.0886, "step": 12700 }, { "epoch": 0.01, "grad_norm": 84.0, "learning_rate": 4.992918458797488e-05, "loss": 1.0992, "step": 12800 }, { "epoch": 0.01, "grad_norm": 0.48046875, "learning_rate": 4.992468551478396e-05, "loss": 1.1081, "step": 12900 }, { "epoch": 0.01, "grad_norm": 33.0, "learning_rate": 4.9920186441593036e-05, "loss": 1.3126, "step": 13000 }, { "epoch": 0.01, "grad_norm": 27.5, "learning_rate": 4.991568736840211e-05, "loss": 1.1685, "step": 13100 }, { "epoch": 0.01, "grad_norm": 53.25, "learning_rate": 4.991118829521119e-05, "loss": 1.118, "step": 13200 }, { "epoch": 0.01, "grad_norm": 410.0, "learning_rate": 4.990668922202026e-05, "loss": 1.0968, "step": 13300 }, { "epoch": 0.01, "grad_norm": 56.25, "learning_rate": 4.9902190148829344e-05, "loss": 1.1793, "step": 13400 }, { "epoch": 0.01, "grad_norm": 25.75, "learning_rate": 4.989769107563842e-05, "loss": 1.1671, "step": 13500 }, { "epoch": 0.01, "grad_norm": 26.25, "learning_rate": 4.9893192002447495e-05, "loss": 1.1825, "step": 13600 }, { "epoch": 0.01, "grad_norm": 105.5, "learning_rate": 4.988869292925658e-05, "loss": 1.1509, "step": 13700 }, { "epoch": 0.01, "grad_norm": 79.5, "learning_rate": 4.988419385606565e-05, "loss": 1.1337, "step": 13800 }, { "epoch": 0.01, "grad_norm": 36.75, "learning_rate": 4.987969478287473e-05, "loss": 1.0261, "step": 13900 }, { "epoch": 0.01, "grad_norm": 48.75, "learning_rate": 4.987519570968381e-05, "loss": 1.2461, "step": 14000 }, { "epoch": 0.01, "grad_norm": 52.75, "learning_rate": 4.9870696636492885e-05, "loss": 1.1742, "step": 14100 }, { "epoch": 0.01, "grad_norm": 236.0, "learning_rate": 4.986619756330197e-05, "loss": 0.9742, "step": 14200 }, { "epoch": 0.01, "grad_norm": 402.0, "learning_rate": 4.986169849011104e-05, "loss": 1.2615, "step": 14300 }, { "epoch": 0.01, "grad_norm": 120.0, "learning_rate": 4.985719941692012e-05, "loss": 1.3339, "step": 14400 }, { "epoch": 0.01, "grad_norm": 66.0, "learning_rate": 4.9852700343729193e-05, "loss": 1.2984, "step": 14500 }, { "epoch": 0.01, "grad_norm": 37.5, "learning_rate": 4.984820127053827e-05, "loss": 0.9931, "step": 14600 }, { "epoch": 0.01, "grad_norm": 26.625, "learning_rate": 4.9843702197347344e-05, "loss": 1.1177, "step": 14700 }, { "epoch": 0.01, "grad_norm": 143.0, "learning_rate": 4.9839203124156426e-05, "loss": 1.1625, "step": 14800 }, { "epoch": 0.01, "grad_norm": 58.75, "learning_rate": 4.98347040509655e-05, "loss": 1.2321, "step": 14900 }, { "epoch": 0.01, "grad_norm": 27.625, "learning_rate": 4.9830204977774584e-05, "loss": 1.2545, "step": 15000 }, { "epoch": 0.01, "grad_norm": 40.75, "learning_rate": 4.982570590458366e-05, "loss": 1.1757, "step": 15100 }, { "epoch": 0.01, "grad_norm": 44.5, "learning_rate": 4.9821206831392734e-05, "loss": 1.1684, "step": 15200 }, { "epoch": 0.01, "grad_norm": 31.0, "learning_rate": 4.9816707758201816e-05, "loss": 1.2152, "step": 15300 }, { "epoch": 0.01, "grad_norm": 56.5, "learning_rate": 4.981220868501089e-05, "loss": 1.2536, "step": 15400 }, { "epoch": 0.01, "grad_norm": 93.0, "learning_rate": 4.980770961181997e-05, "loss": 1.2398, "step": 15500 }, { "epoch": 0.01, "grad_norm": 63.5, "learning_rate": 4.980321053862905e-05, "loss": 1.0495, "step": 15600 }, { "epoch": 0.01, "grad_norm": 294.0, "learning_rate": 4.9798711465438125e-05, "loss": 1.2303, "step": 15700 }, { "epoch": 0.01, "grad_norm": 97.0, "learning_rate": 4.97942123922472e-05, "loss": 1.0398, "step": 15800 }, { "epoch": 0.01, "grad_norm": 30.125, "learning_rate": 4.9789713319056275e-05, "loss": 1.2357, "step": 15900 }, { "epoch": 0.01, "grad_norm": 64.0, "learning_rate": 4.978521424586535e-05, "loss": 1.1963, "step": 16000 }, { "epoch": 0.01, "grad_norm": 69.0, "learning_rate": 4.978071517267443e-05, "loss": 1.1217, "step": 16100 }, { "epoch": 0.01, "grad_norm": 73.0, "learning_rate": 4.977621609948351e-05, "loss": 1.3172, "step": 16200 }, { "epoch": 0.01, "grad_norm": 6.4375, "learning_rate": 4.977171702629258e-05, "loss": 1.0252, "step": 16300 }, { "epoch": 0.01, "grad_norm": 13.5625, "learning_rate": 4.9767217953101666e-05, "loss": 1.0254, "step": 16400 }, { "epoch": 0.01, "grad_norm": 28.375, "learning_rate": 4.976271887991074e-05, "loss": 1.116, "step": 16500 }, { "epoch": 0.01, "grad_norm": 664.0, "learning_rate": 4.9758219806719816e-05, "loss": 1.1861, "step": 16600 }, { "epoch": 0.01, "grad_norm": 157.0, "learning_rate": 4.97537207335289e-05, "loss": 1.1762, "step": 16700 }, { "epoch": 0.01, "grad_norm": 42.75, "learning_rate": 4.9749221660337974e-05, "loss": 1.2372, "step": 16800 }, { "epoch": 0.02, "grad_norm": 2.578125, "learning_rate": 4.974472258714705e-05, "loss": 1.1629, "step": 16900 }, { "epoch": 0.02, "grad_norm": 196.0, "learning_rate": 4.974022351395613e-05, "loss": 1.1746, "step": 17000 }, { "epoch": 0.02, "grad_norm": 43.5, "learning_rate": 4.97357244407652e-05, "loss": 1.1731, "step": 17100 }, { "epoch": 0.02, "grad_norm": 22.375, "learning_rate": 4.973122536757428e-05, "loss": 1.0003, "step": 17200 }, { "epoch": 0.02, "grad_norm": 44.75, "learning_rate": 4.972672629438336e-05, "loss": 1.0736, "step": 17300 }, { "epoch": 0.02, "grad_norm": 2.5625, "learning_rate": 4.972222722119243e-05, "loss": 1.2785, "step": 17400 }, { "epoch": 0.02, "grad_norm": 31.5, "learning_rate": 4.9717728148001515e-05, "loss": 1.0953, "step": 17500 }, { "epoch": 0.02, "grad_norm": 83.5, "learning_rate": 4.971322907481059e-05, "loss": 1.1549, "step": 17600 }, { "epoch": 0.02, "grad_norm": 54.0, "learning_rate": 4.970873000161967e-05, "loss": 1.3649, "step": 17700 }, { "epoch": 0.02, "grad_norm": 182.0, "learning_rate": 4.970423092842875e-05, "loss": 1.109, "step": 17800 }, { "epoch": 0.02, "grad_norm": 233.0, "learning_rate": 4.969973185523782e-05, "loss": 1.1443, "step": 17900 }, { "epoch": 0.02, "grad_norm": 65.5, "learning_rate": 4.9695232782046905e-05, "loss": 1.1159, "step": 18000 }, { "epoch": 0.02, "grad_norm": 494.0, "learning_rate": 4.969073370885598e-05, "loss": 1.1072, "step": 18100 }, { "epoch": 0.02, "grad_norm": 105.0, "learning_rate": 4.9686234635665055e-05, "loss": 1.0828, "step": 18200 }, { "epoch": 0.02, "grad_norm": 32.0, "learning_rate": 4.968173556247414e-05, "loss": 1.092, "step": 18300 }, { "epoch": 0.02, "grad_norm": 40.5, "learning_rate": 4.9677236489283206e-05, "loss": 1.093, "step": 18400 }, { "epoch": 0.02, "grad_norm": 20.0, "learning_rate": 4.967273741609229e-05, "loss": 1.283, "step": 18500 }, { "epoch": 0.02, "grad_norm": 69.0, "learning_rate": 4.9668238342901364e-05, "loss": 1.2988, "step": 18600 }, { "epoch": 0.02, "grad_norm": 102.5, "learning_rate": 4.966373926971044e-05, "loss": 1.2539, "step": 18700 }, { "epoch": 0.02, "grad_norm": 54.5, "learning_rate": 4.965924019651952e-05, "loss": 0.9519, "step": 18800 }, { "epoch": 0.02, "grad_norm": 52.0, "learning_rate": 4.9654741123328596e-05, "loss": 1.0343, "step": 18900 }, { "epoch": 0.02, "grad_norm": 91.0, "learning_rate": 4.965024205013767e-05, "loss": 1.0141, "step": 19000 }, { "epoch": 0.02, "grad_norm": 18.5, "learning_rate": 4.9645742976946754e-05, "loss": 1.0664, "step": 19100 }, { "epoch": 0.02, "grad_norm": 82.0, "learning_rate": 4.964124390375583e-05, "loss": 1.1607, "step": 19200 }, { "epoch": 0.02, "grad_norm": 133.0, "learning_rate": 4.9636744830564905e-05, "loss": 1.0688, "step": 19300 }, { "epoch": 0.02, "grad_norm": 23.5, "learning_rate": 4.963224575737399e-05, "loss": 1.006, "step": 19400 }, { "epoch": 0.02, "grad_norm": 29.125, "learning_rate": 4.962774668418306e-05, "loss": 1.2201, "step": 19500 }, { "epoch": 0.02, "grad_norm": 214.0, "learning_rate": 4.962324761099214e-05, "loss": 1.1192, "step": 19600 }, { "epoch": 0.02, "grad_norm": 34.75, "learning_rate": 4.961874853780121e-05, "loss": 1.1362, "step": 19700 }, { "epoch": 0.02, "grad_norm": 156.0, "learning_rate": 4.961424946461029e-05, "loss": 1.1204, "step": 19800 }, { "epoch": 0.02, "grad_norm": 157.0, "learning_rate": 4.960975039141937e-05, "loss": 1.0584, "step": 19900 }, { "epoch": 0.02, "grad_norm": 37.5, "learning_rate": 4.9605251318228445e-05, "loss": 1.2305, "step": 20000 }, { "epoch": 0.02, "grad_norm": 127.0, "learning_rate": 4.960075224503752e-05, "loss": 1.0515, "step": 20100 }, { "epoch": 0.02, "grad_norm": 126.5, "learning_rate": 4.95962531718466e-05, "loss": 1.1291, "step": 20200 }, { "epoch": 0.02, "grad_norm": 20.75, "learning_rate": 4.959175409865568e-05, "loss": 1.2646, "step": 20300 }, { "epoch": 0.02, "grad_norm": 37.25, "learning_rate": 4.958725502546476e-05, "loss": 1.0123, "step": 20400 }, { "epoch": 0.02, "grad_norm": 39.25, "learning_rate": 4.9582755952273836e-05, "loss": 1.2387, "step": 20500 }, { "epoch": 0.02, "grad_norm": 103.5, "learning_rate": 4.957825687908291e-05, "loss": 1.097, "step": 20600 }, { "epoch": 0.02, "grad_norm": 12.25, "learning_rate": 4.957375780589199e-05, "loss": 0.9842, "step": 20700 }, { "epoch": 0.02, "grad_norm": 41.75, "learning_rate": 4.956925873270107e-05, "loss": 1.018, "step": 20800 }, { "epoch": 0.02, "grad_norm": 20.5, "learning_rate": 4.9564759659510144e-05, "loss": 1.0691, "step": 20900 }, { "epoch": 0.02, "grad_norm": 72.0, "learning_rate": 4.956026058631922e-05, "loss": 1.1096, "step": 21000 }, { "epoch": 0.02, "grad_norm": 0.0006103515625, "learning_rate": 4.9555761513128295e-05, "loss": 1.2363, "step": 21100 }, { "epoch": 0.02, "grad_norm": 37.75, "learning_rate": 4.955126243993738e-05, "loss": 1.1209, "step": 21200 }, { "epoch": 0.02, "grad_norm": 43.25, "learning_rate": 4.954676336674645e-05, "loss": 1.3024, "step": 21300 }, { "epoch": 0.02, "grad_norm": 2.125, "learning_rate": 4.954226429355553e-05, "loss": 1.1737, "step": 21400 }, { "epoch": 0.02, "grad_norm": 203.0, "learning_rate": 4.953776522036461e-05, "loss": 1.0662, "step": 21500 }, { "epoch": 0.02, "grad_norm": 22.0, "learning_rate": 4.9533266147173685e-05, "loss": 1.0818, "step": 21600 }, { "epoch": 0.02, "grad_norm": 15.3125, "learning_rate": 4.952876707398276e-05, "loss": 1.1323, "step": 21700 }, { "epoch": 0.02, "grad_norm": 6.59375, "learning_rate": 4.952426800079184e-05, "loss": 0.9627, "step": 21800 }, { "epoch": 0.02, "grad_norm": 145.0, "learning_rate": 4.951976892760092e-05, "loss": 1.1693, "step": 21900 }, { "epoch": 0.02, "grad_norm": 65.5, "learning_rate": 4.951526985440999e-05, "loss": 1.2508, "step": 22000 }, { "epoch": 0.02, "grad_norm": 26.875, "learning_rate": 4.9510770781219075e-05, "loss": 0.9449, "step": 22100 }, { "epoch": 0.02, "grad_norm": 23.625, "learning_rate": 4.950627170802815e-05, "loss": 0.9817, "step": 22200 }, { "epoch": 0.02, "grad_norm": 83.0, "learning_rate": 4.9501772634837226e-05, "loss": 1.1505, "step": 22300 }, { "epoch": 0.02, "grad_norm": 33.0, "learning_rate": 4.94972735616463e-05, "loss": 1.0348, "step": 22400 }, { "epoch": 0.02, "grad_norm": 77.5, "learning_rate": 4.9492774488455376e-05, "loss": 1.1222, "step": 22500 }, { "epoch": 0.02, "grad_norm": 40.25, "learning_rate": 4.948827541526446e-05, "loss": 1.2024, "step": 22600 }, { "epoch": 0.02, "grad_norm": 103.5, "learning_rate": 4.9483776342073534e-05, "loss": 1.1287, "step": 22700 }, { "epoch": 0.02, "grad_norm": 45.25, "learning_rate": 4.947927726888261e-05, "loss": 1.1186, "step": 22800 }, { "epoch": 0.02, "grad_norm": 14.625, "learning_rate": 4.947477819569169e-05, "loss": 1.0359, "step": 22900 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.947027912250077e-05, "loss": 0.9546, "step": 23000 }, { "epoch": 0.02, "grad_norm": 73.5, "learning_rate": 4.946578004930985e-05, "loss": 1.2416, "step": 23100 }, { "epoch": 0.02, "grad_norm": 33.0, "learning_rate": 4.9461280976118924e-05, "loss": 1.0936, "step": 23200 }, { "epoch": 0.02, "grad_norm": 74.5, "learning_rate": 4.9456781902928e-05, "loss": 1.1805, "step": 23300 }, { "epoch": 0.02, "grad_norm": 4.90625, "learning_rate": 4.945228282973708e-05, "loss": 1.1044, "step": 23400 }, { "epoch": 0.02, "grad_norm": 22.375, "learning_rate": 4.944778375654616e-05, "loss": 1.1009, "step": 23500 }, { "epoch": 0.02, "grad_norm": 18.125, "learning_rate": 4.9443284683355225e-05, "loss": 1.1798, "step": 23600 }, { "epoch": 0.02, "grad_norm": 72.0, "learning_rate": 4.943878561016431e-05, "loss": 1.0636, "step": 23700 }, { "epoch": 0.02, "grad_norm": 13.5625, "learning_rate": 4.943428653697338e-05, "loss": 1.2526, "step": 23800 }, { "epoch": 0.02, "grad_norm": 61.75, "learning_rate": 4.9429787463782465e-05, "loss": 1.2469, "step": 23900 }, { "epoch": 0.02, "grad_norm": 48.75, "learning_rate": 4.942528839059154e-05, "loss": 1.1861, "step": 24000 }, { "epoch": 0.02, "grad_norm": 216.0, "learning_rate": 4.9420789317400616e-05, "loss": 1.1375, "step": 24100 }, { "epoch": 0.02, "grad_norm": 20.625, "learning_rate": 4.94162902442097e-05, "loss": 1.1822, "step": 24200 }, { "epoch": 0.02, "grad_norm": 26.875, "learning_rate": 4.941179117101877e-05, "loss": 1.1196, "step": 24300 }, { "epoch": 0.02, "grad_norm": 39.0, "learning_rate": 4.940729209782785e-05, "loss": 1.1318, "step": 24400 }, { "epoch": 0.02, "grad_norm": 24.625, "learning_rate": 4.940279302463693e-05, "loss": 1.0661, "step": 24500 }, { "epoch": 0.02, "grad_norm": 52.75, "learning_rate": 4.9398293951446006e-05, "loss": 1.0103, "step": 24600 }, { "epoch": 0.02, "grad_norm": 124.5, "learning_rate": 4.939379487825508e-05, "loss": 1.1614, "step": 24700 }, { "epoch": 0.02, "grad_norm": 14.1875, "learning_rate": 4.9389295805064163e-05, "loss": 1.0209, "step": 24800 }, { "epoch": 0.02, "grad_norm": 47.0, "learning_rate": 4.938479673187323e-05, "loss": 1.1871, "step": 24900 }, { "epoch": 0.02, "grad_norm": 278.0, "learning_rate": 4.9380297658682314e-05, "loss": 1.0667, "step": 25000 }, { "epoch": 0.02, "grad_norm": 2.359375, "learning_rate": 4.937579858549139e-05, "loss": 1.0638, "step": 25100 }, { "epoch": 0.02, "grad_norm": 37.75, "learning_rate": 4.9371299512300465e-05, "loss": 0.8903, "step": 25200 }, { "epoch": 0.02, "grad_norm": 464.0, "learning_rate": 4.936680043910955e-05, "loss": 1.0502, "step": 25300 }, { "epoch": 0.02, "grad_norm": 38.5, "learning_rate": 4.936230136591862e-05, "loss": 1.1984, "step": 25400 }, { "epoch": 0.02, "grad_norm": 32.5, "learning_rate": 4.93578022927277e-05, "loss": 1.2607, "step": 25500 }, { "epoch": 0.02, "grad_norm": 48.5, "learning_rate": 4.935330321953678e-05, "loss": 1.058, "step": 25600 }, { "epoch": 0.02, "grad_norm": 59.5, "learning_rate": 4.9348804146345855e-05, "loss": 1.0378, "step": 25700 }, { "epoch": 0.02, "grad_norm": 512.0, "learning_rate": 4.934430507315494e-05, "loss": 1.0418, "step": 25800 }, { "epoch": 0.02, "grad_norm": 47.25, "learning_rate": 4.933980599996401e-05, "loss": 1.1618, "step": 25900 }, { "epoch": 0.02, "grad_norm": 44.75, "learning_rate": 4.933530692677309e-05, "loss": 1.0471, "step": 26000 }, { "epoch": 0.02, "grad_norm": 23.125, "learning_rate": 4.933080785358217e-05, "loss": 1.0217, "step": 26100 }, { "epoch": 0.02, "grad_norm": 112.0, "learning_rate": 4.932630878039124e-05, "loss": 1.0314, "step": 26200 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.9321809707200314e-05, "loss": 1.1879, "step": 26300 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.9317310634009396e-05, "loss": 1.0381, "step": 26400 }, { "epoch": 0.02, "grad_norm": 29.625, "learning_rate": 4.931281156081847e-05, "loss": 1.1444, "step": 26500 }, { "epoch": 0.02, "grad_norm": 37.25, "learning_rate": 4.9308312487627553e-05, "loss": 0.9723, "step": 26600 }, { "epoch": 0.02, "grad_norm": 65.0, "learning_rate": 4.930381341443663e-05, "loss": 1.13, "step": 26700 }, { "epoch": 0.02, "grad_norm": 23.875, "learning_rate": 4.9299314341245704e-05, "loss": 0.9686, "step": 26800 }, { "epoch": 0.02, "grad_norm": 68.0, "learning_rate": 4.9294815268054786e-05, "loss": 1.0283, "step": 26900 }, { "epoch": 0.02, "grad_norm": 42.25, "learning_rate": 4.929031619486386e-05, "loss": 1.1077, "step": 27000 }, { "epoch": 0.02, "grad_norm": 38.25, "learning_rate": 4.928581712167294e-05, "loss": 1.1153, "step": 27100 }, { "epoch": 0.02, "grad_norm": 28.5, "learning_rate": 4.928131804848202e-05, "loss": 1.1458, "step": 27200 }, { "epoch": 0.02, "grad_norm": 352.0, "learning_rate": 4.9276818975291094e-05, "loss": 1.0234, "step": 27300 }, { "epoch": 0.02, "grad_norm": 58.25, "learning_rate": 4.927231990210017e-05, "loss": 1.1267, "step": 27400 }, { "epoch": 0.02, "grad_norm": 326.0, "learning_rate": 4.9267820828909245e-05, "loss": 1.0168, "step": 27500 }, { "epoch": 0.02, "grad_norm": 42.25, "learning_rate": 4.926332175571832e-05, "loss": 1.0986, "step": 27600 }, { "epoch": 0.02, "grad_norm": 81.0, "learning_rate": 4.92588226825274e-05, "loss": 0.9895, "step": 27700 }, { "epoch": 0.02, "grad_norm": 160.0, "learning_rate": 4.925432360933648e-05, "loss": 1.1471, "step": 27800 }, { "epoch": 0.02, "grad_norm": 2.578125, "learning_rate": 4.924982453614555e-05, "loss": 1.0556, "step": 27900 }, { "epoch": 0.02, "grad_norm": 2.421875, "learning_rate": 4.9245325462954635e-05, "loss": 1.0177, "step": 28000 }, { "epoch": 0.03, "grad_norm": 82.5, "learning_rate": 4.924082638976371e-05, "loss": 1.1341, "step": 28100 }, { "epoch": 0.03, "grad_norm": 98.5, "learning_rate": 4.9236327316572786e-05, "loss": 1.2145, "step": 28200 }, { "epoch": 0.03, "grad_norm": 0.05224609375, "learning_rate": 4.923182824338187e-05, "loss": 1.0562, "step": 28300 }, { "epoch": 0.03, "grad_norm": 43.0, "learning_rate": 4.9227329170190943e-05, "loss": 1.2268, "step": 28400 }, { "epoch": 0.03, "grad_norm": 59.25, "learning_rate": 4.9222830097000026e-05, "loss": 1.1499, "step": 28500 }, { "epoch": 0.03, "grad_norm": 592.0, "learning_rate": 4.92183310238091e-05, "loss": 1.215, "step": 28600 }, { "epoch": 0.03, "grad_norm": 16.75, "learning_rate": 4.9213831950618176e-05, "loss": 1.0979, "step": 28700 }, { "epoch": 0.03, "grad_norm": 38.25, "learning_rate": 4.920933287742725e-05, "loss": 1.128, "step": 28800 }, { "epoch": 0.03, "grad_norm": 62.5, "learning_rate": 4.920483380423633e-05, "loss": 1.046, "step": 28900 }, { "epoch": 0.03, "grad_norm": 2.703125, "learning_rate": 4.92003347310454e-05, "loss": 1.063, "step": 29000 }, { "epoch": 0.03, "grad_norm": 7.875, "learning_rate": 4.9195835657854484e-05, "loss": 1.0012, "step": 29100 }, { "epoch": 0.03, "grad_norm": 19.25, "learning_rate": 4.919133658466356e-05, "loss": 1.0919, "step": 29200 }, { "epoch": 0.03, "grad_norm": 14.6875, "learning_rate": 4.918683751147264e-05, "loss": 0.9316, "step": 29300 }, { "epoch": 0.03, "grad_norm": 50.25, "learning_rate": 4.918233843828172e-05, "loss": 1.0796, "step": 29400 }, { "epoch": 0.03, "grad_norm": 0.01422119140625, "learning_rate": 4.917783936509079e-05, "loss": 1.0421, "step": 29500 }, { "epoch": 0.03, "grad_norm": 20.75, "learning_rate": 4.9173340291899875e-05, "loss": 1.1928, "step": 29600 }, { "epoch": 0.03, "grad_norm": 13.25, "learning_rate": 4.916884121870895e-05, "loss": 1.1847, "step": 29700 }, { "epoch": 0.03, "grad_norm": 12.9375, "learning_rate": 4.9164342145518025e-05, "loss": 1.1563, "step": 29800 }, { "epoch": 0.03, "grad_norm": 32.5, "learning_rate": 4.915984307232711e-05, "loss": 1.179, "step": 29900 }, { "epoch": 0.03, "grad_norm": 44.0, "learning_rate": 4.915534399913618e-05, "loss": 1.1422, "step": 30000 }, { "epoch": 0.03, "grad_norm": 46.75, "learning_rate": 4.915084492594526e-05, "loss": 1.0485, "step": 30100 }, { "epoch": 0.03, "grad_norm": 88.0, "learning_rate": 4.9146345852754333e-05, "loss": 1.0639, "step": 30200 }, { "epoch": 0.03, "grad_norm": 31.5, "learning_rate": 4.914184677956341e-05, "loss": 1.1275, "step": 30300 }, { "epoch": 0.03, "grad_norm": 65.5, "learning_rate": 4.913734770637249e-05, "loss": 1.1221, "step": 30400 }, { "epoch": 0.03, "grad_norm": 84.0, "learning_rate": 4.9132848633181566e-05, "loss": 1.1461, "step": 30500 }, { "epoch": 0.03, "grad_norm": 0.09326171875, "learning_rate": 4.912834955999064e-05, "loss": 1.0183, "step": 30600 }, { "epoch": 0.03, "grad_norm": 0.7578125, "learning_rate": 4.9123850486799724e-05, "loss": 1.1438, "step": 30700 }, { "epoch": 0.03, "grad_norm": 43.25, "learning_rate": 4.91193514136088e-05, "loss": 0.9956, "step": 30800 }, { "epoch": 0.03, "grad_norm": 120.0, "learning_rate": 4.9114852340417874e-05, "loss": 1.1497, "step": 30900 }, { "epoch": 0.03, "grad_norm": 29.625, "learning_rate": 4.9110353267226956e-05, "loss": 1.1515, "step": 31000 }, { "epoch": 0.03, "grad_norm": 13.5625, "learning_rate": 4.910585419403603e-05, "loss": 1.0692, "step": 31100 }, { "epoch": 0.03, "grad_norm": 113.0, "learning_rate": 4.9101355120845114e-05, "loss": 1.1573, "step": 31200 }, { "epoch": 0.03, "grad_norm": 14.125, "learning_rate": 4.909685604765419e-05, "loss": 1.0614, "step": 31300 }, { "epoch": 0.03, "grad_norm": 2.359375, "learning_rate": 4.909235697446326e-05, "loss": 1.1084, "step": 31400 }, { "epoch": 0.03, "grad_norm": 0.66796875, "learning_rate": 4.908785790127234e-05, "loss": 0.9684, "step": 31500 }, { "epoch": 0.03, "grad_norm": 36.0, "learning_rate": 4.9083358828081415e-05, "loss": 0.9898, "step": 31600 }, { "epoch": 0.03, "grad_norm": 29.625, "learning_rate": 4.907885975489049e-05, "loss": 1.0407, "step": 31700 }, { "epoch": 0.03, "grad_norm": 12.3125, "learning_rate": 4.907436068169957e-05, "loss": 1.1401, "step": 31800 }, { "epoch": 0.03, "grad_norm": 2.171875, "learning_rate": 4.906986160850865e-05, "loss": 0.9899, "step": 31900 }, { "epoch": 0.03, "grad_norm": 63.25, "learning_rate": 4.906536253531773e-05, "loss": 1.1188, "step": 32000 }, { "epoch": 0.03, "grad_norm": 116.5, "learning_rate": 4.9060863462126806e-05, "loss": 1.1581, "step": 32100 }, { "epoch": 0.03, "grad_norm": 5.1875, "learning_rate": 4.905636438893588e-05, "loss": 1.1116, "step": 32200 }, { "epoch": 0.03, "grad_norm": 71.0, "learning_rate": 4.905186531574496e-05, "loss": 1.0821, "step": 32300 }, { "epoch": 0.03, "grad_norm": 24.875, "learning_rate": 4.904736624255404e-05, "loss": 1.0241, "step": 32400 }, { "epoch": 0.03, "grad_norm": 71.0, "learning_rate": 4.9042867169363114e-05, "loss": 0.9884, "step": 32500 }, { "epoch": 0.03, "grad_norm": 14.25, "learning_rate": 4.9038368096172196e-05, "loss": 1.0877, "step": 32600 }, { "epoch": 0.03, "grad_norm": 14.0625, "learning_rate": 4.9033869022981264e-05, "loss": 1.1312, "step": 32700 }, { "epoch": 0.03, "grad_norm": 28.25, "learning_rate": 4.9029369949790346e-05, "loss": 1.1194, "step": 32800 }, { "epoch": 0.03, "grad_norm": 72.5, "learning_rate": 4.902487087659942e-05, "loss": 1.2167, "step": 32900 }, { "epoch": 0.03, "grad_norm": 60.5, "learning_rate": 4.90203718034085e-05, "loss": 1.0633, "step": 33000 }, { "epoch": 0.03, "grad_norm": 62.25, "learning_rate": 4.901587273021758e-05, "loss": 1.1538, "step": 33100 }, { "epoch": 0.03, "grad_norm": 0.1943359375, "learning_rate": 4.9011373657026655e-05, "loss": 0.9982, "step": 33200 }, { "epoch": 0.03, "grad_norm": 45.25, "learning_rate": 4.900687458383573e-05, "loss": 1.1433, "step": 33300 }, { "epoch": 0.03, "grad_norm": 548.0, "learning_rate": 4.900237551064481e-05, "loss": 1.1228, "step": 33400 }, { "epoch": 0.03, "grad_norm": 24.125, "learning_rate": 4.899787643745389e-05, "loss": 1.0949, "step": 33500 }, { "epoch": 0.03, "grad_norm": 29.625, "learning_rate": 4.899337736426296e-05, "loss": 0.9741, "step": 33600 }, { "epoch": 0.03, "grad_norm": 68.5, "learning_rate": 4.8988878291072045e-05, "loss": 1.1313, "step": 33700 }, { "epoch": 0.03, "grad_norm": 24.125, "learning_rate": 4.898437921788112e-05, "loss": 0.9327, "step": 33800 }, { "epoch": 0.03, "grad_norm": 30.625, "learning_rate": 4.8979880144690196e-05, "loss": 1.0312, "step": 33900 }, { "epoch": 0.03, "grad_norm": 28.0, "learning_rate": 4.897538107149927e-05, "loss": 1.0795, "step": 34000 }, { "epoch": 0.03, "grad_norm": 0.23828125, "learning_rate": 4.8970881998308346e-05, "loss": 1.028, "step": 34100 }, { "epoch": 0.03, "grad_norm": 336.0, "learning_rate": 4.896638292511743e-05, "loss": 0.9252, "step": 34200 }, { "epoch": 0.03, "grad_norm": 59.5, "learning_rate": 4.8961883851926504e-05, "loss": 1.0244, "step": 34300 }, { "epoch": 0.03, "grad_norm": 91.5, "learning_rate": 4.895738477873558e-05, "loss": 1.2062, "step": 34400 }, { "epoch": 0.03, "grad_norm": 24.25, "learning_rate": 4.895288570554466e-05, "loss": 1.016, "step": 34500 }, { "epoch": 0.03, "grad_norm": 326.0, "learning_rate": 4.8948386632353736e-05, "loss": 0.946, "step": 34600 }, { "epoch": 0.03, "grad_norm": 18.75, "learning_rate": 4.894388755916282e-05, "loss": 1.0384, "step": 34700 }, { "epoch": 0.03, "grad_norm": 0.361328125, "learning_rate": 4.8939388485971894e-05, "loss": 1.0267, "step": 34800 }, { "epoch": 0.03, "grad_norm": 26.75, "learning_rate": 4.893488941278097e-05, "loss": 1.0807, "step": 34900 }, { "epoch": 0.03, "grad_norm": 72.0, "learning_rate": 4.893039033959005e-05, "loss": 1.0735, "step": 35000 }, { "epoch": 0.03, "grad_norm": 5.875, "learning_rate": 4.892589126639913e-05, "loss": 1.2151, "step": 35100 }, { "epoch": 0.03, "grad_norm": 170.0, "learning_rate": 4.89213921932082e-05, "loss": 1.189, "step": 35200 }, { "epoch": 0.03, "grad_norm": 56.75, "learning_rate": 4.891689312001728e-05, "loss": 1.1221, "step": 35300 }, { "epoch": 0.03, "grad_norm": 63.75, "learning_rate": 4.891239404682635e-05, "loss": 1.1452, "step": 35400 }, { "epoch": 0.03, "grad_norm": 39.75, "learning_rate": 4.8907894973635435e-05, "loss": 1.2071, "step": 35500 }, { "epoch": 0.03, "grad_norm": 48.0, "learning_rate": 4.890339590044451e-05, "loss": 1.1314, "step": 35600 }, { "epoch": 0.03, "grad_norm": 17.5, "learning_rate": 4.8898896827253586e-05, "loss": 1.1872, "step": 35700 }, { "epoch": 0.03, "grad_norm": 222.0, "learning_rate": 4.889439775406267e-05, "loss": 1.1534, "step": 35800 }, { "epoch": 0.03, "grad_norm": 13.25, "learning_rate": 4.888989868087174e-05, "loss": 1.1041, "step": 35900 }, { "epoch": 0.03, "grad_norm": 79.0, "learning_rate": 4.888539960768082e-05, "loss": 1.0419, "step": 36000 }, { "epoch": 0.03, "grad_norm": 35.75, "learning_rate": 4.88809005344899e-05, "loss": 1.1561, "step": 36100 }, { "epoch": 0.03, "grad_norm": 19.0, "learning_rate": 4.8876401461298976e-05, "loss": 1.0917, "step": 36200 }, { "epoch": 0.03, "grad_norm": 45.5, "learning_rate": 4.887190238810805e-05, "loss": 1.1914, "step": 36300 }, { "epoch": 0.03, "grad_norm": 20.0, "learning_rate": 4.886740331491713e-05, "loss": 1.0026, "step": 36400 }, { "epoch": 0.03, "grad_norm": 20.125, "learning_rate": 4.886290424172621e-05, "loss": 1.1217, "step": 36500 }, { "epoch": 0.03, "grad_norm": 31.125, "learning_rate": 4.8858405168535284e-05, "loss": 1.0603, "step": 36600 }, { "epoch": 0.03, "grad_norm": 42.5, "learning_rate": 4.885390609534436e-05, "loss": 1.1187, "step": 36700 }, { "epoch": 0.03, "grad_norm": 368.0, "learning_rate": 4.8849407022153435e-05, "loss": 1.0626, "step": 36800 }, { "epoch": 0.03, "grad_norm": 30.25, "learning_rate": 4.884490794896252e-05, "loss": 1.1262, "step": 36900 }, { "epoch": 0.03, "grad_norm": 62.0, "learning_rate": 4.884040887577159e-05, "loss": 1.1231, "step": 37000 }, { "epoch": 0.03, "grad_norm": 52.25, "learning_rate": 4.883590980258067e-05, "loss": 1.0812, "step": 37100 }, { "epoch": 0.03, "grad_norm": 114.0, "learning_rate": 4.883141072938975e-05, "loss": 0.9969, "step": 37200 }, { "epoch": 0.03, "grad_norm": 45.0, "learning_rate": 4.8826911656198825e-05, "loss": 1.1014, "step": 37300 }, { "epoch": 0.03, "grad_norm": 102.0, "learning_rate": 4.882241258300791e-05, "loss": 1.1065, "step": 37400 }, { "epoch": 0.03, "grad_norm": 20.375, "learning_rate": 4.881791350981698e-05, "loss": 1.0295, "step": 37500 }, { "epoch": 0.03, "grad_norm": 0.24609375, "learning_rate": 4.881341443662606e-05, "loss": 1.0879, "step": 37600 }, { "epoch": 0.03, "grad_norm": 23.0, "learning_rate": 4.880891536343514e-05, "loss": 1.2481, "step": 37700 }, { "epoch": 0.03, "grad_norm": 127.0, "learning_rate": 4.8804416290244215e-05, "loss": 1.1123, "step": 37800 }, { "epoch": 0.03, "grad_norm": 46.25, "learning_rate": 4.8799917217053284e-05, "loss": 1.1555, "step": 37900 }, { "epoch": 0.03, "grad_norm": 234.0, "learning_rate": 4.8795418143862366e-05, "loss": 1.0754, "step": 38000 }, { "epoch": 0.03, "grad_norm": 31.875, "learning_rate": 4.879091907067144e-05, "loss": 1.029, "step": 38100 }, { "epoch": 0.03, "grad_norm": 2.34375, "learning_rate": 4.878641999748052e-05, "loss": 1.1966, "step": 38200 }, { "epoch": 0.03, "grad_norm": 96.0, "learning_rate": 4.87819209242896e-05, "loss": 1.0488, "step": 38300 }, { "epoch": 0.03, "grad_norm": 13.8125, "learning_rate": 4.8777421851098674e-05, "loss": 1.075, "step": 38400 }, { "epoch": 0.03, "grad_norm": 35.75, "learning_rate": 4.8772922777907756e-05, "loss": 1.0888, "step": 38500 }, { "epoch": 0.03, "grad_norm": 14.3125, "learning_rate": 4.876842370471683e-05, "loss": 1.0776, "step": 38600 }, { "epoch": 0.03, "grad_norm": 29.0, "learning_rate": 4.876392463152591e-05, "loss": 1.1729, "step": 38700 }, { "epoch": 0.03, "grad_norm": 48.25, "learning_rate": 4.875942555833499e-05, "loss": 1.0547, "step": 38800 }, { "epoch": 0.03, "grad_norm": 24.125, "learning_rate": 4.8754926485144064e-05, "loss": 1.033, "step": 38900 }, { "epoch": 0.03, "grad_norm": 22.625, "learning_rate": 4.875042741195314e-05, "loss": 1.0052, "step": 39000 }, { "epoch": 0.03, "grad_norm": 11.5625, "learning_rate": 4.874592833876222e-05, "loss": 1.0835, "step": 39100 }, { "epoch": 0.03, "grad_norm": 38.0, "learning_rate": 4.874142926557129e-05, "loss": 0.9219, "step": 39200 }, { "epoch": 0.04, "grad_norm": 121.5, "learning_rate": 4.873693019238037e-05, "loss": 1.0016, "step": 39300 }, { "epoch": 0.04, "grad_norm": 49.25, "learning_rate": 4.873243111918945e-05, "loss": 0.9643, "step": 39400 }, { "epoch": 0.04, "grad_norm": 40.0, "learning_rate": 4.872793204599852e-05, "loss": 1.1834, "step": 39500 }, { "epoch": 0.04, "grad_norm": 23.125, "learning_rate": 4.8723432972807605e-05, "loss": 1.1207, "step": 39600 }, { "epoch": 0.04, "grad_norm": 0.01385498046875, "learning_rate": 4.871893389961668e-05, "loss": 1.1989, "step": 39700 }, { "epoch": 0.04, "grad_norm": 101.0, "learning_rate": 4.8714434826425756e-05, "loss": 1.0312, "step": 39800 }, { "epoch": 0.04, "grad_norm": 25.625, "learning_rate": 4.870993575323484e-05, "loss": 1.0033, "step": 39900 }, { "epoch": 0.04, "grad_norm": 19.375, "learning_rate": 4.870543668004391e-05, "loss": 1.1085, "step": 40000 }, { "epoch": 0.04, "grad_norm": 71.0, "learning_rate": 4.8700937606852995e-05, "loss": 1.0862, "step": 40100 }, { "epoch": 0.04, "grad_norm": 0.00616455078125, "learning_rate": 4.869643853366207e-05, "loss": 1.0734, "step": 40200 }, { "epoch": 0.04, "grad_norm": 162.0, "learning_rate": 4.8691939460471146e-05, "loss": 1.1716, "step": 40300 }, { "epoch": 0.04, "grad_norm": 8.6875, "learning_rate": 4.868744038728022e-05, "loss": 1.0932, "step": 40400 }, { "epoch": 0.04, "grad_norm": 13.0, "learning_rate": 4.86829413140893e-05, "loss": 1.2164, "step": 40500 }, { "epoch": 0.04, "grad_norm": 46.5, "learning_rate": 4.867844224089837e-05, "loss": 1.1104, "step": 40600 }, { "epoch": 0.04, "grad_norm": 262.0, "learning_rate": 4.8673943167707454e-05, "loss": 1.0848, "step": 40700 }, { "epoch": 0.04, "grad_norm": 42.75, "learning_rate": 4.866944409451653e-05, "loss": 1.0706, "step": 40800 }, { "epoch": 0.04, "grad_norm": 78.5, "learning_rate": 4.866494502132561e-05, "loss": 1.2088, "step": 40900 }, { "epoch": 0.04, "grad_norm": 49.0, "learning_rate": 4.866044594813469e-05, "loss": 1.0286, "step": 41000 }, { "epoch": 0.04, "grad_norm": 77.0, "learning_rate": 4.865594687494376e-05, "loss": 1.0746, "step": 41100 }, { "epoch": 0.04, "grad_norm": 89.0, "learning_rate": 4.8651447801752844e-05, "loss": 1.0762, "step": 41200 }, { "epoch": 0.04, "grad_norm": 81.5, "learning_rate": 4.864694872856192e-05, "loss": 1.0354, "step": 41300 }, { "epoch": 0.04, "grad_norm": 29.0, "learning_rate": 4.8642449655370995e-05, "loss": 1.1843, "step": 41400 }, { "epoch": 0.04, "grad_norm": 33.0, "learning_rate": 4.863795058218008e-05, "loss": 1.1052, "step": 41500 }, { "epoch": 0.04, "grad_norm": 19.625, "learning_rate": 4.863345150898915e-05, "loss": 1.0887, "step": 41600 }, { "epoch": 0.04, "grad_norm": 98.5, "learning_rate": 4.862895243579823e-05, "loss": 1.1573, "step": 41700 }, { "epoch": 0.04, "grad_norm": 47.75, "learning_rate": 4.86244533626073e-05, "loss": 1.2246, "step": 41800 }, { "epoch": 0.04, "grad_norm": 132.0, "learning_rate": 4.861995428941638e-05, "loss": 1.1189, "step": 41900 }, { "epoch": 0.04, "grad_norm": 28.375, "learning_rate": 4.861545521622546e-05, "loss": 1.0678, "step": 42000 }, { "epoch": 0.04, "grad_norm": 7.1875, "learning_rate": 4.8610956143034536e-05, "loss": 1.1277, "step": 42100 }, { "epoch": 0.04, "grad_norm": 107.0, "learning_rate": 4.860645706984361e-05, "loss": 0.9764, "step": 42200 }, { "epoch": 0.04, "grad_norm": 7.5, "learning_rate": 4.8601957996652693e-05, "loss": 1.0772, "step": 42300 }, { "epoch": 0.04, "grad_norm": 61.0, "learning_rate": 4.859745892346177e-05, "loss": 0.8331, "step": 42400 }, { "epoch": 0.04, "grad_norm": 43.5, "learning_rate": 4.8592959850270844e-05, "loss": 1.1747, "step": 42500 }, { "epoch": 0.04, "grad_norm": 0.0189208984375, "learning_rate": 4.8588460777079926e-05, "loss": 1.2166, "step": 42600 }, { "epoch": 0.04, "grad_norm": 44.75, "learning_rate": 4.8583961703889e-05, "loss": 1.0518, "step": 42700 }, { "epoch": 0.04, "grad_norm": 147.0, "learning_rate": 4.8579462630698084e-05, "loss": 1.1158, "step": 42800 }, { "epoch": 0.04, "grad_norm": 99.0, "learning_rate": 4.857496355750716e-05, "loss": 0.9201, "step": 42900 }, { "epoch": 0.04, "grad_norm": 17.0, "learning_rate": 4.857046448431623e-05, "loss": 1.1372, "step": 43000 }, { "epoch": 0.04, "grad_norm": 69.0, "learning_rate": 4.856596541112531e-05, "loss": 0.9705, "step": 43100 }, { "epoch": 0.04, "grad_norm": 25.75, "learning_rate": 4.8561466337934385e-05, "loss": 0.9867, "step": 43200 }, { "epoch": 0.04, "grad_norm": 226.0, "learning_rate": 4.855696726474346e-05, "loss": 1.119, "step": 43300 }, { "epoch": 0.04, "grad_norm": 36.5, "learning_rate": 4.855246819155254e-05, "loss": 1.1015, "step": 43400 }, { "epoch": 0.04, "grad_norm": 32.75, "learning_rate": 4.854796911836162e-05, "loss": 1.1207, "step": 43500 }, { "epoch": 0.04, "grad_norm": 77.0, "learning_rate": 4.85434700451707e-05, "loss": 1.1423, "step": 43600 }, { "epoch": 0.04, "grad_norm": 0.59765625, "learning_rate": 4.8538970971979775e-05, "loss": 1.1564, "step": 43700 }, { "epoch": 0.04, "grad_norm": 48.0, "learning_rate": 4.853447189878885e-05, "loss": 1.237, "step": 43800 }, { "epoch": 0.04, "grad_norm": 28.125, "learning_rate": 4.852997282559793e-05, "loss": 1.1437, "step": 43900 }, { "epoch": 0.04, "grad_norm": 20.5, "learning_rate": 4.852547375240701e-05, "loss": 0.9318, "step": 44000 }, { "epoch": 0.04, "grad_norm": 1408.0, "learning_rate": 4.8520974679216083e-05, "loss": 0.9653, "step": 44100 }, { "epoch": 0.04, "grad_norm": 24.125, "learning_rate": 4.8516475606025166e-05, "loss": 0.9739, "step": 44200 }, { "epoch": 0.04, "grad_norm": 0.035888671875, "learning_rate": 4.8511976532834234e-05, "loss": 1.0071, "step": 44300 }, { "epoch": 0.04, "grad_norm": 33.5, "learning_rate": 4.8507477459643316e-05, "loss": 1.103, "step": 44400 }, { "epoch": 0.04, "grad_norm": 20.375, "learning_rate": 4.850297838645239e-05, "loss": 1.0733, "step": 44500 }, { "epoch": 0.04, "grad_norm": 23.0, "learning_rate": 4.849847931326147e-05, "loss": 1.1211, "step": 44600 }, { "epoch": 0.04, "grad_norm": 15.8125, "learning_rate": 4.849398024007055e-05, "loss": 0.9089, "step": 44700 }, { "epoch": 0.04, "grad_norm": 38.0, "learning_rate": 4.8489481166879624e-05, "loss": 1.069, "step": 44800 }, { "epoch": 0.04, "grad_norm": 0.474609375, "learning_rate": 4.84849820936887e-05, "loss": 1.0493, "step": 44900 }, { "epoch": 0.04, "grad_norm": 31.125, "learning_rate": 4.848048302049778e-05, "loss": 0.9652, "step": 45000 }, { "epoch": 0.04, "grad_norm": 41.25, "learning_rate": 4.847598394730686e-05, "loss": 1.2541, "step": 45100 }, { "epoch": 0.04, "grad_norm": 7.21875, "learning_rate": 4.847148487411593e-05, "loss": 1.0102, "step": 45200 }, { "epoch": 0.04, "grad_norm": 52.5, "learning_rate": 4.8466985800925015e-05, "loss": 1.0949, "step": 45300 }, { "epoch": 0.04, "grad_norm": 16.75, "learning_rate": 4.846248672773409e-05, "loss": 1.0092, "step": 45400 }, { "epoch": 0.04, "grad_norm": 36.5, "learning_rate": 4.845798765454317e-05, "loss": 1.0239, "step": 45500 }, { "epoch": 0.04, "grad_norm": 23.375, "learning_rate": 4.845348858135224e-05, "loss": 0.9919, "step": 45600 }, { "epoch": 0.04, "grad_norm": 48.0, "learning_rate": 4.8448989508161316e-05, "loss": 1.0217, "step": 45700 }, { "epoch": 0.04, "grad_norm": 35.5, "learning_rate": 4.84444904349704e-05, "loss": 1.038, "step": 45800 }, { "epoch": 0.04, "grad_norm": 7.34375, "learning_rate": 4.8439991361779473e-05, "loss": 1.0612, "step": 45900 }, { "epoch": 0.04, "grad_norm": 12.8125, "learning_rate": 4.843549228858855e-05, "loss": 1.1405, "step": 46000 }, { "epoch": 0.04, "grad_norm": 16.125, "learning_rate": 4.843099321539763e-05, "loss": 1.0643, "step": 46100 }, { "epoch": 0.04, "grad_norm": 22.125, "learning_rate": 4.8426494142206706e-05, "loss": 1.0632, "step": 46200 }, { "epoch": 0.04, "grad_norm": 19.875, "learning_rate": 4.842199506901579e-05, "loss": 1.2547, "step": 46300 }, { "epoch": 0.04, "grad_norm": 143.0, "learning_rate": 4.8417495995824864e-05, "loss": 1.0667, "step": 46400 }, { "epoch": 0.04, "grad_norm": 26.75, "learning_rate": 4.841299692263394e-05, "loss": 1.0599, "step": 46500 }, { "epoch": 0.04, "grad_norm": 12.0, "learning_rate": 4.840849784944302e-05, "loss": 1.0818, "step": 46600 }, { "epoch": 0.04, "grad_norm": 852.0, "learning_rate": 4.8403998776252097e-05, "loss": 1.1998, "step": 46700 }, { "epoch": 0.04, "grad_norm": 48.5, "learning_rate": 4.839949970306117e-05, "loss": 1.1431, "step": 46800 }, { "epoch": 0.04, "grad_norm": 33.0, "learning_rate": 4.839500062987025e-05, "loss": 1.1179, "step": 46900 }, { "epoch": 0.04, "grad_norm": 20.625, "learning_rate": 4.839050155667932e-05, "loss": 0.9942, "step": 47000 }, { "epoch": 0.04, "grad_norm": 51.5, "learning_rate": 4.8386002483488405e-05, "loss": 1.0463, "step": 47100 }, { "epoch": 0.04, "grad_norm": 0.045654296875, "learning_rate": 4.838150341029748e-05, "loss": 1.1198, "step": 47200 }, { "epoch": 0.04, "grad_norm": 50.5, "learning_rate": 4.8377004337106555e-05, "loss": 1.0793, "step": 47300 }, { "epoch": 0.04, "grad_norm": 25.125, "learning_rate": 4.837250526391564e-05, "loss": 0.9773, "step": 47400 }, { "epoch": 0.04, "grad_norm": 72.0, "learning_rate": 4.836800619072471e-05, "loss": 1.1212, "step": 47500 }, { "epoch": 0.04, "grad_norm": 119.0, "learning_rate": 4.836350711753379e-05, "loss": 1.077, "step": 47600 }, { "epoch": 0.04, "grad_norm": 0.0037384033203125, "learning_rate": 4.835900804434287e-05, "loss": 0.9038, "step": 47700 }, { "epoch": 0.04, "grad_norm": 154.0, "learning_rate": 4.8354508971151946e-05, "loss": 1.0858, "step": 47800 }, { "epoch": 0.04, "grad_norm": 1.8359375, "learning_rate": 4.835000989796102e-05, "loss": 0.9363, "step": 47900 }, { "epoch": 0.04, "grad_norm": 46.25, "learning_rate": 4.83455108247701e-05, "loss": 1.1046, "step": 48000 }, { "epoch": 0.04, "grad_norm": 28.125, "learning_rate": 4.834101175157918e-05, "loss": 1.0324, "step": 48100 }, { "epoch": 0.04, "grad_norm": 29.5, "learning_rate": 4.8336512678388254e-05, "loss": 1.139, "step": 48200 }, { "epoch": 0.04, "grad_norm": 81.5, "learning_rate": 4.833201360519733e-05, "loss": 1.0533, "step": 48300 }, { "epoch": 0.04, "grad_norm": 124.5, "learning_rate": 4.8327514532006404e-05, "loss": 0.9259, "step": 48400 }, { "epoch": 0.04, "grad_norm": 97.0, "learning_rate": 4.8323015458815487e-05, "loss": 0.9167, "step": 48500 }, { "epoch": 0.04, "grad_norm": 19.625, "learning_rate": 4.831851638562456e-05, "loss": 1.1529, "step": 48600 }, { "epoch": 0.04, "grad_norm": 72.0, "learning_rate": 4.831401731243364e-05, "loss": 1.1205, "step": 48700 }, { "epoch": 0.04, "grad_norm": 0.003814697265625, "learning_rate": 4.830951823924272e-05, "loss": 1.0103, "step": 48800 }, { "epoch": 0.04, "grad_norm": 1.15625, "learning_rate": 4.8305019166051795e-05, "loss": 1.1474, "step": 48900 }, { "epoch": 0.04, "grad_norm": 21.875, "learning_rate": 4.830052009286088e-05, "loss": 1.0274, "step": 49000 }, { "epoch": 0.04, "grad_norm": 28.125, "learning_rate": 4.829602101966995e-05, "loss": 1.0594, "step": 49100 }, { "epoch": 0.04, "grad_norm": 4.84375, "learning_rate": 4.829152194647903e-05, "loss": 1.076, "step": 49200 }, { "epoch": 0.04, "grad_norm": 20.625, "learning_rate": 4.828702287328811e-05, "loss": 0.9598, "step": 49300 }, { "epoch": 0.04, "grad_norm": 6.96875, "learning_rate": 4.8282523800097185e-05, "loss": 1.1013, "step": 49400 }, { "epoch": 0.04, "grad_norm": 15.625, "learning_rate": 4.827802472690626e-05, "loss": 1.0096, "step": 49500 }, { "epoch": 0.04, "grad_norm": 45.25, "learning_rate": 4.8273525653715336e-05, "loss": 1.1038, "step": 49600 }, { "epoch": 0.04, "grad_norm": 0.8203125, "learning_rate": 4.826902658052441e-05, "loss": 1.1955, "step": 49700 }, { "epoch": 0.04, "grad_norm": 29.75, "learning_rate": 4.826452750733349e-05, "loss": 0.9651, "step": 49800 }, { "epoch": 0.04, "grad_norm": 16.0, "learning_rate": 4.826002843414257e-05, "loss": 1.0677, "step": 49900 }, { "epoch": 0.04, "grad_norm": 0.0140380859375, "learning_rate": 4.8255529360951644e-05, "loss": 0.9213, "step": 50000 }, { "epoch": 0.04, "grad_norm": 27.75, "learning_rate": 4.8251030287760726e-05, "loss": 1.1253, "step": 50100 }, { "epoch": 0.04, "grad_norm": 1048.0, "learning_rate": 4.82465312145698e-05, "loss": 1.0198, "step": 50200 }, { "epoch": 0.04, "grad_norm": 80.5, "learning_rate": 4.8242032141378877e-05, "loss": 1.0778, "step": 50300 }, { "epoch": 0.04, "grad_norm": 120.0, "learning_rate": 4.823753306818796e-05, "loss": 1.0936, "step": 50400 }, { "epoch": 0.04, "grad_norm": 26.0, "learning_rate": 4.8233033994997034e-05, "loss": 1.145, "step": 50500 }, { "epoch": 0.05, "grad_norm": 82.0, "learning_rate": 4.822853492180611e-05, "loss": 1.0195, "step": 50600 }, { "epoch": 0.05, "grad_norm": 0.07568359375, "learning_rate": 4.822403584861519e-05, "loss": 1.1293, "step": 50700 }, { "epoch": 0.05, "grad_norm": 592.0, "learning_rate": 4.821953677542426e-05, "loss": 1.0629, "step": 50800 }, { "epoch": 0.05, "grad_norm": 0.06787109375, "learning_rate": 4.821503770223334e-05, "loss": 1.0955, "step": 50900 }, { "epoch": 0.05, "grad_norm": 31.875, "learning_rate": 4.821053862904242e-05, "loss": 1.0129, "step": 51000 }, { "epoch": 0.05, "grad_norm": 53.25, "learning_rate": 4.820603955585149e-05, "loss": 1.0709, "step": 51100 }, { "epoch": 0.05, "grad_norm": 11.9375, "learning_rate": 4.8201540482660575e-05, "loss": 1.0282, "step": 51200 }, { "epoch": 0.05, "grad_norm": 44.5, "learning_rate": 4.819704140946965e-05, "loss": 1.1032, "step": 51300 }, { "epoch": 0.05, "grad_norm": 0.004547119140625, "learning_rate": 4.8192542336278726e-05, "loss": 1.1192, "step": 51400 }, { "epoch": 0.05, "grad_norm": 16.5, "learning_rate": 4.818804326308781e-05, "loss": 0.9293, "step": 51500 }, { "epoch": 0.05, "grad_norm": 34.0, "learning_rate": 4.818354418989688e-05, "loss": 0.9689, "step": 51600 }, { "epoch": 0.05, "grad_norm": 280.0, "learning_rate": 4.8179045116705965e-05, "loss": 1.0233, "step": 51700 }, { "epoch": 0.05, "grad_norm": 72.0, "learning_rate": 4.817454604351504e-05, "loss": 1.0125, "step": 51800 }, { "epoch": 0.05, "grad_norm": 33.5, "learning_rate": 4.8170046970324116e-05, "loss": 1.0061, "step": 51900 }, { "epoch": 0.05, "grad_norm": 39.75, "learning_rate": 4.81655478971332e-05, "loss": 1.0524, "step": 52000 }, { "epoch": 0.05, "grad_norm": 28.0, "learning_rate": 4.8161048823942266e-05, "loss": 1.1626, "step": 52100 }, { "epoch": 0.05, "grad_norm": 47.75, "learning_rate": 4.815654975075135e-05, "loss": 1.0667, "step": 52200 }, { "epoch": 0.05, "grad_norm": 135.0, "learning_rate": 4.8152050677560424e-05, "loss": 1.0158, "step": 52300 }, { "epoch": 0.05, "grad_norm": 41.25, "learning_rate": 4.81475516043695e-05, "loss": 1.0858, "step": 52400 }, { "epoch": 0.05, "grad_norm": 524.0, "learning_rate": 4.814305253117858e-05, "loss": 1.0831, "step": 52500 }, { "epoch": 0.05, "grad_norm": 0.2470703125, "learning_rate": 4.813855345798766e-05, "loss": 1.0296, "step": 52600 }, { "epoch": 0.05, "grad_norm": 17.875, "learning_rate": 4.813405438479673e-05, "loss": 1.0289, "step": 52700 }, { "epoch": 0.05, "grad_norm": 24.875, "learning_rate": 4.8129555311605814e-05, "loss": 1.115, "step": 52800 }, { "epoch": 0.05, "grad_norm": 203.0, "learning_rate": 4.812505623841489e-05, "loss": 0.9661, "step": 52900 }, { "epoch": 0.05, "grad_norm": 5.40625, "learning_rate": 4.8120557165223965e-05, "loss": 0.9604, "step": 53000 }, { "epoch": 0.05, "grad_norm": 19.25, "learning_rate": 4.811605809203305e-05, "loss": 1.0973, "step": 53100 }, { "epoch": 0.05, "grad_norm": 12.0625, "learning_rate": 4.811155901884212e-05, "loss": 1.1134, "step": 53200 }, { "epoch": 0.05, "grad_norm": 21.875, "learning_rate": 4.81070599456512e-05, "loss": 0.9593, "step": 53300 }, { "epoch": 0.05, "grad_norm": 1616.0, "learning_rate": 4.810256087246027e-05, "loss": 0.925, "step": 53400 }, { "epoch": 0.05, "grad_norm": 122.5, "learning_rate": 4.809806179926935e-05, "loss": 1.021, "step": 53500 }, { "epoch": 0.05, "grad_norm": 0.061767578125, "learning_rate": 4.809356272607843e-05, "loss": 1.1105, "step": 53600 }, { "epoch": 0.05, "grad_norm": 788.0, "learning_rate": 4.8089063652887506e-05, "loss": 1.2342, "step": 53700 }, { "epoch": 0.05, "grad_norm": 25.375, "learning_rate": 4.808456457969658e-05, "loss": 1.0668, "step": 53800 }, { "epoch": 0.05, "grad_norm": 113.0, "learning_rate": 4.808006550650566e-05, "loss": 1.129, "step": 53900 }, { "epoch": 0.05, "grad_norm": 31.5, "learning_rate": 4.807556643331474e-05, "loss": 1.1183, "step": 54000 }, { "epoch": 0.05, "grad_norm": 79.5, "learning_rate": 4.8071067360123814e-05, "loss": 1.0775, "step": 54100 }, { "epoch": 0.05, "grad_norm": 296.0, "learning_rate": 4.8066568286932896e-05, "loss": 1.1357, "step": 54200 }, { "epoch": 0.05, "grad_norm": 34.0, "learning_rate": 4.806206921374197e-05, "loss": 1.1471, "step": 54300 }, { "epoch": 0.05, "grad_norm": 13.9375, "learning_rate": 4.8057570140551054e-05, "loss": 1.0694, "step": 54400 }, { "epoch": 0.05, "grad_norm": 0.0159912109375, "learning_rate": 4.805307106736013e-05, "loss": 1.1395, "step": 54500 }, { "epoch": 0.05, "grad_norm": 37.75, "learning_rate": 4.8048571994169204e-05, "loss": 1.1149, "step": 54600 }, { "epoch": 0.05, "grad_norm": 16.75, "learning_rate": 4.804407292097828e-05, "loss": 1.2619, "step": 54700 }, { "epoch": 0.05, "grad_norm": 30.625, "learning_rate": 4.8039573847787355e-05, "loss": 1.0838, "step": 54800 }, { "epoch": 0.05, "grad_norm": 51.25, "learning_rate": 4.803507477459643e-05, "loss": 1.0213, "step": 54900 }, { "epoch": 0.05, "grad_norm": 161.0, "learning_rate": 4.803057570140551e-05, "loss": 0.9585, "step": 55000 }, { "epoch": 0.05, "grad_norm": 103.0, "learning_rate": 4.802607662821459e-05, "loss": 1.0219, "step": 55100 }, { "epoch": 0.05, "grad_norm": 41.0, "learning_rate": 4.802157755502367e-05, "loss": 1.0754, "step": 55200 }, { "epoch": 0.05, "grad_norm": 0.267578125, "learning_rate": 4.8017078481832745e-05, "loss": 1.058, "step": 55300 }, { "epoch": 0.05, "grad_norm": 23.875, "learning_rate": 4.801257940864182e-05, "loss": 1.0358, "step": 55400 }, { "epoch": 0.05, "grad_norm": 38.5, "learning_rate": 4.80080803354509e-05, "loss": 1.255, "step": 55500 }, { "epoch": 0.05, "grad_norm": 89.5, "learning_rate": 4.800358126225998e-05, "loss": 1.1077, "step": 55600 }, { "epoch": 0.05, "grad_norm": 22.625, "learning_rate": 4.799908218906905e-05, "loss": 1.19, "step": 55700 }, { "epoch": 0.05, "grad_norm": 141.0, "learning_rate": 4.7994583115878135e-05, "loss": 1.046, "step": 55800 }, { "epoch": 0.05, "grad_norm": 18.375, "learning_rate": 4.799008404268721e-05, "loss": 1.0007, "step": 55900 }, { "epoch": 0.05, "grad_norm": 89.0, "learning_rate": 4.7985584969496286e-05, "loss": 0.957, "step": 56000 }, { "epoch": 0.05, "grad_norm": 94.0, "learning_rate": 4.798108589630536e-05, "loss": 1.0887, "step": 56100 }, { "epoch": 0.05, "grad_norm": 25.625, "learning_rate": 4.797658682311444e-05, "loss": 1.1572, "step": 56200 }, { "epoch": 0.05, "grad_norm": 52.25, "learning_rate": 4.797208774992352e-05, "loss": 1.096, "step": 56300 }, { "epoch": 0.05, "grad_norm": 17.5, "learning_rate": 4.7967588676732594e-05, "loss": 1.1291, "step": 56400 }, { "epoch": 0.05, "grad_norm": 46.5, "learning_rate": 4.796308960354167e-05, "loss": 1.0168, "step": 56500 }, { "epoch": 0.05, "grad_norm": 18.625, "learning_rate": 4.795859053035075e-05, "loss": 0.9168, "step": 56600 }, { "epoch": 0.05, "grad_norm": 0.01611328125, "learning_rate": 4.795409145715983e-05, "loss": 0.9272, "step": 56700 }, { "epoch": 0.05, "grad_norm": 60.5, "learning_rate": 4.79495923839689e-05, "loss": 1.0723, "step": 56800 }, { "epoch": 0.05, "grad_norm": 54.5, "learning_rate": 4.7945093310777984e-05, "loss": 1.1016, "step": 56900 }, { "epoch": 0.05, "grad_norm": 1.359375, "learning_rate": 4.794059423758706e-05, "loss": 1.0385, "step": 57000 }, { "epoch": 0.05, "grad_norm": 16.625, "learning_rate": 4.793609516439614e-05, "loss": 1.0596, "step": 57100 }, { "epoch": 0.05, "grad_norm": 36.0, "learning_rate": 4.793159609120522e-05, "loss": 1.0204, "step": 57200 }, { "epoch": 0.05, "grad_norm": 26.25, "learning_rate": 4.7927097018014286e-05, "loss": 1.0025, "step": 57300 }, { "epoch": 0.05, "grad_norm": 27.0, "learning_rate": 4.792259794482337e-05, "loss": 1.0586, "step": 57400 }, { "epoch": 0.05, "grad_norm": 19.125, "learning_rate": 4.791809887163244e-05, "loss": 1.0268, "step": 57500 }, { "epoch": 0.05, "grad_norm": 39.25, "learning_rate": 4.791359979844152e-05, "loss": 1.1759, "step": 57600 }, { "epoch": 0.05, "grad_norm": 3.125, "learning_rate": 4.79091007252506e-05, "loss": 1.0709, "step": 57700 }, { "epoch": 0.05, "grad_norm": 105.0, "learning_rate": 4.7904601652059676e-05, "loss": 1.0584, "step": 57800 }, { "epoch": 0.05, "grad_norm": 80.0, "learning_rate": 4.790010257886876e-05, "loss": 1.0015, "step": 57900 }, { "epoch": 0.05, "grad_norm": 31.5, "learning_rate": 4.7895603505677834e-05, "loss": 1.0309, "step": 58000 }, { "epoch": 0.05, "grad_norm": 28.625, "learning_rate": 4.789110443248691e-05, "loss": 1.0922, "step": 58100 }, { "epoch": 0.05, "grad_norm": 22.75, "learning_rate": 4.788660535929599e-05, "loss": 1.1791, "step": 58200 }, { "epoch": 0.05, "grad_norm": 3120.0, "learning_rate": 4.7882106286105066e-05, "loss": 1.2718, "step": 58300 }, { "epoch": 0.05, "grad_norm": 38.75, "learning_rate": 4.787760721291414e-05, "loss": 0.9101, "step": 58400 }, { "epoch": 0.05, "grad_norm": 18.75, "learning_rate": 4.7873108139723224e-05, "loss": 0.9628, "step": 58500 }, { "epoch": 0.05, "grad_norm": 26.875, "learning_rate": 4.786860906653229e-05, "loss": 1.0926, "step": 58600 }, { "epoch": 0.05, "grad_norm": 29.0, "learning_rate": 4.7864109993341374e-05, "loss": 1.0893, "step": 58700 }, { "epoch": 0.05, "grad_norm": 32.0, "learning_rate": 4.785961092015045e-05, "loss": 1.0435, "step": 58800 }, { "epoch": 0.05, "grad_norm": 17.625, "learning_rate": 4.7855111846959525e-05, "loss": 1.1382, "step": 58900 }, { "epoch": 0.05, "grad_norm": 165.0, "learning_rate": 4.785061277376861e-05, "loss": 0.9335, "step": 59000 }, { "epoch": 0.05, "grad_norm": 23.875, "learning_rate": 4.784611370057768e-05, "loss": 0.9897, "step": 59100 }, { "epoch": 0.05, "grad_norm": 99.0, "learning_rate": 4.784161462738676e-05, "loss": 1.077, "step": 59200 }, { "epoch": 0.05, "grad_norm": 0.0322265625, "learning_rate": 4.783711555419584e-05, "loss": 0.927, "step": 59300 }, { "epoch": 0.05, "grad_norm": 134.0, "learning_rate": 4.7832616481004915e-05, "loss": 0.9779, "step": 59400 }, { "epoch": 0.05, "grad_norm": 33.25, "learning_rate": 4.782811740781399e-05, "loss": 1.114, "step": 59500 }, { "epoch": 0.05, "grad_norm": 32.75, "learning_rate": 4.782361833462307e-05, "loss": 1.1215, "step": 59600 }, { "epoch": 0.05, "grad_norm": 32.25, "learning_rate": 4.781911926143215e-05, "loss": 1.0068, "step": 59700 }, { "epoch": 0.05, "grad_norm": 87.0, "learning_rate": 4.781462018824123e-05, "loss": 1.0691, "step": 59800 }, { "epoch": 0.05, "grad_norm": 24.75, "learning_rate": 4.78101211150503e-05, "loss": 0.9855, "step": 59900 }, { "epoch": 0.05, "grad_norm": 0.0074462890625, "learning_rate": 4.7805622041859374e-05, "loss": 1.0395, "step": 60000 }, { "epoch": 0.05, "grad_norm": 0.1435546875, "learning_rate": 4.7801122968668456e-05, "loss": 1.0282, "step": 60100 }, { "epoch": 0.05, "grad_norm": 62.25, "learning_rate": 4.779662389547753e-05, "loss": 1.0955, "step": 60200 }, { "epoch": 0.05, "grad_norm": 95.0, "learning_rate": 4.779212482228661e-05, "loss": 0.9779, "step": 60300 }, { "epoch": 0.05, "grad_norm": 76.5, "learning_rate": 4.778762574909569e-05, "loss": 1.1875, "step": 60400 }, { "epoch": 0.05, "grad_norm": 0.00750732421875, "learning_rate": 4.7783126675904764e-05, "loss": 1.144, "step": 60500 }, { "epoch": 0.05, "grad_norm": 123.0, "learning_rate": 4.7778627602713847e-05, "loss": 1.0817, "step": 60600 }, { "epoch": 0.05, "grad_norm": 31.5, "learning_rate": 4.777412852952292e-05, "loss": 1.0511, "step": 60700 }, { "epoch": 0.05, "grad_norm": 38.5, "learning_rate": 4.7769629456332e-05, "loss": 1.1687, "step": 60800 }, { "epoch": 0.05, "grad_norm": 13.4375, "learning_rate": 4.776513038314108e-05, "loss": 0.9554, "step": 60900 }, { "epoch": 0.05, "grad_norm": 51.0, "learning_rate": 4.7760631309950155e-05, "loss": 1.1859, "step": 61000 }, { "epoch": 0.05, "grad_norm": 22.75, "learning_rate": 4.775613223675923e-05, "loss": 1.1318, "step": 61100 }, { "epoch": 0.05, "grad_norm": 23.0, "learning_rate": 4.7751633163568305e-05, "loss": 0.9677, "step": 61200 }, { "epoch": 0.05, "grad_norm": 4.46875, "learning_rate": 4.774713409037738e-05, "loss": 1.0597, "step": 61300 }, { "epoch": 0.05, "grad_norm": 18.75, "learning_rate": 4.774263501718646e-05, "loss": 1.0387, "step": 61400 }, { "epoch": 0.05, "grad_norm": 13.5, "learning_rate": 4.773813594399554e-05, "loss": 1.1281, "step": 61500 }, { "epoch": 0.05, "grad_norm": 47.75, "learning_rate": 4.7733636870804613e-05, "loss": 1.1926, "step": 61600 }, { "epoch": 0.05, "grad_norm": 32.25, "learning_rate": 4.7729137797613696e-05, "loss": 1.0866, "step": 61700 }, { "epoch": 0.06, "grad_norm": 0.003997802734375, "learning_rate": 4.772463872442277e-05, "loss": 1.0067, "step": 61800 }, { "epoch": 0.06, "grad_norm": 0.00518798828125, "learning_rate": 4.7720139651231846e-05, "loss": 0.9618, "step": 61900 }, { "epoch": 0.06, "grad_norm": 38.25, "learning_rate": 4.771564057804093e-05, "loss": 0.9947, "step": 62000 }, { "epoch": 0.06, "grad_norm": 35.25, "learning_rate": 4.7711141504850004e-05, "loss": 0.9221, "step": 62100 }, { "epoch": 0.06, "grad_norm": 17.375, "learning_rate": 4.770664243165908e-05, "loss": 1.0307, "step": 62200 }, { "epoch": 0.06, "grad_norm": 0.146484375, "learning_rate": 4.770214335846816e-05, "loss": 1.1203, "step": 62300 }, { "epoch": 0.06, "grad_norm": 120.5, "learning_rate": 4.7697644285277237e-05, "loss": 1.1088, "step": 62400 }, { "epoch": 0.06, "grad_norm": 78.0, "learning_rate": 4.769314521208631e-05, "loss": 0.9565, "step": 62500 }, { "epoch": 0.06, "grad_norm": 25.625, "learning_rate": 4.768864613889539e-05, "loss": 1.096, "step": 62600 }, { "epoch": 0.06, "grad_norm": 21.25, "learning_rate": 4.768414706570446e-05, "loss": 1.0787, "step": 62700 }, { "epoch": 0.06, "grad_norm": 15.5, "learning_rate": 4.7679647992513545e-05, "loss": 1.0096, "step": 62800 }, { "epoch": 0.06, "grad_norm": 50.25, "learning_rate": 4.767514891932262e-05, "loss": 1.0513, "step": 62900 }, { "epoch": 0.06, "grad_norm": 0.00445556640625, "learning_rate": 4.7670649846131695e-05, "loss": 0.9897, "step": 63000 }, { "epoch": 0.06, "grad_norm": 400.0, "learning_rate": 4.766615077294078e-05, "loss": 1.044, "step": 63100 }, { "epoch": 0.06, "grad_norm": 89.0, "learning_rate": 4.766165169974985e-05, "loss": 1.0449, "step": 63200 }, { "epoch": 0.06, "grad_norm": 58.0, "learning_rate": 4.7657152626558935e-05, "loss": 1.0454, "step": 63300 }, { "epoch": 0.06, "grad_norm": 0.048828125, "learning_rate": 4.765265355336801e-05, "loss": 0.9011, "step": 63400 }, { "epoch": 0.06, "grad_norm": 38.75, "learning_rate": 4.7648154480177086e-05, "loss": 1.1728, "step": 63500 }, { "epoch": 0.06, "grad_norm": 0.055419921875, "learning_rate": 4.764365540698617e-05, "loss": 1.0142, "step": 63600 }, { "epoch": 0.06, "grad_norm": 102.0, "learning_rate": 4.763915633379524e-05, "loss": 1.0692, "step": 63700 }, { "epoch": 0.06, "grad_norm": 22.625, "learning_rate": 4.763465726060432e-05, "loss": 0.8402, "step": 63800 }, { "epoch": 0.06, "grad_norm": 24.25, "learning_rate": 4.7630158187413394e-05, "loss": 1.135, "step": 63900 }, { "epoch": 0.06, "grad_norm": 48.0, "learning_rate": 4.762565911422247e-05, "loss": 0.9995, "step": 64000 }, { "epoch": 0.06, "grad_norm": 45.5, "learning_rate": 4.762116004103155e-05, "loss": 1.085, "step": 64100 }, { "epoch": 0.06, "grad_norm": 65.5, "learning_rate": 4.7616660967840627e-05, "loss": 1.0364, "step": 64200 }, { "epoch": 0.06, "grad_norm": 24.25, "learning_rate": 4.76121618946497e-05, "loss": 1.1136, "step": 64300 }, { "epoch": 0.06, "grad_norm": 12.5625, "learning_rate": 4.7607662821458784e-05, "loss": 1.0889, "step": 64400 }, { "epoch": 0.06, "grad_norm": 103.0, "learning_rate": 4.760316374826786e-05, "loss": 1.1256, "step": 64500 }, { "epoch": 0.06, "grad_norm": 34.25, "learning_rate": 4.7598664675076935e-05, "loss": 0.9965, "step": 64600 }, { "epoch": 0.06, "grad_norm": 25.5, "learning_rate": 4.759416560188602e-05, "loss": 1.0614, "step": 64700 }, { "epoch": 0.06, "grad_norm": 80.5, "learning_rate": 4.758966652869509e-05, "loss": 1.0382, "step": 64800 }, { "epoch": 0.06, "grad_norm": 9.5625, "learning_rate": 4.758516745550417e-05, "loss": 0.8942, "step": 64900 }, { "epoch": 0.06, "grad_norm": 378.0, "learning_rate": 4.758066838231325e-05, "loss": 1.1136, "step": 65000 }, { "epoch": 0.06, "grad_norm": 62.5, "learning_rate": 4.757616930912232e-05, "loss": 1.1306, "step": 65100 }, { "epoch": 0.06, "grad_norm": 0.212890625, "learning_rate": 4.75716702359314e-05, "loss": 0.9875, "step": 65200 }, { "epoch": 0.06, "grad_norm": 28.5, "learning_rate": 4.7567171162740476e-05, "loss": 1.0619, "step": 65300 }, { "epoch": 0.06, "grad_norm": 47.25, "learning_rate": 4.756267208954955e-05, "loss": 0.9667, "step": 65400 }, { "epoch": 0.06, "grad_norm": 79.5, "learning_rate": 4.755817301635863e-05, "loss": 1.0267, "step": 65500 }, { "epoch": 0.06, "grad_norm": 68.5, "learning_rate": 4.755367394316771e-05, "loss": 1.1107, "step": 65600 }, { "epoch": 0.06, "grad_norm": 35.5, "learning_rate": 4.7549174869976784e-05, "loss": 0.9501, "step": 65700 }, { "epoch": 0.06, "grad_norm": 74.5, "learning_rate": 4.7544675796785866e-05, "loss": 1.0483, "step": 65800 }, { "epoch": 0.06, "grad_norm": 54.5, "learning_rate": 4.754017672359494e-05, "loss": 1.1021, "step": 65900 }, { "epoch": 0.06, "grad_norm": 25.125, "learning_rate": 4.753567765040402e-05, "loss": 1.006, "step": 66000 }, { "epoch": 0.06, "grad_norm": 55.25, "learning_rate": 4.75311785772131e-05, "loss": 1.1606, "step": 66100 }, { "epoch": 0.06, "grad_norm": 112.0, "learning_rate": 4.7526679504022174e-05, "loss": 0.9369, "step": 66200 }, { "epoch": 0.06, "grad_norm": 57.0, "learning_rate": 4.7522180430831256e-05, "loss": 1.0697, "step": 66300 }, { "epoch": 0.06, "grad_norm": 62.0, "learning_rate": 4.7517681357640325e-05, "loss": 1.0148, "step": 66400 }, { "epoch": 0.06, "grad_norm": 51.5, "learning_rate": 4.751318228444941e-05, "loss": 1.0458, "step": 66500 }, { "epoch": 0.06, "grad_norm": 64.0, "learning_rate": 4.750868321125848e-05, "loss": 1.0097, "step": 66600 }, { "epoch": 0.06, "grad_norm": 29.375, "learning_rate": 4.750418413806756e-05, "loss": 1.0261, "step": 66700 }, { "epoch": 0.06, "grad_norm": 19.125, "learning_rate": 4.749968506487664e-05, "loss": 1.1002, "step": 66800 }, { "epoch": 0.06, "grad_norm": 19.625, "learning_rate": 4.7495185991685715e-05, "loss": 1.1513, "step": 66900 }, { "epoch": 0.06, "grad_norm": 28.0, "learning_rate": 4.749068691849479e-05, "loss": 1.0036, "step": 67000 }, { "epoch": 0.06, "grad_norm": 1.5703125, "learning_rate": 4.748618784530387e-05, "loss": 0.9781, "step": 67100 }, { "epoch": 0.06, "grad_norm": 1.0390625, "learning_rate": 4.748168877211295e-05, "loss": 1.1338, "step": 67200 }, { "epoch": 0.06, "grad_norm": 160.0, "learning_rate": 4.747718969892202e-05, "loss": 1.104, "step": 67300 }, { "epoch": 0.06, "grad_norm": 21.5, "learning_rate": 4.7472690625731105e-05, "loss": 1.0585, "step": 67400 }, { "epoch": 0.06, "grad_norm": 22.125, "learning_rate": 4.746819155254018e-05, "loss": 0.9737, "step": 67500 }, { "epoch": 0.06, "grad_norm": 29.5, "learning_rate": 4.7463692479349256e-05, "loss": 1.0911, "step": 67600 }, { "epoch": 0.06, "grad_norm": 48.25, "learning_rate": 4.745919340615833e-05, "loss": 1.0497, "step": 67700 }, { "epoch": 0.06, "grad_norm": 139.0, "learning_rate": 4.7454694332967407e-05, "loss": 1.0999, "step": 67800 }, { "epoch": 0.06, "grad_norm": 106.0, "learning_rate": 4.745019525977649e-05, "loss": 0.952, "step": 67900 }, { "epoch": 0.06, "grad_norm": 19.75, "learning_rate": 4.7445696186585564e-05, "loss": 1.0808, "step": 68000 }, { "epoch": 0.06, "grad_norm": 100.0, "learning_rate": 4.744119711339464e-05, "loss": 1.129, "step": 68100 }, { "epoch": 0.06, "grad_norm": 45.75, "learning_rate": 4.743669804020372e-05, "loss": 1.1297, "step": 68200 }, { "epoch": 0.06, "grad_norm": 52.5, "learning_rate": 4.74321989670128e-05, "loss": 0.9746, "step": 68300 }, { "epoch": 0.06, "grad_norm": 41.0, "learning_rate": 4.742769989382187e-05, "loss": 1.0569, "step": 68400 }, { "epoch": 0.06, "grad_norm": 36.0, "learning_rate": 4.7423200820630954e-05, "loss": 1.0026, "step": 68500 }, { "epoch": 0.06, "grad_norm": 43.0, "learning_rate": 4.741870174744003e-05, "loss": 0.8584, "step": 68600 }, { "epoch": 0.06, "grad_norm": 24.625, "learning_rate": 4.741420267424911e-05, "loss": 1.0649, "step": 68700 }, { "epoch": 0.06, "grad_norm": 65.5, "learning_rate": 4.740970360105819e-05, "loss": 1.1102, "step": 68800 }, { "epoch": 0.06, "grad_norm": 28.875, "learning_rate": 4.740520452786726e-05, "loss": 1.0797, "step": 68900 }, { "epoch": 0.06, "grad_norm": 80.0, "learning_rate": 4.740070545467634e-05, "loss": 1.0115, "step": 69000 }, { "epoch": 0.06, "grad_norm": 13.3125, "learning_rate": 4.739620638148541e-05, "loss": 1.0605, "step": 69100 }, { "epoch": 0.06, "grad_norm": 4.375, "learning_rate": 4.7391707308294495e-05, "loss": 1.1096, "step": 69200 }, { "epoch": 0.06, "grad_norm": 69.5, "learning_rate": 4.738720823510357e-05, "loss": 1.1468, "step": 69300 }, { "epoch": 0.06, "grad_norm": 16.375, "learning_rate": 4.7382709161912646e-05, "loss": 1.1837, "step": 69400 }, { "epoch": 0.06, "grad_norm": 98.0, "learning_rate": 4.737821008872173e-05, "loss": 1.0676, "step": 69500 }, { "epoch": 0.06, "grad_norm": 35.75, "learning_rate": 4.73737110155308e-05, "loss": 1.112, "step": 69600 }, { "epoch": 0.06, "grad_norm": 74.0, "learning_rate": 4.736921194233988e-05, "loss": 0.983, "step": 69700 }, { "epoch": 0.06, "grad_norm": 60.0, "learning_rate": 4.736471286914896e-05, "loss": 1.0985, "step": 69800 }, { "epoch": 0.06, "grad_norm": 66.0, "learning_rate": 4.7360213795958036e-05, "loss": 1.1919, "step": 69900 }, { "epoch": 0.06, "grad_norm": 40.25, "learning_rate": 4.735571472276711e-05, "loss": 1.1042, "step": 70000 }, { "epoch": 0.06, "grad_norm": 46.0, "learning_rate": 4.7351215649576194e-05, "loss": 1.0638, "step": 70100 }, { "epoch": 0.06, "grad_norm": 176.0, "learning_rate": 4.734671657638527e-05, "loss": 0.9685, "step": 70200 }, { "epoch": 0.06, "grad_norm": 57.5, "learning_rate": 4.7342217503194344e-05, "loss": 1.0771, "step": 70300 }, { "epoch": 0.06, "grad_norm": 37.25, "learning_rate": 4.733771843000342e-05, "loss": 1.0881, "step": 70400 }, { "epoch": 0.06, "grad_norm": 41.0, "learning_rate": 4.7333219356812495e-05, "loss": 0.9475, "step": 70500 }, { "epoch": 0.06, "grad_norm": 83.5, "learning_rate": 4.732872028362158e-05, "loss": 1.1723, "step": 70600 }, { "epoch": 0.06, "grad_norm": 64.5, "learning_rate": 4.732422121043065e-05, "loss": 1.072, "step": 70700 }, { "epoch": 0.06, "grad_norm": 72.5, "learning_rate": 4.731972213723973e-05, "loss": 0.9838, "step": 70800 }, { "epoch": 0.06, "grad_norm": 41.5, "learning_rate": 4.731522306404881e-05, "loss": 1.078, "step": 70900 }, { "epoch": 0.06, "grad_norm": 115.5, "learning_rate": 4.7310723990857885e-05, "loss": 1.0592, "step": 71000 }, { "epoch": 0.06, "grad_norm": 0.095703125, "learning_rate": 4.730622491766696e-05, "loss": 0.9734, "step": 71100 }, { "epoch": 0.06, "grad_norm": 19.375, "learning_rate": 4.730172584447604e-05, "loss": 0.9183, "step": 71200 }, { "epoch": 0.06, "grad_norm": 15.75, "learning_rate": 4.729722677128512e-05, "loss": 0.9231, "step": 71300 }, { "epoch": 0.06, "grad_norm": 80.0, "learning_rate": 4.72927276980942e-05, "loss": 1.1361, "step": 71400 }, { "epoch": 0.06, "grad_norm": 90.0, "learning_rate": 4.7288228624903275e-05, "loss": 1.0136, "step": 71500 }, { "epoch": 0.06, "grad_norm": 13.9375, "learning_rate": 4.7283729551712344e-05, "loss": 1.0182, "step": 71600 }, { "epoch": 0.06, "grad_norm": 59.5, "learning_rate": 4.7279230478521426e-05, "loss": 0.9439, "step": 71700 }, { "epoch": 0.06, "grad_norm": 54.5, "learning_rate": 4.72747314053305e-05, "loss": 1.0811, "step": 71800 }, { "epoch": 0.06, "grad_norm": 16.25, "learning_rate": 4.727023233213958e-05, "loss": 1.0841, "step": 71900 }, { "epoch": 0.06, "grad_norm": 15.8125, "learning_rate": 4.726573325894866e-05, "loss": 0.988, "step": 72000 }, { "epoch": 0.06, "grad_norm": 73.5, "learning_rate": 4.7261234185757734e-05, "loss": 1.1638, "step": 72100 }, { "epoch": 0.06, "grad_norm": 69.5, "learning_rate": 4.7256735112566816e-05, "loss": 1.111, "step": 72200 }, { "epoch": 0.06, "grad_norm": 16.125, "learning_rate": 4.725223603937589e-05, "loss": 0.9247, "step": 72300 }, { "epoch": 0.06, "grad_norm": 36.75, "learning_rate": 4.724773696618497e-05, "loss": 1.029, "step": 72400 }, { "epoch": 0.06, "grad_norm": 15.0, "learning_rate": 4.724323789299405e-05, "loss": 1.1028, "step": 72500 }, { "epoch": 0.06, "grad_norm": 12.6875, "learning_rate": 4.7238738819803124e-05, "loss": 1.1013, "step": 72600 }, { "epoch": 0.06, "grad_norm": 42.75, "learning_rate": 4.72342397466122e-05, "loss": 1.2265, "step": 72700 }, { "epoch": 0.06, "grad_norm": 30.625, "learning_rate": 4.722974067342128e-05, "loss": 0.9438, "step": 72800 }, { "epoch": 0.06, "grad_norm": 37.75, "learning_rate": 4.722524160023035e-05, "loss": 0.9759, "step": 72900 }, { "epoch": 0.07, "grad_norm": 0.2578125, "learning_rate": 4.722074252703943e-05, "loss": 0.9565, "step": 73000 }, { "epoch": 0.07, "grad_norm": 0.1943359375, "learning_rate": 4.721624345384851e-05, "loss": 0.9849, "step": 73100 }, { "epoch": 0.07, "grad_norm": 74.0, "learning_rate": 4.721174438065758e-05, "loss": 1.0244, "step": 73200 }, { "epoch": 0.07, "grad_norm": 31.375, "learning_rate": 4.7207245307466665e-05, "loss": 1.1064, "step": 73300 }, { "epoch": 0.07, "grad_norm": 84.0, "learning_rate": 4.720274623427574e-05, "loss": 1.0538, "step": 73400 }, { "epoch": 0.07, "grad_norm": 27.875, "learning_rate": 4.7198247161084816e-05, "loss": 1.057, "step": 73500 }, { "epoch": 0.07, "grad_norm": 185.0, "learning_rate": 4.71937480878939e-05, "loss": 1.0565, "step": 73600 }, { "epoch": 0.07, "grad_norm": 90.5, "learning_rate": 4.7189249014702974e-05, "loss": 1.11, "step": 73700 }, { "epoch": 0.07, "grad_norm": 808.0, "learning_rate": 4.718474994151205e-05, "loss": 1.1171, "step": 73800 }, { "epoch": 0.07, "grad_norm": 191.0, "learning_rate": 4.718025086832113e-05, "loss": 1.0666, "step": 73900 }, { "epoch": 0.07, "grad_norm": 50.25, "learning_rate": 4.7175751795130206e-05, "loss": 1.0441, "step": 74000 }, { "epoch": 0.07, "grad_norm": 44.75, "learning_rate": 4.717125272193929e-05, "loss": 0.9349, "step": 74100 }, { "epoch": 0.07, "grad_norm": 54.5, "learning_rate": 4.716675364874836e-05, "loss": 1.0472, "step": 74200 }, { "epoch": 0.07, "grad_norm": 12.25, "learning_rate": 4.716225457555743e-05, "loss": 0.9954, "step": 74300 }, { "epoch": 0.07, "grad_norm": 18.75, "learning_rate": 4.7157755502366514e-05, "loss": 0.9964, "step": 74400 }, { "epoch": 0.07, "grad_norm": 38.0, "learning_rate": 4.715325642917559e-05, "loss": 1.0264, "step": 74500 }, { "epoch": 0.07, "grad_norm": 19.0, "learning_rate": 4.7148757355984665e-05, "loss": 1.1607, "step": 74600 }, { "epoch": 0.07, "grad_norm": 23.125, "learning_rate": 4.714425828279375e-05, "loss": 1.0605, "step": 74700 }, { "epoch": 0.07, "grad_norm": 78.5, "learning_rate": 4.713975920960282e-05, "loss": 0.9948, "step": 74800 }, { "epoch": 0.07, "grad_norm": 48.75, "learning_rate": 4.7135260136411905e-05, "loss": 0.9176, "step": 74900 }, { "epoch": 0.07, "grad_norm": 45.5, "learning_rate": 4.713076106322098e-05, "loss": 0.9902, "step": 75000 }, { "epoch": 0.07, "grad_norm": 0.00396728515625, "learning_rate": 4.7126261990030055e-05, "loss": 1.1327, "step": 75100 }, { "epoch": 0.07, "grad_norm": 34.75, "learning_rate": 4.712176291683914e-05, "loss": 1.054, "step": 75200 }, { "epoch": 0.07, "grad_norm": 38.5, "learning_rate": 4.711726384364821e-05, "loss": 0.9967, "step": 75300 }, { "epoch": 0.07, "grad_norm": 35.0, "learning_rate": 4.711276477045729e-05, "loss": 1.0065, "step": 75400 }, { "epoch": 0.07, "grad_norm": 15.875, "learning_rate": 4.7108265697266364e-05, "loss": 0.9735, "step": 75500 }, { "epoch": 0.07, "grad_norm": 0.039794921875, "learning_rate": 4.710376662407544e-05, "loss": 0.9869, "step": 75600 }, { "epoch": 0.07, "grad_norm": 75.0, "learning_rate": 4.709926755088452e-05, "loss": 1.1004, "step": 75700 }, { "epoch": 0.07, "grad_norm": 27.75, "learning_rate": 4.7094768477693596e-05, "loss": 1.0081, "step": 75800 }, { "epoch": 0.07, "grad_norm": 144.0, "learning_rate": 4.709026940450267e-05, "loss": 1.1244, "step": 75900 }, { "epoch": 0.07, "grad_norm": 196.0, "learning_rate": 4.7085770331311754e-05, "loss": 1.0563, "step": 76000 }, { "epoch": 0.07, "grad_norm": 180.0, "learning_rate": 4.708127125812083e-05, "loss": 1.0529, "step": 76100 }, { "epoch": 0.07, "grad_norm": 38.0, "learning_rate": 4.7076772184929904e-05, "loss": 0.9047, "step": 76200 }, { "epoch": 0.07, "grad_norm": 82.0, "learning_rate": 4.7072273111738987e-05, "loss": 1.048, "step": 76300 }, { "epoch": 0.07, "grad_norm": 28.5, "learning_rate": 4.706777403854806e-05, "loss": 1.146, "step": 76400 }, { "epoch": 0.07, "grad_norm": 74.5, "learning_rate": 4.706327496535714e-05, "loss": 1.0799, "step": 76500 }, { "epoch": 0.07, "grad_norm": 20.75, "learning_rate": 4.705877589216622e-05, "loss": 1.1352, "step": 76600 }, { "epoch": 0.07, "grad_norm": 46.25, "learning_rate": 4.7054276818975295e-05, "loss": 1.0056, "step": 76700 }, { "epoch": 0.07, "grad_norm": 0.96875, "learning_rate": 4.704977774578437e-05, "loss": 1.1245, "step": 76800 }, { "epoch": 0.07, "grad_norm": 20.25, "learning_rate": 4.7045278672593445e-05, "loss": 1.352, "step": 76900 }, { "epoch": 0.07, "grad_norm": 21.5, "learning_rate": 4.704077959940252e-05, "loss": 1.0695, "step": 77000 }, { "epoch": 0.07, "grad_norm": 25.0, "learning_rate": 4.70362805262116e-05, "loss": 1.0156, "step": 77100 }, { "epoch": 0.07, "grad_norm": 40.25, "learning_rate": 4.703178145302068e-05, "loss": 0.9797, "step": 77200 }, { "epoch": 0.07, "grad_norm": 27.875, "learning_rate": 4.7027282379829754e-05, "loss": 1.0771, "step": 77300 }, { "epoch": 0.07, "grad_norm": 80.5, "learning_rate": 4.7022783306638836e-05, "loss": 0.9401, "step": 77400 }, { "epoch": 0.07, "grad_norm": 51.5, "learning_rate": 4.701828423344791e-05, "loss": 0.9542, "step": 77500 }, { "epoch": 0.07, "grad_norm": 42.25, "learning_rate": 4.701378516025699e-05, "loss": 1.1057, "step": 77600 }, { "epoch": 0.07, "grad_norm": 127.5, "learning_rate": 4.700928608706607e-05, "loss": 1.1478, "step": 77700 }, { "epoch": 0.07, "grad_norm": 16.375, "learning_rate": 4.7004787013875144e-05, "loss": 0.964, "step": 77800 }, { "epoch": 0.07, "grad_norm": 213.0, "learning_rate": 4.7000287940684226e-05, "loss": 0.9888, "step": 77900 }, { "epoch": 0.07, "grad_norm": 41.5, "learning_rate": 4.69957888674933e-05, "loss": 1.0407, "step": 78000 }, { "epoch": 0.07, "grad_norm": 124.0, "learning_rate": 4.6991289794302377e-05, "loss": 0.8326, "step": 78100 }, { "epoch": 0.07, "grad_norm": 31.0, "learning_rate": 4.698679072111145e-05, "loss": 1.1145, "step": 78200 }, { "epoch": 0.07, "grad_norm": 42.5, "learning_rate": 4.698229164792053e-05, "loss": 1.0665, "step": 78300 }, { "epoch": 0.07, "grad_norm": 14.875, "learning_rate": 4.697779257472961e-05, "loss": 1.0049, "step": 78400 }, { "epoch": 0.07, "grad_norm": 14.25, "learning_rate": 4.6973293501538685e-05, "loss": 1.2033, "step": 78500 }, { "epoch": 0.07, "grad_norm": 46.25, "learning_rate": 4.696879442834776e-05, "loss": 1.0287, "step": 78600 }, { "epoch": 0.07, "grad_norm": 16.875, "learning_rate": 4.696429535515684e-05, "loss": 1.0317, "step": 78700 }, { "epoch": 0.07, "grad_norm": 0.162109375, "learning_rate": 4.695979628196592e-05, "loss": 1.1977, "step": 78800 }, { "epoch": 0.07, "grad_norm": 36.0, "learning_rate": 4.695529720877499e-05, "loss": 1.0422, "step": 78900 }, { "epoch": 0.07, "grad_norm": 44.5, "learning_rate": 4.6950798135584075e-05, "loss": 0.9911, "step": 79000 }, { "epoch": 0.07, "grad_norm": 25.0, "learning_rate": 4.694629906239315e-05, "loss": 1.2047, "step": 79100 }, { "epoch": 0.07, "grad_norm": 22.75, "learning_rate": 4.6941799989202226e-05, "loss": 1.0781, "step": 79200 }, { "epoch": 0.07, "grad_norm": 11.875, "learning_rate": 4.693730091601131e-05, "loss": 0.9866, "step": 79300 }, { "epoch": 0.07, "grad_norm": 32.25, "learning_rate": 4.6932801842820376e-05, "loss": 1.089, "step": 79400 }, { "epoch": 0.07, "grad_norm": 29.125, "learning_rate": 4.692830276962946e-05, "loss": 0.9768, "step": 79500 }, { "epoch": 0.07, "grad_norm": 6.4375, "learning_rate": 4.6923803696438534e-05, "loss": 0.9505, "step": 79600 }, { "epoch": 0.07, "grad_norm": 61.0, "learning_rate": 4.691930462324761e-05, "loss": 1.0787, "step": 79700 }, { "epoch": 0.07, "grad_norm": 46.0, "learning_rate": 4.691480555005669e-05, "loss": 1.0963, "step": 79800 }, { "epoch": 0.07, "grad_norm": 32.75, "learning_rate": 4.6910306476865767e-05, "loss": 1.1343, "step": 79900 }, { "epoch": 0.07, "grad_norm": 29.5, "learning_rate": 4.690580740367484e-05, "loss": 1.0554, "step": 80000 }, { "epoch": 0.07, "grad_norm": 87.0, "learning_rate": 4.6901308330483924e-05, "loss": 0.9815, "step": 80100 }, { "epoch": 0.07, "grad_norm": 31.625, "learning_rate": 4.6896809257293e-05, "loss": 1.0009, "step": 80200 }, { "epoch": 0.07, "grad_norm": 87.5, "learning_rate": 4.689231018410208e-05, "loss": 1.0204, "step": 80300 }, { "epoch": 0.07, "grad_norm": 63.75, "learning_rate": 4.688781111091116e-05, "loss": 1.0705, "step": 80400 }, { "epoch": 0.07, "grad_norm": 30.875, "learning_rate": 4.688331203772023e-05, "loss": 1.0482, "step": 80500 }, { "epoch": 0.07, "grad_norm": 19.0, "learning_rate": 4.6878812964529314e-05, "loss": 1.0991, "step": 80600 }, { "epoch": 0.07, "grad_norm": 141.0, "learning_rate": 4.687431389133838e-05, "loss": 1.1017, "step": 80700 }, { "epoch": 0.07, "grad_norm": 240.0, "learning_rate": 4.6869814818147465e-05, "loss": 1.045, "step": 80800 }, { "epoch": 0.07, "grad_norm": 16.875, "learning_rate": 4.686531574495654e-05, "loss": 1.1315, "step": 80900 }, { "epoch": 0.07, "grad_norm": 39.5, "learning_rate": 4.6860816671765616e-05, "loss": 1.0082, "step": 81000 }, { "epoch": 0.07, "grad_norm": 68.5, "learning_rate": 4.68563175985747e-05, "loss": 1.0634, "step": 81100 }, { "epoch": 0.07, "grad_norm": 31.625, "learning_rate": 4.685181852538377e-05, "loss": 1.0875, "step": 81200 }, { "epoch": 0.07, "grad_norm": 77.5, "learning_rate": 4.684731945219285e-05, "loss": 0.9573, "step": 81300 }, { "epoch": 0.07, "grad_norm": 0.1123046875, "learning_rate": 4.684282037900193e-05, "loss": 0.8969, "step": 81400 }, { "epoch": 0.07, "grad_norm": 86.5, "learning_rate": 4.6838321305811006e-05, "loss": 1.0937, "step": 81500 }, { "epoch": 0.07, "grad_norm": 22.625, "learning_rate": 4.683382223262008e-05, "loss": 0.9974, "step": 81600 }, { "epoch": 0.07, "grad_norm": 180.0, "learning_rate": 4.682932315942916e-05, "loss": 0.9794, "step": 81700 }, { "epoch": 0.07, "grad_norm": 64.5, "learning_rate": 4.682482408623824e-05, "loss": 1.0085, "step": 81800 }, { "epoch": 0.07, "grad_norm": 42.75, "learning_rate": 4.6820325013047314e-05, "loss": 1.104, "step": 81900 }, { "epoch": 0.07, "grad_norm": 10.0, "learning_rate": 4.681582593985639e-05, "loss": 0.9597, "step": 82000 }, { "epoch": 0.07, "grad_norm": 8.3125, "learning_rate": 4.6811326866665465e-05, "loss": 1.0694, "step": 82100 }, { "epoch": 0.07, "grad_norm": 19.25, "learning_rate": 4.680682779347455e-05, "loss": 1.1842, "step": 82200 }, { "epoch": 0.07, "grad_norm": 20.5, "learning_rate": 4.680232872028362e-05, "loss": 1.1325, "step": 82300 }, { "epoch": 0.07, "grad_norm": 19.5, "learning_rate": 4.67978296470927e-05, "loss": 1.1123, "step": 82400 }, { "epoch": 0.07, "grad_norm": 11.75, "learning_rate": 4.679333057390178e-05, "loss": 1.1663, "step": 82500 }, { "epoch": 0.07, "grad_norm": 39.25, "learning_rate": 4.6788831500710855e-05, "loss": 1.1589, "step": 82600 }, { "epoch": 0.07, "grad_norm": 79.5, "learning_rate": 4.678433242751993e-05, "loss": 1.0633, "step": 82700 }, { "epoch": 0.07, "grad_norm": 8.5, "learning_rate": 4.677983335432901e-05, "loss": 1.0339, "step": 82800 }, { "epoch": 0.07, "grad_norm": 46.0, "learning_rate": 4.677533428113809e-05, "loss": 1.1827, "step": 82900 }, { "epoch": 0.07, "grad_norm": 124.0, "learning_rate": 4.677083520794717e-05, "loss": 1.0051, "step": 83000 }, { "epoch": 0.07, "grad_norm": 35.75, "learning_rate": 4.6766336134756245e-05, "loss": 1.028, "step": 83100 }, { "epoch": 0.07, "grad_norm": 75.0, "learning_rate": 4.676183706156532e-05, "loss": 0.923, "step": 83200 }, { "epoch": 0.07, "grad_norm": 13.875, "learning_rate": 4.6757337988374396e-05, "loss": 0.9642, "step": 83300 }, { "epoch": 0.07, "grad_norm": 12.875, "learning_rate": 4.675283891518347e-05, "loss": 1.138, "step": 83400 }, { "epoch": 0.07, "grad_norm": 42.5, "learning_rate": 4.674833984199255e-05, "loss": 1.2567, "step": 83500 }, { "epoch": 0.07, "grad_norm": 20.625, "learning_rate": 4.674384076880163e-05, "loss": 1.0483, "step": 83600 }, { "epoch": 0.07, "grad_norm": 86.5, "learning_rate": 4.6739341695610704e-05, "loss": 1.2013, "step": 83700 }, { "epoch": 0.07, "grad_norm": 596.0, "learning_rate": 4.6734842622419786e-05, "loss": 1.0766, "step": 83800 }, { "epoch": 0.07, "grad_norm": 35.75, "learning_rate": 4.673034354922886e-05, "loss": 0.9455, "step": 83900 }, { "epoch": 0.07, "grad_norm": 54.0, "learning_rate": 4.672584447603794e-05, "loss": 1.0611, "step": 84000 }, { "epoch": 0.07, "grad_norm": 157.0, "learning_rate": 4.672134540284702e-05, "loss": 1.1654, "step": 84100 }, { "epoch": 0.08, "grad_norm": 59.25, "learning_rate": 4.6716846329656094e-05, "loss": 1.039, "step": 84200 }, { "epoch": 0.08, "grad_norm": 24.375, "learning_rate": 4.671234725646517e-05, "loss": 0.9963, "step": 84300 }, { "epoch": 0.08, "grad_norm": 66.5, "learning_rate": 4.670784818327425e-05, "loss": 1.0757, "step": 84400 }, { "epoch": 0.08, "grad_norm": 17.875, "learning_rate": 4.670334911008333e-05, "loss": 0.9207, "step": 84500 }, { "epoch": 0.08, "grad_norm": 37.75, "learning_rate": 4.66988500368924e-05, "loss": 1.0711, "step": 84600 }, { "epoch": 0.08, "grad_norm": 25.875, "learning_rate": 4.669435096370148e-05, "loss": 0.9419, "step": 84700 }, { "epoch": 0.08, "grad_norm": 108.5, "learning_rate": 4.668985189051055e-05, "loss": 1.0834, "step": 84800 }, { "epoch": 0.08, "grad_norm": 288.0, "learning_rate": 4.6685352817319635e-05, "loss": 1.0528, "step": 84900 }, { "epoch": 0.08, "grad_norm": 14.625, "learning_rate": 4.668085374412871e-05, "loss": 0.8459, "step": 85000 }, { "epoch": 0.08, "grad_norm": 110.5, "learning_rate": 4.6676354670937786e-05, "loss": 0.8702, "step": 85100 }, { "epoch": 0.08, "grad_norm": 15.4375, "learning_rate": 4.667185559774687e-05, "loss": 0.9928, "step": 85200 }, { "epoch": 0.08, "grad_norm": 35.0, "learning_rate": 4.666735652455594e-05, "loss": 0.8058, "step": 85300 }, { "epoch": 0.08, "grad_norm": 13.875, "learning_rate": 4.666285745136502e-05, "loss": 1.072, "step": 85400 }, { "epoch": 0.08, "grad_norm": 47.5, "learning_rate": 4.66583583781741e-05, "loss": 0.9577, "step": 85500 }, { "epoch": 0.08, "grad_norm": 21.125, "learning_rate": 4.6653859304983176e-05, "loss": 1.1584, "step": 85600 }, { "epoch": 0.08, "grad_norm": 23.25, "learning_rate": 4.664936023179226e-05, "loss": 1.003, "step": 85700 }, { "epoch": 0.08, "grad_norm": 29.625, "learning_rate": 4.6644861158601334e-05, "loss": 1.0796, "step": 85800 }, { "epoch": 0.08, "grad_norm": 0.00189971923828125, "learning_rate": 4.66403620854104e-05, "loss": 1.0614, "step": 85900 }, { "epoch": 0.08, "grad_norm": 126.0, "learning_rate": 4.6635863012219484e-05, "loss": 1.0196, "step": 86000 }, { "epoch": 0.08, "grad_norm": 70.0, "learning_rate": 4.663136393902856e-05, "loss": 0.992, "step": 86100 }, { "epoch": 0.08, "grad_norm": 26.5, "learning_rate": 4.662686486583764e-05, "loss": 1.0376, "step": 86200 }, { "epoch": 0.08, "grad_norm": 26.125, "learning_rate": 4.662236579264672e-05, "loss": 1.046, "step": 86300 }, { "epoch": 0.08, "grad_norm": 22.125, "learning_rate": 4.661786671945579e-05, "loss": 0.9676, "step": 86400 }, { "epoch": 0.08, "grad_norm": 5.15625, "learning_rate": 4.6613367646264875e-05, "loss": 0.9605, "step": 86500 }, { "epoch": 0.08, "grad_norm": 15.0625, "learning_rate": 4.660886857307395e-05, "loss": 1.1733, "step": 86600 }, { "epoch": 0.08, "grad_norm": 51.0, "learning_rate": 4.6604369499883025e-05, "loss": 1.0272, "step": 86700 }, { "epoch": 0.08, "grad_norm": 18.375, "learning_rate": 4.659987042669211e-05, "loss": 0.9423, "step": 86800 }, { "epoch": 0.08, "grad_norm": 15.6875, "learning_rate": 4.659537135350118e-05, "loss": 1.0338, "step": 86900 }, { "epoch": 0.08, "grad_norm": 13.4375, "learning_rate": 4.659087228031026e-05, "loss": 1.0813, "step": 87000 }, { "epoch": 0.08, "grad_norm": 12.9375, "learning_rate": 4.658637320711934e-05, "loss": 1.0381, "step": 87100 }, { "epoch": 0.08, "grad_norm": 12.75, "learning_rate": 4.658187413392841e-05, "loss": 0.9795, "step": 87200 }, { "epoch": 0.08, "grad_norm": 44.0, "learning_rate": 4.657737506073749e-05, "loss": 0.9689, "step": 87300 }, { "epoch": 0.08, "grad_norm": 17.5, "learning_rate": 4.6572875987546566e-05, "loss": 0.9152, "step": 87400 }, { "epoch": 0.08, "grad_norm": 11.875, "learning_rate": 4.656837691435564e-05, "loss": 1.0113, "step": 87500 }, { "epoch": 0.08, "grad_norm": 25.75, "learning_rate": 4.6563877841164724e-05, "loss": 1.0188, "step": 87600 }, { "epoch": 0.08, "grad_norm": 0.076171875, "learning_rate": 4.65593787679738e-05, "loss": 1.0329, "step": 87700 }, { "epoch": 0.08, "grad_norm": 105.5, "learning_rate": 4.6554879694782874e-05, "loss": 1.0492, "step": 87800 }, { "epoch": 0.08, "grad_norm": 69.0, "learning_rate": 4.6550380621591956e-05, "loss": 0.9479, "step": 87900 }, { "epoch": 0.08, "grad_norm": 245.0, "learning_rate": 4.654588154840103e-05, "loss": 0.9938, "step": 88000 }, { "epoch": 0.08, "grad_norm": 676.0, "learning_rate": 4.654138247521011e-05, "loss": 0.9638, "step": 88100 }, { "epoch": 0.08, "grad_norm": 39.5, "learning_rate": 4.653688340201919e-05, "loss": 1.0982, "step": 88200 }, { "epoch": 0.08, "grad_norm": 29.625, "learning_rate": 4.6532384328828265e-05, "loss": 1.0785, "step": 88300 }, { "epoch": 0.08, "grad_norm": 5.875, "learning_rate": 4.652788525563735e-05, "loss": 1.0102, "step": 88400 }, { "epoch": 0.08, "grad_norm": 0.034423828125, "learning_rate": 4.6523386182446415e-05, "loss": 1.088, "step": 88500 }, { "epoch": 0.08, "grad_norm": 0.33203125, "learning_rate": 4.651888710925549e-05, "loss": 0.946, "step": 88600 }, { "epoch": 0.08, "grad_norm": 25.125, "learning_rate": 4.651438803606457e-05, "loss": 1.0201, "step": 88700 }, { "epoch": 0.08, "grad_norm": 14.3125, "learning_rate": 4.650988896287365e-05, "loss": 0.9087, "step": 88800 }, { "epoch": 0.08, "grad_norm": 0.01336669921875, "learning_rate": 4.650538988968272e-05, "loss": 1.0322, "step": 88900 }, { "epoch": 0.08, "grad_norm": 19.875, "learning_rate": 4.6500890816491805e-05, "loss": 1.099, "step": 89000 }, { "epoch": 0.08, "grad_norm": 26.125, "learning_rate": 4.649639174330088e-05, "loss": 1.0903, "step": 89100 }, { "epoch": 0.08, "grad_norm": 62.0, "learning_rate": 4.649189267010996e-05, "loss": 1.1445, "step": 89200 }, { "epoch": 0.08, "grad_norm": 165.0, "learning_rate": 4.648739359691904e-05, "loss": 1.0607, "step": 89300 }, { "epoch": 0.08, "grad_norm": 10.75, "learning_rate": 4.6482894523728114e-05, "loss": 1.1787, "step": 89400 }, { "epoch": 0.08, "grad_norm": 38.5, "learning_rate": 4.6478395450537196e-05, "loss": 0.973, "step": 89500 }, { "epoch": 0.08, "grad_norm": 53.25, "learning_rate": 4.647389637734627e-05, "loss": 0.9323, "step": 89600 }, { "epoch": 0.08, "grad_norm": 28.25, "learning_rate": 4.6469397304155346e-05, "loss": 1.1569, "step": 89700 }, { "epoch": 0.08, "grad_norm": 34.25, "learning_rate": 4.646489823096442e-05, "loss": 1.1027, "step": 89800 }, { "epoch": 0.08, "grad_norm": 49.0, "learning_rate": 4.64603991577735e-05, "loss": 1.2724, "step": 89900 }, { "epoch": 0.08, "grad_norm": 20.75, "learning_rate": 4.645590008458258e-05, "loss": 1.0498, "step": 90000 }, { "epoch": 0.08, "grad_norm": 0.3203125, "learning_rate": 4.6451401011391655e-05, "loss": 0.9929, "step": 90100 }, { "epoch": 0.08, "grad_norm": 65.0, "learning_rate": 4.644690193820073e-05, "loss": 1.1277, "step": 90200 }, { "epoch": 0.08, "grad_norm": 24.125, "learning_rate": 4.644240286500981e-05, "loss": 1.1095, "step": 90300 }, { "epoch": 0.08, "grad_norm": 28.375, "learning_rate": 4.643790379181889e-05, "loss": 1.1632, "step": 90400 }, { "epoch": 0.08, "grad_norm": 322.0, "learning_rate": 4.643340471862796e-05, "loss": 1.005, "step": 90500 }, { "epoch": 0.08, "grad_norm": 68.0, "learning_rate": 4.6428905645437045e-05, "loss": 1.0186, "step": 90600 }, { "epoch": 0.08, "grad_norm": 21.625, "learning_rate": 4.642440657224612e-05, "loss": 1.1503, "step": 90700 }, { "epoch": 0.08, "grad_norm": 38.5, "learning_rate": 4.6419907499055195e-05, "loss": 1.1201, "step": 90800 }, { "epoch": 0.08, "grad_norm": 43.25, "learning_rate": 4.641540842586428e-05, "loss": 1.0718, "step": 90900 }, { "epoch": 0.08, "grad_norm": 206.0, "learning_rate": 4.641090935267335e-05, "loss": 1.1328, "step": 91000 }, { "epoch": 0.08, "grad_norm": 31.125, "learning_rate": 4.640641027948243e-05, "loss": 1.0125, "step": 91100 }, { "epoch": 0.08, "grad_norm": 59.25, "learning_rate": 4.6401911206291504e-05, "loss": 0.9647, "step": 91200 }, { "epoch": 0.08, "grad_norm": 21.125, "learning_rate": 4.639741213310058e-05, "loss": 0.9768, "step": 91300 }, { "epoch": 0.08, "grad_norm": 24.5, "learning_rate": 4.639291305990966e-05, "loss": 0.9173, "step": 91400 }, { "epoch": 0.08, "grad_norm": 37.25, "learning_rate": 4.6388413986718736e-05, "loss": 0.9748, "step": 91500 }, { "epoch": 0.08, "grad_norm": 70.0, "learning_rate": 4.638391491352781e-05, "loss": 0.9763, "step": 91600 }, { "epoch": 0.08, "grad_norm": 34.0, "learning_rate": 4.6379415840336894e-05, "loss": 1.0998, "step": 91700 }, { "epoch": 0.08, "grad_norm": 6.8125, "learning_rate": 4.637491676714597e-05, "loss": 0.9437, "step": 91800 }, { "epoch": 0.08, "grad_norm": 740.0, "learning_rate": 4.637041769395505e-05, "loss": 1.094, "step": 91900 }, { "epoch": 0.08, "grad_norm": 2.328125, "learning_rate": 4.636591862076413e-05, "loss": 1.0091, "step": 92000 }, { "epoch": 0.08, "grad_norm": 34.25, "learning_rate": 4.63614195475732e-05, "loss": 1.1075, "step": 92100 }, { "epoch": 0.08, "grad_norm": 163.0, "learning_rate": 4.6356920474382284e-05, "loss": 1.0266, "step": 92200 }, { "epoch": 0.08, "grad_norm": 55.5, "learning_rate": 4.635242140119136e-05, "loss": 0.9571, "step": 92300 }, { "epoch": 0.08, "grad_norm": 16.875, "learning_rate": 4.6347922328000435e-05, "loss": 1.095, "step": 92400 }, { "epoch": 0.08, "grad_norm": 46.25, "learning_rate": 4.634342325480951e-05, "loss": 1.0834, "step": 92500 }, { "epoch": 0.08, "grad_norm": 10.1875, "learning_rate": 4.6338924181618585e-05, "loss": 0.9692, "step": 92600 }, { "epoch": 0.08, "grad_norm": 38.0, "learning_rate": 4.633442510842767e-05, "loss": 1.1476, "step": 92700 }, { "epoch": 0.08, "grad_norm": 1.5625, "learning_rate": 4.632992603523674e-05, "loss": 0.973, "step": 92800 }, { "epoch": 0.08, "grad_norm": 164.0, "learning_rate": 4.632542696204582e-05, "loss": 0.9548, "step": 92900 }, { "epoch": 0.08, "grad_norm": 51.5, "learning_rate": 4.63209278888549e-05, "loss": 1.0253, "step": 93000 }, { "epoch": 0.08, "grad_norm": 19.75, "learning_rate": 4.6316428815663976e-05, "loss": 1.0976, "step": 93100 }, { "epoch": 0.08, "grad_norm": 13.875, "learning_rate": 4.631192974247305e-05, "loss": 0.9796, "step": 93200 }, { "epoch": 0.08, "grad_norm": 872.0, "learning_rate": 4.630743066928213e-05, "loss": 0.8708, "step": 93300 }, { "epoch": 0.08, "grad_norm": 25.375, "learning_rate": 4.630293159609121e-05, "loss": 1.0497, "step": 93400 }, { "epoch": 0.08, "grad_norm": 34.5, "learning_rate": 4.6298432522900284e-05, "loss": 1.1437, "step": 93500 }, { "epoch": 0.08, "grad_norm": 16.625, "learning_rate": 4.6293933449709366e-05, "loss": 0.8803, "step": 93600 }, { "epoch": 0.08, "grad_norm": 1.2265625, "learning_rate": 4.6289434376518435e-05, "loss": 0.9964, "step": 93700 }, { "epoch": 0.08, "grad_norm": 25.75, "learning_rate": 4.628493530332752e-05, "loss": 0.9963, "step": 93800 }, { "epoch": 0.08, "grad_norm": 0.0164794921875, "learning_rate": 4.628043623013659e-05, "loss": 1.0988, "step": 93900 }, { "epoch": 0.08, "grad_norm": 18.875, "learning_rate": 4.627593715694567e-05, "loss": 1.0421, "step": 94000 }, { "epoch": 0.08, "grad_norm": 15.875, "learning_rate": 4.627143808375475e-05, "loss": 0.9977, "step": 94100 }, { "epoch": 0.08, "grad_norm": 32.0, "learning_rate": 4.6266939010563825e-05, "loss": 0.9836, "step": 94200 }, { "epoch": 0.08, "grad_norm": 169.0, "learning_rate": 4.62624399373729e-05, "loss": 1.0128, "step": 94300 }, { "epoch": 0.08, "grad_norm": 86.5, "learning_rate": 4.625794086418198e-05, "loss": 0.952, "step": 94400 }, { "epoch": 0.08, "grad_norm": 0.62109375, "learning_rate": 4.625344179099106e-05, "loss": 1.1159, "step": 94500 }, { "epoch": 0.08, "grad_norm": 0.126953125, "learning_rate": 4.624894271780014e-05, "loss": 1.0818, "step": 94600 }, { "epoch": 0.08, "grad_norm": 78.0, "learning_rate": 4.6244443644609215e-05, "loss": 1.0271, "step": 94700 }, { "epoch": 0.08, "grad_norm": 66.5, "learning_rate": 4.623994457141829e-05, "loss": 1.0018, "step": 94800 }, { "epoch": 0.08, "grad_norm": 12.75, "learning_rate": 4.623544549822737e-05, "loss": 0.9924, "step": 94900 }, { "epoch": 0.08, "grad_norm": 0.0023193359375, "learning_rate": 4.623094642503644e-05, "loss": 0.9408, "step": 95000 }, { "epoch": 0.08, "grad_norm": 35.5, "learning_rate": 4.622644735184552e-05, "loss": 1.2318, "step": 95100 }, { "epoch": 0.08, "grad_norm": 53.0, "learning_rate": 4.62219482786546e-05, "loss": 0.9541, "step": 95200 }, { "epoch": 0.08, "grad_norm": 108.5, "learning_rate": 4.6217449205463674e-05, "loss": 1.0628, "step": 95300 }, { "epoch": 0.08, "grad_norm": 159.0, "learning_rate": 4.6212950132272756e-05, "loss": 1.0599, "step": 95400 }, { "epoch": 0.09, "grad_norm": 17.5, "learning_rate": 4.620845105908183e-05, "loss": 1.1268, "step": 95500 }, { "epoch": 0.09, "grad_norm": 18.375, "learning_rate": 4.620395198589091e-05, "loss": 1.0284, "step": 95600 }, { "epoch": 0.09, "grad_norm": 35.25, "learning_rate": 4.619945291269999e-05, "loss": 1.0674, "step": 95700 }, { "epoch": 0.09, "grad_norm": 51.75, "learning_rate": 4.6194953839509064e-05, "loss": 1.1138, "step": 95800 }, { "epoch": 0.09, "grad_norm": 41.5, "learning_rate": 4.619045476631814e-05, "loss": 1.1292, "step": 95900 }, { "epoch": 0.09, "grad_norm": 40.5, "learning_rate": 4.618595569312722e-05, "loss": 1.0113, "step": 96000 }, { "epoch": 0.09, "grad_norm": 17.5, "learning_rate": 4.61814566199363e-05, "loss": 1.1243, "step": 96100 }, { "epoch": 0.09, "grad_norm": 15.1875, "learning_rate": 4.617695754674537e-05, "loss": 0.9991, "step": 96200 }, { "epoch": 0.09, "grad_norm": 17.25, "learning_rate": 4.617245847355445e-05, "loss": 1.0575, "step": 96300 }, { "epoch": 0.09, "grad_norm": 330.0, "learning_rate": 4.616795940036352e-05, "loss": 1.0142, "step": 96400 }, { "epoch": 0.09, "grad_norm": 16.75, "learning_rate": 4.6163460327172605e-05, "loss": 1.1727, "step": 96500 }, { "epoch": 0.09, "grad_norm": 17.75, "learning_rate": 4.615896125398168e-05, "loss": 1.1062, "step": 96600 }, { "epoch": 0.09, "grad_norm": 117.0, "learning_rate": 4.6154462180790756e-05, "loss": 0.9946, "step": 96700 }, { "epoch": 0.09, "grad_norm": 41.5, "learning_rate": 4.614996310759984e-05, "loss": 1.0037, "step": 96800 }, { "epoch": 0.09, "grad_norm": 0.294921875, "learning_rate": 4.614546403440891e-05, "loss": 0.964, "step": 96900 }, { "epoch": 0.09, "grad_norm": 660.0, "learning_rate": 4.614096496121799e-05, "loss": 1.0423, "step": 97000 }, { "epoch": 0.09, "grad_norm": 29.5, "learning_rate": 4.613646588802707e-05, "loss": 1.0066, "step": 97100 }, { "epoch": 0.09, "grad_norm": 16.0, "learning_rate": 4.6131966814836146e-05, "loss": 0.9157, "step": 97200 }, { "epoch": 0.09, "grad_norm": 94.5, "learning_rate": 4.612746774164523e-05, "loss": 1.0699, "step": 97300 }, { "epoch": 0.09, "grad_norm": 103.5, "learning_rate": 4.61229686684543e-05, "loss": 1.1165, "step": 97400 }, { "epoch": 0.09, "grad_norm": 31.375, "learning_rate": 4.611846959526338e-05, "loss": 1.0617, "step": 97500 }, { "epoch": 0.09, "grad_norm": 159.0, "learning_rate": 4.6113970522072454e-05, "loss": 1.0553, "step": 97600 }, { "epoch": 0.09, "grad_norm": 137.0, "learning_rate": 4.610947144888153e-05, "loss": 1.0892, "step": 97700 }, { "epoch": 0.09, "grad_norm": 20.875, "learning_rate": 4.610497237569061e-05, "loss": 0.8908, "step": 97800 }, { "epoch": 0.09, "grad_norm": 66.0, "learning_rate": 4.610047330249969e-05, "loss": 0.9143, "step": 97900 }, { "epoch": 0.09, "grad_norm": 79.5, "learning_rate": 4.609597422930876e-05, "loss": 1.019, "step": 98000 }, { "epoch": 0.09, "grad_norm": 23.75, "learning_rate": 4.6091475156117844e-05, "loss": 1.1294, "step": 98100 }, { "epoch": 0.09, "grad_norm": 3.921875, "learning_rate": 4.608697608292692e-05, "loss": 1.0638, "step": 98200 }, { "epoch": 0.09, "grad_norm": 40.0, "learning_rate": 4.6082477009735995e-05, "loss": 0.9934, "step": 98300 }, { "epoch": 0.09, "grad_norm": 153.0, "learning_rate": 4.607797793654508e-05, "loss": 1.064, "step": 98400 }, { "epoch": 0.09, "grad_norm": 11.1875, "learning_rate": 4.607347886335415e-05, "loss": 1.2169, "step": 98500 }, { "epoch": 0.09, "grad_norm": 126.5, "learning_rate": 4.606897979016323e-05, "loss": 0.9162, "step": 98600 }, { "epoch": 0.09, "grad_norm": 17.875, "learning_rate": 4.606448071697231e-05, "loss": 1.0007, "step": 98700 }, { "epoch": 0.09, "grad_norm": 36.25, "learning_rate": 4.6059981643781385e-05, "loss": 0.9393, "step": 98800 }, { "epoch": 0.09, "grad_norm": 18.5, "learning_rate": 4.605548257059046e-05, "loss": 1.1701, "step": 98900 }, { "epoch": 0.09, "grad_norm": 25.0, "learning_rate": 4.6050983497399536e-05, "loss": 1.1328, "step": 99000 }, { "epoch": 0.09, "grad_norm": 0.0191650390625, "learning_rate": 4.604648442420861e-05, "loss": 1.0642, "step": 99100 }, { "epoch": 0.09, "grad_norm": 16.375, "learning_rate": 4.604198535101769e-05, "loss": 1.0403, "step": 99200 }, { "epoch": 0.09, "grad_norm": 46.5, "learning_rate": 4.603748627782677e-05, "loss": 1.1214, "step": 99300 }, { "epoch": 0.09, "grad_norm": 12.375, "learning_rate": 4.6032987204635844e-05, "loss": 0.9587, "step": 99400 }, { "epoch": 0.09, "grad_norm": 113.5, "learning_rate": 4.6028488131444926e-05, "loss": 1.1008, "step": 99500 }, { "epoch": 0.09, "grad_norm": 122.0, "learning_rate": 4.6023989058254e-05, "loss": 0.9344, "step": 99600 }, { "epoch": 0.09, "grad_norm": 56.25, "learning_rate": 4.601948998506308e-05, "loss": 1.0311, "step": 99700 }, { "epoch": 0.09, "grad_norm": 1536.0, "learning_rate": 4.601499091187216e-05, "loss": 1.0442, "step": 99800 }, { "epoch": 0.09, "grad_norm": 19.875, "learning_rate": 4.6010491838681234e-05, "loss": 1.1676, "step": 99900 }, { "epoch": 0.09, "grad_norm": 56.75, "learning_rate": 4.6005992765490316e-05, "loss": 1.1096, "step": 100000 }, { "epoch": 0.09, "grad_norm": 107.5, "learning_rate": 4.600149369229939e-05, "loss": 0.947, "step": 100100 }, { "epoch": 0.09, "grad_norm": 15.25, "learning_rate": 4.599699461910846e-05, "loss": 1.0848, "step": 100200 }, { "epoch": 0.09, "grad_norm": 13.8125, "learning_rate": 4.599249554591754e-05, "loss": 1.0569, "step": 100300 }, { "epoch": 0.09, "grad_norm": 48.75, "learning_rate": 4.598799647272662e-05, "loss": 1.0007, "step": 100400 }, { "epoch": 0.09, "grad_norm": 21.5, "learning_rate": 4.59834973995357e-05, "loss": 1.1294, "step": 100500 }, { "epoch": 0.09, "grad_norm": 196.0, "learning_rate": 4.5978998326344775e-05, "loss": 1.1246, "step": 100600 }, { "epoch": 0.09, "grad_norm": 28.0, "learning_rate": 4.597449925315385e-05, "loss": 1.1526, "step": 100700 }, { "epoch": 0.09, "grad_norm": 50.5, "learning_rate": 4.597000017996293e-05, "loss": 1.0116, "step": 100800 }, { "epoch": 0.09, "grad_norm": 4.71875, "learning_rate": 4.596550110677201e-05, "loss": 1.1103, "step": 100900 }, { "epoch": 0.09, "grad_norm": 29.75, "learning_rate": 4.596100203358108e-05, "loss": 1.0276, "step": 101000 }, { "epoch": 0.09, "grad_norm": 32.25, "learning_rate": 4.5956502960390165e-05, "loss": 1.1312, "step": 101100 }, { "epoch": 0.09, "grad_norm": 17.25, "learning_rate": 4.595200388719924e-05, "loss": 0.8968, "step": 101200 }, { "epoch": 0.09, "grad_norm": 45.5, "learning_rate": 4.5947504814008316e-05, "loss": 0.9783, "step": 101300 }, { "epoch": 0.09, "grad_norm": 21.625, "learning_rate": 4.59430057408174e-05, "loss": 1.1372, "step": 101400 }, { "epoch": 0.09, "grad_norm": 29.0, "learning_rate": 4.593850666762647e-05, "loss": 1.1434, "step": 101500 }, { "epoch": 0.09, "grad_norm": 31.0, "learning_rate": 4.593400759443555e-05, "loss": 0.9434, "step": 101600 }, { "epoch": 0.09, "grad_norm": 11.5, "learning_rate": 4.5929508521244624e-05, "loss": 0.9486, "step": 101700 }, { "epoch": 0.09, "grad_norm": 116.5, "learning_rate": 4.59250094480537e-05, "loss": 1.0677, "step": 101800 }, { "epoch": 0.09, "grad_norm": 25.25, "learning_rate": 4.592051037486278e-05, "loss": 0.8866, "step": 101900 }, { "epoch": 0.09, "grad_norm": 82.5, "learning_rate": 4.591601130167186e-05, "loss": 1.0288, "step": 102000 }, { "epoch": 0.09, "grad_norm": 39.75, "learning_rate": 4.591151222848093e-05, "loss": 1.1237, "step": 102100 }, { "epoch": 0.09, "grad_norm": 19.375, "learning_rate": 4.5907013155290015e-05, "loss": 1.0126, "step": 102200 }, { "epoch": 0.09, "grad_norm": 20.75, "learning_rate": 4.590251408209909e-05, "loss": 1.006, "step": 102300 }, { "epoch": 0.09, "grad_norm": 0.482421875, "learning_rate": 4.5898015008908165e-05, "loss": 1.1302, "step": 102400 }, { "epoch": 0.09, "grad_norm": 49.75, "learning_rate": 4.589351593571725e-05, "loss": 1.0853, "step": 102500 }, { "epoch": 0.09, "grad_norm": 144.0, "learning_rate": 4.588901686252632e-05, "loss": 1.0041, "step": 102600 }, { "epoch": 0.09, "grad_norm": 376.0, "learning_rate": 4.5884517789335405e-05, "loss": 1.0391, "step": 102700 }, { "epoch": 0.09, "grad_norm": 16.625, "learning_rate": 4.588001871614447e-05, "loss": 1.0192, "step": 102800 }, { "epoch": 0.09, "grad_norm": 31.375, "learning_rate": 4.587551964295355e-05, "loss": 1.0514, "step": 102900 }, { "epoch": 0.09, "grad_norm": 53.5, "learning_rate": 4.587102056976263e-05, "loss": 1.0099, "step": 103000 }, { "epoch": 0.09, "grad_norm": 1.0390625, "learning_rate": 4.5866521496571706e-05, "loss": 0.9165, "step": 103100 }, { "epoch": 0.09, "grad_norm": 15.1875, "learning_rate": 4.586202242338079e-05, "loss": 0.9151, "step": 103200 }, { "epoch": 0.09, "grad_norm": 36.25, "learning_rate": 4.5857523350189864e-05, "loss": 1.1059, "step": 103300 }, { "epoch": 0.09, "grad_norm": 162.0, "learning_rate": 4.585302427699894e-05, "loss": 1.1004, "step": 103400 }, { "epoch": 0.09, "grad_norm": 22.75, "learning_rate": 4.584852520380802e-05, "loss": 0.9395, "step": 103500 }, { "epoch": 0.09, "grad_norm": 84.5, "learning_rate": 4.5844026130617096e-05, "loss": 1.159, "step": 103600 }, { "epoch": 0.09, "grad_norm": 0.345703125, "learning_rate": 4.583952705742617e-05, "loss": 0.9854, "step": 103700 }, { "epoch": 0.09, "grad_norm": 43.0, "learning_rate": 4.5835027984235254e-05, "loss": 0.9976, "step": 103800 }, { "epoch": 0.09, "grad_norm": 0.08544921875, "learning_rate": 4.583052891104433e-05, "loss": 1.0317, "step": 103900 }, { "epoch": 0.09, "grad_norm": 0.000629425048828125, "learning_rate": 4.5826029837853405e-05, "loss": 1.125, "step": 104000 }, { "epoch": 0.09, "grad_norm": 101.0, "learning_rate": 4.582153076466248e-05, "loss": 1.0035, "step": 104100 }, { "epoch": 0.09, "grad_norm": 6.5, "learning_rate": 4.5817031691471555e-05, "loss": 1.0678, "step": 104200 }, { "epoch": 0.09, "grad_norm": 146.0, "learning_rate": 4.581253261828064e-05, "loss": 1.1163, "step": 104300 }, { "epoch": 0.09, "grad_norm": 17.125, "learning_rate": 4.580803354508971e-05, "loss": 0.9413, "step": 104400 }, { "epoch": 0.09, "grad_norm": 38.0, "learning_rate": 4.580353447189879e-05, "loss": 0.9658, "step": 104500 }, { "epoch": 0.09, "grad_norm": 82.5, "learning_rate": 4.579903539870787e-05, "loss": 1.1039, "step": 104600 }, { "epoch": 0.09, "grad_norm": 30.25, "learning_rate": 4.5794536325516945e-05, "loss": 1.0439, "step": 104700 }, { "epoch": 0.09, "grad_norm": 29.25, "learning_rate": 4.579003725232602e-05, "loss": 0.8397, "step": 104800 }, { "epoch": 0.09, "grad_norm": 58.5, "learning_rate": 4.57855381791351e-05, "loss": 1.0219, "step": 104900 }, { "epoch": 0.09, "grad_norm": 48.25, "learning_rate": 4.578103910594418e-05, "loss": 1.0695, "step": 105000 }, { "epoch": 0.09, "grad_norm": 0.462890625, "learning_rate": 4.5776540032753254e-05, "loss": 0.9501, "step": 105100 }, { "epoch": 0.09, "grad_norm": 16.375, "learning_rate": 4.5772040959562336e-05, "loss": 1.044, "step": 105200 }, { "epoch": 0.09, "grad_norm": 41.25, "learning_rate": 4.576754188637141e-05, "loss": 1.0914, "step": 105300 }, { "epoch": 0.09, "grad_norm": 0.005096435546875, "learning_rate": 4.5763042813180486e-05, "loss": 1.1638, "step": 105400 }, { "epoch": 0.09, "grad_norm": 12.3125, "learning_rate": 4.575854373998956e-05, "loss": 1.0377, "step": 105500 }, { "epoch": 0.09, "grad_norm": 59.25, "learning_rate": 4.575404466679864e-05, "loss": 1.0802, "step": 105600 }, { "epoch": 0.09, "grad_norm": 41.75, "learning_rate": 4.574954559360772e-05, "loss": 1.099, "step": 105700 }, { "epoch": 0.09, "grad_norm": 0.004180908203125, "learning_rate": 4.5745046520416795e-05, "loss": 1.1757, "step": 105800 }, { "epoch": 0.09, "grad_norm": 33.75, "learning_rate": 4.574054744722587e-05, "loss": 1.1173, "step": 105900 }, { "epoch": 0.09, "grad_norm": 4.0, "learning_rate": 4.573604837403495e-05, "loss": 1.0574, "step": 106000 }, { "epoch": 0.09, "grad_norm": 0.004486083984375, "learning_rate": 4.573154930084403e-05, "loss": 1.1521, "step": 106100 }, { "epoch": 0.09, "grad_norm": 63.75, "learning_rate": 4.572705022765311e-05, "loss": 1.1711, "step": 106200 }, { "epoch": 0.09, "grad_norm": 18.5, "learning_rate": 4.5722551154462185e-05, "loss": 1.042, "step": 106300 }, { "epoch": 0.09, "grad_norm": 20.375, "learning_rate": 4.571805208127126e-05, "loss": 1.0231, "step": 106400 }, { "epoch": 0.09, "grad_norm": 25.625, "learning_rate": 4.571355300808034e-05, "loss": 0.9212, "step": 106500 }, { "epoch": 0.09, "grad_norm": 39.5, "learning_rate": 4.570905393488942e-05, "loss": 0.9912, "step": 106600 }, { "epoch": 0.1, "grad_norm": 82.5, "learning_rate": 4.570455486169849e-05, "loss": 1.2595, "step": 106700 }, { "epoch": 0.1, "grad_norm": 0.06787109375, "learning_rate": 4.570005578850757e-05, "loss": 0.9667, "step": 106800 }, { "epoch": 0.1, "grad_norm": 18.75, "learning_rate": 4.5695556715316644e-05, "loss": 1.0125, "step": 106900 }, { "epoch": 0.1, "grad_norm": 276.0, "learning_rate": 4.5691057642125726e-05, "loss": 1.0792, "step": 107000 }, { "epoch": 0.1, "grad_norm": 16.875, "learning_rate": 4.56865585689348e-05, "loss": 0.8907, "step": 107100 }, { "epoch": 0.1, "grad_norm": 33.5, "learning_rate": 4.5682059495743876e-05, "loss": 1.0903, "step": 107200 }, { "epoch": 0.1, "grad_norm": 50.5, "learning_rate": 4.567756042255296e-05, "loss": 1.1117, "step": 107300 }, { "epoch": 0.1, "grad_norm": 9.5, "learning_rate": 4.5673061349362034e-05, "loss": 1.0159, "step": 107400 }, { "epoch": 0.1, "grad_norm": 21.0, "learning_rate": 4.566856227617111e-05, "loss": 0.9291, "step": 107500 }, { "epoch": 0.1, "grad_norm": 58.25, "learning_rate": 4.566406320298019e-05, "loss": 1.0275, "step": 107600 }, { "epoch": 0.1, "grad_norm": 22.25, "learning_rate": 4.565956412978927e-05, "loss": 1.0228, "step": 107700 }, { "epoch": 0.1, "grad_norm": 264.0, "learning_rate": 4.565506505659834e-05, "loss": 0.9558, "step": 107800 }, { "epoch": 0.1, "grad_norm": 36.0, "learning_rate": 4.5650565983407424e-05, "loss": 1.0886, "step": 107900 }, { "epoch": 0.1, "grad_norm": 29.5, "learning_rate": 4.564606691021649e-05, "loss": 1.0162, "step": 108000 }, { "epoch": 0.1, "grad_norm": 141.0, "learning_rate": 4.5641567837025575e-05, "loss": 1.1004, "step": 108100 }, { "epoch": 0.1, "grad_norm": 20.0, "learning_rate": 4.563706876383465e-05, "loss": 1.0725, "step": 108200 }, { "epoch": 0.1, "grad_norm": 20.5, "learning_rate": 4.5632569690643725e-05, "loss": 1.0277, "step": 108300 }, { "epoch": 0.1, "grad_norm": 37.25, "learning_rate": 4.562807061745281e-05, "loss": 1.0413, "step": 108400 }, { "epoch": 0.1, "grad_norm": 92.5, "learning_rate": 4.562357154426188e-05, "loss": 1.1285, "step": 108500 }, { "epoch": 0.1, "grad_norm": 0.83203125, "learning_rate": 4.561907247107096e-05, "loss": 0.9656, "step": 108600 }, { "epoch": 0.1, "grad_norm": 572.0, "learning_rate": 4.561457339788004e-05, "loss": 0.9537, "step": 108700 }, { "epoch": 0.1, "grad_norm": 78.0, "learning_rate": 4.5610074324689116e-05, "loss": 1.024, "step": 108800 }, { "epoch": 0.1, "grad_norm": 31.0, "learning_rate": 4.56055752514982e-05, "loss": 1.0087, "step": 108900 }, { "epoch": 0.1, "grad_norm": 0.0269775390625, "learning_rate": 4.560107617830727e-05, "loss": 1.0142, "step": 109000 }, { "epoch": 0.1, "grad_norm": 17.125, "learning_rate": 4.559657710511635e-05, "loss": 1.0721, "step": 109100 }, { "epoch": 0.1, "grad_norm": 1.9609375, "learning_rate": 4.559207803192543e-05, "loss": 1.1147, "step": 109200 }, { "epoch": 0.1, "grad_norm": 27.375, "learning_rate": 4.55875789587345e-05, "loss": 0.9822, "step": 109300 }, { "epoch": 0.1, "grad_norm": 46.75, "learning_rate": 4.558307988554358e-05, "loss": 1.0803, "step": 109400 }, { "epoch": 0.1, "grad_norm": 0.94921875, "learning_rate": 4.557858081235266e-05, "loss": 1.0385, "step": 109500 }, { "epoch": 0.1, "grad_norm": 17.125, "learning_rate": 4.557408173916173e-05, "loss": 1.0949, "step": 109600 }, { "epoch": 0.1, "grad_norm": 36.75, "learning_rate": 4.5569582665970814e-05, "loss": 0.9582, "step": 109700 }, { "epoch": 0.1, "grad_norm": 30.25, "learning_rate": 4.556508359277989e-05, "loss": 1.0487, "step": 109800 }, { "epoch": 0.1, "grad_norm": 20.625, "learning_rate": 4.5560584519588965e-05, "loss": 1.0011, "step": 109900 }, { "epoch": 0.1, "grad_norm": 66.0, "learning_rate": 4.555608544639805e-05, "loss": 1.0864, "step": 110000 }, { "epoch": 0.1, "grad_norm": 84.0, "learning_rate": 4.555158637320712e-05, "loss": 1.065, "step": 110100 }, { "epoch": 0.1, "grad_norm": 35.0, "learning_rate": 4.55470873000162e-05, "loss": 1.0923, "step": 110200 }, { "epoch": 0.1, "grad_norm": 482.0, "learning_rate": 4.554258822682528e-05, "loss": 1.094, "step": 110300 }, { "epoch": 0.1, "grad_norm": 25.0, "learning_rate": 4.5538089153634355e-05, "loss": 1.043, "step": 110400 }, { "epoch": 0.1, "grad_norm": 28.25, "learning_rate": 4.553359008044343e-05, "loss": 1.016, "step": 110500 }, { "epoch": 0.1, "grad_norm": 41.5, "learning_rate": 4.5529091007252506e-05, "loss": 0.9853, "step": 110600 }, { "epoch": 0.1, "grad_norm": 51.5, "learning_rate": 4.552459193406158e-05, "loss": 1.0946, "step": 110700 }, { "epoch": 0.1, "grad_norm": 78.5, "learning_rate": 4.552009286087066e-05, "loss": 0.9516, "step": 110800 }, { "epoch": 0.1, "grad_norm": 36.5, "learning_rate": 4.551559378767974e-05, "loss": 1.0524, "step": 110900 }, { "epoch": 0.1, "grad_norm": 510.0, "learning_rate": 4.5511094714488814e-05, "loss": 1.0463, "step": 111000 }, { "epoch": 0.1, "grad_norm": 20.25, "learning_rate": 4.5506595641297896e-05, "loss": 1.1404, "step": 111100 }, { "epoch": 0.1, "grad_norm": 92.0, "learning_rate": 4.550209656810697e-05, "loss": 1.035, "step": 111200 }, { "epoch": 0.1, "grad_norm": 95.5, "learning_rate": 4.549759749491605e-05, "loss": 1.0988, "step": 111300 }, { "epoch": 0.1, "grad_norm": 10.8125, "learning_rate": 4.549309842172513e-05, "loss": 1.1427, "step": 111400 }, { "epoch": 0.1, "grad_norm": 37.25, "learning_rate": 4.5488599348534204e-05, "loss": 1.1043, "step": 111500 }, { "epoch": 0.1, "grad_norm": 83.5, "learning_rate": 4.5484100275343286e-05, "loss": 0.9516, "step": 111600 }, { "epoch": 0.1, "grad_norm": 42.25, "learning_rate": 4.547960120215236e-05, "loss": 1.0208, "step": 111700 }, { "epoch": 0.1, "grad_norm": 47.75, "learning_rate": 4.547510212896144e-05, "loss": 0.9251, "step": 111800 }, { "epoch": 0.1, "grad_norm": 13.4375, "learning_rate": 4.547060305577051e-05, "loss": 1.0534, "step": 111900 }, { "epoch": 0.1, "grad_norm": 0.09033203125, "learning_rate": 4.546610398257959e-05, "loss": 1.1408, "step": 112000 }, { "epoch": 0.1, "grad_norm": 36.25, "learning_rate": 4.546160490938867e-05, "loss": 1.0512, "step": 112100 }, { "epoch": 0.1, "grad_norm": 88.0, "learning_rate": 4.5457105836197745e-05, "loss": 0.9643, "step": 112200 }, { "epoch": 0.1, "grad_norm": 139.0, "learning_rate": 4.545260676300682e-05, "loss": 1.024, "step": 112300 }, { "epoch": 0.1, "grad_norm": 62.25, "learning_rate": 4.54481076898159e-05, "loss": 1.0729, "step": 112400 }, { "epoch": 0.1, "grad_norm": 63.75, "learning_rate": 4.544360861662498e-05, "loss": 1.0241, "step": 112500 }, { "epoch": 0.1, "grad_norm": 608.0, "learning_rate": 4.543910954343405e-05, "loss": 0.9974, "step": 112600 }, { "epoch": 0.1, "grad_norm": 28.625, "learning_rate": 4.5434610470243135e-05, "loss": 1.1885, "step": 112700 }, { "epoch": 0.1, "grad_norm": 47.75, "learning_rate": 4.543011139705221e-05, "loss": 1.0307, "step": 112800 }, { "epoch": 0.1, "grad_norm": 30.5, "learning_rate": 4.5425612323861286e-05, "loss": 1.0306, "step": 112900 }, { "epoch": 0.1, "grad_norm": 79.0, "learning_rate": 4.542111325067037e-05, "loss": 1.0288, "step": 113000 }, { "epoch": 0.1, "grad_norm": 77.5, "learning_rate": 4.5416614177479443e-05, "loss": 0.9431, "step": 113100 }, { "epoch": 0.1, "grad_norm": 142.0, "learning_rate": 4.541211510428852e-05, "loss": 1.0245, "step": 113200 }, { "epoch": 0.1, "grad_norm": 21.0, "learning_rate": 4.5407616031097594e-05, "loss": 0.9562, "step": 113300 }, { "epoch": 0.1, "grad_norm": 8.4375, "learning_rate": 4.540311695790667e-05, "loss": 0.8952, "step": 113400 }, { "epoch": 0.1, "grad_norm": 7.34375, "learning_rate": 4.539861788471575e-05, "loss": 0.9978, "step": 113500 }, { "epoch": 0.1, "grad_norm": 0.12060546875, "learning_rate": 4.539411881152483e-05, "loss": 1.0347, "step": 113600 }, { "epoch": 0.1, "grad_norm": 152.0, "learning_rate": 4.53896197383339e-05, "loss": 1.1466, "step": 113700 }, { "epoch": 0.1, "grad_norm": 23.0, "learning_rate": 4.5385120665142984e-05, "loss": 1.0597, "step": 113800 }, { "epoch": 0.1, "grad_norm": 61.25, "learning_rate": 4.538062159195206e-05, "loss": 1.0811, "step": 113900 }, { "epoch": 0.1, "grad_norm": 27.0, "learning_rate": 4.5376122518761135e-05, "loss": 1.0961, "step": 114000 }, { "epoch": 0.1, "grad_norm": 67.0, "learning_rate": 4.537162344557022e-05, "loss": 1.1227, "step": 114100 }, { "epoch": 0.1, "grad_norm": 0.189453125, "learning_rate": 4.536712437237929e-05, "loss": 1.038, "step": 114200 }, { "epoch": 0.1, "grad_norm": 26.625, "learning_rate": 4.5362625299188375e-05, "loss": 0.9473, "step": 114300 }, { "epoch": 0.1, "grad_norm": 12.875, "learning_rate": 4.535812622599745e-05, "loss": 0.8994, "step": 114400 }, { "epoch": 0.1, "grad_norm": 22.25, "learning_rate": 4.535362715280652e-05, "loss": 1.044, "step": 114500 }, { "epoch": 0.1, "grad_norm": 46.0, "learning_rate": 4.53491280796156e-05, "loss": 0.9068, "step": 114600 }, { "epoch": 0.1, "grad_norm": 94.5, "learning_rate": 4.5344629006424676e-05, "loss": 1.1091, "step": 114700 }, { "epoch": 0.1, "grad_norm": 21.875, "learning_rate": 4.534012993323376e-05, "loss": 0.8964, "step": 114800 }, { "epoch": 0.1, "grad_norm": 31.75, "learning_rate": 4.5335630860042833e-05, "loss": 1.1737, "step": 114900 }, { "epoch": 0.1, "grad_norm": 98.5, "learning_rate": 4.533113178685191e-05, "loss": 1.0359, "step": 115000 }, { "epoch": 0.1, "grad_norm": 47.75, "learning_rate": 4.532663271366099e-05, "loss": 0.9983, "step": 115100 }, { "epoch": 0.1, "grad_norm": 40.5, "learning_rate": 4.5322133640470066e-05, "loss": 0.9794, "step": 115200 }, { "epoch": 0.1, "grad_norm": 28.0, "learning_rate": 4.531763456727914e-05, "loss": 1.087, "step": 115300 }, { "epoch": 0.1, "grad_norm": 0.337890625, "learning_rate": 4.5313135494088224e-05, "loss": 0.9429, "step": 115400 }, { "epoch": 0.1, "grad_norm": 12.1875, "learning_rate": 4.53086364208973e-05, "loss": 0.9066, "step": 115500 }, { "epoch": 0.1, "grad_norm": 0.0751953125, "learning_rate": 4.5304137347706374e-05, "loss": 1.0203, "step": 115600 }, { "epoch": 0.1, "grad_norm": 14.0, "learning_rate": 4.5299638274515456e-05, "loss": 1.0212, "step": 115700 }, { "epoch": 0.1, "grad_norm": 218.0, "learning_rate": 4.5295139201324525e-05, "loss": 0.9827, "step": 115800 }, { "epoch": 0.1, "grad_norm": 0.25, "learning_rate": 4.529064012813361e-05, "loss": 1.0948, "step": 115900 }, { "epoch": 0.1, "grad_norm": 27.125, "learning_rate": 4.528614105494268e-05, "loss": 0.9621, "step": 116000 }, { "epoch": 0.1, "grad_norm": 18.125, "learning_rate": 4.528164198175176e-05, "loss": 1.038, "step": 116100 }, { "epoch": 0.1, "grad_norm": 39.0, "learning_rate": 4.527714290856084e-05, "loss": 1.0215, "step": 116200 }, { "epoch": 0.1, "grad_norm": 0.025634765625, "learning_rate": 4.5272643835369915e-05, "loss": 0.993, "step": 116300 }, { "epoch": 0.1, "grad_norm": 161.0, "learning_rate": 4.526814476217899e-05, "loss": 1.095, "step": 116400 }, { "epoch": 0.1, "grad_norm": 10.875, "learning_rate": 4.526364568898807e-05, "loss": 1.049, "step": 116500 }, { "epoch": 0.1, "grad_norm": 0.0068359375, "learning_rate": 4.525914661579715e-05, "loss": 0.9871, "step": 116600 }, { "epoch": 0.1, "grad_norm": 47.5, "learning_rate": 4.5254647542606223e-05, "loss": 0.9609, "step": 116700 }, { "epoch": 0.1, "grad_norm": 11.75, "learning_rate": 4.5250148469415306e-05, "loss": 0.9806, "step": 116800 }, { "epoch": 0.1, "grad_norm": 24.5, "learning_rate": 4.524564939622438e-05, "loss": 0.9371, "step": 116900 }, { "epoch": 0.1, "grad_norm": 55.25, "learning_rate": 4.524115032303346e-05, "loss": 0.9856, "step": 117000 }, { "epoch": 0.1, "grad_norm": 89.5, "learning_rate": 4.523665124984253e-05, "loss": 1.149, "step": 117100 }, { "epoch": 0.1, "grad_norm": 2.46875, "learning_rate": 4.523215217665161e-05, "loss": 1.1008, "step": 117200 }, { "epoch": 0.1, "grad_norm": 6.15625, "learning_rate": 4.522765310346069e-05, "loss": 1.0506, "step": 117300 }, { "epoch": 0.1, "grad_norm": 13.25, "learning_rate": 4.5223154030269764e-05, "loss": 1.0076, "step": 117400 }, { "epoch": 0.1, "grad_norm": 23.375, "learning_rate": 4.5218654957078846e-05, "loss": 1.0853, "step": 117500 }, { "epoch": 0.1, "grad_norm": 83.0, "learning_rate": 4.521415588388792e-05, "loss": 0.9788, "step": 117600 }, { "epoch": 0.1, "grad_norm": 18.625, "learning_rate": 4.5209656810697e-05, "loss": 1.0742, "step": 117700 }, { "epoch": 0.1, "grad_norm": 46.0, "learning_rate": 4.520515773750608e-05, "loss": 1.0977, "step": 117800 }, { "epoch": 0.11, "grad_norm": 0.306640625, "learning_rate": 4.5200658664315155e-05, "loss": 1.0214, "step": 117900 }, { "epoch": 0.11, "grad_norm": 12.25, "learning_rate": 4.519615959112423e-05, "loss": 1.0425, "step": 118000 }, { "epoch": 0.11, "grad_norm": 18.75, "learning_rate": 4.519166051793331e-05, "loss": 1.0495, "step": 118100 }, { "epoch": 0.11, "grad_norm": 28.375, "learning_rate": 4.518716144474239e-05, "loss": 1.0409, "step": 118200 }, { "epoch": 0.11, "grad_norm": 14.375, "learning_rate": 4.518266237155146e-05, "loss": 1.0505, "step": 118300 }, { "epoch": 0.11, "grad_norm": 90.0, "learning_rate": 4.517816329836054e-05, "loss": 0.9198, "step": 118400 }, { "epoch": 0.11, "grad_norm": 35.75, "learning_rate": 4.5173664225169613e-05, "loss": 0.9992, "step": 118500 }, { "epoch": 0.11, "grad_norm": 243.0, "learning_rate": 4.5169165151978696e-05, "loss": 1.1532, "step": 118600 }, { "epoch": 0.11, "grad_norm": 12.25, "learning_rate": 4.516466607878777e-05, "loss": 0.9649, "step": 118700 }, { "epoch": 0.11, "grad_norm": 101.0, "learning_rate": 4.5160167005596846e-05, "loss": 1.2006, "step": 118800 }, { "epoch": 0.11, "grad_norm": 75.5, "learning_rate": 4.515566793240593e-05, "loss": 0.9985, "step": 118900 }, { "epoch": 0.11, "grad_norm": 0.64453125, "learning_rate": 4.5151168859215004e-05, "loss": 1.1021, "step": 119000 }, { "epoch": 0.11, "grad_norm": 15.5625, "learning_rate": 4.514666978602408e-05, "loss": 0.9532, "step": 119100 }, { "epoch": 0.11, "grad_norm": 189.0, "learning_rate": 4.514217071283316e-05, "loss": 1.0827, "step": 119200 }, { "epoch": 0.11, "grad_norm": 109.0, "learning_rate": 4.5137671639642236e-05, "loss": 1.0307, "step": 119300 }, { "epoch": 0.11, "grad_norm": 31.625, "learning_rate": 4.513317256645131e-05, "loss": 1.1052, "step": 119400 }, { "epoch": 0.11, "grad_norm": 110.0, "learning_rate": 4.5128673493260394e-05, "loss": 1.0393, "step": 119500 }, { "epoch": 0.11, "grad_norm": 8.5, "learning_rate": 4.512417442006947e-05, "loss": 0.981, "step": 119600 }, { "epoch": 0.11, "grad_norm": 37.75, "learning_rate": 4.5119675346878545e-05, "loss": 0.9903, "step": 119700 }, { "epoch": 0.11, "grad_norm": 149.0, "learning_rate": 4.511517627368762e-05, "loss": 1.0324, "step": 119800 }, { "epoch": 0.11, "grad_norm": 119.0, "learning_rate": 4.5110677200496695e-05, "loss": 1.0228, "step": 119900 }, { "epoch": 0.11, "grad_norm": 58.75, "learning_rate": 4.510617812730578e-05, "loss": 1.1095, "step": 120000 }, { "epoch": 0.11, "grad_norm": 23.875, "learning_rate": 4.510167905411485e-05, "loss": 1.1055, "step": 120100 }, { "epoch": 0.11, "grad_norm": 36.75, "learning_rate": 4.5097179980923935e-05, "loss": 1.1821, "step": 120200 }, { "epoch": 0.11, "grad_norm": 29.25, "learning_rate": 4.509268090773301e-05, "loss": 0.9721, "step": 120300 }, { "epoch": 0.11, "grad_norm": 22.375, "learning_rate": 4.5088181834542086e-05, "loss": 1.1529, "step": 120400 }, { "epoch": 0.11, "grad_norm": 14.4375, "learning_rate": 4.508368276135117e-05, "loss": 1.1538, "step": 120500 }, { "epoch": 0.11, "grad_norm": 70.0, "learning_rate": 4.507918368816024e-05, "loss": 0.9579, "step": 120600 }, { "epoch": 0.11, "grad_norm": 12.0625, "learning_rate": 4.507468461496932e-05, "loss": 0.8607, "step": 120700 }, { "epoch": 0.11, "grad_norm": 41.0, "learning_rate": 4.50701855417784e-05, "loss": 1.0996, "step": 120800 }, { "epoch": 0.11, "grad_norm": 93.0, "learning_rate": 4.5065686468587476e-05, "loss": 0.9022, "step": 120900 }, { "epoch": 0.11, "grad_norm": 25.75, "learning_rate": 4.506118739539655e-05, "loss": 1.0507, "step": 121000 }, { "epoch": 0.11, "grad_norm": 18.25, "learning_rate": 4.5056688322205626e-05, "loss": 1.0672, "step": 121100 }, { "epoch": 0.11, "grad_norm": 23.125, "learning_rate": 4.50521892490147e-05, "loss": 1.0311, "step": 121200 }, { "epoch": 0.11, "grad_norm": 0.0078125, "learning_rate": 4.5047690175823784e-05, "loss": 1.0323, "step": 121300 }, { "epoch": 0.11, "grad_norm": 69.0, "learning_rate": 4.504319110263286e-05, "loss": 0.9827, "step": 121400 }, { "epoch": 0.11, "grad_norm": 117.5, "learning_rate": 4.5038692029441935e-05, "loss": 1.0411, "step": 121500 }, { "epoch": 0.11, "grad_norm": 7.59375, "learning_rate": 4.503419295625102e-05, "loss": 1.1013, "step": 121600 }, { "epoch": 0.11, "grad_norm": 418.0, "learning_rate": 4.502969388306009e-05, "loss": 0.9148, "step": 121700 }, { "epoch": 0.11, "grad_norm": 40.0, "learning_rate": 4.502519480986917e-05, "loss": 1.0555, "step": 121800 }, { "epoch": 0.11, "grad_norm": 40.5, "learning_rate": 4.502069573667825e-05, "loss": 0.9873, "step": 121900 }, { "epoch": 0.11, "grad_norm": 75.0, "learning_rate": 4.5016196663487325e-05, "loss": 1.2124, "step": 122000 }, { "epoch": 0.11, "grad_norm": 18.625, "learning_rate": 4.50116975902964e-05, "loss": 1.0172, "step": 122100 }, { "epoch": 0.11, "grad_norm": 47.0, "learning_rate": 4.500719851710548e-05, "loss": 1.0121, "step": 122200 }, { "epoch": 0.11, "grad_norm": 26.125, "learning_rate": 4.500269944391455e-05, "loss": 1.0633, "step": 122300 }, { "epoch": 0.11, "grad_norm": 14.625, "learning_rate": 4.499820037072363e-05, "loss": 1.0916, "step": 122400 }, { "epoch": 0.11, "grad_norm": 90.5, "learning_rate": 4.499370129753271e-05, "loss": 0.9899, "step": 122500 }, { "epoch": 0.11, "grad_norm": 26.875, "learning_rate": 4.4989202224341784e-05, "loss": 0.9726, "step": 122600 }, { "epoch": 0.11, "grad_norm": 0.3671875, "learning_rate": 4.4984703151150866e-05, "loss": 0.9431, "step": 122700 }, { "epoch": 0.11, "grad_norm": 76.0, "learning_rate": 4.498020407795994e-05, "loss": 0.9709, "step": 122800 }, { "epoch": 0.11, "grad_norm": 37.5, "learning_rate": 4.4975705004769016e-05, "loss": 0.9988, "step": 122900 }, { "epoch": 0.11, "grad_norm": 25.875, "learning_rate": 4.49712059315781e-05, "loss": 1.089, "step": 123000 }, { "epoch": 0.11, "grad_norm": 12.75, "learning_rate": 4.4966706858387174e-05, "loss": 0.9941, "step": 123100 }, { "epoch": 0.11, "grad_norm": 62.5, "learning_rate": 4.4962207785196256e-05, "loss": 0.9371, "step": 123200 }, { "epoch": 0.11, "grad_norm": 24.375, "learning_rate": 4.495770871200533e-05, "loss": 1.1702, "step": 123300 }, { "epoch": 0.11, "grad_norm": 106.5, "learning_rate": 4.495320963881441e-05, "loss": 0.9995, "step": 123400 }, { "epoch": 0.11, "grad_norm": 18.375, "learning_rate": 4.494871056562349e-05, "loss": 1.1057, "step": 123500 }, { "epoch": 0.11, "grad_norm": 145.0, "learning_rate": 4.494421149243256e-05, "loss": 1.0401, "step": 123600 }, { "epoch": 0.11, "grad_norm": 29.375, "learning_rate": 4.493971241924164e-05, "loss": 1.0329, "step": 123700 }, { "epoch": 0.11, "grad_norm": 42.25, "learning_rate": 4.4935213346050715e-05, "loss": 1.1075, "step": 123800 }, { "epoch": 0.11, "grad_norm": 155.0, "learning_rate": 4.493071427285979e-05, "loss": 1.1021, "step": 123900 }, { "epoch": 0.11, "grad_norm": 12.875, "learning_rate": 4.492621519966887e-05, "loss": 1.048, "step": 124000 }, { "epoch": 0.11, "grad_norm": 1.6953125, "learning_rate": 4.492171612647795e-05, "loss": 0.9331, "step": 124100 }, { "epoch": 0.11, "grad_norm": 12.625, "learning_rate": 4.491721705328702e-05, "loss": 1.1235, "step": 124200 }, { "epoch": 0.11, "grad_norm": 47.5, "learning_rate": 4.4912717980096105e-05, "loss": 1.2219, "step": 124300 }, { "epoch": 0.11, "grad_norm": 69.5, "learning_rate": 4.490821890690518e-05, "loss": 1.0295, "step": 124400 }, { "epoch": 0.11, "grad_norm": 38.25, "learning_rate": 4.4903719833714256e-05, "loss": 1.2938, "step": 124500 }, { "epoch": 0.11, "grad_norm": 35.5, "learning_rate": 4.489922076052334e-05, "loss": 1.0069, "step": 124600 }, { "epoch": 0.11, "grad_norm": 38.0, "learning_rate": 4.489472168733241e-05, "loss": 1.0296, "step": 124700 }, { "epoch": 0.11, "grad_norm": 72.5, "learning_rate": 4.489022261414149e-05, "loss": 0.9872, "step": 124800 }, { "epoch": 0.11, "grad_norm": 4.09375, "learning_rate": 4.4885723540950564e-05, "loss": 1.1223, "step": 124900 }, { "epoch": 0.11, "grad_norm": 51.75, "learning_rate": 4.488122446775964e-05, "loss": 1.046, "step": 125000 }, { "epoch": 0.11, "grad_norm": 29.875, "learning_rate": 4.487672539456872e-05, "loss": 1.1126, "step": 125100 }, { "epoch": 0.11, "grad_norm": 19.0, "learning_rate": 4.48722263213778e-05, "loss": 1.1329, "step": 125200 }, { "epoch": 0.11, "grad_norm": 11.125, "learning_rate": 4.486772724818687e-05, "loss": 1.0184, "step": 125300 }, { "epoch": 0.11, "grad_norm": 41.5, "learning_rate": 4.4863228174995954e-05, "loss": 1.0952, "step": 125400 }, { "epoch": 0.11, "grad_norm": 0.009521484375, "learning_rate": 4.485872910180503e-05, "loss": 0.9959, "step": 125500 }, { "epoch": 0.11, "grad_norm": 39.25, "learning_rate": 4.4854230028614105e-05, "loss": 1.0238, "step": 125600 }, { "epoch": 0.11, "grad_norm": 40.25, "learning_rate": 4.484973095542319e-05, "loss": 0.8617, "step": 125700 }, { "epoch": 0.11, "grad_norm": 56.25, "learning_rate": 4.484523188223226e-05, "loss": 1.1847, "step": 125800 }, { "epoch": 0.11, "grad_norm": 120.0, "learning_rate": 4.4840732809041344e-05, "loss": 0.9826, "step": 125900 }, { "epoch": 0.11, "grad_norm": 40.0, "learning_rate": 4.483623373585042e-05, "loss": 1.1456, "step": 126000 }, { "epoch": 0.11, "grad_norm": 0.005889892578125, "learning_rate": 4.4831734662659495e-05, "loss": 1.136, "step": 126100 }, { "epoch": 0.11, "grad_norm": 0.024658203125, "learning_rate": 4.482723558946857e-05, "loss": 1.0936, "step": 126200 }, { "epoch": 0.11, "grad_norm": 0.036376953125, "learning_rate": 4.4822736516277646e-05, "loss": 0.9123, "step": 126300 }, { "epoch": 0.11, "grad_norm": 26.75, "learning_rate": 4.481823744308673e-05, "loss": 0.9552, "step": 126400 }, { "epoch": 0.11, "grad_norm": 34.25, "learning_rate": 4.48137383698958e-05, "loss": 1.0772, "step": 126500 }, { "epoch": 0.11, "grad_norm": 249.0, "learning_rate": 4.480923929670488e-05, "loss": 1.0825, "step": 126600 }, { "epoch": 0.11, "grad_norm": 44.0, "learning_rate": 4.480474022351396e-05, "loss": 0.9959, "step": 126700 }, { "epoch": 0.11, "grad_norm": 49.0, "learning_rate": 4.4800241150323036e-05, "loss": 0.9977, "step": 126800 }, { "epoch": 0.11, "grad_norm": 8.375, "learning_rate": 4.479574207713211e-05, "loss": 1.0124, "step": 126900 }, { "epoch": 0.11, "grad_norm": 17.375, "learning_rate": 4.4791243003941193e-05, "loss": 1.1152, "step": 127000 }, { "epoch": 0.11, "grad_norm": 0.0201416015625, "learning_rate": 4.478674393075027e-05, "loss": 1.1607, "step": 127100 }, { "epoch": 0.11, "grad_norm": 39.0, "learning_rate": 4.4782244857559344e-05, "loss": 0.9655, "step": 127200 }, { "epoch": 0.11, "grad_norm": 34.25, "learning_rate": 4.4777745784368426e-05, "loss": 0.9306, "step": 127300 }, { "epoch": 0.11, "grad_norm": 20.25, "learning_rate": 4.47732467111775e-05, "loss": 1.1897, "step": 127400 }, { "epoch": 0.11, "grad_norm": 29.25, "learning_rate": 4.476874763798658e-05, "loss": 1.022, "step": 127500 }, { "epoch": 0.11, "grad_norm": 23.5, "learning_rate": 4.476424856479565e-05, "loss": 1.1027, "step": 127600 }, { "epoch": 0.11, "grad_norm": 14.125, "learning_rate": 4.475974949160473e-05, "loss": 0.9885, "step": 127700 }, { "epoch": 0.11, "grad_norm": 24.625, "learning_rate": 4.475525041841381e-05, "loss": 0.9968, "step": 127800 }, { "epoch": 0.11, "grad_norm": 0.77734375, "learning_rate": 4.4750751345222885e-05, "loss": 0.9143, "step": 127900 }, { "epoch": 0.11, "grad_norm": 36.0, "learning_rate": 4.474625227203196e-05, "loss": 0.9545, "step": 128000 }, { "epoch": 0.11, "grad_norm": 16.875, "learning_rate": 4.474175319884104e-05, "loss": 0.9204, "step": 128100 }, { "epoch": 0.11, "grad_norm": 19.5, "learning_rate": 4.473725412565012e-05, "loss": 1.2275, "step": 128200 }, { "epoch": 0.11, "grad_norm": 28.75, "learning_rate": 4.473275505245919e-05, "loss": 0.8557, "step": 128300 }, { "epoch": 0.11, "grad_norm": 23.625, "learning_rate": 4.4728255979268275e-05, "loss": 1.0364, "step": 128400 }, { "epoch": 0.11, "grad_norm": 80.5, "learning_rate": 4.472375690607735e-05, "loss": 1.0122, "step": 128500 }, { "epoch": 0.11, "grad_norm": 0.19921875, "learning_rate": 4.471925783288643e-05, "loss": 1.0095, "step": 128600 }, { "epoch": 0.11, "grad_norm": 93.0, "learning_rate": 4.471475875969551e-05, "loss": 1.1243, "step": 128700 }, { "epoch": 0.11, "grad_norm": 57.75, "learning_rate": 4.471025968650458e-05, "loss": 0.9377, "step": 128800 }, { "epoch": 0.11, "grad_norm": 29.875, "learning_rate": 4.470576061331366e-05, "loss": 1.1993, "step": 128900 }, { "epoch": 0.11, "grad_norm": 30.5, "learning_rate": 4.4701261540122734e-05, "loss": 1.1627, "step": 129000 }, { "epoch": 0.12, "grad_norm": 28.625, "learning_rate": 4.4696762466931816e-05, "loss": 1.0471, "step": 129100 }, { "epoch": 0.12, "grad_norm": 37.75, "learning_rate": 4.469226339374089e-05, "loss": 1.0627, "step": 129200 }, { "epoch": 0.12, "grad_norm": 20.375, "learning_rate": 4.468776432054997e-05, "loss": 1.0888, "step": 129300 }, { "epoch": 0.12, "grad_norm": 88.0, "learning_rate": 4.468326524735905e-05, "loss": 1.1228, "step": 129400 }, { "epoch": 0.12, "grad_norm": 22.875, "learning_rate": 4.4678766174168124e-05, "loss": 0.9295, "step": 129500 }, { "epoch": 0.12, "grad_norm": 35.75, "learning_rate": 4.46742671009772e-05, "loss": 0.9467, "step": 129600 }, { "epoch": 0.12, "grad_norm": 33.5, "learning_rate": 4.466976802778628e-05, "loss": 0.9815, "step": 129700 }, { "epoch": 0.12, "grad_norm": 21.375, "learning_rate": 4.466526895459536e-05, "loss": 1.0445, "step": 129800 }, { "epoch": 0.12, "grad_norm": 50.25, "learning_rate": 4.466076988140443e-05, "loss": 1.0879, "step": 129900 }, { "epoch": 0.12, "grad_norm": 24.0, "learning_rate": 4.4656270808213515e-05, "loss": 1.2399, "step": 130000 }, { "epoch": 0.12, "grad_norm": 34.25, "learning_rate": 4.465177173502258e-05, "loss": 0.9757, "step": 130100 }, { "epoch": 0.12, "grad_norm": 0.251953125, "learning_rate": 4.4647272661831665e-05, "loss": 1.097, "step": 130200 }, { "epoch": 0.12, "grad_norm": 14.0625, "learning_rate": 4.464277358864074e-05, "loss": 1.0073, "step": 130300 }, { "epoch": 0.12, "grad_norm": 113.0, "learning_rate": 4.4638274515449816e-05, "loss": 0.9838, "step": 130400 }, { "epoch": 0.12, "grad_norm": 476.0, "learning_rate": 4.46337754422589e-05, "loss": 1.0932, "step": 130500 }, { "epoch": 0.12, "grad_norm": 65.5, "learning_rate": 4.4629276369067973e-05, "loss": 1.1927, "step": 130600 }, { "epoch": 0.12, "grad_norm": 1.6484375, "learning_rate": 4.462477729587705e-05, "loss": 0.9592, "step": 130700 }, { "epoch": 0.12, "grad_norm": 37.75, "learning_rate": 4.462027822268613e-05, "loss": 1.1063, "step": 130800 }, { "epoch": 0.12, "grad_norm": 53.75, "learning_rate": 4.4615779149495206e-05, "loss": 1.0479, "step": 130900 }, { "epoch": 0.12, "grad_norm": 0.125, "learning_rate": 4.461128007630428e-05, "loss": 0.9726, "step": 131000 }, { "epoch": 0.12, "grad_norm": 67.0, "learning_rate": 4.4606781003113364e-05, "loss": 0.9642, "step": 131100 }, { "epoch": 0.12, "grad_norm": 0.404296875, "learning_rate": 4.460228192992244e-05, "loss": 0.9941, "step": 131200 }, { "epoch": 0.12, "grad_norm": 46.0, "learning_rate": 4.459778285673152e-05, "loss": 1.0464, "step": 131300 }, { "epoch": 0.12, "grad_norm": 14.5625, "learning_rate": 4.459328378354059e-05, "loss": 0.8999, "step": 131400 }, { "epoch": 0.12, "grad_norm": 28.875, "learning_rate": 4.4588784710349665e-05, "loss": 1.0474, "step": 131500 }, { "epoch": 0.12, "grad_norm": 13.625, "learning_rate": 4.458428563715875e-05, "loss": 0.9166, "step": 131600 }, { "epoch": 0.12, "grad_norm": 94.5, "learning_rate": 4.457978656396782e-05, "loss": 1.0516, "step": 131700 }, { "epoch": 0.12, "grad_norm": 35.0, "learning_rate": 4.4575287490776905e-05, "loss": 1.0711, "step": 131800 }, { "epoch": 0.12, "grad_norm": 9.0625, "learning_rate": 4.457078841758598e-05, "loss": 0.9893, "step": 131900 }, { "epoch": 0.12, "grad_norm": 39.5, "learning_rate": 4.4566289344395055e-05, "loss": 1.043, "step": 132000 }, { "epoch": 0.12, "grad_norm": 0.00970458984375, "learning_rate": 4.456179027120414e-05, "loss": 1.0306, "step": 132100 }, { "epoch": 0.12, "grad_norm": 8.8125, "learning_rate": 4.455729119801321e-05, "loss": 1.0423, "step": 132200 }, { "epoch": 0.12, "grad_norm": 30.75, "learning_rate": 4.455279212482229e-05, "loss": 1.0196, "step": 132300 }, { "epoch": 0.12, "grad_norm": 11.3125, "learning_rate": 4.454829305163137e-05, "loss": 0.9924, "step": 132400 }, { "epoch": 0.12, "grad_norm": 16.25, "learning_rate": 4.4543793978440446e-05, "loss": 0.9755, "step": 132500 }, { "epoch": 0.12, "grad_norm": 35.5, "learning_rate": 4.453929490524952e-05, "loss": 1.0687, "step": 132600 }, { "epoch": 0.12, "grad_norm": 41.25, "learning_rate": 4.4534795832058596e-05, "loss": 1.056, "step": 132700 }, { "epoch": 0.12, "grad_norm": 0.028076171875, "learning_rate": 4.453029675886767e-05, "loss": 1.0784, "step": 132800 }, { "epoch": 0.12, "grad_norm": 16.375, "learning_rate": 4.4525797685676754e-05, "loss": 1.1376, "step": 132900 }, { "epoch": 0.12, "grad_norm": 81.5, "learning_rate": 4.452129861248583e-05, "loss": 1.0206, "step": 133000 }, { "epoch": 0.12, "grad_norm": 36.0, "learning_rate": 4.4516799539294904e-05, "loss": 0.9588, "step": 133100 }, { "epoch": 0.12, "grad_norm": 0.026123046875, "learning_rate": 4.4512300466103986e-05, "loss": 1.0697, "step": 133200 }, { "epoch": 0.12, "grad_norm": 80.0, "learning_rate": 4.450780139291306e-05, "loss": 1.0369, "step": 133300 }, { "epoch": 0.12, "grad_norm": 24.125, "learning_rate": 4.450330231972214e-05, "loss": 1.0461, "step": 133400 }, { "epoch": 0.12, "grad_norm": 15.6875, "learning_rate": 4.449880324653122e-05, "loss": 1.0091, "step": 133500 }, { "epoch": 0.12, "grad_norm": 38.25, "learning_rate": 4.4494304173340295e-05, "loss": 1.0225, "step": 133600 }, { "epoch": 0.12, "grad_norm": 70.5, "learning_rate": 4.448980510014937e-05, "loss": 1.0442, "step": 133700 }, { "epoch": 0.12, "grad_norm": 1.578125, "learning_rate": 4.448530602695845e-05, "loss": 1.0823, "step": 133800 }, { "epoch": 0.12, "grad_norm": 42.75, "learning_rate": 4.448080695376753e-05, "loss": 1.0175, "step": 133900 }, { "epoch": 0.12, "grad_norm": 0.032470703125, "learning_rate": 4.44763078805766e-05, "loss": 1.0915, "step": 134000 }, { "epoch": 0.12, "grad_norm": 1.3671875, "learning_rate": 4.447180880738568e-05, "loss": 0.8835, "step": 134100 }, { "epoch": 0.12, "grad_norm": 22.375, "learning_rate": 4.4467309734194753e-05, "loss": 0.9454, "step": 134200 }, { "epoch": 0.12, "grad_norm": 11.3125, "learning_rate": 4.4462810661003836e-05, "loss": 1.2124, "step": 134300 }, { "epoch": 0.12, "grad_norm": 15.25, "learning_rate": 4.445831158781291e-05, "loss": 0.9435, "step": 134400 }, { "epoch": 0.12, "grad_norm": 194.0, "learning_rate": 4.445381251462199e-05, "loss": 1.0023, "step": 134500 }, { "epoch": 0.12, "grad_norm": 13.9375, "learning_rate": 4.444931344143107e-05, "loss": 1.1119, "step": 134600 }, { "epoch": 0.12, "grad_norm": 19.125, "learning_rate": 4.4444814368240144e-05, "loss": 1.042, "step": 134700 }, { "epoch": 0.12, "grad_norm": 17.25, "learning_rate": 4.4440315295049226e-05, "loss": 0.9875, "step": 134800 }, { "epoch": 0.12, "grad_norm": 0.13671875, "learning_rate": 4.44358162218583e-05, "loss": 1.0372, "step": 134900 }, { "epoch": 0.12, "grad_norm": 37.5, "learning_rate": 4.4431317148667376e-05, "loss": 0.8741, "step": 135000 }, { "epoch": 0.12, "grad_norm": 21.375, "learning_rate": 4.442681807547646e-05, "loss": 0.9989, "step": 135100 }, { "epoch": 0.12, "grad_norm": 62.0, "learning_rate": 4.4422319002285534e-05, "loss": 1.0272, "step": 135200 }, { "epoch": 0.12, "grad_norm": 30.625, "learning_rate": 4.441781992909461e-05, "loss": 0.9341, "step": 135300 }, { "epoch": 0.12, "grad_norm": 32.25, "learning_rate": 4.4413320855903685e-05, "loss": 0.9291, "step": 135400 }, { "epoch": 0.12, "grad_norm": 23.25, "learning_rate": 4.440882178271276e-05, "loss": 0.9806, "step": 135500 }, { "epoch": 0.12, "grad_norm": 27.875, "learning_rate": 4.440432270952184e-05, "loss": 1.0683, "step": 135600 }, { "epoch": 0.12, "grad_norm": 0.0166015625, "learning_rate": 4.439982363633092e-05, "loss": 0.9878, "step": 135700 }, { "epoch": 0.12, "grad_norm": 30.25, "learning_rate": 4.439532456313999e-05, "loss": 1.116, "step": 135800 }, { "epoch": 0.12, "grad_norm": 18.125, "learning_rate": 4.4390825489949075e-05, "loss": 1.0541, "step": 135900 }, { "epoch": 0.12, "grad_norm": 18.0, "learning_rate": 4.438632641675815e-05, "loss": 1.0558, "step": 136000 }, { "epoch": 0.12, "grad_norm": 49.25, "learning_rate": 4.4381827343567226e-05, "loss": 0.8569, "step": 136100 }, { "epoch": 0.12, "grad_norm": 20.25, "learning_rate": 4.437732827037631e-05, "loss": 0.9822, "step": 136200 }, { "epoch": 0.12, "grad_norm": 19.875, "learning_rate": 4.437282919718538e-05, "loss": 1.1, "step": 136300 }, { "epoch": 0.12, "grad_norm": 13.3125, "learning_rate": 4.436833012399446e-05, "loss": 0.9849, "step": 136400 }, { "epoch": 0.12, "grad_norm": 43.0, "learning_rate": 4.436383105080354e-05, "loss": 0.9913, "step": 136500 }, { "epoch": 0.12, "grad_norm": 18.0, "learning_rate": 4.435933197761261e-05, "loss": 0.9118, "step": 136600 }, { "epoch": 0.12, "grad_norm": 62.0, "learning_rate": 4.435483290442169e-05, "loss": 1.1478, "step": 136700 }, { "epoch": 0.12, "grad_norm": 52.25, "learning_rate": 4.4350333831230766e-05, "loss": 1.0215, "step": 136800 }, { "epoch": 0.12, "grad_norm": 33.5, "learning_rate": 4.434583475803984e-05, "loss": 1.0647, "step": 136900 }, { "epoch": 0.12, "grad_norm": 29.0, "learning_rate": 4.4341335684848924e-05, "loss": 0.9871, "step": 137000 }, { "epoch": 0.12, "grad_norm": 0.000946044921875, "learning_rate": 4.4336836611658e-05, "loss": 1.0437, "step": 137100 }, { "epoch": 0.12, "grad_norm": 62.25, "learning_rate": 4.433233753846708e-05, "loss": 1.1933, "step": 137200 }, { "epoch": 0.12, "grad_norm": 36.5, "learning_rate": 4.432783846527616e-05, "loss": 1.081, "step": 137300 }, { "epoch": 0.12, "grad_norm": 49.5, "learning_rate": 4.432333939208523e-05, "loss": 1.0511, "step": 137400 }, { "epoch": 0.12, "grad_norm": 37.5, "learning_rate": 4.4318840318894314e-05, "loss": 1.0126, "step": 137500 }, { "epoch": 0.12, "grad_norm": 29.5, "learning_rate": 4.431434124570339e-05, "loss": 0.9972, "step": 137600 }, { "epoch": 0.12, "grad_norm": 32.25, "learning_rate": 4.4309842172512465e-05, "loss": 0.9747, "step": 137700 }, { "epoch": 0.12, "grad_norm": 100.0, "learning_rate": 4.430534309932155e-05, "loss": 0.9739, "step": 137800 }, { "epoch": 0.12, "grad_norm": 15.5625, "learning_rate": 4.4300844026130616e-05, "loss": 0.9225, "step": 137900 }, { "epoch": 0.12, "grad_norm": 43.75, "learning_rate": 4.42963449529397e-05, "loss": 1.0158, "step": 138000 }, { "epoch": 0.12, "grad_norm": 17.375, "learning_rate": 4.429184587974877e-05, "loss": 0.9643, "step": 138100 }, { "epoch": 0.12, "grad_norm": 53.75, "learning_rate": 4.428734680655785e-05, "loss": 1.0333, "step": 138200 }, { "epoch": 0.12, "grad_norm": 916.0, "learning_rate": 4.428284773336693e-05, "loss": 0.8435, "step": 138300 }, { "epoch": 0.12, "grad_norm": 12.9375, "learning_rate": 4.4278348660176006e-05, "loss": 1.1194, "step": 138400 }, { "epoch": 0.12, "grad_norm": 10.625, "learning_rate": 4.427384958698508e-05, "loss": 1.0688, "step": 138500 }, { "epoch": 0.12, "grad_norm": 101.5, "learning_rate": 4.426935051379416e-05, "loss": 1.0035, "step": 138600 }, { "epoch": 0.12, "grad_norm": 34.75, "learning_rate": 4.426485144060324e-05, "loss": 1.2824, "step": 138700 }, { "epoch": 0.12, "grad_norm": 61.5, "learning_rate": 4.4260352367412314e-05, "loss": 1.0244, "step": 138800 }, { "epoch": 0.12, "grad_norm": 3.5, "learning_rate": 4.4255853294221396e-05, "loss": 1.0817, "step": 138900 }, { "epoch": 0.12, "grad_norm": 31.25, "learning_rate": 4.425135422103047e-05, "loss": 0.9969, "step": 139000 }, { "epoch": 0.12, "grad_norm": 32.5, "learning_rate": 4.424685514783955e-05, "loss": 0.9774, "step": 139100 }, { "epoch": 0.12, "grad_norm": 36.75, "learning_rate": 4.424235607464862e-05, "loss": 0.8757, "step": 139200 }, { "epoch": 0.12, "grad_norm": 14.375, "learning_rate": 4.42378570014577e-05, "loss": 0.926, "step": 139300 }, { "epoch": 0.12, "grad_norm": 109.5, "learning_rate": 4.423335792826678e-05, "loss": 0.9832, "step": 139400 }, { "epoch": 0.12, "grad_norm": 3.828125, "learning_rate": 4.4228858855075855e-05, "loss": 0.91, "step": 139500 }, { "epoch": 0.12, "grad_norm": 26.625, "learning_rate": 4.422435978188493e-05, "loss": 0.9774, "step": 139600 }, { "epoch": 0.12, "grad_norm": 0.0057373046875, "learning_rate": 4.421986070869401e-05, "loss": 0.9223, "step": 139700 }, { "epoch": 0.12, "grad_norm": 95.5, "learning_rate": 4.421536163550309e-05, "loss": 0.9779, "step": 139800 }, { "epoch": 0.12, "grad_norm": 0.010009765625, "learning_rate": 4.421086256231216e-05, "loss": 1.0977, "step": 139900 }, { "epoch": 0.12, "grad_norm": 0.2373046875, "learning_rate": 4.4206363489121245e-05, "loss": 1.0138, "step": 140000 }, { "epoch": 0.12, "grad_norm": 47.5, "learning_rate": 4.420186441593032e-05, "loss": 1.1582, "step": 140100 }, { "epoch": 0.12, "grad_norm": 24.25, "learning_rate": 4.41973653427394e-05, "loss": 0.9933, "step": 140200 }, { "epoch": 0.12, "grad_norm": 46.5, "learning_rate": 4.419286626954848e-05, "loss": 1.0703, "step": 140300 }, { "epoch": 0.13, "grad_norm": 29.5, "learning_rate": 4.418836719635755e-05, "loss": 0.984, "step": 140400 }, { "epoch": 0.13, "grad_norm": 63.0, "learning_rate": 4.418386812316663e-05, "loss": 1.1195, "step": 140500 }, { "epoch": 0.13, "grad_norm": 29.0, "learning_rate": 4.4179369049975704e-05, "loss": 1.0109, "step": 140600 }, { "epoch": 0.13, "grad_norm": 12.4375, "learning_rate": 4.4174869976784786e-05, "loss": 1.2296, "step": 140700 }, { "epoch": 0.13, "grad_norm": 35.5, "learning_rate": 4.417037090359386e-05, "loss": 0.7742, "step": 140800 }, { "epoch": 0.13, "grad_norm": 55.75, "learning_rate": 4.416587183040294e-05, "loss": 0.9351, "step": 140900 }, { "epoch": 0.13, "grad_norm": 50.75, "learning_rate": 4.416137275721202e-05, "loss": 0.9272, "step": 141000 }, { "epoch": 0.13, "grad_norm": 32.0, "learning_rate": 4.4156873684021094e-05, "loss": 1.0042, "step": 141100 }, { "epoch": 0.13, "grad_norm": 15.375, "learning_rate": 4.415237461083017e-05, "loss": 1.0413, "step": 141200 }, { "epoch": 0.13, "grad_norm": 12.0, "learning_rate": 4.414787553763925e-05, "loss": 1.0101, "step": 141300 }, { "epoch": 0.13, "grad_norm": 19.5, "learning_rate": 4.414337646444833e-05, "loss": 1.1085, "step": 141400 }, { "epoch": 0.13, "grad_norm": 16.75, "learning_rate": 4.41388773912574e-05, "loss": 0.9453, "step": 141500 }, { "epoch": 0.13, "grad_norm": 117.5, "learning_rate": 4.4134378318066484e-05, "loss": 1.013, "step": 141600 }, { "epoch": 0.13, "grad_norm": 11.5, "learning_rate": 4.412987924487556e-05, "loss": 1.1468, "step": 141700 }, { "epoch": 0.13, "grad_norm": 38.75, "learning_rate": 4.4125380171684635e-05, "loss": 0.9927, "step": 141800 }, { "epoch": 0.13, "grad_norm": 4.8125, "learning_rate": 4.412088109849371e-05, "loss": 0.9451, "step": 141900 }, { "epoch": 0.13, "grad_norm": 5.6875, "learning_rate": 4.4116382025302786e-05, "loss": 0.9307, "step": 142000 }, { "epoch": 0.13, "grad_norm": 15.0625, "learning_rate": 4.411188295211187e-05, "loss": 1.0743, "step": 142100 }, { "epoch": 0.13, "grad_norm": 72.5, "learning_rate": 4.410738387892094e-05, "loss": 1.1525, "step": 142200 }, { "epoch": 0.13, "grad_norm": 21.75, "learning_rate": 4.410288480573002e-05, "loss": 1.0244, "step": 142300 }, { "epoch": 0.13, "grad_norm": 23.25, "learning_rate": 4.40983857325391e-05, "loss": 1.0243, "step": 142400 }, { "epoch": 0.13, "grad_norm": 56.5, "learning_rate": 4.4093886659348176e-05, "loss": 1.0664, "step": 142500 }, { "epoch": 0.13, "grad_norm": 28.125, "learning_rate": 4.408938758615725e-05, "loss": 0.9965, "step": 142600 }, { "epoch": 0.13, "grad_norm": 17.125, "learning_rate": 4.4084888512966333e-05, "loss": 1.2094, "step": 142700 }, { "epoch": 0.13, "grad_norm": 1.109375, "learning_rate": 4.408038943977541e-05, "loss": 0.9811, "step": 142800 }, { "epoch": 0.13, "grad_norm": 32.75, "learning_rate": 4.407589036658449e-05, "loss": 0.9128, "step": 142900 }, { "epoch": 0.13, "grad_norm": 29.25, "learning_rate": 4.4071391293393566e-05, "loss": 0.9645, "step": 143000 }, { "epoch": 0.13, "grad_norm": 19.875, "learning_rate": 4.4066892220202635e-05, "loss": 0.9836, "step": 143100 }, { "epoch": 0.13, "grad_norm": 21.5, "learning_rate": 4.406239314701172e-05, "loss": 0.9767, "step": 143200 }, { "epoch": 0.13, "grad_norm": 48.5, "learning_rate": 4.405789407382079e-05, "loss": 1.2398, "step": 143300 }, { "epoch": 0.13, "grad_norm": 203.0, "learning_rate": 4.4053395000629874e-05, "loss": 0.9407, "step": 143400 }, { "epoch": 0.13, "grad_norm": 0.130859375, "learning_rate": 4.404889592743895e-05, "loss": 1.0356, "step": 143500 }, { "epoch": 0.13, "grad_norm": 48.25, "learning_rate": 4.4044396854248025e-05, "loss": 0.7877, "step": 143600 }, { "epoch": 0.13, "grad_norm": 29.625, "learning_rate": 4.403989778105711e-05, "loss": 1.0302, "step": 143700 }, { "epoch": 0.13, "grad_norm": 23.125, "learning_rate": 4.403539870786618e-05, "loss": 1.1916, "step": 143800 }, { "epoch": 0.13, "grad_norm": 46.5, "learning_rate": 4.403089963467526e-05, "loss": 0.9749, "step": 143900 }, { "epoch": 0.13, "grad_norm": 40.25, "learning_rate": 4.402640056148434e-05, "loss": 1.0308, "step": 144000 }, { "epoch": 0.13, "grad_norm": 3.109375, "learning_rate": 4.4021901488293415e-05, "loss": 0.9644, "step": 144100 }, { "epoch": 0.13, "grad_norm": 29.875, "learning_rate": 4.401740241510249e-05, "loss": 1.2236, "step": 144200 }, { "epoch": 0.13, "grad_norm": 0.189453125, "learning_rate": 4.401290334191157e-05, "loss": 0.9333, "step": 144300 }, { "epoch": 0.13, "grad_norm": 80.5, "learning_rate": 4.400840426872064e-05, "loss": 1.1109, "step": 144400 }, { "epoch": 0.13, "grad_norm": 31.5, "learning_rate": 4.4003905195529723e-05, "loss": 0.9701, "step": 144500 }, { "epoch": 0.13, "grad_norm": 0.1181640625, "learning_rate": 4.39994061223388e-05, "loss": 1.0283, "step": 144600 }, { "epoch": 0.13, "grad_norm": 8.5625, "learning_rate": 4.3994907049147874e-05, "loss": 1.0156, "step": 144700 }, { "epoch": 0.13, "grad_norm": 41.25, "learning_rate": 4.3990407975956956e-05, "loss": 1.0755, "step": 144800 }, { "epoch": 0.13, "grad_norm": 0.052490234375, "learning_rate": 4.398590890276603e-05, "loss": 0.9679, "step": 144900 }, { "epoch": 0.13, "grad_norm": 38.25, "learning_rate": 4.398140982957511e-05, "loss": 0.9356, "step": 145000 }, { "epoch": 0.13, "grad_norm": 10.9375, "learning_rate": 4.397691075638419e-05, "loss": 0.9515, "step": 145100 }, { "epoch": 0.13, "grad_norm": 14.4375, "learning_rate": 4.3972411683193264e-05, "loss": 1.0842, "step": 145200 }, { "epoch": 0.13, "grad_norm": 20.75, "learning_rate": 4.396791261000234e-05, "loss": 1.0994, "step": 145300 }, { "epoch": 0.13, "grad_norm": 11.25, "learning_rate": 4.396341353681142e-05, "loss": 0.987, "step": 145400 }, { "epoch": 0.13, "grad_norm": 45.0, "learning_rate": 4.39589144636205e-05, "loss": 1.0725, "step": 145500 }, { "epoch": 0.13, "grad_norm": 14.3125, "learning_rate": 4.395441539042958e-05, "loss": 1.1202, "step": 145600 }, { "epoch": 0.13, "grad_norm": 33.0, "learning_rate": 4.394991631723865e-05, "loss": 1.137, "step": 145700 }, { "epoch": 0.13, "grad_norm": 33.75, "learning_rate": 4.394541724404772e-05, "loss": 1.2172, "step": 145800 }, { "epoch": 0.13, "grad_norm": 19.25, "learning_rate": 4.3940918170856805e-05, "loss": 0.9738, "step": 145900 }, { "epoch": 0.13, "grad_norm": 13.6875, "learning_rate": 4.393641909766588e-05, "loss": 0.9192, "step": 146000 }, { "epoch": 0.13, "grad_norm": 55.75, "learning_rate": 4.393192002447496e-05, "loss": 0.8443, "step": 146100 }, { "epoch": 0.13, "grad_norm": 23.5, "learning_rate": 4.392742095128404e-05, "loss": 0.9508, "step": 146200 }, { "epoch": 0.13, "grad_norm": 12.0625, "learning_rate": 4.3922921878093113e-05, "loss": 0.9376, "step": 146300 }, { "epoch": 0.13, "grad_norm": 0.0106201171875, "learning_rate": 4.3918422804902196e-05, "loss": 0.9247, "step": 146400 }, { "epoch": 0.13, "grad_norm": 50.75, "learning_rate": 4.391392373171127e-05, "loss": 1.0626, "step": 146500 }, { "epoch": 0.13, "grad_norm": 10.0625, "learning_rate": 4.3909424658520346e-05, "loss": 1.0409, "step": 146600 }, { "epoch": 0.13, "grad_norm": 0.07861328125, "learning_rate": 4.390492558532943e-05, "loss": 0.898, "step": 146700 }, { "epoch": 0.13, "grad_norm": 71.5, "learning_rate": 4.3900426512138504e-05, "loss": 1.086, "step": 146800 }, { "epoch": 0.13, "grad_norm": 0.04833984375, "learning_rate": 4.389592743894758e-05, "loss": 1.2122, "step": 146900 }, { "epoch": 0.13, "grad_norm": 30.5, "learning_rate": 4.3891428365756654e-05, "loss": 1.2413, "step": 147000 }, { "epoch": 0.13, "grad_norm": 48.75, "learning_rate": 4.388692929256573e-05, "loss": 1.0457, "step": 147100 }, { "epoch": 0.13, "grad_norm": 0.01025390625, "learning_rate": 4.388243021937481e-05, "loss": 1.0146, "step": 147200 }, { "epoch": 0.13, "grad_norm": 48.5, "learning_rate": 4.387793114618389e-05, "loss": 1.1213, "step": 147300 }, { "epoch": 0.13, "grad_norm": 20.375, "learning_rate": 4.387343207299296e-05, "loss": 0.7852, "step": 147400 }, { "epoch": 0.13, "grad_norm": 33.75, "learning_rate": 4.3868932999802045e-05, "loss": 1.0076, "step": 147500 }, { "epoch": 0.13, "grad_norm": 18.875, "learning_rate": 4.386443392661112e-05, "loss": 1.0222, "step": 147600 }, { "epoch": 0.13, "grad_norm": 15.1875, "learning_rate": 4.3859934853420195e-05, "loss": 1.0606, "step": 147700 }, { "epoch": 0.13, "grad_norm": 1.3828125, "learning_rate": 4.385543578022928e-05, "loss": 1.0254, "step": 147800 }, { "epoch": 0.13, "grad_norm": 31.75, "learning_rate": 4.385093670703835e-05, "loss": 1.0288, "step": 147900 }, { "epoch": 0.13, "grad_norm": 1904.0, "learning_rate": 4.384643763384743e-05, "loss": 1.0253, "step": 148000 }, { "epoch": 0.13, "grad_norm": 47.5, "learning_rate": 4.384193856065651e-05, "loss": 0.9882, "step": 148100 }, { "epoch": 0.13, "grad_norm": 72.5, "learning_rate": 4.3837439487465586e-05, "loss": 1.0051, "step": 148200 }, { "epoch": 0.13, "grad_norm": 184.0, "learning_rate": 4.383294041427466e-05, "loss": 1.2004, "step": 148300 }, { "epoch": 0.13, "grad_norm": 33.75, "learning_rate": 4.3828441341083736e-05, "loss": 1.1212, "step": 148400 }, { "epoch": 0.13, "grad_norm": 340.0, "learning_rate": 4.382394226789281e-05, "loss": 1.0302, "step": 148500 }, { "epoch": 0.13, "grad_norm": 106.5, "learning_rate": 4.3819443194701894e-05, "loss": 0.9579, "step": 148600 }, { "epoch": 0.13, "grad_norm": 57.0, "learning_rate": 4.381494412151097e-05, "loss": 0.9339, "step": 148700 }, { "epoch": 0.13, "grad_norm": 64.0, "learning_rate": 4.381044504832005e-05, "loss": 1.0883, "step": 148800 }, { "epoch": 0.13, "grad_norm": 47.75, "learning_rate": 4.3805945975129127e-05, "loss": 0.9696, "step": 148900 }, { "epoch": 0.13, "grad_norm": 6.75, "learning_rate": 4.38014469019382e-05, "loss": 1.0203, "step": 149000 }, { "epoch": 0.13, "grad_norm": 78.0, "learning_rate": 4.3796947828747284e-05, "loss": 1.0517, "step": 149100 }, { "epoch": 0.13, "grad_norm": 24.5, "learning_rate": 4.379244875555636e-05, "loss": 0.8561, "step": 149200 }, { "epoch": 0.13, "grad_norm": 87.0, "learning_rate": 4.3787949682365435e-05, "loss": 1.0067, "step": 149300 }, { "epoch": 0.13, "grad_norm": 26.375, "learning_rate": 4.378345060917452e-05, "loss": 1.0164, "step": 149400 }, { "epoch": 0.13, "grad_norm": 106.0, "learning_rate": 4.377895153598359e-05, "loss": 0.9985, "step": 149500 }, { "epoch": 0.13, "grad_norm": 21.25, "learning_rate": 4.377445246279267e-05, "loss": 1.0696, "step": 149600 }, { "epoch": 0.13, "grad_norm": 12.875, "learning_rate": 4.376995338960174e-05, "loss": 0.9058, "step": 149700 }, { "epoch": 0.13, "grad_norm": 34.25, "learning_rate": 4.376545431641082e-05, "loss": 0.9745, "step": 149800 }, { "epoch": 0.13, "grad_norm": 21.625, "learning_rate": 4.37609552432199e-05, "loss": 1.0585, "step": 149900 }, { "epoch": 0.13, "grad_norm": 47.5, "learning_rate": 4.3756456170028976e-05, "loss": 1.0539, "step": 150000 }, { "epoch": 0.13, "grad_norm": 26.25, "learning_rate": 4.375195709683805e-05, "loss": 1.1171, "step": 150100 }, { "epoch": 0.13, "grad_norm": 0.5703125, "learning_rate": 4.374745802364713e-05, "loss": 1.1021, "step": 150200 }, { "epoch": 0.13, "grad_norm": 30.625, "learning_rate": 4.374295895045621e-05, "loss": 1.101, "step": 150300 }, { "epoch": 0.13, "grad_norm": 53.5, "learning_rate": 4.3738459877265284e-05, "loss": 0.9998, "step": 150400 }, { "epoch": 0.13, "grad_norm": 39.0, "learning_rate": 4.3733960804074366e-05, "loss": 1.1215, "step": 150500 }, { "epoch": 0.13, "grad_norm": 2784.0, "learning_rate": 4.372946173088344e-05, "loss": 1.097, "step": 150600 }, { "epoch": 0.13, "grad_norm": 52.0, "learning_rate": 4.3724962657692517e-05, "loss": 1.0714, "step": 150700 }, { "epoch": 0.13, "grad_norm": 11.625, "learning_rate": 4.37204635845016e-05, "loss": 1.0957, "step": 150800 }, { "epoch": 0.13, "grad_norm": 20.75, "learning_rate": 4.371596451131067e-05, "loss": 1.0594, "step": 150900 }, { "epoch": 0.13, "grad_norm": 66.0, "learning_rate": 4.371146543811975e-05, "loss": 1.0495, "step": 151000 }, { "epoch": 0.13, "grad_norm": 67.0, "learning_rate": 4.3706966364928825e-05, "loss": 1.0036, "step": 151100 }, { "epoch": 0.13, "grad_norm": 31.0, "learning_rate": 4.37024672917379e-05, "loss": 1.0579, "step": 151200 }, { "epoch": 0.13, "grad_norm": 25.75, "learning_rate": 4.369796821854698e-05, "loss": 1.0552, "step": 151300 }, { "epoch": 0.13, "grad_norm": 60.75, "learning_rate": 4.369346914535606e-05, "loss": 1.1418, "step": 151400 }, { "epoch": 0.13, "grad_norm": 0.00098419189453125, "learning_rate": 4.368897007216514e-05, "loss": 1.0137, "step": 151500 }, { "epoch": 0.14, "grad_norm": 128.0, "learning_rate": 4.3684470998974215e-05, "loss": 1.1362, "step": 151600 }, { "epoch": 0.14, "grad_norm": 27.625, "learning_rate": 4.367997192578329e-05, "loss": 0.8731, "step": 151700 }, { "epoch": 0.14, "grad_norm": 15.8125, "learning_rate": 4.367547285259237e-05, "loss": 1.028, "step": 151800 }, { "epoch": 0.14, "grad_norm": 82.5, "learning_rate": 4.367097377940145e-05, "loss": 1.1782, "step": 151900 }, { "epoch": 0.14, "grad_norm": 17.625, "learning_rate": 4.366647470621052e-05, "loss": 0.9741, "step": 152000 }, { "epoch": 0.14, "grad_norm": 24.75, "learning_rate": 4.3661975633019605e-05, "loss": 1.049, "step": 152100 }, { "epoch": 0.14, "grad_norm": 15.8125, "learning_rate": 4.3657476559828674e-05, "loss": 1.0108, "step": 152200 }, { "epoch": 0.14, "grad_norm": 55.0, "learning_rate": 4.3652977486637756e-05, "loss": 1.05, "step": 152300 }, { "epoch": 0.14, "grad_norm": 30.125, "learning_rate": 4.364847841344683e-05, "loss": 1.127, "step": 152400 }, { "epoch": 0.14, "grad_norm": 10.8125, "learning_rate": 4.3643979340255907e-05, "loss": 0.9838, "step": 152500 }, { "epoch": 0.14, "grad_norm": 122.5, "learning_rate": 4.363948026706499e-05, "loss": 0.9758, "step": 152600 }, { "epoch": 0.14, "grad_norm": 26.75, "learning_rate": 4.3634981193874064e-05, "loss": 1.0675, "step": 152700 }, { "epoch": 0.14, "grad_norm": 33.25, "learning_rate": 4.363048212068314e-05, "loss": 1.0756, "step": 152800 }, { "epoch": 0.14, "grad_norm": 0.080078125, "learning_rate": 4.362598304749222e-05, "loss": 1.1334, "step": 152900 }, { "epoch": 0.14, "grad_norm": 160.0, "learning_rate": 4.36214839743013e-05, "loss": 1.0536, "step": 153000 }, { "epoch": 0.14, "grad_norm": 0.58984375, "learning_rate": 4.361698490111037e-05, "loss": 0.9971, "step": 153100 }, { "epoch": 0.14, "grad_norm": 8.25, "learning_rate": 4.3612485827919454e-05, "loss": 0.9975, "step": 153200 }, { "epoch": 0.14, "grad_norm": 56.75, "learning_rate": 4.360798675472853e-05, "loss": 0.9696, "step": 153300 }, { "epoch": 0.14, "grad_norm": 21.125, "learning_rate": 4.3603487681537605e-05, "loss": 1.0212, "step": 153400 }, { "epoch": 0.14, "grad_norm": 8.5, "learning_rate": 4.359898860834668e-05, "loss": 0.9699, "step": 153500 }, { "epoch": 0.14, "grad_norm": 30.25, "learning_rate": 4.3594489535155756e-05, "loss": 0.9782, "step": 153600 }, { "epoch": 0.14, "grad_norm": 340.0, "learning_rate": 4.358999046196484e-05, "loss": 1.0191, "step": 153700 }, { "epoch": 0.14, "grad_norm": 50.75, "learning_rate": 4.358549138877391e-05, "loss": 0.9227, "step": 153800 }, { "epoch": 0.14, "grad_norm": 32.25, "learning_rate": 4.358099231558299e-05, "loss": 1.1029, "step": 153900 }, { "epoch": 0.14, "grad_norm": 0.3046875, "learning_rate": 4.357649324239207e-05, "loss": 1.0705, "step": 154000 }, { "epoch": 0.14, "grad_norm": 45.25, "learning_rate": 4.3571994169201146e-05, "loss": 1.0076, "step": 154100 }, { "epoch": 0.14, "grad_norm": 48.75, "learning_rate": 4.356749509601023e-05, "loss": 1.1447, "step": 154200 }, { "epoch": 0.14, "grad_norm": 32.25, "learning_rate": 4.35629960228193e-05, "loss": 1.1206, "step": 154300 }, { "epoch": 0.14, "grad_norm": 44.75, "learning_rate": 4.355849694962838e-05, "loss": 0.9393, "step": 154400 }, { "epoch": 0.14, "grad_norm": 28.75, "learning_rate": 4.355399787643746e-05, "loss": 1.1693, "step": 154500 }, { "epoch": 0.14, "grad_norm": 14.6875, "learning_rate": 4.3549498803246536e-05, "loss": 0.9107, "step": 154600 }, { "epoch": 0.14, "grad_norm": 79.0, "learning_rate": 4.354499973005561e-05, "loss": 1.1249, "step": 154700 }, { "epoch": 0.14, "grad_norm": 22.625, "learning_rate": 4.354050065686469e-05, "loss": 1.0055, "step": 154800 }, { "epoch": 0.14, "grad_norm": 45.5, "learning_rate": 4.353600158367376e-05, "loss": 1.021, "step": 154900 }, { "epoch": 0.14, "grad_norm": 62.5, "learning_rate": 4.3531502510482844e-05, "loss": 0.862, "step": 155000 }, { "epoch": 0.14, "grad_norm": 74.0, "learning_rate": 4.352700343729192e-05, "loss": 1.0474, "step": 155100 }, { "epoch": 0.14, "grad_norm": 260.0, "learning_rate": 4.3522504364100995e-05, "loss": 1.0278, "step": 155200 }, { "epoch": 0.14, "grad_norm": 120.0, "learning_rate": 4.351800529091008e-05, "loss": 1.0703, "step": 155300 }, { "epoch": 0.14, "grad_norm": 30.25, "learning_rate": 4.351350621771915e-05, "loss": 0.9811, "step": 155400 }, { "epoch": 0.14, "grad_norm": 36.5, "learning_rate": 4.350900714452823e-05, "loss": 1.0293, "step": 155500 }, { "epoch": 0.14, "grad_norm": 35.5, "learning_rate": 4.350450807133731e-05, "loss": 0.9772, "step": 155600 }, { "epoch": 0.14, "grad_norm": 118.0, "learning_rate": 4.3500008998146385e-05, "loss": 1.1033, "step": 155700 }, { "epoch": 0.14, "grad_norm": 0.13671875, "learning_rate": 4.349550992495546e-05, "loss": 1.0257, "step": 155800 }, { "epoch": 0.14, "grad_norm": 34.25, "learning_rate": 4.349101085176454e-05, "loss": 1.0198, "step": 155900 }, { "epoch": 0.14, "grad_norm": 51.0, "learning_rate": 4.348651177857362e-05, "loss": 0.9208, "step": 156000 }, { "epoch": 0.14, "grad_norm": 20.125, "learning_rate": 4.348201270538269e-05, "loss": 1.0556, "step": 156100 }, { "epoch": 0.14, "grad_norm": 30.25, "learning_rate": 4.347751363219177e-05, "loss": 1.076, "step": 156200 }, { "epoch": 0.14, "grad_norm": 11.5625, "learning_rate": 4.3473014559000844e-05, "loss": 1.044, "step": 156300 }, { "epoch": 0.14, "grad_norm": 12.25, "learning_rate": 4.3468515485809926e-05, "loss": 1.0219, "step": 156400 }, { "epoch": 0.14, "grad_norm": 23.625, "learning_rate": 4.3464016412619e-05, "loss": 1.1348, "step": 156500 }, { "epoch": 0.14, "grad_norm": 7.90625, "learning_rate": 4.345951733942808e-05, "loss": 0.789, "step": 156600 }, { "epoch": 0.14, "grad_norm": 43.25, "learning_rate": 4.345501826623716e-05, "loss": 0.9491, "step": 156700 }, { "epoch": 0.14, "grad_norm": 0.0015716552734375, "learning_rate": 4.3450519193046234e-05, "loss": 0.9913, "step": 156800 }, { "epoch": 0.14, "grad_norm": 105.0, "learning_rate": 4.3446020119855316e-05, "loss": 1.0048, "step": 156900 }, { "epoch": 0.14, "grad_norm": 33.0, "learning_rate": 4.344152104666439e-05, "loss": 0.8363, "step": 157000 }, { "epoch": 0.14, "grad_norm": 113.0, "learning_rate": 4.343702197347347e-05, "loss": 0.929, "step": 157100 }, { "epoch": 0.14, "grad_norm": 9.1875, "learning_rate": 4.343252290028255e-05, "loss": 0.9984, "step": 157200 }, { "epoch": 0.14, "grad_norm": 102.5, "learning_rate": 4.3428023827091624e-05, "loss": 1.0548, "step": 157300 }, { "epoch": 0.14, "grad_norm": 80.5, "learning_rate": 4.342352475390069e-05, "loss": 0.9331, "step": 157400 }, { "epoch": 0.14, "grad_norm": 0.023193359375, "learning_rate": 4.3419025680709775e-05, "loss": 0.935, "step": 157500 }, { "epoch": 0.14, "grad_norm": 13.75, "learning_rate": 4.341452660751885e-05, "loss": 0.968, "step": 157600 }, { "epoch": 0.14, "grad_norm": 44.25, "learning_rate": 4.341002753432793e-05, "loss": 1.1513, "step": 157700 }, { "epoch": 0.14, "grad_norm": 81.0, "learning_rate": 4.340552846113701e-05, "loss": 0.9997, "step": 157800 }, { "epoch": 0.14, "grad_norm": 26.625, "learning_rate": 4.340102938794608e-05, "loss": 1.0114, "step": 157900 }, { "epoch": 0.14, "grad_norm": 0.099609375, "learning_rate": 4.3396530314755165e-05, "loss": 0.9502, "step": 158000 }, { "epoch": 0.14, "grad_norm": 10.8125, "learning_rate": 4.339203124156424e-05, "loss": 0.889, "step": 158100 }, { "epoch": 0.14, "grad_norm": 18.25, "learning_rate": 4.3387532168373316e-05, "loss": 1.1494, "step": 158200 }, { "epoch": 0.14, "grad_norm": 0.32421875, "learning_rate": 4.33830330951824e-05, "loss": 0.9385, "step": 158300 }, { "epoch": 0.14, "grad_norm": 28.125, "learning_rate": 4.3378534021991474e-05, "loss": 0.815, "step": 158400 }, { "epoch": 0.14, "grad_norm": 0.08251953125, "learning_rate": 4.337403494880055e-05, "loss": 1.0117, "step": 158500 }, { "epoch": 0.14, "grad_norm": 107.5, "learning_rate": 4.3369535875609624e-05, "loss": 1.0547, "step": 158600 }, { "epoch": 0.14, "grad_norm": 0.06005859375, "learning_rate": 4.33650368024187e-05, "loss": 1.0249, "step": 158700 }, { "epoch": 0.14, "grad_norm": 0.2080078125, "learning_rate": 4.336053772922778e-05, "loss": 0.9179, "step": 158800 }, { "epoch": 0.14, "grad_norm": 0.52734375, "learning_rate": 4.335603865603686e-05, "loss": 1.0161, "step": 158900 }, { "epoch": 0.14, "grad_norm": 0.00946044921875, "learning_rate": 4.335153958284593e-05, "loss": 0.987, "step": 159000 }, { "epoch": 0.14, "grad_norm": 0.1630859375, "learning_rate": 4.3347040509655014e-05, "loss": 1.0456, "step": 159100 }, { "epoch": 0.14, "grad_norm": 0.1044921875, "learning_rate": 4.334254143646409e-05, "loss": 1.0138, "step": 159200 }, { "epoch": 0.14, "grad_norm": 20.5, "learning_rate": 4.3338042363273165e-05, "loss": 1.0523, "step": 159300 }, { "epoch": 0.14, "grad_norm": 25.625, "learning_rate": 4.333354329008225e-05, "loss": 0.9729, "step": 159400 }, { "epoch": 0.14, "grad_norm": 79.0, "learning_rate": 4.332904421689132e-05, "loss": 1.1203, "step": 159500 }, { "epoch": 0.14, "grad_norm": 12.4375, "learning_rate": 4.33245451437004e-05, "loss": 1.063, "step": 159600 }, { "epoch": 0.14, "grad_norm": 109.0, "learning_rate": 4.332004607050948e-05, "loss": 1.0203, "step": 159700 }, { "epoch": 0.14, "grad_norm": 75.5, "learning_rate": 4.3315546997318555e-05, "loss": 1.019, "step": 159800 }, { "epoch": 0.14, "grad_norm": 16.125, "learning_rate": 4.331104792412763e-05, "loss": 0.9972, "step": 159900 }, { "epoch": 0.14, "grad_norm": 0.038330078125, "learning_rate": 4.3306548850936706e-05, "loss": 0.9881, "step": 160000 }, { "epoch": 0.14, "grad_norm": 17.125, "learning_rate": 4.330204977774578e-05, "loss": 0.8157, "step": 160100 }, { "epoch": 0.14, "grad_norm": 55.75, "learning_rate": 4.3297550704554864e-05, "loss": 1.1089, "step": 160200 }, { "epoch": 0.14, "grad_norm": 56.75, "learning_rate": 4.329305163136394e-05, "loss": 0.9508, "step": 160300 }, { "epoch": 0.14, "grad_norm": 22.5, "learning_rate": 4.328855255817302e-05, "loss": 1.0825, "step": 160400 }, { "epoch": 0.14, "grad_norm": 27.875, "learning_rate": 4.3284053484982096e-05, "loss": 0.978, "step": 160500 }, { "epoch": 0.14, "grad_norm": 17.75, "learning_rate": 4.327955441179117e-05, "loss": 0.9958, "step": 160600 }, { "epoch": 0.14, "grad_norm": 97.5, "learning_rate": 4.3275055338600254e-05, "loss": 0.8639, "step": 160700 }, { "epoch": 0.14, "grad_norm": 227.0, "learning_rate": 4.327055626540933e-05, "loss": 1.0966, "step": 160800 }, { "epoch": 0.14, "grad_norm": 9.875, "learning_rate": 4.3266057192218404e-05, "loss": 1.0282, "step": 160900 }, { "epoch": 0.14, "grad_norm": 30.5, "learning_rate": 4.3261558119027487e-05, "loss": 0.9126, "step": 161000 }, { "epoch": 0.14, "grad_norm": 290.0, "learning_rate": 4.325705904583656e-05, "loss": 1.1288, "step": 161100 }, { "epoch": 0.14, "grad_norm": 68.5, "learning_rate": 4.325255997264564e-05, "loss": 1.099, "step": 161200 }, { "epoch": 0.14, "grad_norm": 19.25, "learning_rate": 4.324806089945471e-05, "loss": 1.0288, "step": 161300 }, { "epoch": 0.14, "grad_norm": 47.0, "learning_rate": 4.324356182626379e-05, "loss": 1.0872, "step": 161400 }, { "epoch": 0.14, "grad_norm": 149.0, "learning_rate": 4.323906275307287e-05, "loss": 1.0714, "step": 161500 }, { "epoch": 0.14, "grad_norm": 103.0, "learning_rate": 4.3234563679881945e-05, "loss": 1.1005, "step": 161600 }, { "epoch": 0.14, "grad_norm": 45.0, "learning_rate": 4.323006460669102e-05, "loss": 1.0969, "step": 161700 }, { "epoch": 0.14, "grad_norm": 3.625, "learning_rate": 4.32255655335001e-05, "loss": 0.9946, "step": 161800 }, { "epoch": 0.14, "grad_norm": 66.0, "learning_rate": 4.322106646030918e-05, "loss": 0.9462, "step": 161900 }, { "epoch": 0.14, "grad_norm": 23.25, "learning_rate": 4.3216567387118254e-05, "loss": 0.982, "step": 162000 }, { "epoch": 0.14, "grad_norm": 20.875, "learning_rate": 4.3212068313927336e-05, "loss": 0.9534, "step": 162100 }, { "epoch": 0.14, "grad_norm": 20.75, "learning_rate": 4.320756924073641e-05, "loss": 1.0807, "step": 162200 }, { "epoch": 0.14, "grad_norm": 39.5, "learning_rate": 4.3203070167545486e-05, "loss": 1.0738, "step": 162300 }, { "epoch": 0.14, "grad_norm": 8.5, "learning_rate": 4.319857109435457e-05, "loss": 1.0418, "step": 162400 }, { "epoch": 0.14, "grad_norm": 46.75, "learning_rate": 4.319407202116364e-05, "loss": 1.1008, "step": 162500 }, { "epoch": 0.14, "grad_norm": 18.0, "learning_rate": 4.318957294797272e-05, "loss": 0.9745, "step": 162600 }, { "epoch": 0.14, "grad_norm": 39.5, "learning_rate": 4.3185073874781794e-05, "loss": 1.0785, "step": 162700 }, { "epoch": 0.15, "grad_norm": 14.875, "learning_rate": 4.318057480159087e-05, "loss": 0.9414, "step": 162800 }, { "epoch": 0.15, "grad_norm": 61.5, "learning_rate": 4.317607572839995e-05, "loss": 1.02, "step": 162900 }, { "epoch": 0.15, "grad_norm": 53.0, "learning_rate": 4.317157665520903e-05, "loss": 1.0859, "step": 163000 }, { "epoch": 0.15, "grad_norm": 25.375, "learning_rate": 4.316707758201811e-05, "loss": 1.0585, "step": 163100 }, { "epoch": 0.15, "grad_norm": 23.125, "learning_rate": 4.3162578508827185e-05, "loss": 0.9966, "step": 163200 }, { "epoch": 0.15, "grad_norm": 47.0, "learning_rate": 4.315807943563626e-05, "loss": 0.8642, "step": 163300 }, { "epoch": 0.15, "grad_norm": 0.189453125, "learning_rate": 4.315358036244534e-05, "loss": 0.9856, "step": 163400 }, { "epoch": 0.15, "grad_norm": 63.25, "learning_rate": 4.314908128925442e-05, "loss": 0.9111, "step": 163500 }, { "epoch": 0.15, "grad_norm": 13.8125, "learning_rate": 4.314458221606349e-05, "loss": 1.0371, "step": 163600 }, { "epoch": 0.15, "grad_norm": 14.875, "learning_rate": 4.3140083142872575e-05, "loss": 1.1399, "step": 163700 }, { "epoch": 0.15, "grad_norm": 5.96875, "learning_rate": 4.3135584069681644e-05, "loss": 0.8889, "step": 163800 }, { "epoch": 0.15, "grad_norm": 22.875, "learning_rate": 4.3131084996490726e-05, "loss": 1.0474, "step": 163900 }, { "epoch": 0.15, "grad_norm": 47.25, "learning_rate": 4.31265859232998e-05, "loss": 0.9557, "step": 164000 }, { "epoch": 0.15, "grad_norm": 43.75, "learning_rate": 4.3122086850108876e-05, "loss": 1.0484, "step": 164100 }, { "epoch": 0.15, "grad_norm": 65.0, "learning_rate": 4.311758777691796e-05, "loss": 1.22, "step": 164200 }, { "epoch": 0.15, "grad_norm": 44.25, "learning_rate": 4.3113088703727034e-05, "loss": 0.9491, "step": 164300 }, { "epoch": 0.15, "grad_norm": 10.5, "learning_rate": 4.310858963053611e-05, "loss": 0.9652, "step": 164400 }, { "epoch": 0.15, "grad_norm": 10.0, "learning_rate": 4.310409055734519e-05, "loss": 1.0211, "step": 164500 }, { "epoch": 0.15, "grad_norm": 58.75, "learning_rate": 4.3099591484154267e-05, "loss": 0.9964, "step": 164600 }, { "epoch": 0.15, "grad_norm": 59.5, "learning_rate": 4.309509241096334e-05, "loss": 1.0356, "step": 164700 }, { "epoch": 0.15, "grad_norm": 7.4375, "learning_rate": 4.3090593337772424e-05, "loss": 1.0443, "step": 164800 }, { "epoch": 0.15, "grad_norm": 35.25, "learning_rate": 4.30860942645815e-05, "loss": 1.0688, "step": 164900 }, { "epoch": 0.15, "grad_norm": 17.5, "learning_rate": 4.3081595191390575e-05, "loss": 1.0226, "step": 165000 }, { "epoch": 0.15, "grad_norm": 9.875, "learning_rate": 4.307709611819965e-05, "loss": 0.9612, "step": 165100 }, { "epoch": 0.15, "grad_norm": 33.25, "learning_rate": 4.3072597045008725e-05, "loss": 1.1404, "step": 165200 }, { "epoch": 0.15, "grad_norm": 500.0, "learning_rate": 4.306809797181781e-05, "loss": 0.9332, "step": 165300 }, { "epoch": 0.15, "grad_norm": 11.5625, "learning_rate": 4.306359889862688e-05, "loss": 1.0052, "step": 165400 }, { "epoch": 0.15, "grad_norm": 24.25, "learning_rate": 4.305909982543596e-05, "loss": 1.0863, "step": 165500 }, { "epoch": 0.15, "grad_norm": 45.75, "learning_rate": 4.305460075224504e-05, "loss": 1.0065, "step": 165600 }, { "epoch": 0.15, "grad_norm": 12.6875, "learning_rate": 4.3050101679054116e-05, "loss": 1.1437, "step": 165700 }, { "epoch": 0.15, "grad_norm": 41.25, "learning_rate": 4.30456026058632e-05, "loss": 0.9853, "step": 165800 }, { "epoch": 0.15, "grad_norm": 2.609375, "learning_rate": 4.304110353267227e-05, "loss": 1.0444, "step": 165900 }, { "epoch": 0.15, "grad_norm": 1032.0, "learning_rate": 4.303660445948135e-05, "loss": 1.0349, "step": 166000 }, { "epoch": 0.15, "grad_norm": 19.375, "learning_rate": 4.303210538629043e-05, "loss": 0.9965, "step": 166100 }, { "epoch": 0.15, "grad_norm": 0.0537109375, "learning_rate": 4.3027606313099506e-05, "loss": 1.1163, "step": 166200 }, { "epoch": 0.15, "grad_norm": 37.5, "learning_rate": 4.302310723990858e-05, "loss": 1.0659, "step": 166300 }, { "epoch": 0.15, "grad_norm": 32.0, "learning_rate": 4.3018608166717657e-05, "loss": 0.983, "step": 166400 }, { "epoch": 0.15, "grad_norm": 26.125, "learning_rate": 4.301410909352673e-05, "loss": 0.9871, "step": 166500 }, { "epoch": 0.15, "grad_norm": 34.5, "learning_rate": 4.3009610020335814e-05, "loss": 1.1223, "step": 166600 }, { "epoch": 0.15, "grad_norm": 45.5, "learning_rate": 4.300511094714489e-05, "loss": 1.0868, "step": 166700 }, { "epoch": 0.15, "grad_norm": 8.5, "learning_rate": 4.3000611873953965e-05, "loss": 1.0496, "step": 166800 }, { "epoch": 0.15, "grad_norm": 39.25, "learning_rate": 4.299611280076305e-05, "loss": 1.0291, "step": 166900 }, { "epoch": 0.15, "grad_norm": 25.875, "learning_rate": 4.299161372757212e-05, "loss": 1.013, "step": 167000 }, { "epoch": 0.15, "grad_norm": 36.5, "learning_rate": 4.29871146543812e-05, "loss": 1.0793, "step": 167100 }, { "epoch": 0.15, "grad_norm": 134.0, "learning_rate": 4.298261558119028e-05, "loss": 1.126, "step": 167200 }, { "epoch": 0.15, "grad_norm": 44.5, "learning_rate": 4.2978116507999355e-05, "loss": 1.1957, "step": 167300 }, { "epoch": 0.15, "grad_norm": 22.0, "learning_rate": 4.297361743480843e-05, "loss": 1.147, "step": 167400 }, { "epoch": 0.15, "grad_norm": 5.40625, "learning_rate": 4.296911836161751e-05, "loss": 1.011, "step": 167500 }, { "epoch": 0.15, "grad_norm": 266.0, "learning_rate": 4.296461928842659e-05, "loss": 1.1574, "step": 167600 }, { "epoch": 0.15, "grad_norm": 21.625, "learning_rate": 4.296012021523566e-05, "loss": 1.0342, "step": 167700 }, { "epoch": 0.15, "grad_norm": 67.5, "learning_rate": 4.295562114204474e-05, "loss": 1.087, "step": 167800 }, { "epoch": 0.15, "grad_norm": 8.625, "learning_rate": 4.2951122068853814e-05, "loss": 1.0295, "step": 167900 }, { "epoch": 0.15, "grad_norm": 107.5, "learning_rate": 4.2946622995662896e-05, "loss": 1.0703, "step": 168000 }, { "epoch": 0.15, "grad_norm": 31.5, "learning_rate": 4.294212392247197e-05, "loss": 1.0369, "step": 168100 }, { "epoch": 0.15, "grad_norm": 310.0, "learning_rate": 4.2937624849281047e-05, "loss": 1.127, "step": 168200 }, { "epoch": 0.15, "grad_norm": 6.6875, "learning_rate": 4.293312577609013e-05, "loss": 1.0062, "step": 168300 }, { "epoch": 0.15, "grad_norm": 11.4375, "learning_rate": 4.2928626702899204e-05, "loss": 1.0202, "step": 168400 }, { "epoch": 0.15, "grad_norm": 9.0, "learning_rate": 4.2924127629708286e-05, "loss": 0.9727, "step": 168500 }, { "epoch": 0.15, "grad_norm": 208.0, "learning_rate": 4.291962855651736e-05, "loss": 0.9625, "step": 168600 }, { "epoch": 0.15, "grad_norm": 55.25, "learning_rate": 4.291512948332644e-05, "loss": 1.1664, "step": 168700 }, { "epoch": 0.15, "grad_norm": 10.1875, "learning_rate": 4.291063041013552e-05, "loss": 1.1378, "step": 168800 }, { "epoch": 0.15, "grad_norm": 24.25, "learning_rate": 4.2906131336944594e-05, "loss": 0.9084, "step": 168900 }, { "epoch": 0.15, "grad_norm": 21.875, "learning_rate": 4.290163226375366e-05, "loss": 0.9553, "step": 169000 }, { "epoch": 0.15, "grad_norm": 0.0517578125, "learning_rate": 4.2897133190562745e-05, "loss": 0.9911, "step": 169100 }, { "epoch": 0.15, "grad_norm": 0.2138671875, "learning_rate": 4.289263411737182e-05, "loss": 1.0727, "step": 169200 }, { "epoch": 0.15, "grad_norm": 89.0, "learning_rate": 4.28881350441809e-05, "loss": 1.1094, "step": 169300 }, { "epoch": 0.15, "grad_norm": 47.0, "learning_rate": 4.288363597098998e-05, "loss": 0.901, "step": 169400 }, { "epoch": 0.15, "grad_norm": 32.25, "learning_rate": 4.287913689779905e-05, "loss": 0.9228, "step": 169500 }, { "epoch": 0.15, "grad_norm": 19.125, "learning_rate": 4.2874637824608135e-05, "loss": 0.9525, "step": 169600 }, { "epoch": 0.15, "grad_norm": 284.0, "learning_rate": 4.287013875141721e-05, "loss": 1.0846, "step": 169700 }, { "epoch": 0.15, "grad_norm": 33.25, "learning_rate": 4.2865639678226286e-05, "loss": 1.1031, "step": 169800 }, { "epoch": 0.15, "grad_norm": 0.19140625, "learning_rate": 4.286114060503537e-05, "loss": 0.9617, "step": 169900 }, { "epoch": 0.15, "grad_norm": 21.625, "learning_rate": 4.285664153184444e-05, "loss": 0.9865, "step": 170000 }, { "epoch": 0.15, "grad_norm": 44.25, "learning_rate": 4.285214245865352e-05, "loss": 1.0339, "step": 170100 }, { "epoch": 0.15, "grad_norm": 42.0, "learning_rate": 4.28476433854626e-05, "loss": 1.0454, "step": 170200 }, { "epoch": 0.15, "grad_norm": 0.09326171875, "learning_rate": 4.284314431227167e-05, "loss": 1.1814, "step": 170300 }, { "epoch": 0.15, "grad_norm": 22.125, "learning_rate": 4.283864523908075e-05, "loss": 1.0357, "step": 170400 }, { "epoch": 0.15, "grad_norm": 30.125, "learning_rate": 4.283414616588983e-05, "loss": 0.9267, "step": 170500 }, { "epoch": 0.15, "grad_norm": 37.5, "learning_rate": 4.28296470926989e-05, "loss": 1.1452, "step": 170600 }, { "epoch": 0.15, "grad_norm": 26.625, "learning_rate": 4.2825148019507984e-05, "loss": 1.0902, "step": 170700 }, { "epoch": 0.15, "grad_norm": 38.25, "learning_rate": 4.282064894631706e-05, "loss": 0.9456, "step": 170800 }, { "epoch": 0.15, "grad_norm": 27.5, "learning_rate": 4.2816149873126135e-05, "loss": 0.9448, "step": 170900 }, { "epoch": 0.15, "grad_norm": 35.0, "learning_rate": 4.281165079993522e-05, "loss": 1.0657, "step": 171000 }, { "epoch": 0.15, "grad_norm": 26.25, "learning_rate": 4.280715172674429e-05, "loss": 0.9641, "step": 171100 }, { "epoch": 0.15, "grad_norm": 17.375, "learning_rate": 4.2802652653553375e-05, "loss": 0.9344, "step": 171200 }, { "epoch": 0.15, "grad_norm": 20.375, "learning_rate": 4.279815358036245e-05, "loss": 0.9064, "step": 171300 }, { "epoch": 0.15, "grad_norm": 59.5, "learning_rate": 4.2793654507171525e-05, "loss": 0.9617, "step": 171400 }, { "epoch": 0.15, "grad_norm": 18.5, "learning_rate": 4.278915543398061e-05, "loss": 1.0616, "step": 171500 }, { "epoch": 0.15, "grad_norm": 16.5, "learning_rate": 4.2784656360789676e-05, "loss": 1.013, "step": 171600 }, { "epoch": 0.15, "grad_norm": 0.56640625, "learning_rate": 4.278015728759875e-05, "loss": 0.9883, "step": 171700 }, { "epoch": 0.15, "grad_norm": 78.5, "learning_rate": 4.277565821440783e-05, "loss": 0.9044, "step": 171800 }, { "epoch": 0.15, "grad_norm": 29.875, "learning_rate": 4.277115914121691e-05, "loss": 0.9471, "step": 171900 }, { "epoch": 0.15, "grad_norm": 0.9921875, "learning_rate": 4.276666006802599e-05, "loss": 1.041, "step": 172000 }, { "epoch": 0.15, "grad_norm": 15.5625, "learning_rate": 4.2762160994835066e-05, "loss": 1.1452, "step": 172100 }, { "epoch": 0.15, "grad_norm": 67.5, "learning_rate": 4.275766192164414e-05, "loss": 1.1141, "step": 172200 }, { "epoch": 0.15, "grad_norm": 39.75, "learning_rate": 4.2753162848453224e-05, "loss": 0.8427, "step": 172300 }, { "epoch": 0.15, "grad_norm": 67.5, "learning_rate": 4.27486637752623e-05, "loss": 0.9316, "step": 172400 }, { "epoch": 0.15, "grad_norm": 0.8046875, "learning_rate": 4.2744164702071374e-05, "loss": 1.1593, "step": 172500 }, { "epoch": 0.15, "grad_norm": 127.0, "learning_rate": 4.2739665628880456e-05, "loss": 1.0972, "step": 172600 }, { "epoch": 0.15, "grad_norm": 70.5, "learning_rate": 4.273516655568953e-05, "loss": 0.8319, "step": 172700 }, { "epoch": 0.15, "grad_norm": 19.25, "learning_rate": 4.273066748249861e-05, "loss": 0.913, "step": 172800 }, { "epoch": 0.15, "grad_norm": 35.75, "learning_rate": 4.272616840930768e-05, "loss": 1.0221, "step": 172900 }, { "epoch": 0.15, "grad_norm": 19.375, "learning_rate": 4.272166933611676e-05, "loss": 0.961, "step": 173000 }, { "epoch": 0.15, "grad_norm": 12.9375, "learning_rate": 4.271717026292584e-05, "loss": 1.1848, "step": 173100 }, { "epoch": 0.15, "grad_norm": 0.0203857421875, "learning_rate": 4.2712671189734915e-05, "loss": 1.1035, "step": 173200 }, { "epoch": 0.15, "grad_norm": 26.375, "learning_rate": 4.270817211654399e-05, "loss": 1.1222, "step": 173300 }, { "epoch": 0.15, "grad_norm": 20.375, "learning_rate": 4.270367304335307e-05, "loss": 1.0297, "step": 173400 }, { "epoch": 0.15, "grad_norm": 78.0, "learning_rate": 4.269917397016215e-05, "loss": 1.0118, "step": 173500 }, { "epoch": 0.15, "grad_norm": 13.0, "learning_rate": 4.269467489697122e-05, "loss": 0.9917, "step": 173600 }, { "epoch": 0.15, "grad_norm": 0.97265625, "learning_rate": 4.2690175823780305e-05, "loss": 0.903, "step": 173700 }, { "epoch": 0.15, "grad_norm": 29.75, "learning_rate": 4.268567675058938e-05, "loss": 0.8788, "step": 173800 }, { "epoch": 0.15, "grad_norm": 78.5, "learning_rate": 4.268117767739846e-05, "loss": 1.0553, "step": 173900 }, { "epoch": 0.16, "grad_norm": 29.75, "learning_rate": 4.267667860420754e-05, "loss": 0.9663, "step": 174000 }, { "epoch": 0.16, "grad_norm": 38.5, "learning_rate": 4.2672179531016614e-05, "loss": 1.0341, "step": 174100 }, { "epoch": 0.16, "grad_norm": 53.5, "learning_rate": 4.266768045782569e-05, "loss": 0.931, "step": 174200 }, { "epoch": 0.16, "grad_norm": 3.765625, "learning_rate": 4.2663181384634764e-05, "loss": 0.9649, "step": 174300 }, { "epoch": 0.16, "grad_norm": 0.2138671875, "learning_rate": 4.265868231144384e-05, "loss": 0.9726, "step": 174400 }, { "epoch": 0.16, "grad_norm": 27.875, "learning_rate": 4.265418323825292e-05, "loss": 0.8931, "step": 174500 }, { "epoch": 0.16, "grad_norm": 19.375, "learning_rate": 4.2649684165062e-05, "loss": 1.0549, "step": 174600 }, { "epoch": 0.16, "grad_norm": 1.0703125, "learning_rate": 4.264518509187108e-05, "loss": 1.0041, "step": 174700 }, { "epoch": 0.16, "grad_norm": 20.75, "learning_rate": 4.2640686018680155e-05, "loss": 0.9753, "step": 174800 }, { "epoch": 0.16, "grad_norm": 20.875, "learning_rate": 4.263618694548923e-05, "loss": 1.0315, "step": 174900 }, { "epoch": 0.16, "grad_norm": 27.25, "learning_rate": 4.263168787229831e-05, "loss": 1.0886, "step": 175000 }, { "epoch": 0.16, "grad_norm": 55.0, "learning_rate": 4.262718879910739e-05, "loss": 0.9983, "step": 175100 }, { "epoch": 0.16, "grad_norm": 19.75, "learning_rate": 4.262268972591646e-05, "loss": 0.9282, "step": 175200 }, { "epoch": 0.16, "grad_norm": 13.1875, "learning_rate": 4.2618190652725545e-05, "loss": 1.1496, "step": 175300 }, { "epoch": 0.16, "grad_norm": 20.5, "learning_rate": 4.261369157953462e-05, "loss": 1.0169, "step": 175400 }, { "epoch": 0.16, "grad_norm": 48.75, "learning_rate": 4.2609192506343695e-05, "loss": 1.0309, "step": 175500 }, { "epoch": 0.16, "grad_norm": 48.75, "learning_rate": 4.260469343315277e-05, "loss": 1.0293, "step": 175600 }, { "epoch": 0.16, "grad_norm": 43.25, "learning_rate": 4.2600194359961846e-05, "loss": 1.0975, "step": 175700 }, { "epoch": 0.16, "grad_norm": 24.75, "learning_rate": 4.259569528677093e-05, "loss": 1.0853, "step": 175800 }, { "epoch": 0.16, "grad_norm": 24.25, "learning_rate": 4.2591196213580004e-05, "loss": 1.0873, "step": 175900 }, { "epoch": 0.16, "grad_norm": 50.0, "learning_rate": 4.258669714038908e-05, "loss": 1.0547, "step": 176000 }, { "epoch": 0.16, "grad_norm": 16.375, "learning_rate": 4.258219806719816e-05, "loss": 1.0271, "step": 176100 }, { "epoch": 0.16, "grad_norm": 19.625, "learning_rate": 4.2577698994007236e-05, "loss": 0.9698, "step": 176200 }, { "epoch": 0.16, "grad_norm": 39.25, "learning_rate": 4.257319992081631e-05, "loss": 1.1282, "step": 176300 }, { "epoch": 0.16, "grad_norm": 21.875, "learning_rate": 4.2568700847625394e-05, "loss": 1.0358, "step": 176400 }, { "epoch": 0.16, "grad_norm": 31.75, "learning_rate": 4.256420177443447e-05, "loss": 1.0171, "step": 176500 }, { "epoch": 0.16, "grad_norm": 186.0, "learning_rate": 4.2559702701243544e-05, "loss": 0.8717, "step": 176600 }, { "epoch": 0.16, "grad_norm": 1.5625, "learning_rate": 4.255520362805263e-05, "loss": 1.044, "step": 176700 }, { "epoch": 0.16, "grad_norm": 262.0, "learning_rate": 4.2550704554861695e-05, "loss": 0.9051, "step": 176800 }, { "epoch": 0.16, "grad_norm": 0.490234375, "learning_rate": 4.254620548167078e-05, "loss": 1.1272, "step": 176900 }, { "epoch": 0.16, "grad_norm": 5.84375, "learning_rate": 4.254170640847985e-05, "loss": 1.0549, "step": 177000 }, { "epoch": 0.16, "grad_norm": 37.0, "learning_rate": 4.253720733528893e-05, "loss": 1.0612, "step": 177100 }, { "epoch": 0.16, "grad_norm": 223.0, "learning_rate": 4.253270826209801e-05, "loss": 0.9417, "step": 177200 }, { "epoch": 0.16, "grad_norm": 556.0, "learning_rate": 4.2528209188907085e-05, "loss": 0.8947, "step": 177300 }, { "epoch": 0.16, "grad_norm": 40.75, "learning_rate": 4.252371011571617e-05, "loss": 1.0793, "step": 177400 }, { "epoch": 0.16, "grad_norm": 50.5, "learning_rate": 4.251921104252524e-05, "loss": 1.03, "step": 177500 }, { "epoch": 0.16, "grad_norm": 0.85546875, "learning_rate": 4.251471196933432e-05, "loss": 1.0201, "step": 177600 }, { "epoch": 0.16, "grad_norm": 27.125, "learning_rate": 4.25102128961434e-05, "loss": 1.0034, "step": 177700 }, { "epoch": 0.16, "grad_norm": 0.00274658203125, "learning_rate": 4.2505713822952476e-05, "loss": 0.9403, "step": 177800 }, { "epoch": 0.16, "grad_norm": 29.375, "learning_rate": 4.250121474976155e-05, "loss": 0.9524, "step": 177900 }, { "epoch": 0.16, "grad_norm": 19.25, "learning_rate": 4.249671567657063e-05, "loss": 0.9907, "step": 178000 }, { "epoch": 0.16, "grad_norm": 8.5625, "learning_rate": 4.24922166033797e-05, "loss": 0.8962, "step": 178100 }, { "epoch": 0.16, "grad_norm": 860.0, "learning_rate": 4.2487717530188784e-05, "loss": 0.8981, "step": 178200 }, { "epoch": 0.16, "grad_norm": 410.0, "learning_rate": 4.248321845699786e-05, "loss": 1.0501, "step": 178300 }, { "epoch": 0.16, "grad_norm": 41.25, "learning_rate": 4.2478719383806934e-05, "loss": 0.8877, "step": 178400 }, { "epoch": 0.16, "grad_norm": 42.75, "learning_rate": 4.2474220310616017e-05, "loss": 1.0879, "step": 178500 }, { "epoch": 0.16, "grad_norm": 56.0, "learning_rate": 4.246972123742509e-05, "loss": 0.9904, "step": 178600 }, { "epoch": 0.16, "grad_norm": 24.0, "learning_rate": 4.246522216423417e-05, "loss": 1.1506, "step": 178700 }, { "epoch": 0.16, "grad_norm": 50.25, "learning_rate": 4.246072309104325e-05, "loss": 1.1529, "step": 178800 }, { "epoch": 0.16, "grad_norm": 58.0, "learning_rate": 4.2456224017852325e-05, "loss": 1.0478, "step": 178900 }, { "epoch": 0.16, "grad_norm": 33.0, "learning_rate": 4.24517249446614e-05, "loss": 1.0825, "step": 179000 }, { "epoch": 0.16, "grad_norm": 3.375, "learning_rate": 4.244722587147048e-05, "loss": 0.9179, "step": 179100 }, { "epoch": 0.16, "grad_norm": 86.0, "learning_rate": 4.244272679827956e-05, "loss": 1.0197, "step": 179200 }, { "epoch": 0.16, "grad_norm": 30.25, "learning_rate": 4.243822772508863e-05, "loss": 1.1412, "step": 179300 }, { "epoch": 0.16, "grad_norm": 88.5, "learning_rate": 4.243372865189771e-05, "loss": 1.1946, "step": 179400 }, { "epoch": 0.16, "grad_norm": 0.232421875, "learning_rate": 4.2429229578706784e-05, "loss": 0.9457, "step": 179500 }, { "epoch": 0.16, "grad_norm": 41.25, "learning_rate": 4.2424730505515866e-05, "loss": 1.1139, "step": 179600 }, { "epoch": 0.16, "grad_norm": 6.0, "learning_rate": 4.242023143232494e-05, "loss": 1.1794, "step": 179700 }, { "epoch": 0.16, "grad_norm": 61.25, "learning_rate": 4.2415732359134016e-05, "loss": 0.98, "step": 179800 }, { "epoch": 0.16, "grad_norm": 39.0, "learning_rate": 4.24112332859431e-05, "loss": 1.0908, "step": 179900 }, { "epoch": 0.16, "grad_norm": 27.0, "learning_rate": 4.2406734212752174e-05, "loss": 1.149, "step": 180000 }, { "epoch": 0.16, "grad_norm": 19.75, "learning_rate": 4.2402235139561256e-05, "loss": 1.1205, "step": 180100 }, { "epoch": 0.16, "grad_norm": 0.007171630859375, "learning_rate": 4.239773606637033e-05, "loss": 0.8894, "step": 180200 }, { "epoch": 0.16, "grad_norm": 15.125, "learning_rate": 4.2393236993179407e-05, "loss": 0.9269, "step": 180300 }, { "epoch": 0.16, "grad_norm": 23.0, "learning_rate": 4.238873791998849e-05, "loss": 0.9371, "step": 180400 }, { "epoch": 0.16, "grad_norm": 70.0, "learning_rate": 4.2384238846797564e-05, "loss": 0.9446, "step": 180500 }, { "epoch": 0.16, "grad_norm": 28.375, "learning_rate": 4.237973977360664e-05, "loss": 0.988, "step": 180600 }, { "epoch": 0.16, "grad_norm": 75.5, "learning_rate": 4.2375240700415715e-05, "loss": 0.9541, "step": 180700 }, { "epoch": 0.16, "grad_norm": 0.36328125, "learning_rate": 4.237074162722479e-05, "loss": 0.9531, "step": 180800 }, { "epoch": 0.16, "grad_norm": 11.4375, "learning_rate": 4.236624255403387e-05, "loss": 0.9431, "step": 180900 }, { "epoch": 0.16, "grad_norm": 20.0, "learning_rate": 4.236174348084295e-05, "loss": 0.9906, "step": 181000 }, { "epoch": 0.16, "grad_norm": 50.75, "learning_rate": 4.235724440765202e-05, "loss": 1.0392, "step": 181100 }, { "epoch": 0.16, "grad_norm": 0.037109375, "learning_rate": 4.2352745334461105e-05, "loss": 1.0021, "step": 181200 }, { "epoch": 0.16, "grad_norm": 2.40625, "learning_rate": 4.234824626127018e-05, "loss": 1.122, "step": 181300 }, { "epoch": 0.16, "grad_norm": 28.0, "learning_rate": 4.2343747188079256e-05, "loss": 1.1188, "step": 181400 }, { "epoch": 0.16, "grad_norm": 100.0, "learning_rate": 4.233924811488834e-05, "loss": 1.0672, "step": 181500 }, { "epoch": 0.16, "grad_norm": 0.0184326171875, "learning_rate": 4.233474904169741e-05, "loss": 1.031, "step": 181600 }, { "epoch": 0.16, "grad_norm": 0.00154876708984375, "learning_rate": 4.233024996850649e-05, "loss": 0.9535, "step": 181700 }, { "epoch": 0.16, "grad_norm": 0.162109375, "learning_rate": 4.232575089531557e-05, "loss": 0.9562, "step": 181800 }, { "epoch": 0.16, "grad_norm": 14.3125, "learning_rate": 4.2321251822124646e-05, "loss": 0.8395, "step": 181900 }, { "epoch": 0.16, "grad_norm": 60.0, "learning_rate": 4.231675274893372e-05, "loss": 0.9971, "step": 182000 }, { "epoch": 0.16, "grad_norm": 38.5, "learning_rate": 4.2312253675742797e-05, "loss": 1.0235, "step": 182100 }, { "epoch": 0.16, "grad_norm": 0.0186767578125, "learning_rate": 4.230775460255187e-05, "loss": 1.1027, "step": 182200 }, { "epoch": 0.16, "grad_norm": 78.5, "learning_rate": 4.2303255529360954e-05, "loss": 1.0614, "step": 182300 }, { "epoch": 0.16, "grad_norm": 13.8125, "learning_rate": 4.229875645617003e-05, "loss": 1.0475, "step": 182400 }, { "epoch": 0.16, "grad_norm": 3.125, "learning_rate": 4.2294257382979105e-05, "loss": 0.991, "step": 182500 }, { "epoch": 0.16, "grad_norm": 21.375, "learning_rate": 4.228975830978819e-05, "loss": 1.0521, "step": 182600 }, { "epoch": 0.16, "grad_norm": 23.875, "learning_rate": 4.228525923659726e-05, "loss": 1.168, "step": 182700 }, { "epoch": 0.16, "grad_norm": 0.03515625, "learning_rate": 4.2280760163406344e-05, "loss": 1.0485, "step": 182800 }, { "epoch": 0.16, "grad_norm": 88.5, "learning_rate": 4.227626109021542e-05, "loss": 0.9773, "step": 182900 }, { "epoch": 0.16, "grad_norm": 89.0, "learning_rate": 4.2271762017024495e-05, "loss": 0.9462, "step": 183000 }, { "epoch": 0.16, "grad_norm": 46.25, "learning_rate": 4.226726294383358e-05, "loss": 1.1444, "step": 183100 }, { "epoch": 0.16, "grad_norm": 26.375, "learning_rate": 4.226276387064265e-05, "loss": 0.8055, "step": 183200 }, { "epoch": 0.16, "grad_norm": 85.5, "learning_rate": 4.225826479745172e-05, "loss": 0.9264, "step": 183300 }, { "epoch": 0.16, "grad_norm": 18.375, "learning_rate": 4.22537657242608e-05, "loss": 0.8415, "step": 183400 }, { "epoch": 0.16, "grad_norm": 58.75, "learning_rate": 4.224926665106988e-05, "loss": 1.0071, "step": 183500 }, { "epoch": 0.16, "grad_norm": 20.0, "learning_rate": 4.224476757787896e-05, "loss": 1.0227, "step": 183600 }, { "epoch": 0.16, "grad_norm": 23.625, "learning_rate": 4.2240268504688036e-05, "loss": 1.2288, "step": 183700 }, { "epoch": 0.16, "grad_norm": 7.21875, "learning_rate": 4.223576943149711e-05, "loss": 0.8231, "step": 183800 }, { "epoch": 0.16, "grad_norm": 38.25, "learning_rate": 4.223127035830619e-05, "loss": 1.1156, "step": 183900 }, { "epoch": 0.16, "grad_norm": 19.875, "learning_rate": 4.222677128511527e-05, "loss": 0.9293, "step": 184000 }, { "epoch": 0.16, "grad_norm": 17.375, "learning_rate": 4.2222272211924344e-05, "loss": 0.9727, "step": 184100 }, { "epoch": 0.16, "grad_norm": 278.0, "learning_rate": 4.2217773138733426e-05, "loss": 1.0683, "step": 184200 }, { "epoch": 0.16, "grad_norm": 10.375, "learning_rate": 4.22132740655425e-05, "loss": 0.9945, "step": 184300 }, { "epoch": 0.16, "grad_norm": 26.375, "learning_rate": 4.220877499235158e-05, "loss": 1.0406, "step": 184400 }, { "epoch": 0.16, "grad_norm": 8.8125, "learning_rate": 4.220427591916066e-05, "loss": 1.1124, "step": 184500 }, { "epoch": 0.16, "grad_norm": 40.5, "learning_rate": 4.219977684596973e-05, "loss": 1.0967, "step": 184600 }, { "epoch": 0.16, "grad_norm": 231.0, "learning_rate": 4.219527777277881e-05, "loss": 1.0927, "step": 184700 }, { "epoch": 0.16, "grad_norm": 27.75, "learning_rate": 4.2190778699587885e-05, "loss": 1.1185, "step": 184800 }, { "epoch": 0.16, "grad_norm": 35.0, "learning_rate": 4.218627962639696e-05, "loss": 1.1544, "step": 184900 }, { "epoch": 0.16, "grad_norm": 90.0, "learning_rate": 4.218178055320604e-05, "loss": 0.9832, "step": 185000 }, { "epoch": 0.16, "grad_norm": 0.296875, "learning_rate": 4.217728148001512e-05, "loss": 1.0514, "step": 185100 }, { "epoch": 0.16, "grad_norm": 62.5, "learning_rate": 4.217278240682419e-05, "loss": 0.9806, "step": 185200 }, { "epoch": 0.17, "grad_norm": 58.25, "learning_rate": 4.2168283333633275e-05, "loss": 1.0324, "step": 185300 }, { "epoch": 0.17, "grad_norm": 16.0, "learning_rate": 4.216378426044235e-05, "loss": 1.0414, "step": 185400 }, { "epoch": 0.17, "grad_norm": 29.5, "learning_rate": 4.215928518725143e-05, "loss": 1.1363, "step": 185500 }, { "epoch": 0.17, "grad_norm": 9.625, "learning_rate": 4.215478611406051e-05, "loss": 1.0124, "step": 185600 }, { "epoch": 0.17, "grad_norm": 19.25, "learning_rate": 4.215028704086958e-05, "loss": 1.1343, "step": 185700 }, { "epoch": 0.17, "grad_norm": 10.3125, "learning_rate": 4.2145787967678665e-05, "loss": 0.9466, "step": 185800 }, { "epoch": 0.17, "grad_norm": 0.0986328125, "learning_rate": 4.2141288894487734e-05, "loss": 0.9751, "step": 185900 }, { "epoch": 0.17, "grad_norm": 51.5, "learning_rate": 4.213678982129681e-05, "loss": 1.2408, "step": 186000 }, { "epoch": 0.17, "grad_norm": 7.96875, "learning_rate": 4.213229074810589e-05, "loss": 1.0506, "step": 186100 }, { "epoch": 0.17, "grad_norm": 47.0, "learning_rate": 4.212779167491497e-05, "loss": 1.0667, "step": 186200 }, { "epoch": 0.17, "grad_norm": 124.5, "learning_rate": 4.212329260172405e-05, "loss": 1.0205, "step": 186300 }, { "epoch": 0.17, "grad_norm": 0.1279296875, "learning_rate": 4.2118793528533124e-05, "loss": 1.0697, "step": 186400 }, { "epoch": 0.17, "grad_norm": 1.3984375, "learning_rate": 4.21142944553422e-05, "loss": 0.8599, "step": 186500 }, { "epoch": 0.17, "grad_norm": 0.0031585693359375, "learning_rate": 4.210979538215128e-05, "loss": 0.9359, "step": 186600 }, { "epoch": 0.17, "grad_norm": 0.1455078125, "learning_rate": 4.210529630896036e-05, "loss": 1.0606, "step": 186700 }, { "epoch": 0.17, "grad_norm": 49.25, "learning_rate": 4.210079723576943e-05, "loss": 1.1038, "step": 186800 }, { "epoch": 0.17, "grad_norm": 27.0, "learning_rate": 4.2096298162578515e-05, "loss": 0.9061, "step": 186900 }, { "epoch": 0.17, "grad_norm": 516.0, "learning_rate": 4.209179908938759e-05, "loss": 0.9282, "step": 187000 }, { "epoch": 0.17, "grad_norm": 61.25, "learning_rate": 4.2087300016196665e-05, "loss": 1.0016, "step": 187100 }, { "epoch": 0.17, "grad_norm": 0.1689453125, "learning_rate": 4.208280094300574e-05, "loss": 0.9262, "step": 187200 }, { "epoch": 0.17, "grad_norm": 44.25, "learning_rate": 4.2078301869814816e-05, "loss": 0.9576, "step": 187300 }, { "epoch": 0.17, "grad_norm": 25.5, "learning_rate": 4.20738027966239e-05, "loss": 0.9147, "step": 187400 }, { "epoch": 0.17, "grad_norm": 0.7265625, "learning_rate": 4.206930372343297e-05, "loss": 1.1562, "step": 187500 }, { "epoch": 0.17, "grad_norm": 19.625, "learning_rate": 4.206480465024205e-05, "loss": 0.9978, "step": 187600 }, { "epoch": 0.17, "grad_norm": 18.5, "learning_rate": 4.206030557705113e-05, "loss": 1.1307, "step": 187700 }, { "epoch": 0.17, "grad_norm": 52.0, "learning_rate": 4.2055806503860206e-05, "loss": 0.9417, "step": 187800 }, { "epoch": 0.17, "grad_norm": 57.75, "learning_rate": 4.205130743066928e-05, "loss": 0.8818, "step": 187900 }, { "epoch": 0.17, "grad_norm": 14.75, "learning_rate": 4.2046808357478364e-05, "loss": 1.0595, "step": 188000 }, { "epoch": 0.17, "grad_norm": 46.75, "learning_rate": 4.204230928428744e-05, "loss": 1.063, "step": 188100 }, { "epoch": 0.17, "grad_norm": 0.2119140625, "learning_rate": 4.203781021109652e-05, "loss": 1.0472, "step": 188200 }, { "epoch": 0.17, "grad_norm": 57.5, "learning_rate": 4.2033311137905596e-05, "loss": 1.0571, "step": 188300 }, { "epoch": 0.17, "grad_norm": 76.0, "learning_rate": 4.202881206471467e-05, "loss": 1.0313, "step": 188400 }, { "epoch": 0.17, "grad_norm": 87.5, "learning_rate": 4.202431299152375e-05, "loss": 1.0989, "step": 188500 }, { "epoch": 0.17, "grad_norm": 79.5, "learning_rate": 4.201981391833282e-05, "loss": 0.9495, "step": 188600 }, { "epoch": 0.17, "grad_norm": 266.0, "learning_rate": 4.20153148451419e-05, "loss": 1.1051, "step": 188700 }, { "epoch": 0.17, "grad_norm": 28.75, "learning_rate": 4.201081577195098e-05, "loss": 1.0755, "step": 188800 }, { "epoch": 0.17, "grad_norm": 17.625, "learning_rate": 4.2006316698760055e-05, "loss": 0.854, "step": 188900 }, { "epoch": 0.17, "grad_norm": 23.875, "learning_rate": 4.200181762556914e-05, "loss": 0.9885, "step": 189000 }, { "epoch": 0.17, "grad_norm": 64.0, "learning_rate": 4.199731855237821e-05, "loss": 0.9234, "step": 189100 }, { "epoch": 0.17, "grad_norm": 596.0, "learning_rate": 4.199281947918729e-05, "loss": 1.0651, "step": 189200 }, { "epoch": 0.17, "grad_norm": 24.5, "learning_rate": 4.198832040599637e-05, "loss": 1.0574, "step": 189300 }, { "epoch": 0.17, "grad_norm": 38.75, "learning_rate": 4.1983821332805445e-05, "loss": 1.0161, "step": 189400 }, { "epoch": 0.17, "grad_norm": 32.5, "learning_rate": 4.197932225961452e-05, "loss": 1.1001, "step": 189500 }, { "epoch": 0.17, "grad_norm": 10.6875, "learning_rate": 4.19748231864236e-05, "loss": 1.0056, "step": 189600 }, { "epoch": 0.17, "grad_norm": 58.5, "learning_rate": 4.197032411323268e-05, "loss": 0.9923, "step": 189700 }, { "epoch": 0.17, "grad_norm": 35.5, "learning_rate": 4.1965825040041754e-05, "loss": 1.0193, "step": 189800 }, { "epoch": 0.17, "grad_norm": 27.625, "learning_rate": 4.196132596685083e-05, "loss": 1.0113, "step": 189900 }, { "epoch": 0.17, "grad_norm": 51.0, "learning_rate": 4.1956826893659904e-05, "loss": 0.9433, "step": 190000 }, { "epoch": 0.17, "grad_norm": 0.267578125, "learning_rate": 4.1952327820468986e-05, "loss": 1.0347, "step": 190100 }, { "epoch": 0.17, "grad_norm": 35.0, "learning_rate": 4.194782874727806e-05, "loss": 0.9434, "step": 190200 }, { "epoch": 0.17, "grad_norm": 56.5, "learning_rate": 4.194332967408714e-05, "loss": 0.9746, "step": 190300 }, { "epoch": 0.17, "grad_norm": 81.5, "learning_rate": 4.193883060089622e-05, "loss": 1.1085, "step": 190400 }, { "epoch": 0.17, "grad_norm": 95.5, "learning_rate": 4.1934331527705295e-05, "loss": 0.9829, "step": 190500 }, { "epoch": 0.17, "grad_norm": 35.25, "learning_rate": 4.192983245451437e-05, "loss": 0.8869, "step": 190600 }, { "epoch": 0.17, "grad_norm": 280.0, "learning_rate": 4.192533338132345e-05, "loss": 1.0473, "step": 190700 }, { "epoch": 0.17, "grad_norm": 9.375, "learning_rate": 4.192083430813253e-05, "loss": 1.0365, "step": 190800 }, { "epoch": 0.17, "grad_norm": 360.0, "learning_rate": 4.191633523494161e-05, "loss": 0.9568, "step": 190900 }, { "epoch": 0.17, "grad_norm": 46.25, "learning_rate": 4.1911836161750685e-05, "loss": 0.9797, "step": 191000 }, { "epoch": 0.17, "grad_norm": 18.625, "learning_rate": 4.190733708855975e-05, "loss": 1.0198, "step": 191100 }, { "epoch": 0.17, "grad_norm": 35.0, "learning_rate": 4.1902838015368835e-05, "loss": 1.1112, "step": 191200 }, { "epoch": 0.17, "grad_norm": 84.0, "learning_rate": 4.189833894217791e-05, "loss": 1.0277, "step": 191300 }, { "epoch": 0.17, "grad_norm": 10.6875, "learning_rate": 4.1893839868986986e-05, "loss": 1.0063, "step": 191400 }, { "epoch": 0.17, "grad_norm": 82.0, "learning_rate": 4.188934079579607e-05, "loss": 1.0716, "step": 191500 }, { "epoch": 0.17, "grad_norm": 24.75, "learning_rate": 4.1884841722605144e-05, "loss": 1.0852, "step": 191600 }, { "epoch": 0.17, "grad_norm": 23.5, "learning_rate": 4.1880342649414226e-05, "loss": 1.105, "step": 191700 }, { "epoch": 0.17, "grad_norm": 24.0, "learning_rate": 4.18758435762233e-05, "loss": 0.8718, "step": 191800 }, { "epoch": 0.17, "grad_norm": 42.0, "learning_rate": 4.1871344503032376e-05, "loss": 0.9035, "step": 191900 }, { "epoch": 0.17, "grad_norm": 0.0517578125, "learning_rate": 4.186684542984146e-05, "loss": 0.9971, "step": 192000 }, { "epoch": 0.17, "grad_norm": 68.5, "learning_rate": 4.1862346356650534e-05, "loss": 1.016, "step": 192100 }, { "epoch": 0.17, "grad_norm": 22.5, "learning_rate": 4.185784728345961e-05, "loss": 0.9734, "step": 192200 }, { "epoch": 0.17, "grad_norm": 49.0, "learning_rate": 4.185334821026869e-05, "loss": 1.0572, "step": 192300 }, { "epoch": 0.17, "grad_norm": 59.25, "learning_rate": 4.184884913707776e-05, "loss": 1.0149, "step": 192400 }, { "epoch": 0.17, "grad_norm": 12.5, "learning_rate": 4.184435006388684e-05, "loss": 0.8981, "step": 192500 }, { "epoch": 0.17, "grad_norm": 20.625, "learning_rate": 4.183985099069592e-05, "loss": 0.9419, "step": 192600 }, { "epoch": 0.17, "grad_norm": 14.0625, "learning_rate": 4.183535191750499e-05, "loss": 1.0664, "step": 192700 }, { "epoch": 0.17, "grad_norm": 156.0, "learning_rate": 4.1830852844314075e-05, "loss": 1.0785, "step": 192800 }, { "epoch": 0.17, "grad_norm": 28.625, "learning_rate": 4.182635377112315e-05, "loss": 0.9858, "step": 192900 }, { "epoch": 0.17, "grad_norm": 21.5, "learning_rate": 4.1821854697932225e-05, "loss": 1.1164, "step": 193000 }, { "epoch": 0.17, "grad_norm": 52.25, "learning_rate": 4.181735562474131e-05, "loss": 1.0492, "step": 193100 }, { "epoch": 0.17, "grad_norm": 25.375, "learning_rate": 4.181285655155038e-05, "loss": 0.8838, "step": 193200 }, { "epoch": 0.17, "grad_norm": 0.015625, "learning_rate": 4.180835747835946e-05, "loss": 0.8675, "step": 193300 }, { "epoch": 0.17, "grad_norm": 12.75, "learning_rate": 4.180385840516854e-05, "loss": 1.0667, "step": 193400 }, { "epoch": 0.17, "grad_norm": 21.25, "learning_rate": 4.1799359331977616e-05, "loss": 0.9365, "step": 193500 }, { "epoch": 0.17, "grad_norm": 53.75, "learning_rate": 4.179486025878669e-05, "loss": 1.1836, "step": 193600 }, { "epoch": 0.17, "grad_norm": 26.375, "learning_rate": 4.1790361185595766e-05, "loss": 1.0216, "step": 193700 }, { "epoch": 0.17, "grad_norm": 41.25, "learning_rate": 4.178586211240484e-05, "loss": 1.0362, "step": 193800 }, { "epoch": 0.17, "grad_norm": 70.5, "learning_rate": 4.1781363039213924e-05, "loss": 0.9231, "step": 193900 }, { "epoch": 0.17, "grad_norm": 15.8125, "learning_rate": 4.1776863966023e-05, "loss": 0.9694, "step": 194000 }, { "epoch": 0.17, "grad_norm": 39.5, "learning_rate": 4.1772364892832075e-05, "loss": 0.9674, "step": 194100 }, { "epoch": 0.17, "grad_norm": 77.5, "learning_rate": 4.176786581964116e-05, "loss": 0.9426, "step": 194200 }, { "epoch": 0.17, "grad_norm": 52.25, "learning_rate": 4.176336674645023e-05, "loss": 0.9666, "step": 194300 }, { "epoch": 0.17, "grad_norm": 47.0, "learning_rate": 4.1758867673259314e-05, "loss": 0.9828, "step": 194400 }, { "epoch": 0.17, "grad_norm": 177.0, "learning_rate": 4.175436860006839e-05, "loss": 1.0319, "step": 194500 }, { "epoch": 0.17, "grad_norm": 44.0, "learning_rate": 4.1749869526877465e-05, "loss": 1.0749, "step": 194600 }, { "epoch": 0.17, "grad_norm": 9.6875, "learning_rate": 4.174537045368655e-05, "loss": 1.0851, "step": 194700 }, { "epoch": 0.17, "grad_norm": 33.75, "learning_rate": 4.174087138049562e-05, "loss": 1.0028, "step": 194800 }, { "epoch": 0.17, "grad_norm": 9.75, "learning_rate": 4.17363723073047e-05, "loss": 1.1624, "step": 194900 }, { "epoch": 0.17, "grad_norm": 0.703125, "learning_rate": 4.173187323411377e-05, "loss": 0.8968, "step": 195000 }, { "epoch": 0.17, "grad_norm": 19.375, "learning_rate": 4.172737416092285e-05, "loss": 1.0454, "step": 195100 }, { "epoch": 0.17, "grad_norm": 17.625, "learning_rate": 4.172287508773193e-05, "loss": 1.0142, "step": 195200 }, { "epoch": 0.17, "grad_norm": 29.25, "learning_rate": 4.1718376014541006e-05, "loss": 0.8795, "step": 195300 }, { "epoch": 0.17, "grad_norm": 16.0, "learning_rate": 4.171387694135008e-05, "loss": 0.967, "step": 195400 }, { "epoch": 0.17, "grad_norm": 25.375, "learning_rate": 4.170937786815916e-05, "loss": 0.9382, "step": 195500 }, { "epoch": 0.17, "grad_norm": 240.0, "learning_rate": 4.170487879496824e-05, "loss": 0.9448, "step": 195600 }, { "epoch": 0.17, "grad_norm": 46.5, "learning_rate": 4.1700379721777314e-05, "loss": 0.9801, "step": 195700 }, { "epoch": 0.17, "grad_norm": 25.25, "learning_rate": 4.1695880648586396e-05, "loss": 0.8991, "step": 195800 }, { "epoch": 0.17, "grad_norm": 69.5, "learning_rate": 4.169138157539547e-05, "loss": 1.0617, "step": 195900 }, { "epoch": 0.17, "grad_norm": 94.5, "learning_rate": 4.168688250220455e-05, "loss": 1.0327, "step": 196000 }, { "epoch": 0.17, "grad_norm": 134.0, "learning_rate": 4.168238342901363e-05, "loss": 1.0698, "step": 196100 }, { "epoch": 0.17, "grad_norm": 31.125, "learning_rate": 4.1677884355822704e-05, "loss": 1.0747, "step": 196200 }, { "epoch": 0.17, "grad_norm": 0.01495361328125, "learning_rate": 4.167338528263178e-05, "loss": 1.0298, "step": 196300 }, { "epoch": 0.17, "grad_norm": 0.1298828125, "learning_rate": 4.1668886209440855e-05, "loss": 1.1018, "step": 196400 }, { "epoch": 0.18, "grad_norm": 13.75, "learning_rate": 4.166438713624993e-05, "loss": 0.9481, "step": 196500 }, { "epoch": 0.18, "grad_norm": 73.0, "learning_rate": 4.165988806305901e-05, "loss": 1.054, "step": 196600 }, { "epoch": 0.18, "grad_norm": 1.5546875, "learning_rate": 4.165538898986809e-05, "loss": 1.0025, "step": 196700 }, { "epoch": 0.18, "grad_norm": 11.5, "learning_rate": 4.165088991667716e-05, "loss": 1.0205, "step": 196800 }, { "epoch": 0.18, "grad_norm": 30.25, "learning_rate": 4.1646390843486245e-05, "loss": 1.108, "step": 196900 }, { "epoch": 0.18, "grad_norm": 32.0, "learning_rate": 4.164189177029532e-05, "loss": 1.0031, "step": 197000 }, { "epoch": 0.18, "grad_norm": 1.921875, "learning_rate": 4.16373926971044e-05, "loss": 0.8931, "step": 197100 }, { "epoch": 0.18, "grad_norm": 13.375, "learning_rate": 4.163289362391348e-05, "loss": 0.937, "step": 197200 }, { "epoch": 0.18, "grad_norm": 219.0, "learning_rate": 4.162839455072255e-05, "loss": 1.0134, "step": 197300 }, { "epoch": 0.18, "grad_norm": 61.75, "learning_rate": 4.1623895477531635e-05, "loss": 1.0368, "step": 197400 }, { "epoch": 0.18, "grad_norm": 240.0, "learning_rate": 4.161939640434071e-05, "loss": 1.1006, "step": 197500 }, { "epoch": 0.18, "grad_norm": 42.0, "learning_rate": 4.161489733114978e-05, "loss": 0.9246, "step": 197600 }, { "epoch": 0.18, "grad_norm": 1.1796875, "learning_rate": 4.161039825795886e-05, "loss": 0.9707, "step": 197700 }, { "epoch": 0.18, "grad_norm": 25.375, "learning_rate": 4.160589918476794e-05, "loss": 1.123, "step": 197800 }, { "epoch": 0.18, "grad_norm": 68.5, "learning_rate": 4.160140011157702e-05, "loss": 1.0692, "step": 197900 }, { "epoch": 0.18, "grad_norm": 22.0, "learning_rate": 4.1596901038386094e-05, "loss": 0.9804, "step": 198000 }, { "epoch": 0.18, "grad_norm": 41.75, "learning_rate": 4.159240196519517e-05, "loss": 1.0816, "step": 198100 }, { "epoch": 0.18, "grad_norm": 0.91015625, "learning_rate": 4.158790289200425e-05, "loss": 1.0439, "step": 198200 }, { "epoch": 0.18, "grad_norm": 43.5, "learning_rate": 4.158340381881333e-05, "loss": 0.9404, "step": 198300 }, { "epoch": 0.18, "grad_norm": 52.5, "learning_rate": 4.15789047456224e-05, "loss": 1.0073, "step": 198400 }, { "epoch": 0.18, "grad_norm": 11.6875, "learning_rate": 4.1574405672431484e-05, "loss": 1.1357, "step": 198500 }, { "epoch": 0.18, "grad_norm": 0.00360107421875, "learning_rate": 4.156990659924056e-05, "loss": 1.0117, "step": 198600 }, { "epoch": 0.18, "grad_norm": 36.75, "learning_rate": 4.1565407526049635e-05, "loss": 1.038, "step": 198700 }, { "epoch": 0.18, "grad_norm": 22.375, "learning_rate": 4.156090845285872e-05, "loss": 1.1316, "step": 198800 }, { "epoch": 0.18, "grad_norm": 227.0, "learning_rate": 4.1556409379667786e-05, "loss": 0.9967, "step": 198900 }, { "epoch": 0.18, "grad_norm": 27.0, "learning_rate": 4.155191030647687e-05, "loss": 0.9362, "step": 199000 }, { "epoch": 0.18, "grad_norm": 7.75, "learning_rate": 4.154741123328594e-05, "loss": 1.0924, "step": 199100 }, { "epoch": 0.18, "grad_norm": 36.5, "learning_rate": 4.154291216009502e-05, "loss": 1.1432, "step": 199200 }, { "epoch": 0.18, "grad_norm": 1.0390625, "learning_rate": 4.15384130869041e-05, "loss": 0.9493, "step": 199300 }, { "epoch": 0.18, "grad_norm": 57.25, "learning_rate": 4.1533914013713176e-05, "loss": 0.9189, "step": 199400 }, { "epoch": 0.18, "grad_norm": 7.5, "learning_rate": 4.152941494052225e-05, "loss": 0.9244, "step": 199500 }, { "epoch": 0.18, "grad_norm": 21.0, "learning_rate": 4.1524915867331333e-05, "loss": 0.9848, "step": 199600 }, { "epoch": 0.18, "grad_norm": 57.75, "learning_rate": 4.152041679414041e-05, "loss": 0.9719, "step": 199700 }, { "epoch": 0.18, "grad_norm": 27.625, "learning_rate": 4.151591772094949e-05, "loss": 0.9785, "step": 199800 }, { "epoch": 0.18, "grad_norm": 136.0, "learning_rate": 4.1511418647758566e-05, "loss": 1.0904, "step": 199900 }, { "epoch": 0.18, "grad_norm": 30.0, "learning_rate": 4.150691957456764e-05, "loss": 0.952, "step": 200000 }, { "epoch": 0.18, "grad_norm": 137.0, "learning_rate": 4.1502420501376724e-05, "loss": 0.9827, "step": 200100 }, { "epoch": 0.18, "grad_norm": 44.5, "learning_rate": 4.149792142818579e-05, "loss": 1.0779, "step": 200200 }, { "epoch": 0.18, "grad_norm": 12.4375, "learning_rate": 4.149342235499487e-05, "loss": 1.0018, "step": 200300 }, { "epoch": 0.18, "grad_norm": 0.1005859375, "learning_rate": 4.148892328180395e-05, "loss": 1.0206, "step": 200400 }, { "epoch": 0.18, "grad_norm": 17.25, "learning_rate": 4.1484424208613025e-05, "loss": 1.2204, "step": 200500 }, { "epoch": 0.18, "grad_norm": 44.75, "learning_rate": 4.147992513542211e-05, "loss": 1.0894, "step": 200600 }, { "epoch": 0.18, "grad_norm": 91.0, "learning_rate": 4.147542606223118e-05, "loss": 1.1154, "step": 200700 }, { "epoch": 0.18, "grad_norm": 38.0, "learning_rate": 4.147092698904026e-05, "loss": 0.9523, "step": 200800 }, { "epoch": 0.18, "grad_norm": 43.25, "learning_rate": 4.146642791584934e-05, "loss": 1.0039, "step": 200900 }, { "epoch": 0.18, "grad_norm": 17.0, "learning_rate": 4.1461928842658415e-05, "loss": 1.0288, "step": 201000 }, { "epoch": 0.18, "grad_norm": 0.025146484375, "learning_rate": 4.145742976946749e-05, "loss": 0.9594, "step": 201100 }, { "epoch": 0.18, "grad_norm": 16.75, "learning_rate": 4.145293069627657e-05, "loss": 1.0622, "step": 201200 }, { "epoch": 0.18, "grad_norm": 58.0, "learning_rate": 4.144843162308565e-05, "loss": 1.0108, "step": 201300 }, { "epoch": 0.18, "grad_norm": 39.25, "learning_rate": 4.1443932549894723e-05, "loss": 1.1088, "step": 201400 }, { "epoch": 0.18, "grad_norm": 23.25, "learning_rate": 4.14394334767038e-05, "loss": 1.0485, "step": 201500 }, { "epoch": 0.18, "grad_norm": 81.0, "learning_rate": 4.1434934403512874e-05, "loss": 1.0323, "step": 201600 }, { "epoch": 0.18, "grad_norm": 20.375, "learning_rate": 4.1430435330321956e-05, "loss": 1.0112, "step": 201700 }, { "epoch": 0.18, "grad_norm": 25.25, "learning_rate": 4.142593625713103e-05, "loss": 1.0242, "step": 201800 }, { "epoch": 0.18, "grad_norm": 272.0, "learning_rate": 4.142143718394011e-05, "loss": 1.0801, "step": 201900 }, { "epoch": 0.18, "grad_norm": 69.0, "learning_rate": 4.141693811074919e-05, "loss": 0.8958, "step": 202000 }, { "epoch": 0.18, "grad_norm": 34.25, "learning_rate": 4.1412439037558264e-05, "loss": 1.025, "step": 202100 }, { "epoch": 0.18, "grad_norm": 47.0, "learning_rate": 4.140793996436734e-05, "loss": 1.058, "step": 202200 }, { "epoch": 0.18, "grad_norm": 122.5, "learning_rate": 4.140344089117642e-05, "loss": 0.9138, "step": 202300 }, { "epoch": 0.18, "grad_norm": 53.0, "learning_rate": 4.13989418179855e-05, "loss": 1.0506, "step": 202400 }, { "epoch": 0.18, "grad_norm": 50.25, "learning_rate": 4.139444274479458e-05, "loss": 0.8935, "step": 202500 }, { "epoch": 0.18, "grad_norm": 33.5, "learning_rate": 4.1389943671603655e-05, "loss": 1.0334, "step": 202600 }, { "epoch": 0.18, "grad_norm": 40.0, "learning_rate": 4.138544459841273e-05, "loss": 0.9227, "step": 202700 }, { "epoch": 0.18, "grad_norm": 23.25, "learning_rate": 4.1380945525221805e-05, "loss": 0.9657, "step": 202800 }, { "epoch": 0.18, "grad_norm": 0.578125, "learning_rate": 4.137644645203088e-05, "loss": 1.0117, "step": 202900 }, { "epoch": 0.18, "grad_norm": 0.00537109375, "learning_rate": 4.1371947378839956e-05, "loss": 1.1096, "step": 203000 }, { "epoch": 0.18, "grad_norm": 0.37890625, "learning_rate": 4.136744830564904e-05, "loss": 0.9909, "step": 203100 }, { "epoch": 0.18, "grad_norm": 42.75, "learning_rate": 4.1362949232458113e-05, "loss": 1.0561, "step": 203200 }, { "epoch": 0.18, "grad_norm": 19.25, "learning_rate": 4.1358450159267196e-05, "loss": 1.0458, "step": 203300 }, { "epoch": 0.18, "grad_norm": 23.0, "learning_rate": 4.135395108607627e-05, "loss": 1.0198, "step": 203400 }, { "epoch": 0.18, "grad_norm": 57.5, "learning_rate": 4.1349452012885346e-05, "loss": 1.0858, "step": 203500 }, { "epoch": 0.18, "grad_norm": 43.0, "learning_rate": 4.134495293969443e-05, "loss": 1.0132, "step": 203600 }, { "epoch": 0.18, "grad_norm": 0.4375, "learning_rate": 4.1340453866503504e-05, "loss": 1.0476, "step": 203700 }, { "epoch": 0.18, "grad_norm": 31.25, "learning_rate": 4.133595479331258e-05, "loss": 1.0296, "step": 203800 }, { "epoch": 0.18, "grad_norm": 207.0, "learning_rate": 4.133145572012166e-05, "loss": 1.0641, "step": 203900 }, { "epoch": 0.18, "grad_norm": 0.244140625, "learning_rate": 4.1326956646930736e-05, "loss": 0.934, "step": 204000 }, { "epoch": 0.18, "grad_norm": 32.0, "learning_rate": 4.132245757373981e-05, "loss": 1.0494, "step": 204100 }, { "epoch": 0.18, "grad_norm": 12.375, "learning_rate": 4.131795850054889e-05, "loss": 0.9449, "step": 204200 }, { "epoch": 0.18, "grad_norm": 21.375, "learning_rate": 4.131345942735796e-05, "loss": 1.1892, "step": 204300 }, { "epoch": 0.18, "grad_norm": 19.125, "learning_rate": 4.1308960354167045e-05, "loss": 0.9864, "step": 204400 }, { "epoch": 0.18, "grad_norm": 14.0625, "learning_rate": 4.130446128097612e-05, "loss": 1.0519, "step": 204500 }, { "epoch": 0.18, "grad_norm": 17.0, "learning_rate": 4.1299962207785195e-05, "loss": 0.8999, "step": 204600 }, { "epoch": 0.18, "grad_norm": 7.6875, "learning_rate": 4.129546313459428e-05, "loss": 1.1317, "step": 204700 }, { "epoch": 0.18, "grad_norm": 113.5, "learning_rate": 4.129096406140335e-05, "loss": 1.0446, "step": 204800 }, { "epoch": 0.18, "grad_norm": 39.0, "learning_rate": 4.128646498821243e-05, "loss": 1.0604, "step": 204900 }, { "epoch": 0.18, "grad_norm": 18.375, "learning_rate": 4.128196591502151e-05, "loss": 1.0099, "step": 205000 }, { "epoch": 0.18, "grad_norm": 34.0, "learning_rate": 4.1277466841830586e-05, "loss": 0.9905, "step": 205100 }, { "epoch": 0.18, "grad_norm": 45.5, "learning_rate": 4.127296776863967e-05, "loss": 0.8917, "step": 205200 }, { "epoch": 0.18, "grad_norm": 66.0, "learning_rate": 4.126846869544874e-05, "loss": 1.2461, "step": 205300 }, { "epoch": 0.18, "grad_norm": 37.0, "learning_rate": 4.126396962225781e-05, "loss": 1.0329, "step": 205400 }, { "epoch": 0.18, "grad_norm": 38.25, "learning_rate": 4.1259470549066894e-05, "loss": 1.0766, "step": 205500 }, { "epoch": 0.18, "grad_norm": 28.25, "learning_rate": 4.125497147587597e-05, "loss": 1.0072, "step": 205600 }, { "epoch": 0.18, "grad_norm": 92.5, "learning_rate": 4.1250472402685044e-05, "loss": 1.122, "step": 205700 }, { "epoch": 0.18, "grad_norm": 18.25, "learning_rate": 4.1245973329494126e-05, "loss": 0.9677, "step": 205800 }, { "epoch": 0.18, "grad_norm": 21.375, "learning_rate": 4.12414742563032e-05, "loss": 0.9124, "step": 205900 }, { "epoch": 0.18, "grad_norm": 25.0, "learning_rate": 4.1236975183112284e-05, "loss": 1.0021, "step": 206000 }, { "epoch": 0.18, "grad_norm": 19.75, "learning_rate": 4.123247610992136e-05, "loss": 0.9075, "step": 206100 }, { "epoch": 0.18, "grad_norm": 18.25, "learning_rate": 4.1227977036730435e-05, "loss": 0.9195, "step": 206200 }, { "epoch": 0.18, "grad_norm": 31.5, "learning_rate": 4.122347796353952e-05, "loss": 1.0138, "step": 206300 }, { "epoch": 0.18, "grad_norm": 68.5, "learning_rate": 4.121897889034859e-05, "loss": 1.0014, "step": 206400 }, { "epoch": 0.18, "grad_norm": 33.75, "learning_rate": 4.121447981715767e-05, "loss": 0.9441, "step": 206500 }, { "epoch": 0.18, "grad_norm": 18.125, "learning_rate": 4.120998074396675e-05, "loss": 0.9736, "step": 206600 }, { "epoch": 0.18, "grad_norm": 46.25, "learning_rate": 4.120548167077582e-05, "loss": 0.9764, "step": 206700 }, { "epoch": 0.18, "grad_norm": 45.5, "learning_rate": 4.12009825975849e-05, "loss": 1.0639, "step": 206800 }, { "epoch": 0.18, "grad_norm": 83.0, "learning_rate": 4.1196483524393976e-05, "loss": 1.1208, "step": 206900 }, { "epoch": 0.18, "grad_norm": 10.0, "learning_rate": 4.119198445120305e-05, "loss": 0.9852, "step": 207000 }, { "epoch": 0.18, "grad_norm": 0.0032501220703125, "learning_rate": 4.118748537801213e-05, "loss": 0.9806, "step": 207100 }, { "epoch": 0.18, "grad_norm": 158.0, "learning_rate": 4.118298630482121e-05, "loss": 1.0451, "step": 207200 }, { "epoch": 0.18, "grad_norm": 27.625, "learning_rate": 4.1178487231630284e-05, "loss": 0.9113, "step": 207300 }, { "epoch": 0.18, "grad_norm": 17.375, "learning_rate": 4.1173988158439366e-05, "loss": 0.9102, "step": 207400 }, { "epoch": 0.18, "grad_norm": 0.1767578125, "learning_rate": 4.116948908524844e-05, "loss": 1.0888, "step": 207500 }, { "epoch": 0.18, "grad_norm": 45.25, "learning_rate": 4.1164990012057516e-05, "loss": 0.9614, "step": 207600 }, { "epoch": 0.19, "grad_norm": 91.5, "learning_rate": 4.11604909388666e-05, "loss": 1.0828, "step": 207700 }, { "epoch": 0.19, "grad_norm": 14.125, "learning_rate": 4.1155991865675674e-05, "loss": 0.975, "step": 207800 }, { "epoch": 0.19, "grad_norm": 40.25, "learning_rate": 4.1151492792484756e-05, "loss": 1.1039, "step": 207900 }, { "epoch": 0.19, "grad_norm": 41.0, "learning_rate": 4.1146993719293825e-05, "loss": 1.0925, "step": 208000 }, { "epoch": 0.19, "grad_norm": 3.234375, "learning_rate": 4.11424946461029e-05, "loss": 1.0953, "step": 208100 }, { "epoch": 0.19, "grad_norm": 0.0021209716796875, "learning_rate": 4.113799557291198e-05, "loss": 1.0687, "step": 208200 }, { "epoch": 0.19, "grad_norm": 7.09375, "learning_rate": 4.113349649972106e-05, "loss": 0.8731, "step": 208300 }, { "epoch": 0.19, "grad_norm": 53.5, "learning_rate": 4.112899742653013e-05, "loss": 1.0374, "step": 208400 }, { "epoch": 0.19, "grad_norm": 3.625, "learning_rate": 4.1124498353339215e-05, "loss": 1.2792, "step": 208500 }, { "epoch": 0.19, "grad_norm": 19.875, "learning_rate": 4.111999928014829e-05, "loss": 0.9054, "step": 208600 }, { "epoch": 0.19, "grad_norm": 33.0, "learning_rate": 4.111550020695737e-05, "loss": 1.1398, "step": 208700 }, { "epoch": 0.19, "grad_norm": 23.0, "learning_rate": 4.111100113376645e-05, "loss": 1.0491, "step": 208800 }, { "epoch": 0.19, "grad_norm": 54.0, "learning_rate": 4.110650206057552e-05, "loss": 0.9972, "step": 208900 }, { "epoch": 0.19, "grad_norm": 22.875, "learning_rate": 4.1102002987384605e-05, "loss": 0.9967, "step": 209000 }, { "epoch": 0.19, "grad_norm": 24.25, "learning_rate": 4.109750391419368e-05, "loss": 1.0893, "step": 209100 }, { "epoch": 0.19, "grad_norm": 117.5, "learning_rate": 4.1093004841002756e-05, "loss": 1.0223, "step": 209200 }, { "epoch": 0.19, "grad_norm": 0.00634765625, "learning_rate": 4.108850576781183e-05, "loss": 1.0812, "step": 209300 }, { "epoch": 0.19, "grad_norm": 0.012939453125, "learning_rate": 4.1084006694620906e-05, "loss": 0.9931, "step": 209400 }, { "epoch": 0.19, "grad_norm": 20.625, "learning_rate": 4.107950762142999e-05, "loss": 0.9572, "step": 209500 }, { "epoch": 0.19, "grad_norm": 15.3125, "learning_rate": 4.1075008548239064e-05, "loss": 1.0195, "step": 209600 }, { "epoch": 0.19, "grad_norm": 104.5, "learning_rate": 4.107050947504814e-05, "loss": 0.9802, "step": 209700 }, { "epoch": 0.19, "grad_norm": 9.5, "learning_rate": 4.106601040185722e-05, "loss": 1.0643, "step": 209800 }, { "epoch": 0.19, "grad_norm": 84.0, "learning_rate": 4.10615113286663e-05, "loss": 0.8721, "step": 209900 }, { "epoch": 0.19, "grad_norm": 27.375, "learning_rate": 4.105701225547537e-05, "loss": 0.9599, "step": 210000 }, { "epoch": 0.19, "grad_norm": 0.037841796875, "learning_rate": 4.1052513182284454e-05, "loss": 0.9251, "step": 210100 }, { "epoch": 0.19, "grad_norm": 25.25, "learning_rate": 4.104801410909353e-05, "loss": 1.015, "step": 210200 }, { "epoch": 0.19, "grad_norm": 100.5, "learning_rate": 4.1043515035902605e-05, "loss": 1.0445, "step": 210300 }, { "epoch": 0.19, "grad_norm": 75.0, "learning_rate": 4.103901596271169e-05, "loss": 0.9973, "step": 210400 }, { "epoch": 0.19, "grad_norm": 249.0, "learning_rate": 4.103451688952076e-05, "loss": 0.9951, "step": 210500 }, { "epoch": 0.19, "grad_norm": 0.201171875, "learning_rate": 4.103001781632984e-05, "loss": 1.1118, "step": 210600 }, { "epoch": 0.19, "grad_norm": 60.75, "learning_rate": 4.102551874313891e-05, "loss": 0.9056, "step": 210700 }, { "epoch": 0.19, "grad_norm": 17.0, "learning_rate": 4.102101966994799e-05, "loss": 1.0998, "step": 210800 }, { "epoch": 0.19, "grad_norm": 20.625, "learning_rate": 4.101652059675707e-05, "loss": 1.0499, "step": 210900 }, { "epoch": 0.19, "grad_norm": 25.5, "learning_rate": 4.1012021523566146e-05, "loss": 0.9896, "step": 211000 }, { "epoch": 0.19, "grad_norm": 0.002227783203125, "learning_rate": 4.100752245037522e-05, "loss": 0.9199, "step": 211100 }, { "epoch": 0.19, "grad_norm": 9.6875, "learning_rate": 4.10030233771843e-05, "loss": 0.9904, "step": 211200 }, { "epoch": 0.19, "grad_norm": 14.8125, "learning_rate": 4.099852430399338e-05, "loss": 1.1601, "step": 211300 }, { "epoch": 0.19, "grad_norm": 22.5, "learning_rate": 4.099402523080246e-05, "loss": 1.0315, "step": 211400 }, { "epoch": 0.19, "grad_norm": 43.75, "learning_rate": 4.0989526157611536e-05, "loss": 0.9983, "step": 211500 }, { "epoch": 0.19, "grad_norm": 0.36328125, "learning_rate": 4.098502708442061e-05, "loss": 1.0844, "step": 211600 }, { "epoch": 0.19, "grad_norm": 10.25, "learning_rate": 4.0980528011229693e-05, "loss": 1.0916, "step": 211700 }, { "epoch": 0.19, "grad_norm": 29.0, "learning_rate": 4.097602893803877e-05, "loss": 1.0555, "step": 211800 }, { "epoch": 0.19, "grad_norm": 27.875, "learning_rate": 4.097152986484784e-05, "loss": 1.0497, "step": 211900 }, { "epoch": 0.19, "grad_norm": 16.625, "learning_rate": 4.096703079165692e-05, "loss": 0.8473, "step": 212000 }, { "epoch": 0.19, "grad_norm": 19.125, "learning_rate": 4.0962531718465995e-05, "loss": 1.0515, "step": 212100 }, { "epoch": 0.19, "grad_norm": 35.5, "learning_rate": 4.095803264527508e-05, "loss": 0.965, "step": 212200 }, { "epoch": 0.19, "grad_norm": 50.5, "learning_rate": 4.095353357208415e-05, "loss": 1.0608, "step": 212300 }, { "epoch": 0.19, "grad_norm": 11.9375, "learning_rate": 4.094903449889323e-05, "loss": 1.0581, "step": 212400 }, { "epoch": 0.19, "grad_norm": 61.5, "learning_rate": 4.094453542570231e-05, "loss": 0.973, "step": 212500 }, { "epoch": 0.19, "grad_norm": 26.5, "learning_rate": 4.0940036352511385e-05, "loss": 0.866, "step": 212600 }, { "epoch": 0.19, "grad_norm": 21.0, "learning_rate": 4.093553727932046e-05, "loss": 1.1619, "step": 212700 }, { "epoch": 0.19, "grad_norm": 20.5, "learning_rate": 4.093103820612954e-05, "loss": 1.0849, "step": 212800 }, { "epoch": 0.19, "grad_norm": 20.875, "learning_rate": 4.092653913293862e-05, "loss": 0.8512, "step": 212900 }, { "epoch": 0.19, "grad_norm": 984.0, "learning_rate": 4.092204005974769e-05, "loss": 1.0211, "step": 213000 }, { "epoch": 0.19, "grad_norm": 12.875, "learning_rate": 4.0917540986556775e-05, "loss": 0.996, "step": 213100 }, { "epoch": 0.19, "grad_norm": 11.5, "learning_rate": 4.0913041913365844e-05, "loss": 1.1139, "step": 213200 }, { "epoch": 0.19, "grad_norm": 11.125, "learning_rate": 4.0908542840174926e-05, "loss": 1.0224, "step": 213300 }, { "epoch": 0.19, "grad_norm": 1.484375, "learning_rate": 4.0904043766984e-05, "loss": 1.0536, "step": 213400 }, { "epoch": 0.19, "grad_norm": 60.25, "learning_rate": 4.089954469379308e-05, "loss": 1.1269, "step": 213500 }, { "epoch": 0.19, "grad_norm": 12.3125, "learning_rate": 4.089504562060216e-05, "loss": 1.1374, "step": 213600 }, { "epoch": 0.19, "grad_norm": 14.5625, "learning_rate": 4.0890546547411234e-05, "loss": 1.0655, "step": 213700 }, { "epoch": 0.19, "grad_norm": 4.15625, "learning_rate": 4.088604747422031e-05, "loss": 1.0169, "step": 213800 }, { "epoch": 0.19, "grad_norm": 154.0, "learning_rate": 4.088154840102939e-05, "loss": 0.9249, "step": 213900 }, { "epoch": 0.19, "grad_norm": 0.12255859375, "learning_rate": 4.087704932783847e-05, "loss": 0.9403, "step": 214000 }, { "epoch": 0.19, "grad_norm": 29.25, "learning_rate": 4.087255025464755e-05, "loss": 0.9346, "step": 214100 }, { "epoch": 0.19, "grad_norm": 175.0, "learning_rate": 4.0868051181456624e-05, "loss": 0.8497, "step": 214200 }, { "epoch": 0.19, "grad_norm": 66.5, "learning_rate": 4.08635521082657e-05, "loss": 1.0652, "step": 214300 }, { "epoch": 0.19, "grad_norm": 17.25, "learning_rate": 4.085905303507478e-05, "loss": 0.9883, "step": 214400 }, { "epoch": 0.19, "grad_norm": 27.75, "learning_rate": 4.085455396188385e-05, "loss": 0.8639, "step": 214500 }, { "epoch": 0.19, "grad_norm": 41.5, "learning_rate": 4.0850054888692926e-05, "loss": 1.0497, "step": 214600 }, { "epoch": 0.19, "grad_norm": 20.25, "learning_rate": 4.084555581550201e-05, "loss": 1.037, "step": 214700 }, { "epoch": 0.19, "grad_norm": 21.25, "learning_rate": 4.084105674231108e-05, "loss": 0.9879, "step": 214800 }, { "epoch": 0.19, "grad_norm": 43.0, "learning_rate": 4.0836557669120165e-05, "loss": 0.9982, "step": 214900 }, { "epoch": 0.19, "grad_norm": 39.75, "learning_rate": 4.083205859592924e-05, "loss": 1.0842, "step": 215000 }, { "epoch": 0.19, "grad_norm": 34.0, "learning_rate": 4.0827559522738316e-05, "loss": 0.9924, "step": 215100 }, { "epoch": 0.19, "grad_norm": 13.75, "learning_rate": 4.08230604495474e-05, "loss": 1.0327, "step": 215200 }, { "epoch": 0.19, "grad_norm": 15.0625, "learning_rate": 4.0818561376356473e-05, "loss": 1.0646, "step": 215300 }, { "epoch": 0.19, "grad_norm": 0.02099609375, "learning_rate": 4.081406230316555e-05, "loss": 0.9434, "step": 215400 }, { "epoch": 0.19, "grad_norm": 177.0, "learning_rate": 4.080956322997463e-05, "loss": 1.0415, "step": 215500 }, { "epoch": 0.19, "grad_norm": 94.5, "learning_rate": 4.0805064156783706e-05, "loss": 0.9127, "step": 215600 }, { "epoch": 0.19, "grad_norm": 44.0, "learning_rate": 4.080056508359278e-05, "loss": 1.0076, "step": 215700 }, { "epoch": 0.19, "grad_norm": 34.5, "learning_rate": 4.079606601040186e-05, "loss": 1.0544, "step": 215800 }, { "epoch": 0.19, "grad_norm": 44.75, "learning_rate": 4.079156693721093e-05, "loss": 0.9754, "step": 215900 }, { "epoch": 0.19, "grad_norm": 17.5, "learning_rate": 4.0787067864020014e-05, "loss": 0.9611, "step": 216000 }, { "epoch": 0.19, "grad_norm": 67.0, "learning_rate": 4.078256879082909e-05, "loss": 1.1344, "step": 216100 }, { "epoch": 0.19, "grad_norm": 21.625, "learning_rate": 4.0778069717638165e-05, "loss": 0.921, "step": 216200 }, { "epoch": 0.19, "grad_norm": 6.21875, "learning_rate": 4.077357064444725e-05, "loss": 1.1117, "step": 216300 }, { "epoch": 0.19, "grad_norm": 21.5, "learning_rate": 4.076907157125632e-05, "loss": 1.0584, "step": 216400 }, { "epoch": 0.19, "grad_norm": 28.875, "learning_rate": 4.07645724980654e-05, "loss": 0.8819, "step": 216500 }, { "epoch": 0.19, "grad_norm": 14.3125, "learning_rate": 4.076007342487448e-05, "loss": 1.1318, "step": 216600 }, { "epoch": 0.19, "grad_norm": 106.5, "learning_rate": 4.0755574351683555e-05, "loss": 1.033, "step": 216700 }, { "epoch": 0.19, "grad_norm": 67.0, "learning_rate": 4.075107527849264e-05, "loss": 0.989, "step": 216800 }, { "epoch": 0.19, "grad_norm": 17.5, "learning_rate": 4.074657620530171e-05, "loss": 0.9674, "step": 216900 }, { "epoch": 0.19, "grad_norm": 20.0, "learning_rate": 4.074207713211079e-05, "loss": 1.0205, "step": 217000 }, { "epoch": 0.19, "grad_norm": 56.5, "learning_rate": 4.0737578058919863e-05, "loss": 1.0161, "step": 217100 }, { "epoch": 0.19, "grad_norm": 41.25, "learning_rate": 4.073307898572894e-05, "loss": 1.0059, "step": 217200 }, { "epoch": 0.19, "grad_norm": 19.0, "learning_rate": 4.0728579912538014e-05, "loss": 1.0873, "step": 217300 }, { "epoch": 0.19, "grad_norm": 99.0, "learning_rate": 4.0724080839347096e-05, "loss": 1.0004, "step": 217400 }, { "epoch": 0.19, "grad_norm": 9.0625, "learning_rate": 4.071958176615617e-05, "loss": 1.08, "step": 217500 }, { "epoch": 0.19, "grad_norm": 0.388671875, "learning_rate": 4.0715082692965254e-05, "loss": 0.8686, "step": 217600 }, { "epoch": 0.19, "grad_norm": 99.5, "learning_rate": 4.071058361977433e-05, "loss": 0.9807, "step": 217700 }, { "epoch": 0.19, "grad_norm": 76.5, "learning_rate": 4.0706084546583404e-05, "loss": 1.188, "step": 217800 }, { "epoch": 0.19, "grad_norm": 40.0, "learning_rate": 4.0701585473392486e-05, "loss": 1.0733, "step": 217900 }, { "epoch": 0.19, "grad_norm": 29.625, "learning_rate": 4.069708640020156e-05, "loss": 1.0241, "step": 218000 }, { "epoch": 0.19, "grad_norm": 53.75, "learning_rate": 4.069258732701064e-05, "loss": 1.0858, "step": 218100 }, { "epoch": 0.19, "grad_norm": 154.0, "learning_rate": 4.068808825381972e-05, "loss": 0.8302, "step": 218200 }, { "epoch": 0.19, "grad_norm": 270.0, "learning_rate": 4.0683589180628795e-05, "loss": 1.0766, "step": 218300 }, { "epoch": 0.19, "grad_norm": 18.75, "learning_rate": 4.067909010743787e-05, "loss": 1.0029, "step": 218400 }, { "epoch": 0.19, "grad_norm": 0.453125, "learning_rate": 4.0674591034246945e-05, "loss": 1.0616, "step": 218500 }, { "epoch": 0.19, "grad_norm": 6.21875, "learning_rate": 4.067009196105602e-05, "loss": 1.1286, "step": 218600 }, { "epoch": 0.19, "grad_norm": 1.390625, "learning_rate": 4.06655928878651e-05, "loss": 1.0094, "step": 218700 }, { "epoch": 0.19, "grad_norm": 0.0283203125, "learning_rate": 4.066109381467418e-05, "loss": 1.0558, "step": 218800 }, { "epoch": 0.19, "grad_norm": 6.46875, "learning_rate": 4.0656594741483253e-05, "loss": 0.9468, "step": 218900 }, { "epoch": 0.2, "grad_norm": 27.25, "learning_rate": 4.0652095668292336e-05, "loss": 0.9764, "step": 219000 }, { "epoch": 0.2, "grad_norm": 61.25, "learning_rate": 4.064759659510141e-05, "loss": 1.0133, "step": 219100 }, { "epoch": 0.2, "grad_norm": 67.5, "learning_rate": 4.0643097521910486e-05, "loss": 1.0726, "step": 219200 }, { "epoch": 0.2, "grad_norm": 33.75, "learning_rate": 4.063859844871957e-05, "loss": 1.1053, "step": 219300 }, { "epoch": 0.2, "grad_norm": 0.07470703125, "learning_rate": 4.0634099375528644e-05, "loss": 1.0408, "step": 219400 }, { "epoch": 0.2, "grad_norm": 35.0, "learning_rate": 4.0629600302337726e-05, "loss": 1.1241, "step": 219500 }, { "epoch": 0.2, "grad_norm": 174.0, "learning_rate": 4.06251012291468e-05, "loss": 0.8645, "step": 219600 }, { "epoch": 0.2, "grad_norm": 49.5, "learning_rate": 4.062060215595587e-05, "loss": 0.9771, "step": 219700 }, { "epoch": 0.2, "grad_norm": 0.1708984375, "learning_rate": 4.061610308276495e-05, "loss": 1.0401, "step": 219800 }, { "epoch": 0.2, "grad_norm": 101.5, "learning_rate": 4.061160400957403e-05, "loss": 1.0759, "step": 219900 }, { "epoch": 0.2, "grad_norm": 82.0, "learning_rate": 4.06071049363831e-05, "loss": 0.9021, "step": 220000 }, { "epoch": 0.2, "grad_norm": 23.875, "learning_rate": 4.0602605863192185e-05, "loss": 1.0224, "step": 220100 }, { "epoch": 0.2, "grad_norm": 28.875, "learning_rate": 4.059810679000126e-05, "loss": 1.015, "step": 220200 }, { "epoch": 0.2, "grad_norm": 51.75, "learning_rate": 4.059360771681034e-05, "loss": 0.926, "step": 220300 }, { "epoch": 0.2, "grad_norm": 21.25, "learning_rate": 4.058910864361942e-05, "loss": 0.9364, "step": 220400 }, { "epoch": 0.2, "grad_norm": 61.75, "learning_rate": 4.058460957042849e-05, "loss": 1.0805, "step": 220500 }, { "epoch": 0.2, "grad_norm": 17.625, "learning_rate": 4.0580110497237575e-05, "loss": 0.9326, "step": 220600 }, { "epoch": 0.2, "grad_norm": 54.75, "learning_rate": 4.057561142404665e-05, "loss": 1.1103, "step": 220700 }, { "epoch": 0.2, "grad_norm": 42.0, "learning_rate": 4.0571112350855726e-05, "loss": 1.1509, "step": 220800 }, { "epoch": 0.2, "grad_norm": 52.5, "learning_rate": 4.056661327766481e-05, "loss": 0.9183, "step": 220900 }, { "epoch": 0.2, "grad_norm": 66.5, "learning_rate": 4.0562114204473876e-05, "loss": 1.103, "step": 221000 }, { "epoch": 0.2, "grad_norm": 25.75, "learning_rate": 4.055761513128296e-05, "loss": 1.091, "step": 221100 }, { "epoch": 0.2, "grad_norm": 32.5, "learning_rate": 4.0553116058092034e-05, "loss": 1.1038, "step": 221200 }, { "epoch": 0.2, "grad_norm": 0.0054931640625, "learning_rate": 4.054861698490111e-05, "loss": 1.0682, "step": 221300 }, { "epoch": 0.2, "grad_norm": 15.4375, "learning_rate": 4.054411791171019e-05, "loss": 1.1342, "step": 221400 }, { "epoch": 0.2, "grad_norm": 8.5, "learning_rate": 4.0539618838519266e-05, "loss": 0.8835, "step": 221500 }, { "epoch": 0.2, "grad_norm": 78.5, "learning_rate": 4.053511976532834e-05, "loss": 0.9309, "step": 221600 }, { "epoch": 0.2, "grad_norm": 14.0625, "learning_rate": 4.0530620692137424e-05, "loss": 0.9811, "step": 221700 }, { "epoch": 0.2, "grad_norm": 40.0, "learning_rate": 4.05261216189465e-05, "loss": 0.9188, "step": 221800 }, { "epoch": 0.2, "grad_norm": 14.875, "learning_rate": 4.0521622545755575e-05, "loss": 1.0862, "step": 221900 }, { "epoch": 0.2, "grad_norm": 103.5, "learning_rate": 4.051712347256466e-05, "loss": 0.9534, "step": 222000 }, { "epoch": 0.2, "grad_norm": 2.671875, "learning_rate": 4.051262439937373e-05, "loss": 1.0598, "step": 222100 }, { "epoch": 0.2, "grad_norm": 32.75, "learning_rate": 4.0508125326182814e-05, "loss": 0.9286, "step": 222200 }, { "epoch": 0.2, "grad_norm": 16.125, "learning_rate": 4.050362625299188e-05, "loss": 1.0089, "step": 222300 }, { "epoch": 0.2, "grad_norm": 30.75, "learning_rate": 4.049912717980096e-05, "loss": 1.0282, "step": 222400 }, { "epoch": 0.2, "grad_norm": 28.125, "learning_rate": 4.049462810661004e-05, "loss": 1.1468, "step": 222500 }, { "epoch": 0.2, "grad_norm": 23.875, "learning_rate": 4.0490129033419116e-05, "loss": 1.0639, "step": 222600 }, { "epoch": 0.2, "grad_norm": 24.875, "learning_rate": 4.048562996022819e-05, "loss": 1.0247, "step": 222700 }, { "epoch": 0.2, "grad_norm": 65.0, "learning_rate": 4.048113088703727e-05, "loss": 1.0286, "step": 222800 }, { "epoch": 0.2, "grad_norm": 27.375, "learning_rate": 4.047663181384635e-05, "loss": 1.024, "step": 222900 }, { "epoch": 0.2, "grad_norm": 77.5, "learning_rate": 4.047213274065543e-05, "loss": 0.9071, "step": 223000 }, { "epoch": 0.2, "grad_norm": 50.75, "learning_rate": 4.0467633667464506e-05, "loss": 1.0451, "step": 223100 }, { "epoch": 0.2, "grad_norm": 20.25, "learning_rate": 4.046313459427358e-05, "loss": 0.9831, "step": 223200 }, { "epoch": 0.2, "grad_norm": 14.4375, "learning_rate": 4.045863552108266e-05, "loss": 0.955, "step": 223300 }, { "epoch": 0.2, "grad_norm": 16.375, "learning_rate": 4.045413644789174e-05, "loss": 1.0547, "step": 223400 }, { "epoch": 0.2, "grad_norm": 10.25, "learning_rate": 4.0449637374700814e-05, "loss": 1.034, "step": 223500 }, { "epoch": 0.2, "grad_norm": 11.625, "learning_rate": 4.044513830150989e-05, "loss": 1.0968, "step": 223600 }, { "epoch": 0.2, "grad_norm": 100.0, "learning_rate": 4.0440639228318965e-05, "loss": 1.0565, "step": 223700 }, { "epoch": 0.2, "grad_norm": 20.375, "learning_rate": 4.043614015512805e-05, "loss": 1.0507, "step": 223800 }, { "epoch": 0.2, "grad_norm": 12.5, "learning_rate": 4.043164108193712e-05, "loss": 0.9951, "step": 223900 }, { "epoch": 0.2, "grad_norm": 524.0, "learning_rate": 4.04271420087462e-05, "loss": 0.9851, "step": 224000 }, { "epoch": 0.2, "grad_norm": 61.25, "learning_rate": 4.042264293555528e-05, "loss": 1.0471, "step": 224100 }, { "epoch": 0.2, "grad_norm": 29.875, "learning_rate": 4.0418143862364355e-05, "loss": 1.0684, "step": 224200 }, { "epoch": 0.2, "grad_norm": 21.625, "learning_rate": 4.041364478917343e-05, "loss": 0.9652, "step": 224300 }, { "epoch": 0.2, "grad_norm": 7.25, "learning_rate": 4.040914571598251e-05, "loss": 1.1236, "step": 224400 }, { "epoch": 0.2, "grad_norm": 0.46484375, "learning_rate": 4.040464664279159e-05, "loss": 1.0768, "step": 224500 }, { "epoch": 0.2, "grad_norm": 14.4375, "learning_rate": 4.040014756960066e-05, "loss": 1.1431, "step": 224600 }, { "epoch": 0.2, "grad_norm": 11.0625, "learning_rate": 4.0395648496409745e-05, "loss": 0.9607, "step": 224700 }, { "epoch": 0.2, "grad_norm": 29.125, "learning_rate": 4.039114942321882e-05, "loss": 0.905, "step": 224800 }, { "epoch": 0.2, "grad_norm": 21.125, "learning_rate": 4.0386650350027896e-05, "loss": 1.0986, "step": 224900 }, { "epoch": 0.2, "grad_norm": 0.00518798828125, "learning_rate": 4.038215127683697e-05, "loss": 0.9506, "step": 225000 }, { "epoch": 0.2, "grad_norm": 8.3125, "learning_rate": 4.0377652203646046e-05, "loss": 1.1942, "step": 225100 }, { "epoch": 0.2, "grad_norm": 21.25, "learning_rate": 4.037315313045513e-05, "loss": 1.0557, "step": 225200 }, { "epoch": 0.2, "grad_norm": 20.875, "learning_rate": 4.0368654057264204e-05, "loss": 1.0554, "step": 225300 }, { "epoch": 0.2, "grad_norm": 28.0, "learning_rate": 4.036415498407328e-05, "loss": 1.0826, "step": 225400 }, { "epoch": 0.2, "grad_norm": 161.0, "learning_rate": 4.035965591088236e-05, "loss": 1.1307, "step": 225500 }, { "epoch": 0.2, "grad_norm": 0.047607421875, "learning_rate": 4.035515683769144e-05, "loss": 1.0308, "step": 225600 }, { "epoch": 0.2, "grad_norm": 16.0, "learning_rate": 4.035065776450052e-05, "loss": 0.9722, "step": 225700 }, { "epoch": 0.2, "grad_norm": 13.4375, "learning_rate": 4.0346158691309594e-05, "loss": 1.0784, "step": 225800 }, { "epoch": 0.2, "grad_norm": 0.84765625, "learning_rate": 4.034165961811867e-05, "loss": 1.0371, "step": 225900 }, { "epoch": 0.2, "grad_norm": 18.875, "learning_rate": 4.033716054492775e-05, "loss": 1.1102, "step": 226000 }, { "epoch": 0.2, "grad_norm": 22.875, "learning_rate": 4.033266147173683e-05, "loss": 0.9886, "step": 226100 }, { "epoch": 0.2, "grad_norm": 1.3671875, "learning_rate": 4.03281623985459e-05, "loss": 1.126, "step": 226200 }, { "epoch": 0.2, "grad_norm": 0.043212890625, "learning_rate": 4.032366332535498e-05, "loss": 1.0707, "step": 226300 }, { "epoch": 0.2, "grad_norm": 24.125, "learning_rate": 4.031916425216405e-05, "loss": 1.0397, "step": 226400 }, { "epoch": 0.2, "grad_norm": 6.125, "learning_rate": 4.0314665178973135e-05, "loss": 1.1157, "step": 226500 }, { "epoch": 0.2, "grad_norm": 0.10009765625, "learning_rate": 4.031016610578221e-05, "loss": 1.0381, "step": 226600 }, { "epoch": 0.2, "grad_norm": 22.25, "learning_rate": 4.0305667032591286e-05, "loss": 1.0159, "step": 226700 }, { "epoch": 0.2, "grad_norm": 0.031982421875, "learning_rate": 4.030116795940037e-05, "loss": 1.0236, "step": 226800 }, { "epoch": 0.2, "grad_norm": 7.53125, "learning_rate": 4.029666888620944e-05, "loss": 1.0486, "step": 226900 }, { "epoch": 0.2, "grad_norm": 0.00921630859375, "learning_rate": 4.029216981301852e-05, "loss": 1.0281, "step": 227000 }, { "epoch": 0.2, "grad_norm": 51.75, "learning_rate": 4.02876707398276e-05, "loss": 1.1135, "step": 227100 }, { "epoch": 0.2, "grad_norm": 15.25, "learning_rate": 4.0283171666636676e-05, "loss": 1.1518, "step": 227200 }, { "epoch": 0.2, "grad_norm": 0.0830078125, "learning_rate": 4.027867259344575e-05, "loss": 0.9774, "step": 227300 }, { "epoch": 0.2, "grad_norm": 32.0, "learning_rate": 4.0274173520254833e-05, "loss": 0.9246, "step": 227400 }, { "epoch": 0.2, "grad_norm": 296.0, "learning_rate": 4.02696744470639e-05, "loss": 1.0522, "step": 227500 }, { "epoch": 0.2, "grad_norm": 952.0, "learning_rate": 4.0265175373872984e-05, "loss": 0.8872, "step": 227600 }, { "epoch": 0.2, "grad_norm": 44.25, "learning_rate": 4.026067630068206e-05, "loss": 0.9439, "step": 227700 }, { "epoch": 0.2, "grad_norm": 0.01416015625, "learning_rate": 4.0256177227491135e-05, "loss": 0.9362, "step": 227800 }, { "epoch": 0.2, "grad_norm": 14.9375, "learning_rate": 4.025167815430022e-05, "loss": 1.0321, "step": 227900 }, { "epoch": 0.2, "grad_norm": 49.75, "learning_rate": 4.024717908110929e-05, "loss": 0.8969, "step": 228000 }, { "epoch": 0.2, "grad_norm": 34.5, "learning_rate": 4.024268000791837e-05, "loss": 0.9787, "step": 228100 }, { "epoch": 0.2, "grad_norm": 18.25, "learning_rate": 4.023818093472745e-05, "loss": 1.0048, "step": 228200 }, { "epoch": 0.2, "grad_norm": 18.125, "learning_rate": 4.0233681861536525e-05, "loss": 1.1526, "step": 228300 }, { "epoch": 0.2, "grad_norm": 96.0, "learning_rate": 4.022918278834561e-05, "loss": 1.0252, "step": 228400 }, { "epoch": 0.2, "grad_norm": 18.375, "learning_rate": 4.022468371515468e-05, "loss": 0.9873, "step": 228500 }, { "epoch": 0.2, "grad_norm": 28.125, "learning_rate": 4.022018464196376e-05, "loss": 1.0147, "step": 228600 }, { "epoch": 0.2, "grad_norm": 22.375, "learning_rate": 4.021568556877284e-05, "loss": 0.9519, "step": 228700 }, { "epoch": 0.2, "grad_norm": 40.0, "learning_rate": 4.021118649558191e-05, "loss": 1.1077, "step": 228800 }, { "epoch": 0.2, "grad_norm": 4.84375, "learning_rate": 4.0206687422390984e-05, "loss": 0.9326, "step": 228900 }, { "epoch": 0.2, "grad_norm": 0.341796875, "learning_rate": 4.0202188349200066e-05, "loss": 1.0734, "step": 229000 }, { "epoch": 0.2, "grad_norm": 5.9375, "learning_rate": 4.019768927600914e-05, "loss": 0.9805, "step": 229100 }, { "epoch": 0.2, "grad_norm": 48.5, "learning_rate": 4.0193190202818223e-05, "loss": 0.9345, "step": 229200 }, { "epoch": 0.2, "grad_norm": 85.0, "learning_rate": 4.01886911296273e-05, "loss": 0.9318, "step": 229300 }, { "epoch": 0.2, "grad_norm": 56.25, "learning_rate": 4.0184192056436374e-05, "loss": 1.0994, "step": 229400 }, { "epoch": 0.2, "grad_norm": 25.25, "learning_rate": 4.0179692983245456e-05, "loss": 1.113, "step": 229500 }, { "epoch": 0.2, "grad_norm": 37.0, "learning_rate": 4.017519391005453e-05, "loss": 0.8794, "step": 229600 }, { "epoch": 0.2, "grad_norm": 35.5, "learning_rate": 4.017069483686361e-05, "loss": 1.0169, "step": 229700 }, { "epoch": 0.2, "grad_norm": 25.625, "learning_rate": 4.016619576367269e-05, "loss": 1.022, "step": 229800 }, { "epoch": 0.2, "grad_norm": 49.25, "learning_rate": 4.0161696690481764e-05, "loss": 1.0459, "step": 229900 }, { "epoch": 0.2, "grad_norm": 334.0, "learning_rate": 4.015719761729084e-05, "loss": 0.9685, "step": 230000 }, { "epoch": 0.2, "grad_norm": 17.375, "learning_rate": 4.0152698544099915e-05, "loss": 1.1808, "step": 230100 }, { "epoch": 0.21, "grad_norm": 0.310546875, "learning_rate": 4.014819947090899e-05, "loss": 1.0379, "step": 230200 }, { "epoch": 0.21, "grad_norm": 89.0, "learning_rate": 4.014370039771807e-05, "loss": 0.8973, "step": 230300 }, { "epoch": 0.21, "grad_norm": 3.28125, "learning_rate": 4.013920132452715e-05, "loss": 1.0262, "step": 230400 }, { "epoch": 0.21, "grad_norm": 14.4375, "learning_rate": 4.013470225133622e-05, "loss": 0.9842, "step": 230500 }, { "epoch": 0.21, "grad_norm": 7.625, "learning_rate": 4.0130203178145305e-05, "loss": 1.067, "step": 230600 }, { "epoch": 0.21, "grad_norm": 284.0, "learning_rate": 4.012570410495438e-05, "loss": 0.9661, "step": 230700 }, { "epoch": 0.21, "grad_norm": 24.125, "learning_rate": 4.0121205031763456e-05, "loss": 0.9486, "step": 230800 }, { "epoch": 0.21, "grad_norm": 0.01275634765625, "learning_rate": 4.011670595857254e-05, "loss": 1.0253, "step": 230900 }, { "epoch": 0.21, "grad_norm": 43.75, "learning_rate": 4.0112206885381613e-05, "loss": 1.0081, "step": 231000 }, { "epoch": 0.21, "grad_norm": 66.0, "learning_rate": 4.0107707812190696e-05, "loss": 0.9609, "step": 231100 }, { "epoch": 0.21, "grad_norm": 26.875, "learning_rate": 4.010320873899977e-05, "loss": 0.9894, "step": 231200 }, { "epoch": 0.21, "grad_norm": 19.75, "learning_rate": 4.0098709665808846e-05, "loss": 1.0742, "step": 231300 }, { "epoch": 0.21, "grad_norm": 41.25, "learning_rate": 4.009421059261792e-05, "loss": 1.0614, "step": 231400 }, { "epoch": 0.21, "grad_norm": 49.0, "learning_rate": 4.0089711519427e-05, "loss": 1.0053, "step": 231500 }, { "epoch": 0.21, "grad_norm": 34.5, "learning_rate": 4.008521244623607e-05, "loss": 0.9822, "step": 231600 }, { "epoch": 0.21, "grad_norm": 32.0, "learning_rate": 4.0080713373045154e-05, "loss": 1.0565, "step": 231700 }, { "epoch": 0.21, "grad_norm": 10.8125, "learning_rate": 4.007621429985423e-05, "loss": 1.0608, "step": 231800 }, { "epoch": 0.21, "grad_norm": 31.25, "learning_rate": 4.007171522666331e-05, "loss": 1.0291, "step": 231900 }, { "epoch": 0.21, "grad_norm": 43.5, "learning_rate": 4.006721615347239e-05, "loss": 1.005, "step": 232000 }, { "epoch": 0.21, "grad_norm": 110.5, "learning_rate": 4.006271708028146e-05, "loss": 0.8585, "step": 232100 }, { "epoch": 0.21, "grad_norm": 31.125, "learning_rate": 4.0058218007090545e-05, "loss": 1.0246, "step": 232200 }, { "epoch": 0.21, "grad_norm": 30.125, "learning_rate": 4.005371893389962e-05, "loss": 1.0897, "step": 232300 }, { "epoch": 0.21, "grad_norm": 0.032958984375, "learning_rate": 4.0049219860708695e-05, "loss": 1.1274, "step": 232400 }, { "epoch": 0.21, "grad_norm": 15.5625, "learning_rate": 4.004472078751778e-05, "loss": 0.9087, "step": 232500 }, { "epoch": 0.21, "grad_norm": 115.5, "learning_rate": 4.004022171432685e-05, "loss": 0.9183, "step": 232600 }, { "epoch": 0.21, "grad_norm": 43.75, "learning_rate": 4.003572264113593e-05, "loss": 0.9738, "step": 232700 }, { "epoch": 0.21, "grad_norm": 147.0, "learning_rate": 4.0031223567945003e-05, "loss": 0.9961, "step": 232800 }, { "epoch": 0.21, "grad_norm": 19.5, "learning_rate": 4.002672449475408e-05, "loss": 1.1077, "step": 232900 }, { "epoch": 0.21, "grad_norm": 0.05859375, "learning_rate": 4.002222542156316e-05, "loss": 1.0937, "step": 233000 }, { "epoch": 0.21, "grad_norm": 22.875, "learning_rate": 4.0017726348372236e-05, "loss": 0.9967, "step": 233100 }, { "epoch": 0.21, "grad_norm": 318.0, "learning_rate": 4.001322727518131e-05, "loss": 1.0481, "step": 233200 }, { "epoch": 0.21, "grad_norm": 28.0, "learning_rate": 4.0008728201990394e-05, "loss": 1.0507, "step": 233300 }, { "epoch": 0.21, "grad_norm": 96.5, "learning_rate": 4.000422912879947e-05, "loss": 0.933, "step": 233400 }, { "epoch": 0.21, "grad_norm": 11.0625, "learning_rate": 3.9999730055608544e-05, "loss": 0.9831, "step": 233500 }, { "epoch": 0.21, "grad_norm": 202.0, "learning_rate": 3.9995230982417627e-05, "loss": 0.9308, "step": 233600 }, { "epoch": 0.21, "grad_norm": 11.75, "learning_rate": 3.99907319092267e-05, "loss": 1.0269, "step": 233700 }, { "epoch": 0.21, "grad_norm": 15.4375, "learning_rate": 3.9986232836035784e-05, "loss": 0.9417, "step": 233800 }, { "epoch": 0.21, "grad_norm": 42.0, "learning_rate": 3.998173376284486e-05, "loss": 0.9465, "step": 233900 }, { "epoch": 0.21, "grad_norm": 13.875, "learning_rate": 3.997723468965393e-05, "loss": 0.9392, "step": 234000 }, { "epoch": 0.21, "grad_norm": 53.25, "learning_rate": 3.997273561646301e-05, "loss": 0.9887, "step": 234100 }, { "epoch": 0.21, "grad_norm": 0.005462646484375, "learning_rate": 3.9968236543272085e-05, "loss": 1.0167, "step": 234200 }, { "epoch": 0.21, "grad_norm": 27.25, "learning_rate": 3.996373747008116e-05, "loss": 1.0032, "step": 234300 }, { "epoch": 0.21, "grad_norm": 0.328125, "learning_rate": 3.995923839689024e-05, "loss": 1.0075, "step": 234400 }, { "epoch": 0.21, "grad_norm": 229.0, "learning_rate": 3.995473932369932e-05, "loss": 1.0626, "step": 234500 }, { "epoch": 0.21, "grad_norm": 0.025146484375, "learning_rate": 3.99502402505084e-05, "loss": 1.128, "step": 234600 }, { "epoch": 0.21, "grad_norm": 31.75, "learning_rate": 3.9945741177317476e-05, "loss": 1.1265, "step": 234700 }, { "epoch": 0.21, "grad_norm": 10.6875, "learning_rate": 3.994124210412655e-05, "loss": 1.1406, "step": 234800 }, { "epoch": 0.21, "grad_norm": 0.0174560546875, "learning_rate": 3.993674303093563e-05, "loss": 0.9755, "step": 234900 }, { "epoch": 0.21, "grad_norm": 18.0, "learning_rate": 3.993224395774471e-05, "loss": 1.0672, "step": 235000 }, { "epoch": 0.21, "grad_norm": 17.375, "learning_rate": 3.9927744884553784e-05, "loss": 0.9761, "step": 235100 }, { "epoch": 0.21, "grad_norm": 57.25, "learning_rate": 3.9923245811362866e-05, "loss": 0.9685, "step": 235200 }, { "epoch": 0.21, "grad_norm": 15.0, "learning_rate": 3.9918746738171934e-05, "loss": 1.0695, "step": 235300 }, { "epoch": 0.21, "grad_norm": 80.5, "learning_rate": 3.9914247664981017e-05, "loss": 1.0753, "step": 235400 }, { "epoch": 0.21, "grad_norm": 24.25, "learning_rate": 3.990974859179009e-05, "loss": 0.9355, "step": 235500 }, { "epoch": 0.21, "grad_norm": 0.036376953125, "learning_rate": 3.990524951859917e-05, "loss": 0.8267, "step": 235600 }, { "epoch": 0.21, "grad_norm": 28.0, "learning_rate": 3.990075044540825e-05, "loss": 1.0094, "step": 235700 }, { "epoch": 0.21, "grad_norm": 22.875, "learning_rate": 3.9896251372217325e-05, "loss": 0.8892, "step": 235800 }, { "epoch": 0.21, "grad_norm": 36.25, "learning_rate": 3.98917522990264e-05, "loss": 0.9947, "step": 235900 }, { "epoch": 0.21, "grad_norm": 22.125, "learning_rate": 3.988725322583548e-05, "loss": 0.9337, "step": 236000 }, { "epoch": 0.21, "grad_norm": 14.4375, "learning_rate": 3.988275415264456e-05, "loss": 1.2244, "step": 236100 }, { "epoch": 0.21, "grad_norm": 29.0, "learning_rate": 3.987825507945363e-05, "loss": 1.0279, "step": 236200 }, { "epoch": 0.21, "grad_norm": 29.0, "learning_rate": 3.9873756006262715e-05, "loss": 1.1059, "step": 236300 }, { "epoch": 0.21, "grad_norm": 27.0, "learning_rate": 3.986925693307179e-05, "loss": 1.0736, "step": 236400 }, { "epoch": 0.21, "grad_norm": 0.1298828125, "learning_rate": 3.986475785988087e-05, "loss": 1.014, "step": 236500 }, { "epoch": 0.21, "grad_norm": 40.5, "learning_rate": 3.986025878668994e-05, "loss": 0.9059, "step": 236600 }, { "epoch": 0.21, "grad_norm": 120.5, "learning_rate": 3.9855759713499016e-05, "loss": 0.9994, "step": 236700 }, { "epoch": 0.21, "grad_norm": 22.375, "learning_rate": 3.98512606403081e-05, "loss": 0.8617, "step": 236800 }, { "epoch": 0.21, "grad_norm": 12.6875, "learning_rate": 3.9846761567117174e-05, "loss": 0.9958, "step": 236900 }, { "epoch": 0.21, "grad_norm": 35.0, "learning_rate": 3.984226249392625e-05, "loss": 0.9294, "step": 237000 }, { "epoch": 0.21, "grad_norm": 0.2275390625, "learning_rate": 3.983776342073533e-05, "loss": 0.9823, "step": 237100 }, { "epoch": 0.21, "grad_norm": 155.0, "learning_rate": 3.9833264347544407e-05, "loss": 1.0118, "step": 237200 }, { "epoch": 0.21, "grad_norm": 0.004852294921875, "learning_rate": 3.982876527435349e-05, "loss": 1.0626, "step": 237300 }, { "epoch": 0.21, "grad_norm": 12.9375, "learning_rate": 3.9824266201162564e-05, "loss": 1.0751, "step": 237400 }, { "epoch": 0.21, "grad_norm": 39.0, "learning_rate": 3.981976712797164e-05, "loss": 0.9286, "step": 237500 }, { "epoch": 0.21, "grad_norm": 37.75, "learning_rate": 3.981526805478072e-05, "loss": 1.0099, "step": 237600 }, { "epoch": 0.21, "grad_norm": 316.0, "learning_rate": 3.98107689815898e-05, "loss": 1.0549, "step": 237700 }, { "epoch": 0.21, "grad_norm": 44.5, "learning_rate": 3.980626990839887e-05, "loss": 1.0088, "step": 237800 }, { "epoch": 0.21, "grad_norm": 49.5, "learning_rate": 3.980177083520795e-05, "loss": 0.8736, "step": 237900 }, { "epoch": 0.21, "grad_norm": 80.0, "learning_rate": 3.979727176201702e-05, "loss": 1.1077, "step": 238000 }, { "epoch": 0.21, "grad_norm": 16.875, "learning_rate": 3.9792772688826105e-05, "loss": 0.9999, "step": 238100 }, { "epoch": 0.21, "grad_norm": 12.625, "learning_rate": 3.978827361563518e-05, "loss": 0.9204, "step": 238200 }, { "epoch": 0.21, "grad_norm": 39.75, "learning_rate": 3.9783774542444256e-05, "loss": 0.8332, "step": 238300 }, { "epoch": 0.21, "grad_norm": 23.75, "learning_rate": 3.977927546925334e-05, "loss": 1.014, "step": 238400 }, { "epoch": 0.21, "grad_norm": 35.0, "learning_rate": 3.977477639606241e-05, "loss": 1.0413, "step": 238500 }, { "epoch": 0.21, "grad_norm": 46.75, "learning_rate": 3.977027732287149e-05, "loss": 0.9752, "step": 238600 }, { "epoch": 0.21, "grad_norm": 10.375, "learning_rate": 3.976577824968057e-05, "loss": 1.0089, "step": 238700 }, { "epoch": 0.21, "grad_norm": 25.0, "learning_rate": 3.9761279176489646e-05, "loss": 0.9386, "step": 238800 }, { "epoch": 0.21, "grad_norm": 0.0140380859375, "learning_rate": 3.975678010329872e-05, "loss": 1.1574, "step": 238900 }, { "epoch": 0.21, "grad_norm": 17.25, "learning_rate": 3.97522810301078e-05, "loss": 1.1042, "step": 239000 }, { "epoch": 0.21, "grad_norm": 20.625, "learning_rate": 3.974778195691688e-05, "loss": 1.1184, "step": 239100 }, { "epoch": 0.21, "grad_norm": 20.0, "learning_rate": 3.9743282883725954e-05, "loss": 1.0072, "step": 239200 }, { "epoch": 0.21, "grad_norm": 62.5, "learning_rate": 3.973878381053503e-05, "loss": 1.1051, "step": 239300 }, { "epoch": 0.21, "grad_norm": 46.0, "learning_rate": 3.9734284737344105e-05, "loss": 1.024, "step": 239400 }, { "epoch": 0.21, "grad_norm": 46.25, "learning_rate": 3.972978566415319e-05, "loss": 0.9284, "step": 239500 }, { "epoch": 0.21, "grad_norm": 20.375, "learning_rate": 3.972528659096226e-05, "loss": 0.9666, "step": 239600 }, { "epoch": 0.21, "grad_norm": 4.28125, "learning_rate": 3.972078751777134e-05, "loss": 0.9218, "step": 239700 }, { "epoch": 0.21, "grad_norm": 41.75, "learning_rate": 3.971628844458042e-05, "loss": 0.8954, "step": 239800 }, { "epoch": 0.21, "grad_norm": 39.75, "learning_rate": 3.9711789371389495e-05, "loss": 0.9357, "step": 239900 }, { "epoch": 0.21, "grad_norm": 0.546875, "learning_rate": 3.970729029819858e-05, "loss": 1.0388, "step": 240000 }, { "epoch": 0.21, "grad_norm": 46.5, "learning_rate": 3.970279122500765e-05, "loss": 0.8919, "step": 240100 }, { "epoch": 0.21, "grad_norm": 79.0, "learning_rate": 3.969829215181673e-05, "loss": 0.9717, "step": 240200 }, { "epoch": 0.21, "grad_norm": 35.25, "learning_rate": 3.969379307862581e-05, "loss": 1.0627, "step": 240300 }, { "epoch": 0.21, "grad_norm": 32.75, "learning_rate": 3.9689294005434885e-05, "loss": 0.8763, "step": 240400 }, { "epoch": 0.21, "grad_norm": 65.0, "learning_rate": 3.968479493224396e-05, "loss": 1.007, "step": 240500 }, { "epoch": 0.21, "grad_norm": 45.25, "learning_rate": 3.9680295859053036e-05, "loss": 0.9996, "step": 240600 }, { "epoch": 0.21, "grad_norm": 27.875, "learning_rate": 3.967579678586211e-05, "loss": 0.9441, "step": 240700 }, { "epoch": 0.21, "grad_norm": 38.0, "learning_rate": 3.967129771267119e-05, "loss": 1.0479, "step": 240800 }, { "epoch": 0.21, "grad_norm": 0.0262451171875, "learning_rate": 3.966679863948027e-05, "loss": 1.0767, "step": 240900 }, { "epoch": 0.21, "grad_norm": 21.375, "learning_rate": 3.9662299566289344e-05, "loss": 1.0097, "step": 241000 }, { "epoch": 0.21, "grad_norm": 32.25, "learning_rate": 3.9657800493098426e-05, "loss": 1.0596, "step": 241100 }, { "epoch": 0.21, "grad_norm": 42.25, "learning_rate": 3.96533014199075e-05, "loss": 1.0287, "step": 241200 }, { "epoch": 0.21, "grad_norm": 1.0, "learning_rate": 3.964880234671658e-05, "loss": 1.0252, "step": 241300 }, { "epoch": 0.22, "grad_norm": 11.3125, "learning_rate": 3.964430327352566e-05, "loss": 1.0113, "step": 241400 }, { "epoch": 0.22, "grad_norm": 141.0, "learning_rate": 3.9639804200334734e-05, "loss": 0.9814, "step": 241500 }, { "epoch": 0.22, "grad_norm": 39.75, "learning_rate": 3.963530512714381e-05, "loss": 0.8839, "step": 241600 }, { "epoch": 0.22, "grad_norm": 32.25, "learning_rate": 3.963080605395289e-05, "loss": 0.9375, "step": 241700 }, { "epoch": 0.22, "grad_norm": 29.125, "learning_rate": 3.962630698076196e-05, "loss": 0.8722, "step": 241800 }, { "epoch": 0.22, "grad_norm": 21.125, "learning_rate": 3.962180790757104e-05, "loss": 0.9342, "step": 241900 }, { "epoch": 0.22, "grad_norm": 91.0, "learning_rate": 3.961730883438012e-05, "loss": 0.8654, "step": 242000 }, { "epoch": 0.22, "grad_norm": 16.25, "learning_rate": 3.961280976118919e-05, "loss": 1.0902, "step": 242100 }, { "epoch": 0.22, "grad_norm": 91.5, "learning_rate": 3.9608310687998275e-05, "loss": 1.0038, "step": 242200 }, { "epoch": 0.22, "grad_norm": 2.53125, "learning_rate": 3.960381161480735e-05, "loss": 0.9835, "step": 242300 }, { "epoch": 0.22, "grad_norm": 13.1875, "learning_rate": 3.9599312541616426e-05, "loss": 1.0077, "step": 242400 }, { "epoch": 0.22, "grad_norm": 9.6875, "learning_rate": 3.959481346842551e-05, "loss": 1.1076, "step": 242500 }, { "epoch": 0.22, "grad_norm": 34.75, "learning_rate": 3.959031439523458e-05, "loss": 0.9872, "step": 242600 }, { "epoch": 0.22, "grad_norm": 95.5, "learning_rate": 3.9585815322043665e-05, "loss": 1.1495, "step": 242700 }, { "epoch": 0.22, "grad_norm": 29.375, "learning_rate": 3.958131624885274e-05, "loss": 1.1064, "step": 242800 }, { "epoch": 0.22, "grad_norm": 27.125, "learning_rate": 3.9576817175661816e-05, "loss": 1.061, "step": 242900 }, { "epoch": 0.22, "grad_norm": 0.84375, "learning_rate": 3.95723181024709e-05, "loss": 1.024, "step": 243000 }, { "epoch": 0.22, "grad_norm": 0.412109375, "learning_rate": 3.956781902927997e-05, "loss": 0.9002, "step": 243100 }, { "epoch": 0.22, "grad_norm": 54.75, "learning_rate": 3.956331995608905e-05, "loss": 1.1241, "step": 243200 }, { "epoch": 0.22, "grad_norm": 23.875, "learning_rate": 3.9558820882898124e-05, "loss": 1.0088, "step": 243300 }, { "epoch": 0.22, "grad_norm": 26.125, "learning_rate": 3.95543218097072e-05, "loss": 0.992, "step": 243400 }, { "epoch": 0.22, "grad_norm": 25.875, "learning_rate": 3.954982273651628e-05, "loss": 1.024, "step": 243500 }, { "epoch": 0.22, "grad_norm": 34.75, "learning_rate": 3.954532366332536e-05, "loss": 0.9248, "step": 243600 }, { "epoch": 0.22, "grad_norm": 12.6875, "learning_rate": 3.954082459013443e-05, "loss": 0.9332, "step": 243700 }, { "epoch": 0.22, "grad_norm": 39.75, "learning_rate": 3.9536325516943514e-05, "loss": 1.0014, "step": 243800 }, { "epoch": 0.22, "grad_norm": 23.75, "learning_rate": 3.953182644375259e-05, "loss": 1.1147, "step": 243900 }, { "epoch": 0.22, "grad_norm": 30.375, "learning_rate": 3.9527327370561665e-05, "loss": 1.0165, "step": 244000 }, { "epoch": 0.22, "grad_norm": 14.5, "learning_rate": 3.952282829737075e-05, "loss": 1.0904, "step": 244100 }, { "epoch": 0.22, "grad_norm": 100.5, "learning_rate": 3.951832922417982e-05, "loss": 1.0877, "step": 244200 }, { "epoch": 0.22, "grad_norm": 22.625, "learning_rate": 3.95138301509889e-05, "loss": 0.8984, "step": 244300 }, { "epoch": 0.22, "grad_norm": 70.5, "learning_rate": 3.950933107779797e-05, "loss": 0.9939, "step": 244400 }, { "epoch": 0.22, "grad_norm": 9.8125, "learning_rate": 3.950483200460705e-05, "loss": 1.1845, "step": 244500 }, { "epoch": 0.22, "grad_norm": 10.5625, "learning_rate": 3.950033293141613e-05, "loss": 0.9884, "step": 244600 }, { "epoch": 0.22, "grad_norm": 24.625, "learning_rate": 3.9495833858225206e-05, "loss": 1.0626, "step": 244700 }, { "epoch": 0.22, "grad_norm": 40.75, "learning_rate": 3.949133478503428e-05, "loss": 0.9637, "step": 244800 }, { "epoch": 0.22, "grad_norm": 40.5, "learning_rate": 3.9486835711843364e-05, "loss": 1.1104, "step": 244900 }, { "epoch": 0.22, "grad_norm": 38.75, "learning_rate": 3.948233663865244e-05, "loss": 0.9862, "step": 245000 }, { "epoch": 0.22, "grad_norm": 24.25, "learning_rate": 3.9477837565461514e-05, "loss": 1.0989, "step": 245100 }, { "epoch": 0.22, "grad_norm": 56.75, "learning_rate": 3.9473338492270596e-05, "loss": 1.0406, "step": 245200 }, { "epoch": 0.22, "grad_norm": 6.6875, "learning_rate": 3.946883941907967e-05, "loss": 1.0767, "step": 245300 }, { "epoch": 0.22, "grad_norm": 15.3125, "learning_rate": 3.9464340345888754e-05, "loss": 0.8896, "step": 245400 }, { "epoch": 0.22, "grad_norm": 5.8125, "learning_rate": 3.945984127269783e-05, "loss": 1.022, "step": 245500 }, { "epoch": 0.22, "grad_norm": 20.5, "learning_rate": 3.9455342199506904e-05, "loss": 1.0143, "step": 245600 }, { "epoch": 0.22, "grad_norm": 1.15625, "learning_rate": 3.945084312631598e-05, "loss": 0.909, "step": 245700 }, { "epoch": 0.22, "grad_norm": 0.01409912109375, "learning_rate": 3.9446344053125055e-05, "loss": 1.0102, "step": 245800 }, { "epoch": 0.22, "grad_norm": 0.15234375, "learning_rate": 3.944184497993413e-05, "loss": 0.901, "step": 245900 }, { "epoch": 0.22, "grad_norm": 13.5625, "learning_rate": 3.943734590674321e-05, "loss": 1.1719, "step": 246000 }, { "epoch": 0.22, "grad_norm": 35.75, "learning_rate": 3.943284683355229e-05, "loss": 1.0459, "step": 246100 }, { "epoch": 0.22, "grad_norm": 448.0, "learning_rate": 3.942834776036137e-05, "loss": 1.0996, "step": 246200 }, { "epoch": 0.22, "grad_norm": 21.375, "learning_rate": 3.9423848687170445e-05, "loss": 0.9833, "step": 246300 }, { "epoch": 0.22, "grad_norm": 91.5, "learning_rate": 3.941934961397952e-05, "loss": 1.0997, "step": 246400 }, { "epoch": 0.22, "grad_norm": 56.75, "learning_rate": 3.94148505407886e-05, "loss": 0.8731, "step": 246500 }, { "epoch": 0.22, "grad_norm": 620.0, "learning_rate": 3.941035146759768e-05, "loss": 1.0876, "step": 246600 }, { "epoch": 0.22, "grad_norm": 16.875, "learning_rate": 3.9405852394406754e-05, "loss": 0.9109, "step": 246700 }, { "epoch": 0.22, "grad_norm": 97.0, "learning_rate": 3.9401353321215836e-05, "loss": 1.0517, "step": 246800 }, { "epoch": 0.22, "grad_norm": 36.0, "learning_rate": 3.939685424802491e-05, "loss": 1.0084, "step": 246900 }, { "epoch": 0.22, "grad_norm": 63.0, "learning_rate": 3.9392355174833986e-05, "loss": 0.977, "step": 247000 }, { "epoch": 0.22, "grad_norm": 17.0, "learning_rate": 3.938785610164306e-05, "loss": 1.1385, "step": 247100 }, { "epoch": 0.22, "grad_norm": 23.625, "learning_rate": 3.938335702845214e-05, "loss": 1.1118, "step": 247200 }, { "epoch": 0.22, "grad_norm": 25.5, "learning_rate": 3.937885795526122e-05, "loss": 0.9351, "step": 247300 }, { "epoch": 0.22, "grad_norm": 28.25, "learning_rate": 3.9374358882070294e-05, "loss": 1.1201, "step": 247400 }, { "epoch": 0.22, "grad_norm": 29.75, "learning_rate": 3.936985980887937e-05, "loss": 1.1406, "step": 247500 }, { "epoch": 0.22, "grad_norm": 52.5, "learning_rate": 3.936536073568845e-05, "loss": 1.1221, "step": 247600 }, { "epoch": 0.22, "grad_norm": 60.5, "learning_rate": 3.936086166249753e-05, "loss": 0.9895, "step": 247700 }, { "epoch": 0.22, "grad_norm": 1104.0, "learning_rate": 3.93563625893066e-05, "loss": 0.9288, "step": 247800 }, { "epoch": 0.22, "grad_norm": 12.3125, "learning_rate": 3.9351863516115685e-05, "loss": 1.0274, "step": 247900 }, { "epoch": 0.22, "grad_norm": 133.0, "learning_rate": 3.934736444292476e-05, "loss": 0.98, "step": 248000 }, { "epoch": 0.22, "grad_norm": 7.90625, "learning_rate": 3.934286536973384e-05, "loss": 0.8549, "step": 248100 }, { "epoch": 0.22, "grad_norm": 52.25, "learning_rate": 3.933836629654292e-05, "loss": 0.9437, "step": 248200 }, { "epoch": 0.22, "grad_norm": 30.625, "learning_rate": 3.9333867223351986e-05, "loss": 1.0633, "step": 248300 }, { "epoch": 0.22, "grad_norm": 57.0, "learning_rate": 3.932936815016107e-05, "loss": 0.9639, "step": 248400 }, { "epoch": 0.22, "grad_norm": 188.0, "learning_rate": 3.9324869076970144e-05, "loss": 0.8477, "step": 248500 }, { "epoch": 0.22, "grad_norm": 56.75, "learning_rate": 3.932037000377922e-05, "loss": 1.1589, "step": 248600 }, { "epoch": 0.22, "grad_norm": 30.375, "learning_rate": 3.93158709305883e-05, "loss": 1.0893, "step": 248700 }, { "epoch": 0.22, "grad_norm": 128.0, "learning_rate": 3.9311371857397376e-05, "loss": 1.1633, "step": 248800 }, { "epoch": 0.22, "grad_norm": 17.625, "learning_rate": 3.930687278420646e-05, "loss": 0.9066, "step": 248900 }, { "epoch": 0.22, "grad_norm": 59.5, "learning_rate": 3.9302373711015534e-05, "loss": 1.1544, "step": 249000 }, { "epoch": 0.22, "grad_norm": 0.037109375, "learning_rate": 3.929787463782461e-05, "loss": 1.0615, "step": 249100 }, { "epoch": 0.22, "grad_norm": 368.0, "learning_rate": 3.929337556463369e-05, "loss": 1.1389, "step": 249200 }, { "epoch": 0.22, "grad_norm": 45.75, "learning_rate": 3.9288876491442767e-05, "loss": 1.2061, "step": 249300 }, { "epoch": 0.22, "grad_norm": 12.375, "learning_rate": 3.928437741825184e-05, "loss": 1.0681, "step": 249400 }, { "epoch": 0.22, "grad_norm": 193.0, "learning_rate": 3.9279878345060924e-05, "loss": 0.9814, "step": 249500 }, { "epoch": 0.22, "grad_norm": 39.75, "learning_rate": 3.927537927186999e-05, "loss": 0.9327, "step": 249600 }, { "epoch": 0.22, "grad_norm": 23.25, "learning_rate": 3.9270880198679075e-05, "loss": 0.9946, "step": 249700 }, { "epoch": 0.22, "grad_norm": 35.5, "learning_rate": 3.926638112548815e-05, "loss": 1.063, "step": 249800 }, { "epoch": 0.22, "grad_norm": 288.0, "learning_rate": 3.9261882052297225e-05, "loss": 0.9444, "step": 249900 }, { "epoch": 0.22, "grad_norm": 51.25, "learning_rate": 3.925738297910631e-05, "loss": 1.071, "step": 250000 }, { "epoch": 0.22, "grad_norm": 0.1572265625, "learning_rate": 3.925288390591538e-05, "loss": 0.801, "step": 250100 }, { "epoch": 0.22, "grad_norm": 0.032958984375, "learning_rate": 3.924838483272446e-05, "loss": 0.9229, "step": 250200 }, { "epoch": 0.22, "grad_norm": 14.25, "learning_rate": 3.924388575953354e-05, "loss": 1.0075, "step": 250300 }, { "epoch": 0.22, "grad_norm": 73.5, "learning_rate": 3.9239386686342616e-05, "loss": 1.1117, "step": 250400 }, { "epoch": 0.22, "grad_norm": 22.375, "learning_rate": 3.923488761315169e-05, "loss": 0.8667, "step": 250500 }, { "epoch": 0.22, "grad_norm": 104.0, "learning_rate": 3.923038853996077e-05, "loss": 0.9855, "step": 250600 }, { "epoch": 0.22, "grad_norm": 0.060791015625, "learning_rate": 3.922588946676985e-05, "loss": 1.0965, "step": 250700 }, { "epoch": 0.22, "grad_norm": 0.01336669921875, "learning_rate": 3.922139039357893e-05, "loss": 1.0748, "step": 250800 }, { "epoch": 0.22, "grad_norm": 9.375, "learning_rate": 3.9216891320388e-05, "loss": 1.1585, "step": 250900 }, { "epoch": 0.22, "grad_norm": 46.25, "learning_rate": 3.9212392247197074e-05, "loss": 1.1373, "step": 251000 }, { "epoch": 0.22, "grad_norm": 89.5, "learning_rate": 3.9207893174006157e-05, "loss": 1.0942, "step": 251100 }, { "epoch": 0.22, "grad_norm": 49.75, "learning_rate": 3.920339410081523e-05, "loss": 0.9776, "step": 251200 }, { "epoch": 0.22, "grad_norm": 12.6875, "learning_rate": 3.919889502762431e-05, "loss": 1.0352, "step": 251300 }, { "epoch": 0.22, "grad_norm": 47.5, "learning_rate": 3.919439595443339e-05, "loss": 1.0185, "step": 251400 }, { "epoch": 0.22, "grad_norm": 49.75, "learning_rate": 3.9189896881242465e-05, "loss": 0.9082, "step": 251500 }, { "epoch": 0.22, "grad_norm": 20.375, "learning_rate": 3.918539780805155e-05, "loss": 0.9834, "step": 251600 }, { "epoch": 0.22, "grad_norm": 24.375, "learning_rate": 3.918089873486062e-05, "loss": 0.9561, "step": 251700 }, { "epoch": 0.22, "grad_norm": 24.125, "learning_rate": 3.91763996616697e-05, "loss": 1.0995, "step": 251800 }, { "epoch": 0.22, "grad_norm": 33.0, "learning_rate": 3.917190058847878e-05, "loss": 1.1076, "step": 251900 }, { "epoch": 0.22, "grad_norm": 62.0, "learning_rate": 3.9167401515287855e-05, "loss": 0.8693, "step": 252000 }, { "epoch": 0.22, "grad_norm": 0.2470703125, "learning_rate": 3.916290244209693e-05, "loss": 1.0181, "step": 252100 }, { "epoch": 0.22, "grad_norm": 41.25, "learning_rate": 3.9158403368906006e-05, "loss": 0.9503, "step": 252200 }, { "epoch": 0.22, "grad_norm": 17.5, "learning_rate": 3.915390429571508e-05, "loss": 1.0744, "step": 252300 }, { "epoch": 0.22, "grad_norm": 41.5, "learning_rate": 3.914940522252416e-05, "loss": 1.003, "step": 252400 }, { "epoch": 0.22, "grad_norm": 22.75, "learning_rate": 3.914490614933324e-05, "loss": 1.0178, "step": 252500 }, { "epoch": 0.23, "grad_norm": 23.125, "learning_rate": 3.9140407076142314e-05, "loss": 1.0478, "step": 252600 }, { "epoch": 0.23, "grad_norm": 8.8125, "learning_rate": 3.9135908002951396e-05, "loss": 1.0237, "step": 252700 }, { "epoch": 0.23, "grad_norm": 20.0, "learning_rate": 3.913140892976047e-05, "loss": 0.9183, "step": 252800 }, { "epoch": 0.23, "grad_norm": 19.5, "learning_rate": 3.9126909856569547e-05, "loss": 1.2169, "step": 252900 }, { "epoch": 0.23, "grad_norm": 213.0, "learning_rate": 3.912241078337863e-05, "loss": 1.0258, "step": 253000 }, { "epoch": 0.23, "grad_norm": 16.625, "learning_rate": 3.9117911710187704e-05, "loss": 0.9357, "step": 253100 }, { "epoch": 0.23, "grad_norm": 30.0, "learning_rate": 3.911341263699678e-05, "loss": 1.0019, "step": 253200 }, { "epoch": 0.23, "grad_norm": 17.5, "learning_rate": 3.910891356380586e-05, "loss": 0.8838, "step": 253300 }, { "epoch": 0.23, "grad_norm": 7.71875, "learning_rate": 3.910441449061494e-05, "loss": 0.9828, "step": 253400 }, { "epoch": 0.23, "grad_norm": 13.375, "learning_rate": 3.909991541742401e-05, "loss": 0.9326, "step": 253500 }, { "epoch": 0.23, "grad_norm": 14.8125, "learning_rate": 3.909541634423309e-05, "loss": 1.0334, "step": 253600 }, { "epoch": 0.23, "grad_norm": 13.75, "learning_rate": 3.909091727104216e-05, "loss": 1.0168, "step": 253700 }, { "epoch": 0.23, "grad_norm": 33.0, "learning_rate": 3.9086418197851245e-05, "loss": 0.9915, "step": 253800 }, { "epoch": 0.23, "grad_norm": 0.193359375, "learning_rate": 3.908191912466032e-05, "loss": 1.0294, "step": 253900 }, { "epoch": 0.23, "grad_norm": 29.625, "learning_rate": 3.9077420051469396e-05, "loss": 1.0334, "step": 254000 }, { "epoch": 0.23, "grad_norm": 9.3125, "learning_rate": 3.907292097827848e-05, "loss": 1.1583, "step": 254100 }, { "epoch": 0.23, "grad_norm": 33.0, "learning_rate": 3.906842190508755e-05, "loss": 0.9236, "step": 254200 }, { "epoch": 0.23, "grad_norm": 0.007110595703125, "learning_rate": 3.9063922831896635e-05, "loss": 0.9179, "step": 254300 }, { "epoch": 0.23, "grad_norm": 43.25, "learning_rate": 3.905942375870571e-05, "loss": 0.9746, "step": 254400 }, { "epoch": 0.23, "grad_norm": 19.125, "learning_rate": 3.9054924685514786e-05, "loss": 1.0515, "step": 254500 }, { "epoch": 0.23, "grad_norm": 153.0, "learning_rate": 3.905042561232387e-05, "loss": 1.1089, "step": 254600 }, { "epoch": 0.23, "grad_norm": 18.375, "learning_rate": 3.904592653913294e-05, "loss": 0.9498, "step": 254700 }, { "epoch": 0.23, "grad_norm": 66.0, "learning_rate": 3.904142746594202e-05, "loss": 1.1103, "step": 254800 }, { "epoch": 0.23, "grad_norm": 86.5, "learning_rate": 3.9036928392751094e-05, "loss": 1.0989, "step": 254900 }, { "epoch": 0.23, "grad_norm": 114.5, "learning_rate": 3.903242931956017e-05, "loss": 1.1017, "step": 255000 }, { "epoch": 0.23, "grad_norm": 27.125, "learning_rate": 3.902793024636925e-05, "loss": 1.0728, "step": 255100 }, { "epoch": 0.23, "grad_norm": 15.5, "learning_rate": 3.902343117317833e-05, "loss": 0.9304, "step": 255200 }, { "epoch": 0.23, "grad_norm": 8.6875, "learning_rate": 3.90189320999874e-05, "loss": 1.062, "step": 255300 }, { "epoch": 0.23, "grad_norm": 20.625, "learning_rate": 3.9014433026796484e-05, "loss": 1.021, "step": 255400 }, { "epoch": 0.23, "grad_norm": 98.0, "learning_rate": 3.900993395360556e-05, "loss": 0.9793, "step": 255500 }, { "epoch": 0.23, "grad_norm": 57.5, "learning_rate": 3.9005434880414635e-05, "loss": 0.9593, "step": 255600 }, { "epoch": 0.23, "grad_norm": 21.875, "learning_rate": 3.900093580722372e-05, "loss": 0.8503, "step": 255700 }, { "epoch": 0.23, "grad_norm": 22.0, "learning_rate": 3.899643673403279e-05, "loss": 1.0543, "step": 255800 }, { "epoch": 0.23, "grad_norm": 56.25, "learning_rate": 3.899193766084187e-05, "loss": 1.0556, "step": 255900 }, { "epoch": 0.23, "grad_norm": 58.75, "learning_rate": 3.898743858765095e-05, "loss": 1.0963, "step": 256000 }, { "epoch": 0.23, "grad_norm": 20.25, "learning_rate": 3.898293951446002e-05, "loss": 0.9329, "step": 256100 }, { "epoch": 0.23, "grad_norm": 39.25, "learning_rate": 3.89784404412691e-05, "loss": 1.0692, "step": 256200 }, { "epoch": 0.23, "grad_norm": 18.125, "learning_rate": 3.8973941368078176e-05, "loss": 0.9528, "step": 256300 }, { "epoch": 0.23, "grad_norm": 0.5703125, "learning_rate": 3.896944229488725e-05, "loss": 0.951, "step": 256400 }, { "epoch": 0.23, "grad_norm": 24.5, "learning_rate": 3.896494322169633e-05, "loss": 0.9127, "step": 256500 }, { "epoch": 0.23, "grad_norm": 0.166015625, "learning_rate": 3.896044414850541e-05, "loss": 0.9868, "step": 256600 }, { "epoch": 0.23, "grad_norm": 0.21484375, "learning_rate": 3.8955945075314484e-05, "loss": 1.1349, "step": 256700 }, { "epoch": 0.23, "grad_norm": 37.5, "learning_rate": 3.8951446002123566e-05, "loss": 0.9999, "step": 256800 }, { "epoch": 0.23, "grad_norm": 44.25, "learning_rate": 3.894694692893264e-05, "loss": 1.0133, "step": 256900 }, { "epoch": 0.23, "grad_norm": 628.0, "learning_rate": 3.8942447855741724e-05, "loss": 1.0403, "step": 257000 }, { "epoch": 0.23, "grad_norm": 17.125, "learning_rate": 3.89379487825508e-05, "loss": 1.0205, "step": 257100 }, { "epoch": 0.23, "grad_norm": 39.0, "learning_rate": 3.8933449709359874e-05, "loss": 0.9979, "step": 257200 }, { "epoch": 0.23, "grad_norm": 31.875, "learning_rate": 3.8928950636168956e-05, "loss": 1.1494, "step": 257300 }, { "epoch": 0.23, "grad_norm": 68.5, "learning_rate": 3.8924451562978025e-05, "loss": 1.0635, "step": 257400 }, { "epoch": 0.23, "grad_norm": 62.0, "learning_rate": 3.891995248978711e-05, "loss": 1.1233, "step": 257500 }, { "epoch": 0.23, "grad_norm": 71.0, "learning_rate": 3.891545341659618e-05, "loss": 1.0237, "step": 257600 }, { "epoch": 0.23, "grad_norm": 21.625, "learning_rate": 3.891095434340526e-05, "loss": 1.0337, "step": 257700 }, { "epoch": 0.23, "grad_norm": 21.625, "learning_rate": 3.890645527021434e-05, "loss": 0.9592, "step": 257800 }, { "epoch": 0.23, "grad_norm": 0.37890625, "learning_rate": 3.8901956197023415e-05, "loss": 1.0695, "step": 257900 }, { "epoch": 0.23, "grad_norm": 21.125, "learning_rate": 3.889745712383249e-05, "loss": 1.1847, "step": 258000 }, { "epoch": 0.23, "grad_norm": 13.125, "learning_rate": 3.889295805064157e-05, "loss": 0.843, "step": 258100 }, { "epoch": 0.23, "grad_norm": 15.375, "learning_rate": 3.888845897745065e-05, "loss": 1.1496, "step": 258200 }, { "epoch": 0.23, "grad_norm": 16.75, "learning_rate": 3.888395990425972e-05, "loss": 0.9838, "step": 258300 }, { "epoch": 0.23, "grad_norm": 4.3125, "learning_rate": 3.8879460831068805e-05, "loss": 0.9052, "step": 258400 }, { "epoch": 0.23, "grad_norm": 51.0, "learning_rate": 3.887496175787788e-05, "loss": 1.0382, "step": 258500 }, { "epoch": 0.23, "grad_norm": 5.125, "learning_rate": 3.8870462684686956e-05, "loss": 1.0682, "step": 258600 }, { "epoch": 0.23, "grad_norm": 23.625, "learning_rate": 3.886596361149603e-05, "loss": 0.959, "step": 258700 }, { "epoch": 0.23, "grad_norm": 0.384765625, "learning_rate": 3.886146453830511e-05, "loss": 1.1241, "step": 258800 }, { "epoch": 0.23, "grad_norm": 0.32421875, "learning_rate": 3.885696546511419e-05, "loss": 0.9618, "step": 258900 }, { "epoch": 0.23, "grad_norm": 90.5, "learning_rate": 3.8852466391923264e-05, "loss": 1.1003, "step": 259000 }, { "epoch": 0.23, "grad_norm": 0.01068115234375, "learning_rate": 3.884796731873234e-05, "loss": 1.1317, "step": 259100 }, { "epoch": 0.23, "grad_norm": 12.75, "learning_rate": 3.884346824554142e-05, "loss": 1.0843, "step": 259200 }, { "epoch": 0.23, "grad_norm": 0.031005859375, "learning_rate": 3.88389691723505e-05, "loss": 1.0241, "step": 259300 }, { "epoch": 0.23, "grad_norm": 0.0081787109375, "learning_rate": 3.883447009915957e-05, "loss": 1.0026, "step": 259400 }, { "epoch": 0.23, "grad_norm": 13.0625, "learning_rate": 3.8829971025968654e-05, "loss": 0.978, "step": 259500 }, { "epoch": 0.23, "grad_norm": 50.25, "learning_rate": 3.882547195277773e-05, "loss": 1.0678, "step": 259600 }, { "epoch": 0.23, "grad_norm": 22.625, "learning_rate": 3.882097287958681e-05, "loss": 0.9762, "step": 259700 }, { "epoch": 0.23, "grad_norm": 69.5, "learning_rate": 3.881647380639589e-05, "loss": 1.0388, "step": 259800 }, { "epoch": 0.23, "grad_norm": 0.2470703125, "learning_rate": 3.881197473320496e-05, "loss": 0.9259, "step": 259900 }, { "epoch": 0.23, "grad_norm": 19.0, "learning_rate": 3.880747566001404e-05, "loss": 1.1373, "step": 260000 }, { "epoch": 0.23, "grad_norm": 0.08642578125, "learning_rate": 3.880297658682311e-05, "loss": 0.9832, "step": 260100 }, { "epoch": 0.23, "grad_norm": 18.75, "learning_rate": 3.8798477513632195e-05, "loss": 1.0725, "step": 260200 }, { "epoch": 0.23, "grad_norm": 39.0, "learning_rate": 3.879397844044127e-05, "loss": 1.0731, "step": 260300 }, { "epoch": 0.23, "grad_norm": 49.25, "learning_rate": 3.8789479367250346e-05, "loss": 1.0931, "step": 260400 }, { "epoch": 0.23, "grad_norm": 28.0, "learning_rate": 3.878498029405943e-05, "loss": 0.9126, "step": 260500 }, { "epoch": 0.23, "grad_norm": 96.5, "learning_rate": 3.8780481220868504e-05, "loss": 0.9054, "step": 260600 }, { "epoch": 0.23, "grad_norm": 14.625, "learning_rate": 3.877598214767758e-05, "loss": 1.0186, "step": 260700 }, { "epoch": 0.23, "grad_norm": 0.0224609375, "learning_rate": 3.877148307448666e-05, "loss": 0.9594, "step": 260800 }, { "epoch": 0.23, "grad_norm": 17.125, "learning_rate": 3.8766984001295736e-05, "loss": 1.0147, "step": 260900 }, { "epoch": 0.23, "grad_norm": 39.5, "learning_rate": 3.876248492810481e-05, "loss": 0.989, "step": 261000 }, { "epoch": 0.23, "grad_norm": 12.625, "learning_rate": 3.8757985854913894e-05, "loss": 0.9604, "step": 261100 }, { "epoch": 0.23, "grad_norm": 0.2392578125, "learning_rate": 3.875348678172297e-05, "loss": 1.0045, "step": 261200 }, { "epoch": 0.23, "grad_norm": 164.0, "learning_rate": 3.8748987708532044e-05, "loss": 1.1207, "step": 261300 }, { "epoch": 0.23, "grad_norm": 79.5, "learning_rate": 3.874448863534112e-05, "loss": 1.0898, "step": 261400 }, { "epoch": 0.23, "grad_norm": 118.5, "learning_rate": 3.8739989562150195e-05, "loss": 0.9858, "step": 261500 }, { "epoch": 0.23, "grad_norm": 0.1171875, "learning_rate": 3.873549048895928e-05, "loss": 1.0268, "step": 261600 }, { "epoch": 0.23, "grad_norm": 58.25, "learning_rate": 3.873099141576835e-05, "loss": 1.0691, "step": 261700 }, { "epoch": 0.23, "grad_norm": 808.0, "learning_rate": 3.872649234257743e-05, "loss": 1.0907, "step": 261800 }, { "epoch": 0.23, "grad_norm": 0.498046875, "learning_rate": 3.872199326938651e-05, "loss": 1.0142, "step": 261900 }, { "epoch": 0.23, "grad_norm": 25.375, "learning_rate": 3.8717494196195585e-05, "loss": 1.1959, "step": 262000 }, { "epoch": 0.23, "grad_norm": 0.55859375, "learning_rate": 3.871299512300466e-05, "loss": 1.0116, "step": 262100 }, { "epoch": 0.23, "grad_norm": 38.75, "learning_rate": 3.870849604981374e-05, "loss": 0.8472, "step": 262200 }, { "epoch": 0.23, "grad_norm": 14.8125, "learning_rate": 3.870399697662282e-05, "loss": 0.999, "step": 262300 }, { "epoch": 0.23, "grad_norm": 12.0625, "learning_rate": 3.86994979034319e-05, "loss": 1.097, "step": 262400 }, { "epoch": 0.23, "grad_norm": 48.0, "learning_rate": 3.8694998830240976e-05, "loss": 0.9909, "step": 262500 }, { "epoch": 0.23, "grad_norm": 49.25, "learning_rate": 3.8690499757050044e-05, "loss": 0.9547, "step": 262600 }, { "epoch": 0.23, "grad_norm": 41.25, "learning_rate": 3.8686000683859126e-05, "loss": 0.8995, "step": 262700 }, { "epoch": 0.23, "grad_norm": 0.007171630859375, "learning_rate": 3.86815016106682e-05, "loss": 1.1877, "step": 262800 }, { "epoch": 0.23, "grad_norm": 20.25, "learning_rate": 3.8677002537477284e-05, "loss": 1.046, "step": 262900 }, { "epoch": 0.23, "grad_norm": 32.25, "learning_rate": 3.867250346428636e-05, "loss": 0.9654, "step": 263000 }, { "epoch": 0.23, "grad_norm": 0.00732421875, "learning_rate": 3.8668004391095434e-05, "loss": 0.9455, "step": 263100 }, { "epoch": 0.23, "grad_norm": 24.375, "learning_rate": 3.8663505317904517e-05, "loss": 1.0026, "step": 263200 }, { "epoch": 0.23, "grad_norm": 21.625, "learning_rate": 3.865900624471359e-05, "loss": 1.0009, "step": 263300 }, { "epoch": 0.23, "grad_norm": 0.1630859375, "learning_rate": 3.865450717152267e-05, "loss": 0.9792, "step": 263400 }, { "epoch": 0.23, "grad_norm": 66.5, "learning_rate": 3.865000809833175e-05, "loss": 1.1434, "step": 263500 }, { "epoch": 0.23, "grad_norm": 16.375, "learning_rate": 3.8645509025140825e-05, "loss": 1.0509, "step": 263600 }, { "epoch": 0.23, "grad_norm": 11.25, "learning_rate": 3.86410099519499e-05, "loss": 0.9607, "step": 263700 }, { "epoch": 0.23, "grad_norm": 43.75, "learning_rate": 3.863651087875898e-05, "loss": 1.1413, "step": 263800 }, { "epoch": 0.24, "grad_norm": 49.75, "learning_rate": 3.863201180556805e-05, "loss": 1.0306, "step": 263900 }, { "epoch": 0.24, "grad_norm": 46.25, "learning_rate": 3.862751273237713e-05, "loss": 1.0876, "step": 264000 }, { "epoch": 0.24, "grad_norm": 75.5, "learning_rate": 3.862301365918621e-05, "loss": 1.1211, "step": 264100 }, { "epoch": 0.24, "grad_norm": 62.5, "learning_rate": 3.8618514585995284e-05, "loss": 1.016, "step": 264200 }, { "epoch": 0.24, "grad_norm": 38.0, "learning_rate": 3.8614015512804366e-05, "loss": 0.8601, "step": 264300 }, { "epoch": 0.24, "grad_norm": 16.5, "learning_rate": 3.860951643961344e-05, "loss": 1.0999, "step": 264400 }, { "epoch": 0.24, "grad_norm": 0.02978515625, "learning_rate": 3.8605017366422516e-05, "loss": 1.0766, "step": 264500 }, { "epoch": 0.24, "grad_norm": 10.5, "learning_rate": 3.86005182932316e-05, "loss": 0.9753, "step": 264600 }, { "epoch": 0.24, "grad_norm": 78.0, "learning_rate": 3.8596019220040674e-05, "loss": 0.9668, "step": 264700 }, { "epoch": 0.24, "grad_norm": 19.25, "learning_rate": 3.859152014684975e-05, "loss": 0.8695, "step": 264800 }, { "epoch": 0.24, "grad_norm": 11.75, "learning_rate": 3.858702107365883e-05, "loss": 1.0599, "step": 264900 }, { "epoch": 0.24, "grad_norm": 22.125, "learning_rate": 3.8582522000467907e-05, "loss": 1.028, "step": 265000 }, { "epoch": 0.24, "grad_norm": 13.5, "learning_rate": 3.857802292727699e-05, "loss": 1.1109, "step": 265100 }, { "epoch": 0.24, "grad_norm": 37.5, "learning_rate": 3.857352385408606e-05, "loss": 0.9585, "step": 265200 }, { "epoch": 0.24, "grad_norm": 108.0, "learning_rate": 3.856902478089513e-05, "loss": 0.8214, "step": 265300 }, { "epoch": 0.24, "grad_norm": 63.0, "learning_rate": 3.8564525707704215e-05, "loss": 1.0191, "step": 265400 }, { "epoch": 0.24, "grad_norm": 0.05517578125, "learning_rate": 3.856002663451329e-05, "loss": 1.0374, "step": 265500 }, { "epoch": 0.24, "grad_norm": 43.75, "learning_rate": 3.8555527561322365e-05, "loss": 1.1707, "step": 265600 }, { "epoch": 0.24, "grad_norm": 52.25, "learning_rate": 3.855102848813145e-05, "loss": 1.0462, "step": 265700 }, { "epoch": 0.24, "grad_norm": 21.0, "learning_rate": 3.854652941494052e-05, "loss": 1.0787, "step": 265800 }, { "epoch": 0.24, "grad_norm": 21.125, "learning_rate": 3.8542030341749605e-05, "loss": 1.0676, "step": 265900 }, { "epoch": 0.24, "grad_norm": 0.0201416015625, "learning_rate": 3.853753126855868e-05, "loss": 1.0169, "step": 266000 }, { "epoch": 0.24, "grad_norm": 5.4375, "learning_rate": 3.8533032195367756e-05, "loss": 0.9729, "step": 266100 }, { "epoch": 0.24, "grad_norm": 78.5, "learning_rate": 3.852853312217684e-05, "loss": 0.9858, "step": 266200 }, { "epoch": 0.24, "grad_norm": 0.0216064453125, "learning_rate": 3.852403404898591e-05, "loss": 1.1963, "step": 266300 }, { "epoch": 0.24, "grad_norm": 0.040771484375, "learning_rate": 3.851953497579499e-05, "loss": 0.9544, "step": 266400 }, { "epoch": 0.24, "grad_norm": 19.25, "learning_rate": 3.8515035902604064e-05, "loss": 1.0033, "step": 266500 }, { "epoch": 0.24, "grad_norm": 28.0, "learning_rate": 3.851053682941314e-05, "loss": 1.0625, "step": 266600 }, { "epoch": 0.24, "grad_norm": 14.3125, "learning_rate": 3.850603775622222e-05, "loss": 0.873, "step": 266700 }, { "epoch": 0.24, "grad_norm": 11.625, "learning_rate": 3.8501538683031297e-05, "loss": 1.0363, "step": 266800 }, { "epoch": 0.24, "grad_norm": 17.5, "learning_rate": 3.849703960984037e-05, "loss": 1.035, "step": 266900 }, { "epoch": 0.24, "grad_norm": 215.0, "learning_rate": 3.8492540536649454e-05, "loss": 1.0572, "step": 267000 }, { "epoch": 0.24, "grad_norm": 21.875, "learning_rate": 3.848804146345853e-05, "loss": 1.116, "step": 267100 }, { "epoch": 0.24, "grad_norm": 101.0, "learning_rate": 3.8483542390267605e-05, "loss": 1.0954, "step": 267200 }, { "epoch": 0.24, "grad_norm": 1.234375, "learning_rate": 3.847904331707669e-05, "loss": 1.0449, "step": 267300 }, { "epoch": 0.24, "grad_norm": 47.25, "learning_rate": 3.847454424388576e-05, "loss": 1.055, "step": 267400 }, { "epoch": 0.24, "grad_norm": 23.375, "learning_rate": 3.847004517069484e-05, "loss": 0.9842, "step": 267500 }, { "epoch": 0.24, "grad_norm": 20.25, "learning_rate": 3.846554609750392e-05, "loss": 1.1345, "step": 267600 }, { "epoch": 0.24, "grad_norm": 0.189453125, "learning_rate": 3.8461047024312995e-05, "loss": 1.0896, "step": 267700 }, { "epoch": 0.24, "grad_norm": 22.0, "learning_rate": 3.845654795112207e-05, "loss": 0.9783, "step": 267800 }, { "epoch": 0.24, "grad_norm": 23.0, "learning_rate": 3.8452048877931146e-05, "loss": 1.0952, "step": 267900 }, { "epoch": 0.24, "grad_norm": 106.0, "learning_rate": 3.844754980474022e-05, "loss": 1.0655, "step": 268000 }, { "epoch": 0.24, "grad_norm": 38.0, "learning_rate": 3.84430507315493e-05, "loss": 0.9758, "step": 268100 }, { "epoch": 0.24, "grad_norm": 54.5, "learning_rate": 3.843855165835838e-05, "loss": 0.989, "step": 268200 }, { "epoch": 0.24, "grad_norm": 0.0062255859375, "learning_rate": 3.8434052585167454e-05, "loss": 1.0824, "step": 268300 }, { "epoch": 0.24, "grad_norm": 47.75, "learning_rate": 3.8429553511976536e-05, "loss": 0.8245, "step": 268400 }, { "epoch": 0.24, "grad_norm": 0.12255859375, "learning_rate": 3.842505443878561e-05, "loss": 0.8582, "step": 268500 }, { "epoch": 0.24, "grad_norm": 0.0269775390625, "learning_rate": 3.842055536559469e-05, "loss": 0.9881, "step": 268600 }, { "epoch": 0.24, "grad_norm": 93.0, "learning_rate": 3.841605629240377e-05, "loss": 0.9859, "step": 268700 }, { "epoch": 0.24, "grad_norm": 10.125, "learning_rate": 3.8411557219212844e-05, "loss": 0.9935, "step": 268800 }, { "epoch": 0.24, "grad_norm": 0.158203125, "learning_rate": 3.8407058146021926e-05, "loss": 0.8388, "step": 268900 }, { "epoch": 0.24, "grad_norm": 35.0, "learning_rate": 3.8402559072831e-05, "loss": 1.0776, "step": 269000 }, { "epoch": 0.24, "grad_norm": 53.75, "learning_rate": 3.839805999964008e-05, "loss": 1.0681, "step": 269100 }, { "epoch": 0.24, "grad_norm": 10.75, "learning_rate": 3.839356092644915e-05, "loss": 0.9243, "step": 269200 }, { "epoch": 0.24, "grad_norm": 14.875, "learning_rate": 3.838906185325823e-05, "loss": 0.9357, "step": 269300 }, { "epoch": 0.24, "grad_norm": 17.875, "learning_rate": 3.838456278006731e-05, "loss": 1.041, "step": 269400 }, { "epoch": 0.24, "grad_norm": 35.0, "learning_rate": 3.8380063706876385e-05, "loss": 1.0491, "step": 269500 }, { "epoch": 0.24, "grad_norm": 13.875, "learning_rate": 3.837556463368546e-05, "loss": 0.9161, "step": 269600 }, { "epoch": 0.24, "grad_norm": 41.75, "learning_rate": 3.837106556049454e-05, "loss": 1.0651, "step": 269700 }, { "epoch": 0.24, "grad_norm": 0.234375, "learning_rate": 3.836656648730362e-05, "loss": 1.1058, "step": 269800 }, { "epoch": 0.24, "grad_norm": 0.01171875, "learning_rate": 3.836206741411269e-05, "loss": 0.8572, "step": 269900 }, { "epoch": 0.24, "grad_norm": 66.0, "learning_rate": 3.8357568340921775e-05, "loss": 1.1186, "step": 270000 }, { "epoch": 0.24, "grad_norm": 0.1591796875, "learning_rate": 3.835306926773085e-05, "loss": 0.9725, "step": 270100 }, { "epoch": 0.24, "grad_norm": 25.75, "learning_rate": 3.8348570194539926e-05, "loss": 1.0514, "step": 270200 }, { "epoch": 0.24, "grad_norm": 13.8125, "learning_rate": 3.834407112134901e-05, "loss": 1.0911, "step": 270300 }, { "epoch": 0.24, "grad_norm": 0.005950927734375, "learning_rate": 3.8339572048158077e-05, "loss": 0.9847, "step": 270400 }, { "epoch": 0.24, "grad_norm": 13.6875, "learning_rate": 3.833507297496716e-05, "loss": 0.9347, "step": 270500 }, { "epoch": 0.24, "grad_norm": 17.0, "learning_rate": 3.8330573901776234e-05, "loss": 1.045, "step": 270600 }, { "epoch": 0.24, "grad_norm": 0.189453125, "learning_rate": 3.832607482858531e-05, "loss": 1.0941, "step": 270700 }, { "epoch": 0.24, "grad_norm": 0.1845703125, "learning_rate": 3.832157575539439e-05, "loss": 1.0154, "step": 270800 }, { "epoch": 0.24, "grad_norm": 36.25, "learning_rate": 3.831707668220347e-05, "loss": 0.888, "step": 270900 }, { "epoch": 0.24, "grad_norm": 105.5, "learning_rate": 3.831257760901254e-05, "loss": 0.9389, "step": 271000 }, { "epoch": 0.24, "grad_norm": 178.0, "learning_rate": 3.8308078535821624e-05, "loss": 0.8784, "step": 271100 }, { "epoch": 0.24, "grad_norm": 47.75, "learning_rate": 3.83035794626307e-05, "loss": 0.894, "step": 271200 }, { "epoch": 0.24, "grad_norm": 0.546875, "learning_rate": 3.829908038943978e-05, "loss": 1.119, "step": 271300 }, { "epoch": 0.24, "grad_norm": 34.75, "learning_rate": 3.829458131624886e-05, "loss": 1.0578, "step": 271400 }, { "epoch": 0.24, "grad_norm": 96.0, "learning_rate": 3.829008224305793e-05, "loss": 1.1486, "step": 271500 }, { "epoch": 0.24, "grad_norm": 33.0, "learning_rate": 3.8285583169867015e-05, "loss": 0.8229, "step": 271600 }, { "epoch": 0.24, "grad_norm": 32.25, "learning_rate": 3.828108409667608e-05, "loss": 0.8392, "step": 271700 }, { "epoch": 0.24, "grad_norm": 19.625, "learning_rate": 3.8276585023485165e-05, "loss": 0.9758, "step": 271800 }, { "epoch": 0.24, "grad_norm": 24.625, "learning_rate": 3.827208595029424e-05, "loss": 1.055, "step": 271900 }, { "epoch": 0.24, "grad_norm": 91.0, "learning_rate": 3.8267586877103316e-05, "loss": 0.954, "step": 272000 }, { "epoch": 0.24, "grad_norm": 0.640625, "learning_rate": 3.82630878039124e-05, "loss": 1.0351, "step": 272100 }, { "epoch": 0.24, "grad_norm": 39.75, "learning_rate": 3.825858873072147e-05, "loss": 1.0571, "step": 272200 }, { "epoch": 0.24, "grad_norm": 0.3203125, "learning_rate": 3.825408965753055e-05, "loss": 1.0111, "step": 272300 }, { "epoch": 0.24, "grad_norm": 10.8125, "learning_rate": 3.824959058433963e-05, "loss": 1.0805, "step": 272400 }, { "epoch": 0.24, "grad_norm": 23.375, "learning_rate": 3.8245091511148706e-05, "loss": 0.957, "step": 272500 }, { "epoch": 0.24, "grad_norm": 12.8125, "learning_rate": 3.824059243795778e-05, "loss": 0.9219, "step": 272600 }, { "epoch": 0.24, "grad_norm": 22.625, "learning_rate": 3.8236093364766864e-05, "loss": 1.0054, "step": 272700 }, { "epoch": 0.24, "grad_norm": 166.0, "learning_rate": 3.823159429157594e-05, "loss": 1.0138, "step": 272800 }, { "epoch": 0.24, "grad_norm": 157.0, "learning_rate": 3.8227095218385014e-05, "loss": 1.0631, "step": 272900 }, { "epoch": 0.24, "grad_norm": 38.75, "learning_rate": 3.822259614519409e-05, "loss": 0.9157, "step": 273000 }, { "epoch": 0.24, "grad_norm": 23.625, "learning_rate": 3.8218097072003165e-05, "loss": 0.9601, "step": 273100 }, { "epoch": 0.24, "grad_norm": 0.291015625, "learning_rate": 3.821359799881225e-05, "loss": 0.8231, "step": 273200 }, { "epoch": 0.24, "grad_norm": 29.875, "learning_rate": 3.820909892562132e-05, "loss": 1.0122, "step": 273300 }, { "epoch": 0.24, "grad_norm": 0.09521484375, "learning_rate": 3.82045998524304e-05, "loss": 1.0578, "step": 273400 }, { "epoch": 0.24, "grad_norm": 45.0, "learning_rate": 3.820010077923948e-05, "loss": 0.9441, "step": 273500 }, { "epoch": 0.24, "grad_norm": 139.0, "learning_rate": 3.8195601706048555e-05, "loss": 0.9708, "step": 273600 }, { "epoch": 0.24, "grad_norm": 29.625, "learning_rate": 3.819110263285763e-05, "loss": 1.0925, "step": 273700 }, { "epoch": 0.24, "grad_norm": 30.75, "learning_rate": 3.818660355966671e-05, "loss": 1.1019, "step": 273800 }, { "epoch": 0.24, "grad_norm": 37.0, "learning_rate": 3.818210448647579e-05, "loss": 0.8965, "step": 273900 }, { "epoch": 0.24, "grad_norm": 41.75, "learning_rate": 3.817760541328487e-05, "loss": 0.8904, "step": 274000 }, { "epoch": 0.24, "grad_norm": 108.5, "learning_rate": 3.8173106340093945e-05, "loss": 0.8578, "step": 274100 }, { "epoch": 0.24, "grad_norm": 9.5625, "learning_rate": 3.816860726690302e-05, "loss": 0.9038, "step": 274200 }, { "epoch": 0.24, "grad_norm": 0.00384521484375, "learning_rate": 3.8164108193712096e-05, "loss": 0.9433, "step": 274300 }, { "epoch": 0.24, "grad_norm": 0.12255859375, "learning_rate": 3.815960912052117e-05, "loss": 0.9756, "step": 274400 }, { "epoch": 0.24, "grad_norm": 64.5, "learning_rate": 3.8155110047330254e-05, "loss": 0.9267, "step": 274500 }, { "epoch": 0.24, "grad_norm": 220.0, "learning_rate": 3.815061097413933e-05, "loss": 1.0132, "step": 274600 }, { "epoch": 0.24, "grad_norm": 552.0, "learning_rate": 3.8146111900948404e-05, "loss": 0.937, "step": 274700 }, { "epoch": 0.24, "grad_norm": 7.96875, "learning_rate": 3.8141612827757486e-05, "loss": 1.0196, "step": 274800 }, { "epoch": 0.24, "grad_norm": 0.01904296875, "learning_rate": 3.813711375456656e-05, "loss": 0.9484, "step": 274900 }, { "epoch": 0.24, "grad_norm": 0.08154296875, "learning_rate": 3.813261468137564e-05, "loss": 0.9945, "step": 275000 }, { "epoch": 0.25, "grad_norm": 572.0, "learning_rate": 3.812811560818472e-05, "loss": 1.0876, "step": 275100 }, { "epoch": 0.25, "grad_norm": 1.1953125, "learning_rate": 3.8123616534993795e-05, "loss": 0.8657, "step": 275200 }, { "epoch": 0.25, "grad_norm": 37.75, "learning_rate": 3.811911746180287e-05, "loss": 0.9201, "step": 275300 }, { "epoch": 0.25, "grad_norm": 7.375, "learning_rate": 3.811461838861195e-05, "loss": 1.115, "step": 275400 }, { "epoch": 0.25, "grad_norm": 0.201171875, "learning_rate": 3.811011931542103e-05, "loss": 0.9549, "step": 275500 }, { "epoch": 0.25, "grad_norm": 22.375, "learning_rate": 3.81056202422301e-05, "loss": 0.9049, "step": 275600 }, { "epoch": 0.25, "grad_norm": 1.46875, "learning_rate": 3.810112116903918e-05, "loss": 0.9228, "step": 275700 }, { "epoch": 0.25, "grad_norm": 186.0, "learning_rate": 3.809662209584825e-05, "loss": 0.9166, "step": 275800 }, { "epoch": 0.25, "grad_norm": 67.5, "learning_rate": 3.8092123022657335e-05, "loss": 0.9586, "step": 275900 }, { "epoch": 0.25, "grad_norm": 41.75, "learning_rate": 3.808762394946641e-05, "loss": 0.9645, "step": 276000 }, { "epoch": 0.25, "grad_norm": 22.375, "learning_rate": 3.8083124876275486e-05, "loss": 1.1052, "step": 276100 }, { "epoch": 0.25, "grad_norm": 19.0, "learning_rate": 3.807862580308457e-05, "loss": 0.9588, "step": 276200 }, { "epoch": 0.25, "grad_norm": 22.875, "learning_rate": 3.8074126729893644e-05, "loss": 1.0982, "step": 276300 }, { "epoch": 0.25, "grad_norm": 23.875, "learning_rate": 3.806962765670272e-05, "loss": 0.9041, "step": 276400 }, { "epoch": 0.25, "grad_norm": 188.0, "learning_rate": 3.80651285835118e-05, "loss": 1.0159, "step": 276500 }, { "epoch": 0.25, "grad_norm": 19.25, "learning_rate": 3.8060629510320876e-05, "loss": 1.0217, "step": 276600 }, { "epoch": 0.25, "grad_norm": 129.0, "learning_rate": 3.805613043712996e-05, "loss": 1.0292, "step": 276700 }, { "epoch": 0.25, "grad_norm": 28.0, "learning_rate": 3.805163136393903e-05, "loss": 1.0513, "step": 276800 }, { "epoch": 0.25, "grad_norm": 48.75, "learning_rate": 3.80471322907481e-05, "loss": 0.9502, "step": 276900 }, { "epoch": 0.25, "grad_norm": 28.625, "learning_rate": 3.8042633217557185e-05, "loss": 1.0215, "step": 277000 }, { "epoch": 0.25, "grad_norm": 30.125, "learning_rate": 3.803813414436626e-05, "loss": 1.1013, "step": 277100 }, { "epoch": 0.25, "grad_norm": 44.5, "learning_rate": 3.803363507117534e-05, "loss": 1.0524, "step": 277200 }, { "epoch": 0.25, "grad_norm": 25.5, "learning_rate": 3.802913599798442e-05, "loss": 0.8994, "step": 277300 }, { "epoch": 0.25, "grad_norm": 3.171875, "learning_rate": 3.802463692479349e-05, "loss": 0.9864, "step": 277400 }, { "epoch": 0.25, "grad_norm": 106.0, "learning_rate": 3.8020137851602575e-05, "loss": 1.0491, "step": 277500 }, { "epoch": 0.25, "grad_norm": 33.75, "learning_rate": 3.801563877841165e-05, "loss": 1.0561, "step": 277600 }, { "epoch": 0.25, "grad_norm": 31.75, "learning_rate": 3.8011139705220725e-05, "loss": 0.9155, "step": 277700 }, { "epoch": 0.25, "grad_norm": 37.75, "learning_rate": 3.800664063202981e-05, "loss": 0.8815, "step": 277800 }, { "epoch": 0.25, "grad_norm": 57.75, "learning_rate": 3.800214155883888e-05, "loss": 1.0203, "step": 277900 }, { "epoch": 0.25, "grad_norm": 41.0, "learning_rate": 3.799764248564796e-05, "loss": 0.8779, "step": 278000 }, { "epoch": 0.25, "grad_norm": 0.05908203125, "learning_rate": 3.7993143412457034e-05, "loss": 0.9544, "step": 278100 }, { "epoch": 0.25, "grad_norm": 42.5, "learning_rate": 3.798864433926611e-05, "loss": 0.8568, "step": 278200 }, { "epoch": 0.25, "grad_norm": 32.75, "learning_rate": 3.798414526607519e-05, "loss": 1.054, "step": 278300 }, { "epoch": 0.25, "grad_norm": 53.25, "learning_rate": 3.7979646192884266e-05, "loss": 1.0164, "step": 278400 }, { "epoch": 0.25, "grad_norm": 16.625, "learning_rate": 3.797514711969334e-05, "loss": 0.9564, "step": 278500 }, { "epoch": 0.25, "grad_norm": 59.75, "learning_rate": 3.7970648046502424e-05, "loss": 1.1013, "step": 278600 }, { "epoch": 0.25, "grad_norm": 79.0, "learning_rate": 3.79661489733115e-05, "loss": 1.0964, "step": 278700 }, { "epoch": 0.25, "grad_norm": 63.75, "learning_rate": 3.7961649900120575e-05, "loss": 1.0402, "step": 278800 }, { "epoch": 0.25, "grad_norm": 24.0, "learning_rate": 3.795715082692966e-05, "loss": 1.015, "step": 278900 }, { "epoch": 0.25, "grad_norm": 54.0, "learning_rate": 3.795265175373873e-05, "loss": 0.9332, "step": 279000 }, { "epoch": 0.25, "grad_norm": 2.859375, "learning_rate": 3.794815268054781e-05, "loss": 1.0376, "step": 279100 }, { "epoch": 0.25, "grad_norm": 49.75, "learning_rate": 3.794365360735689e-05, "loss": 1.0458, "step": 279200 }, { "epoch": 0.25, "grad_norm": 24.375, "learning_rate": 3.7939154534165965e-05, "loss": 1.0153, "step": 279300 }, { "epoch": 0.25, "grad_norm": 41.75, "learning_rate": 3.793465546097504e-05, "loss": 1.0531, "step": 279400 }, { "epoch": 0.25, "grad_norm": 40.75, "learning_rate": 3.7930156387784115e-05, "loss": 0.9474, "step": 279500 }, { "epoch": 0.25, "grad_norm": 26.75, "learning_rate": 3.792565731459319e-05, "loss": 1.1138, "step": 279600 }, { "epoch": 0.25, "grad_norm": 28.375, "learning_rate": 3.792115824140227e-05, "loss": 1.1098, "step": 279700 }, { "epoch": 0.25, "grad_norm": 147.0, "learning_rate": 3.791665916821135e-05, "loss": 1.1804, "step": 279800 }, { "epoch": 0.25, "grad_norm": 16.5, "learning_rate": 3.791216009502043e-05, "loss": 0.9355, "step": 279900 }, { "epoch": 0.25, "grad_norm": 38.0, "learning_rate": 3.7907661021829506e-05, "loss": 1.0924, "step": 280000 }, { "epoch": 0.25, "grad_norm": 0.310546875, "learning_rate": 3.790316194863858e-05, "loss": 1.0058, "step": 280100 }, { "epoch": 0.25, "grad_norm": 26.75, "learning_rate": 3.789866287544766e-05, "loss": 1.0419, "step": 280200 }, { "epoch": 0.25, "grad_norm": 176.0, "learning_rate": 3.789416380225674e-05, "loss": 0.8807, "step": 280300 }, { "epoch": 0.25, "grad_norm": 1.2734375, "learning_rate": 3.7889664729065814e-05, "loss": 1.2521, "step": 280400 }, { "epoch": 0.25, "grad_norm": 39.0, "learning_rate": 3.7885165655874896e-05, "loss": 1.0638, "step": 280500 }, { "epoch": 0.25, "grad_norm": 50.75, "learning_rate": 3.788066658268397e-05, "loss": 0.9331, "step": 280600 }, { "epoch": 0.25, "grad_norm": 43.0, "learning_rate": 3.787616750949305e-05, "loss": 1.0003, "step": 280700 }, { "epoch": 0.25, "grad_norm": 24.125, "learning_rate": 3.787166843630212e-05, "loss": 1.0782, "step": 280800 }, { "epoch": 0.25, "grad_norm": 0.036865234375, "learning_rate": 3.78671693631112e-05, "loss": 1.0198, "step": 280900 }, { "epoch": 0.25, "grad_norm": 20.625, "learning_rate": 3.786267028992028e-05, "loss": 1.018, "step": 281000 }, { "epoch": 0.25, "grad_norm": 36.5, "learning_rate": 3.7858171216729355e-05, "loss": 0.9739, "step": 281100 }, { "epoch": 0.25, "grad_norm": 49.25, "learning_rate": 3.785367214353843e-05, "loss": 1.1613, "step": 281200 }, { "epoch": 0.25, "grad_norm": 25.125, "learning_rate": 3.784917307034751e-05, "loss": 0.9868, "step": 281300 }, { "epoch": 0.25, "grad_norm": 134.0, "learning_rate": 3.784467399715659e-05, "loss": 0.9122, "step": 281400 }, { "epoch": 0.25, "grad_norm": 33.25, "learning_rate": 3.784017492396566e-05, "loss": 0.8729, "step": 281500 }, { "epoch": 0.25, "grad_norm": 0.4140625, "learning_rate": 3.7835675850774745e-05, "loss": 1.0995, "step": 281600 }, { "epoch": 0.25, "grad_norm": 25.375, "learning_rate": 3.783117677758382e-05, "loss": 0.9733, "step": 281700 }, { "epoch": 0.25, "grad_norm": 0.1455078125, "learning_rate": 3.7826677704392896e-05, "loss": 1.1197, "step": 281800 }, { "epoch": 0.25, "grad_norm": 11.9375, "learning_rate": 3.782217863120198e-05, "loss": 0.8921, "step": 281900 }, { "epoch": 0.25, "grad_norm": 23.25, "learning_rate": 3.7817679558011046e-05, "loss": 0.9566, "step": 282000 }, { "epoch": 0.25, "grad_norm": 28.5, "learning_rate": 3.781318048482013e-05, "loss": 1.1232, "step": 282100 }, { "epoch": 0.25, "grad_norm": 22.625, "learning_rate": 3.7808681411629204e-05, "loss": 1.1001, "step": 282200 }, { "epoch": 0.25, "grad_norm": 38.0, "learning_rate": 3.780418233843828e-05, "loss": 1.0222, "step": 282300 }, { "epoch": 0.25, "grad_norm": 32.25, "learning_rate": 3.779968326524736e-05, "loss": 0.9127, "step": 282400 }, { "epoch": 0.25, "grad_norm": 127.5, "learning_rate": 3.779518419205644e-05, "loss": 1.1838, "step": 282500 }, { "epoch": 0.25, "grad_norm": 184.0, "learning_rate": 3.779068511886551e-05, "loss": 0.9834, "step": 282600 }, { "epoch": 0.25, "grad_norm": 19.75, "learning_rate": 3.7786186045674594e-05, "loss": 1.097, "step": 282700 }, { "epoch": 0.25, "grad_norm": 0.03662109375, "learning_rate": 3.778168697248367e-05, "loss": 0.9354, "step": 282800 }, { "epoch": 0.25, "grad_norm": 42.75, "learning_rate": 3.777718789929275e-05, "loss": 1.0269, "step": 282900 }, { "epoch": 0.25, "grad_norm": 10.5625, "learning_rate": 3.777268882610183e-05, "loss": 1.0666, "step": 283000 }, { "epoch": 0.25, "grad_norm": 57.0, "learning_rate": 3.77681897529109e-05, "loss": 1.0767, "step": 283100 }, { "epoch": 0.25, "grad_norm": 113.0, "learning_rate": 3.7763690679719984e-05, "loss": 0.9977, "step": 283200 }, { "epoch": 0.25, "grad_norm": 109.0, "learning_rate": 3.775919160652905e-05, "loss": 0.9438, "step": 283300 }, { "epoch": 0.25, "grad_norm": 51.25, "learning_rate": 3.7754692533338135e-05, "loss": 1.0746, "step": 283400 }, { "epoch": 0.25, "grad_norm": 0.07080078125, "learning_rate": 3.775019346014721e-05, "loss": 0.9062, "step": 283500 }, { "epoch": 0.25, "grad_norm": 74.0, "learning_rate": 3.7745694386956286e-05, "loss": 0.9957, "step": 283600 }, { "epoch": 0.25, "grad_norm": 8.375, "learning_rate": 3.774119531376537e-05, "loss": 1.0649, "step": 283700 }, { "epoch": 0.25, "grad_norm": 10.875, "learning_rate": 3.773669624057444e-05, "loss": 0.8601, "step": 283800 }, { "epoch": 0.25, "grad_norm": 60.25, "learning_rate": 3.773219716738352e-05, "loss": 0.9882, "step": 283900 }, { "epoch": 0.25, "grad_norm": 19.125, "learning_rate": 3.77276980941926e-05, "loss": 0.9612, "step": 284000 }, { "epoch": 0.25, "grad_norm": 0.1787109375, "learning_rate": 3.7723199021001676e-05, "loss": 0.8524, "step": 284100 }, { "epoch": 0.25, "grad_norm": 69.5, "learning_rate": 3.771869994781075e-05, "loss": 0.9708, "step": 284200 }, { "epoch": 0.25, "grad_norm": 17.25, "learning_rate": 3.7714200874619833e-05, "loss": 1.0929, "step": 284300 }, { "epoch": 0.25, "grad_norm": 26.875, "learning_rate": 3.770970180142891e-05, "loss": 1.1401, "step": 284400 }, { "epoch": 0.25, "grad_norm": 118.5, "learning_rate": 3.7705202728237984e-05, "loss": 1.1539, "step": 284500 }, { "epoch": 0.25, "grad_norm": 7.28125, "learning_rate": 3.770070365504706e-05, "loss": 0.988, "step": 284600 }, { "epoch": 0.25, "grad_norm": 42.25, "learning_rate": 3.7696204581856135e-05, "loss": 1.018, "step": 284700 }, { "epoch": 0.25, "grad_norm": 26.5, "learning_rate": 3.769170550866522e-05, "loss": 0.9587, "step": 284800 }, { "epoch": 0.25, "grad_norm": 72.5, "learning_rate": 3.768720643547429e-05, "loss": 0.9842, "step": 284900 }, { "epoch": 0.25, "grad_norm": 43.5, "learning_rate": 3.768270736228337e-05, "loss": 1.055, "step": 285000 }, { "epoch": 0.25, "grad_norm": 55.0, "learning_rate": 3.767820828909245e-05, "loss": 0.8903, "step": 285100 }, { "epoch": 0.25, "grad_norm": 60.5, "learning_rate": 3.7673709215901525e-05, "loss": 0.9298, "step": 285200 }, { "epoch": 0.25, "grad_norm": 20.125, "learning_rate": 3.76692101427106e-05, "loss": 0.9636, "step": 285300 }, { "epoch": 0.25, "grad_norm": 18.625, "learning_rate": 3.766471106951968e-05, "loss": 0.9638, "step": 285400 }, { "epoch": 0.25, "grad_norm": 110.0, "learning_rate": 3.766021199632876e-05, "loss": 1.0231, "step": 285500 }, { "epoch": 0.25, "grad_norm": 28.625, "learning_rate": 3.765571292313784e-05, "loss": 1.0215, "step": 285600 }, { "epoch": 0.25, "grad_norm": 24.625, "learning_rate": 3.7651213849946915e-05, "loss": 0.9749, "step": 285700 }, { "epoch": 0.25, "grad_norm": 41.75, "learning_rate": 3.764671477675599e-05, "loss": 1.0126, "step": 285800 }, { "epoch": 0.25, "grad_norm": 17.625, "learning_rate": 3.7642215703565066e-05, "loss": 1.0122, "step": 285900 }, { "epoch": 0.25, "grad_norm": 33.25, "learning_rate": 3.763771663037414e-05, "loss": 0.9669, "step": 286000 }, { "epoch": 0.25, "grad_norm": 12.125, "learning_rate": 3.7633217557183223e-05, "loss": 0.9164, "step": 286100 }, { "epoch": 0.25, "grad_norm": 18.5, "learning_rate": 3.76287184839923e-05, "loss": 1.0143, "step": 286200 }, { "epoch": 0.26, "grad_norm": 18.5, "learning_rate": 3.7624219410801374e-05, "loss": 0.9201, "step": 286300 }, { "epoch": 0.26, "grad_norm": 119.0, "learning_rate": 3.7619720337610456e-05, "loss": 1.0825, "step": 286400 }, { "epoch": 0.26, "grad_norm": 9.1875, "learning_rate": 3.761522126441953e-05, "loss": 1.0808, "step": 286500 }, { "epoch": 0.26, "grad_norm": 70.5, "learning_rate": 3.761072219122861e-05, "loss": 0.8891, "step": 286600 }, { "epoch": 0.26, "grad_norm": 134.0, "learning_rate": 3.760622311803769e-05, "loss": 0.9578, "step": 286700 }, { "epoch": 0.26, "grad_norm": 21.875, "learning_rate": 3.7601724044846764e-05, "loss": 1.0709, "step": 286800 }, { "epoch": 0.26, "grad_norm": 0.020263671875, "learning_rate": 3.759722497165584e-05, "loss": 1.1112, "step": 286900 }, { "epoch": 0.26, "grad_norm": 37.25, "learning_rate": 3.759272589846492e-05, "loss": 1.0595, "step": 287000 }, { "epoch": 0.26, "grad_norm": 42.25, "learning_rate": 3.7588226825274e-05, "loss": 1.0129, "step": 287100 }, { "epoch": 0.26, "grad_norm": 28.5, "learning_rate": 3.758372775208307e-05, "loss": 1.0426, "step": 287200 }, { "epoch": 0.26, "grad_norm": 75.5, "learning_rate": 3.757922867889215e-05, "loss": 0.9792, "step": 287300 }, { "epoch": 0.26, "grad_norm": 0.02392578125, "learning_rate": 3.757472960570122e-05, "loss": 1.0452, "step": 287400 }, { "epoch": 0.26, "grad_norm": 28.0, "learning_rate": 3.7570230532510305e-05, "loss": 1.0643, "step": 287500 }, { "epoch": 0.26, "grad_norm": 30.5, "learning_rate": 3.756573145931938e-05, "loss": 1.0373, "step": 287600 }, { "epoch": 0.26, "grad_norm": 27.875, "learning_rate": 3.7561232386128456e-05, "loss": 0.9409, "step": 287700 }, { "epoch": 0.26, "grad_norm": 0.1416015625, "learning_rate": 3.755673331293754e-05, "loss": 0.9208, "step": 287800 }, { "epoch": 0.26, "grad_norm": 74.0, "learning_rate": 3.755223423974661e-05, "loss": 0.8953, "step": 287900 }, { "epoch": 0.26, "grad_norm": 28.375, "learning_rate": 3.754773516655569e-05, "loss": 0.906, "step": 288000 }, { "epoch": 0.26, "grad_norm": 62.75, "learning_rate": 3.754323609336477e-05, "loss": 1.039, "step": 288100 }, { "epoch": 0.26, "grad_norm": 22.75, "learning_rate": 3.7538737020173846e-05, "loss": 1.0885, "step": 288200 }, { "epoch": 0.26, "grad_norm": 30.0, "learning_rate": 3.753423794698293e-05, "loss": 1.0548, "step": 288300 }, { "epoch": 0.26, "grad_norm": 27.375, "learning_rate": 3.7529738873792004e-05, "loss": 1.1686, "step": 288400 }, { "epoch": 0.26, "grad_norm": 21.125, "learning_rate": 3.752523980060107e-05, "loss": 1.013, "step": 288500 }, { "epoch": 0.26, "grad_norm": 47.25, "learning_rate": 3.7520740727410154e-05, "loss": 0.9353, "step": 288600 }, { "epoch": 0.26, "grad_norm": 0.0164794921875, "learning_rate": 3.751624165421923e-05, "loss": 0.9683, "step": 288700 }, { "epoch": 0.26, "grad_norm": 69.0, "learning_rate": 3.751174258102831e-05, "loss": 0.9861, "step": 288800 }, { "epoch": 0.26, "grad_norm": 268.0, "learning_rate": 3.750724350783739e-05, "loss": 0.9531, "step": 288900 }, { "epoch": 0.26, "grad_norm": 13.9375, "learning_rate": 3.750274443464646e-05, "loss": 1.1006, "step": 289000 }, { "epoch": 0.26, "grad_norm": 21.5, "learning_rate": 3.7498245361455545e-05, "loss": 0.999, "step": 289100 }, { "epoch": 0.26, "grad_norm": 52.25, "learning_rate": 3.749374628826462e-05, "loss": 0.8388, "step": 289200 }, { "epoch": 0.26, "grad_norm": 45.0, "learning_rate": 3.7489247215073695e-05, "loss": 1.055, "step": 289300 }, { "epoch": 0.26, "grad_norm": 42.25, "learning_rate": 3.748474814188278e-05, "loss": 1.066, "step": 289400 }, { "epoch": 0.26, "grad_norm": 51.5, "learning_rate": 3.748024906869185e-05, "loss": 0.9666, "step": 289500 }, { "epoch": 0.26, "grad_norm": 8.3125, "learning_rate": 3.747574999550093e-05, "loss": 1.1302, "step": 289600 }, { "epoch": 0.26, "grad_norm": 14.25, "learning_rate": 3.747125092231001e-05, "loss": 1.0545, "step": 289700 }, { "epoch": 0.26, "grad_norm": 0.006591796875, "learning_rate": 3.746675184911908e-05, "loss": 0.965, "step": 289800 }, { "epoch": 0.26, "grad_norm": 44.0, "learning_rate": 3.746225277592816e-05, "loss": 0.9575, "step": 289900 }, { "epoch": 0.26, "grad_norm": 23.375, "learning_rate": 3.7457753702737236e-05, "loss": 1.1131, "step": 290000 }, { "epoch": 0.26, "grad_norm": 9.5, "learning_rate": 3.745325462954631e-05, "loss": 0.9814, "step": 290100 }, { "epoch": 0.26, "grad_norm": 0.201171875, "learning_rate": 3.7448755556355394e-05, "loss": 0.9384, "step": 290200 }, { "epoch": 0.26, "grad_norm": 25.5, "learning_rate": 3.744425648316447e-05, "loss": 0.9451, "step": 290300 }, { "epoch": 0.26, "grad_norm": 17.25, "learning_rate": 3.7439757409973544e-05, "loss": 1.0421, "step": 290400 }, { "epoch": 0.26, "grad_norm": 48.25, "learning_rate": 3.7435258336782626e-05, "loss": 0.9799, "step": 290500 }, { "epoch": 0.26, "grad_norm": 57.25, "learning_rate": 3.74307592635917e-05, "loss": 0.9903, "step": 290600 }, { "epoch": 0.26, "grad_norm": 22.375, "learning_rate": 3.742626019040078e-05, "loss": 0.9827, "step": 290700 }, { "epoch": 0.26, "grad_norm": 18.5, "learning_rate": 3.742176111720986e-05, "loss": 1.0665, "step": 290800 }, { "epoch": 0.26, "grad_norm": 0.00347900390625, "learning_rate": 3.7417262044018935e-05, "loss": 0.9944, "step": 290900 }, { "epoch": 0.26, "grad_norm": 11.75, "learning_rate": 3.741276297082802e-05, "loss": 0.9932, "step": 291000 }, { "epoch": 0.26, "grad_norm": 10.375, "learning_rate": 3.7408263897637085e-05, "loss": 1.1128, "step": 291100 }, { "epoch": 0.26, "grad_norm": 41.5, "learning_rate": 3.740376482444616e-05, "loss": 1.1565, "step": 291200 }, { "epoch": 0.26, "grad_norm": 12.75, "learning_rate": 3.739926575125524e-05, "loss": 0.9661, "step": 291300 }, { "epoch": 0.26, "grad_norm": 66.5, "learning_rate": 3.739476667806432e-05, "loss": 1.0081, "step": 291400 }, { "epoch": 0.26, "grad_norm": 69.5, "learning_rate": 3.73902676048734e-05, "loss": 1.0014, "step": 291500 }, { "epoch": 0.26, "grad_norm": 70.0, "learning_rate": 3.7385768531682475e-05, "loss": 1.0749, "step": 291600 }, { "epoch": 0.26, "grad_norm": 1.9140625, "learning_rate": 3.738126945849155e-05, "loss": 0.9294, "step": 291700 }, { "epoch": 0.26, "grad_norm": 24.625, "learning_rate": 3.737677038530063e-05, "loss": 1.1287, "step": 291800 }, { "epoch": 0.26, "grad_norm": 22.0, "learning_rate": 3.737227131210971e-05, "loss": 0.9587, "step": 291900 }, { "epoch": 0.26, "grad_norm": 0.62890625, "learning_rate": 3.7367772238918784e-05, "loss": 0.9536, "step": 292000 }, { "epoch": 0.26, "grad_norm": 25.25, "learning_rate": 3.7363273165727866e-05, "loss": 0.9381, "step": 292100 }, { "epoch": 0.26, "grad_norm": 19.375, "learning_rate": 3.735877409253694e-05, "loss": 0.9211, "step": 292200 }, { "epoch": 0.26, "grad_norm": 15.0, "learning_rate": 3.7354275019346016e-05, "loss": 1.0364, "step": 292300 }, { "epoch": 0.26, "grad_norm": 65.0, "learning_rate": 3.734977594615509e-05, "loss": 0.9292, "step": 292400 }, { "epoch": 0.26, "grad_norm": 14.4375, "learning_rate": 3.734527687296417e-05, "loss": 1.1398, "step": 292500 }, { "epoch": 0.26, "grad_norm": 13.5625, "learning_rate": 3.734077779977325e-05, "loss": 1.0559, "step": 292600 }, { "epoch": 0.26, "grad_norm": 0.0068359375, "learning_rate": 3.7336278726582325e-05, "loss": 1.0671, "step": 292700 }, { "epoch": 0.26, "grad_norm": 9.3125, "learning_rate": 3.73317796533914e-05, "loss": 0.9718, "step": 292800 }, { "epoch": 0.26, "grad_norm": 76.0, "learning_rate": 3.732728058020048e-05, "loss": 1.1268, "step": 292900 }, { "epoch": 0.26, "grad_norm": 13.8125, "learning_rate": 3.732278150700956e-05, "loss": 1.0402, "step": 293000 }, { "epoch": 0.26, "grad_norm": 184.0, "learning_rate": 3.731828243381863e-05, "loss": 1.0138, "step": 293100 }, { "epoch": 0.26, "grad_norm": 25.25, "learning_rate": 3.7313783360627715e-05, "loss": 1.0434, "step": 293200 }, { "epoch": 0.26, "grad_norm": 23.375, "learning_rate": 3.730928428743679e-05, "loss": 0.9349, "step": 293300 }, { "epoch": 0.26, "grad_norm": 42.75, "learning_rate": 3.7304785214245865e-05, "loss": 1.0041, "step": 293400 }, { "epoch": 0.26, "grad_norm": 848.0, "learning_rate": 3.730028614105495e-05, "loss": 0.9763, "step": 293500 }, { "epoch": 0.26, "grad_norm": 59.25, "learning_rate": 3.729578706786402e-05, "loss": 1.0238, "step": 293600 }, { "epoch": 0.26, "grad_norm": 6.46875, "learning_rate": 3.72912879946731e-05, "loss": 1.011, "step": 293700 }, { "epoch": 0.26, "grad_norm": 47.75, "learning_rate": 3.7286788921482174e-05, "loss": 0.996, "step": 293800 }, { "epoch": 0.26, "grad_norm": 27.625, "learning_rate": 3.728228984829125e-05, "loss": 1.0576, "step": 293900 }, { "epoch": 0.26, "grad_norm": 86.0, "learning_rate": 3.727779077510033e-05, "loss": 0.9976, "step": 294000 }, { "epoch": 0.26, "grad_norm": 98.0, "learning_rate": 3.7273291701909406e-05, "loss": 0.9828, "step": 294100 }, { "epoch": 0.26, "grad_norm": 40.5, "learning_rate": 3.726879262871849e-05, "loss": 0.9299, "step": 294200 }, { "epoch": 0.26, "grad_norm": 7.15625, "learning_rate": 3.7264293555527564e-05, "loss": 0.9068, "step": 294300 }, { "epoch": 0.26, "grad_norm": 12.1875, "learning_rate": 3.725979448233664e-05, "loss": 1.1035, "step": 294400 }, { "epoch": 0.26, "grad_norm": 159.0, "learning_rate": 3.725529540914572e-05, "loss": 1.0225, "step": 294500 }, { "epoch": 0.26, "grad_norm": 296.0, "learning_rate": 3.72507963359548e-05, "loss": 1.0048, "step": 294600 }, { "epoch": 0.26, "grad_norm": 33.5, "learning_rate": 3.724629726276387e-05, "loss": 1.0528, "step": 294700 }, { "epoch": 0.26, "grad_norm": 74.0, "learning_rate": 3.7241798189572954e-05, "loss": 1.0722, "step": 294800 }, { "epoch": 0.26, "grad_norm": 0.443359375, "learning_rate": 3.723729911638203e-05, "loss": 0.9493, "step": 294900 }, { "epoch": 0.26, "grad_norm": 33.25, "learning_rate": 3.7232800043191105e-05, "loss": 0.9789, "step": 295000 }, { "epoch": 0.26, "grad_norm": 114.5, "learning_rate": 3.722830097000018e-05, "loss": 0.944, "step": 295100 }, { "epoch": 0.26, "grad_norm": 104.0, "learning_rate": 3.7223801896809255e-05, "loss": 1.1147, "step": 295200 }, { "epoch": 0.26, "grad_norm": 0.2578125, "learning_rate": 3.721930282361834e-05, "loss": 1.0557, "step": 295300 }, { "epoch": 0.26, "grad_norm": 24.25, "learning_rate": 3.721480375042741e-05, "loss": 0.9894, "step": 295400 }, { "epoch": 0.26, "grad_norm": 340.0, "learning_rate": 3.721030467723649e-05, "loss": 0.9389, "step": 295500 }, { "epoch": 0.26, "grad_norm": 0.0791015625, "learning_rate": 3.720580560404557e-05, "loss": 1.0486, "step": 295600 }, { "epoch": 0.26, "grad_norm": 26.5, "learning_rate": 3.7201306530854646e-05, "loss": 1.0906, "step": 295700 }, { "epoch": 0.26, "grad_norm": 34.0, "learning_rate": 3.719680745766372e-05, "loss": 1.0111, "step": 295800 }, { "epoch": 0.26, "grad_norm": 18.5, "learning_rate": 3.71923083844728e-05, "loss": 1.0953, "step": 295900 }, { "epoch": 0.26, "grad_norm": 19.625, "learning_rate": 3.718780931128188e-05, "loss": 0.9447, "step": 296000 }, { "epoch": 0.26, "grad_norm": 18.25, "learning_rate": 3.7183310238090954e-05, "loss": 0.9725, "step": 296100 }, { "epoch": 0.26, "grad_norm": 23.875, "learning_rate": 3.7178811164900036e-05, "loss": 0.9777, "step": 296200 }, { "epoch": 0.26, "grad_norm": 9.4375, "learning_rate": 3.7174312091709105e-05, "loss": 0.8518, "step": 296300 }, { "epoch": 0.26, "grad_norm": 1472.0, "learning_rate": 3.716981301851819e-05, "loss": 1.0447, "step": 296400 }, { "epoch": 0.26, "grad_norm": 37.5, "learning_rate": 3.716531394532726e-05, "loss": 0.9507, "step": 296500 }, { "epoch": 0.26, "grad_norm": 12.0, "learning_rate": 3.716081487213634e-05, "loss": 1.1126, "step": 296600 }, { "epoch": 0.26, "grad_norm": 21.0, "learning_rate": 3.715631579894542e-05, "loss": 1.0372, "step": 296700 }, { "epoch": 0.26, "grad_norm": 11.6875, "learning_rate": 3.7151816725754495e-05, "loss": 0.9917, "step": 296800 }, { "epoch": 0.26, "grad_norm": 59.25, "learning_rate": 3.714731765256358e-05, "loss": 0.9659, "step": 296900 }, { "epoch": 0.26, "grad_norm": 125.5, "learning_rate": 3.714281857937265e-05, "loss": 1.0493, "step": 297000 }, { "epoch": 0.26, "grad_norm": 0.498046875, "learning_rate": 3.713831950618173e-05, "loss": 1.0106, "step": 297100 }, { "epoch": 0.26, "grad_norm": 26.25, "learning_rate": 3.713382043299081e-05, "loss": 0.9621, "step": 297200 }, { "epoch": 0.26, "grad_norm": 51.0, "learning_rate": 3.7129321359799885e-05, "loss": 0.9449, "step": 297300 }, { "epoch": 0.26, "grad_norm": 124.0, "learning_rate": 3.712482228660896e-05, "loss": 1.129, "step": 297400 }, { "epoch": 0.27, "grad_norm": 46.5, "learning_rate": 3.712032321341804e-05, "loss": 0.9938, "step": 297500 }, { "epoch": 0.27, "grad_norm": 29.625, "learning_rate": 3.711582414022711e-05, "loss": 1.0287, "step": 297600 }, { "epoch": 0.27, "grad_norm": 16.375, "learning_rate": 3.711132506703619e-05, "loss": 1.153, "step": 297700 }, { "epoch": 0.27, "grad_norm": 99.5, "learning_rate": 3.710682599384527e-05, "loss": 1.0182, "step": 297800 }, { "epoch": 0.27, "grad_norm": 17.75, "learning_rate": 3.7102326920654344e-05, "loss": 0.9651, "step": 297900 }, { "epoch": 0.27, "grad_norm": 10.1875, "learning_rate": 3.7097827847463426e-05, "loss": 0.8857, "step": 298000 }, { "epoch": 0.27, "grad_norm": 26.375, "learning_rate": 3.70933287742725e-05, "loss": 1.1251, "step": 298100 }, { "epoch": 0.27, "grad_norm": 21.625, "learning_rate": 3.708882970108158e-05, "loss": 0.9032, "step": 298200 }, { "epoch": 0.27, "grad_norm": 87.0, "learning_rate": 3.708433062789066e-05, "loss": 1.0085, "step": 298300 }, { "epoch": 0.27, "grad_norm": 37.5, "learning_rate": 3.7079831554699734e-05, "loss": 0.9449, "step": 298400 }, { "epoch": 0.27, "grad_norm": 13.9375, "learning_rate": 3.707533248150881e-05, "loss": 0.9516, "step": 298500 }, { "epoch": 0.27, "grad_norm": 6.875, "learning_rate": 3.707083340831789e-05, "loss": 1.0265, "step": 298600 }, { "epoch": 0.27, "grad_norm": 372.0, "learning_rate": 3.706633433512697e-05, "loss": 0.9236, "step": 298700 }, { "epoch": 0.27, "grad_norm": 63.0, "learning_rate": 3.706183526193604e-05, "loss": 0.9214, "step": 298800 }, { "epoch": 0.27, "grad_norm": 94.5, "learning_rate": 3.705733618874512e-05, "loss": 1.0358, "step": 298900 }, { "epoch": 0.27, "grad_norm": 62.75, "learning_rate": 3.705283711555419e-05, "loss": 0.9155, "step": 299000 }, { "epoch": 0.27, "grad_norm": 73.5, "learning_rate": 3.7048338042363275e-05, "loss": 1.0036, "step": 299100 }, { "epoch": 0.27, "grad_norm": 26.375, "learning_rate": 3.704383896917235e-05, "loss": 1.0941, "step": 299200 }, { "epoch": 0.27, "grad_norm": 15.8125, "learning_rate": 3.7039339895981426e-05, "loss": 1.0197, "step": 299300 }, { "epoch": 0.27, "grad_norm": 0.00860595703125, "learning_rate": 3.703484082279051e-05, "loss": 0.9353, "step": 299400 }, { "epoch": 0.27, "grad_norm": 13.375, "learning_rate": 3.703034174959958e-05, "loss": 0.9616, "step": 299500 }, { "epoch": 0.27, "grad_norm": 22.0, "learning_rate": 3.702584267640866e-05, "loss": 1.1224, "step": 299600 }, { "epoch": 0.27, "grad_norm": 0.251953125, "learning_rate": 3.702134360321774e-05, "loss": 0.9859, "step": 299700 }, { "epoch": 0.27, "grad_norm": 20.875, "learning_rate": 3.7016844530026816e-05, "loss": 1.0369, "step": 299800 }, { "epoch": 0.27, "grad_norm": 0.1796875, "learning_rate": 3.70123454568359e-05, "loss": 0.9392, "step": 299900 }, { "epoch": 0.27, "grad_norm": 2.734375, "learning_rate": 3.7007846383644973e-05, "loss": 1.1231, "step": 300000 }, { "epoch": 0.27, "grad_norm": 73.0, "learning_rate": 3.700334731045405e-05, "loss": 1.1582, "step": 300100 }, { "epoch": 0.27, "grad_norm": 0.1923828125, "learning_rate": 3.6998848237263124e-05, "loss": 0.9658, "step": 300200 }, { "epoch": 0.27, "grad_norm": 27.25, "learning_rate": 3.69943491640722e-05, "loss": 0.8286, "step": 300300 }, { "epoch": 0.27, "grad_norm": 96.0, "learning_rate": 3.698985009088128e-05, "loss": 0.8997, "step": 300400 }, { "epoch": 0.27, "grad_norm": 0.1650390625, "learning_rate": 3.698535101769036e-05, "loss": 1.0215, "step": 300500 }, { "epoch": 0.27, "grad_norm": 31.5, "learning_rate": 3.698085194449943e-05, "loss": 0.9601, "step": 300600 }, { "epoch": 0.27, "grad_norm": 0.09033203125, "learning_rate": 3.6976352871308514e-05, "loss": 1.0356, "step": 300700 }, { "epoch": 0.27, "grad_norm": 156.0, "learning_rate": 3.697185379811759e-05, "loss": 1.0337, "step": 300800 }, { "epoch": 0.27, "grad_norm": 14.0, "learning_rate": 3.6967354724926665e-05, "loss": 1.0733, "step": 300900 }, { "epoch": 0.27, "grad_norm": 90.0, "learning_rate": 3.696285565173575e-05, "loss": 0.9862, "step": 301000 }, { "epoch": 0.27, "grad_norm": 33.75, "learning_rate": 3.695835657854482e-05, "loss": 1.0186, "step": 301100 }, { "epoch": 0.27, "grad_norm": 0.08349609375, "learning_rate": 3.69538575053539e-05, "loss": 1.0127, "step": 301200 }, { "epoch": 0.27, "grad_norm": 122.0, "learning_rate": 3.694935843216298e-05, "loss": 1.01, "step": 301300 }, { "epoch": 0.27, "grad_norm": 14.5625, "learning_rate": 3.6944859358972055e-05, "loss": 0.96, "step": 301400 }, { "epoch": 0.27, "grad_norm": 5.46875, "learning_rate": 3.694036028578113e-05, "loss": 1.0052, "step": 301500 }, { "epoch": 0.27, "grad_norm": 0.6796875, "learning_rate": 3.6935861212590206e-05, "loss": 1.0166, "step": 301600 }, { "epoch": 0.27, "grad_norm": 150.0, "learning_rate": 3.693136213939928e-05, "loss": 0.9057, "step": 301700 }, { "epoch": 0.27, "grad_norm": 15.3125, "learning_rate": 3.6926863066208363e-05, "loss": 1.0006, "step": 301800 }, { "epoch": 0.27, "grad_norm": 14.625, "learning_rate": 3.692236399301744e-05, "loss": 1.0435, "step": 301900 }, { "epoch": 0.27, "grad_norm": 64.5, "learning_rate": 3.6917864919826514e-05, "loss": 1.0164, "step": 302000 }, { "epoch": 0.27, "grad_norm": 4.3125, "learning_rate": 3.6913365846635596e-05, "loss": 0.9632, "step": 302100 }, { "epoch": 0.27, "grad_norm": 28.125, "learning_rate": 3.690886677344467e-05, "loss": 1.0699, "step": 302200 }, { "epoch": 0.27, "grad_norm": 20.0, "learning_rate": 3.690436770025375e-05, "loss": 1.0983, "step": 302300 }, { "epoch": 0.27, "grad_norm": 10.625, "learning_rate": 3.689986862706283e-05, "loss": 0.9954, "step": 302400 }, { "epoch": 0.27, "grad_norm": 50.25, "learning_rate": 3.6895369553871904e-05, "loss": 1.1556, "step": 302500 }, { "epoch": 0.27, "grad_norm": 0.037109375, "learning_rate": 3.6890870480680986e-05, "loss": 0.867, "step": 302600 }, { "epoch": 0.27, "grad_norm": 218.0, "learning_rate": 3.688637140749006e-05, "loss": 1.0612, "step": 302700 }, { "epoch": 0.27, "grad_norm": 0.080078125, "learning_rate": 3.688187233429913e-05, "loss": 0.9617, "step": 302800 }, { "epoch": 0.27, "grad_norm": 18.625, "learning_rate": 3.687737326110821e-05, "loss": 0.9486, "step": 302900 }, { "epoch": 0.27, "grad_norm": 46.25, "learning_rate": 3.687287418791729e-05, "loss": 0.9948, "step": 303000 }, { "epoch": 0.27, "grad_norm": 6.5625, "learning_rate": 3.686837511472637e-05, "loss": 1.0681, "step": 303100 }, { "epoch": 0.27, "grad_norm": 21.375, "learning_rate": 3.6863876041535445e-05, "loss": 1.0904, "step": 303200 }, { "epoch": 0.27, "grad_norm": 0.046142578125, "learning_rate": 3.685937696834452e-05, "loss": 1.1284, "step": 303300 }, { "epoch": 0.27, "grad_norm": 20.5, "learning_rate": 3.68548778951536e-05, "loss": 1.0434, "step": 303400 }, { "epoch": 0.27, "grad_norm": 50.0, "learning_rate": 3.685037882196268e-05, "loss": 1.1216, "step": 303500 }, { "epoch": 0.27, "grad_norm": 18.875, "learning_rate": 3.6845879748771753e-05, "loss": 1.0383, "step": 303600 }, { "epoch": 0.27, "grad_norm": 0.19921875, "learning_rate": 3.6841380675580836e-05, "loss": 0.8577, "step": 303700 }, { "epoch": 0.27, "grad_norm": 284.0, "learning_rate": 3.683688160238991e-05, "loss": 1.0803, "step": 303800 }, { "epoch": 0.27, "grad_norm": 52.75, "learning_rate": 3.6832382529198986e-05, "loss": 0.8806, "step": 303900 }, { "epoch": 0.27, "grad_norm": 27.125, "learning_rate": 3.682788345600807e-05, "loss": 0.9574, "step": 304000 }, { "epoch": 0.27, "grad_norm": 482.0, "learning_rate": 3.682338438281714e-05, "loss": 1.1519, "step": 304100 }, { "epoch": 0.27, "grad_norm": 49.5, "learning_rate": 3.681888530962622e-05, "loss": 0.9249, "step": 304200 }, { "epoch": 0.27, "grad_norm": 17.25, "learning_rate": 3.6814386236435294e-05, "loss": 1.0271, "step": 304300 }, { "epoch": 0.27, "grad_norm": 0.0390625, "learning_rate": 3.680988716324437e-05, "loss": 1.0063, "step": 304400 }, { "epoch": 0.27, "grad_norm": 11.1875, "learning_rate": 3.680538809005345e-05, "loss": 1.0237, "step": 304500 }, { "epoch": 0.27, "grad_norm": 35.5, "learning_rate": 3.680088901686253e-05, "loss": 1.0015, "step": 304600 }, { "epoch": 0.27, "grad_norm": 0.01397705078125, "learning_rate": 3.67963899436716e-05, "loss": 1.0098, "step": 304700 }, { "epoch": 0.27, "grad_norm": 19.25, "learning_rate": 3.6791890870480685e-05, "loss": 1.1084, "step": 304800 }, { "epoch": 0.27, "grad_norm": 42.25, "learning_rate": 3.678739179728976e-05, "loss": 1.0012, "step": 304900 }, { "epoch": 0.27, "grad_norm": 37.25, "learning_rate": 3.6782892724098835e-05, "loss": 0.9744, "step": 305000 }, { "epoch": 0.27, "grad_norm": 34.5, "learning_rate": 3.677839365090792e-05, "loss": 1.079, "step": 305100 }, { "epoch": 0.27, "grad_norm": 0.08203125, "learning_rate": 3.677389457771699e-05, "loss": 1.0196, "step": 305200 }, { "epoch": 0.27, "grad_norm": 940.0, "learning_rate": 3.6769395504526075e-05, "loss": 1.1222, "step": 305300 }, { "epoch": 0.27, "grad_norm": 38.25, "learning_rate": 3.6764896431335143e-05, "loss": 0.8667, "step": 305400 }, { "epoch": 0.27, "grad_norm": 11.875, "learning_rate": 3.676039735814422e-05, "loss": 0.9969, "step": 305500 }, { "epoch": 0.27, "grad_norm": 59.0, "learning_rate": 3.67558982849533e-05, "loss": 0.9233, "step": 305600 }, { "epoch": 0.27, "grad_norm": 1.2578125, "learning_rate": 3.6751399211762376e-05, "loss": 1.0392, "step": 305700 }, { "epoch": 0.27, "grad_norm": 48.75, "learning_rate": 3.674690013857146e-05, "loss": 1.0305, "step": 305800 }, { "epoch": 0.27, "grad_norm": 21.625, "learning_rate": 3.6742401065380534e-05, "loss": 0.9824, "step": 305900 }, { "epoch": 0.27, "grad_norm": 26.75, "learning_rate": 3.673790199218961e-05, "loss": 1.1893, "step": 306000 }, { "epoch": 0.27, "grad_norm": 8.0, "learning_rate": 3.673340291899869e-05, "loss": 0.8237, "step": 306100 }, { "epoch": 0.27, "grad_norm": 43.75, "learning_rate": 3.6728903845807766e-05, "loss": 0.9386, "step": 306200 }, { "epoch": 0.27, "grad_norm": 11.5, "learning_rate": 3.672440477261684e-05, "loss": 0.9905, "step": 306300 }, { "epoch": 0.27, "grad_norm": 30.5, "learning_rate": 3.6719905699425924e-05, "loss": 0.9236, "step": 306400 }, { "epoch": 0.27, "grad_norm": 0.0120849609375, "learning_rate": 3.6715406626235e-05, "loss": 0.9911, "step": 306500 }, { "epoch": 0.27, "grad_norm": 9.8125, "learning_rate": 3.6710907553044075e-05, "loss": 1.0119, "step": 306600 }, { "epoch": 0.27, "grad_norm": 33.75, "learning_rate": 3.670640847985315e-05, "loss": 1.0005, "step": 306700 }, { "epoch": 0.27, "grad_norm": 173.0, "learning_rate": 3.6701909406662225e-05, "loss": 1.0375, "step": 306800 }, { "epoch": 0.27, "grad_norm": 13.9375, "learning_rate": 3.669741033347131e-05, "loss": 0.9426, "step": 306900 }, { "epoch": 0.27, "grad_norm": 23.875, "learning_rate": 3.669291126028038e-05, "loss": 1.0287, "step": 307000 }, { "epoch": 0.27, "grad_norm": 21.75, "learning_rate": 3.668841218708946e-05, "loss": 1.0525, "step": 307100 }, { "epoch": 0.27, "grad_norm": 0.0849609375, "learning_rate": 3.668391311389854e-05, "loss": 0.9764, "step": 307200 }, { "epoch": 0.27, "grad_norm": 108.0, "learning_rate": 3.6679414040707616e-05, "loss": 1.0604, "step": 307300 }, { "epoch": 0.27, "grad_norm": 140.0, "learning_rate": 3.667491496751669e-05, "loss": 1.0525, "step": 307400 }, { "epoch": 0.27, "grad_norm": 18.75, "learning_rate": 3.667041589432577e-05, "loss": 1.0677, "step": 307500 }, { "epoch": 0.27, "grad_norm": 0.48046875, "learning_rate": 3.666591682113485e-05, "loss": 0.9733, "step": 307600 }, { "epoch": 0.27, "grad_norm": 31.875, "learning_rate": 3.6661417747943924e-05, "loss": 0.9332, "step": 307700 }, { "epoch": 0.27, "grad_norm": 26.625, "learning_rate": 3.6656918674753006e-05, "loss": 1.0315, "step": 307800 }, { "epoch": 0.27, "grad_norm": 31.25, "learning_rate": 3.665241960156208e-05, "loss": 0.9553, "step": 307900 }, { "epoch": 0.27, "grad_norm": 506.0, "learning_rate": 3.6647920528371156e-05, "loss": 1.0836, "step": 308000 }, { "epoch": 0.27, "grad_norm": 12.0, "learning_rate": 3.664342145518023e-05, "loss": 0.9197, "step": 308100 }, { "epoch": 0.27, "grad_norm": 25.875, "learning_rate": 3.663892238198931e-05, "loss": 1.0806, "step": 308200 }, { "epoch": 0.27, "grad_norm": 33.0, "learning_rate": 3.663442330879839e-05, "loss": 1.0173, "step": 308300 }, { "epoch": 0.27, "grad_norm": 0.080078125, "learning_rate": 3.6629924235607465e-05, "loss": 0.9192, "step": 308400 }, { "epoch": 0.27, "grad_norm": 110.5, "learning_rate": 3.662542516241655e-05, "loss": 0.9926, "step": 308500 }, { "epoch": 0.27, "grad_norm": 50.75, "learning_rate": 3.662092608922562e-05, "loss": 1.0538, "step": 308600 }, { "epoch": 0.27, "grad_norm": 25.75, "learning_rate": 3.66164270160347e-05, "loss": 1.0892, "step": 308700 }, { "epoch": 0.28, "grad_norm": 116.5, "learning_rate": 3.661192794284378e-05, "loss": 1.0033, "step": 308800 }, { "epoch": 0.28, "grad_norm": 46.75, "learning_rate": 3.6607428869652855e-05, "loss": 1.0094, "step": 308900 }, { "epoch": 0.28, "grad_norm": 87.0, "learning_rate": 3.660292979646193e-05, "loss": 0.9548, "step": 309000 }, { "epoch": 0.28, "grad_norm": 0.1630859375, "learning_rate": 3.659843072327101e-05, "loss": 1.1314, "step": 309100 }, { "epoch": 0.28, "grad_norm": 43.25, "learning_rate": 3.659393165008009e-05, "loss": 0.9092, "step": 309200 }, { "epoch": 0.28, "grad_norm": 20.375, "learning_rate": 3.658943257688916e-05, "loss": 0.9552, "step": 309300 }, { "epoch": 0.28, "grad_norm": 17.25, "learning_rate": 3.658493350369824e-05, "loss": 1.1365, "step": 309400 }, { "epoch": 0.28, "grad_norm": 18.625, "learning_rate": 3.6580434430507314e-05, "loss": 0.9769, "step": 309500 }, { "epoch": 0.28, "grad_norm": 2.515625, "learning_rate": 3.6575935357316396e-05, "loss": 1.0819, "step": 309600 }, { "epoch": 0.28, "grad_norm": 26.625, "learning_rate": 3.657143628412547e-05, "loss": 0.9747, "step": 309700 }, { "epoch": 0.28, "grad_norm": 14.25, "learning_rate": 3.6566937210934546e-05, "loss": 1.068, "step": 309800 }, { "epoch": 0.28, "grad_norm": 0.28125, "learning_rate": 3.656243813774363e-05, "loss": 0.9789, "step": 309900 }, { "epoch": 0.28, "grad_norm": 87.0, "learning_rate": 3.6557939064552704e-05, "loss": 1.0288, "step": 310000 }, { "epoch": 0.28, "grad_norm": 65.5, "learning_rate": 3.655343999136178e-05, "loss": 0.9174, "step": 310100 }, { "epoch": 0.28, "grad_norm": 41.0, "learning_rate": 3.654894091817086e-05, "loss": 1.0209, "step": 310200 }, { "epoch": 0.28, "grad_norm": 31.875, "learning_rate": 3.654444184497994e-05, "loss": 0.975, "step": 310300 }, { "epoch": 0.28, "grad_norm": 0.0174560546875, "learning_rate": 3.653994277178901e-05, "loss": 0.9641, "step": 310400 }, { "epoch": 0.28, "grad_norm": 25.0, "learning_rate": 3.6535443698598094e-05, "loss": 1.0333, "step": 310500 }, { "epoch": 0.28, "grad_norm": 32.25, "learning_rate": 3.653094462540716e-05, "loss": 0.9952, "step": 310600 }, { "epoch": 0.28, "grad_norm": 26.875, "learning_rate": 3.6526445552216245e-05, "loss": 1.0131, "step": 310700 }, { "epoch": 0.28, "grad_norm": 71.5, "learning_rate": 3.652194647902532e-05, "loss": 1.0748, "step": 310800 }, { "epoch": 0.28, "grad_norm": 34.75, "learning_rate": 3.6517447405834396e-05, "loss": 0.9656, "step": 310900 }, { "epoch": 0.28, "grad_norm": 49.0, "learning_rate": 3.651294833264348e-05, "loss": 1.0691, "step": 311000 }, { "epoch": 0.28, "grad_norm": 23.25, "learning_rate": 3.650844925945255e-05, "loss": 1.0165, "step": 311100 }, { "epoch": 0.28, "grad_norm": 14.625, "learning_rate": 3.6503950186261635e-05, "loss": 1.1299, "step": 311200 }, { "epoch": 0.28, "grad_norm": 0.0126953125, "learning_rate": 3.649945111307071e-05, "loss": 0.9627, "step": 311300 }, { "epoch": 0.28, "grad_norm": 107.0, "learning_rate": 3.6494952039879786e-05, "loss": 1.0488, "step": 311400 }, { "epoch": 0.28, "grad_norm": 77.0, "learning_rate": 3.649045296668887e-05, "loss": 1.0094, "step": 311500 }, { "epoch": 0.28, "grad_norm": 80.5, "learning_rate": 3.648595389349794e-05, "loss": 1.0298, "step": 311600 }, { "epoch": 0.28, "grad_norm": 0.44921875, "learning_rate": 3.648145482030702e-05, "loss": 0.9935, "step": 311700 }, { "epoch": 0.28, "grad_norm": 39.5, "learning_rate": 3.64769557471161e-05, "loss": 1.0381, "step": 311800 }, { "epoch": 0.28, "grad_norm": 29.625, "learning_rate": 3.647245667392517e-05, "loss": 0.9743, "step": 311900 }, { "epoch": 0.28, "grad_norm": 34.0, "learning_rate": 3.646795760073425e-05, "loss": 1.0455, "step": 312000 }, { "epoch": 0.28, "grad_norm": 20.875, "learning_rate": 3.646345852754333e-05, "loss": 1.0269, "step": 312100 }, { "epoch": 0.28, "grad_norm": 58.25, "learning_rate": 3.64589594543524e-05, "loss": 1.0804, "step": 312200 }, { "epoch": 0.28, "grad_norm": 90.0, "learning_rate": 3.6454460381161484e-05, "loss": 0.8518, "step": 312300 }, { "epoch": 0.28, "grad_norm": 36.0, "learning_rate": 3.644996130797056e-05, "loss": 1.1012, "step": 312400 }, { "epoch": 0.28, "grad_norm": 31.5, "learning_rate": 3.6445462234779635e-05, "loss": 1.0819, "step": 312500 }, { "epoch": 0.28, "grad_norm": 87.0, "learning_rate": 3.644096316158872e-05, "loss": 1.0311, "step": 312600 }, { "epoch": 0.28, "grad_norm": 110.5, "learning_rate": 3.643646408839779e-05, "loss": 0.984, "step": 312700 }, { "epoch": 0.28, "grad_norm": 0.021240234375, "learning_rate": 3.643196501520687e-05, "loss": 0.9403, "step": 312800 }, { "epoch": 0.28, "grad_norm": 109.5, "learning_rate": 3.642746594201595e-05, "loss": 0.8858, "step": 312900 }, { "epoch": 0.28, "grad_norm": 44.25, "learning_rate": 3.6422966868825025e-05, "loss": 1.057, "step": 313000 }, { "epoch": 0.28, "grad_norm": 45.0, "learning_rate": 3.64184677956341e-05, "loss": 0.9359, "step": 313100 }, { "epoch": 0.28, "grad_norm": 32.0, "learning_rate": 3.6413968722443176e-05, "loss": 1.0052, "step": 313200 }, { "epoch": 0.28, "grad_norm": 27.25, "learning_rate": 3.640946964925225e-05, "loss": 1.1168, "step": 313300 }, { "epoch": 0.28, "grad_norm": 103.5, "learning_rate": 3.640497057606133e-05, "loss": 1.0036, "step": 313400 }, { "epoch": 0.28, "grad_norm": 27.625, "learning_rate": 3.640047150287041e-05, "loss": 1.0273, "step": 313500 }, { "epoch": 0.28, "grad_norm": 53.5, "learning_rate": 3.6395972429679484e-05, "loss": 0.9393, "step": 313600 }, { "epoch": 0.28, "grad_norm": 14.0625, "learning_rate": 3.6391473356488566e-05, "loss": 0.9945, "step": 313700 }, { "epoch": 0.28, "grad_norm": 41.0, "learning_rate": 3.638697428329764e-05, "loss": 0.9459, "step": 313800 }, { "epoch": 0.28, "grad_norm": 0.029296875, "learning_rate": 3.6382475210106723e-05, "loss": 0.8752, "step": 313900 }, { "epoch": 0.28, "grad_norm": 42.75, "learning_rate": 3.63779761369158e-05, "loss": 1.0804, "step": 314000 }, { "epoch": 0.28, "grad_norm": 0.1484375, "learning_rate": 3.6373477063724874e-05, "loss": 1.2319, "step": 314100 }, { "epoch": 0.28, "grad_norm": 23.25, "learning_rate": 3.6368977990533956e-05, "loss": 0.9201, "step": 314200 }, { "epoch": 0.28, "grad_norm": 48.5, "learning_rate": 3.636447891734303e-05, "loss": 0.9143, "step": 314300 }, { "epoch": 0.28, "grad_norm": 37.75, "learning_rate": 3.635997984415211e-05, "loss": 0.8795, "step": 314400 }, { "epoch": 0.28, "grad_norm": 8.875, "learning_rate": 3.635548077096118e-05, "loss": 0.8999, "step": 314500 }, { "epoch": 0.28, "grad_norm": 4.5625, "learning_rate": 3.635098169777026e-05, "loss": 1.011, "step": 314600 }, { "epoch": 0.28, "grad_norm": 0.36328125, "learning_rate": 3.634648262457934e-05, "loss": 1.0106, "step": 314700 }, { "epoch": 0.28, "grad_norm": 23.0, "learning_rate": 3.6341983551388415e-05, "loss": 0.8728, "step": 314800 }, { "epoch": 0.28, "grad_norm": 90.0, "learning_rate": 3.633748447819749e-05, "loss": 1.1142, "step": 314900 }, { "epoch": 0.28, "grad_norm": 0.04150390625, "learning_rate": 3.633298540500657e-05, "loss": 0.9321, "step": 315000 }, { "epoch": 0.28, "grad_norm": 46.0, "learning_rate": 3.632848633181565e-05, "loss": 0.9976, "step": 315100 }, { "epoch": 0.28, "grad_norm": 22.625, "learning_rate": 3.632398725862472e-05, "loss": 0.9879, "step": 315200 }, { "epoch": 0.28, "grad_norm": 24.0, "learning_rate": 3.6319488185433805e-05, "loss": 0.9861, "step": 315300 }, { "epoch": 0.28, "grad_norm": 15.1875, "learning_rate": 3.631498911224288e-05, "loss": 0.9907, "step": 315400 }, { "epoch": 0.28, "grad_norm": 79.0, "learning_rate": 3.6310490039051956e-05, "loss": 1.0425, "step": 315500 }, { "epoch": 0.28, "grad_norm": 34.5, "learning_rate": 3.630599096586104e-05, "loss": 1.135, "step": 315600 }, { "epoch": 0.28, "grad_norm": 211.0, "learning_rate": 3.6301491892670113e-05, "loss": 1.045, "step": 315700 }, { "epoch": 0.28, "grad_norm": 87.0, "learning_rate": 3.629699281947919e-05, "loss": 0.9732, "step": 315800 }, { "epoch": 0.28, "grad_norm": 16.125, "learning_rate": 3.6292493746288264e-05, "loss": 1.1904, "step": 315900 }, { "epoch": 0.28, "grad_norm": 0.54296875, "learning_rate": 3.628799467309734e-05, "loss": 0.9783, "step": 316000 }, { "epoch": 0.28, "grad_norm": 0.6015625, "learning_rate": 3.628349559990642e-05, "loss": 0.9245, "step": 316100 }, { "epoch": 0.28, "grad_norm": 13.0, "learning_rate": 3.62789965267155e-05, "loss": 1.0189, "step": 316200 }, { "epoch": 0.28, "grad_norm": 44.25, "learning_rate": 3.627449745352457e-05, "loss": 0.975, "step": 316300 }, { "epoch": 0.28, "grad_norm": 67.0, "learning_rate": 3.6269998380333654e-05, "loss": 0.871, "step": 316400 }, { "epoch": 0.28, "grad_norm": 15.25, "learning_rate": 3.626549930714273e-05, "loss": 0.9881, "step": 316500 }, { "epoch": 0.28, "grad_norm": 150.0, "learning_rate": 3.6261000233951805e-05, "loss": 1.1318, "step": 316600 }, { "epoch": 0.28, "grad_norm": 12.75, "learning_rate": 3.625650116076089e-05, "loss": 1.1347, "step": 316700 }, { "epoch": 0.28, "grad_norm": 9.4375, "learning_rate": 3.625200208756996e-05, "loss": 0.9635, "step": 316800 }, { "epoch": 0.28, "grad_norm": 16.875, "learning_rate": 3.6247503014379045e-05, "loss": 1.0946, "step": 316900 }, { "epoch": 0.28, "grad_norm": 0.408203125, "learning_rate": 3.624300394118812e-05, "loss": 0.9249, "step": 317000 }, { "epoch": 0.28, "grad_norm": 5.5625, "learning_rate": 3.623850486799719e-05, "loss": 0.8626, "step": 317100 }, { "epoch": 0.28, "grad_norm": 20.0, "learning_rate": 3.623400579480627e-05, "loss": 0.9188, "step": 317200 }, { "epoch": 0.28, "grad_norm": 15.4375, "learning_rate": 3.6229506721615346e-05, "loss": 1.048, "step": 317300 }, { "epoch": 0.28, "grad_norm": 18.625, "learning_rate": 3.622500764842443e-05, "loss": 1.1118, "step": 317400 }, { "epoch": 0.28, "grad_norm": 72.0, "learning_rate": 3.6220508575233503e-05, "loss": 0.9467, "step": 317500 }, { "epoch": 0.28, "grad_norm": 46.75, "learning_rate": 3.621600950204258e-05, "loss": 1.1107, "step": 317600 }, { "epoch": 0.28, "grad_norm": 23.75, "learning_rate": 3.621151042885166e-05, "loss": 1.0043, "step": 317700 }, { "epoch": 0.28, "grad_norm": 28.375, "learning_rate": 3.6207011355660736e-05, "loss": 0.9025, "step": 317800 }, { "epoch": 0.28, "grad_norm": 36.5, "learning_rate": 3.620251228246981e-05, "loss": 0.8554, "step": 317900 }, { "epoch": 0.28, "grad_norm": 12.375, "learning_rate": 3.6198013209278894e-05, "loss": 1.026, "step": 318000 }, { "epoch": 0.28, "grad_norm": 12.3125, "learning_rate": 3.619351413608797e-05, "loss": 1.1124, "step": 318100 }, { "epoch": 0.28, "grad_norm": 22.75, "learning_rate": 3.6189015062897044e-05, "loss": 1.1842, "step": 318200 }, { "epoch": 0.28, "grad_norm": 0.004486083984375, "learning_rate": 3.6184515989706127e-05, "loss": 0.9869, "step": 318300 }, { "epoch": 0.28, "grad_norm": 183.0, "learning_rate": 3.6180016916515195e-05, "loss": 1.2416, "step": 318400 }, { "epoch": 0.28, "grad_norm": 49.25, "learning_rate": 3.617551784332428e-05, "loss": 1.0201, "step": 318500 }, { "epoch": 0.28, "grad_norm": 21.625, "learning_rate": 3.617101877013335e-05, "loss": 1.0957, "step": 318600 }, { "epoch": 0.28, "grad_norm": 54.5, "learning_rate": 3.616651969694243e-05, "loss": 0.9934, "step": 318700 }, { "epoch": 0.28, "grad_norm": 14.375, "learning_rate": 3.616202062375151e-05, "loss": 0.9565, "step": 318800 }, { "epoch": 0.28, "grad_norm": 62.5, "learning_rate": 3.6157521550560585e-05, "loss": 1.0589, "step": 318900 }, { "epoch": 0.28, "grad_norm": 17.125, "learning_rate": 3.615302247736966e-05, "loss": 0.9754, "step": 319000 }, { "epoch": 0.28, "grad_norm": 9.375, "learning_rate": 3.614852340417874e-05, "loss": 0.9177, "step": 319100 }, { "epoch": 0.28, "grad_norm": 117.0, "learning_rate": 3.614402433098782e-05, "loss": 0.9928, "step": 319200 }, { "epoch": 0.28, "grad_norm": 25.375, "learning_rate": 3.6139525257796893e-05, "loss": 0.9426, "step": 319300 }, { "epoch": 0.28, "grad_norm": 136.0, "learning_rate": 3.6135026184605976e-05, "loss": 0.9318, "step": 319400 }, { "epoch": 0.28, "grad_norm": 17.375, "learning_rate": 3.613052711141505e-05, "loss": 0.9935, "step": 319500 }, { "epoch": 0.28, "grad_norm": 21.875, "learning_rate": 3.612602803822413e-05, "loss": 1.0611, "step": 319600 }, { "epoch": 0.28, "grad_norm": 0.036376953125, "learning_rate": 3.61215289650332e-05, "loss": 0.9981, "step": 319700 }, { "epoch": 0.28, "grad_norm": 0.009521484375, "learning_rate": 3.611702989184228e-05, "loss": 1.1049, "step": 319800 }, { "epoch": 0.28, "grad_norm": 63.25, "learning_rate": 3.611253081865136e-05, "loss": 1.0493, "step": 319900 }, { "epoch": 0.29, "grad_norm": 12.625, "learning_rate": 3.6108031745460434e-05, "loss": 0.9991, "step": 320000 }, { "epoch": 0.29, "grad_norm": 19.25, "learning_rate": 3.6103532672269517e-05, "loss": 0.9046, "step": 320100 }, { "epoch": 0.29, "grad_norm": 51.25, "learning_rate": 3.609903359907859e-05, "loss": 0.9621, "step": 320200 }, { "epoch": 0.29, "grad_norm": 0.12890625, "learning_rate": 3.609453452588767e-05, "loss": 1.0321, "step": 320300 }, { "epoch": 0.29, "grad_norm": 21.75, "learning_rate": 3.609003545269675e-05, "loss": 0.9761, "step": 320400 }, { "epoch": 0.29, "grad_norm": 1600.0, "learning_rate": 3.6085536379505825e-05, "loss": 1.0054, "step": 320500 }, { "epoch": 0.29, "grad_norm": 20.875, "learning_rate": 3.60810373063149e-05, "loss": 0.895, "step": 320600 }, { "epoch": 0.29, "grad_norm": 33.75, "learning_rate": 3.607653823312398e-05, "loss": 1.0764, "step": 320700 }, { "epoch": 0.29, "grad_norm": 5.5625, "learning_rate": 3.607203915993306e-05, "loss": 0.9927, "step": 320800 }, { "epoch": 0.29, "grad_norm": 18.5, "learning_rate": 3.606754008674213e-05, "loss": 0.9858, "step": 320900 }, { "epoch": 0.29, "grad_norm": 20.125, "learning_rate": 3.606304101355121e-05, "loss": 1.142, "step": 321000 }, { "epoch": 0.29, "grad_norm": 92.0, "learning_rate": 3.6058541940360283e-05, "loss": 1.0967, "step": 321100 }, { "epoch": 0.29, "grad_norm": 12.5625, "learning_rate": 3.6054042867169366e-05, "loss": 0.9635, "step": 321200 }, { "epoch": 0.29, "grad_norm": 26.0, "learning_rate": 3.604954379397844e-05, "loss": 0.9771, "step": 321300 }, { "epoch": 0.29, "grad_norm": 0.03076171875, "learning_rate": 3.6045044720787516e-05, "loss": 0.9096, "step": 321400 }, { "epoch": 0.29, "grad_norm": 40.5, "learning_rate": 3.60405456475966e-05, "loss": 0.9777, "step": 321500 }, { "epoch": 0.29, "grad_norm": 0.005950927734375, "learning_rate": 3.6036046574405674e-05, "loss": 1.0101, "step": 321600 }, { "epoch": 0.29, "grad_norm": 3.359375, "learning_rate": 3.603154750121475e-05, "loss": 1.0459, "step": 321700 }, { "epoch": 0.29, "grad_norm": 38.0, "learning_rate": 3.602704842802383e-05, "loss": 1.1353, "step": 321800 }, { "epoch": 0.29, "grad_norm": 94.0, "learning_rate": 3.6022549354832907e-05, "loss": 0.9828, "step": 321900 }, { "epoch": 0.29, "grad_norm": 0.03857421875, "learning_rate": 3.601805028164198e-05, "loss": 0.9718, "step": 322000 }, { "epoch": 0.29, "grad_norm": 19.375, "learning_rate": 3.6013551208451064e-05, "loss": 0.9502, "step": 322100 }, { "epoch": 0.29, "grad_norm": 1.28125, "learning_rate": 3.600905213526014e-05, "loss": 0.923, "step": 322200 }, { "epoch": 0.29, "grad_norm": 19.375, "learning_rate": 3.6004553062069215e-05, "loss": 0.7945, "step": 322300 }, { "epoch": 0.29, "grad_norm": 0.2236328125, "learning_rate": 3.600005398887829e-05, "loss": 1.0133, "step": 322400 }, { "epoch": 0.29, "grad_norm": 27.375, "learning_rate": 3.5995554915687365e-05, "loss": 1.022, "step": 322500 }, { "epoch": 0.29, "grad_norm": 21.25, "learning_rate": 3.599105584249645e-05, "loss": 1.1105, "step": 322600 }, { "epoch": 0.29, "grad_norm": 98.0, "learning_rate": 3.598655676930552e-05, "loss": 1.184, "step": 322700 }, { "epoch": 0.29, "grad_norm": 26.25, "learning_rate": 3.5982057696114605e-05, "loss": 1.0759, "step": 322800 }, { "epoch": 0.29, "grad_norm": 31.0, "learning_rate": 3.597755862292368e-05, "loss": 0.966, "step": 322900 }, { "epoch": 0.29, "grad_norm": 33.25, "learning_rate": 3.5973059549732756e-05, "loss": 1.1839, "step": 323000 }, { "epoch": 0.29, "grad_norm": 37.75, "learning_rate": 3.596856047654184e-05, "loss": 1.0618, "step": 323100 }, { "epoch": 0.29, "grad_norm": 26.125, "learning_rate": 3.596406140335091e-05, "loss": 0.9178, "step": 323200 }, { "epoch": 0.29, "grad_norm": 88.5, "learning_rate": 3.595956233015999e-05, "loss": 1.0137, "step": 323300 }, { "epoch": 0.29, "grad_norm": 40.25, "learning_rate": 3.595506325696907e-05, "loss": 0.9731, "step": 323400 }, { "epoch": 0.29, "grad_norm": 26.5, "learning_rate": 3.5950564183778146e-05, "loss": 0.8936, "step": 323500 }, { "epoch": 0.29, "grad_norm": 0.01519775390625, "learning_rate": 3.594606511058722e-05, "loss": 1.0664, "step": 323600 }, { "epoch": 0.29, "grad_norm": 15.75, "learning_rate": 3.5941566037396296e-05, "loss": 0.9372, "step": 323700 }, { "epoch": 0.29, "grad_norm": 41.25, "learning_rate": 3.593706696420537e-05, "loss": 1.1182, "step": 323800 }, { "epoch": 0.29, "grad_norm": 25.5, "learning_rate": 3.5932567891014454e-05, "loss": 1.0484, "step": 323900 }, { "epoch": 0.29, "grad_norm": 780.0, "learning_rate": 3.592806881782353e-05, "loss": 0.9203, "step": 324000 }, { "epoch": 0.29, "grad_norm": 17.5, "learning_rate": 3.5923569744632605e-05, "loss": 0.8622, "step": 324100 }, { "epoch": 0.29, "grad_norm": 22.25, "learning_rate": 3.591907067144169e-05, "loss": 1.1688, "step": 324200 }, { "epoch": 0.29, "grad_norm": 14.125, "learning_rate": 3.591457159825076e-05, "loss": 0.9461, "step": 324300 }, { "epoch": 0.29, "grad_norm": 25.25, "learning_rate": 3.591007252505984e-05, "loss": 0.9347, "step": 324400 }, { "epoch": 0.29, "grad_norm": 16.25, "learning_rate": 3.590557345186892e-05, "loss": 1.1219, "step": 324500 }, { "epoch": 0.29, "grad_norm": 34.75, "learning_rate": 3.5901074378677995e-05, "loss": 1.0158, "step": 324600 }, { "epoch": 0.29, "grad_norm": 34.75, "learning_rate": 3.589657530548707e-05, "loss": 1.1649, "step": 324700 }, { "epoch": 0.29, "grad_norm": 63.0, "learning_rate": 3.589207623229615e-05, "loss": 0.9947, "step": 324800 }, { "epoch": 0.29, "grad_norm": 57.5, "learning_rate": 3.588757715910522e-05, "loss": 0.9695, "step": 324900 }, { "epoch": 0.29, "grad_norm": 15.375, "learning_rate": 3.58830780859143e-05, "loss": 1.104, "step": 325000 }, { "epoch": 0.29, "grad_norm": 33.75, "learning_rate": 3.587857901272338e-05, "loss": 1.0179, "step": 325100 }, { "epoch": 0.29, "grad_norm": 29.0, "learning_rate": 3.5874079939532454e-05, "loss": 1.0921, "step": 325200 }, { "epoch": 0.29, "grad_norm": 26.125, "learning_rate": 3.5869580866341536e-05, "loss": 0.9883, "step": 325300 }, { "epoch": 0.29, "grad_norm": 12.625, "learning_rate": 3.586508179315061e-05, "loss": 0.9916, "step": 325400 }, { "epoch": 0.29, "grad_norm": 32.25, "learning_rate": 3.586058271995969e-05, "loss": 1.0047, "step": 325500 }, { "epoch": 0.29, "grad_norm": 48.0, "learning_rate": 3.585608364676877e-05, "loss": 1.0746, "step": 325600 }, { "epoch": 0.29, "grad_norm": 24.0, "learning_rate": 3.5851584573577844e-05, "loss": 1.2072, "step": 325700 }, { "epoch": 0.29, "grad_norm": 54.0, "learning_rate": 3.5847085500386926e-05, "loss": 1.2004, "step": 325800 }, { "epoch": 0.29, "grad_norm": 19.0, "learning_rate": 3.5842586427196e-05, "loss": 0.9823, "step": 325900 }, { "epoch": 0.29, "grad_norm": 48.0, "learning_rate": 3.583808735400508e-05, "loss": 1.0313, "step": 326000 }, { "epoch": 0.29, "grad_norm": 192.0, "learning_rate": 3.583358828081416e-05, "loss": 1.0086, "step": 326100 }, { "epoch": 0.29, "grad_norm": 36.75, "learning_rate": 3.582908920762323e-05, "loss": 0.9467, "step": 326200 }, { "epoch": 0.29, "grad_norm": 26.125, "learning_rate": 3.582459013443231e-05, "loss": 1.0403, "step": 326300 }, { "epoch": 0.29, "grad_norm": 117.0, "learning_rate": 3.5820091061241385e-05, "loss": 0.9163, "step": 326400 }, { "epoch": 0.29, "grad_norm": 33.75, "learning_rate": 3.581559198805046e-05, "loss": 1.1902, "step": 326500 }, { "epoch": 0.29, "grad_norm": 28.375, "learning_rate": 3.581109291485954e-05, "loss": 1.0054, "step": 326600 }, { "epoch": 0.29, "grad_norm": 1.5859375, "learning_rate": 3.580659384166862e-05, "loss": 0.9536, "step": 326700 }, { "epoch": 0.29, "grad_norm": 0.042236328125, "learning_rate": 3.580209476847769e-05, "loss": 1.115, "step": 326800 }, { "epoch": 0.29, "grad_norm": 15.375, "learning_rate": 3.5797595695286775e-05, "loss": 0.9789, "step": 326900 }, { "epoch": 0.29, "grad_norm": 10.5, "learning_rate": 3.579309662209585e-05, "loss": 1.0123, "step": 327000 }, { "epoch": 0.29, "grad_norm": 38.5, "learning_rate": 3.5788597548904926e-05, "loss": 0.9962, "step": 327100 }, { "epoch": 0.29, "grad_norm": 40.5, "learning_rate": 3.578409847571401e-05, "loss": 1.0599, "step": 327200 }, { "epoch": 0.29, "grad_norm": 13.75, "learning_rate": 3.577959940252308e-05, "loss": 0.9727, "step": 327300 }, { "epoch": 0.29, "grad_norm": 21.5, "learning_rate": 3.577510032933216e-05, "loss": 0.9051, "step": 327400 }, { "epoch": 0.29, "grad_norm": 85.5, "learning_rate": 3.5770601256141234e-05, "loss": 0.9214, "step": 327500 }, { "epoch": 0.29, "grad_norm": 43.75, "learning_rate": 3.576610218295031e-05, "loss": 0.8978, "step": 327600 }, { "epoch": 0.29, "grad_norm": 54.0, "learning_rate": 3.576160310975939e-05, "loss": 0.9393, "step": 327700 }, { "epoch": 0.29, "grad_norm": 10.3125, "learning_rate": 3.575710403656847e-05, "loss": 0.9979, "step": 327800 }, { "epoch": 0.29, "grad_norm": 0.002655029296875, "learning_rate": 3.575260496337754e-05, "loss": 0.9615, "step": 327900 }, { "epoch": 0.29, "grad_norm": 0.138671875, "learning_rate": 3.5748105890186624e-05, "loss": 1.0525, "step": 328000 }, { "epoch": 0.29, "grad_norm": 672.0, "learning_rate": 3.57436068169957e-05, "loss": 0.8802, "step": 328100 }, { "epoch": 0.29, "grad_norm": 45.0, "learning_rate": 3.573910774380478e-05, "loss": 0.9863, "step": 328200 }, { "epoch": 0.29, "grad_norm": 13.0625, "learning_rate": 3.573460867061386e-05, "loss": 1.0176, "step": 328300 }, { "epoch": 0.29, "grad_norm": 6.03125, "learning_rate": 3.573010959742293e-05, "loss": 0.9135, "step": 328400 }, { "epoch": 0.29, "grad_norm": 117.0, "learning_rate": 3.5725610524232014e-05, "loss": 1.0044, "step": 328500 }, { "epoch": 0.29, "grad_norm": 143.0, "learning_rate": 3.572111145104109e-05, "loss": 1.1217, "step": 328600 }, { "epoch": 0.29, "grad_norm": 0.039794921875, "learning_rate": 3.5716612377850165e-05, "loss": 0.9841, "step": 328700 }, { "epoch": 0.29, "grad_norm": 17.375, "learning_rate": 3.571211330465924e-05, "loss": 0.9709, "step": 328800 }, { "epoch": 0.29, "grad_norm": 36.25, "learning_rate": 3.5707614231468316e-05, "loss": 1.0775, "step": 328900 }, { "epoch": 0.29, "grad_norm": 67.0, "learning_rate": 3.57031151582774e-05, "loss": 0.9732, "step": 329000 }, { "epoch": 0.29, "grad_norm": 40.25, "learning_rate": 3.569861608508647e-05, "loss": 1.049, "step": 329100 }, { "epoch": 0.29, "grad_norm": 27.875, "learning_rate": 3.569411701189555e-05, "loss": 0.9743, "step": 329200 }, { "epoch": 0.29, "grad_norm": 26.25, "learning_rate": 3.568961793870463e-05, "loss": 0.8575, "step": 329300 }, { "epoch": 0.29, "grad_norm": 0.055908203125, "learning_rate": 3.5685118865513706e-05, "loss": 0.9061, "step": 329400 }, { "epoch": 0.29, "grad_norm": 71.0, "learning_rate": 3.568061979232278e-05, "loss": 1.0331, "step": 329500 }, { "epoch": 0.29, "grad_norm": 22.375, "learning_rate": 3.5676120719131864e-05, "loss": 0.8514, "step": 329600 }, { "epoch": 0.29, "grad_norm": 19.125, "learning_rate": 3.567162164594094e-05, "loss": 1.1511, "step": 329700 }, { "epoch": 0.29, "grad_norm": 0.006500244140625, "learning_rate": 3.5667122572750014e-05, "loss": 0.8774, "step": 329800 }, { "epoch": 0.29, "grad_norm": 13.875, "learning_rate": 3.5662623499559096e-05, "loss": 0.9567, "step": 329900 }, { "epoch": 0.29, "grad_norm": 91.0, "learning_rate": 3.565812442636817e-05, "loss": 0.9579, "step": 330000 }, { "epoch": 0.29, "grad_norm": 0.1708984375, "learning_rate": 3.565362535317725e-05, "loss": 1.027, "step": 330100 }, { "epoch": 0.29, "grad_norm": 23.375, "learning_rate": 3.564912627998632e-05, "loss": 0.9305, "step": 330200 }, { "epoch": 0.29, "grad_norm": 13.1875, "learning_rate": 3.56446272067954e-05, "loss": 1.0634, "step": 330300 }, { "epoch": 0.29, "grad_norm": 40.75, "learning_rate": 3.564012813360448e-05, "loss": 0.9462, "step": 330400 }, { "epoch": 0.29, "grad_norm": 28.75, "learning_rate": 3.5635629060413555e-05, "loss": 0.9551, "step": 330500 }, { "epoch": 0.29, "grad_norm": 139.0, "learning_rate": 3.563112998722263e-05, "loss": 1.1876, "step": 330600 }, { "epoch": 0.29, "grad_norm": 22.75, "learning_rate": 3.562663091403171e-05, "loss": 0.9847, "step": 330700 }, { "epoch": 0.29, "grad_norm": 23.875, "learning_rate": 3.562213184084079e-05, "loss": 0.9629, "step": 330800 }, { "epoch": 0.29, "grad_norm": 0.15234375, "learning_rate": 3.561763276764987e-05, "loss": 0.9678, "step": 330900 }, { "epoch": 0.29, "grad_norm": 115.0, "learning_rate": 3.5613133694458945e-05, "loss": 1.0061, "step": 331000 }, { "epoch": 0.29, "grad_norm": 32.75, "learning_rate": 3.560863462126802e-05, "loss": 1.0344, "step": 331100 }, { "epoch": 0.3, "grad_norm": 65.0, "learning_rate": 3.56041355480771e-05, "loss": 0.9472, "step": 331200 }, { "epoch": 0.3, "grad_norm": 0.0498046875, "learning_rate": 3.559963647488618e-05, "loss": 1.0289, "step": 331300 }, { "epoch": 0.3, "grad_norm": 65.0, "learning_rate": 3.559513740169525e-05, "loss": 0.9999, "step": 331400 }, { "epoch": 0.3, "grad_norm": 38.5, "learning_rate": 3.559063832850433e-05, "loss": 1.0686, "step": 331500 }, { "epoch": 0.3, "grad_norm": 10.3125, "learning_rate": 3.5586139255313404e-05, "loss": 0.9857, "step": 331600 }, { "epoch": 0.3, "grad_norm": 2.984375, "learning_rate": 3.5581640182122486e-05, "loss": 0.9316, "step": 331700 }, { "epoch": 0.3, "grad_norm": 3.828125, "learning_rate": 3.557714110893156e-05, "loss": 1.0798, "step": 331800 }, { "epoch": 0.3, "grad_norm": 135.0, "learning_rate": 3.557264203574064e-05, "loss": 0.9377, "step": 331900 }, { "epoch": 0.3, "grad_norm": 20.625, "learning_rate": 3.556814296254972e-05, "loss": 0.9441, "step": 332000 }, { "epoch": 0.3, "grad_norm": 0.8125, "learning_rate": 3.5563643889358794e-05, "loss": 0.9284, "step": 332100 }, { "epoch": 0.3, "grad_norm": 50.25, "learning_rate": 3.555914481616787e-05, "loss": 1.0711, "step": 332200 }, { "epoch": 0.3, "grad_norm": 35.75, "learning_rate": 3.555464574297695e-05, "loss": 1.0742, "step": 332300 }, { "epoch": 0.3, "grad_norm": 29.125, "learning_rate": 3.555014666978603e-05, "loss": 0.8731, "step": 332400 }, { "epoch": 0.3, "grad_norm": 36.25, "learning_rate": 3.55456475965951e-05, "loss": 0.9239, "step": 332500 }, { "epoch": 0.3, "grad_norm": 3.421875, "learning_rate": 3.5541148523404185e-05, "loss": 1.078, "step": 332600 }, { "epoch": 0.3, "grad_norm": 24.625, "learning_rate": 3.553664945021325e-05, "loss": 0.9942, "step": 332700 }, { "epoch": 0.3, "grad_norm": 101.5, "learning_rate": 3.5532150377022335e-05, "loss": 1.0795, "step": 332800 }, { "epoch": 0.3, "grad_norm": 74.0, "learning_rate": 3.552765130383141e-05, "loss": 1.0356, "step": 332900 }, { "epoch": 0.3, "grad_norm": 57.25, "learning_rate": 3.5523152230640486e-05, "loss": 1.1318, "step": 333000 }, { "epoch": 0.3, "grad_norm": 118.5, "learning_rate": 3.551865315744957e-05, "loss": 0.9528, "step": 333100 }, { "epoch": 0.3, "grad_norm": 25.0, "learning_rate": 3.5514154084258643e-05, "loss": 0.9894, "step": 333200 }, { "epoch": 0.3, "grad_norm": 0.0018310546875, "learning_rate": 3.550965501106772e-05, "loss": 0.9249, "step": 333300 }, { "epoch": 0.3, "grad_norm": 56.0, "learning_rate": 3.55051559378768e-05, "loss": 1.0157, "step": 333400 }, { "epoch": 0.3, "grad_norm": 0.0771484375, "learning_rate": 3.5500656864685876e-05, "loss": 1.1021, "step": 333500 }, { "epoch": 0.3, "grad_norm": 7.84375, "learning_rate": 3.549615779149495e-05, "loss": 1.0148, "step": 333600 }, { "epoch": 0.3, "grad_norm": 49.25, "learning_rate": 3.5491658718304034e-05, "loss": 1.0459, "step": 333700 }, { "epoch": 0.3, "grad_norm": 5.96875, "learning_rate": 3.548715964511311e-05, "loss": 1.0988, "step": 333800 }, { "epoch": 0.3, "grad_norm": 36.5, "learning_rate": 3.548266057192219e-05, "loss": 1.0738, "step": 333900 }, { "epoch": 0.3, "grad_norm": 35.0, "learning_rate": 3.547816149873126e-05, "loss": 1.0493, "step": 334000 }, { "epoch": 0.3, "grad_norm": 304.0, "learning_rate": 3.5473662425540335e-05, "loss": 1.0974, "step": 334100 }, { "epoch": 0.3, "grad_norm": 56.75, "learning_rate": 3.546916335234942e-05, "loss": 0.9967, "step": 334200 }, { "epoch": 0.3, "grad_norm": 13.0625, "learning_rate": 3.546466427915849e-05, "loss": 0.9931, "step": 334300 }, { "epoch": 0.3, "grad_norm": 22.375, "learning_rate": 3.5460165205967575e-05, "loss": 0.9772, "step": 334400 }, { "epoch": 0.3, "grad_norm": 6.53125, "learning_rate": 3.545566613277665e-05, "loss": 1.0019, "step": 334500 }, { "epoch": 0.3, "grad_norm": 1.640625, "learning_rate": 3.5451167059585725e-05, "loss": 1.0308, "step": 334600 }, { "epoch": 0.3, "grad_norm": 42.25, "learning_rate": 3.544666798639481e-05, "loss": 1.032, "step": 334700 }, { "epoch": 0.3, "grad_norm": 26.25, "learning_rate": 3.544216891320388e-05, "loss": 0.9726, "step": 334800 }, { "epoch": 0.3, "grad_norm": 0.054443359375, "learning_rate": 3.543766984001296e-05, "loss": 1.074, "step": 334900 }, { "epoch": 0.3, "grad_norm": 29.0, "learning_rate": 3.543317076682204e-05, "loss": 0.9047, "step": 335000 }, { "epoch": 0.3, "grad_norm": 60.5, "learning_rate": 3.5428671693631116e-05, "loss": 0.9681, "step": 335100 }, { "epoch": 0.3, "grad_norm": 9.3125, "learning_rate": 3.542417262044019e-05, "loss": 0.9256, "step": 335200 }, { "epoch": 0.3, "grad_norm": 39.0, "learning_rate": 3.5419673547249266e-05, "loss": 0.9005, "step": 335300 }, { "epoch": 0.3, "grad_norm": 0.0260009765625, "learning_rate": 3.541517447405834e-05, "loss": 1.0256, "step": 335400 }, { "epoch": 0.3, "grad_norm": 117.5, "learning_rate": 3.5410675400867424e-05, "loss": 1.0543, "step": 335500 }, { "epoch": 0.3, "grad_norm": 12.8125, "learning_rate": 3.54061763276765e-05, "loss": 1.0191, "step": 335600 }, { "epoch": 0.3, "grad_norm": 15.4375, "learning_rate": 3.5401677254485574e-05, "loss": 1.0416, "step": 335700 }, { "epoch": 0.3, "grad_norm": 31.25, "learning_rate": 3.5397178181294657e-05, "loss": 1.0673, "step": 335800 }, { "epoch": 0.3, "grad_norm": 20.875, "learning_rate": 3.539267910810373e-05, "loss": 0.9754, "step": 335900 }, { "epoch": 0.3, "grad_norm": 2.140625, "learning_rate": 3.538818003491281e-05, "loss": 1.0473, "step": 336000 }, { "epoch": 0.3, "grad_norm": 9.75, "learning_rate": 3.538368096172189e-05, "loss": 0.9843, "step": 336100 }, { "epoch": 0.3, "grad_norm": 27.75, "learning_rate": 3.5379181888530965e-05, "loss": 1.0561, "step": 336200 }, { "epoch": 0.3, "grad_norm": 16.875, "learning_rate": 3.537468281534004e-05, "loss": 0.809, "step": 336300 }, { "epoch": 0.3, "grad_norm": 43.25, "learning_rate": 3.537018374214912e-05, "loss": 1.0007, "step": 336400 }, { "epoch": 0.3, "grad_norm": 30.625, "learning_rate": 3.53656846689582e-05, "loss": 0.9604, "step": 336500 }, { "epoch": 0.3, "grad_norm": 66.5, "learning_rate": 3.536118559576727e-05, "loss": 1.0873, "step": 336600 }, { "epoch": 0.3, "grad_norm": 10.5, "learning_rate": 3.535668652257635e-05, "loss": 1.2278, "step": 336700 }, { "epoch": 0.3, "grad_norm": 28.25, "learning_rate": 3.5352187449385423e-05, "loss": 1.078, "step": 336800 }, { "epoch": 0.3, "grad_norm": 103.5, "learning_rate": 3.5347688376194506e-05, "loss": 0.9048, "step": 336900 }, { "epoch": 0.3, "grad_norm": 13.5, "learning_rate": 3.534318930300358e-05, "loss": 1.0114, "step": 337000 }, { "epoch": 0.3, "grad_norm": 0.003631591796875, "learning_rate": 3.533869022981266e-05, "loss": 1.0559, "step": 337100 }, { "epoch": 0.3, "grad_norm": 100.5, "learning_rate": 3.533419115662174e-05, "loss": 0.9829, "step": 337200 }, { "epoch": 0.3, "grad_norm": 44.75, "learning_rate": 3.5329692083430814e-05, "loss": 1.0918, "step": 337300 }, { "epoch": 0.3, "grad_norm": 27.875, "learning_rate": 3.5325193010239896e-05, "loss": 0.8966, "step": 337400 }, { "epoch": 0.3, "grad_norm": 27.875, "learning_rate": 3.532069393704897e-05, "loss": 0.9762, "step": 337500 }, { "epoch": 0.3, "grad_norm": 22.625, "learning_rate": 3.5316194863858047e-05, "loss": 0.975, "step": 337600 }, { "epoch": 0.3, "grad_norm": 15.75, "learning_rate": 3.531169579066713e-05, "loss": 0.9939, "step": 337700 }, { "epoch": 0.3, "grad_norm": 17.875, "learning_rate": 3.5307196717476204e-05, "loss": 0.9822, "step": 337800 }, { "epoch": 0.3, "grad_norm": 136.0, "learning_rate": 3.530269764428528e-05, "loss": 0.9633, "step": 337900 }, { "epoch": 0.3, "grad_norm": 68.5, "learning_rate": 3.5298198571094355e-05, "loss": 0.9347, "step": 338000 }, { "epoch": 0.3, "grad_norm": 38.0, "learning_rate": 3.529369949790343e-05, "loss": 0.9838, "step": 338100 }, { "epoch": 0.3, "grad_norm": 24.25, "learning_rate": 3.528920042471251e-05, "loss": 0.8962, "step": 338200 }, { "epoch": 0.3, "grad_norm": 21.375, "learning_rate": 3.528470135152159e-05, "loss": 0.9742, "step": 338300 }, { "epoch": 0.3, "grad_norm": 310.0, "learning_rate": 3.528020227833066e-05, "loss": 1.0379, "step": 338400 }, { "epoch": 0.3, "grad_norm": 40.75, "learning_rate": 3.5275703205139745e-05, "loss": 0.9511, "step": 338500 }, { "epoch": 0.3, "grad_norm": 19.375, "learning_rate": 3.527120413194882e-05, "loss": 1.1389, "step": 338600 }, { "epoch": 0.3, "grad_norm": 95.0, "learning_rate": 3.5266705058757896e-05, "loss": 0.9534, "step": 338700 }, { "epoch": 0.3, "grad_norm": 32.25, "learning_rate": 3.526220598556698e-05, "loss": 0.8412, "step": 338800 }, { "epoch": 0.3, "grad_norm": 4.90625, "learning_rate": 3.525770691237605e-05, "loss": 0.8912, "step": 338900 }, { "epoch": 0.3, "grad_norm": 0.138671875, "learning_rate": 3.525320783918513e-05, "loss": 0.9501, "step": 339000 }, { "epoch": 0.3, "grad_norm": 22.5, "learning_rate": 3.524870876599421e-05, "loss": 0.9967, "step": 339100 }, { "epoch": 0.3, "grad_norm": 23.125, "learning_rate": 3.524420969280328e-05, "loss": 0.9, "step": 339200 }, { "epoch": 0.3, "grad_norm": 28.25, "learning_rate": 3.523971061961236e-05, "loss": 1.0166, "step": 339300 }, { "epoch": 0.3, "grad_norm": 8.0, "learning_rate": 3.5235211546421437e-05, "loss": 0.9221, "step": 339400 }, { "epoch": 0.3, "grad_norm": 39.0, "learning_rate": 3.523071247323051e-05, "loss": 0.9019, "step": 339500 }, { "epoch": 0.3, "grad_norm": 39.5, "learning_rate": 3.5226213400039594e-05, "loss": 0.927, "step": 339600 }, { "epoch": 0.3, "grad_norm": 33.25, "learning_rate": 3.522171432684867e-05, "loss": 1.0473, "step": 339700 }, { "epoch": 0.3, "grad_norm": 24.625, "learning_rate": 3.521721525365775e-05, "loss": 0.9213, "step": 339800 }, { "epoch": 0.3, "grad_norm": 31.125, "learning_rate": 3.521271618046683e-05, "loss": 1.0288, "step": 339900 }, { "epoch": 0.3, "grad_norm": 54.75, "learning_rate": 3.52082171072759e-05, "loss": 0.8142, "step": 340000 }, { "epoch": 0.3, "grad_norm": 58.0, "learning_rate": 3.5203718034084984e-05, "loss": 1.0138, "step": 340100 }, { "epoch": 0.3, "grad_norm": 61.25, "learning_rate": 3.519921896089406e-05, "loss": 0.8596, "step": 340200 }, { "epoch": 0.3, "grad_norm": 16.875, "learning_rate": 3.5194719887703135e-05, "loss": 0.8493, "step": 340300 }, { "epoch": 0.3, "grad_norm": 0.4140625, "learning_rate": 3.519022081451222e-05, "loss": 1.0531, "step": 340400 }, { "epoch": 0.3, "grad_norm": 25.625, "learning_rate": 3.5185721741321286e-05, "loss": 0.9826, "step": 340500 }, { "epoch": 0.3, "grad_norm": 68.0, "learning_rate": 3.518122266813037e-05, "loss": 0.9105, "step": 340600 }, { "epoch": 0.3, "grad_norm": 1.0, "learning_rate": 3.517672359493944e-05, "loss": 0.9738, "step": 340700 }, { "epoch": 0.3, "grad_norm": 0.8515625, "learning_rate": 3.517222452174852e-05, "loss": 0.8979, "step": 340800 }, { "epoch": 0.3, "grad_norm": 86.5, "learning_rate": 3.51677254485576e-05, "loss": 1.028, "step": 340900 }, { "epoch": 0.3, "grad_norm": 0.060302734375, "learning_rate": 3.5163226375366676e-05, "loss": 0.9634, "step": 341000 }, { "epoch": 0.3, "grad_norm": 17.25, "learning_rate": 3.515872730217575e-05, "loss": 1.0268, "step": 341100 }, { "epoch": 0.3, "grad_norm": 32.75, "learning_rate": 3.515422822898483e-05, "loss": 0.9466, "step": 341200 }, { "epoch": 0.3, "grad_norm": 48.5, "learning_rate": 3.514972915579391e-05, "loss": 0.956, "step": 341300 }, { "epoch": 0.3, "grad_norm": 15.0, "learning_rate": 3.5145230082602984e-05, "loss": 1.0007, "step": 341400 }, { "epoch": 0.3, "grad_norm": 62.25, "learning_rate": 3.5140731009412066e-05, "loss": 0.8858, "step": 341500 }, { "epoch": 0.3, "grad_norm": 0.016845703125, "learning_rate": 3.513623193622114e-05, "loss": 1.0892, "step": 341600 }, { "epoch": 0.3, "grad_norm": 10.4375, "learning_rate": 3.513173286303022e-05, "loss": 0.9665, "step": 341700 }, { "epoch": 0.3, "grad_norm": 72.0, "learning_rate": 3.512723378983929e-05, "loss": 0.962, "step": 341800 }, { "epoch": 0.3, "grad_norm": 41.25, "learning_rate": 3.512273471664837e-05, "loss": 1.023, "step": 341900 }, { "epoch": 0.3, "grad_norm": 53.25, "learning_rate": 3.511823564345745e-05, "loss": 1.0495, "step": 342000 }, { "epoch": 0.3, "grad_norm": 0.019287109375, "learning_rate": 3.5113736570266525e-05, "loss": 1.119, "step": 342100 }, { "epoch": 0.3, "grad_norm": 97.5, "learning_rate": 3.51092374970756e-05, "loss": 1.1437, "step": 342200 }, { "epoch": 0.3, "grad_norm": 19.25, "learning_rate": 3.510473842388468e-05, "loss": 1.0745, "step": 342300 }, { "epoch": 0.31, "grad_norm": 17.125, "learning_rate": 3.510023935069376e-05, "loss": 0.9692, "step": 342400 }, { "epoch": 0.31, "grad_norm": 33.25, "learning_rate": 3.509574027750284e-05, "loss": 0.8827, "step": 342500 }, { "epoch": 0.31, "grad_norm": 26.25, "learning_rate": 3.5091241204311915e-05, "loss": 0.9616, "step": 342600 }, { "epoch": 0.31, "grad_norm": 49.5, "learning_rate": 3.508674213112099e-05, "loss": 0.9545, "step": 342700 }, { "epoch": 0.31, "grad_norm": 19.875, "learning_rate": 3.508224305793007e-05, "loss": 0.9747, "step": 342800 }, { "epoch": 0.31, "grad_norm": 0.4296875, "learning_rate": 3.507774398473915e-05, "loss": 1.014, "step": 342900 }, { "epoch": 0.31, "grad_norm": 0.318359375, "learning_rate": 3.507324491154822e-05, "loss": 0.8575, "step": 343000 }, { "epoch": 0.31, "grad_norm": 0.052978515625, "learning_rate": 3.50687458383573e-05, "loss": 0.9437, "step": 343100 }, { "epoch": 0.31, "grad_norm": 39.75, "learning_rate": 3.5064246765166374e-05, "loss": 1.1519, "step": 343200 }, { "epoch": 0.31, "grad_norm": 43.5, "learning_rate": 3.5059747691975456e-05, "loss": 1.1083, "step": 343300 }, { "epoch": 0.31, "grad_norm": 39.75, "learning_rate": 3.505524861878453e-05, "loss": 0.9813, "step": 343400 }, { "epoch": 0.31, "grad_norm": 46.75, "learning_rate": 3.505074954559361e-05, "loss": 0.9439, "step": 343500 }, { "epoch": 0.31, "grad_norm": 15.75, "learning_rate": 3.504625047240269e-05, "loss": 1.1206, "step": 343600 }, { "epoch": 0.31, "grad_norm": 60.75, "learning_rate": 3.5041751399211764e-05, "loss": 0.9304, "step": 343700 }, { "epoch": 0.31, "grad_norm": 17.5, "learning_rate": 3.503725232602084e-05, "loss": 0.9854, "step": 343800 }, { "epoch": 0.31, "grad_norm": 36.25, "learning_rate": 3.503275325282992e-05, "loss": 1.0822, "step": 343900 }, { "epoch": 0.31, "grad_norm": 4.65625, "learning_rate": 3.5028254179639e-05, "loss": 0.999, "step": 344000 }, { "epoch": 0.31, "grad_norm": 20.125, "learning_rate": 3.502375510644807e-05, "loss": 0.995, "step": 344100 }, { "epoch": 0.31, "grad_norm": 17.5, "learning_rate": 3.5019256033257154e-05, "loss": 0.9917, "step": 344200 }, { "epoch": 0.31, "grad_norm": 54.75, "learning_rate": 3.501475696006623e-05, "loss": 0.9688, "step": 344300 }, { "epoch": 0.31, "grad_norm": 8.5625, "learning_rate": 3.5010257886875305e-05, "loss": 1.1924, "step": 344400 }, { "epoch": 0.31, "grad_norm": 98.5, "learning_rate": 3.500575881368438e-05, "loss": 1.0433, "step": 344500 }, { "epoch": 0.31, "grad_norm": 42.25, "learning_rate": 3.5001259740493456e-05, "loss": 1.0896, "step": 344600 }, { "epoch": 0.31, "grad_norm": 70.5, "learning_rate": 3.499676066730254e-05, "loss": 1.0824, "step": 344700 }, { "epoch": 0.31, "grad_norm": 36.25, "learning_rate": 3.499226159411161e-05, "loss": 0.9353, "step": 344800 }, { "epoch": 0.31, "grad_norm": 0.859375, "learning_rate": 3.498776252092069e-05, "loss": 0.9933, "step": 344900 }, { "epoch": 0.31, "grad_norm": 250.0, "learning_rate": 3.498326344772977e-05, "loss": 1.0695, "step": 345000 }, { "epoch": 0.31, "grad_norm": 53.25, "learning_rate": 3.4978764374538846e-05, "loss": 0.9949, "step": 345100 }, { "epoch": 0.31, "grad_norm": 38.0, "learning_rate": 3.497426530134793e-05, "loss": 0.9965, "step": 345200 }, { "epoch": 0.31, "grad_norm": 0.0257568359375, "learning_rate": 3.4969766228157004e-05, "loss": 0.9574, "step": 345300 }, { "epoch": 0.31, "grad_norm": 65.0, "learning_rate": 3.496526715496608e-05, "loss": 0.8941, "step": 345400 }, { "epoch": 0.31, "grad_norm": 22.375, "learning_rate": 3.496076808177516e-05, "loss": 0.9229, "step": 345500 }, { "epoch": 0.31, "grad_norm": 15.3125, "learning_rate": 3.4956269008584236e-05, "loss": 1.0172, "step": 345600 }, { "epoch": 0.31, "grad_norm": 24.25, "learning_rate": 3.4951769935393305e-05, "loss": 1.047, "step": 345700 }, { "epoch": 0.31, "grad_norm": 26.125, "learning_rate": 3.494727086220239e-05, "loss": 1.1756, "step": 345800 }, { "epoch": 0.31, "grad_norm": 0.033935546875, "learning_rate": 3.494277178901146e-05, "loss": 0.9654, "step": 345900 }, { "epoch": 0.31, "grad_norm": 0.015625, "learning_rate": 3.4938272715820544e-05, "loss": 0.9827, "step": 346000 }, { "epoch": 0.31, "grad_norm": 0.0517578125, "learning_rate": 3.493377364262962e-05, "loss": 0.9261, "step": 346100 }, { "epoch": 0.31, "grad_norm": 63.0, "learning_rate": 3.4929274569438695e-05, "loss": 1.0586, "step": 346200 }, { "epoch": 0.31, "grad_norm": 33.75, "learning_rate": 3.492477549624778e-05, "loss": 0.974, "step": 346300 }, { "epoch": 0.31, "grad_norm": 53.5, "learning_rate": 3.492027642305685e-05, "loss": 0.9945, "step": 346400 }, { "epoch": 0.31, "grad_norm": 30.75, "learning_rate": 3.491577734986593e-05, "loss": 0.9252, "step": 346500 }, { "epoch": 0.31, "grad_norm": 14.125, "learning_rate": 3.491127827667501e-05, "loss": 1.0636, "step": 346600 }, { "epoch": 0.31, "grad_norm": 9.625, "learning_rate": 3.4906779203484085e-05, "loss": 0.9651, "step": 346700 }, { "epoch": 0.31, "grad_norm": 13.0625, "learning_rate": 3.490228013029316e-05, "loss": 0.9804, "step": 346800 }, { "epoch": 0.31, "grad_norm": 346.0, "learning_rate": 3.489778105710224e-05, "loss": 0.8737, "step": 346900 }, { "epoch": 0.31, "grad_norm": 41.0, "learning_rate": 3.489328198391131e-05, "loss": 0.9257, "step": 347000 }, { "epoch": 0.31, "grad_norm": 28.375, "learning_rate": 3.4888782910720394e-05, "loss": 1.0121, "step": 347100 }, { "epoch": 0.31, "grad_norm": 0.009033203125, "learning_rate": 3.488428383752947e-05, "loss": 0.9638, "step": 347200 }, { "epoch": 0.31, "grad_norm": 35.75, "learning_rate": 3.4879784764338544e-05, "loss": 0.9795, "step": 347300 }, { "epoch": 0.31, "grad_norm": 39.75, "learning_rate": 3.4875285691147626e-05, "loss": 0.9371, "step": 347400 }, { "epoch": 0.31, "grad_norm": 38.5, "learning_rate": 3.48707866179567e-05, "loss": 1.0468, "step": 347500 }, { "epoch": 0.31, "grad_norm": 158.0, "learning_rate": 3.486628754476578e-05, "loss": 1.0942, "step": 347600 }, { "epoch": 0.31, "grad_norm": 7.84375, "learning_rate": 3.486178847157486e-05, "loss": 0.9997, "step": 347700 }, { "epoch": 0.31, "grad_norm": 111.0, "learning_rate": 3.4857289398383934e-05, "loss": 1.037, "step": 347800 }, { "epoch": 0.31, "grad_norm": 35.5, "learning_rate": 3.4852790325193017e-05, "loss": 1.0475, "step": 347900 }, { "epoch": 0.31, "grad_norm": 21.125, "learning_rate": 3.484829125200209e-05, "loss": 1.155, "step": 348000 }, { "epoch": 0.31, "grad_norm": 0.01043701171875, "learning_rate": 3.484379217881117e-05, "loss": 0.9906, "step": 348100 }, { "epoch": 0.31, "grad_norm": 0.044189453125, "learning_rate": 3.483929310562025e-05, "loss": 0.9364, "step": 348200 }, { "epoch": 0.31, "grad_norm": 0.2119140625, "learning_rate": 3.483479403242932e-05, "loss": 1.0356, "step": 348300 }, { "epoch": 0.31, "grad_norm": 42.25, "learning_rate": 3.483029495923839e-05, "loss": 1.0871, "step": 348400 }, { "epoch": 0.31, "grad_norm": 16.25, "learning_rate": 3.4825795886047475e-05, "loss": 1.0965, "step": 348500 }, { "epoch": 0.31, "grad_norm": 0.0172119140625, "learning_rate": 3.482129681285655e-05, "loss": 0.9703, "step": 348600 }, { "epoch": 0.31, "grad_norm": 31.875, "learning_rate": 3.481679773966563e-05, "loss": 0.9847, "step": 348700 }, { "epoch": 0.31, "grad_norm": 23.375, "learning_rate": 3.481229866647471e-05, "loss": 0.9492, "step": 348800 }, { "epoch": 0.31, "grad_norm": 75.5, "learning_rate": 3.4807799593283784e-05, "loss": 1.0443, "step": 348900 }, { "epoch": 0.31, "grad_norm": 31.625, "learning_rate": 3.4803300520092866e-05, "loss": 0.8512, "step": 349000 }, { "epoch": 0.31, "grad_norm": 352.0, "learning_rate": 3.479880144690194e-05, "loss": 0.908, "step": 349100 }, { "epoch": 0.31, "grad_norm": 24.0, "learning_rate": 3.4794302373711016e-05, "loss": 0.9246, "step": 349200 }, { "epoch": 0.31, "grad_norm": 7.03125, "learning_rate": 3.47898033005201e-05, "loss": 0.9786, "step": 349300 }, { "epoch": 0.31, "grad_norm": 22.0, "learning_rate": 3.4785304227329174e-05, "loss": 1.125, "step": 349400 }, { "epoch": 0.31, "grad_norm": 62.5, "learning_rate": 3.478080515413825e-05, "loss": 0.8742, "step": 349500 }, { "epoch": 0.31, "grad_norm": 164.0, "learning_rate": 3.4776306080947324e-05, "loss": 1.0632, "step": 349600 }, { "epoch": 0.31, "grad_norm": 146.0, "learning_rate": 3.47718070077564e-05, "loss": 1.0378, "step": 349700 }, { "epoch": 0.31, "grad_norm": 99.5, "learning_rate": 3.476730793456548e-05, "loss": 1.0271, "step": 349800 }, { "epoch": 0.31, "grad_norm": 23.25, "learning_rate": 3.476280886137456e-05, "loss": 0.9971, "step": 349900 }, { "epoch": 0.31, "grad_norm": 23.625, "learning_rate": 3.475830978818363e-05, "loss": 1.0899, "step": 350000 }, { "epoch": 0.31, "grad_norm": 0.470703125, "learning_rate": 3.4753810714992715e-05, "loss": 0.9545, "step": 350100 }, { "epoch": 0.31, "grad_norm": 20.625, "learning_rate": 3.474931164180179e-05, "loss": 0.9789, "step": 350200 }, { "epoch": 0.31, "grad_norm": 86.5, "learning_rate": 3.4744812568610865e-05, "loss": 0.9448, "step": 350300 }, { "epoch": 0.31, "grad_norm": 8.25, "learning_rate": 3.474031349541995e-05, "loss": 1.1435, "step": 350400 }, { "epoch": 0.31, "grad_norm": 165.0, "learning_rate": 3.473581442222902e-05, "loss": 1.1196, "step": 350500 }, { "epoch": 0.31, "grad_norm": 17.5, "learning_rate": 3.4731315349038105e-05, "loss": 1.0311, "step": 350600 }, { "epoch": 0.31, "grad_norm": 76.5, "learning_rate": 3.472681627584718e-05, "loss": 1.0222, "step": 350700 }, { "epoch": 0.31, "grad_norm": 0.0205078125, "learning_rate": 3.4722317202656256e-05, "loss": 1.0105, "step": 350800 }, { "epoch": 0.31, "grad_norm": 32.0, "learning_rate": 3.471781812946533e-05, "loss": 0.8543, "step": 350900 }, { "epoch": 0.31, "grad_norm": 43.25, "learning_rate": 3.4713319056274406e-05, "loss": 1.0365, "step": 351000 }, { "epoch": 0.31, "grad_norm": 31.5, "learning_rate": 3.470881998308348e-05, "loss": 0.9772, "step": 351100 }, { "epoch": 0.31, "grad_norm": 41.0, "learning_rate": 3.4704320909892564e-05, "loss": 0.9123, "step": 351200 }, { "epoch": 0.31, "grad_norm": 34.25, "learning_rate": 3.469982183670164e-05, "loss": 0.8525, "step": 351300 }, { "epoch": 0.31, "grad_norm": 374.0, "learning_rate": 3.469532276351072e-05, "loss": 0.8905, "step": 351400 }, { "epoch": 0.31, "grad_norm": 35.75, "learning_rate": 3.4690823690319797e-05, "loss": 0.9893, "step": 351500 }, { "epoch": 0.31, "grad_norm": 9.0, "learning_rate": 3.468632461712887e-05, "loss": 1.1033, "step": 351600 }, { "epoch": 0.31, "grad_norm": 14.75, "learning_rate": 3.4681825543937954e-05, "loss": 0.9942, "step": 351700 }, { "epoch": 0.31, "grad_norm": 0.0269775390625, "learning_rate": 3.467732647074703e-05, "loss": 0.8974, "step": 351800 }, { "epoch": 0.31, "grad_norm": 16.75, "learning_rate": 3.4672827397556105e-05, "loss": 1.0212, "step": 351900 }, { "epoch": 0.31, "grad_norm": 0.455078125, "learning_rate": 3.466832832436519e-05, "loss": 0.9682, "step": 352000 }, { "epoch": 0.31, "grad_norm": 0.0206298828125, "learning_rate": 3.466382925117426e-05, "loss": 1.0093, "step": 352100 }, { "epoch": 0.31, "grad_norm": 8.875, "learning_rate": 3.465933017798334e-05, "loss": 0.7791, "step": 352200 }, { "epoch": 0.31, "grad_norm": 0.087890625, "learning_rate": 3.465483110479241e-05, "loss": 1.0043, "step": 352300 }, { "epoch": 0.31, "grad_norm": 43.5, "learning_rate": 3.465033203160149e-05, "loss": 0.9916, "step": 352400 }, { "epoch": 0.31, "grad_norm": 1040.0, "learning_rate": 3.464583295841057e-05, "loss": 1.0605, "step": 352500 }, { "epoch": 0.31, "grad_norm": 2.546875, "learning_rate": 3.4641333885219646e-05, "loss": 1.0786, "step": 352600 }, { "epoch": 0.31, "grad_norm": 28.875, "learning_rate": 3.463683481202872e-05, "loss": 0.8849, "step": 352700 }, { "epoch": 0.31, "grad_norm": 68.0, "learning_rate": 3.46323357388378e-05, "loss": 0.8792, "step": 352800 }, { "epoch": 0.31, "grad_norm": 11.6875, "learning_rate": 3.462783666564688e-05, "loss": 1.131, "step": 352900 }, { "epoch": 0.31, "grad_norm": 27.625, "learning_rate": 3.4623337592455954e-05, "loss": 1.0118, "step": 353000 }, { "epoch": 0.31, "grad_norm": 24.5, "learning_rate": 3.4618838519265036e-05, "loss": 1.0404, "step": 353100 }, { "epoch": 0.31, "grad_norm": 16.75, "learning_rate": 3.461433944607411e-05, "loss": 0.9962, "step": 353200 }, { "epoch": 0.31, "grad_norm": 20.75, "learning_rate": 3.4609840372883187e-05, "loss": 0.8976, "step": 353300 }, { "epoch": 0.31, "grad_norm": 36.25, "learning_rate": 3.460534129969227e-05, "loss": 1.1852, "step": 353400 }, { "epoch": 0.31, "grad_norm": 34.5, "learning_rate": 3.460084222650134e-05, "loss": 0.9735, "step": 353500 }, { "epoch": 0.31, "grad_norm": 56.0, "learning_rate": 3.459634315331042e-05, "loss": 0.9293, "step": 353600 }, { "epoch": 0.32, "grad_norm": 13.1875, "learning_rate": 3.4591844080119495e-05, "loss": 0.9346, "step": 353700 }, { "epoch": 0.32, "grad_norm": 0.0966796875, "learning_rate": 3.458734500692857e-05, "loss": 1.143, "step": 353800 }, { "epoch": 0.32, "grad_norm": 46.75, "learning_rate": 3.458284593373765e-05, "loss": 0.9157, "step": 353900 }, { "epoch": 0.32, "grad_norm": 68.0, "learning_rate": 3.457834686054673e-05, "loss": 0.971, "step": 354000 }, { "epoch": 0.32, "grad_norm": 47.25, "learning_rate": 3.457384778735581e-05, "loss": 1.0524, "step": 354100 }, { "epoch": 0.32, "grad_norm": 100.0, "learning_rate": 3.4569348714164885e-05, "loss": 1.2063, "step": 354200 }, { "epoch": 0.32, "grad_norm": 63.75, "learning_rate": 3.456484964097396e-05, "loss": 0.9138, "step": 354300 }, { "epoch": 0.32, "grad_norm": 10.5625, "learning_rate": 3.456035056778304e-05, "loss": 0.8864, "step": 354400 }, { "epoch": 0.32, "grad_norm": 1688.0, "learning_rate": 3.455585149459212e-05, "loss": 1.1033, "step": 354500 }, { "epoch": 0.32, "grad_norm": 3.078125, "learning_rate": 3.455135242140119e-05, "loss": 0.9511, "step": 354600 }, { "epoch": 0.32, "grad_norm": 9.25, "learning_rate": 3.4546853348210275e-05, "loss": 0.9354, "step": 354700 }, { "epoch": 0.32, "grad_norm": 42.75, "learning_rate": 3.4542354275019344e-05, "loss": 1.0069, "step": 354800 }, { "epoch": 0.32, "grad_norm": 162.0, "learning_rate": 3.4537855201828426e-05, "loss": 1.0169, "step": 354900 }, { "epoch": 0.32, "grad_norm": 9.125, "learning_rate": 3.45333561286375e-05, "loss": 0.911, "step": 355000 }, { "epoch": 0.32, "grad_norm": 42.0, "learning_rate": 3.4528857055446577e-05, "loss": 1.1081, "step": 355100 }, { "epoch": 0.32, "grad_norm": 28.25, "learning_rate": 3.452435798225566e-05, "loss": 0.9224, "step": 355200 }, { "epoch": 0.32, "grad_norm": 56.0, "learning_rate": 3.4519858909064734e-05, "loss": 1.0065, "step": 355300 }, { "epoch": 0.32, "grad_norm": 0.72265625, "learning_rate": 3.451535983587381e-05, "loss": 1.0743, "step": 355400 }, { "epoch": 0.32, "grad_norm": 25.875, "learning_rate": 3.451086076268289e-05, "loss": 1.0195, "step": 355500 }, { "epoch": 0.32, "grad_norm": 16.125, "learning_rate": 3.450636168949197e-05, "loss": 1.161, "step": 355600 }, { "epoch": 0.32, "grad_norm": 11.625, "learning_rate": 3.450186261630104e-05, "loss": 0.913, "step": 355700 }, { "epoch": 0.32, "grad_norm": 10.5625, "learning_rate": 3.4497363543110124e-05, "loss": 0.8156, "step": 355800 }, { "epoch": 0.32, "grad_norm": 11.0, "learning_rate": 3.44928644699192e-05, "loss": 0.9391, "step": 355900 }, { "epoch": 0.32, "grad_norm": 40.5, "learning_rate": 3.4488365396728275e-05, "loss": 0.8788, "step": 356000 }, { "epoch": 0.32, "grad_norm": 22.375, "learning_rate": 3.448386632353735e-05, "loss": 0.9859, "step": 356100 }, { "epoch": 0.32, "grad_norm": 23.625, "learning_rate": 3.4479367250346426e-05, "loss": 1.0059, "step": 356200 }, { "epoch": 0.32, "grad_norm": 16.5, "learning_rate": 3.447486817715551e-05, "loss": 1.115, "step": 356300 }, { "epoch": 0.32, "grad_norm": 18.0, "learning_rate": 3.447036910396458e-05, "loss": 0.9832, "step": 356400 }, { "epoch": 0.32, "grad_norm": 11.9375, "learning_rate": 3.446587003077366e-05, "loss": 0.9601, "step": 356500 }, { "epoch": 0.32, "grad_norm": 8.0625, "learning_rate": 3.446137095758274e-05, "loss": 1.1986, "step": 356600 }, { "epoch": 0.32, "grad_norm": 1.21875, "learning_rate": 3.4456871884391816e-05, "loss": 1.0122, "step": 356700 }, { "epoch": 0.32, "grad_norm": 46.25, "learning_rate": 3.44523728112009e-05, "loss": 0.7792, "step": 356800 }, { "epoch": 0.32, "grad_norm": 45.0, "learning_rate": 3.444787373800997e-05, "loss": 1.0879, "step": 356900 }, { "epoch": 0.32, "grad_norm": 9.0, "learning_rate": 3.444337466481905e-05, "loss": 0.9483, "step": 357000 }, { "epoch": 0.32, "grad_norm": 20.125, "learning_rate": 3.443887559162813e-05, "loss": 0.9914, "step": 357100 }, { "epoch": 0.32, "grad_norm": 21.5, "learning_rate": 3.4434376518437206e-05, "loss": 0.9563, "step": 357200 }, { "epoch": 0.32, "grad_norm": 20.875, "learning_rate": 3.442987744524628e-05, "loss": 1.0086, "step": 357300 }, { "epoch": 0.32, "grad_norm": 51.25, "learning_rate": 3.442537837205536e-05, "loss": 1.0248, "step": 357400 }, { "epoch": 0.32, "grad_norm": 17.25, "learning_rate": 3.442087929886443e-05, "loss": 0.9769, "step": 357500 }, { "epoch": 0.32, "grad_norm": 9.25, "learning_rate": 3.4416380225673514e-05, "loss": 1.0876, "step": 357600 }, { "epoch": 0.32, "grad_norm": 47.5, "learning_rate": 3.441188115248259e-05, "loss": 0.9682, "step": 357700 }, { "epoch": 0.32, "grad_norm": 99.5, "learning_rate": 3.4407382079291665e-05, "loss": 1.031, "step": 357800 }, { "epoch": 0.32, "grad_norm": 10.875, "learning_rate": 3.440288300610075e-05, "loss": 1.0673, "step": 357900 }, { "epoch": 0.32, "grad_norm": 10.8125, "learning_rate": 3.439838393290982e-05, "loss": 1.0278, "step": 358000 }, { "epoch": 0.32, "grad_norm": 35.5, "learning_rate": 3.43938848597189e-05, "loss": 1.0423, "step": 358100 }, { "epoch": 0.32, "grad_norm": 13.8125, "learning_rate": 3.438938578652798e-05, "loss": 0.9551, "step": 358200 }, { "epoch": 0.32, "grad_norm": 18.125, "learning_rate": 3.4384886713337055e-05, "loss": 0.9817, "step": 358300 }, { "epoch": 0.32, "grad_norm": 43.0, "learning_rate": 3.438038764014613e-05, "loss": 1.0417, "step": 358400 }, { "epoch": 0.32, "grad_norm": 16.25, "learning_rate": 3.437588856695521e-05, "loss": 0.8936, "step": 358500 }, { "epoch": 0.32, "grad_norm": 43.5, "learning_rate": 3.437138949376429e-05, "loss": 1.0977, "step": 358600 }, { "epoch": 0.32, "grad_norm": 11.1875, "learning_rate": 3.436689042057336e-05, "loss": 1.1079, "step": 358700 }, { "epoch": 0.32, "grad_norm": 113.5, "learning_rate": 3.436239134738244e-05, "loss": 0.9851, "step": 358800 }, { "epoch": 0.32, "grad_norm": 36.25, "learning_rate": 3.4357892274191514e-05, "loss": 1.0251, "step": 358900 }, { "epoch": 0.32, "grad_norm": 73.0, "learning_rate": 3.4353393201000596e-05, "loss": 0.9387, "step": 359000 }, { "epoch": 0.32, "grad_norm": 14.0625, "learning_rate": 3.434889412780967e-05, "loss": 1.0755, "step": 359100 }, { "epoch": 0.32, "grad_norm": 14.625, "learning_rate": 3.434439505461875e-05, "loss": 1.0748, "step": 359200 }, { "epoch": 0.32, "grad_norm": 12.0, "learning_rate": 3.433989598142783e-05, "loss": 0.9022, "step": 359300 }, { "epoch": 0.32, "grad_norm": 103.5, "learning_rate": 3.4335396908236904e-05, "loss": 1.0759, "step": 359400 }, { "epoch": 0.32, "grad_norm": 13.5, "learning_rate": 3.4330897835045986e-05, "loss": 0.9412, "step": 359500 }, { "epoch": 0.32, "grad_norm": 4640.0, "learning_rate": 3.432639876185506e-05, "loss": 0.7769, "step": 359600 }, { "epoch": 0.32, "grad_norm": 0.004974365234375, "learning_rate": 3.432189968866414e-05, "loss": 1.0443, "step": 359700 }, { "epoch": 0.32, "grad_norm": 55.5, "learning_rate": 3.431740061547322e-05, "loss": 1.0175, "step": 359800 }, { "epoch": 0.32, "grad_norm": 17.125, "learning_rate": 3.4312901542282295e-05, "loss": 0.9163, "step": 359900 }, { "epoch": 0.32, "grad_norm": 3.265625, "learning_rate": 3.430840246909136e-05, "loss": 0.8714, "step": 360000 }, { "epoch": 0.32, "grad_norm": 27.0, "learning_rate": 3.4303903395900445e-05, "loss": 0.903, "step": 360100 }, { "epoch": 0.32, "grad_norm": 52.25, "learning_rate": 3.429940432270952e-05, "loss": 1.0778, "step": 360200 }, { "epoch": 0.32, "grad_norm": 16.75, "learning_rate": 3.42949052495186e-05, "loss": 0.7507, "step": 360300 }, { "epoch": 0.32, "grad_norm": 44.0, "learning_rate": 3.429040617632768e-05, "loss": 0.9221, "step": 360400 }, { "epoch": 0.32, "grad_norm": 13.25, "learning_rate": 3.428590710313675e-05, "loss": 1.0659, "step": 360500 }, { "epoch": 0.32, "grad_norm": 21.625, "learning_rate": 3.4281408029945835e-05, "loss": 1.0189, "step": 360600 }, { "epoch": 0.32, "grad_norm": 28.0, "learning_rate": 3.427690895675491e-05, "loss": 0.9743, "step": 360700 }, { "epoch": 0.32, "grad_norm": 6.875, "learning_rate": 3.4272409883563986e-05, "loss": 0.9364, "step": 360800 }, { "epoch": 0.32, "grad_norm": 23.5, "learning_rate": 3.426791081037307e-05, "loss": 1.0681, "step": 360900 }, { "epoch": 0.32, "grad_norm": 21.625, "learning_rate": 3.4263411737182144e-05, "loss": 0.954, "step": 361000 }, { "epoch": 0.32, "grad_norm": 0.030517578125, "learning_rate": 3.425891266399122e-05, "loss": 0.7981, "step": 361100 }, { "epoch": 0.32, "grad_norm": 0.353515625, "learning_rate": 3.42544135908003e-05, "loss": 0.9293, "step": 361200 }, { "epoch": 0.32, "grad_norm": 14.25, "learning_rate": 3.424991451760937e-05, "loss": 0.9397, "step": 361300 }, { "epoch": 0.32, "grad_norm": 58.25, "learning_rate": 3.424541544441845e-05, "loss": 1.0275, "step": 361400 }, { "epoch": 0.32, "grad_norm": 46.0, "learning_rate": 3.424091637122753e-05, "loss": 1.0131, "step": 361500 }, { "epoch": 0.32, "grad_norm": 16.875, "learning_rate": 3.42364172980366e-05, "loss": 0.904, "step": 361600 }, { "epoch": 0.32, "grad_norm": 26.25, "learning_rate": 3.4231918224845685e-05, "loss": 0.9013, "step": 361700 }, { "epoch": 0.32, "grad_norm": 33.5, "learning_rate": 3.422741915165476e-05, "loss": 1.0755, "step": 361800 }, { "epoch": 0.32, "grad_norm": 0.1416015625, "learning_rate": 3.4222920078463835e-05, "loss": 1.0032, "step": 361900 }, { "epoch": 0.32, "grad_norm": 6.09375, "learning_rate": 3.421842100527292e-05, "loss": 1.1277, "step": 362000 }, { "epoch": 0.32, "grad_norm": 34.25, "learning_rate": 3.421392193208199e-05, "loss": 0.8355, "step": 362100 }, { "epoch": 0.32, "grad_norm": 0.6015625, "learning_rate": 3.4209422858891075e-05, "loss": 0.8673, "step": 362200 }, { "epoch": 0.32, "grad_norm": 11.9375, "learning_rate": 3.420492378570015e-05, "loss": 1.0876, "step": 362300 }, { "epoch": 0.32, "grad_norm": 36.5, "learning_rate": 3.4200424712509225e-05, "loss": 0.9968, "step": 362400 }, { "epoch": 0.32, "grad_norm": 58.0, "learning_rate": 3.419592563931831e-05, "loss": 1.1515, "step": 362500 }, { "epoch": 0.32, "grad_norm": 33.5, "learning_rate": 3.4191426566127376e-05, "loss": 0.9773, "step": 362600 }, { "epoch": 0.32, "grad_norm": 130.0, "learning_rate": 3.418692749293645e-05, "loss": 0.9652, "step": 362700 }, { "epoch": 0.32, "grad_norm": 52.0, "learning_rate": 3.4182428419745534e-05, "loss": 0.8992, "step": 362800 }, { "epoch": 0.32, "grad_norm": 13.5, "learning_rate": 3.417792934655461e-05, "loss": 1.0248, "step": 362900 }, { "epoch": 0.32, "grad_norm": 13.375, "learning_rate": 3.417343027336369e-05, "loss": 0.9995, "step": 363000 }, { "epoch": 0.32, "grad_norm": 60.0, "learning_rate": 3.4168931200172766e-05, "loss": 1.0267, "step": 363100 }, { "epoch": 0.32, "grad_norm": 19.625, "learning_rate": 3.416443212698184e-05, "loss": 1.1082, "step": 363200 }, { "epoch": 0.32, "grad_norm": 47.0, "learning_rate": 3.4159933053790924e-05, "loss": 1.0299, "step": 363300 }, { "epoch": 0.32, "grad_norm": 58.75, "learning_rate": 3.41554339806e-05, "loss": 0.9064, "step": 363400 }, { "epoch": 0.32, "grad_norm": 9.4375, "learning_rate": 3.4150934907409075e-05, "loss": 1.0907, "step": 363500 }, { "epoch": 0.32, "grad_norm": 22.875, "learning_rate": 3.414643583421816e-05, "loss": 0.832, "step": 363600 }, { "epoch": 0.32, "grad_norm": 0.25390625, "learning_rate": 3.414193676102723e-05, "loss": 1.0826, "step": 363700 }, { "epoch": 0.32, "grad_norm": 35.75, "learning_rate": 3.413743768783631e-05, "loss": 1.0584, "step": 363800 }, { "epoch": 0.32, "grad_norm": 0.057373046875, "learning_rate": 3.413293861464538e-05, "loss": 0.9768, "step": 363900 }, { "epoch": 0.32, "grad_norm": 34.5, "learning_rate": 3.412843954145446e-05, "loss": 1.1146, "step": 364000 }, { "epoch": 0.32, "grad_norm": 0.326171875, "learning_rate": 3.412394046826354e-05, "loss": 1.0217, "step": 364100 }, { "epoch": 0.32, "grad_norm": 31.5, "learning_rate": 3.4119441395072615e-05, "loss": 1.0237, "step": 364200 }, { "epoch": 0.32, "grad_norm": 0.00518798828125, "learning_rate": 3.411494232188169e-05, "loss": 0.8757, "step": 364300 }, { "epoch": 0.32, "grad_norm": 44.25, "learning_rate": 3.411044324869077e-05, "loss": 1.0125, "step": 364400 }, { "epoch": 0.32, "grad_norm": 84.0, "learning_rate": 3.410594417549985e-05, "loss": 1.0171, "step": 364500 }, { "epoch": 0.32, "grad_norm": 7.0, "learning_rate": 3.4101445102308924e-05, "loss": 1.0036, "step": 364600 }, { "epoch": 0.32, "grad_norm": 7.3125, "learning_rate": 3.4096946029118006e-05, "loss": 0.9506, "step": 364700 }, { "epoch": 0.32, "grad_norm": 21.625, "learning_rate": 3.409244695592708e-05, "loss": 0.8712, "step": 364800 }, { "epoch": 0.33, "grad_norm": 59.25, "learning_rate": 3.408794788273616e-05, "loss": 0.9358, "step": 364900 }, { "epoch": 0.33, "grad_norm": 21.375, "learning_rate": 3.408344880954524e-05, "loss": 1.0977, "step": 365000 }, { "epoch": 0.33, "grad_norm": 33.75, "learning_rate": 3.4078949736354314e-05, "loss": 1.0174, "step": 365100 }, { "epoch": 0.33, "grad_norm": 24.875, "learning_rate": 3.407445066316339e-05, "loss": 1.0531, "step": 365200 }, { "epoch": 0.33, "grad_norm": 0.14453125, "learning_rate": 3.4069951589972465e-05, "loss": 0.9368, "step": 365300 }, { "epoch": 0.33, "grad_norm": 49.75, "learning_rate": 3.406545251678154e-05, "loss": 1.1169, "step": 365400 }, { "epoch": 0.33, "grad_norm": 33.0, "learning_rate": 3.406095344359062e-05, "loss": 0.9764, "step": 365500 }, { "epoch": 0.33, "grad_norm": 50.0, "learning_rate": 3.40564543703997e-05, "loss": 1.1753, "step": 365600 }, { "epoch": 0.33, "grad_norm": 65.0, "learning_rate": 3.405195529720878e-05, "loss": 0.8653, "step": 365700 }, { "epoch": 0.33, "grad_norm": 78.0, "learning_rate": 3.4047456224017855e-05, "loss": 0.9461, "step": 365800 }, { "epoch": 0.33, "grad_norm": 16.125, "learning_rate": 3.404295715082693e-05, "loss": 1.1266, "step": 365900 }, { "epoch": 0.33, "grad_norm": 96.5, "learning_rate": 3.403845807763601e-05, "loss": 1.1212, "step": 366000 }, { "epoch": 0.33, "grad_norm": 32.75, "learning_rate": 3.403395900444509e-05, "loss": 1.0391, "step": 366100 }, { "epoch": 0.33, "grad_norm": 29.875, "learning_rate": 3.402945993125416e-05, "loss": 1.0962, "step": 366200 }, { "epoch": 0.33, "grad_norm": 41.75, "learning_rate": 3.4024960858063245e-05, "loss": 0.982, "step": 366300 }, { "epoch": 0.33, "grad_norm": 832.0, "learning_rate": 3.402046178487232e-05, "loss": 0.9597, "step": 366400 }, { "epoch": 0.33, "grad_norm": 17.0, "learning_rate": 3.4015962711681396e-05, "loss": 0.8734, "step": 366500 }, { "epoch": 0.33, "grad_norm": 32.5, "learning_rate": 3.401146363849047e-05, "loss": 1.1441, "step": 366600 }, { "epoch": 0.33, "grad_norm": 9.5, "learning_rate": 3.4006964565299546e-05, "loss": 1.0511, "step": 366700 }, { "epoch": 0.33, "grad_norm": 22.625, "learning_rate": 3.400246549210863e-05, "loss": 1.0366, "step": 366800 }, { "epoch": 0.33, "grad_norm": 17.25, "learning_rate": 3.3997966418917704e-05, "loss": 0.9527, "step": 366900 }, { "epoch": 0.33, "grad_norm": 0.2734375, "learning_rate": 3.399346734572678e-05, "loss": 0.9249, "step": 367000 }, { "epoch": 0.33, "grad_norm": 29.375, "learning_rate": 3.398896827253586e-05, "loss": 0.944, "step": 367100 }, { "epoch": 0.33, "grad_norm": 83.0, "learning_rate": 3.398446919934494e-05, "loss": 0.8695, "step": 367200 }, { "epoch": 0.33, "grad_norm": 13.25, "learning_rate": 3.397997012615401e-05, "loss": 0.9981, "step": 367300 }, { "epoch": 0.33, "grad_norm": 22.5, "learning_rate": 3.3975471052963094e-05, "loss": 1.0779, "step": 367400 }, { "epoch": 0.33, "grad_norm": 18.25, "learning_rate": 3.397097197977217e-05, "loss": 0.954, "step": 367500 }, { "epoch": 0.33, "grad_norm": 26.0, "learning_rate": 3.396647290658125e-05, "loss": 1.163, "step": 367600 }, { "epoch": 0.33, "grad_norm": 47.5, "learning_rate": 3.396197383339033e-05, "loss": 0.98, "step": 367700 }, { "epoch": 0.33, "grad_norm": 7.3125, "learning_rate": 3.3957474760199395e-05, "loss": 1.0356, "step": 367800 }, { "epoch": 0.33, "grad_norm": 1960.0, "learning_rate": 3.395297568700848e-05, "loss": 0.9846, "step": 367900 }, { "epoch": 0.33, "grad_norm": 16.25, "learning_rate": 3.394847661381755e-05, "loss": 0.8628, "step": 368000 }, { "epoch": 0.33, "grad_norm": 0.07470703125, "learning_rate": 3.394397754062663e-05, "loss": 1.1052, "step": 368100 }, { "epoch": 0.33, "grad_norm": 0.004608154296875, "learning_rate": 3.393947846743571e-05, "loss": 1.0655, "step": 368200 }, { "epoch": 0.33, "grad_norm": 0.4375, "learning_rate": 3.3934979394244786e-05, "loss": 0.9498, "step": 368300 }, { "epoch": 0.33, "grad_norm": 112.5, "learning_rate": 3.393048032105387e-05, "loss": 0.9401, "step": 368400 }, { "epoch": 0.33, "grad_norm": 34.75, "learning_rate": 3.392598124786294e-05, "loss": 1.0478, "step": 368500 }, { "epoch": 0.33, "grad_norm": 0.1474609375, "learning_rate": 3.392148217467202e-05, "loss": 1.188, "step": 368600 }, { "epoch": 0.33, "grad_norm": 1576.0, "learning_rate": 3.39169831014811e-05, "loss": 1.0604, "step": 368700 }, { "epoch": 0.33, "grad_norm": 5.71875, "learning_rate": 3.3912484028290176e-05, "loss": 1.0789, "step": 368800 }, { "epoch": 0.33, "grad_norm": 40.75, "learning_rate": 3.390798495509925e-05, "loss": 0.973, "step": 368900 }, { "epoch": 0.33, "grad_norm": 14.9375, "learning_rate": 3.390348588190833e-05, "loss": 0.9703, "step": 369000 }, { "epoch": 0.33, "grad_norm": 32.75, "learning_rate": 3.38989868087174e-05, "loss": 1.1804, "step": 369100 }, { "epoch": 0.33, "grad_norm": 0.04052734375, "learning_rate": 3.3894487735526484e-05, "loss": 0.9602, "step": 369200 }, { "epoch": 0.33, "grad_norm": 6.46875, "learning_rate": 3.388998866233556e-05, "loss": 1.0113, "step": 369300 }, { "epoch": 0.33, "grad_norm": 18.375, "learning_rate": 3.3885489589144635e-05, "loss": 1.0388, "step": 369400 }, { "epoch": 0.33, "grad_norm": 21.75, "learning_rate": 3.388099051595372e-05, "loss": 1.0397, "step": 369500 }, { "epoch": 0.33, "grad_norm": 1.359375, "learning_rate": 3.387649144276279e-05, "loss": 0.982, "step": 369600 }, { "epoch": 0.33, "grad_norm": 199.0, "learning_rate": 3.387199236957187e-05, "loss": 0.9649, "step": 369700 }, { "epoch": 0.33, "grad_norm": 56.0, "learning_rate": 3.386749329638095e-05, "loss": 1.0288, "step": 369800 }, { "epoch": 0.33, "grad_norm": 0.0673828125, "learning_rate": 3.3862994223190025e-05, "loss": 1.1247, "step": 369900 }, { "epoch": 0.33, "grad_norm": 35.25, "learning_rate": 3.38584951499991e-05, "loss": 0.9348, "step": 370000 }, { "epoch": 0.33, "grad_norm": 20.75, "learning_rate": 3.385399607680818e-05, "loss": 1.1281, "step": 370100 }, { "epoch": 0.33, "grad_norm": 66.0, "learning_rate": 3.384949700361726e-05, "loss": 1.0813, "step": 370200 }, { "epoch": 0.33, "grad_norm": 170.0, "learning_rate": 3.384499793042633e-05, "loss": 1.1549, "step": 370300 }, { "epoch": 0.33, "grad_norm": 16.25, "learning_rate": 3.384049885723541e-05, "loss": 1.0384, "step": 370400 }, { "epoch": 0.33, "grad_norm": 13.5, "learning_rate": 3.3835999784044484e-05, "loss": 0.9709, "step": 370500 }, { "epoch": 0.33, "grad_norm": 44.5, "learning_rate": 3.3831500710853566e-05, "loss": 1.0332, "step": 370600 }, { "epoch": 0.33, "grad_norm": 64.0, "learning_rate": 3.382700163766264e-05, "loss": 1.0561, "step": 370700 }, { "epoch": 0.33, "grad_norm": 12.375, "learning_rate": 3.3822502564471717e-05, "loss": 1.0035, "step": 370800 }, { "epoch": 0.33, "grad_norm": 173.0, "learning_rate": 3.38180034912808e-05, "loss": 0.9603, "step": 370900 }, { "epoch": 0.33, "grad_norm": 33.0, "learning_rate": 3.3813504418089874e-05, "loss": 1.0326, "step": 371000 }, { "epoch": 0.33, "grad_norm": 29.375, "learning_rate": 3.3809005344898956e-05, "loss": 1.0167, "step": 371100 }, { "epoch": 0.33, "grad_norm": 231.0, "learning_rate": 3.380450627170803e-05, "loss": 0.9685, "step": 371200 }, { "epoch": 0.33, "grad_norm": 19.25, "learning_rate": 3.380000719851711e-05, "loss": 0.9463, "step": 371300 }, { "epoch": 0.33, "grad_norm": 32.75, "learning_rate": 3.379550812532619e-05, "loss": 0.8895, "step": 371400 }, { "epoch": 0.33, "grad_norm": 24.375, "learning_rate": 3.3791009052135264e-05, "loss": 0.8625, "step": 371500 }, { "epoch": 0.33, "grad_norm": 9.1875, "learning_rate": 3.378650997894434e-05, "loss": 1.0003, "step": 371600 }, { "epoch": 0.33, "grad_norm": 17.875, "learning_rate": 3.3782010905753415e-05, "loss": 0.9104, "step": 371700 }, { "epoch": 0.33, "grad_norm": 41.75, "learning_rate": 3.377751183256249e-05, "loss": 1.0575, "step": 371800 }, { "epoch": 0.33, "grad_norm": 15.0, "learning_rate": 3.377301275937157e-05, "loss": 0.9208, "step": 371900 }, { "epoch": 0.33, "grad_norm": 17.25, "learning_rate": 3.376851368618065e-05, "loss": 0.8582, "step": 372000 }, { "epoch": 0.33, "grad_norm": 18.875, "learning_rate": 3.376401461298972e-05, "loss": 0.9082, "step": 372100 }, { "epoch": 0.33, "grad_norm": 21.625, "learning_rate": 3.3759515539798805e-05, "loss": 1.0285, "step": 372200 }, { "epoch": 0.33, "grad_norm": 73.5, "learning_rate": 3.375501646660788e-05, "loss": 1.0365, "step": 372300 }, { "epoch": 0.33, "grad_norm": 47.5, "learning_rate": 3.3750517393416956e-05, "loss": 0.9553, "step": 372400 }, { "epoch": 0.33, "grad_norm": 27.125, "learning_rate": 3.374601832022604e-05, "loss": 1.1219, "step": 372500 }, { "epoch": 0.33, "grad_norm": 27.0, "learning_rate": 3.374151924703511e-05, "loss": 0.997, "step": 372600 }, { "epoch": 0.33, "grad_norm": 86.5, "learning_rate": 3.373702017384419e-05, "loss": 0.9543, "step": 372700 }, { "epoch": 0.33, "grad_norm": 25.5, "learning_rate": 3.373252110065327e-05, "loss": 0.9207, "step": 372800 }, { "epoch": 0.33, "grad_norm": 27.125, "learning_rate": 3.3728022027462346e-05, "loss": 1.085, "step": 372900 }, { "epoch": 0.33, "grad_norm": 99.5, "learning_rate": 3.372352295427142e-05, "loss": 1.0742, "step": 373000 }, { "epoch": 0.33, "grad_norm": 32.5, "learning_rate": 3.37190238810805e-05, "loss": 1.0516, "step": 373100 }, { "epoch": 0.33, "grad_norm": 29.5, "learning_rate": 3.371452480788957e-05, "loss": 1.0393, "step": 373200 }, { "epoch": 0.33, "grad_norm": 33.75, "learning_rate": 3.3710025734698654e-05, "loss": 0.8406, "step": 373300 }, { "epoch": 0.33, "grad_norm": 76.5, "learning_rate": 3.370552666150773e-05, "loss": 1.0643, "step": 373400 }, { "epoch": 0.33, "grad_norm": 31.875, "learning_rate": 3.3701027588316805e-05, "loss": 0.8349, "step": 373500 }, { "epoch": 0.33, "grad_norm": 22.125, "learning_rate": 3.369652851512589e-05, "loss": 0.98, "step": 373600 }, { "epoch": 0.33, "grad_norm": 302.0, "learning_rate": 3.369202944193496e-05, "loss": 0.9419, "step": 373700 }, { "epoch": 0.33, "grad_norm": 12.3125, "learning_rate": 3.3687530368744045e-05, "loss": 0.9734, "step": 373800 }, { "epoch": 0.33, "grad_norm": 59.0, "learning_rate": 3.368303129555312e-05, "loss": 0.9372, "step": 373900 }, { "epoch": 0.33, "grad_norm": 9.9375, "learning_rate": 3.3678532222362195e-05, "loss": 0.9892, "step": 374000 }, { "epoch": 0.33, "grad_norm": 7.90625, "learning_rate": 3.367403314917128e-05, "loss": 1.0475, "step": 374100 }, { "epoch": 0.33, "grad_norm": 17.875, "learning_rate": 3.366953407598035e-05, "loss": 0.8914, "step": 374200 }, { "epoch": 0.33, "grad_norm": 75.0, "learning_rate": 3.366503500278942e-05, "loss": 0.9592, "step": 374300 }, { "epoch": 0.33, "grad_norm": 20.375, "learning_rate": 3.36605359295985e-05, "loss": 1.1901, "step": 374400 }, { "epoch": 0.33, "grad_norm": 0.130859375, "learning_rate": 3.365603685640758e-05, "loss": 0.9409, "step": 374500 }, { "epoch": 0.33, "grad_norm": 57.5, "learning_rate": 3.365153778321666e-05, "loss": 1.0367, "step": 374600 }, { "epoch": 0.33, "grad_norm": 6.96875, "learning_rate": 3.3647038710025736e-05, "loss": 0.9422, "step": 374700 }, { "epoch": 0.33, "grad_norm": 30.625, "learning_rate": 3.364253963683481e-05, "loss": 1.0069, "step": 374800 }, { "epoch": 0.33, "grad_norm": 0.330078125, "learning_rate": 3.3638040563643894e-05, "loss": 0.9289, "step": 374900 }, { "epoch": 0.33, "grad_norm": 17.125, "learning_rate": 3.363354149045297e-05, "loss": 1.0581, "step": 375000 }, { "epoch": 0.33, "grad_norm": 12.0625, "learning_rate": 3.3629042417262044e-05, "loss": 1.0849, "step": 375100 }, { "epoch": 0.33, "grad_norm": 0.04248046875, "learning_rate": 3.3624543344071126e-05, "loss": 0.976, "step": 375200 }, { "epoch": 0.33, "grad_norm": 0.640625, "learning_rate": 3.36200442708802e-05, "loss": 1.0362, "step": 375300 }, { "epoch": 0.33, "grad_norm": 9.1875, "learning_rate": 3.361554519768928e-05, "loss": 1.0939, "step": 375400 }, { "epoch": 0.33, "grad_norm": 366.0, "learning_rate": 3.361104612449836e-05, "loss": 1.0964, "step": 375500 }, { "epoch": 0.33, "grad_norm": 86.5, "learning_rate": 3.360654705130743e-05, "loss": 0.9822, "step": 375600 }, { "epoch": 0.33, "grad_norm": 100.0, "learning_rate": 3.360204797811651e-05, "loss": 1.1474, "step": 375700 }, { "epoch": 0.33, "grad_norm": 48.0, "learning_rate": 3.3597548904925585e-05, "loss": 1.045, "step": 375800 }, { "epoch": 0.33, "grad_norm": 0.03466796875, "learning_rate": 3.359304983173466e-05, "loss": 1.0746, "step": 375900 }, { "epoch": 0.33, "grad_norm": 109.0, "learning_rate": 3.358855075854374e-05, "loss": 1.0199, "step": 376000 }, { "epoch": 0.34, "grad_norm": 0.91796875, "learning_rate": 3.358405168535282e-05, "loss": 1.0031, "step": 376100 }, { "epoch": 0.34, "grad_norm": 28.5, "learning_rate": 3.357955261216189e-05, "loss": 0.8974, "step": 376200 }, { "epoch": 0.34, "grad_norm": 47.0, "learning_rate": 3.3575053538970975e-05, "loss": 1.1332, "step": 376300 }, { "epoch": 0.34, "grad_norm": 62.25, "learning_rate": 3.357055446578005e-05, "loss": 0.9769, "step": 376400 }, { "epoch": 0.34, "grad_norm": 39.0, "learning_rate": 3.356605539258913e-05, "loss": 1.076, "step": 376500 }, { "epoch": 0.34, "grad_norm": 39.5, "learning_rate": 3.356155631939821e-05, "loss": 1.109, "step": 376600 }, { "epoch": 0.34, "grad_norm": 43.5, "learning_rate": 3.3557057246207284e-05, "loss": 1.0835, "step": 376700 }, { "epoch": 0.34, "grad_norm": 141.0, "learning_rate": 3.3552558173016366e-05, "loss": 0.9525, "step": 376800 }, { "epoch": 0.34, "grad_norm": 35.5, "learning_rate": 3.3548059099825434e-05, "loss": 1.0132, "step": 376900 }, { "epoch": 0.34, "grad_norm": 13.625, "learning_rate": 3.354356002663451e-05, "loss": 0.9435, "step": 377000 }, { "epoch": 0.34, "grad_norm": 39.75, "learning_rate": 3.353906095344359e-05, "loss": 0.9291, "step": 377100 }, { "epoch": 0.34, "grad_norm": 31.125, "learning_rate": 3.353456188025267e-05, "loss": 1.1039, "step": 377200 }, { "epoch": 0.34, "grad_norm": 0.0223388671875, "learning_rate": 3.353006280706175e-05, "loss": 1.0395, "step": 377300 }, { "epoch": 0.34, "grad_norm": 29.75, "learning_rate": 3.3525563733870825e-05, "loss": 1.0247, "step": 377400 }, { "epoch": 0.34, "grad_norm": 69.0, "learning_rate": 3.35210646606799e-05, "loss": 0.9708, "step": 377500 }, { "epoch": 0.34, "grad_norm": 1.09375, "learning_rate": 3.351656558748898e-05, "loss": 0.9882, "step": 377600 }, { "epoch": 0.34, "grad_norm": 13.75, "learning_rate": 3.351206651429806e-05, "loss": 0.9816, "step": 377700 }, { "epoch": 0.34, "grad_norm": 26.375, "learning_rate": 3.350756744110713e-05, "loss": 0.9454, "step": 377800 }, { "epoch": 0.34, "grad_norm": 39.5, "learning_rate": 3.3503068367916215e-05, "loss": 1.0316, "step": 377900 }, { "epoch": 0.34, "grad_norm": 48.5, "learning_rate": 3.349856929472529e-05, "loss": 0.856, "step": 378000 }, { "epoch": 0.34, "grad_norm": 1320.0, "learning_rate": 3.3494070221534365e-05, "loss": 1.0977, "step": 378100 }, { "epoch": 0.34, "grad_norm": 119.5, "learning_rate": 3.348957114834344e-05, "loss": 0.9325, "step": 378200 }, { "epoch": 0.34, "grad_norm": 77.5, "learning_rate": 3.3485072075152516e-05, "loss": 1.28, "step": 378300 }, { "epoch": 0.34, "grad_norm": 28.625, "learning_rate": 3.34805730019616e-05, "loss": 0.9999, "step": 378400 }, { "epoch": 0.34, "grad_norm": 28.0, "learning_rate": 3.3476073928770674e-05, "loss": 0.9379, "step": 378500 }, { "epoch": 0.34, "grad_norm": 23.125, "learning_rate": 3.347157485557975e-05, "loss": 1.0134, "step": 378600 }, { "epoch": 0.34, "grad_norm": 252.0, "learning_rate": 3.346707578238883e-05, "loss": 0.9712, "step": 378700 }, { "epoch": 0.34, "grad_norm": 17.375, "learning_rate": 3.3462576709197906e-05, "loss": 0.9244, "step": 378800 }, { "epoch": 0.34, "grad_norm": 1.3515625, "learning_rate": 3.345807763600698e-05, "loss": 0.9613, "step": 378900 }, { "epoch": 0.34, "grad_norm": 11.375, "learning_rate": 3.3453578562816064e-05, "loss": 0.9041, "step": 379000 }, { "epoch": 0.34, "grad_norm": 12.5, "learning_rate": 3.344907948962514e-05, "loss": 0.9524, "step": 379100 }, { "epoch": 0.34, "grad_norm": 39.0, "learning_rate": 3.344458041643422e-05, "loss": 1.017, "step": 379200 }, { "epoch": 0.34, "grad_norm": 15.5, "learning_rate": 3.34400813432433e-05, "loss": 1.0178, "step": 379300 }, { "epoch": 0.34, "grad_norm": 84.5, "learning_rate": 3.343558227005237e-05, "loss": 1.0021, "step": 379400 }, { "epoch": 0.34, "grad_norm": 28.25, "learning_rate": 3.343108319686145e-05, "loss": 0.9867, "step": 379500 }, { "epoch": 0.34, "grad_norm": 1.8515625, "learning_rate": 3.342658412367052e-05, "loss": 1.0452, "step": 379600 }, { "epoch": 0.34, "grad_norm": 87.5, "learning_rate": 3.34220850504796e-05, "loss": 1.0257, "step": 379700 }, { "epoch": 0.34, "grad_norm": 0.0201416015625, "learning_rate": 3.341758597728868e-05, "loss": 1.1778, "step": 379800 }, { "epoch": 0.34, "grad_norm": 81.0, "learning_rate": 3.3413086904097755e-05, "loss": 0.9022, "step": 379900 }, { "epoch": 0.34, "grad_norm": 22.25, "learning_rate": 3.340858783090684e-05, "loss": 1.0625, "step": 380000 }, { "epoch": 0.34, "grad_norm": 43.0, "learning_rate": 3.340408875771591e-05, "loss": 1.0564, "step": 380100 }, { "epoch": 0.34, "grad_norm": 42.0, "learning_rate": 3.339958968452499e-05, "loss": 1.0455, "step": 380200 }, { "epoch": 0.34, "grad_norm": 34.25, "learning_rate": 3.339509061133407e-05, "loss": 0.9761, "step": 380300 }, { "epoch": 0.34, "grad_norm": 28.25, "learning_rate": 3.3390591538143146e-05, "loss": 1.1074, "step": 380400 }, { "epoch": 0.34, "grad_norm": 49.75, "learning_rate": 3.338609246495222e-05, "loss": 0.9837, "step": 380500 }, { "epoch": 0.34, "grad_norm": 9.0, "learning_rate": 3.33815933917613e-05, "loss": 0.7835, "step": 380600 }, { "epoch": 0.34, "grad_norm": 15.4375, "learning_rate": 3.337709431857038e-05, "loss": 0.938, "step": 380700 }, { "epoch": 0.34, "grad_norm": 20.0, "learning_rate": 3.3372595245379454e-05, "loss": 0.8742, "step": 380800 }, { "epoch": 0.34, "grad_norm": 149.0, "learning_rate": 3.336809617218853e-05, "loss": 1.0751, "step": 380900 }, { "epoch": 0.34, "grad_norm": 34.75, "learning_rate": 3.3363597098997605e-05, "loss": 1.0227, "step": 381000 }, { "epoch": 0.34, "grad_norm": 110.0, "learning_rate": 3.335909802580669e-05, "loss": 1.0275, "step": 381100 }, { "epoch": 0.34, "grad_norm": 72.5, "learning_rate": 3.335459895261576e-05, "loss": 1.0093, "step": 381200 }, { "epoch": 0.34, "grad_norm": 30.25, "learning_rate": 3.335009987942484e-05, "loss": 1.0325, "step": 381300 }, { "epoch": 0.34, "grad_norm": 127.5, "learning_rate": 3.334560080623392e-05, "loss": 0.9621, "step": 381400 }, { "epoch": 0.34, "grad_norm": 36.0, "learning_rate": 3.3341101733042995e-05, "loss": 1.0984, "step": 381500 }, { "epoch": 0.34, "grad_norm": 37.75, "learning_rate": 3.333660265985207e-05, "loss": 1.1509, "step": 381600 }, { "epoch": 0.34, "grad_norm": 6.625, "learning_rate": 3.333210358666115e-05, "loss": 0.9006, "step": 381700 }, { "epoch": 0.34, "grad_norm": 20.75, "learning_rate": 3.332760451347023e-05, "loss": 0.9851, "step": 381800 }, { "epoch": 0.34, "grad_norm": 0.0115966796875, "learning_rate": 3.332310544027931e-05, "loss": 0.9302, "step": 381900 }, { "epoch": 0.34, "grad_norm": 12.8125, "learning_rate": 3.3318606367088385e-05, "loss": 0.943, "step": 382000 }, { "epoch": 0.34, "grad_norm": 31.875, "learning_rate": 3.3314107293897454e-05, "loss": 1.1274, "step": 382100 }, { "epoch": 0.34, "grad_norm": 29.25, "learning_rate": 3.3309608220706536e-05, "loss": 0.9225, "step": 382200 }, { "epoch": 0.34, "grad_norm": 11.0625, "learning_rate": 3.330510914751561e-05, "loss": 0.9638, "step": 382300 }, { "epoch": 0.34, "grad_norm": 22.375, "learning_rate": 3.3300610074324686e-05, "loss": 0.8293, "step": 382400 }, { "epoch": 0.34, "grad_norm": 94.5, "learning_rate": 3.329611100113377e-05, "loss": 0.8677, "step": 382500 }, { "epoch": 0.34, "grad_norm": 44.25, "learning_rate": 3.3291611927942844e-05, "loss": 1.037, "step": 382600 }, { "epoch": 0.34, "grad_norm": 9.875, "learning_rate": 3.3287112854751926e-05, "loss": 0.9874, "step": 382700 }, { "epoch": 0.34, "grad_norm": 17.875, "learning_rate": 3.3282613781561e-05, "loss": 1.1759, "step": 382800 }, { "epoch": 0.34, "grad_norm": 40.5, "learning_rate": 3.327811470837008e-05, "loss": 1.024, "step": 382900 }, { "epoch": 0.34, "grad_norm": 10.1875, "learning_rate": 3.327361563517916e-05, "loss": 1.0623, "step": 383000 }, { "epoch": 0.34, "grad_norm": 29.25, "learning_rate": 3.3269116561988234e-05, "loss": 0.8774, "step": 383100 }, { "epoch": 0.34, "grad_norm": 195.0, "learning_rate": 3.326461748879731e-05, "loss": 0.983, "step": 383200 }, { "epoch": 0.34, "grad_norm": 100.0, "learning_rate": 3.326011841560639e-05, "loss": 1.0287, "step": 383300 }, { "epoch": 0.34, "grad_norm": 14.6875, "learning_rate": 3.325561934241546e-05, "loss": 0.9744, "step": 383400 }, { "epoch": 0.34, "grad_norm": 14.125, "learning_rate": 3.325112026922454e-05, "loss": 0.9021, "step": 383500 }, { "epoch": 0.34, "grad_norm": 5.75, "learning_rate": 3.324662119603362e-05, "loss": 1.023, "step": 383600 }, { "epoch": 0.34, "grad_norm": 24.875, "learning_rate": 3.324212212284269e-05, "loss": 0.9416, "step": 383700 }, { "epoch": 0.34, "grad_norm": 42.75, "learning_rate": 3.3237623049651775e-05, "loss": 0.9394, "step": 383800 }, { "epoch": 0.34, "grad_norm": 22.125, "learning_rate": 3.323312397646085e-05, "loss": 0.9702, "step": 383900 }, { "epoch": 0.34, "grad_norm": 33.0, "learning_rate": 3.3228624903269926e-05, "loss": 0.8742, "step": 384000 }, { "epoch": 0.34, "grad_norm": 26.125, "learning_rate": 3.322412583007901e-05, "loss": 1.0123, "step": 384100 }, { "epoch": 0.34, "grad_norm": 10.0, "learning_rate": 3.321962675688808e-05, "loss": 0.936, "step": 384200 }, { "epoch": 0.34, "grad_norm": 42.25, "learning_rate": 3.321512768369716e-05, "loss": 0.9998, "step": 384300 }, { "epoch": 0.34, "grad_norm": 25.25, "learning_rate": 3.321062861050624e-05, "loss": 1.0996, "step": 384400 }, { "epoch": 0.34, "grad_norm": 11.9375, "learning_rate": 3.3206129537315316e-05, "loss": 1.1348, "step": 384500 }, { "epoch": 0.34, "grad_norm": 23.5, "learning_rate": 3.32016304641244e-05, "loss": 1.0157, "step": 384600 }, { "epoch": 0.34, "grad_norm": 15.375, "learning_rate": 3.319713139093347e-05, "loss": 1.0609, "step": 384700 }, { "epoch": 0.34, "grad_norm": 20.75, "learning_rate": 3.319263231774254e-05, "loss": 1.0987, "step": 384800 }, { "epoch": 0.34, "grad_norm": 40.5, "learning_rate": 3.3188133244551624e-05, "loss": 0.9281, "step": 384900 }, { "epoch": 0.34, "grad_norm": 34.75, "learning_rate": 3.31836341713607e-05, "loss": 1.0413, "step": 385000 }, { "epoch": 0.34, "grad_norm": 36.75, "learning_rate": 3.3179135098169775e-05, "loss": 0.9547, "step": 385100 }, { "epoch": 0.34, "grad_norm": 26.75, "learning_rate": 3.317463602497886e-05, "loss": 1.0339, "step": 385200 }, { "epoch": 0.34, "grad_norm": 10.625, "learning_rate": 3.317013695178793e-05, "loss": 1.0669, "step": 385300 }, { "epoch": 0.34, "grad_norm": 73.5, "learning_rate": 3.3165637878597014e-05, "loss": 1.0708, "step": 385400 }, { "epoch": 0.34, "grad_norm": 66.5, "learning_rate": 3.316113880540609e-05, "loss": 1.0079, "step": 385500 }, { "epoch": 0.34, "grad_norm": 0.080078125, "learning_rate": 3.3156639732215165e-05, "loss": 0.9695, "step": 385600 }, { "epoch": 0.34, "grad_norm": 13.25, "learning_rate": 3.315214065902425e-05, "loss": 1.1168, "step": 385700 }, { "epoch": 0.34, "grad_norm": 24.125, "learning_rate": 3.314764158583332e-05, "loss": 1.1013, "step": 385800 }, { "epoch": 0.34, "grad_norm": 22.875, "learning_rate": 3.31431425126424e-05, "loss": 1.1192, "step": 385900 }, { "epoch": 0.34, "grad_norm": 21.875, "learning_rate": 3.313864343945147e-05, "loss": 1.0347, "step": 386000 }, { "epoch": 0.34, "grad_norm": 12.4375, "learning_rate": 3.313414436626055e-05, "loss": 1.0131, "step": 386100 }, { "epoch": 0.34, "grad_norm": 104.5, "learning_rate": 3.312964529306963e-05, "loss": 1.032, "step": 386200 }, { "epoch": 0.34, "grad_norm": 18.5, "learning_rate": 3.3125146219878706e-05, "loss": 1.1368, "step": 386300 }, { "epoch": 0.34, "grad_norm": 17.0, "learning_rate": 3.312064714668778e-05, "loss": 0.9886, "step": 386400 }, { "epoch": 0.34, "grad_norm": 0.1748046875, "learning_rate": 3.3116148073496863e-05, "loss": 0.8901, "step": 386500 }, { "epoch": 0.34, "grad_norm": 23.875, "learning_rate": 3.311164900030594e-05, "loss": 1.107, "step": 386600 }, { "epoch": 0.34, "grad_norm": 24.375, "learning_rate": 3.3107149927115014e-05, "loss": 1.0496, "step": 386700 }, { "epoch": 0.34, "grad_norm": 20.375, "learning_rate": 3.3102650853924096e-05, "loss": 1.1145, "step": 386800 }, { "epoch": 0.34, "grad_norm": 73.5, "learning_rate": 3.309815178073317e-05, "loss": 0.9504, "step": 386900 }, { "epoch": 0.34, "grad_norm": 0.060791015625, "learning_rate": 3.309365270754225e-05, "loss": 0.8844, "step": 387000 }, { "epoch": 0.34, "grad_norm": 124.0, "learning_rate": 3.308915363435133e-05, "loss": 1.0265, "step": 387100 }, { "epoch": 0.34, "grad_norm": 12.5625, "learning_rate": 3.3084654561160404e-05, "loss": 1.0152, "step": 387200 }, { "epoch": 0.35, "grad_norm": 27.25, "learning_rate": 3.308015548796948e-05, "loss": 1.0995, "step": 387300 }, { "epoch": 0.35, "grad_norm": 73.0, "learning_rate": 3.3075656414778555e-05, "loss": 0.9643, "step": 387400 }, { "epoch": 0.35, "grad_norm": 23.625, "learning_rate": 3.307115734158763e-05, "loss": 0.9915, "step": 387500 }, { "epoch": 0.35, "grad_norm": 26.625, "learning_rate": 3.306665826839671e-05, "loss": 0.8849, "step": 387600 }, { "epoch": 0.35, "grad_norm": 0.263671875, "learning_rate": 3.306215919520579e-05, "loss": 0.9846, "step": 387700 }, { "epoch": 0.35, "grad_norm": 5.125, "learning_rate": 3.305766012201486e-05, "loss": 1.0451, "step": 387800 }, { "epoch": 0.35, "grad_norm": 0.10791015625, "learning_rate": 3.3053161048823945e-05, "loss": 1.0468, "step": 387900 }, { "epoch": 0.35, "grad_norm": 21.0, "learning_rate": 3.304866197563302e-05, "loss": 1.0574, "step": 388000 }, { "epoch": 0.35, "grad_norm": 12.3125, "learning_rate": 3.30441629024421e-05, "loss": 1.0069, "step": 388100 }, { "epoch": 0.35, "grad_norm": 27.0, "learning_rate": 3.303966382925118e-05, "loss": 1.1069, "step": 388200 }, { "epoch": 0.35, "grad_norm": 17.5, "learning_rate": 3.3035164756060253e-05, "loss": 0.8954, "step": 388300 }, { "epoch": 0.35, "grad_norm": 20.125, "learning_rate": 3.3030665682869336e-05, "loss": 1.0044, "step": 388400 }, { "epoch": 0.35, "grad_norm": 0.55078125, "learning_rate": 3.302616660967841e-05, "loss": 0.9934, "step": 388500 }, { "epoch": 0.35, "grad_norm": 11.25, "learning_rate": 3.302166753648748e-05, "loss": 0.9737, "step": 388600 }, { "epoch": 0.35, "grad_norm": 19.375, "learning_rate": 3.301716846329656e-05, "loss": 0.9581, "step": 388700 }, { "epoch": 0.35, "grad_norm": 95.0, "learning_rate": 3.301266939010564e-05, "loss": 1.0132, "step": 388800 }, { "epoch": 0.35, "grad_norm": 3.546875, "learning_rate": 3.300817031691472e-05, "loss": 0.9096, "step": 388900 }, { "epoch": 0.35, "grad_norm": 21.5, "learning_rate": 3.3003671243723794e-05, "loss": 1.0295, "step": 389000 }, { "epoch": 0.35, "grad_norm": 21.5, "learning_rate": 3.299917217053287e-05, "loss": 0.9456, "step": 389100 }, { "epoch": 0.35, "grad_norm": 30.5, "learning_rate": 3.299467309734195e-05, "loss": 1.064, "step": 389200 }, { "epoch": 0.35, "grad_norm": 28.75, "learning_rate": 3.299017402415103e-05, "loss": 0.775, "step": 389300 }, { "epoch": 0.35, "grad_norm": 21.0, "learning_rate": 3.29856749509601e-05, "loss": 0.9785, "step": 389400 }, { "epoch": 0.35, "grad_norm": 22.5, "learning_rate": 3.2981175877769185e-05, "loss": 1.0503, "step": 389500 }, { "epoch": 0.35, "grad_norm": 0.0849609375, "learning_rate": 3.297667680457826e-05, "loss": 0.915, "step": 389600 }, { "epoch": 0.35, "grad_norm": 14.5, "learning_rate": 3.2972177731387335e-05, "loss": 1.0422, "step": 389700 }, { "epoch": 0.35, "grad_norm": 99.5, "learning_rate": 3.296767865819642e-05, "loss": 0.9847, "step": 389800 }, { "epoch": 0.35, "grad_norm": 60.25, "learning_rate": 3.2963179585005486e-05, "loss": 1.0998, "step": 389900 }, { "epoch": 0.35, "grad_norm": 18.375, "learning_rate": 3.295868051181457e-05, "loss": 0.9966, "step": 390000 } ], "logging_steps": 100, "max_steps": 1122566, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 6000, "total_flos": 6.141852751798272e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }