{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09086325436544489, "eval_steps": 500, "global_step": 102000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 173.0, "learning_rate": 4.4539461963299484e-07, "loss": 2.153, "step": 100 }, { "epoch": 0.0, "grad_norm": 40.5, "learning_rate": 8.907892392659897e-07, "loss": 2.398, "step": 200 }, { "epoch": 0.0, "grad_norm": 223.0, "learning_rate": 1.3361838588989846e-06, "loss": 1.8912, "step": 300 }, { "epoch": 0.0, "grad_norm": 95.5, "learning_rate": 1.7815784785319793e-06, "loss": 1.6403, "step": 400 }, { "epoch": 0.0, "grad_norm": 600.0, "learning_rate": 2.226973098164974e-06, "loss": 1.4652, "step": 500 }, { "epoch": 0.0, "grad_norm": 51.75, "learning_rate": 2.672367717797969e-06, "loss": 1.2489, "step": 600 }, { "epoch": 0.0, "grad_norm": 290.0, "learning_rate": 3.1177623374309637e-06, "loss": 1.2421, "step": 700 }, { "epoch": 0.0, "grad_norm": 27.75, "learning_rate": 3.5631569570639587e-06, "loss": 1.2188, "step": 800 }, { "epoch": 0.0, "grad_norm": 13.0, "learning_rate": 4.008551576696954e-06, "loss": 1.123, "step": 900 }, { "epoch": 0.0, "grad_norm": 39.75, "learning_rate": 4.453946196329948e-06, "loss": 1.0388, "step": 1000 }, { "epoch": 0.0, "grad_norm": 21.375, "learning_rate": 4.899340815962943e-06, "loss": 1.3034, "step": 1100 }, { "epoch": 0.0, "grad_norm": 36.75, "learning_rate": 5.344735435595938e-06, "loss": 1.2272, "step": 1200 }, { "epoch": 0.0, "grad_norm": 71.5, "learning_rate": 5.790130055228933e-06, "loss": 1.1825, "step": 1300 }, { "epoch": 0.0, "grad_norm": 54.5, "learning_rate": 6.235524674861927e-06, "loss": 1.3466, "step": 1400 }, { "epoch": 0.0, "grad_norm": 169.0, "learning_rate": 6.680919294494922e-06, "loss": 1.0543, "step": 1500 }, { "epoch": 0.0, "grad_norm": 47.75, "learning_rate": 7.126313914127917e-06, "loss": 1.1117, "step": 1600 }, { "epoch": 0.0, "grad_norm": 9.5, "learning_rate": 7.571708533760913e-06, "loss": 1.1458, "step": 1700 }, { "epoch": 0.0, "grad_norm": 119.0, "learning_rate": 8.017103153393907e-06, "loss": 1.1823, "step": 1800 }, { "epoch": 0.0, "grad_norm": 31.0, "learning_rate": 8.462497773026902e-06, "loss": 1.2171, "step": 1900 }, { "epoch": 0.0, "grad_norm": 57.25, "learning_rate": 8.907892392659896e-06, "loss": 1.16, "step": 2000 }, { "epoch": 0.0, "grad_norm": 77.0, "learning_rate": 9.353287012292893e-06, "loss": 1.1243, "step": 2100 }, { "epoch": 0.0, "grad_norm": 31.5, "learning_rate": 9.798681631925886e-06, "loss": 1.1218, "step": 2200 }, { "epoch": 0.0, "grad_norm": 98.0, "learning_rate": 1.0244076251558882e-05, "loss": 1.1527, "step": 2300 }, { "epoch": 0.0, "grad_norm": 32.25, "learning_rate": 1.0689470871191876e-05, "loss": 1.1042, "step": 2400 }, { "epoch": 0.0, "grad_norm": 29.0, "learning_rate": 1.1134865490824871e-05, "loss": 1.0131, "step": 2500 }, { "epoch": 0.0, "grad_norm": 50.0, "learning_rate": 1.1580260110457866e-05, "loss": 1.2544, "step": 2600 }, { "epoch": 0.0, "grad_norm": 20.625, "learning_rate": 1.2025654730090862e-05, "loss": 1.1475, "step": 2700 }, { "epoch": 0.0, "grad_norm": 23.375, "learning_rate": 1.2471049349723855e-05, "loss": 1.2345, "step": 2800 }, { "epoch": 0.0, "grad_norm": 39.5, "learning_rate": 1.2916443969356851e-05, "loss": 1.269, "step": 2900 }, { "epoch": 0.0, "grad_norm": 30.5, "learning_rate": 1.3361838588989844e-05, "loss": 1.1995, "step": 3000 }, { "epoch": 0.0, "grad_norm": 17.875, "learning_rate": 1.3807233208622842e-05, "loss": 1.1367, "step": 3100 }, { "epoch": 0.0, "grad_norm": 255.0, "learning_rate": 1.4252627828255835e-05, "loss": 1.1993, "step": 3200 }, { "epoch": 0.0, "grad_norm": 73.5, "learning_rate": 1.469802244788883e-05, "loss": 1.039, "step": 3300 }, { "epoch": 0.0, "grad_norm": 106.5, "learning_rate": 1.5143417067521826e-05, "loss": 1.2538, "step": 3400 }, { "epoch": 0.0, "grad_norm": 153.0, "learning_rate": 1.558881168715482e-05, "loss": 1.156, "step": 3500 }, { "epoch": 0.0, "grad_norm": 111.0, "learning_rate": 1.6034206306787815e-05, "loss": 1.2418, "step": 3600 }, { "epoch": 0.0, "grad_norm": 20.25, "learning_rate": 1.647960092642081e-05, "loss": 1.0018, "step": 3700 }, { "epoch": 0.0, "grad_norm": 68.0, "learning_rate": 1.6924995546053804e-05, "loss": 1.101, "step": 3800 }, { "epoch": 0.0, "grad_norm": 79.5, "learning_rate": 1.7370390165686802e-05, "loss": 1.1538, "step": 3900 }, { "epoch": 0.0, "grad_norm": 26.5, "learning_rate": 1.7815784785319793e-05, "loss": 1.176, "step": 4000 }, { "epoch": 0.0, "grad_norm": 96.5, "learning_rate": 1.8261179404952788e-05, "loss": 0.9909, "step": 4100 }, { "epoch": 0.0, "grad_norm": 45.75, "learning_rate": 1.8706574024585786e-05, "loss": 1.1521, "step": 4200 }, { "epoch": 0.0, "grad_norm": 42.0, "learning_rate": 1.915196864421878e-05, "loss": 1.1438, "step": 4300 }, { "epoch": 0.0, "grad_norm": 16.125, "learning_rate": 1.959736326385177e-05, "loss": 1.1887, "step": 4400 }, { "epoch": 0.0, "grad_norm": 38.75, "learning_rate": 2.004275788348477e-05, "loss": 1.2946, "step": 4500 }, { "epoch": 0.0, "grad_norm": 99.5, "learning_rate": 2.0488152503117764e-05, "loss": 1.2574, "step": 4600 }, { "epoch": 0.0, "grad_norm": 65.5, "learning_rate": 2.093354712275076e-05, "loss": 1.1517, "step": 4700 }, { "epoch": 0.0, "grad_norm": 79.5, "learning_rate": 2.1378941742383753e-05, "loss": 1.187, "step": 4800 }, { "epoch": 0.0, "grad_norm": 162.0, "learning_rate": 2.1824336362016748e-05, "loss": 0.9974, "step": 4900 }, { "epoch": 0.0, "grad_norm": 48.5, "learning_rate": 2.2269730981649742e-05, "loss": 1.2278, "step": 5000 }, { "epoch": 0.0, "grad_norm": 0.09423828125, "learning_rate": 2.271512560128274e-05, "loss": 1.2128, "step": 5100 }, { "epoch": 0.0, "grad_norm": 34.75, "learning_rate": 2.316052022091573e-05, "loss": 1.1712, "step": 5200 }, { "epoch": 0.0, "grad_norm": 0.01483154296875, "learning_rate": 2.3605914840548726e-05, "loss": 0.9784, "step": 5300 }, { "epoch": 0.0, "grad_norm": 89.0, "learning_rate": 2.4051309460181724e-05, "loss": 1.1382, "step": 5400 }, { "epoch": 0.0, "grad_norm": 59.5, "learning_rate": 2.449670407981472e-05, "loss": 1.291, "step": 5500 }, { "epoch": 0.0, "grad_norm": 29.5, "learning_rate": 2.494209869944771e-05, "loss": 1.1317, "step": 5600 }, { "epoch": 0.01, "grad_norm": 24.125, "learning_rate": 2.5387493319080707e-05, "loss": 1.1689, "step": 5700 }, { "epoch": 0.01, "grad_norm": 34.0, "learning_rate": 2.5832887938713702e-05, "loss": 1.2427, "step": 5800 }, { "epoch": 0.01, "grad_norm": 34.5, "learning_rate": 2.6278282558346697e-05, "loss": 1.353, "step": 5900 }, { "epoch": 0.01, "grad_norm": 32.25, "learning_rate": 2.6723677177979688e-05, "loss": 1.1593, "step": 6000 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 2.716907179761269e-05, "loss": 1.1391, "step": 6100 }, { "epoch": 0.01, "grad_norm": 82.0, "learning_rate": 2.7614466417245684e-05, "loss": 1.1999, "step": 6200 }, { "epoch": 0.01, "grad_norm": 25.375, "learning_rate": 2.8059861036878675e-05, "loss": 1.0865, "step": 6300 }, { "epoch": 0.01, "grad_norm": 34.25, "learning_rate": 2.850525565651167e-05, "loss": 1.0761, "step": 6400 }, { "epoch": 0.01, "grad_norm": 44.0, "learning_rate": 2.8950650276144664e-05, "loss": 1.1187, "step": 6500 }, { "epoch": 0.01, "grad_norm": 38.25, "learning_rate": 2.939604489577766e-05, "loss": 1.0824, "step": 6600 }, { "epoch": 0.01, "grad_norm": 49.75, "learning_rate": 2.9841439515410657e-05, "loss": 1.0826, "step": 6700 }, { "epoch": 0.01, "grad_norm": 47.0, "learning_rate": 3.028683413504365e-05, "loss": 1.1387, "step": 6800 }, { "epoch": 0.01, "grad_norm": 61.0, "learning_rate": 3.073222875467664e-05, "loss": 1.1661, "step": 6900 }, { "epoch": 0.01, "grad_norm": 31.625, "learning_rate": 3.117762337430964e-05, "loss": 1.2022, "step": 7000 }, { "epoch": 0.01, "grad_norm": 38.75, "learning_rate": 3.162301799394263e-05, "loss": 1.2461, "step": 7100 }, { "epoch": 0.01, "grad_norm": 28.5, "learning_rate": 3.206841261357563e-05, "loss": 1.2583, "step": 7200 }, { "epoch": 0.01, "grad_norm": 12.0, "learning_rate": 3.251380723320863e-05, "loss": 1.1094, "step": 7300 }, { "epoch": 0.01, "grad_norm": 68.0, "learning_rate": 3.295920185284162e-05, "loss": 1.2244, "step": 7400 }, { "epoch": 0.01, "grad_norm": 42.0, "learning_rate": 3.3404596472474617e-05, "loss": 1.1486, "step": 7500 }, { "epoch": 0.01, "grad_norm": 344.0, "learning_rate": 3.384999109210761e-05, "loss": 1.2049, "step": 7600 }, { "epoch": 0.01, "grad_norm": 11.6875, "learning_rate": 3.42953857117406e-05, "loss": 1.0226, "step": 7700 }, { "epoch": 0.01, "grad_norm": 43.75, "learning_rate": 3.4740780331373604e-05, "loss": 1.2662, "step": 7800 }, { "epoch": 0.01, "grad_norm": 51.5, "learning_rate": 3.5186174951006595e-05, "loss": 1.1029, "step": 7900 }, { "epoch": 0.01, "grad_norm": 36.25, "learning_rate": 3.5631569570639586e-05, "loss": 1.1623, "step": 8000 }, { "epoch": 0.01, "grad_norm": 32.75, "learning_rate": 3.6076964190272584e-05, "loss": 1.0682, "step": 8100 }, { "epoch": 0.01, "grad_norm": 57.75, "learning_rate": 3.6522358809905575e-05, "loss": 1.1232, "step": 8200 }, { "epoch": 0.01, "grad_norm": 27.125, "learning_rate": 3.696775342953857e-05, "loss": 1.1474, "step": 8300 }, { "epoch": 0.01, "grad_norm": 93.0, "learning_rate": 3.741314804917157e-05, "loss": 1.0526, "step": 8400 }, { "epoch": 0.01, "grad_norm": 20.125, "learning_rate": 3.785854266880456e-05, "loss": 1.2, "step": 8500 }, { "epoch": 0.01, "grad_norm": 46.75, "learning_rate": 3.830393728843756e-05, "loss": 1.1716, "step": 8600 }, { "epoch": 0.01, "grad_norm": 54.75, "learning_rate": 3.874933190807055e-05, "loss": 1.042, "step": 8700 }, { "epoch": 0.01, "grad_norm": 51.0, "learning_rate": 3.919472652770354e-05, "loss": 1.1756, "step": 8800 }, { "epoch": 0.01, "grad_norm": 48.25, "learning_rate": 3.964012114733654e-05, "loss": 1.2597, "step": 8900 }, { "epoch": 0.01, "grad_norm": 14.75, "learning_rate": 4.008551576696954e-05, "loss": 1.0449, "step": 9000 }, { "epoch": 0.01, "grad_norm": 18.25, "learning_rate": 4.0530910386602536e-05, "loss": 1.2163, "step": 9100 }, { "epoch": 0.01, "grad_norm": 80.0, "learning_rate": 4.097630500623553e-05, "loss": 1.2302, "step": 9200 }, { "epoch": 0.01, "grad_norm": 32.0, "learning_rate": 4.142169962586852e-05, "loss": 1.1843, "step": 9300 }, { "epoch": 0.01, "grad_norm": 44.25, "learning_rate": 4.186709424550152e-05, "loss": 1.0384, "step": 9400 }, { "epoch": 0.01, "grad_norm": 66.5, "learning_rate": 4.231248886513451e-05, "loss": 1.0315, "step": 9500 }, { "epoch": 0.01, "grad_norm": 72.5, "learning_rate": 4.2757883484767506e-05, "loss": 1.1746, "step": 9600 }, { "epoch": 0.01, "grad_norm": 37.5, "learning_rate": 4.3203278104400504e-05, "loss": 1.2093, "step": 9700 }, { "epoch": 0.01, "grad_norm": 81.5, "learning_rate": 4.3648672724033495e-05, "loss": 1.3342, "step": 9800 }, { "epoch": 0.01, "grad_norm": 88.5, "learning_rate": 4.409406734366649e-05, "loss": 1.1162, "step": 9900 }, { "epoch": 0.01, "grad_norm": 356.0, "learning_rate": 4.4539461963299484e-05, "loss": 0.8431, "step": 10000 }, { "epoch": 0.01, "grad_norm": 75.5, "learning_rate": 4.498485658293248e-05, "loss": 1.15, "step": 10100 }, { "epoch": 0.01, "grad_norm": 51.0, "learning_rate": 4.543025120256548e-05, "loss": 1.1635, "step": 10200 }, { "epoch": 0.01, "grad_norm": 136.0, "learning_rate": 4.587564582219847e-05, "loss": 1.1175, "step": 10300 }, { "epoch": 0.01, "grad_norm": 180.0, "learning_rate": 4.632104044183146e-05, "loss": 1.0931, "step": 10400 }, { "epoch": 0.01, "grad_norm": 107.5, "learning_rate": 4.676643506146446e-05, "loss": 1.3118, "step": 10500 }, { "epoch": 0.01, "grad_norm": 34.0, "learning_rate": 4.721182968109745e-05, "loss": 1.0301, "step": 10600 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 4.765722430073045e-05, "loss": 0.9912, "step": 10700 }, { "epoch": 0.01, "grad_norm": 48.5, "learning_rate": 4.810261892036345e-05, "loss": 1.1649, "step": 10800 }, { "epoch": 0.01, "grad_norm": 88.5, "learning_rate": 4.854801353999644e-05, "loss": 1.4056, "step": 10900 }, { "epoch": 0.01, "grad_norm": 143.0, "learning_rate": 4.899340815962944e-05, "loss": 1.1315, "step": 11000 }, { "epoch": 0.01, "grad_norm": 26.5, "learning_rate": 4.943880277926243e-05, "loss": 1.231, "step": 11100 }, { "epoch": 0.01, "grad_norm": 47.75, "learning_rate": 4.988419739889542e-05, "loss": 1.2519, "step": 11200 }, { "epoch": 0.01, "grad_norm": 4.5625, "learning_rate": 4.999667068583872e-05, "loss": 1.1036, "step": 11300 }, { "epoch": 0.01, "grad_norm": 72.5, "learning_rate": 4.99921716126478e-05, "loss": 1.0886, "step": 11400 }, { "epoch": 0.01, "grad_norm": 102.5, "learning_rate": 4.998767253945688e-05, "loss": 1.1742, "step": 11500 }, { "epoch": 0.01, "grad_norm": 127.0, "learning_rate": 4.9983173466265954e-05, "loss": 1.2533, "step": 11600 }, { "epoch": 0.01, "grad_norm": 221.0, "learning_rate": 4.997867439307503e-05, "loss": 1.1356, "step": 11700 }, { "epoch": 0.01, "grad_norm": 132.0, "learning_rate": 4.997417531988411e-05, "loss": 1.0534, "step": 11800 }, { "epoch": 0.01, "grad_norm": 30.625, "learning_rate": 4.996967624669318e-05, "loss": 0.936, "step": 11900 }, { "epoch": 0.01, "grad_norm": 56.5, "learning_rate": 4.9965177173502256e-05, "loss": 1.0134, "step": 12000 }, { "epoch": 0.01, "grad_norm": 37.75, "learning_rate": 4.996067810031134e-05, "loss": 1.1623, "step": 12100 }, { "epoch": 0.01, "grad_norm": 175.0, "learning_rate": 4.995617902712041e-05, "loss": 1.0928, "step": 12200 }, { "epoch": 0.01, "grad_norm": 44.5, "learning_rate": 4.9951679953929495e-05, "loss": 1.2636, "step": 12300 }, { "epoch": 0.01, "grad_norm": 20.625, "learning_rate": 4.994718088073857e-05, "loss": 1.07, "step": 12400 }, { "epoch": 0.01, "grad_norm": 28.75, "learning_rate": 4.9942681807547646e-05, "loss": 0.9899, "step": 12500 }, { "epoch": 0.01, "grad_norm": 132.0, "learning_rate": 4.993818273435673e-05, "loss": 1.1851, "step": 12600 }, { "epoch": 0.01, "grad_norm": 133.0, "learning_rate": 4.9933683661165803e-05, "loss": 1.0886, "step": 12700 }, { "epoch": 0.01, "grad_norm": 84.0, "learning_rate": 4.992918458797488e-05, "loss": 1.0992, "step": 12800 }, { "epoch": 0.01, "grad_norm": 0.48046875, "learning_rate": 4.992468551478396e-05, "loss": 1.1081, "step": 12900 }, { "epoch": 0.01, "grad_norm": 33.0, "learning_rate": 4.9920186441593036e-05, "loss": 1.3126, "step": 13000 }, { "epoch": 0.01, "grad_norm": 27.5, "learning_rate": 4.991568736840211e-05, "loss": 1.1685, "step": 13100 }, { "epoch": 0.01, "grad_norm": 53.25, "learning_rate": 4.991118829521119e-05, "loss": 1.118, "step": 13200 }, { "epoch": 0.01, "grad_norm": 410.0, "learning_rate": 4.990668922202026e-05, "loss": 1.0968, "step": 13300 }, { "epoch": 0.01, "grad_norm": 56.25, "learning_rate": 4.9902190148829344e-05, "loss": 1.1793, "step": 13400 }, { "epoch": 0.01, "grad_norm": 25.75, "learning_rate": 4.989769107563842e-05, "loss": 1.1671, "step": 13500 }, { "epoch": 0.01, "grad_norm": 26.25, "learning_rate": 4.9893192002447495e-05, "loss": 1.1825, "step": 13600 }, { "epoch": 0.01, "grad_norm": 105.5, "learning_rate": 4.988869292925658e-05, "loss": 1.1509, "step": 13700 }, { "epoch": 0.01, "grad_norm": 79.5, "learning_rate": 4.988419385606565e-05, "loss": 1.1337, "step": 13800 }, { "epoch": 0.01, "grad_norm": 36.75, "learning_rate": 4.987969478287473e-05, "loss": 1.0261, "step": 13900 }, { "epoch": 0.01, "grad_norm": 48.75, "learning_rate": 4.987519570968381e-05, "loss": 1.2461, "step": 14000 }, { "epoch": 0.01, "grad_norm": 52.75, "learning_rate": 4.9870696636492885e-05, "loss": 1.1742, "step": 14100 }, { "epoch": 0.01, "grad_norm": 236.0, "learning_rate": 4.986619756330197e-05, "loss": 0.9742, "step": 14200 }, { "epoch": 0.01, "grad_norm": 402.0, "learning_rate": 4.986169849011104e-05, "loss": 1.2615, "step": 14300 }, { "epoch": 0.01, "grad_norm": 120.0, "learning_rate": 4.985719941692012e-05, "loss": 1.3339, "step": 14400 }, { "epoch": 0.01, "grad_norm": 66.0, "learning_rate": 4.9852700343729193e-05, "loss": 1.2984, "step": 14500 }, { "epoch": 0.01, "grad_norm": 37.5, "learning_rate": 4.984820127053827e-05, "loss": 0.9931, "step": 14600 }, { "epoch": 0.01, "grad_norm": 26.625, "learning_rate": 4.9843702197347344e-05, "loss": 1.1177, "step": 14700 }, { "epoch": 0.01, "grad_norm": 143.0, "learning_rate": 4.9839203124156426e-05, "loss": 1.1625, "step": 14800 }, { "epoch": 0.01, "grad_norm": 58.75, "learning_rate": 4.98347040509655e-05, "loss": 1.2321, "step": 14900 }, { "epoch": 0.01, "grad_norm": 27.625, "learning_rate": 4.9830204977774584e-05, "loss": 1.2545, "step": 15000 }, { "epoch": 0.01, "grad_norm": 40.75, "learning_rate": 4.982570590458366e-05, "loss": 1.1757, "step": 15100 }, { "epoch": 0.01, "grad_norm": 44.5, "learning_rate": 4.9821206831392734e-05, "loss": 1.1684, "step": 15200 }, { "epoch": 0.01, "grad_norm": 31.0, "learning_rate": 4.9816707758201816e-05, "loss": 1.2152, "step": 15300 }, { "epoch": 0.01, "grad_norm": 56.5, "learning_rate": 4.981220868501089e-05, "loss": 1.2536, "step": 15400 }, { "epoch": 0.01, "grad_norm": 93.0, "learning_rate": 4.980770961181997e-05, "loss": 1.2398, "step": 15500 }, { "epoch": 0.01, "grad_norm": 63.5, "learning_rate": 4.980321053862905e-05, "loss": 1.0495, "step": 15600 }, { "epoch": 0.01, "grad_norm": 294.0, "learning_rate": 4.9798711465438125e-05, "loss": 1.2303, "step": 15700 }, { "epoch": 0.01, "grad_norm": 97.0, "learning_rate": 4.97942123922472e-05, "loss": 1.0398, "step": 15800 }, { "epoch": 0.01, "grad_norm": 30.125, "learning_rate": 4.9789713319056275e-05, "loss": 1.2357, "step": 15900 }, { "epoch": 0.01, "grad_norm": 64.0, "learning_rate": 4.978521424586535e-05, "loss": 1.1963, "step": 16000 }, { "epoch": 0.01, "grad_norm": 69.0, "learning_rate": 4.978071517267443e-05, "loss": 1.1217, "step": 16100 }, { "epoch": 0.01, "grad_norm": 73.0, "learning_rate": 4.977621609948351e-05, "loss": 1.3172, "step": 16200 }, { "epoch": 0.01, "grad_norm": 6.4375, "learning_rate": 4.977171702629258e-05, "loss": 1.0252, "step": 16300 }, { "epoch": 0.01, "grad_norm": 13.5625, "learning_rate": 4.9767217953101666e-05, "loss": 1.0254, "step": 16400 }, { "epoch": 0.01, "grad_norm": 28.375, "learning_rate": 4.976271887991074e-05, "loss": 1.116, "step": 16500 }, { "epoch": 0.01, "grad_norm": 664.0, "learning_rate": 4.9758219806719816e-05, "loss": 1.1861, "step": 16600 }, { "epoch": 0.01, "grad_norm": 157.0, "learning_rate": 4.97537207335289e-05, "loss": 1.1762, "step": 16700 }, { "epoch": 0.01, "grad_norm": 42.75, "learning_rate": 4.9749221660337974e-05, "loss": 1.2372, "step": 16800 }, { "epoch": 0.02, "grad_norm": 2.578125, "learning_rate": 4.974472258714705e-05, "loss": 1.1629, "step": 16900 }, { "epoch": 0.02, "grad_norm": 196.0, "learning_rate": 4.974022351395613e-05, "loss": 1.1746, "step": 17000 }, { "epoch": 0.02, "grad_norm": 43.5, "learning_rate": 4.97357244407652e-05, "loss": 1.1731, "step": 17100 }, { "epoch": 0.02, "grad_norm": 22.375, "learning_rate": 4.973122536757428e-05, "loss": 1.0003, "step": 17200 }, { "epoch": 0.02, "grad_norm": 44.75, "learning_rate": 4.972672629438336e-05, "loss": 1.0736, "step": 17300 }, { "epoch": 0.02, "grad_norm": 2.5625, "learning_rate": 4.972222722119243e-05, "loss": 1.2785, "step": 17400 }, { "epoch": 0.02, "grad_norm": 31.5, "learning_rate": 4.9717728148001515e-05, "loss": 1.0953, "step": 17500 }, { "epoch": 0.02, "grad_norm": 83.5, "learning_rate": 4.971322907481059e-05, "loss": 1.1549, "step": 17600 }, { "epoch": 0.02, "grad_norm": 54.0, "learning_rate": 4.970873000161967e-05, "loss": 1.3649, "step": 17700 }, { "epoch": 0.02, "grad_norm": 182.0, "learning_rate": 4.970423092842875e-05, "loss": 1.109, "step": 17800 }, { "epoch": 0.02, "grad_norm": 233.0, "learning_rate": 4.969973185523782e-05, "loss": 1.1443, "step": 17900 }, { "epoch": 0.02, "grad_norm": 65.5, "learning_rate": 4.9695232782046905e-05, "loss": 1.1159, "step": 18000 }, { "epoch": 0.02, "grad_norm": 494.0, "learning_rate": 4.969073370885598e-05, "loss": 1.1072, "step": 18100 }, { "epoch": 0.02, "grad_norm": 105.0, "learning_rate": 4.9686234635665055e-05, "loss": 1.0828, "step": 18200 }, { "epoch": 0.02, "grad_norm": 32.0, "learning_rate": 4.968173556247414e-05, "loss": 1.092, "step": 18300 }, { "epoch": 0.02, "grad_norm": 40.5, "learning_rate": 4.9677236489283206e-05, "loss": 1.093, "step": 18400 }, { "epoch": 0.02, "grad_norm": 20.0, "learning_rate": 4.967273741609229e-05, "loss": 1.283, "step": 18500 }, { "epoch": 0.02, "grad_norm": 69.0, "learning_rate": 4.9668238342901364e-05, "loss": 1.2988, "step": 18600 }, { "epoch": 0.02, "grad_norm": 102.5, "learning_rate": 4.966373926971044e-05, "loss": 1.2539, "step": 18700 }, { "epoch": 0.02, "grad_norm": 54.5, "learning_rate": 4.965924019651952e-05, "loss": 0.9519, "step": 18800 }, { "epoch": 0.02, "grad_norm": 52.0, "learning_rate": 4.9654741123328596e-05, "loss": 1.0343, "step": 18900 }, { "epoch": 0.02, "grad_norm": 91.0, "learning_rate": 4.965024205013767e-05, "loss": 1.0141, "step": 19000 }, { "epoch": 0.02, "grad_norm": 18.5, "learning_rate": 4.9645742976946754e-05, "loss": 1.0664, "step": 19100 }, { "epoch": 0.02, "grad_norm": 82.0, "learning_rate": 4.964124390375583e-05, "loss": 1.1607, "step": 19200 }, { "epoch": 0.02, "grad_norm": 133.0, "learning_rate": 4.9636744830564905e-05, "loss": 1.0688, "step": 19300 }, { "epoch": 0.02, "grad_norm": 23.5, "learning_rate": 4.963224575737399e-05, "loss": 1.006, "step": 19400 }, { "epoch": 0.02, "grad_norm": 29.125, "learning_rate": 4.962774668418306e-05, "loss": 1.2201, "step": 19500 }, { "epoch": 0.02, "grad_norm": 214.0, "learning_rate": 4.962324761099214e-05, "loss": 1.1192, "step": 19600 }, { "epoch": 0.02, "grad_norm": 34.75, "learning_rate": 4.961874853780121e-05, "loss": 1.1362, "step": 19700 }, { "epoch": 0.02, "grad_norm": 156.0, "learning_rate": 4.961424946461029e-05, "loss": 1.1204, "step": 19800 }, { "epoch": 0.02, "grad_norm": 157.0, "learning_rate": 4.960975039141937e-05, "loss": 1.0584, "step": 19900 }, { "epoch": 0.02, "grad_norm": 37.5, "learning_rate": 4.9605251318228445e-05, "loss": 1.2305, "step": 20000 }, { "epoch": 0.02, "grad_norm": 127.0, "learning_rate": 4.960075224503752e-05, "loss": 1.0515, "step": 20100 }, { "epoch": 0.02, "grad_norm": 126.5, "learning_rate": 4.95962531718466e-05, "loss": 1.1291, "step": 20200 }, { "epoch": 0.02, "grad_norm": 20.75, "learning_rate": 4.959175409865568e-05, "loss": 1.2646, "step": 20300 }, { "epoch": 0.02, "grad_norm": 37.25, "learning_rate": 4.958725502546476e-05, "loss": 1.0123, "step": 20400 }, { "epoch": 0.02, "grad_norm": 39.25, "learning_rate": 4.9582755952273836e-05, "loss": 1.2387, "step": 20500 }, { "epoch": 0.02, "grad_norm": 103.5, "learning_rate": 4.957825687908291e-05, "loss": 1.097, "step": 20600 }, { "epoch": 0.02, "grad_norm": 12.25, "learning_rate": 4.957375780589199e-05, "loss": 0.9842, "step": 20700 }, { "epoch": 0.02, "grad_norm": 41.75, "learning_rate": 4.956925873270107e-05, "loss": 1.018, "step": 20800 }, { "epoch": 0.02, "grad_norm": 20.5, "learning_rate": 4.9564759659510144e-05, "loss": 1.0691, "step": 20900 }, { "epoch": 0.02, "grad_norm": 72.0, "learning_rate": 4.956026058631922e-05, "loss": 1.1096, "step": 21000 }, { "epoch": 0.02, "grad_norm": 0.0006103515625, "learning_rate": 4.9555761513128295e-05, "loss": 1.2363, "step": 21100 }, { "epoch": 0.02, "grad_norm": 37.75, "learning_rate": 4.955126243993738e-05, "loss": 1.1209, "step": 21200 }, { "epoch": 0.02, "grad_norm": 43.25, "learning_rate": 4.954676336674645e-05, "loss": 1.3024, "step": 21300 }, { "epoch": 0.02, "grad_norm": 2.125, "learning_rate": 4.954226429355553e-05, "loss": 1.1737, "step": 21400 }, { "epoch": 0.02, "grad_norm": 203.0, "learning_rate": 4.953776522036461e-05, "loss": 1.0662, "step": 21500 }, { "epoch": 0.02, "grad_norm": 22.0, "learning_rate": 4.9533266147173685e-05, "loss": 1.0818, "step": 21600 }, { "epoch": 0.02, "grad_norm": 15.3125, "learning_rate": 4.952876707398276e-05, "loss": 1.1323, "step": 21700 }, { "epoch": 0.02, "grad_norm": 6.59375, "learning_rate": 4.952426800079184e-05, "loss": 0.9627, "step": 21800 }, { "epoch": 0.02, "grad_norm": 145.0, "learning_rate": 4.951976892760092e-05, "loss": 1.1693, "step": 21900 }, { "epoch": 0.02, "grad_norm": 65.5, "learning_rate": 4.951526985440999e-05, "loss": 1.2508, "step": 22000 }, { "epoch": 0.02, "grad_norm": 26.875, "learning_rate": 4.9510770781219075e-05, "loss": 0.9449, "step": 22100 }, { "epoch": 0.02, "grad_norm": 23.625, "learning_rate": 4.950627170802815e-05, "loss": 0.9817, "step": 22200 }, { "epoch": 0.02, "grad_norm": 83.0, "learning_rate": 4.9501772634837226e-05, "loss": 1.1505, "step": 22300 }, { "epoch": 0.02, "grad_norm": 33.0, "learning_rate": 4.94972735616463e-05, "loss": 1.0348, "step": 22400 }, { "epoch": 0.02, "grad_norm": 77.5, "learning_rate": 4.9492774488455376e-05, "loss": 1.1222, "step": 22500 }, { "epoch": 0.02, "grad_norm": 40.25, "learning_rate": 4.948827541526446e-05, "loss": 1.2024, "step": 22600 }, { "epoch": 0.02, "grad_norm": 103.5, "learning_rate": 4.9483776342073534e-05, "loss": 1.1287, "step": 22700 }, { "epoch": 0.02, "grad_norm": 45.25, "learning_rate": 4.947927726888261e-05, "loss": 1.1186, "step": 22800 }, { "epoch": 0.02, "grad_norm": 14.625, "learning_rate": 4.947477819569169e-05, "loss": 1.0359, "step": 22900 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.947027912250077e-05, "loss": 0.9546, "step": 23000 }, { "epoch": 0.02, "grad_norm": 73.5, "learning_rate": 4.946578004930985e-05, "loss": 1.2416, "step": 23100 }, { "epoch": 0.02, "grad_norm": 33.0, "learning_rate": 4.9461280976118924e-05, "loss": 1.0936, "step": 23200 }, { "epoch": 0.02, "grad_norm": 74.5, "learning_rate": 4.9456781902928e-05, "loss": 1.1805, "step": 23300 }, { "epoch": 0.02, "grad_norm": 4.90625, "learning_rate": 4.945228282973708e-05, "loss": 1.1044, "step": 23400 }, { "epoch": 0.02, "grad_norm": 22.375, "learning_rate": 4.944778375654616e-05, "loss": 1.1009, "step": 23500 }, { "epoch": 0.02, "grad_norm": 18.125, "learning_rate": 4.9443284683355225e-05, "loss": 1.1798, "step": 23600 }, { "epoch": 0.02, "grad_norm": 72.0, "learning_rate": 4.943878561016431e-05, "loss": 1.0636, "step": 23700 }, { "epoch": 0.02, "grad_norm": 13.5625, "learning_rate": 4.943428653697338e-05, "loss": 1.2526, "step": 23800 }, { "epoch": 0.02, "grad_norm": 61.75, "learning_rate": 4.9429787463782465e-05, "loss": 1.2469, "step": 23900 }, { "epoch": 0.02, "grad_norm": 48.75, "learning_rate": 4.942528839059154e-05, "loss": 1.1861, "step": 24000 }, { "epoch": 0.02, "grad_norm": 216.0, "learning_rate": 4.9420789317400616e-05, "loss": 1.1375, "step": 24100 }, { "epoch": 0.02, "grad_norm": 20.625, "learning_rate": 4.94162902442097e-05, "loss": 1.1822, "step": 24200 }, { "epoch": 0.02, "grad_norm": 26.875, "learning_rate": 4.941179117101877e-05, "loss": 1.1196, "step": 24300 }, { "epoch": 0.02, "grad_norm": 39.0, "learning_rate": 4.940729209782785e-05, "loss": 1.1318, "step": 24400 }, { "epoch": 0.02, "grad_norm": 24.625, "learning_rate": 4.940279302463693e-05, "loss": 1.0661, "step": 24500 }, { "epoch": 0.02, "grad_norm": 52.75, "learning_rate": 4.9398293951446006e-05, "loss": 1.0103, "step": 24600 }, { "epoch": 0.02, "grad_norm": 124.5, "learning_rate": 4.939379487825508e-05, "loss": 1.1614, "step": 24700 }, { "epoch": 0.02, "grad_norm": 14.1875, "learning_rate": 4.9389295805064163e-05, "loss": 1.0209, "step": 24800 }, { "epoch": 0.02, "grad_norm": 47.0, "learning_rate": 4.938479673187323e-05, "loss": 1.1871, "step": 24900 }, { "epoch": 0.02, "grad_norm": 278.0, "learning_rate": 4.9380297658682314e-05, "loss": 1.0667, "step": 25000 }, { "epoch": 0.02, "grad_norm": 2.359375, "learning_rate": 4.937579858549139e-05, "loss": 1.0638, "step": 25100 }, { "epoch": 0.02, "grad_norm": 37.75, "learning_rate": 4.9371299512300465e-05, "loss": 0.8903, "step": 25200 }, { "epoch": 0.02, "grad_norm": 464.0, "learning_rate": 4.936680043910955e-05, "loss": 1.0502, "step": 25300 }, { "epoch": 0.02, "grad_norm": 38.5, "learning_rate": 4.936230136591862e-05, "loss": 1.1984, "step": 25400 }, { "epoch": 0.02, "grad_norm": 32.5, "learning_rate": 4.93578022927277e-05, "loss": 1.2607, "step": 25500 }, { "epoch": 0.02, "grad_norm": 48.5, "learning_rate": 4.935330321953678e-05, "loss": 1.058, "step": 25600 }, { "epoch": 0.02, "grad_norm": 59.5, "learning_rate": 4.9348804146345855e-05, "loss": 1.0378, "step": 25700 }, { "epoch": 0.02, "grad_norm": 512.0, "learning_rate": 4.934430507315494e-05, "loss": 1.0418, "step": 25800 }, { "epoch": 0.02, "grad_norm": 47.25, "learning_rate": 4.933980599996401e-05, "loss": 1.1618, "step": 25900 }, { "epoch": 0.02, "grad_norm": 44.75, "learning_rate": 4.933530692677309e-05, "loss": 1.0471, "step": 26000 }, { "epoch": 0.02, "grad_norm": 23.125, "learning_rate": 4.933080785358217e-05, "loss": 1.0217, "step": 26100 }, { "epoch": 0.02, "grad_norm": 112.0, "learning_rate": 4.932630878039124e-05, "loss": 1.0314, "step": 26200 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.9321809707200314e-05, "loss": 1.1879, "step": 26300 }, { "epoch": 0.02, "grad_norm": 36.25, "learning_rate": 4.9317310634009396e-05, "loss": 1.0381, "step": 26400 }, { "epoch": 0.02, "grad_norm": 29.625, "learning_rate": 4.931281156081847e-05, "loss": 1.1444, "step": 26500 }, { "epoch": 0.02, "grad_norm": 37.25, "learning_rate": 4.9308312487627553e-05, "loss": 0.9723, "step": 26600 }, { "epoch": 0.02, "grad_norm": 65.0, "learning_rate": 4.930381341443663e-05, "loss": 1.13, "step": 26700 }, { "epoch": 0.02, "grad_norm": 23.875, "learning_rate": 4.9299314341245704e-05, "loss": 0.9686, "step": 26800 }, { "epoch": 0.02, "grad_norm": 68.0, "learning_rate": 4.9294815268054786e-05, "loss": 1.0283, "step": 26900 }, { "epoch": 0.02, "grad_norm": 42.25, "learning_rate": 4.929031619486386e-05, "loss": 1.1077, "step": 27000 }, { "epoch": 0.02, "grad_norm": 38.25, "learning_rate": 4.928581712167294e-05, "loss": 1.1153, "step": 27100 }, { "epoch": 0.02, "grad_norm": 28.5, "learning_rate": 4.928131804848202e-05, "loss": 1.1458, "step": 27200 }, { "epoch": 0.02, "grad_norm": 352.0, "learning_rate": 4.9276818975291094e-05, "loss": 1.0234, "step": 27300 }, { "epoch": 0.02, "grad_norm": 58.25, "learning_rate": 4.927231990210017e-05, "loss": 1.1267, "step": 27400 }, { "epoch": 0.02, "grad_norm": 326.0, "learning_rate": 4.9267820828909245e-05, "loss": 1.0168, "step": 27500 }, { "epoch": 0.02, "grad_norm": 42.25, "learning_rate": 4.926332175571832e-05, "loss": 1.0986, "step": 27600 }, { "epoch": 0.02, "grad_norm": 81.0, "learning_rate": 4.92588226825274e-05, "loss": 0.9895, "step": 27700 }, { "epoch": 0.02, "grad_norm": 160.0, "learning_rate": 4.925432360933648e-05, "loss": 1.1471, "step": 27800 }, { "epoch": 0.02, "grad_norm": 2.578125, "learning_rate": 4.924982453614555e-05, "loss": 1.0556, "step": 27900 }, { "epoch": 0.02, "grad_norm": 2.421875, "learning_rate": 4.9245325462954635e-05, "loss": 1.0177, "step": 28000 }, { "epoch": 0.03, "grad_norm": 82.5, "learning_rate": 4.924082638976371e-05, "loss": 1.1341, "step": 28100 }, { "epoch": 0.03, "grad_norm": 98.5, "learning_rate": 4.9236327316572786e-05, "loss": 1.2145, "step": 28200 }, { "epoch": 0.03, "grad_norm": 0.05224609375, "learning_rate": 4.923182824338187e-05, "loss": 1.0562, "step": 28300 }, { "epoch": 0.03, "grad_norm": 43.0, "learning_rate": 4.9227329170190943e-05, "loss": 1.2268, "step": 28400 }, { "epoch": 0.03, "grad_norm": 59.25, "learning_rate": 4.9222830097000026e-05, "loss": 1.1499, "step": 28500 }, { "epoch": 0.03, "grad_norm": 592.0, "learning_rate": 4.92183310238091e-05, "loss": 1.215, "step": 28600 }, { "epoch": 0.03, "grad_norm": 16.75, "learning_rate": 4.9213831950618176e-05, "loss": 1.0979, "step": 28700 }, { "epoch": 0.03, "grad_norm": 38.25, "learning_rate": 4.920933287742725e-05, "loss": 1.128, "step": 28800 }, { "epoch": 0.03, "grad_norm": 62.5, "learning_rate": 4.920483380423633e-05, "loss": 1.046, "step": 28900 }, { "epoch": 0.03, "grad_norm": 2.703125, "learning_rate": 4.92003347310454e-05, "loss": 1.063, "step": 29000 }, { "epoch": 0.03, "grad_norm": 7.875, "learning_rate": 4.9195835657854484e-05, "loss": 1.0012, "step": 29100 }, { "epoch": 0.03, "grad_norm": 19.25, "learning_rate": 4.919133658466356e-05, "loss": 1.0919, "step": 29200 }, { "epoch": 0.03, "grad_norm": 14.6875, "learning_rate": 4.918683751147264e-05, "loss": 0.9316, "step": 29300 }, { "epoch": 0.03, "grad_norm": 50.25, "learning_rate": 4.918233843828172e-05, "loss": 1.0796, "step": 29400 }, { "epoch": 0.03, "grad_norm": 0.01422119140625, "learning_rate": 4.917783936509079e-05, "loss": 1.0421, "step": 29500 }, { "epoch": 0.03, "grad_norm": 20.75, "learning_rate": 4.9173340291899875e-05, "loss": 1.1928, "step": 29600 }, { "epoch": 0.03, "grad_norm": 13.25, "learning_rate": 4.916884121870895e-05, "loss": 1.1847, "step": 29700 }, { "epoch": 0.03, "grad_norm": 12.9375, "learning_rate": 4.9164342145518025e-05, "loss": 1.1563, "step": 29800 }, { "epoch": 0.03, "grad_norm": 32.5, "learning_rate": 4.915984307232711e-05, "loss": 1.179, "step": 29900 }, { "epoch": 0.03, "grad_norm": 44.0, "learning_rate": 4.915534399913618e-05, "loss": 1.1422, "step": 30000 }, { "epoch": 0.03, "grad_norm": 46.75, "learning_rate": 4.915084492594526e-05, "loss": 1.0485, "step": 30100 }, { "epoch": 0.03, "grad_norm": 88.0, "learning_rate": 4.9146345852754333e-05, "loss": 1.0639, "step": 30200 }, { "epoch": 0.03, "grad_norm": 31.5, "learning_rate": 4.914184677956341e-05, "loss": 1.1275, "step": 30300 }, { "epoch": 0.03, "grad_norm": 65.5, "learning_rate": 4.913734770637249e-05, "loss": 1.1221, "step": 30400 }, { "epoch": 0.03, "grad_norm": 84.0, "learning_rate": 4.9132848633181566e-05, "loss": 1.1461, "step": 30500 }, { "epoch": 0.03, "grad_norm": 0.09326171875, "learning_rate": 4.912834955999064e-05, "loss": 1.0183, "step": 30600 }, { "epoch": 0.03, "grad_norm": 0.7578125, "learning_rate": 4.9123850486799724e-05, "loss": 1.1438, "step": 30700 }, { "epoch": 0.03, "grad_norm": 43.25, "learning_rate": 4.91193514136088e-05, "loss": 0.9956, "step": 30800 }, { "epoch": 0.03, "grad_norm": 120.0, "learning_rate": 4.9114852340417874e-05, "loss": 1.1497, "step": 30900 }, { "epoch": 0.03, "grad_norm": 29.625, "learning_rate": 4.9110353267226956e-05, "loss": 1.1515, "step": 31000 }, { "epoch": 0.03, "grad_norm": 13.5625, "learning_rate": 4.910585419403603e-05, "loss": 1.0692, "step": 31100 }, { "epoch": 0.03, "grad_norm": 113.0, "learning_rate": 4.9101355120845114e-05, "loss": 1.1573, "step": 31200 }, { "epoch": 0.03, "grad_norm": 14.125, "learning_rate": 4.909685604765419e-05, "loss": 1.0614, "step": 31300 }, { "epoch": 0.03, "grad_norm": 2.359375, "learning_rate": 4.909235697446326e-05, "loss": 1.1084, "step": 31400 }, { "epoch": 0.03, "grad_norm": 0.66796875, "learning_rate": 4.908785790127234e-05, "loss": 0.9684, "step": 31500 }, { "epoch": 0.03, "grad_norm": 36.0, "learning_rate": 4.9083358828081415e-05, "loss": 0.9898, "step": 31600 }, { "epoch": 0.03, "grad_norm": 29.625, "learning_rate": 4.907885975489049e-05, "loss": 1.0407, "step": 31700 }, { "epoch": 0.03, "grad_norm": 12.3125, "learning_rate": 4.907436068169957e-05, "loss": 1.1401, "step": 31800 }, { "epoch": 0.03, "grad_norm": 2.171875, "learning_rate": 4.906986160850865e-05, "loss": 0.9899, "step": 31900 }, { "epoch": 0.03, "grad_norm": 63.25, "learning_rate": 4.906536253531773e-05, "loss": 1.1188, "step": 32000 }, { "epoch": 0.03, "grad_norm": 116.5, "learning_rate": 4.9060863462126806e-05, "loss": 1.1581, "step": 32100 }, { "epoch": 0.03, "grad_norm": 5.1875, "learning_rate": 4.905636438893588e-05, "loss": 1.1116, "step": 32200 }, { "epoch": 0.03, "grad_norm": 71.0, "learning_rate": 4.905186531574496e-05, "loss": 1.0821, "step": 32300 }, { "epoch": 0.03, "grad_norm": 24.875, "learning_rate": 4.904736624255404e-05, "loss": 1.0241, "step": 32400 }, { "epoch": 0.03, "grad_norm": 71.0, "learning_rate": 4.9042867169363114e-05, "loss": 0.9884, "step": 32500 }, { "epoch": 0.03, "grad_norm": 14.25, "learning_rate": 4.9038368096172196e-05, "loss": 1.0877, "step": 32600 }, { "epoch": 0.03, "grad_norm": 14.0625, "learning_rate": 4.9033869022981264e-05, "loss": 1.1312, "step": 32700 }, { "epoch": 0.03, "grad_norm": 28.25, "learning_rate": 4.9029369949790346e-05, "loss": 1.1194, "step": 32800 }, { "epoch": 0.03, "grad_norm": 72.5, "learning_rate": 4.902487087659942e-05, "loss": 1.2167, "step": 32900 }, { "epoch": 0.03, "grad_norm": 60.5, "learning_rate": 4.90203718034085e-05, "loss": 1.0633, "step": 33000 }, { "epoch": 0.03, "grad_norm": 62.25, "learning_rate": 4.901587273021758e-05, "loss": 1.1538, "step": 33100 }, { "epoch": 0.03, "grad_norm": 0.1943359375, "learning_rate": 4.9011373657026655e-05, "loss": 0.9982, "step": 33200 }, { "epoch": 0.03, "grad_norm": 45.25, "learning_rate": 4.900687458383573e-05, "loss": 1.1433, "step": 33300 }, { "epoch": 0.03, "grad_norm": 548.0, "learning_rate": 4.900237551064481e-05, "loss": 1.1228, "step": 33400 }, { "epoch": 0.03, "grad_norm": 24.125, "learning_rate": 4.899787643745389e-05, "loss": 1.0949, "step": 33500 }, { "epoch": 0.03, "grad_norm": 29.625, "learning_rate": 4.899337736426296e-05, "loss": 0.9741, "step": 33600 }, { "epoch": 0.03, "grad_norm": 68.5, "learning_rate": 4.8988878291072045e-05, "loss": 1.1313, "step": 33700 }, { "epoch": 0.03, "grad_norm": 24.125, "learning_rate": 4.898437921788112e-05, "loss": 0.9327, "step": 33800 }, { "epoch": 0.03, "grad_norm": 30.625, "learning_rate": 4.8979880144690196e-05, "loss": 1.0312, "step": 33900 }, { "epoch": 0.03, "grad_norm": 28.0, "learning_rate": 4.897538107149927e-05, "loss": 1.0795, "step": 34000 }, { "epoch": 0.03, "grad_norm": 0.23828125, "learning_rate": 4.8970881998308346e-05, "loss": 1.028, "step": 34100 }, { "epoch": 0.03, "grad_norm": 336.0, "learning_rate": 4.896638292511743e-05, "loss": 0.9252, "step": 34200 }, { "epoch": 0.03, "grad_norm": 59.5, "learning_rate": 4.8961883851926504e-05, "loss": 1.0244, "step": 34300 }, { "epoch": 0.03, "grad_norm": 91.5, "learning_rate": 4.895738477873558e-05, "loss": 1.2062, "step": 34400 }, { "epoch": 0.03, "grad_norm": 24.25, "learning_rate": 4.895288570554466e-05, "loss": 1.016, "step": 34500 }, { "epoch": 0.03, "grad_norm": 326.0, "learning_rate": 4.8948386632353736e-05, "loss": 0.946, "step": 34600 }, { "epoch": 0.03, "grad_norm": 18.75, "learning_rate": 4.894388755916282e-05, "loss": 1.0384, "step": 34700 }, { "epoch": 0.03, "grad_norm": 0.361328125, "learning_rate": 4.8939388485971894e-05, "loss": 1.0267, "step": 34800 }, { "epoch": 0.03, "grad_norm": 26.75, "learning_rate": 4.893488941278097e-05, "loss": 1.0807, "step": 34900 }, { "epoch": 0.03, "grad_norm": 72.0, "learning_rate": 4.893039033959005e-05, "loss": 1.0735, "step": 35000 }, { "epoch": 0.03, "grad_norm": 5.875, "learning_rate": 4.892589126639913e-05, "loss": 1.2151, "step": 35100 }, { "epoch": 0.03, "grad_norm": 170.0, "learning_rate": 4.89213921932082e-05, "loss": 1.189, "step": 35200 }, { "epoch": 0.03, "grad_norm": 56.75, "learning_rate": 4.891689312001728e-05, "loss": 1.1221, "step": 35300 }, { "epoch": 0.03, "grad_norm": 63.75, "learning_rate": 4.891239404682635e-05, "loss": 1.1452, "step": 35400 }, { "epoch": 0.03, "grad_norm": 39.75, "learning_rate": 4.8907894973635435e-05, "loss": 1.2071, "step": 35500 }, { "epoch": 0.03, "grad_norm": 48.0, "learning_rate": 4.890339590044451e-05, "loss": 1.1314, "step": 35600 }, { "epoch": 0.03, "grad_norm": 17.5, "learning_rate": 4.8898896827253586e-05, "loss": 1.1872, "step": 35700 }, { "epoch": 0.03, "grad_norm": 222.0, "learning_rate": 4.889439775406267e-05, "loss": 1.1534, "step": 35800 }, { "epoch": 0.03, "grad_norm": 13.25, "learning_rate": 4.888989868087174e-05, "loss": 1.1041, "step": 35900 }, { "epoch": 0.03, "grad_norm": 79.0, "learning_rate": 4.888539960768082e-05, "loss": 1.0419, "step": 36000 }, { "epoch": 0.03, "grad_norm": 35.75, "learning_rate": 4.88809005344899e-05, "loss": 1.1561, "step": 36100 }, { "epoch": 0.03, "grad_norm": 19.0, "learning_rate": 4.8876401461298976e-05, "loss": 1.0917, "step": 36200 }, { "epoch": 0.03, "grad_norm": 45.5, "learning_rate": 4.887190238810805e-05, "loss": 1.1914, "step": 36300 }, { "epoch": 0.03, "grad_norm": 20.0, "learning_rate": 4.886740331491713e-05, "loss": 1.0026, "step": 36400 }, { "epoch": 0.03, "grad_norm": 20.125, "learning_rate": 4.886290424172621e-05, "loss": 1.1217, "step": 36500 }, { "epoch": 0.03, "grad_norm": 31.125, "learning_rate": 4.8858405168535284e-05, "loss": 1.0603, "step": 36600 }, { "epoch": 0.03, "grad_norm": 42.5, "learning_rate": 4.885390609534436e-05, "loss": 1.1187, "step": 36700 }, { "epoch": 0.03, "grad_norm": 368.0, "learning_rate": 4.8849407022153435e-05, "loss": 1.0626, "step": 36800 }, { "epoch": 0.03, "grad_norm": 30.25, "learning_rate": 4.884490794896252e-05, "loss": 1.1262, "step": 36900 }, { "epoch": 0.03, "grad_norm": 62.0, "learning_rate": 4.884040887577159e-05, "loss": 1.1231, "step": 37000 }, { "epoch": 0.03, "grad_norm": 52.25, "learning_rate": 4.883590980258067e-05, "loss": 1.0812, "step": 37100 }, { "epoch": 0.03, "grad_norm": 114.0, "learning_rate": 4.883141072938975e-05, "loss": 0.9969, "step": 37200 }, { "epoch": 0.03, "grad_norm": 45.0, "learning_rate": 4.8826911656198825e-05, "loss": 1.1014, "step": 37300 }, { "epoch": 0.03, "grad_norm": 102.0, "learning_rate": 4.882241258300791e-05, "loss": 1.1065, "step": 37400 }, { "epoch": 0.03, "grad_norm": 20.375, "learning_rate": 4.881791350981698e-05, "loss": 1.0295, "step": 37500 }, { "epoch": 0.03, "grad_norm": 0.24609375, "learning_rate": 4.881341443662606e-05, "loss": 1.0879, "step": 37600 }, { "epoch": 0.03, "grad_norm": 23.0, "learning_rate": 4.880891536343514e-05, "loss": 1.2481, "step": 37700 }, { "epoch": 0.03, "grad_norm": 127.0, "learning_rate": 4.8804416290244215e-05, "loss": 1.1123, "step": 37800 }, { "epoch": 0.03, "grad_norm": 46.25, "learning_rate": 4.8799917217053284e-05, "loss": 1.1555, "step": 37900 }, { "epoch": 0.03, "grad_norm": 234.0, "learning_rate": 4.8795418143862366e-05, "loss": 1.0754, "step": 38000 }, { "epoch": 0.03, "grad_norm": 31.875, "learning_rate": 4.879091907067144e-05, "loss": 1.029, "step": 38100 }, { "epoch": 0.03, "grad_norm": 2.34375, "learning_rate": 4.878641999748052e-05, "loss": 1.1966, "step": 38200 }, { "epoch": 0.03, "grad_norm": 96.0, "learning_rate": 4.87819209242896e-05, "loss": 1.0488, "step": 38300 }, { "epoch": 0.03, "grad_norm": 13.8125, "learning_rate": 4.8777421851098674e-05, "loss": 1.075, "step": 38400 }, { "epoch": 0.03, "grad_norm": 35.75, "learning_rate": 4.8772922777907756e-05, "loss": 1.0888, "step": 38500 }, { "epoch": 0.03, "grad_norm": 14.3125, "learning_rate": 4.876842370471683e-05, "loss": 1.0776, "step": 38600 }, { "epoch": 0.03, "grad_norm": 29.0, "learning_rate": 4.876392463152591e-05, "loss": 1.1729, "step": 38700 }, { "epoch": 0.03, "grad_norm": 48.25, "learning_rate": 4.875942555833499e-05, "loss": 1.0547, "step": 38800 }, { "epoch": 0.03, "grad_norm": 24.125, "learning_rate": 4.8754926485144064e-05, "loss": 1.033, "step": 38900 }, { "epoch": 0.03, "grad_norm": 22.625, "learning_rate": 4.875042741195314e-05, "loss": 1.0052, "step": 39000 }, { "epoch": 0.03, "grad_norm": 11.5625, "learning_rate": 4.874592833876222e-05, "loss": 1.0835, "step": 39100 }, { "epoch": 0.03, "grad_norm": 38.0, "learning_rate": 4.874142926557129e-05, "loss": 0.9219, "step": 39200 }, { "epoch": 0.04, "grad_norm": 121.5, "learning_rate": 4.873693019238037e-05, "loss": 1.0016, "step": 39300 }, { "epoch": 0.04, "grad_norm": 49.25, "learning_rate": 4.873243111918945e-05, "loss": 0.9643, "step": 39400 }, { "epoch": 0.04, "grad_norm": 40.0, "learning_rate": 4.872793204599852e-05, "loss": 1.1834, "step": 39500 }, { "epoch": 0.04, "grad_norm": 23.125, "learning_rate": 4.8723432972807605e-05, "loss": 1.1207, "step": 39600 }, { "epoch": 0.04, "grad_norm": 0.01385498046875, "learning_rate": 4.871893389961668e-05, "loss": 1.1989, "step": 39700 }, { "epoch": 0.04, "grad_norm": 101.0, "learning_rate": 4.8714434826425756e-05, "loss": 1.0312, "step": 39800 }, { "epoch": 0.04, "grad_norm": 25.625, "learning_rate": 4.870993575323484e-05, "loss": 1.0033, "step": 39900 }, { "epoch": 0.04, "grad_norm": 19.375, "learning_rate": 4.870543668004391e-05, "loss": 1.1085, "step": 40000 }, { "epoch": 0.04, "grad_norm": 71.0, "learning_rate": 4.8700937606852995e-05, "loss": 1.0862, "step": 40100 }, { "epoch": 0.04, "grad_norm": 0.00616455078125, "learning_rate": 4.869643853366207e-05, "loss": 1.0734, "step": 40200 }, { "epoch": 0.04, "grad_norm": 162.0, "learning_rate": 4.8691939460471146e-05, "loss": 1.1716, "step": 40300 }, { "epoch": 0.04, "grad_norm": 8.6875, "learning_rate": 4.868744038728022e-05, "loss": 1.0932, "step": 40400 }, { "epoch": 0.04, "grad_norm": 13.0, "learning_rate": 4.86829413140893e-05, "loss": 1.2164, "step": 40500 }, { "epoch": 0.04, "grad_norm": 46.5, "learning_rate": 4.867844224089837e-05, "loss": 1.1104, "step": 40600 }, { "epoch": 0.04, "grad_norm": 262.0, "learning_rate": 4.8673943167707454e-05, "loss": 1.0848, "step": 40700 }, { "epoch": 0.04, "grad_norm": 42.75, "learning_rate": 4.866944409451653e-05, "loss": 1.0706, "step": 40800 }, { "epoch": 0.04, "grad_norm": 78.5, "learning_rate": 4.866494502132561e-05, "loss": 1.2088, "step": 40900 }, { "epoch": 0.04, "grad_norm": 49.0, "learning_rate": 4.866044594813469e-05, "loss": 1.0286, "step": 41000 }, { "epoch": 0.04, "grad_norm": 77.0, "learning_rate": 4.865594687494376e-05, "loss": 1.0746, "step": 41100 }, { "epoch": 0.04, "grad_norm": 89.0, "learning_rate": 4.8651447801752844e-05, "loss": 1.0762, "step": 41200 }, { "epoch": 0.04, "grad_norm": 81.5, "learning_rate": 4.864694872856192e-05, "loss": 1.0354, "step": 41300 }, { "epoch": 0.04, "grad_norm": 29.0, "learning_rate": 4.8642449655370995e-05, "loss": 1.1843, "step": 41400 }, { "epoch": 0.04, "grad_norm": 33.0, "learning_rate": 4.863795058218008e-05, "loss": 1.1052, "step": 41500 }, { "epoch": 0.04, "grad_norm": 19.625, "learning_rate": 4.863345150898915e-05, "loss": 1.0887, "step": 41600 }, { "epoch": 0.04, "grad_norm": 98.5, "learning_rate": 4.862895243579823e-05, "loss": 1.1573, "step": 41700 }, { "epoch": 0.04, "grad_norm": 47.75, "learning_rate": 4.86244533626073e-05, "loss": 1.2246, "step": 41800 }, { "epoch": 0.04, "grad_norm": 132.0, "learning_rate": 4.861995428941638e-05, "loss": 1.1189, "step": 41900 }, { "epoch": 0.04, "grad_norm": 28.375, "learning_rate": 4.861545521622546e-05, "loss": 1.0678, "step": 42000 }, { "epoch": 0.04, "grad_norm": 7.1875, "learning_rate": 4.8610956143034536e-05, "loss": 1.1277, "step": 42100 }, { "epoch": 0.04, "grad_norm": 107.0, "learning_rate": 4.860645706984361e-05, "loss": 0.9764, "step": 42200 }, { "epoch": 0.04, "grad_norm": 7.5, "learning_rate": 4.8601957996652693e-05, "loss": 1.0772, "step": 42300 }, { "epoch": 0.04, "grad_norm": 61.0, "learning_rate": 4.859745892346177e-05, "loss": 0.8331, "step": 42400 }, { "epoch": 0.04, "grad_norm": 43.5, "learning_rate": 4.8592959850270844e-05, "loss": 1.1747, "step": 42500 }, { "epoch": 0.04, "grad_norm": 0.0189208984375, "learning_rate": 4.8588460777079926e-05, "loss": 1.2166, "step": 42600 }, { "epoch": 0.04, "grad_norm": 44.75, "learning_rate": 4.8583961703889e-05, "loss": 1.0518, "step": 42700 }, { "epoch": 0.04, "grad_norm": 147.0, "learning_rate": 4.8579462630698084e-05, "loss": 1.1158, "step": 42800 }, { "epoch": 0.04, "grad_norm": 99.0, "learning_rate": 4.857496355750716e-05, "loss": 0.9201, "step": 42900 }, { "epoch": 0.04, "grad_norm": 17.0, "learning_rate": 4.857046448431623e-05, "loss": 1.1372, "step": 43000 }, { "epoch": 0.04, "grad_norm": 69.0, "learning_rate": 4.856596541112531e-05, "loss": 0.9705, "step": 43100 }, { "epoch": 0.04, "grad_norm": 25.75, "learning_rate": 4.8561466337934385e-05, "loss": 0.9867, "step": 43200 }, { "epoch": 0.04, "grad_norm": 226.0, "learning_rate": 4.855696726474346e-05, "loss": 1.119, "step": 43300 }, { "epoch": 0.04, "grad_norm": 36.5, "learning_rate": 4.855246819155254e-05, "loss": 1.1015, "step": 43400 }, { "epoch": 0.04, "grad_norm": 32.75, "learning_rate": 4.854796911836162e-05, "loss": 1.1207, "step": 43500 }, { "epoch": 0.04, "grad_norm": 77.0, "learning_rate": 4.85434700451707e-05, "loss": 1.1423, "step": 43600 }, { "epoch": 0.04, "grad_norm": 0.59765625, "learning_rate": 4.8538970971979775e-05, "loss": 1.1564, "step": 43700 }, { "epoch": 0.04, "grad_norm": 48.0, "learning_rate": 4.853447189878885e-05, "loss": 1.237, "step": 43800 }, { "epoch": 0.04, "grad_norm": 28.125, "learning_rate": 4.852997282559793e-05, "loss": 1.1437, "step": 43900 }, { "epoch": 0.04, "grad_norm": 20.5, "learning_rate": 4.852547375240701e-05, "loss": 0.9318, "step": 44000 }, { "epoch": 0.04, "grad_norm": 1408.0, "learning_rate": 4.8520974679216083e-05, "loss": 0.9653, "step": 44100 }, { "epoch": 0.04, "grad_norm": 24.125, "learning_rate": 4.8516475606025166e-05, "loss": 0.9739, "step": 44200 }, { "epoch": 0.04, "grad_norm": 0.035888671875, "learning_rate": 4.8511976532834234e-05, "loss": 1.0071, "step": 44300 }, { "epoch": 0.04, "grad_norm": 33.5, "learning_rate": 4.8507477459643316e-05, "loss": 1.103, "step": 44400 }, { "epoch": 0.04, "grad_norm": 20.375, "learning_rate": 4.850297838645239e-05, "loss": 1.0733, "step": 44500 }, { "epoch": 0.04, "grad_norm": 23.0, "learning_rate": 4.849847931326147e-05, "loss": 1.1211, "step": 44600 }, { "epoch": 0.04, "grad_norm": 15.8125, "learning_rate": 4.849398024007055e-05, "loss": 0.9089, "step": 44700 }, { "epoch": 0.04, "grad_norm": 38.0, "learning_rate": 4.8489481166879624e-05, "loss": 1.069, "step": 44800 }, { "epoch": 0.04, "grad_norm": 0.474609375, "learning_rate": 4.84849820936887e-05, "loss": 1.0493, "step": 44900 }, { "epoch": 0.04, "grad_norm": 31.125, "learning_rate": 4.848048302049778e-05, "loss": 0.9652, "step": 45000 }, { "epoch": 0.04, "grad_norm": 41.25, "learning_rate": 4.847598394730686e-05, "loss": 1.2541, "step": 45100 }, { "epoch": 0.04, "grad_norm": 7.21875, "learning_rate": 4.847148487411593e-05, "loss": 1.0102, "step": 45200 }, { "epoch": 0.04, "grad_norm": 52.5, "learning_rate": 4.8466985800925015e-05, "loss": 1.0949, "step": 45300 }, { "epoch": 0.04, "grad_norm": 16.75, "learning_rate": 4.846248672773409e-05, "loss": 1.0092, "step": 45400 }, { "epoch": 0.04, "grad_norm": 36.5, "learning_rate": 4.845798765454317e-05, "loss": 1.0239, "step": 45500 }, { "epoch": 0.04, "grad_norm": 23.375, "learning_rate": 4.845348858135224e-05, "loss": 0.9919, "step": 45600 }, { "epoch": 0.04, "grad_norm": 48.0, "learning_rate": 4.8448989508161316e-05, "loss": 1.0217, "step": 45700 }, { "epoch": 0.04, "grad_norm": 35.5, "learning_rate": 4.84444904349704e-05, "loss": 1.038, "step": 45800 }, { "epoch": 0.04, "grad_norm": 7.34375, "learning_rate": 4.8439991361779473e-05, "loss": 1.0612, "step": 45900 }, { "epoch": 0.04, "grad_norm": 12.8125, "learning_rate": 4.843549228858855e-05, "loss": 1.1405, "step": 46000 }, { "epoch": 0.04, "grad_norm": 16.125, "learning_rate": 4.843099321539763e-05, "loss": 1.0643, "step": 46100 }, { "epoch": 0.04, "grad_norm": 22.125, "learning_rate": 4.8426494142206706e-05, "loss": 1.0632, "step": 46200 }, { "epoch": 0.04, "grad_norm": 19.875, "learning_rate": 4.842199506901579e-05, "loss": 1.2547, "step": 46300 }, { "epoch": 0.04, "grad_norm": 143.0, "learning_rate": 4.8417495995824864e-05, "loss": 1.0667, "step": 46400 }, { "epoch": 0.04, "grad_norm": 26.75, "learning_rate": 4.841299692263394e-05, "loss": 1.0599, "step": 46500 }, { "epoch": 0.04, "grad_norm": 12.0, "learning_rate": 4.840849784944302e-05, "loss": 1.0818, "step": 46600 }, { "epoch": 0.04, "grad_norm": 852.0, "learning_rate": 4.8403998776252097e-05, "loss": 1.1998, "step": 46700 }, { "epoch": 0.04, "grad_norm": 48.5, "learning_rate": 4.839949970306117e-05, "loss": 1.1431, "step": 46800 }, { "epoch": 0.04, "grad_norm": 33.0, "learning_rate": 4.839500062987025e-05, "loss": 1.1179, "step": 46900 }, { "epoch": 0.04, "grad_norm": 20.625, "learning_rate": 4.839050155667932e-05, "loss": 0.9942, "step": 47000 }, { "epoch": 0.04, "grad_norm": 51.5, "learning_rate": 4.8386002483488405e-05, "loss": 1.0463, "step": 47100 }, { "epoch": 0.04, "grad_norm": 0.045654296875, "learning_rate": 4.838150341029748e-05, "loss": 1.1198, "step": 47200 }, { "epoch": 0.04, "grad_norm": 50.5, "learning_rate": 4.8377004337106555e-05, "loss": 1.0793, "step": 47300 }, { "epoch": 0.04, "grad_norm": 25.125, "learning_rate": 4.837250526391564e-05, "loss": 0.9773, "step": 47400 }, { "epoch": 0.04, "grad_norm": 72.0, "learning_rate": 4.836800619072471e-05, "loss": 1.1212, "step": 47500 }, { "epoch": 0.04, "grad_norm": 119.0, "learning_rate": 4.836350711753379e-05, "loss": 1.077, "step": 47600 }, { "epoch": 0.04, "grad_norm": 0.0037384033203125, "learning_rate": 4.835900804434287e-05, "loss": 0.9038, "step": 47700 }, { "epoch": 0.04, "grad_norm": 154.0, "learning_rate": 4.8354508971151946e-05, "loss": 1.0858, "step": 47800 }, { "epoch": 0.04, "grad_norm": 1.8359375, "learning_rate": 4.835000989796102e-05, "loss": 0.9363, "step": 47900 }, { "epoch": 0.04, "grad_norm": 46.25, "learning_rate": 4.83455108247701e-05, "loss": 1.1046, "step": 48000 }, { "epoch": 0.04, "grad_norm": 28.125, "learning_rate": 4.834101175157918e-05, "loss": 1.0324, "step": 48100 }, { "epoch": 0.04, "grad_norm": 29.5, "learning_rate": 4.8336512678388254e-05, "loss": 1.139, "step": 48200 }, { "epoch": 0.04, "grad_norm": 81.5, "learning_rate": 4.833201360519733e-05, "loss": 1.0533, "step": 48300 }, { "epoch": 0.04, "grad_norm": 124.5, "learning_rate": 4.8327514532006404e-05, "loss": 0.9259, "step": 48400 }, { "epoch": 0.04, "grad_norm": 97.0, "learning_rate": 4.8323015458815487e-05, "loss": 0.9167, "step": 48500 }, { "epoch": 0.04, "grad_norm": 19.625, "learning_rate": 4.831851638562456e-05, "loss": 1.1529, "step": 48600 }, { "epoch": 0.04, "grad_norm": 72.0, "learning_rate": 4.831401731243364e-05, "loss": 1.1205, "step": 48700 }, { "epoch": 0.04, "grad_norm": 0.003814697265625, "learning_rate": 4.830951823924272e-05, "loss": 1.0103, "step": 48800 }, { "epoch": 0.04, "grad_norm": 1.15625, "learning_rate": 4.8305019166051795e-05, "loss": 1.1474, "step": 48900 }, { "epoch": 0.04, "grad_norm": 21.875, "learning_rate": 4.830052009286088e-05, "loss": 1.0274, "step": 49000 }, { "epoch": 0.04, "grad_norm": 28.125, "learning_rate": 4.829602101966995e-05, "loss": 1.0594, "step": 49100 }, { "epoch": 0.04, "grad_norm": 4.84375, "learning_rate": 4.829152194647903e-05, "loss": 1.076, "step": 49200 }, { "epoch": 0.04, "grad_norm": 20.625, "learning_rate": 4.828702287328811e-05, "loss": 0.9598, "step": 49300 }, { "epoch": 0.04, "grad_norm": 6.96875, "learning_rate": 4.8282523800097185e-05, "loss": 1.1013, "step": 49400 }, { "epoch": 0.04, "grad_norm": 15.625, "learning_rate": 4.827802472690626e-05, "loss": 1.0096, "step": 49500 }, { "epoch": 0.04, "grad_norm": 45.25, "learning_rate": 4.8273525653715336e-05, "loss": 1.1038, "step": 49600 }, { "epoch": 0.04, "grad_norm": 0.8203125, "learning_rate": 4.826902658052441e-05, "loss": 1.1955, "step": 49700 }, { "epoch": 0.04, "grad_norm": 29.75, "learning_rate": 4.826452750733349e-05, "loss": 0.9651, "step": 49800 }, { "epoch": 0.04, "grad_norm": 16.0, "learning_rate": 4.826002843414257e-05, "loss": 1.0677, "step": 49900 }, { "epoch": 0.04, "grad_norm": 0.0140380859375, "learning_rate": 4.8255529360951644e-05, "loss": 0.9213, "step": 50000 }, { "epoch": 0.04, "grad_norm": 27.75, "learning_rate": 4.8251030287760726e-05, "loss": 1.1253, "step": 50100 }, { "epoch": 0.04, "grad_norm": 1048.0, "learning_rate": 4.82465312145698e-05, "loss": 1.0198, "step": 50200 }, { "epoch": 0.04, "grad_norm": 80.5, "learning_rate": 4.8242032141378877e-05, "loss": 1.0778, "step": 50300 }, { "epoch": 0.04, "grad_norm": 120.0, "learning_rate": 4.823753306818796e-05, "loss": 1.0936, "step": 50400 }, { "epoch": 0.04, "grad_norm": 26.0, "learning_rate": 4.8233033994997034e-05, "loss": 1.145, "step": 50500 }, { "epoch": 0.05, "grad_norm": 82.0, "learning_rate": 4.822853492180611e-05, "loss": 1.0195, "step": 50600 }, { "epoch": 0.05, "grad_norm": 0.07568359375, "learning_rate": 4.822403584861519e-05, "loss": 1.1293, "step": 50700 }, { "epoch": 0.05, "grad_norm": 592.0, "learning_rate": 4.821953677542426e-05, "loss": 1.0629, "step": 50800 }, { "epoch": 0.05, "grad_norm": 0.06787109375, "learning_rate": 4.821503770223334e-05, "loss": 1.0955, "step": 50900 }, { "epoch": 0.05, "grad_norm": 31.875, "learning_rate": 4.821053862904242e-05, "loss": 1.0129, "step": 51000 }, { "epoch": 0.05, "grad_norm": 53.25, "learning_rate": 4.820603955585149e-05, "loss": 1.0709, "step": 51100 }, { "epoch": 0.05, "grad_norm": 11.9375, "learning_rate": 4.8201540482660575e-05, "loss": 1.0282, "step": 51200 }, { "epoch": 0.05, "grad_norm": 44.5, "learning_rate": 4.819704140946965e-05, "loss": 1.1032, "step": 51300 }, { "epoch": 0.05, "grad_norm": 0.004547119140625, "learning_rate": 4.8192542336278726e-05, "loss": 1.1192, "step": 51400 }, { "epoch": 0.05, "grad_norm": 16.5, "learning_rate": 4.818804326308781e-05, "loss": 0.9293, "step": 51500 }, { "epoch": 0.05, "grad_norm": 34.0, "learning_rate": 4.818354418989688e-05, "loss": 0.9689, "step": 51600 }, { "epoch": 0.05, "grad_norm": 280.0, "learning_rate": 4.8179045116705965e-05, "loss": 1.0233, "step": 51700 }, { "epoch": 0.05, "grad_norm": 72.0, "learning_rate": 4.817454604351504e-05, "loss": 1.0125, "step": 51800 }, { "epoch": 0.05, "grad_norm": 33.5, "learning_rate": 4.8170046970324116e-05, "loss": 1.0061, "step": 51900 }, { "epoch": 0.05, "grad_norm": 39.75, "learning_rate": 4.81655478971332e-05, "loss": 1.0524, "step": 52000 }, { "epoch": 0.05, "grad_norm": 28.0, "learning_rate": 4.8161048823942266e-05, "loss": 1.1626, "step": 52100 }, { "epoch": 0.05, "grad_norm": 47.75, "learning_rate": 4.815654975075135e-05, "loss": 1.0667, "step": 52200 }, { "epoch": 0.05, "grad_norm": 135.0, "learning_rate": 4.8152050677560424e-05, "loss": 1.0158, "step": 52300 }, { "epoch": 0.05, "grad_norm": 41.25, "learning_rate": 4.81475516043695e-05, "loss": 1.0858, "step": 52400 }, { "epoch": 0.05, "grad_norm": 524.0, "learning_rate": 4.814305253117858e-05, "loss": 1.0831, "step": 52500 }, { "epoch": 0.05, "grad_norm": 0.2470703125, "learning_rate": 4.813855345798766e-05, "loss": 1.0296, "step": 52600 }, { "epoch": 0.05, "grad_norm": 17.875, "learning_rate": 4.813405438479673e-05, "loss": 1.0289, "step": 52700 }, { "epoch": 0.05, "grad_norm": 24.875, "learning_rate": 4.8129555311605814e-05, "loss": 1.115, "step": 52800 }, { "epoch": 0.05, "grad_norm": 203.0, "learning_rate": 4.812505623841489e-05, "loss": 0.9661, "step": 52900 }, { "epoch": 0.05, "grad_norm": 5.40625, "learning_rate": 4.8120557165223965e-05, "loss": 0.9604, "step": 53000 }, { "epoch": 0.05, "grad_norm": 19.25, "learning_rate": 4.811605809203305e-05, "loss": 1.0973, "step": 53100 }, { "epoch": 0.05, "grad_norm": 12.0625, "learning_rate": 4.811155901884212e-05, "loss": 1.1134, "step": 53200 }, { "epoch": 0.05, "grad_norm": 21.875, "learning_rate": 4.81070599456512e-05, "loss": 0.9593, "step": 53300 }, { "epoch": 0.05, "grad_norm": 1616.0, "learning_rate": 4.810256087246027e-05, "loss": 0.925, "step": 53400 }, { "epoch": 0.05, "grad_norm": 122.5, "learning_rate": 4.809806179926935e-05, "loss": 1.021, "step": 53500 }, { "epoch": 0.05, "grad_norm": 0.061767578125, "learning_rate": 4.809356272607843e-05, "loss": 1.1105, "step": 53600 }, { "epoch": 0.05, "grad_norm": 788.0, "learning_rate": 4.8089063652887506e-05, "loss": 1.2342, "step": 53700 }, { "epoch": 0.05, "grad_norm": 25.375, "learning_rate": 4.808456457969658e-05, "loss": 1.0668, "step": 53800 }, { "epoch": 0.05, "grad_norm": 113.0, "learning_rate": 4.808006550650566e-05, "loss": 1.129, "step": 53900 }, { "epoch": 0.05, "grad_norm": 31.5, "learning_rate": 4.807556643331474e-05, "loss": 1.1183, "step": 54000 }, { "epoch": 0.05, "grad_norm": 79.5, "learning_rate": 4.8071067360123814e-05, "loss": 1.0775, "step": 54100 }, { "epoch": 0.05, "grad_norm": 296.0, "learning_rate": 4.8066568286932896e-05, "loss": 1.1357, "step": 54200 }, { "epoch": 0.05, "grad_norm": 34.0, "learning_rate": 4.806206921374197e-05, "loss": 1.1471, "step": 54300 }, { "epoch": 0.05, "grad_norm": 13.9375, "learning_rate": 4.8057570140551054e-05, "loss": 1.0694, "step": 54400 }, { "epoch": 0.05, "grad_norm": 0.0159912109375, "learning_rate": 4.805307106736013e-05, "loss": 1.1395, "step": 54500 }, { "epoch": 0.05, "grad_norm": 37.75, "learning_rate": 4.8048571994169204e-05, "loss": 1.1149, "step": 54600 }, { "epoch": 0.05, "grad_norm": 16.75, "learning_rate": 4.804407292097828e-05, "loss": 1.2619, "step": 54700 }, { "epoch": 0.05, "grad_norm": 30.625, "learning_rate": 4.8039573847787355e-05, "loss": 1.0838, "step": 54800 }, { "epoch": 0.05, "grad_norm": 51.25, "learning_rate": 4.803507477459643e-05, "loss": 1.0213, "step": 54900 }, { "epoch": 0.05, "grad_norm": 161.0, "learning_rate": 4.803057570140551e-05, "loss": 0.9585, "step": 55000 }, { "epoch": 0.05, "grad_norm": 103.0, "learning_rate": 4.802607662821459e-05, "loss": 1.0219, "step": 55100 }, { "epoch": 0.05, "grad_norm": 41.0, "learning_rate": 4.802157755502367e-05, "loss": 1.0754, "step": 55200 }, { "epoch": 0.05, "grad_norm": 0.267578125, "learning_rate": 4.8017078481832745e-05, "loss": 1.058, "step": 55300 }, { "epoch": 0.05, "grad_norm": 23.875, "learning_rate": 4.801257940864182e-05, "loss": 1.0358, "step": 55400 }, { "epoch": 0.05, "grad_norm": 38.5, "learning_rate": 4.80080803354509e-05, "loss": 1.255, "step": 55500 }, { "epoch": 0.05, "grad_norm": 89.5, "learning_rate": 4.800358126225998e-05, "loss": 1.1077, "step": 55600 }, { "epoch": 0.05, "grad_norm": 22.625, "learning_rate": 4.799908218906905e-05, "loss": 1.19, "step": 55700 }, { "epoch": 0.05, "grad_norm": 141.0, "learning_rate": 4.7994583115878135e-05, "loss": 1.046, "step": 55800 }, { "epoch": 0.05, "grad_norm": 18.375, "learning_rate": 4.799008404268721e-05, "loss": 1.0007, "step": 55900 }, { "epoch": 0.05, "grad_norm": 89.0, "learning_rate": 4.7985584969496286e-05, "loss": 0.957, "step": 56000 }, { "epoch": 0.05, "grad_norm": 94.0, "learning_rate": 4.798108589630536e-05, "loss": 1.0887, "step": 56100 }, { "epoch": 0.05, "grad_norm": 25.625, "learning_rate": 4.797658682311444e-05, "loss": 1.1572, "step": 56200 }, { "epoch": 0.05, "grad_norm": 52.25, "learning_rate": 4.797208774992352e-05, "loss": 1.096, "step": 56300 }, { "epoch": 0.05, "grad_norm": 17.5, "learning_rate": 4.7967588676732594e-05, "loss": 1.1291, "step": 56400 }, { "epoch": 0.05, "grad_norm": 46.5, "learning_rate": 4.796308960354167e-05, "loss": 1.0168, "step": 56500 }, { "epoch": 0.05, "grad_norm": 18.625, "learning_rate": 4.795859053035075e-05, "loss": 0.9168, "step": 56600 }, { "epoch": 0.05, "grad_norm": 0.01611328125, "learning_rate": 4.795409145715983e-05, "loss": 0.9272, "step": 56700 }, { "epoch": 0.05, "grad_norm": 60.5, "learning_rate": 4.79495923839689e-05, "loss": 1.0723, "step": 56800 }, { "epoch": 0.05, "grad_norm": 54.5, "learning_rate": 4.7945093310777984e-05, "loss": 1.1016, "step": 56900 }, { "epoch": 0.05, "grad_norm": 1.359375, "learning_rate": 4.794059423758706e-05, "loss": 1.0385, "step": 57000 }, { "epoch": 0.05, "grad_norm": 16.625, "learning_rate": 4.793609516439614e-05, "loss": 1.0596, "step": 57100 }, { "epoch": 0.05, "grad_norm": 36.0, "learning_rate": 4.793159609120522e-05, "loss": 1.0204, "step": 57200 }, { "epoch": 0.05, "grad_norm": 26.25, "learning_rate": 4.7927097018014286e-05, "loss": 1.0025, "step": 57300 }, { "epoch": 0.05, "grad_norm": 27.0, "learning_rate": 4.792259794482337e-05, "loss": 1.0586, "step": 57400 }, { "epoch": 0.05, "grad_norm": 19.125, "learning_rate": 4.791809887163244e-05, "loss": 1.0268, "step": 57500 }, { "epoch": 0.05, "grad_norm": 39.25, "learning_rate": 4.791359979844152e-05, "loss": 1.1759, "step": 57600 }, { "epoch": 0.05, "grad_norm": 3.125, "learning_rate": 4.79091007252506e-05, "loss": 1.0709, "step": 57700 }, { "epoch": 0.05, "grad_norm": 105.0, "learning_rate": 4.7904601652059676e-05, "loss": 1.0584, "step": 57800 }, { "epoch": 0.05, "grad_norm": 80.0, "learning_rate": 4.790010257886876e-05, "loss": 1.0015, "step": 57900 }, { "epoch": 0.05, "grad_norm": 31.5, "learning_rate": 4.7895603505677834e-05, "loss": 1.0309, "step": 58000 }, { "epoch": 0.05, "grad_norm": 28.625, "learning_rate": 4.789110443248691e-05, "loss": 1.0922, "step": 58100 }, { "epoch": 0.05, "grad_norm": 22.75, "learning_rate": 4.788660535929599e-05, "loss": 1.1791, "step": 58200 }, { "epoch": 0.05, "grad_norm": 3120.0, "learning_rate": 4.7882106286105066e-05, "loss": 1.2718, "step": 58300 }, { "epoch": 0.05, "grad_norm": 38.75, "learning_rate": 4.787760721291414e-05, "loss": 0.9101, "step": 58400 }, { "epoch": 0.05, "grad_norm": 18.75, "learning_rate": 4.7873108139723224e-05, "loss": 0.9628, "step": 58500 }, { "epoch": 0.05, "grad_norm": 26.875, "learning_rate": 4.786860906653229e-05, "loss": 1.0926, "step": 58600 }, { "epoch": 0.05, "grad_norm": 29.0, "learning_rate": 4.7864109993341374e-05, "loss": 1.0893, "step": 58700 }, { "epoch": 0.05, "grad_norm": 32.0, "learning_rate": 4.785961092015045e-05, "loss": 1.0435, "step": 58800 }, { "epoch": 0.05, "grad_norm": 17.625, "learning_rate": 4.7855111846959525e-05, "loss": 1.1382, "step": 58900 }, { "epoch": 0.05, "grad_norm": 165.0, "learning_rate": 4.785061277376861e-05, "loss": 0.9335, "step": 59000 }, { "epoch": 0.05, "grad_norm": 23.875, "learning_rate": 4.784611370057768e-05, "loss": 0.9897, "step": 59100 }, { "epoch": 0.05, "grad_norm": 99.0, "learning_rate": 4.784161462738676e-05, "loss": 1.077, "step": 59200 }, { "epoch": 0.05, "grad_norm": 0.0322265625, "learning_rate": 4.783711555419584e-05, "loss": 0.927, "step": 59300 }, { "epoch": 0.05, "grad_norm": 134.0, "learning_rate": 4.7832616481004915e-05, "loss": 0.9779, "step": 59400 }, { "epoch": 0.05, "grad_norm": 33.25, "learning_rate": 4.782811740781399e-05, "loss": 1.114, "step": 59500 }, { "epoch": 0.05, "grad_norm": 32.75, "learning_rate": 4.782361833462307e-05, "loss": 1.1215, "step": 59600 }, { "epoch": 0.05, "grad_norm": 32.25, "learning_rate": 4.781911926143215e-05, "loss": 1.0068, "step": 59700 }, { "epoch": 0.05, "grad_norm": 87.0, "learning_rate": 4.781462018824123e-05, "loss": 1.0691, "step": 59800 }, { "epoch": 0.05, "grad_norm": 24.75, "learning_rate": 4.78101211150503e-05, "loss": 0.9855, "step": 59900 }, { "epoch": 0.05, "grad_norm": 0.0074462890625, "learning_rate": 4.7805622041859374e-05, "loss": 1.0395, "step": 60000 }, { "epoch": 0.05, "grad_norm": 0.1435546875, "learning_rate": 4.7801122968668456e-05, "loss": 1.0282, "step": 60100 }, { "epoch": 0.05, "grad_norm": 62.25, "learning_rate": 4.779662389547753e-05, "loss": 1.0955, "step": 60200 }, { "epoch": 0.05, "grad_norm": 95.0, "learning_rate": 4.779212482228661e-05, "loss": 0.9779, "step": 60300 }, { "epoch": 0.05, "grad_norm": 76.5, "learning_rate": 4.778762574909569e-05, "loss": 1.1875, "step": 60400 }, { "epoch": 0.05, "grad_norm": 0.00750732421875, "learning_rate": 4.7783126675904764e-05, "loss": 1.144, "step": 60500 }, { "epoch": 0.05, "grad_norm": 123.0, "learning_rate": 4.7778627602713847e-05, "loss": 1.0817, "step": 60600 }, { "epoch": 0.05, "grad_norm": 31.5, "learning_rate": 4.777412852952292e-05, "loss": 1.0511, "step": 60700 }, { "epoch": 0.05, "grad_norm": 38.5, "learning_rate": 4.7769629456332e-05, "loss": 1.1687, "step": 60800 }, { "epoch": 0.05, "grad_norm": 13.4375, "learning_rate": 4.776513038314108e-05, "loss": 0.9554, "step": 60900 }, { "epoch": 0.05, "grad_norm": 51.0, "learning_rate": 4.7760631309950155e-05, "loss": 1.1859, "step": 61000 }, { "epoch": 0.05, "grad_norm": 22.75, "learning_rate": 4.775613223675923e-05, "loss": 1.1318, "step": 61100 }, { "epoch": 0.05, "grad_norm": 23.0, "learning_rate": 4.7751633163568305e-05, "loss": 0.9677, "step": 61200 }, { "epoch": 0.05, "grad_norm": 4.46875, "learning_rate": 4.774713409037738e-05, "loss": 1.0597, "step": 61300 }, { "epoch": 0.05, "grad_norm": 18.75, "learning_rate": 4.774263501718646e-05, "loss": 1.0387, "step": 61400 }, { "epoch": 0.05, "grad_norm": 13.5, "learning_rate": 4.773813594399554e-05, "loss": 1.1281, "step": 61500 }, { "epoch": 0.05, "grad_norm": 47.75, "learning_rate": 4.7733636870804613e-05, "loss": 1.1926, "step": 61600 }, { "epoch": 0.05, "grad_norm": 32.25, "learning_rate": 4.7729137797613696e-05, "loss": 1.0866, "step": 61700 }, { "epoch": 0.06, "grad_norm": 0.003997802734375, "learning_rate": 4.772463872442277e-05, "loss": 1.0067, "step": 61800 }, { "epoch": 0.06, "grad_norm": 0.00518798828125, "learning_rate": 4.7720139651231846e-05, "loss": 0.9618, "step": 61900 }, { "epoch": 0.06, "grad_norm": 38.25, "learning_rate": 4.771564057804093e-05, "loss": 0.9947, "step": 62000 }, { "epoch": 0.06, "grad_norm": 35.25, "learning_rate": 4.7711141504850004e-05, "loss": 0.9221, "step": 62100 }, { "epoch": 0.06, "grad_norm": 17.375, "learning_rate": 4.770664243165908e-05, "loss": 1.0307, "step": 62200 }, { "epoch": 0.06, "grad_norm": 0.146484375, "learning_rate": 4.770214335846816e-05, "loss": 1.1203, "step": 62300 }, { "epoch": 0.06, "grad_norm": 120.5, "learning_rate": 4.7697644285277237e-05, "loss": 1.1088, "step": 62400 }, { "epoch": 0.06, "grad_norm": 78.0, "learning_rate": 4.769314521208631e-05, "loss": 0.9565, "step": 62500 }, { "epoch": 0.06, "grad_norm": 25.625, "learning_rate": 4.768864613889539e-05, "loss": 1.096, "step": 62600 }, { "epoch": 0.06, "grad_norm": 21.25, "learning_rate": 4.768414706570446e-05, "loss": 1.0787, "step": 62700 }, { "epoch": 0.06, "grad_norm": 15.5, "learning_rate": 4.7679647992513545e-05, "loss": 1.0096, "step": 62800 }, { "epoch": 0.06, "grad_norm": 50.25, "learning_rate": 4.767514891932262e-05, "loss": 1.0513, "step": 62900 }, { "epoch": 0.06, "grad_norm": 0.00445556640625, "learning_rate": 4.7670649846131695e-05, "loss": 0.9897, "step": 63000 }, { "epoch": 0.06, "grad_norm": 400.0, "learning_rate": 4.766615077294078e-05, "loss": 1.044, "step": 63100 }, { "epoch": 0.06, "grad_norm": 89.0, "learning_rate": 4.766165169974985e-05, "loss": 1.0449, "step": 63200 }, { "epoch": 0.06, "grad_norm": 58.0, "learning_rate": 4.7657152626558935e-05, "loss": 1.0454, "step": 63300 }, { "epoch": 0.06, "grad_norm": 0.048828125, "learning_rate": 4.765265355336801e-05, "loss": 0.9011, "step": 63400 }, { "epoch": 0.06, "grad_norm": 38.75, "learning_rate": 4.7648154480177086e-05, "loss": 1.1728, "step": 63500 }, { "epoch": 0.06, "grad_norm": 0.055419921875, "learning_rate": 4.764365540698617e-05, "loss": 1.0142, "step": 63600 }, { "epoch": 0.06, "grad_norm": 102.0, "learning_rate": 4.763915633379524e-05, "loss": 1.0692, "step": 63700 }, { "epoch": 0.06, "grad_norm": 22.625, "learning_rate": 4.763465726060432e-05, "loss": 0.8402, "step": 63800 }, { "epoch": 0.06, "grad_norm": 24.25, "learning_rate": 4.7630158187413394e-05, "loss": 1.135, "step": 63900 }, { "epoch": 0.06, "grad_norm": 48.0, "learning_rate": 4.762565911422247e-05, "loss": 0.9995, "step": 64000 }, { "epoch": 0.06, "grad_norm": 45.5, "learning_rate": 4.762116004103155e-05, "loss": 1.085, "step": 64100 }, { "epoch": 0.06, "grad_norm": 65.5, "learning_rate": 4.7616660967840627e-05, "loss": 1.0364, "step": 64200 }, { "epoch": 0.06, "grad_norm": 24.25, "learning_rate": 4.76121618946497e-05, "loss": 1.1136, "step": 64300 }, { "epoch": 0.06, "grad_norm": 12.5625, "learning_rate": 4.7607662821458784e-05, "loss": 1.0889, "step": 64400 }, { "epoch": 0.06, "grad_norm": 103.0, "learning_rate": 4.760316374826786e-05, "loss": 1.1256, "step": 64500 }, { "epoch": 0.06, "grad_norm": 34.25, "learning_rate": 4.7598664675076935e-05, "loss": 0.9965, "step": 64600 }, { "epoch": 0.06, "grad_norm": 25.5, "learning_rate": 4.759416560188602e-05, "loss": 1.0614, "step": 64700 }, { "epoch": 0.06, "grad_norm": 80.5, "learning_rate": 4.758966652869509e-05, "loss": 1.0382, "step": 64800 }, { "epoch": 0.06, "grad_norm": 9.5625, "learning_rate": 4.758516745550417e-05, "loss": 0.8942, "step": 64900 }, { "epoch": 0.06, "grad_norm": 378.0, "learning_rate": 4.758066838231325e-05, "loss": 1.1136, "step": 65000 }, { "epoch": 0.06, "grad_norm": 62.5, "learning_rate": 4.757616930912232e-05, "loss": 1.1306, "step": 65100 }, { "epoch": 0.06, "grad_norm": 0.212890625, "learning_rate": 4.75716702359314e-05, "loss": 0.9875, "step": 65200 }, { "epoch": 0.06, "grad_norm": 28.5, "learning_rate": 4.7567171162740476e-05, "loss": 1.0619, "step": 65300 }, { "epoch": 0.06, "grad_norm": 47.25, "learning_rate": 4.756267208954955e-05, "loss": 0.9667, "step": 65400 }, { "epoch": 0.06, "grad_norm": 79.5, "learning_rate": 4.755817301635863e-05, "loss": 1.0267, "step": 65500 }, { "epoch": 0.06, "grad_norm": 68.5, "learning_rate": 4.755367394316771e-05, "loss": 1.1107, "step": 65600 }, { "epoch": 0.06, "grad_norm": 35.5, "learning_rate": 4.7549174869976784e-05, "loss": 0.9501, "step": 65700 }, { "epoch": 0.06, "grad_norm": 74.5, "learning_rate": 4.7544675796785866e-05, "loss": 1.0483, "step": 65800 }, { "epoch": 0.06, "grad_norm": 54.5, "learning_rate": 4.754017672359494e-05, "loss": 1.1021, "step": 65900 }, { "epoch": 0.06, "grad_norm": 25.125, "learning_rate": 4.753567765040402e-05, "loss": 1.006, "step": 66000 }, { "epoch": 0.06, "grad_norm": 55.25, "learning_rate": 4.75311785772131e-05, "loss": 1.1606, "step": 66100 }, { "epoch": 0.06, "grad_norm": 112.0, "learning_rate": 4.7526679504022174e-05, "loss": 0.9369, "step": 66200 }, { "epoch": 0.06, "grad_norm": 57.0, "learning_rate": 4.7522180430831256e-05, "loss": 1.0697, "step": 66300 }, { "epoch": 0.06, "grad_norm": 62.0, "learning_rate": 4.7517681357640325e-05, "loss": 1.0148, "step": 66400 }, { "epoch": 0.06, "grad_norm": 51.5, "learning_rate": 4.751318228444941e-05, "loss": 1.0458, "step": 66500 }, { "epoch": 0.06, "grad_norm": 64.0, "learning_rate": 4.750868321125848e-05, "loss": 1.0097, "step": 66600 }, { "epoch": 0.06, "grad_norm": 29.375, "learning_rate": 4.750418413806756e-05, "loss": 1.0261, "step": 66700 }, { "epoch": 0.06, "grad_norm": 19.125, "learning_rate": 4.749968506487664e-05, "loss": 1.1002, "step": 66800 }, { "epoch": 0.06, "grad_norm": 19.625, "learning_rate": 4.7495185991685715e-05, "loss": 1.1513, "step": 66900 }, { "epoch": 0.06, "grad_norm": 28.0, "learning_rate": 4.749068691849479e-05, "loss": 1.0036, "step": 67000 }, { "epoch": 0.06, "grad_norm": 1.5703125, "learning_rate": 4.748618784530387e-05, "loss": 0.9781, "step": 67100 }, { "epoch": 0.06, "grad_norm": 1.0390625, "learning_rate": 4.748168877211295e-05, "loss": 1.1338, "step": 67200 }, { "epoch": 0.06, "grad_norm": 160.0, "learning_rate": 4.747718969892202e-05, "loss": 1.104, "step": 67300 }, { "epoch": 0.06, "grad_norm": 21.5, "learning_rate": 4.7472690625731105e-05, "loss": 1.0585, "step": 67400 }, { "epoch": 0.06, "grad_norm": 22.125, "learning_rate": 4.746819155254018e-05, "loss": 0.9737, "step": 67500 }, { "epoch": 0.06, "grad_norm": 29.5, "learning_rate": 4.7463692479349256e-05, "loss": 1.0911, "step": 67600 }, { "epoch": 0.06, "grad_norm": 48.25, "learning_rate": 4.745919340615833e-05, "loss": 1.0497, "step": 67700 }, { "epoch": 0.06, "grad_norm": 139.0, "learning_rate": 4.7454694332967407e-05, "loss": 1.0999, "step": 67800 }, { "epoch": 0.06, "grad_norm": 106.0, "learning_rate": 4.745019525977649e-05, "loss": 0.952, "step": 67900 }, { "epoch": 0.06, "grad_norm": 19.75, "learning_rate": 4.7445696186585564e-05, "loss": 1.0808, "step": 68000 }, { "epoch": 0.06, "grad_norm": 100.0, "learning_rate": 4.744119711339464e-05, "loss": 1.129, "step": 68100 }, { "epoch": 0.06, "grad_norm": 45.75, "learning_rate": 4.743669804020372e-05, "loss": 1.1297, "step": 68200 }, { "epoch": 0.06, "grad_norm": 52.5, "learning_rate": 4.74321989670128e-05, "loss": 0.9746, "step": 68300 }, { "epoch": 0.06, "grad_norm": 41.0, "learning_rate": 4.742769989382187e-05, "loss": 1.0569, "step": 68400 }, { "epoch": 0.06, "grad_norm": 36.0, "learning_rate": 4.7423200820630954e-05, "loss": 1.0026, "step": 68500 }, { "epoch": 0.06, "grad_norm": 43.0, "learning_rate": 4.741870174744003e-05, "loss": 0.8584, "step": 68600 }, { "epoch": 0.06, "grad_norm": 24.625, "learning_rate": 4.741420267424911e-05, "loss": 1.0649, "step": 68700 }, { "epoch": 0.06, "grad_norm": 65.5, "learning_rate": 4.740970360105819e-05, "loss": 1.1102, "step": 68800 }, { "epoch": 0.06, "grad_norm": 28.875, "learning_rate": 4.740520452786726e-05, "loss": 1.0797, "step": 68900 }, { "epoch": 0.06, "grad_norm": 80.0, "learning_rate": 4.740070545467634e-05, "loss": 1.0115, "step": 69000 }, { "epoch": 0.06, "grad_norm": 13.3125, "learning_rate": 4.739620638148541e-05, "loss": 1.0605, "step": 69100 }, { "epoch": 0.06, "grad_norm": 4.375, "learning_rate": 4.7391707308294495e-05, "loss": 1.1096, "step": 69200 }, { "epoch": 0.06, "grad_norm": 69.5, "learning_rate": 4.738720823510357e-05, "loss": 1.1468, "step": 69300 }, { "epoch": 0.06, "grad_norm": 16.375, "learning_rate": 4.7382709161912646e-05, "loss": 1.1837, "step": 69400 }, { "epoch": 0.06, "grad_norm": 98.0, "learning_rate": 4.737821008872173e-05, "loss": 1.0676, "step": 69500 }, { "epoch": 0.06, "grad_norm": 35.75, "learning_rate": 4.73737110155308e-05, "loss": 1.112, "step": 69600 }, { "epoch": 0.06, "grad_norm": 74.0, "learning_rate": 4.736921194233988e-05, "loss": 0.983, "step": 69700 }, { "epoch": 0.06, "grad_norm": 60.0, "learning_rate": 4.736471286914896e-05, "loss": 1.0985, "step": 69800 }, { "epoch": 0.06, "grad_norm": 66.0, "learning_rate": 4.7360213795958036e-05, "loss": 1.1919, "step": 69900 }, { "epoch": 0.06, "grad_norm": 40.25, "learning_rate": 4.735571472276711e-05, "loss": 1.1042, "step": 70000 }, { "epoch": 0.06, "grad_norm": 46.0, "learning_rate": 4.7351215649576194e-05, "loss": 1.0638, "step": 70100 }, { "epoch": 0.06, "grad_norm": 176.0, "learning_rate": 4.734671657638527e-05, "loss": 0.9685, "step": 70200 }, { "epoch": 0.06, "grad_norm": 57.5, "learning_rate": 4.7342217503194344e-05, "loss": 1.0771, "step": 70300 }, { "epoch": 0.06, "grad_norm": 37.25, "learning_rate": 4.733771843000342e-05, "loss": 1.0881, "step": 70400 }, { "epoch": 0.06, "grad_norm": 41.0, "learning_rate": 4.7333219356812495e-05, "loss": 0.9475, "step": 70500 }, { "epoch": 0.06, "grad_norm": 83.5, "learning_rate": 4.732872028362158e-05, "loss": 1.1723, "step": 70600 }, { "epoch": 0.06, "grad_norm": 64.5, "learning_rate": 4.732422121043065e-05, "loss": 1.072, "step": 70700 }, { "epoch": 0.06, "grad_norm": 72.5, "learning_rate": 4.731972213723973e-05, "loss": 0.9838, "step": 70800 }, { "epoch": 0.06, "grad_norm": 41.5, "learning_rate": 4.731522306404881e-05, "loss": 1.078, "step": 70900 }, { "epoch": 0.06, "grad_norm": 115.5, "learning_rate": 4.7310723990857885e-05, "loss": 1.0592, "step": 71000 }, { "epoch": 0.06, "grad_norm": 0.095703125, "learning_rate": 4.730622491766696e-05, "loss": 0.9734, "step": 71100 }, { "epoch": 0.06, "grad_norm": 19.375, "learning_rate": 4.730172584447604e-05, "loss": 0.9183, "step": 71200 }, { "epoch": 0.06, "grad_norm": 15.75, "learning_rate": 4.729722677128512e-05, "loss": 0.9231, "step": 71300 }, { "epoch": 0.06, "grad_norm": 80.0, "learning_rate": 4.72927276980942e-05, "loss": 1.1361, "step": 71400 }, { "epoch": 0.06, "grad_norm": 90.0, "learning_rate": 4.7288228624903275e-05, "loss": 1.0136, "step": 71500 }, { "epoch": 0.06, "grad_norm": 13.9375, "learning_rate": 4.7283729551712344e-05, "loss": 1.0182, "step": 71600 }, { "epoch": 0.06, "grad_norm": 59.5, "learning_rate": 4.7279230478521426e-05, "loss": 0.9439, "step": 71700 }, { "epoch": 0.06, "grad_norm": 54.5, "learning_rate": 4.72747314053305e-05, "loss": 1.0811, "step": 71800 }, { "epoch": 0.06, "grad_norm": 16.25, "learning_rate": 4.727023233213958e-05, "loss": 1.0841, "step": 71900 }, { "epoch": 0.06, "grad_norm": 15.8125, "learning_rate": 4.726573325894866e-05, "loss": 0.988, "step": 72000 }, { "epoch": 0.06, "grad_norm": 73.5, "learning_rate": 4.7261234185757734e-05, "loss": 1.1638, "step": 72100 }, { "epoch": 0.06, "grad_norm": 69.5, "learning_rate": 4.7256735112566816e-05, "loss": 1.111, "step": 72200 }, { "epoch": 0.06, "grad_norm": 16.125, "learning_rate": 4.725223603937589e-05, "loss": 0.9247, "step": 72300 }, { "epoch": 0.06, "grad_norm": 36.75, "learning_rate": 4.724773696618497e-05, "loss": 1.029, "step": 72400 }, { "epoch": 0.06, "grad_norm": 15.0, "learning_rate": 4.724323789299405e-05, "loss": 1.1028, "step": 72500 }, { "epoch": 0.06, "grad_norm": 12.6875, "learning_rate": 4.7238738819803124e-05, "loss": 1.1013, "step": 72600 }, { "epoch": 0.06, "grad_norm": 42.75, "learning_rate": 4.72342397466122e-05, "loss": 1.2265, "step": 72700 }, { "epoch": 0.06, "grad_norm": 30.625, "learning_rate": 4.722974067342128e-05, "loss": 0.9438, "step": 72800 }, { "epoch": 0.06, "grad_norm": 37.75, "learning_rate": 4.722524160023035e-05, "loss": 0.9759, "step": 72900 }, { "epoch": 0.07, "grad_norm": 0.2578125, "learning_rate": 4.722074252703943e-05, "loss": 0.9565, "step": 73000 }, { "epoch": 0.07, "grad_norm": 0.1943359375, "learning_rate": 4.721624345384851e-05, "loss": 0.9849, "step": 73100 }, { "epoch": 0.07, "grad_norm": 74.0, "learning_rate": 4.721174438065758e-05, "loss": 1.0244, "step": 73200 }, { "epoch": 0.07, "grad_norm": 31.375, "learning_rate": 4.7207245307466665e-05, "loss": 1.1064, "step": 73300 }, { "epoch": 0.07, "grad_norm": 84.0, "learning_rate": 4.720274623427574e-05, "loss": 1.0538, "step": 73400 }, { "epoch": 0.07, "grad_norm": 27.875, "learning_rate": 4.7198247161084816e-05, "loss": 1.057, "step": 73500 }, { "epoch": 0.07, "grad_norm": 185.0, "learning_rate": 4.71937480878939e-05, "loss": 1.0565, "step": 73600 }, { "epoch": 0.07, "grad_norm": 90.5, "learning_rate": 4.7189249014702974e-05, "loss": 1.11, "step": 73700 }, { "epoch": 0.07, "grad_norm": 808.0, "learning_rate": 4.718474994151205e-05, "loss": 1.1171, "step": 73800 }, { "epoch": 0.07, "grad_norm": 191.0, "learning_rate": 4.718025086832113e-05, "loss": 1.0666, "step": 73900 }, { "epoch": 0.07, "grad_norm": 50.25, "learning_rate": 4.7175751795130206e-05, "loss": 1.0441, "step": 74000 }, { "epoch": 0.07, "grad_norm": 44.75, "learning_rate": 4.717125272193929e-05, "loss": 0.9349, "step": 74100 }, { "epoch": 0.07, "grad_norm": 54.5, "learning_rate": 4.716675364874836e-05, "loss": 1.0472, "step": 74200 }, { "epoch": 0.07, "grad_norm": 12.25, "learning_rate": 4.716225457555743e-05, "loss": 0.9954, "step": 74300 }, { "epoch": 0.07, "grad_norm": 18.75, "learning_rate": 4.7157755502366514e-05, "loss": 0.9964, "step": 74400 }, { "epoch": 0.07, "grad_norm": 38.0, "learning_rate": 4.715325642917559e-05, "loss": 1.0264, "step": 74500 }, { "epoch": 0.07, "grad_norm": 19.0, "learning_rate": 4.7148757355984665e-05, "loss": 1.1607, "step": 74600 }, { "epoch": 0.07, "grad_norm": 23.125, "learning_rate": 4.714425828279375e-05, "loss": 1.0605, "step": 74700 }, { "epoch": 0.07, "grad_norm": 78.5, "learning_rate": 4.713975920960282e-05, "loss": 0.9948, "step": 74800 }, { "epoch": 0.07, "grad_norm": 48.75, "learning_rate": 4.7135260136411905e-05, "loss": 0.9176, "step": 74900 }, { "epoch": 0.07, "grad_norm": 45.5, "learning_rate": 4.713076106322098e-05, "loss": 0.9902, "step": 75000 }, { "epoch": 0.07, "grad_norm": 0.00396728515625, "learning_rate": 4.7126261990030055e-05, "loss": 1.1327, "step": 75100 }, { "epoch": 0.07, "grad_norm": 34.75, "learning_rate": 4.712176291683914e-05, "loss": 1.054, "step": 75200 }, { "epoch": 0.07, "grad_norm": 38.5, "learning_rate": 4.711726384364821e-05, "loss": 0.9967, "step": 75300 }, { "epoch": 0.07, "grad_norm": 35.0, "learning_rate": 4.711276477045729e-05, "loss": 1.0065, "step": 75400 }, { "epoch": 0.07, "grad_norm": 15.875, "learning_rate": 4.7108265697266364e-05, "loss": 0.9735, "step": 75500 }, { "epoch": 0.07, "grad_norm": 0.039794921875, "learning_rate": 4.710376662407544e-05, "loss": 0.9869, "step": 75600 }, { "epoch": 0.07, "grad_norm": 75.0, "learning_rate": 4.709926755088452e-05, "loss": 1.1004, "step": 75700 }, { "epoch": 0.07, "grad_norm": 27.75, "learning_rate": 4.7094768477693596e-05, "loss": 1.0081, "step": 75800 }, { "epoch": 0.07, "grad_norm": 144.0, "learning_rate": 4.709026940450267e-05, "loss": 1.1244, "step": 75900 }, { "epoch": 0.07, "grad_norm": 196.0, "learning_rate": 4.7085770331311754e-05, "loss": 1.0563, "step": 76000 }, { "epoch": 0.07, "grad_norm": 180.0, "learning_rate": 4.708127125812083e-05, "loss": 1.0529, "step": 76100 }, { "epoch": 0.07, "grad_norm": 38.0, "learning_rate": 4.7076772184929904e-05, "loss": 0.9047, "step": 76200 }, { "epoch": 0.07, "grad_norm": 82.0, "learning_rate": 4.7072273111738987e-05, "loss": 1.048, "step": 76300 }, { "epoch": 0.07, "grad_norm": 28.5, "learning_rate": 4.706777403854806e-05, "loss": 1.146, "step": 76400 }, { "epoch": 0.07, "grad_norm": 74.5, "learning_rate": 4.706327496535714e-05, "loss": 1.0799, "step": 76500 }, { "epoch": 0.07, "grad_norm": 20.75, "learning_rate": 4.705877589216622e-05, "loss": 1.1352, "step": 76600 }, { "epoch": 0.07, "grad_norm": 46.25, "learning_rate": 4.7054276818975295e-05, "loss": 1.0056, "step": 76700 }, { "epoch": 0.07, "grad_norm": 0.96875, "learning_rate": 4.704977774578437e-05, "loss": 1.1245, "step": 76800 }, { "epoch": 0.07, "grad_norm": 20.25, "learning_rate": 4.7045278672593445e-05, "loss": 1.352, "step": 76900 }, { "epoch": 0.07, "grad_norm": 21.5, "learning_rate": 4.704077959940252e-05, "loss": 1.0695, "step": 77000 }, { "epoch": 0.07, "grad_norm": 25.0, "learning_rate": 4.70362805262116e-05, "loss": 1.0156, "step": 77100 }, { "epoch": 0.07, "grad_norm": 40.25, "learning_rate": 4.703178145302068e-05, "loss": 0.9797, "step": 77200 }, { "epoch": 0.07, "grad_norm": 27.875, "learning_rate": 4.7027282379829754e-05, "loss": 1.0771, "step": 77300 }, { "epoch": 0.07, "grad_norm": 80.5, "learning_rate": 4.7022783306638836e-05, "loss": 0.9401, "step": 77400 }, { "epoch": 0.07, "grad_norm": 51.5, "learning_rate": 4.701828423344791e-05, "loss": 0.9542, "step": 77500 }, { "epoch": 0.07, "grad_norm": 42.25, "learning_rate": 4.701378516025699e-05, "loss": 1.1057, "step": 77600 }, { "epoch": 0.07, "grad_norm": 127.5, "learning_rate": 4.700928608706607e-05, "loss": 1.1478, "step": 77700 }, { "epoch": 0.07, "grad_norm": 16.375, "learning_rate": 4.7004787013875144e-05, "loss": 0.964, "step": 77800 }, { "epoch": 0.07, "grad_norm": 213.0, "learning_rate": 4.7000287940684226e-05, "loss": 0.9888, "step": 77900 }, { "epoch": 0.07, "grad_norm": 41.5, "learning_rate": 4.69957888674933e-05, "loss": 1.0407, "step": 78000 }, { "epoch": 0.07, "grad_norm": 124.0, "learning_rate": 4.6991289794302377e-05, "loss": 0.8326, "step": 78100 }, { "epoch": 0.07, "grad_norm": 31.0, "learning_rate": 4.698679072111145e-05, "loss": 1.1145, "step": 78200 }, { "epoch": 0.07, "grad_norm": 42.5, "learning_rate": 4.698229164792053e-05, "loss": 1.0665, "step": 78300 }, { "epoch": 0.07, "grad_norm": 14.875, "learning_rate": 4.697779257472961e-05, "loss": 1.0049, "step": 78400 }, { "epoch": 0.07, "grad_norm": 14.25, "learning_rate": 4.6973293501538685e-05, "loss": 1.2033, "step": 78500 }, { "epoch": 0.07, "grad_norm": 46.25, "learning_rate": 4.696879442834776e-05, "loss": 1.0287, "step": 78600 }, { "epoch": 0.07, "grad_norm": 16.875, "learning_rate": 4.696429535515684e-05, "loss": 1.0317, "step": 78700 }, { "epoch": 0.07, "grad_norm": 0.162109375, "learning_rate": 4.695979628196592e-05, "loss": 1.1977, "step": 78800 }, { "epoch": 0.07, "grad_norm": 36.0, "learning_rate": 4.695529720877499e-05, "loss": 1.0422, "step": 78900 }, { "epoch": 0.07, "grad_norm": 44.5, "learning_rate": 4.6950798135584075e-05, "loss": 0.9911, "step": 79000 }, { "epoch": 0.07, "grad_norm": 25.0, "learning_rate": 4.694629906239315e-05, "loss": 1.2047, "step": 79100 }, { "epoch": 0.07, "grad_norm": 22.75, "learning_rate": 4.6941799989202226e-05, "loss": 1.0781, "step": 79200 }, { "epoch": 0.07, "grad_norm": 11.875, "learning_rate": 4.693730091601131e-05, "loss": 0.9866, "step": 79300 }, { "epoch": 0.07, "grad_norm": 32.25, "learning_rate": 4.6932801842820376e-05, "loss": 1.089, "step": 79400 }, { "epoch": 0.07, "grad_norm": 29.125, "learning_rate": 4.692830276962946e-05, "loss": 0.9768, "step": 79500 }, { "epoch": 0.07, "grad_norm": 6.4375, "learning_rate": 4.6923803696438534e-05, "loss": 0.9505, "step": 79600 }, { "epoch": 0.07, "grad_norm": 61.0, "learning_rate": 4.691930462324761e-05, "loss": 1.0787, "step": 79700 }, { "epoch": 0.07, "grad_norm": 46.0, "learning_rate": 4.691480555005669e-05, "loss": 1.0963, "step": 79800 }, { "epoch": 0.07, "grad_norm": 32.75, "learning_rate": 4.6910306476865767e-05, "loss": 1.1343, "step": 79900 }, { "epoch": 0.07, "grad_norm": 29.5, "learning_rate": 4.690580740367484e-05, "loss": 1.0554, "step": 80000 }, { "epoch": 0.07, "grad_norm": 87.0, "learning_rate": 4.6901308330483924e-05, "loss": 0.9815, "step": 80100 }, { "epoch": 0.07, "grad_norm": 31.625, "learning_rate": 4.6896809257293e-05, "loss": 1.0009, "step": 80200 }, { "epoch": 0.07, "grad_norm": 87.5, "learning_rate": 4.689231018410208e-05, "loss": 1.0204, "step": 80300 }, { "epoch": 0.07, "grad_norm": 63.75, "learning_rate": 4.688781111091116e-05, "loss": 1.0705, "step": 80400 }, { "epoch": 0.07, "grad_norm": 30.875, "learning_rate": 4.688331203772023e-05, "loss": 1.0482, "step": 80500 }, { "epoch": 0.07, "grad_norm": 19.0, "learning_rate": 4.6878812964529314e-05, "loss": 1.0991, "step": 80600 }, { "epoch": 0.07, "grad_norm": 141.0, "learning_rate": 4.687431389133838e-05, "loss": 1.1017, "step": 80700 }, { "epoch": 0.07, "grad_norm": 240.0, "learning_rate": 4.6869814818147465e-05, "loss": 1.045, "step": 80800 }, { "epoch": 0.07, "grad_norm": 16.875, "learning_rate": 4.686531574495654e-05, "loss": 1.1315, "step": 80900 }, { "epoch": 0.07, "grad_norm": 39.5, "learning_rate": 4.6860816671765616e-05, "loss": 1.0082, "step": 81000 }, { "epoch": 0.07, "grad_norm": 68.5, "learning_rate": 4.68563175985747e-05, "loss": 1.0634, "step": 81100 }, { "epoch": 0.07, "grad_norm": 31.625, "learning_rate": 4.685181852538377e-05, "loss": 1.0875, "step": 81200 }, { "epoch": 0.07, "grad_norm": 77.5, "learning_rate": 4.684731945219285e-05, "loss": 0.9573, "step": 81300 }, { "epoch": 0.07, "grad_norm": 0.1123046875, "learning_rate": 4.684282037900193e-05, "loss": 0.8969, "step": 81400 }, { "epoch": 0.07, "grad_norm": 86.5, "learning_rate": 4.6838321305811006e-05, "loss": 1.0937, "step": 81500 }, { "epoch": 0.07, "grad_norm": 22.625, "learning_rate": 4.683382223262008e-05, "loss": 0.9974, "step": 81600 }, { "epoch": 0.07, "grad_norm": 180.0, "learning_rate": 4.682932315942916e-05, "loss": 0.9794, "step": 81700 }, { "epoch": 0.07, "grad_norm": 64.5, "learning_rate": 4.682482408623824e-05, "loss": 1.0085, "step": 81800 }, { "epoch": 0.07, "grad_norm": 42.75, "learning_rate": 4.6820325013047314e-05, "loss": 1.104, "step": 81900 }, { "epoch": 0.07, "grad_norm": 10.0, "learning_rate": 4.681582593985639e-05, "loss": 0.9597, "step": 82000 }, { "epoch": 0.07, "grad_norm": 8.3125, "learning_rate": 4.6811326866665465e-05, "loss": 1.0694, "step": 82100 }, { "epoch": 0.07, "grad_norm": 19.25, "learning_rate": 4.680682779347455e-05, "loss": 1.1842, "step": 82200 }, { "epoch": 0.07, "grad_norm": 20.5, "learning_rate": 4.680232872028362e-05, "loss": 1.1325, "step": 82300 }, { "epoch": 0.07, "grad_norm": 19.5, "learning_rate": 4.67978296470927e-05, "loss": 1.1123, "step": 82400 }, { "epoch": 0.07, "grad_norm": 11.75, "learning_rate": 4.679333057390178e-05, "loss": 1.1663, "step": 82500 }, { "epoch": 0.07, "grad_norm": 39.25, "learning_rate": 4.6788831500710855e-05, "loss": 1.1589, "step": 82600 }, { "epoch": 0.07, "grad_norm": 79.5, "learning_rate": 4.678433242751993e-05, "loss": 1.0633, "step": 82700 }, { "epoch": 0.07, "grad_norm": 8.5, "learning_rate": 4.677983335432901e-05, "loss": 1.0339, "step": 82800 }, { "epoch": 0.07, "grad_norm": 46.0, "learning_rate": 4.677533428113809e-05, "loss": 1.1827, "step": 82900 }, { "epoch": 0.07, "grad_norm": 124.0, "learning_rate": 4.677083520794717e-05, "loss": 1.0051, "step": 83000 }, { "epoch": 0.07, "grad_norm": 35.75, "learning_rate": 4.6766336134756245e-05, "loss": 1.028, "step": 83100 }, { "epoch": 0.07, "grad_norm": 75.0, "learning_rate": 4.676183706156532e-05, "loss": 0.923, "step": 83200 }, { "epoch": 0.07, "grad_norm": 13.875, "learning_rate": 4.6757337988374396e-05, "loss": 0.9642, "step": 83300 }, { "epoch": 0.07, "grad_norm": 12.875, "learning_rate": 4.675283891518347e-05, "loss": 1.138, "step": 83400 }, { "epoch": 0.07, "grad_norm": 42.5, "learning_rate": 4.674833984199255e-05, "loss": 1.2567, "step": 83500 }, { "epoch": 0.07, "grad_norm": 20.625, "learning_rate": 4.674384076880163e-05, "loss": 1.0483, "step": 83600 }, { "epoch": 0.07, "grad_norm": 86.5, "learning_rate": 4.6739341695610704e-05, "loss": 1.2013, "step": 83700 }, { "epoch": 0.07, "grad_norm": 596.0, "learning_rate": 4.6734842622419786e-05, "loss": 1.0766, "step": 83800 }, { "epoch": 0.07, "grad_norm": 35.75, "learning_rate": 4.673034354922886e-05, "loss": 0.9455, "step": 83900 }, { "epoch": 0.07, "grad_norm": 54.0, "learning_rate": 4.672584447603794e-05, "loss": 1.0611, "step": 84000 }, { "epoch": 0.07, "grad_norm": 157.0, "learning_rate": 4.672134540284702e-05, "loss": 1.1654, "step": 84100 }, { "epoch": 0.08, "grad_norm": 59.25, "learning_rate": 4.6716846329656094e-05, "loss": 1.039, "step": 84200 }, { "epoch": 0.08, "grad_norm": 24.375, "learning_rate": 4.671234725646517e-05, "loss": 0.9963, "step": 84300 }, { "epoch": 0.08, "grad_norm": 66.5, "learning_rate": 4.670784818327425e-05, "loss": 1.0757, "step": 84400 }, { "epoch": 0.08, "grad_norm": 17.875, "learning_rate": 4.670334911008333e-05, "loss": 0.9207, "step": 84500 }, { "epoch": 0.08, "grad_norm": 37.75, "learning_rate": 4.66988500368924e-05, "loss": 1.0711, "step": 84600 }, { "epoch": 0.08, "grad_norm": 25.875, "learning_rate": 4.669435096370148e-05, "loss": 0.9419, "step": 84700 }, { "epoch": 0.08, "grad_norm": 108.5, "learning_rate": 4.668985189051055e-05, "loss": 1.0834, "step": 84800 }, { "epoch": 0.08, "grad_norm": 288.0, "learning_rate": 4.6685352817319635e-05, "loss": 1.0528, "step": 84900 }, { "epoch": 0.08, "grad_norm": 14.625, "learning_rate": 4.668085374412871e-05, "loss": 0.8459, "step": 85000 }, { "epoch": 0.08, "grad_norm": 110.5, "learning_rate": 4.6676354670937786e-05, "loss": 0.8702, "step": 85100 }, { "epoch": 0.08, "grad_norm": 15.4375, "learning_rate": 4.667185559774687e-05, "loss": 0.9928, "step": 85200 }, { "epoch": 0.08, "grad_norm": 35.0, "learning_rate": 4.666735652455594e-05, "loss": 0.8058, "step": 85300 }, { "epoch": 0.08, "grad_norm": 13.875, "learning_rate": 4.666285745136502e-05, "loss": 1.072, "step": 85400 }, { "epoch": 0.08, "grad_norm": 47.5, "learning_rate": 4.66583583781741e-05, "loss": 0.9577, "step": 85500 }, { "epoch": 0.08, "grad_norm": 21.125, "learning_rate": 4.6653859304983176e-05, "loss": 1.1584, "step": 85600 }, { "epoch": 0.08, "grad_norm": 23.25, "learning_rate": 4.664936023179226e-05, "loss": 1.003, "step": 85700 }, { "epoch": 0.08, "grad_norm": 29.625, "learning_rate": 4.6644861158601334e-05, "loss": 1.0796, "step": 85800 }, { "epoch": 0.08, "grad_norm": 0.00189971923828125, "learning_rate": 4.66403620854104e-05, "loss": 1.0614, "step": 85900 }, { "epoch": 0.08, "grad_norm": 126.0, "learning_rate": 4.6635863012219484e-05, "loss": 1.0196, "step": 86000 }, { "epoch": 0.08, "grad_norm": 70.0, "learning_rate": 4.663136393902856e-05, "loss": 0.992, "step": 86100 }, { "epoch": 0.08, "grad_norm": 26.5, "learning_rate": 4.662686486583764e-05, "loss": 1.0376, "step": 86200 }, { "epoch": 0.08, "grad_norm": 26.125, "learning_rate": 4.662236579264672e-05, "loss": 1.046, "step": 86300 }, { "epoch": 0.08, "grad_norm": 22.125, "learning_rate": 4.661786671945579e-05, "loss": 0.9676, "step": 86400 }, { "epoch": 0.08, "grad_norm": 5.15625, "learning_rate": 4.6613367646264875e-05, "loss": 0.9605, "step": 86500 }, { "epoch": 0.08, "grad_norm": 15.0625, "learning_rate": 4.660886857307395e-05, "loss": 1.1733, "step": 86600 }, { "epoch": 0.08, "grad_norm": 51.0, "learning_rate": 4.6604369499883025e-05, "loss": 1.0272, "step": 86700 }, { "epoch": 0.08, "grad_norm": 18.375, "learning_rate": 4.659987042669211e-05, "loss": 0.9423, "step": 86800 }, { "epoch": 0.08, "grad_norm": 15.6875, "learning_rate": 4.659537135350118e-05, "loss": 1.0338, "step": 86900 }, { "epoch": 0.08, "grad_norm": 13.4375, "learning_rate": 4.659087228031026e-05, "loss": 1.0813, "step": 87000 }, { "epoch": 0.08, "grad_norm": 12.9375, "learning_rate": 4.658637320711934e-05, "loss": 1.0381, "step": 87100 }, { "epoch": 0.08, "grad_norm": 12.75, "learning_rate": 4.658187413392841e-05, "loss": 0.9795, "step": 87200 }, { "epoch": 0.08, "grad_norm": 44.0, "learning_rate": 4.657737506073749e-05, "loss": 0.9689, "step": 87300 }, { "epoch": 0.08, "grad_norm": 17.5, "learning_rate": 4.6572875987546566e-05, "loss": 0.9152, "step": 87400 }, { "epoch": 0.08, "grad_norm": 11.875, "learning_rate": 4.656837691435564e-05, "loss": 1.0113, "step": 87500 }, { "epoch": 0.08, "grad_norm": 25.75, "learning_rate": 4.6563877841164724e-05, "loss": 1.0188, "step": 87600 }, { "epoch": 0.08, "grad_norm": 0.076171875, "learning_rate": 4.65593787679738e-05, "loss": 1.0329, "step": 87700 }, { "epoch": 0.08, "grad_norm": 105.5, "learning_rate": 4.6554879694782874e-05, "loss": 1.0492, "step": 87800 }, { "epoch": 0.08, "grad_norm": 69.0, "learning_rate": 4.6550380621591956e-05, "loss": 0.9479, "step": 87900 }, { "epoch": 0.08, "grad_norm": 245.0, "learning_rate": 4.654588154840103e-05, "loss": 0.9938, "step": 88000 }, { "epoch": 0.08, "grad_norm": 676.0, "learning_rate": 4.654138247521011e-05, "loss": 0.9638, "step": 88100 }, { "epoch": 0.08, "grad_norm": 39.5, "learning_rate": 4.653688340201919e-05, "loss": 1.0982, "step": 88200 }, { "epoch": 0.08, "grad_norm": 29.625, "learning_rate": 4.6532384328828265e-05, "loss": 1.0785, "step": 88300 }, { "epoch": 0.08, "grad_norm": 5.875, "learning_rate": 4.652788525563735e-05, "loss": 1.0102, "step": 88400 }, { "epoch": 0.08, "grad_norm": 0.034423828125, "learning_rate": 4.6523386182446415e-05, "loss": 1.088, "step": 88500 }, { "epoch": 0.08, "grad_norm": 0.33203125, "learning_rate": 4.651888710925549e-05, "loss": 0.946, "step": 88600 }, { "epoch": 0.08, "grad_norm": 25.125, "learning_rate": 4.651438803606457e-05, "loss": 1.0201, "step": 88700 }, { "epoch": 0.08, "grad_norm": 14.3125, "learning_rate": 4.650988896287365e-05, "loss": 0.9087, "step": 88800 }, { "epoch": 0.08, "grad_norm": 0.01336669921875, "learning_rate": 4.650538988968272e-05, "loss": 1.0322, "step": 88900 }, { "epoch": 0.08, "grad_norm": 19.875, "learning_rate": 4.6500890816491805e-05, "loss": 1.099, "step": 89000 }, { "epoch": 0.08, "grad_norm": 26.125, "learning_rate": 4.649639174330088e-05, "loss": 1.0903, "step": 89100 }, { "epoch": 0.08, "grad_norm": 62.0, "learning_rate": 4.649189267010996e-05, "loss": 1.1445, "step": 89200 }, { "epoch": 0.08, "grad_norm": 165.0, "learning_rate": 4.648739359691904e-05, "loss": 1.0607, "step": 89300 }, { "epoch": 0.08, "grad_norm": 10.75, "learning_rate": 4.6482894523728114e-05, "loss": 1.1787, "step": 89400 }, { "epoch": 0.08, "grad_norm": 38.5, "learning_rate": 4.6478395450537196e-05, "loss": 0.973, "step": 89500 }, { "epoch": 0.08, "grad_norm": 53.25, "learning_rate": 4.647389637734627e-05, "loss": 0.9323, "step": 89600 }, { "epoch": 0.08, "grad_norm": 28.25, "learning_rate": 4.6469397304155346e-05, "loss": 1.1569, "step": 89700 }, { "epoch": 0.08, "grad_norm": 34.25, "learning_rate": 4.646489823096442e-05, "loss": 1.1027, "step": 89800 }, { "epoch": 0.08, "grad_norm": 49.0, "learning_rate": 4.64603991577735e-05, "loss": 1.2724, "step": 89900 }, { "epoch": 0.08, "grad_norm": 20.75, "learning_rate": 4.645590008458258e-05, "loss": 1.0498, "step": 90000 }, { "epoch": 0.08, "grad_norm": 0.3203125, "learning_rate": 4.6451401011391655e-05, "loss": 0.9929, "step": 90100 }, { "epoch": 0.08, "grad_norm": 65.0, "learning_rate": 4.644690193820073e-05, "loss": 1.1277, "step": 90200 }, { "epoch": 0.08, "grad_norm": 24.125, "learning_rate": 4.644240286500981e-05, "loss": 1.1095, "step": 90300 }, { "epoch": 0.08, "grad_norm": 28.375, "learning_rate": 4.643790379181889e-05, "loss": 1.1632, "step": 90400 }, { "epoch": 0.08, "grad_norm": 322.0, "learning_rate": 4.643340471862796e-05, "loss": 1.005, "step": 90500 }, { "epoch": 0.08, "grad_norm": 68.0, "learning_rate": 4.6428905645437045e-05, "loss": 1.0186, "step": 90600 }, { "epoch": 0.08, "grad_norm": 21.625, "learning_rate": 4.642440657224612e-05, "loss": 1.1503, "step": 90700 }, { "epoch": 0.08, "grad_norm": 38.5, "learning_rate": 4.6419907499055195e-05, "loss": 1.1201, "step": 90800 }, { "epoch": 0.08, "grad_norm": 43.25, "learning_rate": 4.641540842586428e-05, "loss": 1.0718, "step": 90900 }, { "epoch": 0.08, "grad_norm": 206.0, "learning_rate": 4.641090935267335e-05, "loss": 1.1328, "step": 91000 }, { "epoch": 0.08, "grad_norm": 31.125, "learning_rate": 4.640641027948243e-05, "loss": 1.0125, "step": 91100 }, { "epoch": 0.08, "grad_norm": 59.25, "learning_rate": 4.6401911206291504e-05, "loss": 0.9647, "step": 91200 }, { "epoch": 0.08, "grad_norm": 21.125, "learning_rate": 4.639741213310058e-05, "loss": 0.9768, "step": 91300 }, { "epoch": 0.08, "grad_norm": 24.5, "learning_rate": 4.639291305990966e-05, "loss": 0.9173, "step": 91400 }, { "epoch": 0.08, "grad_norm": 37.25, "learning_rate": 4.6388413986718736e-05, "loss": 0.9748, "step": 91500 }, { "epoch": 0.08, "grad_norm": 70.0, "learning_rate": 4.638391491352781e-05, "loss": 0.9763, "step": 91600 }, { "epoch": 0.08, "grad_norm": 34.0, "learning_rate": 4.6379415840336894e-05, "loss": 1.0998, "step": 91700 }, { "epoch": 0.08, "grad_norm": 6.8125, "learning_rate": 4.637491676714597e-05, "loss": 0.9437, "step": 91800 }, { "epoch": 0.08, "grad_norm": 740.0, "learning_rate": 4.637041769395505e-05, "loss": 1.094, "step": 91900 }, { "epoch": 0.08, "grad_norm": 2.328125, "learning_rate": 4.636591862076413e-05, "loss": 1.0091, "step": 92000 }, { "epoch": 0.08, "grad_norm": 34.25, "learning_rate": 4.63614195475732e-05, "loss": 1.1075, "step": 92100 }, { "epoch": 0.08, "grad_norm": 163.0, "learning_rate": 4.6356920474382284e-05, "loss": 1.0266, "step": 92200 }, { "epoch": 0.08, "grad_norm": 55.5, "learning_rate": 4.635242140119136e-05, "loss": 0.9571, "step": 92300 }, { "epoch": 0.08, "grad_norm": 16.875, "learning_rate": 4.6347922328000435e-05, "loss": 1.095, "step": 92400 }, { "epoch": 0.08, "grad_norm": 46.25, "learning_rate": 4.634342325480951e-05, "loss": 1.0834, "step": 92500 }, { "epoch": 0.08, "grad_norm": 10.1875, "learning_rate": 4.6338924181618585e-05, "loss": 0.9692, "step": 92600 }, { "epoch": 0.08, "grad_norm": 38.0, "learning_rate": 4.633442510842767e-05, "loss": 1.1476, "step": 92700 }, { "epoch": 0.08, "grad_norm": 1.5625, "learning_rate": 4.632992603523674e-05, "loss": 0.973, "step": 92800 }, { "epoch": 0.08, "grad_norm": 164.0, "learning_rate": 4.632542696204582e-05, "loss": 0.9548, "step": 92900 }, { "epoch": 0.08, "grad_norm": 51.5, "learning_rate": 4.63209278888549e-05, "loss": 1.0253, "step": 93000 }, { "epoch": 0.08, "grad_norm": 19.75, "learning_rate": 4.6316428815663976e-05, "loss": 1.0976, "step": 93100 }, { "epoch": 0.08, "grad_norm": 13.875, "learning_rate": 4.631192974247305e-05, "loss": 0.9796, "step": 93200 }, { "epoch": 0.08, "grad_norm": 872.0, "learning_rate": 4.630743066928213e-05, "loss": 0.8708, "step": 93300 }, { "epoch": 0.08, "grad_norm": 25.375, "learning_rate": 4.630293159609121e-05, "loss": 1.0497, "step": 93400 }, { "epoch": 0.08, "grad_norm": 34.5, "learning_rate": 4.6298432522900284e-05, "loss": 1.1437, "step": 93500 }, { "epoch": 0.08, "grad_norm": 16.625, "learning_rate": 4.6293933449709366e-05, "loss": 0.8803, "step": 93600 }, { "epoch": 0.08, "grad_norm": 1.2265625, "learning_rate": 4.6289434376518435e-05, "loss": 0.9964, "step": 93700 }, { "epoch": 0.08, "grad_norm": 25.75, "learning_rate": 4.628493530332752e-05, "loss": 0.9963, "step": 93800 }, { "epoch": 0.08, "grad_norm": 0.0164794921875, "learning_rate": 4.628043623013659e-05, "loss": 1.0988, "step": 93900 }, { "epoch": 0.08, "grad_norm": 18.875, "learning_rate": 4.627593715694567e-05, "loss": 1.0421, "step": 94000 }, { "epoch": 0.08, "grad_norm": 15.875, "learning_rate": 4.627143808375475e-05, "loss": 0.9977, "step": 94100 }, { "epoch": 0.08, "grad_norm": 32.0, "learning_rate": 4.6266939010563825e-05, "loss": 0.9836, "step": 94200 }, { "epoch": 0.08, "grad_norm": 169.0, "learning_rate": 4.62624399373729e-05, "loss": 1.0128, "step": 94300 }, { "epoch": 0.08, "grad_norm": 86.5, "learning_rate": 4.625794086418198e-05, "loss": 0.952, "step": 94400 }, { "epoch": 0.08, "grad_norm": 0.62109375, "learning_rate": 4.625344179099106e-05, "loss": 1.1159, "step": 94500 }, { "epoch": 0.08, "grad_norm": 0.126953125, "learning_rate": 4.624894271780014e-05, "loss": 1.0818, "step": 94600 }, { "epoch": 0.08, "grad_norm": 78.0, "learning_rate": 4.6244443644609215e-05, "loss": 1.0271, "step": 94700 }, { "epoch": 0.08, "grad_norm": 66.5, "learning_rate": 4.623994457141829e-05, "loss": 1.0018, "step": 94800 }, { "epoch": 0.08, "grad_norm": 12.75, "learning_rate": 4.623544549822737e-05, "loss": 0.9924, "step": 94900 }, { "epoch": 0.08, "grad_norm": 0.0023193359375, "learning_rate": 4.623094642503644e-05, "loss": 0.9408, "step": 95000 }, { "epoch": 0.08, "grad_norm": 35.5, "learning_rate": 4.622644735184552e-05, "loss": 1.2318, "step": 95100 }, { "epoch": 0.08, "grad_norm": 53.0, "learning_rate": 4.62219482786546e-05, "loss": 0.9541, "step": 95200 }, { "epoch": 0.08, "grad_norm": 108.5, "learning_rate": 4.6217449205463674e-05, "loss": 1.0628, "step": 95300 }, { "epoch": 0.08, "grad_norm": 159.0, "learning_rate": 4.6212950132272756e-05, "loss": 1.0599, "step": 95400 }, { "epoch": 0.09, "grad_norm": 17.5, "learning_rate": 4.620845105908183e-05, "loss": 1.1268, "step": 95500 }, { "epoch": 0.09, "grad_norm": 18.375, "learning_rate": 4.620395198589091e-05, "loss": 1.0284, "step": 95600 }, { "epoch": 0.09, "grad_norm": 35.25, "learning_rate": 4.619945291269999e-05, "loss": 1.0674, "step": 95700 }, { "epoch": 0.09, "grad_norm": 51.75, "learning_rate": 4.6194953839509064e-05, "loss": 1.1138, "step": 95800 }, { "epoch": 0.09, "grad_norm": 41.5, "learning_rate": 4.619045476631814e-05, "loss": 1.1292, "step": 95900 }, { "epoch": 0.09, "grad_norm": 40.5, "learning_rate": 4.618595569312722e-05, "loss": 1.0113, "step": 96000 }, { "epoch": 0.09, "grad_norm": 17.5, "learning_rate": 4.61814566199363e-05, "loss": 1.1243, "step": 96100 }, { "epoch": 0.09, "grad_norm": 15.1875, "learning_rate": 4.617695754674537e-05, "loss": 0.9991, "step": 96200 }, { "epoch": 0.09, "grad_norm": 17.25, "learning_rate": 4.617245847355445e-05, "loss": 1.0575, "step": 96300 }, { "epoch": 0.09, "grad_norm": 330.0, "learning_rate": 4.616795940036352e-05, "loss": 1.0142, "step": 96400 }, { "epoch": 0.09, "grad_norm": 16.75, "learning_rate": 4.6163460327172605e-05, "loss": 1.1727, "step": 96500 }, { "epoch": 0.09, "grad_norm": 17.75, "learning_rate": 4.615896125398168e-05, "loss": 1.1062, "step": 96600 }, { "epoch": 0.09, "grad_norm": 117.0, "learning_rate": 4.6154462180790756e-05, "loss": 0.9946, "step": 96700 }, { "epoch": 0.09, "grad_norm": 41.5, "learning_rate": 4.614996310759984e-05, "loss": 1.0037, "step": 96800 }, { "epoch": 0.09, "grad_norm": 0.294921875, "learning_rate": 4.614546403440891e-05, "loss": 0.964, "step": 96900 }, { "epoch": 0.09, "grad_norm": 660.0, "learning_rate": 4.614096496121799e-05, "loss": 1.0423, "step": 97000 }, { "epoch": 0.09, "grad_norm": 29.5, "learning_rate": 4.613646588802707e-05, "loss": 1.0066, "step": 97100 }, { "epoch": 0.09, "grad_norm": 16.0, "learning_rate": 4.6131966814836146e-05, "loss": 0.9157, "step": 97200 }, { "epoch": 0.09, "grad_norm": 94.5, "learning_rate": 4.612746774164523e-05, "loss": 1.0699, "step": 97300 }, { "epoch": 0.09, "grad_norm": 103.5, "learning_rate": 4.61229686684543e-05, "loss": 1.1165, "step": 97400 }, { "epoch": 0.09, "grad_norm": 31.375, "learning_rate": 4.611846959526338e-05, "loss": 1.0617, "step": 97500 }, { "epoch": 0.09, "grad_norm": 159.0, "learning_rate": 4.6113970522072454e-05, "loss": 1.0553, "step": 97600 }, { "epoch": 0.09, "grad_norm": 137.0, "learning_rate": 4.610947144888153e-05, "loss": 1.0892, "step": 97700 }, { "epoch": 0.09, "grad_norm": 20.875, "learning_rate": 4.610497237569061e-05, "loss": 0.8908, "step": 97800 }, { "epoch": 0.09, "grad_norm": 66.0, "learning_rate": 4.610047330249969e-05, "loss": 0.9143, "step": 97900 }, { "epoch": 0.09, "grad_norm": 79.5, "learning_rate": 4.609597422930876e-05, "loss": 1.019, "step": 98000 }, { "epoch": 0.09, "grad_norm": 23.75, "learning_rate": 4.6091475156117844e-05, "loss": 1.1294, "step": 98100 }, { "epoch": 0.09, "grad_norm": 3.921875, "learning_rate": 4.608697608292692e-05, "loss": 1.0638, "step": 98200 }, { "epoch": 0.09, "grad_norm": 40.0, "learning_rate": 4.6082477009735995e-05, "loss": 0.9934, "step": 98300 }, { "epoch": 0.09, "grad_norm": 153.0, "learning_rate": 4.607797793654508e-05, "loss": 1.064, "step": 98400 }, { "epoch": 0.09, "grad_norm": 11.1875, "learning_rate": 4.607347886335415e-05, "loss": 1.2169, "step": 98500 }, { "epoch": 0.09, "grad_norm": 126.5, "learning_rate": 4.606897979016323e-05, "loss": 0.9162, "step": 98600 }, { "epoch": 0.09, "grad_norm": 17.875, "learning_rate": 4.606448071697231e-05, "loss": 1.0007, "step": 98700 }, { "epoch": 0.09, "grad_norm": 36.25, "learning_rate": 4.6059981643781385e-05, "loss": 0.9393, "step": 98800 }, { "epoch": 0.09, "grad_norm": 18.5, "learning_rate": 4.605548257059046e-05, "loss": 1.1701, "step": 98900 }, { "epoch": 0.09, "grad_norm": 25.0, "learning_rate": 4.6050983497399536e-05, "loss": 1.1328, "step": 99000 }, { "epoch": 0.09, "grad_norm": 0.0191650390625, "learning_rate": 4.604648442420861e-05, "loss": 1.0642, "step": 99100 }, { "epoch": 0.09, "grad_norm": 16.375, "learning_rate": 4.604198535101769e-05, "loss": 1.0403, "step": 99200 }, { "epoch": 0.09, "grad_norm": 46.5, "learning_rate": 4.603748627782677e-05, "loss": 1.1214, "step": 99300 }, { "epoch": 0.09, "grad_norm": 12.375, "learning_rate": 4.6032987204635844e-05, "loss": 0.9587, "step": 99400 }, { "epoch": 0.09, "grad_norm": 113.5, "learning_rate": 4.6028488131444926e-05, "loss": 1.1008, "step": 99500 }, { "epoch": 0.09, "grad_norm": 122.0, "learning_rate": 4.6023989058254e-05, "loss": 0.9344, "step": 99600 }, { "epoch": 0.09, "grad_norm": 56.25, "learning_rate": 4.601948998506308e-05, "loss": 1.0311, "step": 99700 }, { "epoch": 0.09, "grad_norm": 1536.0, "learning_rate": 4.601499091187216e-05, "loss": 1.0442, "step": 99800 }, { "epoch": 0.09, "grad_norm": 19.875, "learning_rate": 4.6010491838681234e-05, "loss": 1.1676, "step": 99900 }, { "epoch": 0.09, "grad_norm": 56.75, "learning_rate": 4.6005992765490316e-05, "loss": 1.1096, "step": 100000 }, { "epoch": 0.09, "grad_norm": 107.5, "learning_rate": 4.600149369229939e-05, "loss": 0.947, "step": 100100 }, { "epoch": 0.09, "grad_norm": 15.25, "learning_rate": 4.599699461910846e-05, "loss": 1.0848, "step": 100200 }, { "epoch": 0.09, "grad_norm": 13.8125, "learning_rate": 4.599249554591754e-05, "loss": 1.0569, "step": 100300 }, { "epoch": 0.09, "grad_norm": 48.75, "learning_rate": 4.598799647272662e-05, "loss": 1.0007, "step": 100400 }, { "epoch": 0.09, "grad_norm": 21.5, "learning_rate": 4.59834973995357e-05, "loss": 1.1294, "step": 100500 }, { "epoch": 0.09, "grad_norm": 196.0, "learning_rate": 4.5978998326344775e-05, "loss": 1.1246, "step": 100600 }, { "epoch": 0.09, "grad_norm": 28.0, "learning_rate": 4.597449925315385e-05, "loss": 1.1526, "step": 100700 }, { "epoch": 0.09, "grad_norm": 50.5, "learning_rate": 4.597000017996293e-05, "loss": 1.0116, "step": 100800 }, { "epoch": 0.09, "grad_norm": 4.71875, "learning_rate": 4.596550110677201e-05, "loss": 1.1103, "step": 100900 }, { "epoch": 0.09, "grad_norm": 29.75, "learning_rate": 4.596100203358108e-05, "loss": 1.0276, "step": 101000 }, { "epoch": 0.09, "grad_norm": 32.25, "learning_rate": 4.5956502960390165e-05, "loss": 1.1312, "step": 101100 }, { "epoch": 0.09, "grad_norm": 17.25, "learning_rate": 4.595200388719924e-05, "loss": 0.8968, "step": 101200 }, { "epoch": 0.09, "grad_norm": 45.5, "learning_rate": 4.5947504814008316e-05, "loss": 0.9783, "step": 101300 }, { "epoch": 0.09, "grad_norm": 21.625, "learning_rate": 4.59430057408174e-05, "loss": 1.1372, "step": 101400 }, { "epoch": 0.09, "grad_norm": 29.0, "learning_rate": 4.593850666762647e-05, "loss": 1.1434, "step": 101500 }, { "epoch": 0.09, "grad_norm": 31.0, "learning_rate": 4.593400759443555e-05, "loss": 0.9434, "step": 101600 }, { "epoch": 0.09, "grad_norm": 11.5, "learning_rate": 4.5929508521244624e-05, "loss": 0.9486, "step": 101700 }, { "epoch": 0.09, "grad_norm": 116.5, "learning_rate": 4.59250094480537e-05, "loss": 1.0677, "step": 101800 }, { "epoch": 0.09, "grad_norm": 25.25, "learning_rate": 4.592051037486278e-05, "loss": 0.8866, "step": 101900 }, { "epoch": 0.09, "grad_norm": 82.5, "learning_rate": 4.591601130167186e-05, "loss": 1.0288, "step": 102000 } ], "logging_steps": 100, "max_steps": 1122566, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 6000, "total_flos": 1.6045021391939174e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }