{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2461.7163655359695, "learning_rate": 2.0833333333333333e-07, "loss": 13.4422, "step": 1 }, { "epoch": 0.01, "grad_norm": 1188.3611863310348, "learning_rate": 1.0416666666666667e-06, "loss": 12.564, "step": 5 }, { "epoch": 0.01, "grad_norm": 383.2777721914162, "learning_rate": 2.0833333333333334e-06, "loss": 7.5043, "step": 10 }, { "epoch": 0.02, "grad_norm": 85.32523705996411, "learning_rate": 3.125e-06, "loss": 4.3848, "step": 15 }, { "epoch": 0.02, "grad_norm": 44.902597609978045, "learning_rate": 4.166666666666667e-06, "loss": 3.608, "step": 20 }, { "epoch": 0.03, "grad_norm": 33.8945502589581, "learning_rate": 5.208333333333334e-06, "loss": 3.3411, "step": 25 }, { "epoch": 0.03, "grad_norm": 22.789039733408682, "learning_rate": 6.25e-06, "loss": 3.1891, "step": 30 }, { "epoch": 0.04, "grad_norm": 15.294909208517709, "learning_rate": 7.291666666666667e-06, "loss": 3.0935, "step": 35 }, { "epoch": 0.04, "grad_norm": 16.346336894842942, "learning_rate": 8.333333333333334e-06, "loss": 2.9414, "step": 40 }, { "epoch": 0.05, "grad_norm": 23.553918714950836, "learning_rate": 9.375000000000001e-06, "loss": 2.8648, "step": 45 }, { "epoch": 0.05, "grad_norm": 164.07588077953187, "learning_rate": 1.0416666666666668e-05, "loss": 2.5263, "step": 50 }, { "epoch": 0.06, "grad_norm": 48.559212060939444, "learning_rate": 1.1458333333333333e-05, "loss": 2.037, "step": 55 }, { "epoch": 0.06, "grad_norm": 23.491477625412887, "learning_rate": 1.25e-05, "loss": 1.5635, "step": 60 }, { "epoch": 0.07, "grad_norm": 18.534543889923896, "learning_rate": 1.3541666666666668e-05, "loss": 1.4948, "step": 65 }, { "epoch": 0.07, "grad_norm": 12.384878393353059, "learning_rate": 1.4583333333333333e-05, "loss": 1.4257, "step": 70 }, { "epoch": 0.08, "grad_norm": 9.477210018385332, "learning_rate": 1.5625e-05, "loss": 1.3939, "step": 75 }, { "epoch": 0.08, "grad_norm": 6.144109465129273, "learning_rate": 1.6666666666666667e-05, "loss": 1.3457, "step": 80 }, { "epoch": 0.09, "grad_norm": 5.552451526784706, "learning_rate": 1.7708333333333335e-05, "loss": 1.3328, "step": 85 }, { "epoch": 0.09, "grad_norm": 3.9924625599541206, "learning_rate": 1.8750000000000002e-05, "loss": 1.3063, "step": 90 }, { "epoch": 0.1, "grad_norm": 3.7886605684614403, "learning_rate": 1.979166666666667e-05, "loss": 1.3082, "step": 95 }, { "epoch": 0.1, "grad_norm": 4.590180516177269, "learning_rate": 1.999892997072575e-05, "loss": 1.2868, "step": 100 }, { "epoch": 0.11, "grad_norm": 4.258640200275494, "learning_rate": 1.99945833692589e-05, "loss": 1.3019, "step": 105 }, { "epoch": 0.12, "grad_norm": 4.200726638847038, "learning_rate": 1.9986894771071707e-05, "loss": 1.2737, "step": 110 }, { "epoch": 0.12, "grad_norm": 4.226850310714147, "learning_rate": 1.9975866747083734e-05, "loss": 1.2763, "step": 115 }, { "epoch": 0.13, "grad_norm": 3.9708049477307656, "learning_rate": 1.9961502984854394e-05, "loss": 1.2526, "step": 120 }, { "epoch": 0.13, "grad_norm": 4.498450336726286, "learning_rate": 1.9943808287349902e-05, "loss": 1.2729, "step": 125 }, { "epoch": 0.14, "grad_norm": 3.3080885736994894, "learning_rate": 1.992278857133726e-05, "loss": 1.243, "step": 130 }, { "epoch": 0.14, "grad_norm": 5.053212283450933, "learning_rate": 1.9898450865405786e-05, "loss": 1.2731, "step": 135 }, { "epoch": 0.15, "grad_norm": 4.17535563832229, "learning_rate": 1.9870803307616916e-05, "loss": 1.2692, "step": 140 }, { "epoch": 0.15, "grad_norm": 4.822523160059189, "learning_rate": 1.983985514278296e-05, "loss": 1.2534, "step": 145 }, { "epoch": 0.16, "grad_norm": 4.4132366650716515, "learning_rate": 1.9805616719375852e-05, "loss": 1.2642, "step": 150 }, { "epoch": 0.16, "grad_norm": 4.773367552594801, "learning_rate": 1.9768099486066776e-05, "loss": 1.2669, "step": 155 }, { "epoch": 0.17, "grad_norm": 6.505204667225315, "learning_rate": 1.9727315987897993e-05, "loss": 1.2221, "step": 160 }, { "epoch": 0.17, "grad_norm": 4.378575933551222, "learning_rate": 1.9683279862087986e-05, "loss": 1.2432, "step": 165 }, { "epoch": 0.18, "grad_norm": 3.9521011616333936, "learning_rate": 1.963600583347147e-05, "loss": 1.2243, "step": 170 }, { "epoch": 0.18, "grad_norm": 6.126328532378981, "learning_rate": 1.9585509709575646e-05, "loss": 1.242, "step": 175 }, { "epoch": 0.19, "grad_norm": 4.314912808408647, "learning_rate": 1.9531808375334512e-05, "loss": 1.2545, "step": 180 }, { "epoch": 0.19, "grad_norm": 4.0121454881889225, "learning_rate": 1.9474919787442835e-05, "loss": 1.2378, "step": 185 }, { "epoch": 0.2, "grad_norm": 3.369424629625938, "learning_rate": 1.9414862968351788e-05, "loss": 1.2304, "step": 190 }, { "epoch": 0.2, "grad_norm": 4.135366351640658, "learning_rate": 1.935165799990821e-05, "loss": 1.2384, "step": 195 }, { "epoch": 0.21, "grad_norm": 5.468708898225178, "learning_rate": 1.9285326016639624e-05, "loss": 1.2372, "step": 200 }, { "epoch": 0.21, "grad_norm": 5.141211198337755, "learning_rate": 1.9215889198687245e-05, "loss": 1.2342, "step": 205 }, { "epoch": 0.22, "grad_norm": 5.564399747906429, "learning_rate": 1.9143370764389374e-05, "loss": 1.245, "step": 210 }, { "epoch": 0.23, "grad_norm": 4.423614603504273, "learning_rate": 1.906779496251763e-05, "loss": 1.2127, "step": 215 }, { "epoch": 0.23, "grad_norm": 4.77469168667574, "learning_rate": 1.8989187064168643e-05, "loss": 1.2114, "step": 220 }, { "epoch": 0.24, "grad_norm": 3.7507426379732873, "learning_rate": 1.8907573354313853e-05, "loss": 1.2057, "step": 225 }, { "epoch": 0.24, "grad_norm": 3.2625231428295396, "learning_rate": 1.8822981123010343e-05, "loss": 1.2005, "step": 230 }, { "epoch": 0.25, "grad_norm": 4.559565219376083, "learning_rate": 1.873543865627556e-05, "loss": 1.2121, "step": 235 }, { "epoch": 0.25, "grad_norm": 4.674000836928357, "learning_rate": 1.8644975226629025e-05, "loss": 1.2064, "step": 240 }, { "epoch": 0.26, "grad_norm": 3.054211528310419, "learning_rate": 1.8551621083304147e-05, "loss": 1.206, "step": 245 }, { "epoch": 0.26, "grad_norm": 3.8005222642398317, "learning_rate": 1.8455407442133467e-05, "loss": 1.1824, "step": 250 }, { "epoch": 0.27, "grad_norm": 4.270369602281788, "learning_rate": 1.8356366475110697e-05, "loss": 1.2048, "step": 255 }, { "epoch": 0.27, "grad_norm": 3.904842228016003, "learning_rate": 1.8254531299633007e-05, "loss": 1.2052, "step": 260 }, { "epoch": 0.28, "grad_norm": 3.286875472560863, "learning_rate": 1.81499359674272e-05, "loss": 1.2018, "step": 265 }, { "epoch": 0.28, "grad_norm": 3.1899104367297864, "learning_rate": 1.8042615453163484e-05, "loss": 1.2018, "step": 270 }, { "epoch": 0.29, "grad_norm": 4.273989115747855, "learning_rate": 1.7932605642760607e-05, "loss": 1.1888, "step": 275 }, { "epoch": 0.29, "grad_norm": 4.268115493890591, "learning_rate": 1.7819943321386295e-05, "loss": 1.1906, "step": 280 }, { "epoch": 0.3, "grad_norm": 5.421685752864093, "learning_rate": 1.7704666161156994e-05, "loss": 1.2086, "step": 285 }, { "epoch": 0.3, "grad_norm": 3.715983707085352, "learning_rate": 1.7586812708541046e-05, "loss": 1.1922, "step": 290 }, { "epoch": 0.31, "grad_norm": 3.727172850716651, "learning_rate": 1.746642237146948e-05, "loss": 1.2142, "step": 295 }, { "epoch": 0.31, "grad_norm": 5.115772105064366, "learning_rate": 1.7343535406158773e-05, "loss": 1.1973, "step": 300 }, { "epoch": 0.32, "grad_norm": 3.907139227773489, "learning_rate": 1.7218192903649926e-05, "loss": 1.1804, "step": 305 }, { "epoch": 0.32, "grad_norm": 5.857633937230002, "learning_rate": 1.7090436776068422e-05, "loss": 1.2183, "step": 310 }, { "epoch": 0.33, "grad_norm": 3.6278739870653705, "learning_rate": 1.6960309742609603e-05, "loss": 1.1918, "step": 315 }, { "epoch": 0.34, "grad_norm": 4.037905874676671, "learning_rate": 1.682785531525422e-05, "loss": 1.1793, "step": 320 }, { "epoch": 0.34, "grad_norm": 3.042020400450931, "learning_rate": 1.6693117784218818e-05, "loss": 1.1942, "step": 325 }, { "epoch": 0.35, "grad_norm": 3.6059688835627632, "learning_rate": 1.655614220314598e-05, "loss": 1.1901, "step": 330 }, { "epoch": 0.35, "grad_norm": 3.027640956254004, "learning_rate": 1.6416974374039227e-05, "loss": 1.1815, "step": 335 }, { "epoch": 0.36, "grad_norm": 3.469869352005406, "learning_rate": 1.6275660831947725e-05, "loss": 1.1882, "step": 340 }, { "epoch": 0.36, "grad_norm": 3.913192938827672, "learning_rate": 1.6132248829405845e-05, "loss": 1.1799, "step": 345 }, { "epoch": 0.37, "grad_norm": 4.193881408589, "learning_rate": 1.5986786320632842e-05, "loss": 1.1993, "step": 350 }, { "epoch": 0.37, "grad_norm": 3.863252905360088, "learning_rate": 1.5839321945497847e-05, "loss": 1.1824, "step": 355 }, { "epoch": 0.38, "grad_norm": 3.775458735232249, "learning_rate": 1.5689905013255683e-05, "loss": 1.1721, "step": 360 }, { "epoch": 0.38, "grad_norm": 3.4362566052272228, "learning_rate": 1.5538585486058747e-05, "loss": 1.1846, "step": 365 }, { "epoch": 0.39, "grad_norm": 3.812829346931678, "learning_rate": 1.5385413962250657e-05, "loss": 1.1828, "step": 370 }, { "epoch": 0.39, "grad_norm": 5.6565563697093495, "learning_rate": 1.5230441659447128e-05, "loss": 1.1707, "step": 375 }, { "epoch": 0.4, "grad_norm": 3.7129216895293524, "learning_rate": 1.507372039740978e-05, "loss": 1.1778, "step": 380 }, { "epoch": 0.4, "grad_norm": 3.934774129904253, "learning_rate": 1.4915302580718614e-05, "loss": 1.1913, "step": 385 }, { "epoch": 0.41, "grad_norm": 3.941744536837656, "learning_rate": 1.4755241181248923e-05, "loss": 1.1825, "step": 390 }, { "epoch": 0.41, "grad_norm": 5.725933867619445, "learning_rate": 1.4593589720458507e-05, "loss": 1.1804, "step": 395 }, { "epoch": 0.42, "grad_norm": 4.884385616617823, "learning_rate": 1.443040225149114e-05, "loss": 1.1766, "step": 400 }, { "epoch": 0.42, "grad_norm": 6.448418045396495, "learning_rate": 1.4265733341102235e-05, "loss": 1.1677, "step": 405 }, { "epoch": 0.43, "grad_norm": 4.543829508747149, "learning_rate": 1.4099638051412745e-05, "loss": 1.1802, "step": 410 }, { "epoch": 0.43, "grad_norm": 3.2565990410366017, "learning_rate": 1.3932171921497483e-05, "loss": 1.1866, "step": 415 }, { "epoch": 0.44, "grad_norm": 3.5069328829068236, "learning_rate": 1.3763390948813897e-05, "loss": 1.1622, "step": 420 }, { "epoch": 0.45, "grad_norm": 4.3809314070147165, "learning_rate": 1.3593351570477608e-05, "loss": 1.1941, "step": 425 }, { "epoch": 0.45, "grad_norm": 4.910200688628565, "learning_rate": 1.3422110644390911e-05, "loss": 1.1709, "step": 430 }, { "epoch": 0.46, "grad_norm": 4.302365264776221, "learning_rate": 1.3249725430230595e-05, "loss": 1.1739, "step": 435 }, { "epoch": 0.46, "grad_norm": 4.246681366222887, "learning_rate": 1.3076253570301409e-05, "loss": 1.1603, "step": 440 }, { "epoch": 0.47, "grad_norm": 3.280421565698981, "learning_rate": 1.2901753070261565e-05, "loss": 1.186, "step": 445 }, { "epoch": 0.47, "grad_norm": 3.81317992852912, "learning_rate": 1.2726282279726788e-05, "loss": 1.1658, "step": 450 }, { "epoch": 0.48, "grad_norm": 3.8610620659207617, "learning_rate": 1.2549899872759288e-05, "loss": 1.1825, "step": 455 }, { "epoch": 0.48, "grad_norm": 6.4010239256735115, "learning_rate": 1.237266482824832e-05, "loss": 1.1496, "step": 460 }, { "epoch": 0.49, "grad_norm": 8.49710325570422, "learning_rate": 1.2194636410188748e-05, "loss": 1.173, "step": 465 }, { "epoch": 0.49, "grad_norm": 3.6548860275475397, "learning_rate": 1.2015874147864314e-05, "loss": 1.1591, "step": 470 }, { "epoch": 0.5, "grad_norm": 3.5679037646309197, "learning_rate": 1.183643781594219e-05, "loss": 1.1691, "step": 475 }, { "epoch": 0.5, "grad_norm": 4.749061818607719, "learning_rate": 1.165638741448548e-05, "loss": 1.1716, "step": 480 }, { "epoch": 0.51, "grad_norm": 3.8240982671284898, "learning_rate": 1.147578314889033e-05, "loss": 1.1539, "step": 485 }, { "epoch": 0.51, "grad_norm": 2.9542808971197947, "learning_rate": 1.1294685409754434e-05, "loss": 1.159, "step": 490 }, { "epoch": 0.52, "grad_norm": 3.1350411827108906, "learning_rate": 1.1113154752683548e-05, "loss": 1.2067, "step": 495 }, { "epoch": 0.52, "grad_norm": 2.8905318360977628, "learning_rate": 1.0931251878042882e-05, "loss": 1.1769, "step": 500 }, { "epoch": 0.53, "grad_norm": 3.0053111002447745, "learning_rate": 1.0749037610660041e-05, "loss": 1.1723, "step": 505 }, { "epoch": 0.53, "grad_norm": 3.3485613915497026, "learning_rate": 1.0566572879486388e-05, "loss": 1.1653, "step": 510 }, { "epoch": 0.54, "grad_norm": 4.082048046997961, "learning_rate": 1.0383918697223564e-05, "loss": 1.1785, "step": 515 }, { "epoch": 0.54, "grad_norm": 2.8712520171978, "learning_rate": 1.020113613992203e-05, "loss": 1.1746, "step": 520 }, { "epoch": 0.55, "grad_norm": 3.26766401286956, "learning_rate": 1.001828632655837e-05, "loss": 1.1372, "step": 525 }, { "epoch": 0.55, "grad_norm": 3.0082362180315343, "learning_rate": 9.835430398598319e-06, "loss": 1.1699, "step": 530 }, { "epoch": 0.56, "grad_norm": 3.6001344186192736, "learning_rate": 9.652629499552216e-06, "loss": 1.187, "step": 535 }, { "epoch": 0.57, "grad_norm": 4.455098713036843, "learning_rate": 9.469944754529784e-06, "loss": 1.1526, "step": 540 }, { "epoch": 0.57, "grad_norm": 4.506173198506875, "learning_rate": 9.28743724980107e-06, "loss": 1.1593, "step": 545 }, { "epoch": 0.58, "grad_norm": 3.492647004795416, "learning_rate": 9.105168012370372e-06, "loss": 1.1407, "step": 550 }, { "epoch": 0.58, "grad_norm": 3.070429402307121, "learning_rate": 8.923197989569981e-06, "loss": 1.1662, "step": 555 }, { "epoch": 0.59, "grad_norm": 3.7831387429713894, "learning_rate": 8.741588028680566e-06, "loss": 1.1552, "step": 560 }, { "epoch": 0.59, "grad_norm": 3.5233461063094142, "learning_rate": 8.560398856585002e-06, "loss": 1.165, "step": 565 }, { "epoch": 0.6, "grad_norm": 3.079554638580303, "learning_rate": 8.379691059462478e-06, "loss": 1.1741, "step": 570 }, { "epoch": 0.6, "grad_norm": 3.367550043194744, "learning_rate": 8.199525062529626e-06, "loss": 1.1572, "step": 575 }, { "epoch": 0.61, "grad_norm": 3.442018148642513, "learning_rate": 8.01996110983552e-06, "loss": 1.1591, "step": 580 }, { "epoch": 0.61, "grad_norm": 3.0524742405717142, "learning_rate": 7.841059244117189e-06, "loss": 1.1678, "step": 585 }, { "epoch": 0.62, "grad_norm": 2.747340823312245, "learning_rate": 7.662879286722496e-06, "loss": 1.1598, "step": 590 }, { "epoch": 0.62, "grad_norm": 2.9472846603989944, "learning_rate": 7.485480817607031e-06, "loss": 1.1753, "step": 595 }, { "epoch": 0.63, "grad_norm": 3.9279530098870943, "learning_rate": 7.30892315541171e-06, "loss": 1.1462, "step": 600 }, { "epoch": 0.63, "grad_norm": 3.056461777781048, "learning_rate": 7.133265337627757e-06, "loss": 1.1319, "step": 605 }, { "epoch": 0.64, "grad_norm": 2.5655491221643256, "learning_rate": 6.958566100855716e-06, "loss": 1.1469, "step": 610 }, { "epoch": 0.64, "grad_norm": 4.033014172979536, "learning_rate": 6.78488386116505e-06, "loss": 1.1522, "step": 615 }, { "epoch": 0.65, "grad_norm": 3.2636560884902512, "learning_rate": 6.612276694560927e-06, "loss": 1.1653, "step": 620 }, { "epoch": 0.65, "grad_norm": 3.099825834412963, "learning_rate": 6.44080231756473e-06, "loss": 1.1695, "step": 625 }, { "epoch": 0.66, "grad_norm": 3.5276354474143465, "learning_rate": 6.2705180679147455e-06, "loss": 1.1586, "step": 630 }, { "epoch": 0.66, "grad_norm": 2.8120221788177417, "learning_rate": 6.101480885393537e-06, "loss": 1.1735, "step": 635 }, { "epoch": 0.67, "grad_norm": 2.7317060856823705, "learning_rate": 5.933747292788369e-06, "loss": 1.1601, "step": 640 }, { "epoch": 0.68, "grad_norm": 2.8630486147702414, "learning_rate": 5.767373376991082e-06, "loss": 1.1548, "step": 645 }, { "epoch": 0.68, "grad_norm": 3.044565653563942, "learning_rate": 5.602414770243698e-06, "loss": 1.1431, "step": 650 }, { "epoch": 0.69, "grad_norm": 2.784721573945372, "learning_rate": 5.438926631536087e-06, "loss": 1.1562, "step": 655 }, { "epoch": 0.69, "grad_norm": 2.9182503829263386, "learning_rate": 5.276963628161833e-06, "loss": 1.1501, "step": 660 }, { "epoch": 0.7, "grad_norm": 3.0148200179539995, "learning_rate": 5.116579917438564e-06, "loss": 1.1599, "step": 665 }, { "epoch": 0.7, "grad_norm": 2.779138701850297, "learning_rate": 4.957829128598781e-06, "loss": 1.1407, "step": 670 }, { "epoch": 0.71, "grad_norm": 2.8364467956103447, "learning_rate": 4.80076434485727e-06, "loss": 1.1632, "step": 675 }, { "epoch": 0.71, "grad_norm": 2.9955667796891445, "learning_rate": 4.645438085661085e-06, "loss": 1.1653, "step": 680 }, { "epoch": 0.72, "grad_norm": 3.2640655907001874, "learning_rate": 4.4919022891280725e-06, "loss": 1.1526, "step": 685 }, { "epoch": 0.72, "grad_norm": 3.2789098333817797, "learning_rate": 4.340208294679745e-06, "loss": 1.1529, "step": 690 }, { "epoch": 0.73, "grad_norm": 2.755334882067621, "learning_rate": 4.190406825874377e-06, "loss": 1.1461, "step": 695 }, { "epoch": 0.73, "grad_norm": 3.3859515382051506, "learning_rate": 4.042547973446017e-06, "loss": 1.136, "step": 700 }, { "epoch": 0.74, "grad_norm": 2.7768674529680197, "learning_rate": 3.896681178555099e-06, "loss": 1.1494, "step": 705 }, { "epoch": 0.74, "grad_norm": 2.7160887414907453, "learning_rate": 3.7528552162562858e-06, "loss": 1.1435, "step": 710 }, { "epoch": 0.75, "grad_norm": 3.445482448946737, "learning_rate": 3.6111181791890184e-06, "loss": 1.1518, "step": 715 }, { "epoch": 0.75, "grad_norm": 3.0678976022117186, "learning_rate": 3.471517461496253e-06, "loss": 1.1191, "step": 720 }, { "epoch": 0.76, "grad_norm": 2.9920996388866588, "learning_rate": 3.3340997429767786e-06, "loss": 1.1509, "step": 725 }, { "epoch": 0.76, "grad_norm": 3.5189782505268377, "learning_rate": 3.1989109734763936e-06, "loss": 1.1447, "step": 730 }, { "epoch": 0.77, "grad_norm": 4.465010089218058, "learning_rate": 3.0659963575231544e-06, "loss": 1.1384, "step": 735 }, { "epoch": 0.77, "grad_norm": 2.8116282143136586, "learning_rate": 2.935400339211841e-06, "loss": 1.1448, "step": 740 }, { "epoch": 0.78, "grad_norm": 2.821536237846906, "learning_rate": 2.8071665873427244e-06, "loss": 1.1529, "step": 745 }, { "epoch": 0.79, "grad_norm": 2.616142918937676, "learning_rate": 2.681337980819536e-06, "loss": 1.1276, "step": 750 }, { "epoch": 0.79, "grad_norm": 2.648221961084194, "learning_rate": 2.5579565943116092e-06, "loss": 1.1511, "step": 755 }, { "epoch": 0.8, "grad_norm": 4.305338632641424, "learning_rate": 2.437063684184893e-06, "loss": 1.139, "step": 760 }, { "epoch": 0.8, "grad_norm": 3.820239172457965, "learning_rate": 2.318699674706639e-06, "loss": 1.1305, "step": 765 }, { "epoch": 0.81, "grad_norm": 2.813379017123292, "learning_rate": 2.202904144528295e-06, "loss": 1.1465, "step": 770 }, { "epoch": 0.81, "grad_norm": 3.5727532652839655, "learning_rate": 2.08971581345115e-06, "loss": 1.1392, "step": 775 }, { "epoch": 0.82, "grad_norm": 3.2395630905470822, "learning_rate": 1.979172529479193e-06, "loss": 1.1545, "step": 780 }, { "epoch": 0.82, "grad_norm": 2.6177906399433017, "learning_rate": 1.8713112561634671e-06, "loss": 1.1372, "step": 785 }, { "epoch": 0.83, "grad_norm": 2.523775559650496, "learning_rate": 1.7661680602421594e-06, "loss": 1.1374, "step": 790 }, { "epoch": 0.83, "grad_norm": 2.8009704991006927, "learning_rate": 1.663778099580583e-06, "loss": 1.1272, "step": 795 }, { "epoch": 0.84, "grad_norm": 2.4906631285299548, "learning_rate": 1.5641756114150552e-06, "loss": 1.1294, "step": 800 }, { "epoch": 0.84, "grad_norm": 3.1402684266522565, "learning_rate": 1.4673939009046268e-06, "loss": 1.1361, "step": 805 }, { "epoch": 0.85, "grad_norm": 2.8346706660144414, "learning_rate": 1.3734653299944834e-06, "loss": 1.1416, "step": 810 }, { "epoch": 0.85, "grad_norm": 2.656713151507134, "learning_rate": 1.2824213065947232e-06, "loss": 1.1123, "step": 815 }, { "epoch": 0.86, "grad_norm": 2.6781138539963876, "learning_rate": 1.194292274078156e-06, "loss": 1.1428, "step": 820 }, { "epoch": 0.86, "grad_norm": 2.683665650227597, "learning_rate": 1.1091077011006302e-06, "loss": 1.1546, "step": 825 }, { "epoch": 0.87, "grad_norm": 2.6042088842327704, "learning_rate": 1.0268960717472742e-06, "loss": 1.1501, "step": 830 }, { "epoch": 0.87, "grad_norm": 2.5487361412205423, "learning_rate": 9.476848760079671e-07, "loss": 1.1409, "step": 835 }, { "epoch": 0.88, "grad_norm": 2.5405778981950227, "learning_rate": 8.715006005852144e-07, "loss": 1.1482, "step": 840 }, { "epoch": 0.88, "grad_norm": 2.384660533885737, "learning_rate": 7.983687200375046e-07, "loss": 1.1196, "step": 845 }, { "epoch": 0.89, "grad_norm": 3.7475252922006828, "learning_rate": 7.283136882611063e-07, "loss": 1.1417, "step": 850 }, { "epoch": 0.9, "grad_norm": 2.832637666387188, "learning_rate": 6.613589303131506e-07, "loss": 1.1508, "step": 855 }, { "epoch": 0.9, "grad_norm": 2.7620043818781497, "learning_rate": 5.975268345787455e-07, "loss": 1.1787, "step": 860 }, { "epoch": 0.91, "grad_norm": 2.548067253837801, "learning_rate": 5.368387452847312e-07, "loss": 1.1385, "step": 865 }, { "epoch": 0.91, "grad_norm": 2.6676850416420477, "learning_rate": 4.793149553625786e-07, "loss": 1.1464, "step": 870 }, { "epoch": 0.92, "grad_norm": 2.813724959820899, "learning_rate": 4.2497469966282125e-07, "loss": 1.1306, "step": 875 }, { "epoch": 0.92, "grad_norm": 2.48368594730713, "learning_rate": 3.738361485232922e-07, "loss": 1.1174, "step": 880 }, { "epoch": 0.93, "grad_norm": 2.4430892492916625, "learning_rate": 3.2591640169331697e-07, "loss": 1.1436, "step": 885 }, { "epoch": 0.93, "grad_norm": 2.857473623835687, "learning_rate": 2.8123148261587465e-07, "loss": 1.1365, "step": 890 }, { "epoch": 0.94, "grad_norm": 2.5325081829773244, "learning_rate": 2.397963330696751e-07, "loss": 1.1367, "step": 895 }, { "epoch": 0.94, "grad_norm": 2.5616316720827585, "learning_rate": 2.0162480817291442e-07, "loss": 1.1283, "step": 900 }, { "epoch": 0.95, "grad_norm": 2.427762885310401, "learning_rate": 1.6672967175038634e-07, "loss": 1.1387, "step": 905 }, { "epoch": 0.95, "grad_norm": 2.621680915045004, "learning_rate": 1.3512259206550748e-07, "loss": 1.1372, "step": 910 }, { "epoch": 0.96, "grad_norm": 2.485476210774143, "learning_rate": 1.0681413791867157e-07, "loss": 1.1432, "step": 915 }, { "epoch": 0.96, "grad_norm": 2.4391825872607704, "learning_rate": 8.181377511324306e-08, "loss": 1.1405, "step": 920 }, { "epoch": 0.97, "grad_norm": 2.7849881804956573, "learning_rate": 6.012986329038462e-08, "loss": 1.1186, "step": 925 }, { "epoch": 0.97, "grad_norm": 2.401471198249117, "learning_rate": 4.1769653133743036e-08, "loss": 1.1486, "step": 930 }, { "epoch": 0.98, "grad_norm": 2.416110237940305, "learning_rate": 2.673928394496206e-08, "loss": 1.1432, "step": 935 }, { "epoch": 0.98, "grad_norm": 2.4944820214028582, "learning_rate": 1.5043781590823313e-08, "loss": 1.1578, "step": 940 }, { "epoch": 0.99, "grad_norm": 2.41912854884359, "learning_rate": 6.687056822688442e-09, "loss": 1.1183, "step": 945 }, { "epoch": 0.99, "grad_norm": 2.738704813702846, "learning_rate": 1.6719039688162242e-09, "loss": 1.1294, "step": 950 }, { "epoch": 1.0, "grad_norm": 2.4680956929047855, "learning_rate": 0.0, "loss": 1.1598, "step": 955 }, { "epoch": 1.0, "step": 955, "total_flos": 262883405463552.0, "train_loss": 1.3692507653960382, "train_runtime": 2783.1362, "train_samples_per_second": 43.907, "train_steps_per_second": 0.343 } ], "logging_steps": 5, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 262883405463552.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }