{ "best_metric": null, "best_model_checkpoint": null, "epoch": 25.0, "global_step": 95050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-09, "loss": 10.5513, "step": 1 }, { "epoch": 0.13, "learning_rate": 2.5e-06, "loss": 9.5944, "step": 500 }, { "epoch": 0.26, "learning_rate": 5e-06, "loss": 8.0091, "step": 1000 }, { "epoch": 0.39, "learning_rate": 7.5e-06, "loss": 6.96, "step": 1500 }, { "epoch": 0.53, "learning_rate": 1e-05, "loss": 6.6546, "step": 2000 }, { "epoch": 0.66, "learning_rate": 1.25e-05, "loss": 6.4887, "step": 2500 }, { "epoch": 0.79, "learning_rate": 1.5e-05, "loss": 6.3737, "step": 3000 }, { "epoch": 0.92, "learning_rate": 1.75e-05, "loss": 6.2809, "step": 3500 }, { "epoch": 1.05, "learning_rate": 2e-05, "loss": 6.211, "step": 4000 }, { "epoch": 1.18, "learning_rate": 2.25e-05, "loss": 6.1475, "step": 4500 }, { "epoch": 1.32, "learning_rate": 2.5e-05, "loss": 6.0923, "step": 5000 }, { "epoch": 1.45, "learning_rate": 2.7500000000000004e-05, "loss": 6.0436, "step": 5500 }, { "epoch": 1.58, "learning_rate": 3e-05, "loss": 6.0011, "step": 6000 }, { "epoch": 1.71, "learning_rate": 3.2495000000000007e-05, "loss": 5.9694, "step": 6500 }, { "epoch": 1.84, "learning_rate": 3.4995e-05, "loss": 5.9366, "step": 7000 }, { "epoch": 1.97, "learning_rate": 3.7495e-05, "loss": 5.9044, "step": 7500 }, { "epoch": 2.1, "learning_rate": 3.9995000000000006e-05, "loss": 5.8788, "step": 8000 }, { "epoch": 2.24, "learning_rate": 4.2495e-05, "loss": 5.8576, "step": 8500 }, { "epoch": 2.37, "learning_rate": 4.4995000000000005e-05, "loss": 5.8413, "step": 9000 }, { "epoch": 2.5, "learning_rate": 4.7495e-05, "loss": 5.818, "step": 9500 }, { "epoch": 2.63, "learning_rate": 4.9995000000000005e-05, "loss": 5.8055, "step": 10000 }, { "epoch": 2.76, "learning_rate": 4.998348557055865e-05, "loss": 5.7852, "step": 10500 }, { "epoch": 2.89, "learning_rate": 4.9966971141117293e-05, "loss": 5.768, "step": 11000 }, { "epoch": 3.02, "learning_rate": 4.995042361662696e-05, "loss": 5.7555, "step": 11500 }, { "epoch": 3.16, "learning_rate": 4.993387609213662e-05, "loss": 5.7434, "step": 12000 }, { "epoch": 3.29, "learning_rate": 4.991732856764628e-05, "loss": 5.7306, "step": 12500 }, { "epoch": 3.42, "learning_rate": 4.9900814138204925e-05, "loss": 5.7245, "step": 13000 }, { "epoch": 3.55, "learning_rate": 4.988426661371459e-05, "loss": 5.7136, "step": 13500 }, { "epoch": 3.68, "learning_rate": 4.986771908922425e-05, "loss": 5.7022, "step": 14000 }, { "epoch": 3.81, "learning_rate": 4.985117156473392e-05, "loss": 5.6951, "step": 14500 }, { "epoch": 3.95, "learning_rate": 4.983465713529256e-05, "loss": 5.6872, "step": 15000 }, { "epoch": 4.08, "learning_rate": 4.981810961080223e-05, "loss": 5.6745, "step": 15500 }, { "epoch": 4.21, "learning_rate": 4.9801562086311895e-05, "loss": 5.6695, "step": 16000 }, { "epoch": 4.34, "learning_rate": 4.9785014561821555e-05, "loss": 5.6663, "step": 16500 }, { "epoch": 4.47, "learning_rate": 4.9768500132380194e-05, "loss": 5.6611, "step": 17000 }, { "epoch": 4.6, "learning_rate": 4.975195260788986e-05, "loss": 5.6569, "step": 17500 }, { "epoch": 4.73, "learning_rate": 4.9735405083399527e-05, "loss": 5.6487, "step": 18000 }, { "epoch": 4.87, "learning_rate": 4.9718857558909186e-05, "loss": 5.6432, "step": 18500 }, { "epoch": 5.0, "learning_rate": 4.970231003441885e-05, "loss": 5.6399, "step": 19000 }, { "epoch": 5.13, "learning_rate": 4.96857956049775e-05, "loss": 5.6297, "step": 19500 }, { "epoch": 5.26, "learning_rate": 4.9669248080487165e-05, "loss": 5.625, "step": 20000 }, { "epoch": 5.39, "learning_rate": 4.9652700555996824e-05, "loss": 5.6231, "step": 20500 }, { "epoch": 5.52, "learning_rate": 4.963615303150649e-05, "loss": 5.6183, "step": 21000 }, { "epoch": 5.65, "learning_rate": 4.961963860206513e-05, "loss": 5.6134, "step": 21500 }, { "epoch": 5.79, "learning_rate": 4.9603091077574796e-05, "loss": 5.6107, "step": 22000 }, { "epoch": 5.92, "learning_rate": 4.958654355308446e-05, "loss": 5.6064, "step": 22500 }, { "epoch": 6.05, "learning_rate": 4.956999602859412e-05, "loss": 5.6008, "step": 23000 }, { "epoch": 6.18, "learning_rate": 4.955348159915277e-05, "loss": 5.5932, "step": 23500 }, { "epoch": 6.31, "learning_rate": 4.9536934074662434e-05, "loss": 5.5929, "step": 24000 }, { "epoch": 6.44, "learning_rate": 4.95203865501721e-05, "loss": 5.5891, "step": 24500 }, { "epoch": 6.58, "learning_rate": 4.9503872120730746e-05, "loss": 5.5828, "step": 25000 }, { "epoch": 6.71, "learning_rate": 4.9487324596240405e-05, "loss": 5.5846, "step": 25500 }, { "epoch": 6.84, "learning_rate": 4.947077707175007e-05, "loss": 5.581, "step": 26000 }, { "epoch": 6.97, "learning_rate": 4.945422954725973e-05, "loss": 5.5753, "step": 26500 }, { "epoch": 7.1, "learning_rate": 4.943768202276939e-05, "loss": 5.5719, "step": 27000 }, { "epoch": 7.23, "learning_rate": 4.942113449827906e-05, "loss": 5.5668, "step": 27500 }, { "epoch": 7.36, "learning_rate": 4.9404586973788724e-05, "loss": 5.564, "step": 28000 }, { "epoch": 7.5, "learning_rate": 4.938807254434737e-05, "loss": 5.5632, "step": 28500 }, { "epoch": 7.63, "learning_rate": 4.9371525019857036e-05, "loss": 5.3943, "step": 29000 }, { "epoch": 7.76, "learning_rate": 4.9354977495366695e-05, "loss": 5.1613, "step": 29500 }, { "epoch": 7.89, "learning_rate": 4.933842997087636e-05, "loss": 4.9594, "step": 30000 }, { "epoch": 8.02, "learning_rate": 4.932188244638603e-05, "loss": 4.7739, "step": 30500 }, { "epoch": 8.15, "learning_rate": 4.930533492189569e-05, "loss": 4.6014, "step": 31000 }, { "epoch": 8.29, "learning_rate": 4.928878739740535e-05, "loss": 4.4413, "step": 31500 }, { "epoch": 8.42, "learning_rate": 4.9272239872915014e-05, "loss": 4.2832, "step": 32000 }, { "epoch": 8.55, "learning_rate": 4.925572544347366e-05, "loss": 4.1308, "step": 32500 }, { "epoch": 8.68, "learning_rate": 4.923917791898332e-05, "loss": 3.9864, "step": 33000 }, { "epoch": 8.81, "learning_rate": 4.9222630394492985e-05, "loss": 3.8488, "step": 33500 }, { "epoch": 8.94, "learning_rate": 4.920608287000265e-05, "loss": 3.699, "step": 34000 }, { "epoch": 9.07, "learning_rate": 4.918953534551232e-05, "loss": 3.5521, "step": 34500 }, { "epoch": 9.21, "learning_rate": 4.917302091607096e-05, "loss": 3.3783, "step": 35000 }, { "epoch": 9.34, "learning_rate": 4.915647339158062e-05, "loss": 2.8694, "step": 35500 }, { "epoch": 9.47, "learning_rate": 4.913992586709028e-05, "loss": 2.3898, "step": 36000 }, { "epoch": 9.6, "learning_rate": 4.912337834259995e-05, "loss": 2.1786, "step": 36500 }, { "epoch": 9.73, "learning_rate": 4.9106863913158595e-05, "loss": 2.0476, "step": 37000 }, { "epoch": 9.86, "learning_rate": 4.9090316388668254e-05, "loss": 1.9528, "step": 37500 }, { "epoch": 9.99, "learning_rate": 4.90738019592269e-05, "loss": 1.8765, "step": 38000 }, { "epoch": 10.13, "learning_rate": 4.9057254434736566e-05, "loss": 1.8172, "step": 38500 }, { "epoch": 10.26, "learning_rate": 4.904070691024623e-05, "loss": 1.7658, "step": 39000 }, { "epoch": 10.39, "learning_rate": 4.902415938575589e-05, "loss": 1.7213, "step": 39500 }, { "epoch": 10.52, "learning_rate": 4.900764495631454e-05, "loss": 1.6845, "step": 40000 }, { "epoch": 10.65, "learning_rate": 4.8991097431824204e-05, "loss": 1.6475, "step": 40500 }, { "epoch": 10.78, "learning_rate": 4.8974549907333864e-05, "loss": 1.6163, "step": 41000 }, { "epoch": 10.92, "learning_rate": 4.8958002382843524e-05, "loss": 1.5924, "step": 41500 }, { "epoch": 11.05, "learning_rate": 4.894148795340217e-05, "loss": 1.5644, "step": 42000 }, { "epoch": 11.18, "learning_rate": 4.8924940428911836e-05, "loss": 1.5381, "step": 42500 }, { "epoch": 11.31, "learning_rate": 4.89083929044215e-05, "loss": 1.5176, "step": 43000 }, { "epoch": 11.44, "learning_rate": 4.889184537993117e-05, "loss": 1.4933, "step": 43500 }, { "epoch": 11.57, "learning_rate": 4.887529785544083e-05, "loss": 1.4755, "step": 44000 }, { "epoch": 11.7, "learning_rate": 4.8858783425999473e-05, "loss": 1.4564, "step": 44500 }, { "epoch": 11.84, "learning_rate": 4.884223590150914e-05, "loss": 1.4382, "step": 45000 }, { "epoch": 11.97, "learning_rate": 4.88256883770188e-05, "loss": 1.4251, "step": 45500 }, { "epoch": 12.1, "learning_rate": 4.880914085252846e-05, "loss": 1.4069, "step": 46000 }, { "epoch": 12.23, "learning_rate": 4.8792593328038126e-05, "loss": 1.3901, "step": 46500 }, { "epoch": 12.36, "learning_rate": 4.877604580354779e-05, "loss": 1.3754, "step": 47000 }, { "epoch": 12.49, "learning_rate": 4.875949827905745e-05, "loss": 1.3633, "step": 47500 }, { "epoch": 12.62, "learning_rate": 4.874295075456712e-05, "loss": 1.3459, "step": 48000 }, { "epoch": 12.76, "learning_rate": 4.8726436325125763e-05, "loss": 1.3374, "step": 48500 }, { "epoch": 12.89, "learning_rate": 4.870992189568441e-05, "loss": 1.3237, "step": 49000 }, { "epoch": 13.02, "learning_rate": 4.8693374371194075e-05, "loss": 1.3117, "step": 49500 }, { "epoch": 13.15, "learning_rate": 4.8676826846703735e-05, "loss": 1.3009, "step": 50000 }, { "epoch": 13.28, "learning_rate": 4.8660279322213395e-05, "loss": 1.2906, "step": 50500 }, { "epoch": 13.41, "learning_rate": 4.864373179772306e-05, "loss": 1.2816, "step": 51000 }, { "epoch": 13.55, "learning_rate": 4.8627217368281707e-05, "loss": 1.2717, "step": 51500 }, { "epoch": 13.68, "learning_rate": 4.861066984379137e-05, "loss": 1.2648, "step": 52000 }, { "epoch": 13.81, "learning_rate": 4.859412231930103e-05, "loss": 1.2561, "step": 52500 }, { "epoch": 13.94, "learning_rate": 4.85775747948107e-05, "loss": 1.2473, "step": 53000 }, { "epoch": 14.07, "learning_rate": 4.8561027270320365e-05, "loss": 1.2351, "step": 53500 }, { "epoch": 14.2, "learning_rate": 4.854451284087901e-05, "loss": 1.2292, "step": 54000 }, { "epoch": 14.33, "learning_rate": 4.852796531638867e-05, "loss": 1.2221, "step": 54500 }, { "epoch": 14.47, "learning_rate": 4.851141779189834e-05, "loss": 1.2137, "step": 55000 }, { "epoch": 14.6, "learning_rate": 4.8494870267407997e-05, "loss": 1.2065, "step": 55500 }, { "epoch": 14.73, "learning_rate": 4.847832274291766e-05, "loss": 1.1981, "step": 56000 }, { "epoch": 14.86, "learning_rate": 4.846180831347631e-05, "loss": 1.1931, "step": 56500 }, { "epoch": 14.99, "learning_rate": 4.844526078898597e-05, "loss": 1.1876, "step": 57000 }, { "epoch": 15.12, "learning_rate": 4.8428713264495635e-05, "loss": 1.1777, "step": 57500 }, { "epoch": 15.26, "learning_rate": 4.84121657400053e-05, "loss": 1.1734, "step": 58000 }, { "epoch": 15.39, "learning_rate": 4.8395651310563947e-05, "loss": 1.1639, "step": 58500 }, { "epoch": 15.52, "learning_rate": 4.8379103786073606e-05, "loss": 1.1619, "step": 59000 }, { "epoch": 15.65, "learning_rate": 4.836255626158327e-05, "loss": 1.1534, "step": 59500 }, { "epoch": 15.78, "learning_rate": 4.834600873709293e-05, "loss": 1.1484, "step": 60000 }, { "epoch": 15.91, "learning_rate": 4.832946121260259e-05, "loss": 1.1453, "step": 60500 }, { "epoch": 16.04, "learning_rate": 4.831291368811226e-05, "loss": 1.1395, "step": 61000 }, { "epoch": 16.18, "learning_rate": 4.8296366163621925e-05, "loss": 1.1322, "step": 61500 }, { "epoch": 16.31, "learning_rate": 4.827985173418057e-05, "loss": 1.1269, "step": 62000 }, { "epoch": 16.44, "learning_rate": 4.826330420969023e-05, "loss": 1.1231, "step": 62500 }, { "epoch": 16.57, "learning_rate": 4.8246756685199896e-05, "loss": 1.1167, "step": 63000 }, { "epoch": 16.7, "learning_rate": 4.823020916070956e-05, "loss": 1.1136, "step": 63500 }, { "epoch": 16.83, "learning_rate": 4.821366163621923e-05, "loss": 1.107, "step": 64000 }, { "epoch": 16.96, "learning_rate": 4.819714720677787e-05, "loss": 1.1027, "step": 64500 }, { "epoch": 17.1, "learning_rate": 4.818059968228753e-05, "loss": 1.0964, "step": 65000 }, { "epoch": 17.23, "learning_rate": 4.8164052157797194e-05, "loss": 1.0933, "step": 65500 }, { "epoch": 17.36, "learning_rate": 4.814750463330686e-05, "loss": 1.088, "step": 66000 }, { "epoch": 17.49, "learning_rate": 4.813095710881652e-05, "loss": 1.0861, "step": 66500 }, { "epoch": 17.62, "learning_rate": 4.8114442679375165e-05, "loss": 1.0817, "step": 67000 }, { "epoch": 17.75, "learning_rate": 4.809789515488483e-05, "loss": 1.0783, "step": 67500 }, { "epoch": 17.89, "learning_rate": 4.80813476303945e-05, "loss": 1.075, "step": 68000 }, { "epoch": 18.02, "learning_rate": 4.8064833200953144e-05, "loss": 1.0672, "step": 68500 }, { "epoch": 18.15, "learning_rate": 4.80482856764628e-05, "loss": 1.0647, "step": 69000 }, { "epoch": 18.28, "learning_rate": 4.803173815197247e-05, "loss": 1.0618, "step": 69500 }, { "epoch": 18.41, "learning_rate": 4.801519062748213e-05, "loss": 1.057, "step": 70000 }, { "epoch": 18.54, "learning_rate": 4.7998643102991796e-05, "loss": 1.0528, "step": 70500 }, { "epoch": 18.67, "learning_rate": 4.7982095578501455e-05, "loss": 1.0503, "step": 71000 }, { "epoch": 18.81, "learning_rate": 4.79655811490601e-05, "loss": 1.0455, "step": 71500 }, { "epoch": 18.94, "learning_rate": 4.794903362456977e-05, "loss": 1.0429, "step": 72000 }, { "epoch": 19.07, "learning_rate": 4.7932486100079434e-05, "loss": 1.0371, "step": 72500 }, { "epoch": 19.2, "learning_rate": 4.791593857558909e-05, "loss": 1.0341, "step": 73000 }, { "epoch": 19.33, "learning_rate": 4.789939105109876e-05, "loss": 1.0291, "step": 73500 }, { "epoch": 19.46, "learning_rate": 4.7882876621657405e-05, "loss": 1.0261, "step": 74000 }, { "epoch": 19.59, "learning_rate": 4.7866329097167065e-05, "loss": 1.0241, "step": 74500 }, { "epoch": 19.73, "learning_rate": 4.7849781572676724e-05, "loss": 1.02, "step": 75000 }, { "epoch": 19.86, "learning_rate": 4.783323404818639e-05, "loss": 1.018, "step": 75500 }, { "epoch": 19.99, "learning_rate": 4.781668652369606e-05, "loss": 1.0154, "step": 76000 }, { "epoch": 20.12, "learning_rate": 4.7800138999205724e-05, "loss": 1.0116, "step": 76500 }, { "epoch": 20.25, "learning_rate": 4.778359147471538e-05, "loss": 1.0065, "step": 77000 }, { "epoch": 20.38, "learning_rate": 4.776704395022505e-05, "loss": 1.005, "step": 77500 }, { "epoch": 20.52, "learning_rate": 4.7750529520783695e-05, "loss": 1.0025, "step": 78000 }, { "epoch": 20.65, "learning_rate": 4.773398199629336e-05, "loss": 1.0008, "step": 78500 }, { "epoch": 20.78, "learning_rate": 4.771743447180302e-05, "loss": 0.9965, "step": 79000 }, { "epoch": 20.91, "learning_rate": 4.770088694731268e-05, "loss": 0.9943, "step": 79500 }, { "epoch": 21.04, "learning_rate": 4.7684372517871326e-05, "loss": 0.9907, "step": 80000 }, { "epoch": 21.17, "learning_rate": 4.766782499338099e-05, "loss": 0.9882, "step": 80500 }, { "epoch": 21.3, "learning_rate": 4.765127746889065e-05, "loss": 0.9861, "step": 81000 }, { "epoch": 21.44, "learning_rate": 4.763472994440032e-05, "loss": 0.9809, "step": 81500 }, { "epoch": 21.57, "learning_rate": 4.7618215514958964e-05, "loss": 0.9793, "step": 82000 }, { "epoch": 21.7, "learning_rate": 4.760166799046863e-05, "loss": 0.9778, "step": 82500 }, { "epoch": 21.83, "learning_rate": 4.7585153561027276e-05, "loss": 0.9756, "step": 83000 }, { "epoch": 21.96, "learning_rate": 4.7568606036536936e-05, "loss": 0.9732, "step": 83500 }, { "epoch": 22.09, "learning_rate": 4.7552058512046596e-05, "loss": 0.97, "step": 84000 }, { "epoch": 22.23, "learning_rate": 4.753551098755626e-05, "loss": 0.9687, "step": 84500 }, { "epoch": 22.36, "learning_rate": 4.751899655811491e-05, "loss": 0.965, "step": 85000 }, { "epoch": 22.49, "learning_rate": 4.7502449033624574e-05, "loss": 0.9627, "step": 85500 }, { "epoch": 22.62, "learning_rate": 4.7485901509134233e-05, "loss": 0.9628, "step": 86000 }, { "epoch": 22.75, "learning_rate": 4.74693539846439e-05, "loss": 0.9605, "step": 86500 }, { "epoch": 22.88, "learning_rate": 4.7452806460153566e-05, "loss": 0.9565, "step": 87000 }, { "epoch": 23.01, "learning_rate": 4.7436258935663226e-05, "loss": 0.9553, "step": 87500 }, { "epoch": 23.15, "learning_rate": 4.741971141117289e-05, "loss": 0.9518, "step": 88000 }, { "epoch": 23.28, "learning_rate": 4.740316388668256e-05, "loss": 0.9504, "step": 88500 }, { "epoch": 23.41, "learning_rate": 4.73866494572412e-05, "loss": 0.9482, "step": 89000 }, { "epoch": 23.54, "learning_rate": 4.7370101932750864e-05, "loss": 0.946, "step": 89500 }, { "epoch": 23.67, "learning_rate": 4.7353554408260523e-05, "loss": 0.9442, "step": 90000 }, { "epoch": 23.8, "learning_rate": 4.733700688377019e-05, "loss": 0.9428, "step": 90500 }, { "epoch": 23.93, "learning_rate": 4.7320492454328835e-05, "loss": 0.9421, "step": 91000 }, { "epoch": 24.07, "learning_rate": 4.73039449298385e-05, "loss": 0.9389, "step": 91500 }, { "epoch": 24.2, "learning_rate": 4.728739740534816e-05, "loss": 0.9374, "step": 92000 }, { "epoch": 24.33, "learning_rate": 4.727084988085783e-05, "loss": 0.9337, "step": 92500 }, { "epoch": 24.46, "learning_rate": 4.7254302356367494e-05, "loss": 0.9332, "step": 93000 }, { "epoch": 24.59, "learning_rate": 4.7237754831877154e-05, "loss": 0.931, "step": 93500 }, { "epoch": 24.72, "learning_rate": 4.722124040243579e-05, "loss": 0.9301, "step": 94000 }, { "epoch": 24.86, "learning_rate": 4.720469287794546e-05, "loss": 0.9296, "step": 94500 }, { "epoch": 24.99, "learning_rate": 4.7188145353455125e-05, "loss": 0.926, "step": 95000 } ], "max_steps": 1520800, "num_train_epochs": 400, "total_flos": 2.561490423429831e+19, "trial_name": null, "trial_params": null }