{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9938916950546224, "eval_steps": 500, "global_step": 34000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 1.4679976512037582e-06, "loss": 0.412, "step": 500 }, { "epoch": 0.06, "eval_loss": 0.06404077261686325, "eval_runtime": 119.6505, "eval_samples_per_second": 54.676, "eval_steps_per_second": 6.837, "step": 500 }, { "epoch": 0.12, "learning_rate": 2.9359953024075165e-06, "loss": 0.0572, "step": 1000 }, { "epoch": 0.12, "eval_loss": 0.033456169068813324, "eval_runtime": 119.4711, "eval_samples_per_second": 54.758, "eval_steps_per_second": 6.847, "step": 1000 }, { "epoch": 0.18, "learning_rate": 4.403992953611275e-06, "loss": 0.0392, "step": 1500 }, { "epoch": 0.18, "eval_loss": 0.027742423117160797, "eval_runtime": 119.6338, "eval_samples_per_second": 54.684, "eval_steps_per_second": 6.838, "step": 1500 }, { "epoch": 0.23, "learning_rate": 5.871990604815033e-06, "loss": 0.0339, "step": 2000 }, { "epoch": 0.23, "eval_loss": 0.024982118979096413, "eval_runtime": 119.4896, "eval_samples_per_second": 54.75, "eval_steps_per_second": 6.846, "step": 2000 }, { "epoch": 0.29, "learning_rate": 7.33998825601879e-06, "loss": 0.0321, "step": 2500 }, { "epoch": 0.29, "eval_loss": 0.02286355197429657, "eval_runtime": 119.7537, "eval_samples_per_second": 54.629, "eval_steps_per_second": 6.831, "step": 2500 }, { "epoch": 0.35, "learning_rate": 8.80798590722255e-06, "loss": 0.0286, "step": 3000 }, { "epoch": 0.35, "eval_loss": 0.02225133590400219, "eval_runtime": 115.351, "eval_samples_per_second": 56.714, "eval_steps_per_second": 7.091, "step": 3000 }, { "epoch": 0.41, "learning_rate": 9.969327155256804e-06, "loss": 0.0265, "step": 3500 }, { "epoch": 0.41, "eval_loss": 0.02050725743174553, "eval_runtime": 115.364, "eval_samples_per_second": 56.707, "eval_steps_per_second": 7.091, "step": 3500 }, { "epoch": 0.47, "learning_rate": 9.806173725771716e-06, "loss": 0.026, "step": 4000 }, { "epoch": 0.47, "eval_loss": 0.020657476037740707, "eval_runtime": 118.0643, "eval_samples_per_second": 55.41, "eval_steps_per_second": 6.928, "step": 4000 }, { "epoch": 0.53, "learning_rate": 9.64302029628663e-06, "loss": 0.0238, "step": 4500 }, { "epoch": 0.53, "eval_loss": 0.019942762330174446, "eval_runtime": 119.4625, "eval_samples_per_second": 54.762, "eval_steps_per_second": 6.847, "step": 4500 }, { "epoch": 0.59, "learning_rate": 9.479866866801542e-06, "loss": 0.0251, "step": 5000 }, { "epoch": 0.59, "eval_loss": 0.020365213975310326, "eval_runtime": 119.5062, "eval_samples_per_second": 54.742, "eval_steps_per_second": 6.845, "step": 5000 }, { "epoch": 0.65, "learning_rate": 9.316713437316454e-06, "loss": 0.0244, "step": 5500 }, { "epoch": 0.65, "eval_loss": 0.02077455259859562, "eval_runtime": 119.2739, "eval_samples_per_second": 54.849, "eval_steps_per_second": 6.858, "step": 5500 }, { "epoch": 0.7, "learning_rate": 9.153560007831366e-06, "loss": 0.0235, "step": 6000 }, { "epoch": 0.7, "eval_loss": 0.019606556743383408, "eval_runtime": 119.4635, "eval_samples_per_second": 54.761, "eval_steps_per_second": 6.847, "step": 6000 }, { "epoch": 0.76, "learning_rate": 8.990406578346278e-06, "loss": 0.0232, "step": 6500 }, { "epoch": 0.76, "eval_loss": 0.01969091035425663, "eval_runtime": 119.7331, "eval_samples_per_second": 54.638, "eval_steps_per_second": 6.832, "step": 6500 }, { "epoch": 0.82, "learning_rate": 8.82725314886119e-06, "loss": 0.0225, "step": 7000 }, { "epoch": 0.82, "eval_loss": 0.019132908433675766, "eval_runtime": 119.5514, "eval_samples_per_second": 54.721, "eval_steps_per_second": 6.842, "step": 7000 }, { "epoch": 0.88, "learning_rate": 8.664099719376103e-06, "loss": 0.0212, "step": 7500 }, { "epoch": 0.88, "eval_loss": 0.018609512597322464, "eval_runtime": 119.4541, "eval_samples_per_second": 54.766, "eval_steps_per_second": 6.848, "step": 7500 }, { "epoch": 0.94, "learning_rate": 8.500946289891015e-06, "loss": 0.0225, "step": 8000 }, { "epoch": 0.94, "eval_loss": 0.018011104315519333, "eval_runtime": 116.0368, "eval_samples_per_second": 56.379, "eval_steps_per_second": 7.049, "step": 8000 }, { "epoch": 1.0, "learning_rate": 8.337792860405927e-06, "loss": 0.0231, "step": 8500 }, { "epoch": 1.0, "eval_loss": 0.01797027327120304, "eval_runtime": 115.2093, "eval_samples_per_second": 56.784, "eval_steps_per_second": 7.1, "step": 8500 }, { "epoch": 1.06, "learning_rate": 8.174639430920839e-06, "loss": 0.018, "step": 9000 }, { "epoch": 1.06, "eval_loss": 0.018307719379663467, "eval_runtime": 116.9177, "eval_samples_per_second": 55.954, "eval_steps_per_second": 6.996, "step": 9000 }, { "epoch": 1.12, "learning_rate": 8.01148600143575e-06, "loss": 0.0178, "step": 9500 }, { "epoch": 1.12, "eval_loss": 0.018220532685518265, "eval_runtime": 119.4643, "eval_samples_per_second": 54.761, "eval_steps_per_second": 6.847, "step": 9500 }, { "epoch": 1.17, "learning_rate": 7.848332571950663e-06, "loss": 0.0173, "step": 10000 }, { "epoch": 1.17, "eval_loss": 0.018342604860663414, "eval_runtime": 119.7353, "eval_samples_per_second": 54.637, "eval_steps_per_second": 6.832, "step": 10000 }, { "epoch": 1.23, "learning_rate": 7.685179142465575e-06, "loss": 0.0176, "step": 10500 }, { "epoch": 1.23, "eval_loss": 0.01870131492614746, "eval_runtime": 119.9453, "eval_samples_per_second": 54.542, "eval_steps_per_second": 6.82, "step": 10500 }, { "epoch": 1.29, "learning_rate": 7.5220257129804875e-06, "loss": 0.0177, "step": 11000 }, { "epoch": 1.29, "eval_loss": 0.018137916922569275, "eval_runtime": 119.817, "eval_samples_per_second": 54.6, "eval_steps_per_second": 6.827, "step": 11000 }, { "epoch": 1.35, "learning_rate": 7.358872283495399e-06, "loss": 0.0171, "step": 11500 }, { "epoch": 1.35, "eval_loss": 0.018740132451057434, "eval_runtime": 119.4709, "eval_samples_per_second": 54.758, "eval_steps_per_second": 6.847, "step": 11500 }, { "epoch": 1.41, "learning_rate": 7.195718854010312e-06, "loss": 0.019, "step": 12000 }, { "epoch": 1.41, "eval_loss": 0.018057728186249733, "eval_runtime": 119.9707, "eval_samples_per_second": 54.53, "eval_steps_per_second": 6.818, "step": 12000 }, { "epoch": 1.47, "learning_rate": 7.032565424525224e-06, "loss": 0.0174, "step": 12500 }, { "epoch": 1.47, "eval_loss": 0.018135011196136475, "eval_runtime": 119.6311, "eval_samples_per_second": 54.685, "eval_steps_per_second": 6.838, "step": 12500 }, { "epoch": 1.53, "learning_rate": 6.869411995040136e-06, "loss": 0.0179, "step": 13000 }, { "epoch": 1.53, "eval_loss": 0.017942175269126892, "eval_runtime": 117.6519, "eval_samples_per_second": 55.605, "eval_steps_per_second": 6.953, "step": 13000 }, { "epoch": 1.59, "learning_rate": 6.706258565555048e-06, "loss": 0.0166, "step": 13500 }, { "epoch": 1.59, "eval_loss": 0.01796996220946312, "eval_runtime": 115.4295, "eval_samples_per_second": 56.675, "eval_steps_per_second": 7.087, "step": 13500 }, { "epoch": 1.64, "learning_rate": 6.543105136069961e-06, "loss": 0.0174, "step": 14000 }, { "epoch": 1.64, "eval_loss": 0.018622903153300285, "eval_runtime": 116.043, "eval_samples_per_second": 56.376, "eval_steps_per_second": 7.049, "step": 14000 }, { "epoch": 1.7, "learning_rate": 6.379951706584873e-06, "loss": 0.0162, "step": 14500 }, { "epoch": 1.7, "eval_loss": 0.017875785008072853, "eval_runtime": 119.775, "eval_samples_per_second": 54.619, "eval_steps_per_second": 6.829, "step": 14500 }, { "epoch": 1.76, "learning_rate": 6.216798277099785e-06, "loss": 0.0163, "step": 15000 }, { "epoch": 1.76, "eval_loss": 0.018203964456915855, "eval_runtime": 119.7603, "eval_samples_per_second": 54.626, "eval_steps_per_second": 6.83, "step": 15000 }, { "epoch": 1.82, "learning_rate": 6.0536448476146966e-06, "loss": 0.0168, "step": 15500 }, { "epoch": 1.82, "eval_loss": 0.017764363437891006, "eval_runtime": 119.5774, "eval_samples_per_second": 54.709, "eval_steps_per_second": 6.841, "step": 15500 }, { "epoch": 1.88, "learning_rate": 5.890491418129609e-06, "loss": 0.0178, "step": 16000 }, { "epoch": 1.88, "eval_loss": 0.017852840945124626, "eval_runtime": 119.5232, "eval_samples_per_second": 54.734, "eval_steps_per_second": 6.844, "step": 16000 }, { "epoch": 1.94, "learning_rate": 5.727337988644521e-06, "loss": 0.0168, "step": 16500 }, { "epoch": 1.94, "eval_loss": 0.017764879390597343, "eval_runtime": 119.6082, "eval_samples_per_second": 54.695, "eval_steps_per_second": 6.839, "step": 16500 }, { "epoch": 2.0, "learning_rate": 5.564184559159433e-06, "loss": 0.0168, "step": 17000 }, { "epoch": 2.0, "eval_loss": 0.017377818003296852, "eval_runtime": 119.6291, "eval_samples_per_second": 54.686, "eval_steps_per_second": 6.838, "step": 17000 }, { "epoch": 2.06, "learning_rate": 5.401031129674347e-06, "loss": 0.0143, "step": 17500 }, { "epoch": 2.06, "eval_loss": 0.017800554633140564, "eval_runtime": 119.9539, "eval_samples_per_second": 54.538, "eval_steps_per_second": 6.819, "step": 17500 }, { "epoch": 2.11, "learning_rate": 5.237877700189259e-06, "loss": 0.014, "step": 18000 }, { "epoch": 2.11, "eval_loss": 0.0179632306098938, "eval_runtime": 118.0782, "eval_samples_per_second": 55.404, "eval_steps_per_second": 6.928, "step": 18000 }, { "epoch": 2.17, "learning_rate": 5.074724270704171e-06, "loss": 0.0143, "step": 18500 }, { "epoch": 2.17, "eval_loss": 0.018571963533759117, "eval_runtime": 115.3792, "eval_samples_per_second": 56.7, "eval_steps_per_second": 7.09, "step": 18500 }, { "epoch": 2.23, "learning_rate": 4.911570841219083e-06, "loss": 0.0137, "step": 19000 }, { "epoch": 2.23, "eval_loss": 0.018732914701104164, "eval_runtime": 116.2594, "eval_samples_per_second": 56.271, "eval_steps_per_second": 7.036, "step": 19000 }, { "epoch": 2.29, "learning_rate": 4.748417411733995e-06, "loss": 0.0131, "step": 19500 }, { "epoch": 2.29, "eval_loss": 0.018157465383410454, "eval_runtime": 119.6325, "eval_samples_per_second": 54.684, "eval_steps_per_second": 6.838, "step": 19500 }, { "epoch": 2.35, "learning_rate": 4.585263982248907e-06, "loss": 0.0134, "step": 20000 }, { "epoch": 2.35, "eval_loss": 0.01858236826956272, "eval_runtime": 119.626, "eval_samples_per_second": 54.687, "eval_steps_per_second": 6.838, "step": 20000 }, { "epoch": 2.41, "learning_rate": 4.42211055276382e-06, "loss": 0.0131, "step": 20500 }, { "epoch": 2.41, "eval_loss": 0.01760929264128208, "eval_runtime": 119.8276, "eval_samples_per_second": 54.595, "eval_steps_per_second": 6.826, "step": 20500 }, { "epoch": 2.47, "learning_rate": 4.258957123278732e-06, "loss": 0.0138, "step": 21000 }, { "epoch": 2.47, "eval_loss": 0.01776733435690403, "eval_runtime": 119.5072, "eval_samples_per_second": 54.741, "eval_steps_per_second": 6.845, "step": 21000 }, { "epoch": 2.53, "learning_rate": 4.095803693793644e-06, "loss": 0.0131, "step": 21500 }, { "epoch": 2.53, "eval_loss": 0.018140822649002075, "eval_runtime": 119.8335, "eval_samples_per_second": 54.592, "eval_steps_per_second": 6.826, "step": 21500 }, { "epoch": 2.58, "learning_rate": 3.932650264308556e-06, "loss": 0.0139, "step": 22000 }, { "epoch": 2.58, "eval_loss": 0.018083902075886726, "eval_runtime": 120.1704, "eval_samples_per_second": 54.439, "eval_steps_per_second": 6.807, "step": 22000 }, { "epoch": 2.64, "learning_rate": 3.7694968348234683e-06, "loss": 0.0139, "step": 22500 }, { "epoch": 2.64, "eval_loss": 0.018096571788191795, "eval_runtime": 119.7812, "eval_samples_per_second": 54.616, "eval_steps_per_second": 6.829, "step": 22500 }, { "epoch": 2.7, "learning_rate": 3.6063434053383807e-06, "loss": 0.0133, "step": 23000 }, { "epoch": 2.7, "eval_loss": 0.01770329661667347, "eval_runtime": 118.2737, "eval_samples_per_second": 55.312, "eval_steps_per_second": 6.916, "step": 23000 }, { "epoch": 2.76, "learning_rate": 3.4431899758532926e-06, "loss": 0.0135, "step": 23500 }, { "epoch": 2.76, "eval_loss": 0.01808938756585121, "eval_runtime": 115.7874, "eval_samples_per_second": 56.5, "eval_steps_per_second": 7.065, "step": 23500 }, { "epoch": 2.82, "learning_rate": 3.280036546368205e-06, "loss": 0.0131, "step": 24000 }, { "epoch": 2.82, "eval_loss": 0.017787907272577286, "eval_runtime": 115.8567, "eval_samples_per_second": 56.466, "eval_steps_per_second": 7.06, "step": 24000 }, { "epoch": 2.88, "learning_rate": 3.116883116883117e-06, "loss": 0.0137, "step": 24500 }, { "epoch": 2.88, "eval_loss": 0.017733994871377945, "eval_runtime": 120.2603, "eval_samples_per_second": 54.399, "eval_steps_per_second": 6.802, "step": 24500 }, { "epoch": 2.94, "learning_rate": 2.9537296873980292e-06, "loss": 0.0133, "step": 25000 }, { "epoch": 2.94, "eval_loss": 0.017949102446436882, "eval_runtime": 119.971, "eval_samples_per_second": 54.53, "eval_steps_per_second": 6.818, "step": 25000 }, { "epoch": 3.0, "learning_rate": 2.7905762579129416e-06, "loss": 0.0136, "step": 25500 }, { "epoch": 3.0, "eval_loss": 0.017474107444286346, "eval_runtime": 119.96, "eval_samples_per_second": 54.535, "eval_steps_per_second": 6.819, "step": 25500 }, { "epoch": 3.05, "learning_rate": 2.6274228284278535e-06, "loss": 0.0124, "step": 26000 }, { "epoch": 3.05, "eval_loss": 0.018201593309640884, "eval_runtime": 119.9656, "eval_samples_per_second": 54.532, "eval_steps_per_second": 6.819, "step": 26000 }, { "epoch": 3.11, "learning_rate": 2.464269398942766e-06, "loss": 0.0121, "step": 26500 }, { "epoch": 3.11, "eval_loss": 0.01811986044049263, "eval_runtime": 119.7914, "eval_samples_per_second": 54.612, "eval_steps_per_second": 6.829, "step": 26500 }, { "epoch": 3.17, "learning_rate": 2.3011159694576783e-06, "loss": 0.012, "step": 27000 }, { "epoch": 3.17, "eval_loss": 0.018191542476415634, "eval_runtime": 119.8265, "eval_samples_per_second": 54.596, "eval_steps_per_second": 6.827, "step": 27000 }, { "epoch": 3.23, "learning_rate": 2.13796253997259e-06, "loss": 0.0115, "step": 27500 }, { "epoch": 3.23, "eval_loss": 0.018120231106877327, "eval_runtime": 119.6169, "eval_samples_per_second": 54.691, "eval_steps_per_second": 6.839, "step": 27500 }, { "epoch": 3.29, "learning_rate": 1.9748091104875025e-06, "loss": 0.0117, "step": 28000 }, { "epoch": 3.29, "eval_loss": 0.017889145761728287, "eval_runtime": 118.9939, "eval_samples_per_second": 54.978, "eval_steps_per_second": 6.874, "step": 28000 }, { "epoch": 3.35, "learning_rate": 1.811655681002415e-06, "loss": 0.0113, "step": 28500 }, { "epoch": 3.35, "eval_loss": 0.017741482704877853, "eval_runtime": 115.6814, "eval_samples_per_second": 56.552, "eval_steps_per_second": 7.071, "step": 28500 }, { "epoch": 3.41, "learning_rate": 1.648502251517327e-06, "loss": 0.0124, "step": 29000 }, { "epoch": 3.41, "eval_loss": 0.017794128507375717, "eval_runtime": 115.7328, "eval_samples_per_second": 56.527, "eval_steps_per_second": 7.068, "step": 29000 }, { "epoch": 3.47, "learning_rate": 1.4853488220322392e-06, "loss": 0.012, "step": 29500 }, { "epoch": 3.47, "eval_loss": 0.018301891162991524, "eval_runtime": 119.5898, "eval_samples_per_second": 54.704, "eval_steps_per_second": 6.84, "step": 29500 }, { "epoch": 3.52, "learning_rate": 1.3221953925471516e-06, "loss": 0.0119, "step": 30000 }, { "epoch": 3.52, "eval_loss": 0.01817336678504944, "eval_runtime": 120.0384, "eval_samples_per_second": 54.499, "eval_steps_per_second": 6.814, "step": 30000 }, { "epoch": 3.58, "learning_rate": 1.1590419630620637e-06, "loss": 0.0115, "step": 30500 }, { "epoch": 3.58, "eval_loss": 0.018085774034261703, "eval_runtime": 119.7931, "eval_samples_per_second": 54.611, "eval_steps_per_second": 6.828, "step": 30500 }, { "epoch": 3.64, "learning_rate": 9.958885335769758e-07, "loss": 0.012, "step": 31000 }, { "epoch": 3.64, "eval_loss": 0.017980104312300682, "eval_runtime": 119.8348, "eval_samples_per_second": 54.592, "eval_steps_per_second": 6.826, "step": 31000 }, { "epoch": 3.7, "learning_rate": 8.327351040918881e-07, "loss": 0.0116, "step": 31500 }, { "epoch": 3.7, "eval_loss": 0.0181511789560318, "eval_runtime": 119.9569, "eval_samples_per_second": 54.536, "eval_steps_per_second": 6.819, "step": 31500 }, { "epoch": 3.76, "learning_rate": 6.695816746068002e-07, "loss": 0.0108, "step": 32000 }, { "epoch": 3.76, "eval_loss": 0.018154002726078033, "eval_runtime": 119.8401, "eval_samples_per_second": 54.589, "eval_steps_per_second": 6.826, "step": 32000 }, { "epoch": 3.82, "learning_rate": 5.064282451217125e-07, "loss": 0.0118, "step": 32500 }, { "epoch": 3.82, "eval_loss": 0.01812034100294113, "eval_runtime": 120.1728, "eval_samples_per_second": 54.438, "eval_steps_per_second": 6.807, "step": 32500 }, { "epoch": 3.88, "learning_rate": 3.4327481563662475e-07, "loss": 0.0114, "step": 33000 }, { "epoch": 3.88, "eval_loss": 0.018135515972971916, "eval_runtime": 119.8212, "eval_samples_per_second": 54.598, "eval_steps_per_second": 6.827, "step": 33000 }, { "epoch": 3.94, "learning_rate": 1.801213861515369e-07, "loss": 0.0121, "step": 33500 }, { "epoch": 3.94, "eval_loss": 0.01807536743581295, "eval_runtime": 115.6623, "eval_samples_per_second": 56.561, "eval_steps_per_second": 7.072, "step": 33500 }, { "epoch": 3.99, "learning_rate": 1.6967956666449132e-08, "loss": 0.0112, "step": 34000 }, { "epoch": 3.99, "eval_loss": 0.01807805709540844, "eval_runtime": 115.5079, "eval_samples_per_second": 56.637, "eval_steps_per_second": 7.082, "step": 34000 } ], "logging_steps": 500, "max_steps": 34052, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 3.237991334295552e+16, "train_batch_size": 18, "trial_name": null, "trial_params": null }