|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9938916950546224, |
|
"eval_steps": 500, |
|
"global_step": 34000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.4679976512037582e-06, |
|
"loss": 0.412, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.06404077261686325, |
|
"eval_runtime": 119.6505, |
|
"eval_samples_per_second": 54.676, |
|
"eval_steps_per_second": 6.837, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 2.9359953024075165e-06, |
|
"loss": 0.0572, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.033456169068813324, |
|
"eval_runtime": 119.4711, |
|
"eval_samples_per_second": 54.758, |
|
"eval_steps_per_second": 6.847, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 4.403992953611275e-06, |
|
"loss": 0.0392, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.027742423117160797, |
|
"eval_runtime": 119.6338, |
|
"eval_samples_per_second": 54.684, |
|
"eval_steps_per_second": 6.838, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 5.871990604815033e-06, |
|
"loss": 0.0339, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 0.024982118979096413, |
|
"eval_runtime": 119.4896, |
|
"eval_samples_per_second": 54.75, |
|
"eval_steps_per_second": 6.846, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 7.33998825601879e-06, |
|
"loss": 0.0321, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 0.02286355197429657, |
|
"eval_runtime": 119.7537, |
|
"eval_samples_per_second": 54.629, |
|
"eval_steps_per_second": 6.831, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 8.80798590722255e-06, |
|
"loss": 0.0286, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.02225133590400219, |
|
"eval_runtime": 115.351, |
|
"eval_samples_per_second": 56.714, |
|
"eval_steps_per_second": 7.091, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 9.969327155256804e-06, |
|
"loss": 0.0265, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 0.02050725743174553, |
|
"eval_runtime": 115.364, |
|
"eval_samples_per_second": 56.707, |
|
"eval_steps_per_second": 7.091, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 9.806173725771716e-06, |
|
"loss": 0.026, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 0.020657476037740707, |
|
"eval_runtime": 118.0643, |
|
"eval_samples_per_second": 55.41, |
|
"eval_steps_per_second": 6.928, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.64302029628663e-06, |
|
"loss": 0.0238, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 0.019942762330174446, |
|
"eval_runtime": 119.4625, |
|
"eval_samples_per_second": 54.762, |
|
"eval_steps_per_second": 6.847, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 9.479866866801542e-06, |
|
"loss": 0.0251, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.020365213975310326, |
|
"eval_runtime": 119.5062, |
|
"eval_samples_per_second": 54.742, |
|
"eval_steps_per_second": 6.845, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 9.316713437316454e-06, |
|
"loss": 0.0244, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.02077455259859562, |
|
"eval_runtime": 119.2739, |
|
"eval_samples_per_second": 54.849, |
|
"eval_steps_per_second": 6.858, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 9.153560007831366e-06, |
|
"loss": 0.0235, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.019606556743383408, |
|
"eval_runtime": 119.4635, |
|
"eval_samples_per_second": 54.761, |
|
"eval_steps_per_second": 6.847, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 8.990406578346278e-06, |
|
"loss": 0.0232, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.01969091035425663, |
|
"eval_runtime": 119.7331, |
|
"eval_samples_per_second": 54.638, |
|
"eval_steps_per_second": 6.832, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 8.82725314886119e-06, |
|
"loss": 0.0225, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 0.019132908433675766, |
|
"eval_runtime": 119.5514, |
|
"eval_samples_per_second": 54.721, |
|
"eval_steps_per_second": 6.842, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 8.664099719376103e-06, |
|
"loss": 0.0212, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.018609512597322464, |
|
"eval_runtime": 119.4541, |
|
"eval_samples_per_second": 54.766, |
|
"eval_steps_per_second": 6.848, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 8.500946289891015e-06, |
|
"loss": 0.0225, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 0.018011104315519333, |
|
"eval_runtime": 116.0368, |
|
"eval_samples_per_second": 56.379, |
|
"eval_steps_per_second": 7.049, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 8.337792860405927e-06, |
|
"loss": 0.0231, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.01797027327120304, |
|
"eval_runtime": 115.2093, |
|
"eval_samples_per_second": 56.784, |
|
"eval_steps_per_second": 7.1, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 8.174639430920839e-06, |
|
"loss": 0.018, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 0.018307719379663467, |
|
"eval_runtime": 116.9177, |
|
"eval_samples_per_second": 55.954, |
|
"eval_steps_per_second": 6.996, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 8.01148600143575e-06, |
|
"loss": 0.0178, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.018220532685518265, |
|
"eval_runtime": 119.4643, |
|
"eval_samples_per_second": 54.761, |
|
"eval_steps_per_second": 6.847, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 7.848332571950663e-06, |
|
"loss": 0.0173, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_loss": 0.018342604860663414, |
|
"eval_runtime": 119.7353, |
|
"eval_samples_per_second": 54.637, |
|
"eval_steps_per_second": 6.832, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 7.685179142465575e-06, |
|
"loss": 0.0176, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 0.01870131492614746, |
|
"eval_runtime": 119.9453, |
|
"eval_samples_per_second": 54.542, |
|
"eval_steps_per_second": 6.82, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 7.5220257129804875e-06, |
|
"loss": 0.0177, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"eval_loss": 0.018137916922569275, |
|
"eval_runtime": 119.817, |
|
"eval_samples_per_second": 54.6, |
|
"eval_steps_per_second": 6.827, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 7.358872283495399e-06, |
|
"loss": 0.0171, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 0.018740132451057434, |
|
"eval_runtime": 119.4709, |
|
"eval_samples_per_second": 54.758, |
|
"eval_steps_per_second": 6.847, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 7.195718854010312e-06, |
|
"loss": 0.019, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"eval_loss": 0.018057728186249733, |
|
"eval_runtime": 119.9707, |
|
"eval_samples_per_second": 54.53, |
|
"eval_steps_per_second": 6.818, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 7.032565424525224e-06, |
|
"loss": 0.0174, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_loss": 0.018135011196136475, |
|
"eval_runtime": 119.6311, |
|
"eval_samples_per_second": 54.685, |
|
"eval_steps_per_second": 6.838, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 6.869411995040136e-06, |
|
"loss": 0.0179, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"eval_loss": 0.017942175269126892, |
|
"eval_runtime": 117.6519, |
|
"eval_samples_per_second": 55.605, |
|
"eval_steps_per_second": 6.953, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 6.706258565555048e-06, |
|
"loss": 0.0166, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_loss": 0.01796996220946312, |
|
"eval_runtime": 115.4295, |
|
"eval_samples_per_second": 56.675, |
|
"eval_steps_per_second": 7.087, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 6.543105136069961e-06, |
|
"loss": 0.0174, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_loss": 0.018622903153300285, |
|
"eval_runtime": 116.043, |
|
"eval_samples_per_second": 56.376, |
|
"eval_steps_per_second": 7.049, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 6.379951706584873e-06, |
|
"loss": 0.0162, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.017875785008072853, |
|
"eval_runtime": 119.775, |
|
"eval_samples_per_second": 54.619, |
|
"eval_steps_per_second": 6.829, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 6.216798277099785e-06, |
|
"loss": 0.0163, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.018203964456915855, |
|
"eval_runtime": 119.7603, |
|
"eval_samples_per_second": 54.626, |
|
"eval_steps_per_second": 6.83, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 6.0536448476146966e-06, |
|
"loss": 0.0168, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"eval_loss": 0.017764363437891006, |
|
"eval_runtime": 119.5774, |
|
"eval_samples_per_second": 54.709, |
|
"eval_steps_per_second": 6.841, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 5.890491418129609e-06, |
|
"loss": 0.0178, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 0.017852840945124626, |
|
"eval_runtime": 119.5232, |
|
"eval_samples_per_second": 54.734, |
|
"eval_steps_per_second": 6.844, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 5.727337988644521e-06, |
|
"loss": 0.0168, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 0.017764879390597343, |
|
"eval_runtime": 119.6082, |
|
"eval_samples_per_second": 54.695, |
|
"eval_steps_per_second": 6.839, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 5.564184559159433e-06, |
|
"loss": 0.0168, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.017377818003296852, |
|
"eval_runtime": 119.6291, |
|
"eval_samples_per_second": 54.686, |
|
"eval_steps_per_second": 6.838, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 5.401031129674347e-06, |
|
"loss": 0.0143, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"eval_loss": 0.017800554633140564, |
|
"eval_runtime": 119.9539, |
|
"eval_samples_per_second": 54.538, |
|
"eval_steps_per_second": 6.819, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 5.237877700189259e-06, |
|
"loss": 0.014, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_loss": 0.0179632306098938, |
|
"eval_runtime": 118.0782, |
|
"eval_samples_per_second": 55.404, |
|
"eval_steps_per_second": 6.928, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 5.074724270704171e-06, |
|
"loss": 0.0143, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"eval_loss": 0.018571963533759117, |
|
"eval_runtime": 115.3792, |
|
"eval_samples_per_second": 56.7, |
|
"eval_steps_per_second": 7.09, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 4.911570841219083e-06, |
|
"loss": 0.0137, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_loss": 0.018732914701104164, |
|
"eval_runtime": 116.2594, |
|
"eval_samples_per_second": 56.271, |
|
"eval_steps_per_second": 7.036, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 4.748417411733995e-06, |
|
"loss": 0.0131, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"eval_loss": 0.018157465383410454, |
|
"eval_runtime": 119.6325, |
|
"eval_samples_per_second": 54.684, |
|
"eval_steps_per_second": 6.838, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 4.585263982248907e-06, |
|
"loss": 0.0134, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_loss": 0.01858236826956272, |
|
"eval_runtime": 119.626, |
|
"eval_samples_per_second": 54.687, |
|
"eval_steps_per_second": 6.838, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 4.42211055276382e-06, |
|
"loss": 0.0131, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"eval_loss": 0.01760929264128208, |
|
"eval_runtime": 119.8276, |
|
"eval_samples_per_second": 54.595, |
|
"eval_steps_per_second": 6.826, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 4.258957123278732e-06, |
|
"loss": 0.0138, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"eval_loss": 0.01776733435690403, |
|
"eval_runtime": 119.5072, |
|
"eval_samples_per_second": 54.741, |
|
"eval_steps_per_second": 6.845, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 4.095803693793644e-06, |
|
"loss": 0.0131, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"eval_loss": 0.018140822649002075, |
|
"eval_runtime": 119.8335, |
|
"eval_samples_per_second": 54.592, |
|
"eval_steps_per_second": 6.826, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 3.932650264308556e-06, |
|
"loss": 0.0139, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"eval_loss": 0.018083902075886726, |
|
"eval_runtime": 120.1704, |
|
"eval_samples_per_second": 54.439, |
|
"eval_steps_per_second": 6.807, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 3.7694968348234683e-06, |
|
"loss": 0.0139, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"eval_loss": 0.018096571788191795, |
|
"eval_runtime": 119.7812, |
|
"eval_samples_per_second": 54.616, |
|
"eval_steps_per_second": 6.829, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"learning_rate": 3.6063434053383807e-06, |
|
"loss": 0.0133, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"eval_loss": 0.01770329661667347, |
|
"eval_runtime": 118.2737, |
|
"eval_samples_per_second": 55.312, |
|
"eval_steps_per_second": 6.916, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 3.4431899758532926e-06, |
|
"loss": 0.0135, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_loss": 0.01808938756585121, |
|
"eval_runtime": 115.7874, |
|
"eval_samples_per_second": 56.5, |
|
"eval_steps_per_second": 7.065, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"learning_rate": 3.280036546368205e-06, |
|
"loss": 0.0131, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"eval_loss": 0.017787907272577286, |
|
"eval_runtime": 115.8567, |
|
"eval_samples_per_second": 56.466, |
|
"eval_steps_per_second": 7.06, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"learning_rate": 3.116883116883117e-06, |
|
"loss": 0.0137, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_loss": 0.017733994871377945, |
|
"eval_runtime": 120.2603, |
|
"eval_samples_per_second": 54.399, |
|
"eval_steps_per_second": 6.802, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 2.9537296873980292e-06, |
|
"loss": 0.0133, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_loss": 0.017949102446436882, |
|
"eval_runtime": 119.971, |
|
"eval_samples_per_second": 54.53, |
|
"eval_steps_per_second": 6.818, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 2.7905762579129416e-06, |
|
"loss": 0.0136, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.017474107444286346, |
|
"eval_runtime": 119.96, |
|
"eval_samples_per_second": 54.535, |
|
"eval_steps_per_second": 6.819, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"learning_rate": 2.6274228284278535e-06, |
|
"loss": 0.0124, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"eval_loss": 0.018201593309640884, |
|
"eval_runtime": 119.9656, |
|
"eval_samples_per_second": 54.532, |
|
"eval_steps_per_second": 6.819, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 2.464269398942766e-06, |
|
"loss": 0.0121, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"eval_loss": 0.01811986044049263, |
|
"eval_runtime": 119.7914, |
|
"eval_samples_per_second": 54.612, |
|
"eval_steps_per_second": 6.829, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 2.3011159694576783e-06, |
|
"loss": 0.012, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"eval_loss": 0.018191542476415634, |
|
"eval_runtime": 119.8265, |
|
"eval_samples_per_second": 54.596, |
|
"eval_steps_per_second": 6.827, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 2.13796253997259e-06, |
|
"loss": 0.0115, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"eval_loss": 0.018120231106877327, |
|
"eval_runtime": 119.6169, |
|
"eval_samples_per_second": 54.691, |
|
"eval_steps_per_second": 6.839, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"learning_rate": 1.9748091104875025e-06, |
|
"loss": 0.0117, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"eval_loss": 0.017889145761728287, |
|
"eval_runtime": 118.9939, |
|
"eval_samples_per_second": 54.978, |
|
"eval_steps_per_second": 6.874, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 1.811655681002415e-06, |
|
"loss": 0.0113, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"eval_loss": 0.017741482704877853, |
|
"eval_runtime": 115.6814, |
|
"eval_samples_per_second": 56.552, |
|
"eval_steps_per_second": 7.071, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"learning_rate": 1.648502251517327e-06, |
|
"loss": 0.0124, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"eval_loss": 0.017794128507375717, |
|
"eval_runtime": 115.7328, |
|
"eval_samples_per_second": 56.527, |
|
"eval_steps_per_second": 7.068, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"learning_rate": 1.4853488220322392e-06, |
|
"loss": 0.012, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"eval_loss": 0.018301891162991524, |
|
"eval_runtime": 119.5898, |
|
"eval_samples_per_second": 54.704, |
|
"eval_steps_per_second": 6.84, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"learning_rate": 1.3221953925471516e-06, |
|
"loss": 0.0119, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"eval_loss": 0.01817336678504944, |
|
"eval_runtime": 120.0384, |
|
"eval_samples_per_second": 54.499, |
|
"eval_steps_per_second": 6.814, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"learning_rate": 1.1590419630620637e-06, |
|
"loss": 0.0115, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"eval_loss": 0.018085774034261703, |
|
"eval_runtime": 119.7931, |
|
"eval_samples_per_second": 54.611, |
|
"eval_steps_per_second": 6.828, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 9.958885335769758e-07, |
|
"loss": 0.012, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"eval_loss": 0.017980104312300682, |
|
"eval_runtime": 119.8348, |
|
"eval_samples_per_second": 54.592, |
|
"eval_steps_per_second": 6.826, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"learning_rate": 8.327351040918881e-07, |
|
"loss": 0.0116, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"eval_loss": 0.0181511789560318, |
|
"eval_runtime": 119.9569, |
|
"eval_samples_per_second": 54.536, |
|
"eval_steps_per_second": 6.819, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 6.695816746068002e-07, |
|
"loss": 0.0108, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"eval_loss": 0.018154002726078033, |
|
"eval_runtime": 119.8401, |
|
"eval_samples_per_second": 54.589, |
|
"eval_steps_per_second": 6.826, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 5.064282451217125e-07, |
|
"loss": 0.0118, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"eval_loss": 0.01812034100294113, |
|
"eval_runtime": 120.1728, |
|
"eval_samples_per_second": 54.438, |
|
"eval_steps_per_second": 6.807, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"learning_rate": 3.4327481563662475e-07, |
|
"loss": 0.0114, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"eval_loss": 0.018135515972971916, |
|
"eval_runtime": 119.8212, |
|
"eval_samples_per_second": 54.598, |
|
"eval_steps_per_second": 6.827, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"learning_rate": 1.801213861515369e-07, |
|
"loss": 0.0121, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"eval_loss": 0.01807536743581295, |
|
"eval_runtime": 115.6623, |
|
"eval_samples_per_second": 56.561, |
|
"eval_steps_per_second": 7.072, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 1.6967956666449132e-08, |
|
"loss": 0.0112, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"eval_loss": 0.01807805709540844, |
|
"eval_runtime": 115.5079, |
|
"eval_samples_per_second": 56.637, |
|
"eval_steps_per_second": 7.082, |
|
"step": 34000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 34052, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"total_flos": 3.237991334295552e+16, |
|
"train_batch_size": 18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|